├── .gitignore
├── Final Project
    ├── OpenStreetMap-UWS.ipynb
    ├── README.md
    └── img
    │   ├── 1.png
    │   └── 2.png
├── Lesson 1 - Data Extraction Fundamentals
    ├── 07-Parsing CSV Files
    │   ├── beatles-diskography.csv
    │   └── simple.py
    ├── 11-Reading Excel Files
    │   ├── 2013_ERCOT_Hourly_Load_Data.xls
    │   ├── 2013_ERCOT_Hourly_Load_Data.zip
    │   └── reading_excel_files.py
    ├── 14-JSON Playground
    │   ├── aritist
    │   ├── musicbrainz.py
    │   └── release
    └── Problem Set 1
    │   ├── 01-Using CSV Module
    │       ├── 745090.csv
    │       └── parsecsv.py
    │   ├── 02-Excel To CSV
    │       ├── 2013_ERCOT_Hourly_Load_Data.xls
    │       ├── 2013_ERCOT_Hourly_Load_Data.zip
    │       ├── 2013_Max_Loads.csv
    │       ├── example.csv
    │       └── excel_csv.py
    │   └── 03-Wrangling JSON
    │       ├── nytimes.py
    │       └── popular-viewed-1.json
├── Lesson 2 - Data in More Complex Formats
    ├── 07-Extracting Data
    │   ├── authors.py
    │   └── exampleResearchArticle.xml
    ├── 08-Handling Attributes
    │   ├── authors.py
    │   └── exampleResearchArticle.xml
    ├── 18-Using Beautiful Soup
    │   ├── html_soup.py
    │   └── page_source.html
    └── Problem Set 2
    │   ├── 01-Carrier List
    │       ├── carriers.py
    │       └── options.html
    │   ├── 02-Airport List
    │       ├── airports.py
    │       └── options.html
    │   ├── 03-Processing All
    │       ├── data.zip
    │       ├── data
    │       │   └── FL-ATL.html
    │       └── process.py
    │   ├── 04-Patent Database
    │       ├── patent.data
    │       └── patent.py
    │   └── 05-Processing Patents
    │       ├── patent.data
    │       ├── patent.data-0
    │       ├── patent.data-1
    │       ├── patent.data-2
    │       ├── patent.data-3
    │       └── split_data.py
├── Lesson 3 - Data Quality
    ├── 12-Correcting Validity
    │   ├── FIXME-autos.csv
    │   ├── autos-valid.csv
    │   ├── autos.csv
    │   └── validity.py
    └── Problem Set 3
    │   ├── 01-Auditing Data Quality
    │       ├── audit.py
    │       └── cities.csv
    │   ├── 03-Fixing the Area
    │       ├── area.py
    │       └── cities.csv
    │   ├── 05-Fixing Name
    │       ├── cities.csv
    │       └── name.py
    │   └── 06-Crossfield Auditing
    │       ├── cities.csv
    │       └── location.py
├── Lesson 4 - Working with MongoDB
    ├── 10-Finding Porsche
    │   ├── example_car.json
    │   └── find_porsche.py
    ├── 14-Inserting Multiple Documents
    │   ├── autos-small.csv
    │   ├── autos.py
    │   └── insert.py
    ├── 18-Range Queries
    │   ├── example_city.txt
    │   └── find_cities.py
    ├── 23-Using $in Operator
    │   ├── example.json
    │   └── find_cars.py
    ├── 25-Dot Notation
    │   ├── dot_find.py
    │   └── example_auto.txt
    └── Problem Set 4
    │   ├── 01-Preparing Data
    │       ├── arachnid.csv
    │       └── processing.py
    │   ├── 02-Inserting into DB
    │       ├── arachnid.json
    │       └── dbinsert.py
    │   └── 03-Updating Schema
    │       ├── arachnid.csv
    │       └── update.py
├── Lesson 5 - Analyzing Data
    ├── 05-Using group
    │   └── Using group.py
    ├── 10-Using match and project
    │   └── Using match and project.py
    ├── 12-Using unwind
    │   └── Using unwind.py
    ├── 14-Using push
    │   └── Using push.py
    ├── 16-Same Operator
    │   └── Same Operator.py
    └── Problem Set 5
    │   ├── 01-Most Common City Name
    │       └── Most Common City Name.py
    │   ├── 02-Region Cities
    │       └── Region Cities.py
    │   └── 03-Average Population
    │       └── Average Population.py
├── Lesson 6 - Case Study - OpenStreetMap Data
    ├── 03-Iterative Parsing
    │   ├── example.osm
    │   └── mapparser.py
    ├── 07-Tag Types
    │   ├── example.osm
    │   └── tags.py
    ├── 08-Exploring Users
    │   ├── example.osm
    │   └── users.py
    ├── 11-Improving Street Names
    │   ├── audit.py
    │   └── example.osm
    └── 12-Preparing for Database - MongoDB
    │   ├── data.py
    │   ├── example.osm
    │   └── example.osm.json
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Mac OS
 2 | .DS_Store
 3 | 
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # PyBuilder
52 | target/
53 | 
54 | # IPython Notebook
55 | .ipynb_checkpoints
56 | 
57 | # PyCharm
58 | .idea
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/Final Project/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Final Project/README.md


--------------------------------------------------------------------------------
/Final Project/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Final Project/img/1.png


--------------------------------------------------------------------------------
/Final Project/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Final Project/img/2.png


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/07-Parsing CSV Files/beatles-diskography.csv:
--------------------------------------------------------------------------------
 1 | Title,Released,Label,UK Chart Position,US Chart Position,BPI Certification,RIAA Certification
 2 | Please Please Me,22 March 1963,Parlophone(UK),1,—,Gold,Platinum
 3 | With the Beatles,22 November 1963,Parlophone(UK),1,—,Platinum,Gold
 4 | Beatlemania! With the Beatles,25 November 1963,Capitol(CAN),—,—,,
 5 | Introducing... The Beatles,10 January 1964,Vee-Jay(US),—,2,,
 6 | Meet the Beatles!,20 January 1964,Capitol(US),—,1,,5xPlatinum
 7 | Twist and Shout,3 February 1964,Capitol(CAN),—,—,,
 8 | The Beatles' Second Album,10 April 1964,Capitol(US),—,1,,2xPlatinum
 9 | The Beatles' Long Tall Sally,11 May 1964,Capitol(CAN),—,—,,
10 | A Hard Day's Night,26 June 1964,United Artists(US)[C],—,1,,4xPlatinum
11 | ,10 July 1964,Parlophone(UK),1,—,Gold,
12 | Something New,20 July 1964,Capitol(US),—,2,,Platinum
13 | Beatles for Sale,4 December 1964,Parlophone(UK),1,—,Gold,Platinum
14 | Beatles '65,15 December 1964,Capitol(US),—,1,,3xPlatinum
15 | Beatles VI,14 June 1965,"Parlophone(NZ), Capitol(US)",—,1,,Platinum
16 | Help!,6 August 1965,Parlophone(UK),1,—,Platinum,
17 | ,13 August 1965,Capitol(US)[C],—,1,,3xPlatinum
18 | Rubber Soul,3 December 1965,Parlophone(UK),1,—,Platinum,
19 | ,6 December 1965,Capitol(US)[C],—,1,,6xPlatinum
20 | Yesterday and Today,15 June 1966,Capitol(US),—,1,,2xPlatinum
21 | Revolver,5 August 1966,Parlophone(UK),1,—,Platinum,
22 | ,8 August 1966,Capitol(US)[C],—,1,,5xPlatinum
23 | Sgt. Pepper's Lonely Hearts Club Band,1 June 1967,"Parlophone(UK), Capitol(US)",1,1,3xPlatinum,11xPlatinum
24 | Magical Mystery Tour,27 November 1967,"Parlophone(UK), Capitol(US)",31[D],1,Platinum,6xPlatinum
25 | The Beatles,22 November 1968,"Apple(UK), Capitol(US)",1,1,Platinum,19xPlatinum
26 | Yellow Submarine,13 January 1969,"Apple(UK), Capitol(US)",3,2,Silver,Platinum
27 | Abbey Road,26 September 1969,"Apple(UK), Capitol(US)",1,1,2xPlatinum,12xPlatinum
28 | Let It Be,8 May 1970,"Apple(UK),United Artists(US)",1,1,Gold,4xPlatinum


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/07-Parsing CSV Files/simple.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Your task is to read the input DATAFILE line by line, and for the first 10 lines (not including the header)
 4 | # split each line on "," and then for each line, create a dictionary
 5 | # where the key is the header title of the field, and the value is the value of that field in the row.
 6 | # The function parse_file should return a list of dictionaries,
 7 | # each data line in the file being a single list entry.
 8 | # Field names and values should not contain extra whitespace, like spaces or newline characters.
 9 | # You can use the Python string method strip() to remove the extra whitespace.
10 | # You have to parse only the first 10 data lines in this exercise,
11 | # so the returned list should have 10 entries!
12 | import os
13 | 
14 | DATADIR = ""
15 | DATAFILE = "beatles-diskography.csv"
16 | 
17 | 
18 | def parse_file(datafile):
19 |     data = []
20 |     with open(datafile, "r") as f:
21 |         header = f.readline().split(",")
22 | 
23 |         counter = 0
24 |         for line in f:
25 |             if counter == 10:
26 |                 break
27 | 
28 |             fields = line.split(",")
29 |             entry = {}
30 | 
31 |             for i, value in enumerate(fields):
32 |                 entry[header[i].strip()] = value.strip()
33 | 
34 |             data.append(entry)
35 |             counter += 1
36 | 
37 |     return data
38 | 
39 | 
40 | def test():
41 |     # a simple test of your implemetation
42 |     datafile = os.path.join(DATADIR, DATAFILE)
43 |     d = parse_file(datafile)
44 |     firstline = {'Title': 'Please Please Me', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)',
45 |                  'Released': '22 March 1963', 'US Chart Position': "\xe2\x80\x94", 'RIAA Certification': 'Platinum',
46 |                  'BPI Certification': 'Gold'}
47 |     tenthline = {'Title': '', 'UK Chart Position': '1', 'Label': 'Parlophone(UK)', 'Released': '10 July 1964',
48 |                  'US Chart Position': "\xe2\x80\x94", 'RIAA Certification': '', 'BPI Certification': 'Gold'}
49 | 
50 |     assert d[0] == firstline
51 |     assert d[9] == tenthline
52 | 
53 | 
54 | test()
55 | # "-" is rendered as "\xe2\x80\x94"
56 | 


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/11-Reading Excel Files/2013_ERCOT_Hourly_Load_Data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Lesson 1 - Data Extraction Fundamentals/11-Reading Excel Files/2013_ERCOT_Hourly_Load_Data.xls


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/11-Reading Excel Files/2013_ERCOT_Hourly_Load_Data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Lesson 1 - Data Extraction Fundamentals/11-Reading Excel Files/2013_ERCOT_Hourly_Load_Data.zip


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/11-Reading Excel Files/reading_excel_files.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Your task is as follows:
 5 | - read the provided Excel file
 6 | - find and return the min, max and average values for the COAST region
 7 | - find and return the time value for the min and max entries
 8 | - the time values should be returned as Python tuples
 9 | 
10 | Please see the test function for the expected return format
11 | """
12 | 
13 | import xlrd
14 | from zipfile import ZipFile
15 | 
16 | datafile = "2013_ERCOT_Hourly_Load_Data"
17 | 
18 | 
19 | def open_zip(data_file):
20 |     with ZipFile('{0}.zip'.format(data_file), 'r') as myzip:
21 |         myzip.extractall()
22 | 
23 | 
24 | def parse_file(data_file):
25 |     workbook = xlrd.open_workbook('{0}.xls'.format(data_file))
26 |     sheet = workbook.sheet_by_index(0)
27 | 
28 |     # example on how you can get the data
29 |     sheet_data = [[sheet.cell_value(rowi, coli) for coli in range(sheet.ncols)] for rowi in range(sheet.nrows)]
30 | 
31 |     # other useful methods:
32 |     # print "\nROWS, COLUMNS, and CELLS:"
33 |     # print "Number of rows in the sheet:",
34 |     # print sheet.nrows
35 |     # print "Type of data in cell (row 3, col 2):",
36 |     # print sheet.cell_type(3, 2)
37 |     # print "Value in cell (row 3, col 2):",
38 |     # print sheet.cell_value(3, 2)
39 |     # print "Get a slice of values in column 3, from rows 1-3:"
40 |     # print sheet.col_values(3, start_rowx=1, end_rowx=4)
41 | 
42 |     # print "\nDATES:"
43 |     # print "Type of data in cell (row 1, col 0):",
44 |     # print sheet.cell_type(1, 0)
45 |     # exceltime = sheet.cell_value(1, 0)
46 |     # print "Time in Excel format:",
47 |     # print exceltime
48 |     # print "Convert time to a Python datetime tuple, from the Excel float:",
49 |     # print xlrd.xldate_as_tuple(exceltime, 0)
50 | 
51 |     data = {
52 |             'maxvalue': 0,
53 |             'minvalue': 0,
54 |             'avgcoast': 0,
55 |             'maxtime': (0, 0, 0, 0, 0, 0),
56 |             'mintime': (0, 0, 0, 0, 0, 0)
57 |     }
58 | 
59 |     coastvalues = [sheet_data[r][1] for r in range(1, len(sheet_data))]  # COAST value is in column 1
60 |     data["maxvalue"] = max(coastvalues)
61 |     data["minvalue"] = min(coastvalues)
62 |     data["avgcoast"] = sum(coastvalues) / float(len(coastvalues))
63 | 
64 |     exceltimes = [sheet_data[r][0] for r in range(1, len(sheet_data))]  # time value is in column 0
65 |     data["maxtime"] = xlrd.xldate_as_tuple(exceltimes[coastvalues.index(max(coastvalues))], 0)
66 |     data["mintime"] = xlrd.xldate_as_tuple(exceltimes[coastvalues.index(min(coastvalues))], 0)
67 | 
68 |     return data
69 | 
70 | 
71 | def test():
72 |     open_zip(datafile)
73 |     data = parse_file(datafile)
74 | 
75 |     assert data['maxtime'] == (2013, 8, 13, 17, 0, 0)
76 |     assert round(data['maxvalue'], 10) == round(18779.02551, 10)
77 | 
78 | 
79 | test()
80 | 


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/14-JSON Playground/aritist:
--------------------------------------------------------------------------------
 1 | {
 2 |     "area": {
 3 |         "id": "8a754a16-0027-3a29-b6d7-2b40ea0481ed",
 4 |         "name": "United Kingdom",
 5 |         "sort-name": "United Kingdom"
 6 |     },
 7 |     "country": "GB",
 8 |     "disambiguation": "60s band from the UK",
 9 |     "id": "9282c8b4-ca0b-4c6b-b7e3-4f7762dfc4d6",
10 |     "life-span": {
11 |         "begin": "1967",
12 |         "ended": null
13 |     },
14 |     "name": "Nirvana",
15 |     "score": "100",
16 |     "sort-name": "Nirvana",
17 |     "tags": [
18 |         {
19 |             "count": 1,
20 |             "name": "rock"
21 |         },
22 |         {
23 |             "count": 1,
24 |             "name": "pop"
25 |         },
26 |         {
27 |             "count": 1,
28 |             "name": "progressive rock"
29 |         },
30 |         {
31 |             "count": 1,
32 |             "name": "orchestral"
33 |         },
34 |         {
35 |             "count": 1,
36 |             "name": "british"
37 |         },
38 |         {
39 |             "count": 1,
40 |             "name": "power pop"
41 |         },
42 |         {
43 |             "count": 1,
44 |             "name": "psychedelic rock"
45 |         },
46 |         {
47 |             "count": 1,
48 |             "name": "soft rock"
49 |         },
50 |         {
51 |             "count": 1,
52 |             "name": "symphonic rock"
53 |         },
54 |         {
55 |             "count": 1,
56 |             "name": "english"
57 |         }
58 |     ],
59 |     "type": "Group"
60 | }


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/14-JSON Playground/musicbrainz.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # To experiment with this code freely you will have to run this code locally.
  4 | # Take a look at the main() function for an example of how to use the code.
  5 | # We have provided example json output in the other code editor tabs for you to
  6 | # look at, but you will not be able to run any queries through our UI.
  7 | 
  8 | import json
  9 | import requests
 10 | 
 11 | 
 12 | BASE_URL = "http://musicbrainz.org/ws/2/"
 13 | ARTIST_URL = BASE_URL + "artist/"
 14 | 
 15 | # query parameters are given to the requests.get function as a dictionary; this
 16 | # variable contains some starter parameters.
 17 | query_type = {"simple": {},
 18 |               "atr": {"inc": "aliases+tags+ratings"},
 19 |               "aliases": {"inc": "aliases"},
 20 |               "releases": {"inc": "releases"}}
 21 | 
 22 | 
 23 | def query_site(url, params, uid="", fmt="json"):
 24 |     # This is the main function for making queries to the musicbrainz API.
 25 |     # A json document should be returned by the query.
 26 |     params["fmt"] = fmt
 27 |     r = requests.get(url + uid, params=params)
 28 |     print "requesting", r.url
 29 | 
 30 |     if r.status_code == requests.codes.ok:
 31 |         return r.json()
 32 |     else:
 33 |         r.raise_for_status()
 34 | 
 35 | 
 36 | def query_by_name(url, params, name):
 37 |     # This adds an artist name to the query parameters before making
 38 |     # an API call to the function above.
 39 |     params["query"] = "artist:" + name
 40 |     return query_site(url, params)
 41 | 
 42 | 
 43 | def pretty_print(data, indent=4):
 44 |     # After we get our output, we can format it to be more readable
 45 |     # by using this function.
 46 |     if type(data) == dict:
 47 |         print json.dumps(data, indent=indent, sort_keys=True)
 48 |     else:
 49 |         print data
 50 | 
 51 | 
 52 | def main():
 53 |     """
 54 |     Modify the function calls and indexing below to answer the questions on
 55 |     the next quiz. HINT: Note how the output we get from the site is a
 56 |     multi-level JSON document, so try making print statements to step through
 57 |     the structure one level at a time or copy the output to a separate output
 58 |     file.
 59 |     """
 60 |     # results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
 61 |     # pretty_print(results)
 62 |     #
 63 |     # artist_id = results["artists"][1]["id"]
 64 |     # print "\nARTIST:"
 65 |     # pretty_print(results["artists"][1])
 66 |     #
 67 |     # artist_data = query_site(ARTIST_URL, query_type["releases"], artist_id)
 68 |     # releases = artist_data["releases"]
 69 |     # print "\nONE RELEASE:"
 70 |     # pretty_print(releases[0], indent=2)
 71 |     # release_titles = [r["title"] for r in releases]
 72 |     #
 73 |     # print "\nALL TITLES:"
 74 |     # for t in release_titles:
 75 |     #     print t
 76 | 
 77 |     # Question 1: How many bands named "First Aid Kit"?
 78 |     query_results = query_by_name(ARTIST_URL, query_type["simple"], "First Aid Kit")
 79 |     # pretty_print(query_results)
 80 |     count_FAK = 0
 81 |     for artist in query_results["artists"]:
 82 |         if artist["name"] == "First Aid Kit":
 83 |             count_FAK += 1
 84 |     print "\nQ1: There are {0} bands named First Aid Kit".format(count_FAK)
 85 | 
 86 |     # Question 2: Begin_area name for Queen?
 87 |     query_results = query_by_name(ARTIST_URL, query_type["simple"], "Queen")
 88 |     # pretty_print(query_results)
 89 |     Queen = query_results["artists"][0]
 90 |     print "\nQ2: The begin-area name for Queen is " + Queen["begin-area"]["name"]
 91 | 
 92 |     # Question 3: Spanish alias for The Beatles?
 93 |     query_results = query_by_name(ARTIST_URL, query_type["simple"], "The Beatles")
 94 |     # pretty_print(query_results)
 95 |     for alias in query_results["artists"][0]["aliases"]:
 96 |         if alias["locale"] == "es":
 97 |             print "\nQ3: The Spanish alias for The Beatles is " + alias["name"]
 98 | 
 99 |     # Question 4: Nirvana disambiguation?
100 |     query_results = query_by_name(ARTIST_URL, query_type["simple"], "Nirvana")
101 |     # pretty_print(query_results)
102 |     print "\nQ4: The disambiguation for Nirvana is " + query_results["artists"][0]["disambiguation"]
103 | 
104 |     # Question 5: Where was One Direction formed?
105 |     query_results = query_by_name(ARTIST_URL, query_type["simple"], "One Direction")
106 |     # pretty_print(query_results)
107 |     print "\nQ5:One Direction was formed in " + query_results["artists"][0]["life-span"]["begin"]
108 | 
109 | if __name__ == '__main__':
110 |     main()
111 | 


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/14-JSON Playground/release:
--------------------------------------------------------------------------------
 1 | {
 2 |   "barcode": null,
 3 |   "country": "GB",
 4 |   "date": "1969",
 5 |   "disambiguation": "",
 6 |   "id": "0b44cb36-550a-491d-bfd9-8751271f9de7",
 7 |   "packaging": null,
 8 |   "quality": "normal",
 9 |   "release-events": [
10 |     {
11 |       "area": {
12 |         "disambiguation": "",
13 |         "id": "8a754a16-0027-3a29-b6d7-2b40ea0481ed",
14 |         "iso_3166_1_codes": [
15 |           "GB"
16 |         ],
17 |         "iso_3166_2_codes": [],
18 |         "iso_3166_3_codes": [],
19 |         "name": "United Kingdom",
20 |         "sort-name": "United Kingdom"
21 |       },
22 |       "date": "1969"
23 |     }
24 |   ],
25 |   "status": "Official",
26 |   "text-representation": {
27 |     "language": "eng",
28 |     "script": "Latn"
29 |   },
30 |   "title": "To Markos III"
31 | }


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/01-Using CSV Module/745090.csv:
--------------------------------------------------------------------------------
  1 | 745090,"MOUNTAIN VIEW MOFFETT FLD NAS",CA,-8.0,37.400,-122.050,12
  2 | Date (MM/DD/YYYY),Time (HH:MM),ETR (W/m^2),ETRN (W/m^2),GHI (W/m^2),GHI source,GHI uncert (%),DNI (W/m^2),DNI source,DNI uncert (%),DHI (W/m^2),DHI source,DHI uncert (%),GH illum (lx),GH illum source,Global illum uncert (%),DN illum (lx),DN illum source,DN illum uncert (%),DH illum (lx),DH illum source,DH illum uncert (%),Zenith lum (cd/m^2),Zenith lum source,Zenith lum uncert (%),TotCld (tenths),TotCld source,TotCld uncert (code),OpqCld (tenths),OpqCld source,OpqCld uncert (code),Dry-bulb (C),Dry-bulb source,Dry-bulb uncert (code),Dew-point (C),Dew-point source,Dew-point uncert (code),RHum (%),RHum source,RHum uncert (code),Pressure (mbar),Pressure source,Pressure uncert (code),Wdir (degrees),Wdir source,Wdir uncert (code),Wspd (m/s),Wspd source,Wspd uncert (code),Hvis (m),Hvis source,Hvis uncert (code),CeilHgt (m),CeilHgt source,CeilHgt uncert (code),Pwat (cm),Pwat source,Pwat uncert (code),AOD (unitless),AOD source,AOD uncert (code),Alb (unitless),Alb source,Alb uncert (code),Lprecip depth (mm),Lprecip quantity (hr),Lprecip source,Lprecip uncert (code)
  3 | 01/01/2005,01:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,3,E,9,3,E,9,8.0,A,7,6.0,A,7,87,A,7,1013,A,7,150,A,7,2.1,A,7,16100,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,1,A,7
  4 | 01/01/2005,02:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1013,A,7,0,A,7,0.0,A,7,12900,A,7,930,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,1,A,7
  5 | 01/01/2005,03:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,E,9,8,E,9,7.0,A,7,6.0,A,7,93,A,7,1013,A,7,120,A,7,2.1,A,7,16100,A,7,2100,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,1,A,7
  6 | 01/01/2005,04:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1014,A,7,160,A,7,2.6,A,7,16100,A,7,1500,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,1,A,7
  7 | 01/01/2005,05:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,1,E,9,1,E,9,7.0,A,7,6.0,A,7,93,A,7,1014,A,7,120,A,7,1.5,A,7,16100,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,1,A,7
  8 | 01/01/2005,06:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,E,9,8,E,9,7.0,A,7,6.0,A,7,93,A,7,1015,A,7,0,A,7,0.0,A,7,16100,A,7,2700,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
  9 | 01/01/2005,07:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,3,E,9,3,E,9,6.0,A,7,5.0,A,7,93,A,7,1015,A,7,120,A,7,2.6,A,7,16100,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 10 | 01/01/2005,08:00,41,837,3,2,8,67,2,15,2,2,8,617,2,8,3807,2,15,416,2,8,79,2,19,0,E,9,0,A,7,7.0,A,7,6.0,A,7,93,A,7,1016,A,7,160,A,7,2.6,A,7,16100,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 11 | 01/01/2005,09:00,252,1415,120,2,8,406,2,15,48,2,8,12317,2,8,29552,2,15,7025,2,8,854,2,19,0,E,9,0,E,9,8.0,A,7,7.0,A,7,93,A,7,1016,A,7,160,A,7,2.1,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 12 | 01/01/2005,10:00,451,1415,282,2,8,501,2,15,121,2,8,29039,2,8,45085,2,15,14598,2,8,2319,2,19,8,E,9,8,E,9,11.0,A,7,7.0,A,7,76,A,7,1017,A,7,140,A,7,3.6,A,7,16100,A,7,2400,A,7,1.2,E,8,0.099,F,8,0.160,F,8,1,6,A,7
 13 | 01/01/2005,11:00,597,1415,151,2,8,18,2,15,144,2,8,17332,2,8,1313,2,15,16775,2,8,5749,2,19,4,E,9,4,E,9,12.0,A,7,6.0,A,7,67,A,7,1017,A,7,150,A,7,3.1,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 14 | 01/01/2005,12:00,681,1415,338,2,8,214,2,15,234,2,8,36467,2,8,21729,2,15,25970,2,8,5761,2,19,10,E,9,10,E,9,12.0,A,7,4.0,A,7,58,A,7,1016,A,7,0,A,7,0.0,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 15 | 01/01/2005,13:00,695,1415,342,2,8,236,2,15,226,2,8,37154,2,8,24081,2,15,25279,2,8,5600,2,19,10,E,9,10,E,9,13.0,A,7,5.0,A,7,58,A,7,1016,A,7,310,B,8,3.1,A,7,16100,A,7,1800,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 16 | 01/01/2005,14:00,639,1415,74,2,8,0,2,15,74,2,8,8979,2,8,0,2,15,8979,2,8,3440,2,19,4,E,9,4,E,9,13.0,A,7,4.0,A,7,54,A,7,1015,A,7,270,B,8,1.5,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 17 | 01/01/2005,15:00,517,1415,291,2,8,264,2,15,195,2,8,31224,2,8,25246,2,15,21956,2,8,4497,2,19,10,E,9,10,E,9,11.0,A,7,3.0,A,7,58,A,7,1016,A,7,220,A,7,4.6,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 18 | 01/01/2005,16:00,338,1415,97,2,8,0,2,15,97,2,8,10737,2,8,0,2,15,10737,2,8,3242,2,19,8,E,9,8,E,9,12.0,A,7,4.0,A,7,58,A,7,1015,A,7,230,A,7,3.6,A,7,16100,A,7,2400,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 19 | 01/01/2005,17:00,115,1403,17,2,8,0,2,15,17,2,8,1978,2,8,0,2,15,1978,2,8,628,2,19,10,E,9,10,E,9,12.0,A,7,5.0,A,7,62,A,7,1015,A,7,230,A,7,2.1,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 20 | 01/01/2005,18:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,10.0,A,7,6.0,A,7,76,A,7,1015,A,7,190,A,7,2.1,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 21 | 01/01/2005,19:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,E,9,8,E,9,9.0,A,7,6.0,A,7,81,A,7,1016,A,7,160,A,7,1.5,A,7,16100,A,7,2400,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,3,A,7
 22 | 01/01/2005,20:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,9.0,A,7,7.0,A,7,87,A,7,1016,A,7,140,A,7,2.1,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 23 | 01/01/2005,21:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,9.0,A,7,7.0,A,7,87,A,7,1016,A,7,0,A,7,0.0,A,7,14500,A,7,1440,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 24 | 01/01/2005,22:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1016,A,7,110,A,7,2.6,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 25 | 01/01/2005,23:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1016,A,7,130,A,7,2.6,A,7,16100,A,7,1500,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 26 | 01/01/2005,24:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,9,E,9,9,E,9,8.0,A,7,6.0,A,7,87,A,7,1016,A,7,150,A,7,2.1,A,7,16100,A,7,2100,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 27 | 01/02/2005,01:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,7.0,A,7,6.0,A,7,93,A,7,1015,A,7,130,A,7,3.1,A,7,16100,A,7,77777,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 28 | 01/02/2005,02:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,9,E,9,9,E,9,8.0,A,7,6.0,A,7,87,A,7,1015,A,7,150,A,7,3.6,A,7,16100,A,7,1080,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 29 | 01/02/2005,03:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,9.0,A,7,6.0,A,7,81,A,7,1014,A,7,180,A,7,2.6,A,7,16100,A,7,1500,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 30 | 01/02/2005,04:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,6.0,A,7,87,A,7,1014,A,7,140,A,7,2.1,A,7,16100,A,7,1800,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 31 | 01/02/2005,05:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1013,A,7,110,A,7,2.1,A,7,16100,A,7,1410,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 32 | 01/02/2005,06:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,6.0,A,7,87,A,7,1013,A,7,150,A,7,3.1,A,7,16100,A,7,990,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 33 | 01/02/2005,07:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1013,A,7,140,A,7,2.6,A,7,16100,A,7,930,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 34 | 01/02/2005,08:00,40,837,2,2,8,0,2,15,2,2,8,3,2,8,0,2,15,3,2,8,8,2,19,10,E,9,10,E,9,9.0,A,7,5.0,A,7,76,A,7,1013,A,7,160,A,7,4.1,A,7,16100,A,7,1350,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 35 | 01/02/2005,09:00,252,1415,60,2,8,10,2,15,58,2,8,67,2,8,3,2,15,67,2,8,201,2,19,10,E,9,10,E,9,8.0,A,7,6.0,A,7,87,A,7,1013,A,7,120,A,7,4.1,A,7,16100,A,7,1350,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 36 | 01/02/2005,10:00,451,1415,53,2,8,0,2,15,53,2,8,63,2,8,0,2,15,63,2,8,229,2,19,10,E,9,10,E,9,9.0,A,7,6.0,A,7,81,A,7,1013,A,7,140,A,7,4.1,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,6,A,7
 37 | 01/02/2005,11:00,598,1415,108,2,8,0,2,15,108,2,8,126,2,8,0,2,15,126,2,8,462,2,19,10,E,9,10,E,9,9.0,A,7,7.0,A,7,87,A,7,1013,A,7,130,A,7,4.1,A,7,16100,A,7,1410,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 38 | 01/02/2005,12:00,682,1415,116,2,8,0,2,15,116,2,8,137,2,8,0,2,15,137,2,8,518,2,19,10,E,9,10,E,9,9.0,A,7,7.0,A,7,87,A,7,1012,A,7,130,A,7,4.6,A,7,16100,A,7,1410,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 39 | 01/02/2005,13:00,697,1415,106,2,8,0,2,15,106,2,8,126,2,8,0,2,15,126,2,8,484,2,19,10,E,9,10,E,9,9.0,A,7,7.0,A,7,87,A,7,1010,A,7,130,A,7,3.1,A,7,16100,A,7,1440,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 40 | 01/02/2005,14:00,642,1415,74,2,8,0,2,15,74,2,8,90,2,8,0,2,15,90,2,8,344,2,19,10,E,9,10,E,9,9.0,A,7,7.0,A,7,87,A,7,1010,A,7,0,A,7,0.0,A,7,11300,A,7,840,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 41 | 01/02/2005,15:00,520,1415,84,2,8,0,2,15,84,2,8,99,2,8,0,2,15,99,2,8,356,2,19,10,E,9,10,E,9,9.0,A,7,8.0,A,7,93,A,7,1009,A,7,130,A,7,3.1,A,7,9700,A,7,1050,A,7,1.4,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 42 | 01/02/2005,16:00,341,1415,86,2,8,0,2,15,86,2,8,96,2,8,0,2,15,96,2,8,302,2,19,10,E,9,10,E,9,9.0,A,7,8.0,A,7,93,A,7,1009,A,7,130,A,7,3.1,A,7,12900,A,7,1110,A,7,1.4,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 43 | 01/02/2005,17:00,119,1415,0,2,8,0,2,15,0,2,8,0,2,8,0,2,15,0,2,8,0,2,19,10,E,9,10,E,9,9.0,A,7,8.0,A,7,93,A,7,1009,A,7,0,A,7,0.0,A,7,16100,A,7,1080,A,7,1.4,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 44 | 01/02/2005,18:00,0,12,0,2,0,0,2,0,0,2,0,0,2,1,0,2,2,0,2,2,0,2,1,10,E,9,10,E,9,8.0,A,7,8.0,A,7,100,A,7,1009,A,7,180,A,7,2.1,A,7,8000,A,7,960,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 45 | 01/02/2005,19:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1008,A,7,200,A,7,2.1,A,7,11300,A,7,1110,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 46 | 01/02/2005,20:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1008,A,7,140,A,7,1.5,A,7,12900,A,7,1200,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 47 | 01/02/2005,21:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1008,A,7,120,A,7,3.1,A,7,16100,A,7,1800,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 48 | 01/02/2005,22:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1009,A,7,0,A,7,0.0,A,7,16100,A,7,1050,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 49 | 01/02/2005,23:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,B,8,10,B,8,7.5,B,8,6.5,B,8,93,A,7,1009,B,8,0,B,8,0.0,B,8,-9900,?,0,1100,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 50 | 01/02/2005,24:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,B,8,10,B,8,7.1,B,8,6.4,B,8,95,A,7,1009,B,8,0,B,8,0.0,B,8,-9900,?,0,1150,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 51 | 01/03/2005,01:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,9,B,8,9,B,8,6.8,B,8,6.4,B,8,97,A,7,1009,B,8,0,B,8,0.0,B,8,-9900,?,0,1200,B,8,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 52 | 01/03/2005,02:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,9,B,8,9,B,8,6.6,B,8,6.3,B,8,98,A,7,1009,B,8,0,B,8,0.0,B,8,-9900,?,0,1250,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 53 | 01/03/2005,03:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,9,B,8,9,B,8,6.3,B,8,6.1,B,8,99,A,7,1009,B,8,0,B,8,0.0,B,8,-9900,?,0,1300,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 54 | 01/03/2005,04:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,9,B,8,9,B,8,6.0,B,8,6.0,B,8,100,A,7,1010,B,8,0,B,8,0.0,B,8,-9900,?,0,1350,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 55 | 01/03/2005,05:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,B,8,8,B,8,5.9,B,8,5.9,B,8,100,A,7,1010,B,8,0,B,8,0.0,B,8,-9900,?,0,1400,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 56 | 01/03/2005,06:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,B,8,8,B,8,5.8,B,8,5.8,B,8,100,A,7,1010,B,8,0,B,8,0.0,B,8,-9900,?,0,1450,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 57 | 01/03/2005,07:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,E,9,8,E,9,6.0,A,7,6.0,A,7,100,A,7,1010,A,7,0,A,7,0.0,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 58 | 01/03/2005,08:00,40,837,3,2,8,48,2,15,2,2,8,6,2,8,26,2,15,4,2,8,6,2,19,8,B,8,8,B,8,6.0,B,8,6.0,B,8,100,A,7,1010,B,8,0,B,8,0.0,B,8,-9900,?,0,1350,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 59 | 01/03/2005,09:00,252,1415,85,2,8,105,2,15,66,2,8,92,2,8,78,2,15,78,2,8,141,2,19,9,B,8,9,B,8,6.9,B,8,6.8,B,8,99,A,7,1010,B,8,0,B,8,0.0,B,8,-9900,?,0,1200,B,8,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 60 | 01/03/2005,10:00,452,1415,153,2,8,51,2,15,136,2,8,167,2,8,47,2,15,152,2,8,356,2,19,9,E,9,9,E,9,8.0,A,7,7.0,A,7,93,A,7,1011,A,7,0,A,7,0.0,A,7,16100,A,7,1050,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 61 | 01/03/2005,11:00,599,1415,302,2,8,257,2,15,192,2,8,318,2,8,257,2,15,209,2,8,422,2,19,8,E,9,8,E,9,9.0,A,7,7.0,A,7,87,A,7,1010,A,7,340,A,7,1.5,A,7,16100,A,7,1440,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 62 | 01/03/2005,12:00,684,1415,291,2,8,119,2,15,233,2,8,319,2,8,119,2,15,261,2,8,669,2,19,8,E,9,8,E,9,10.0,A,7,6.0,A,7,76,A,7,1009,A,7,340,B,8,1.5,A,7,16100,A,7,1350,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 63 | 01/03/2005,13:00,699,1415,261,2,8,67,2,15,228,2,8,287,2,8,67,2,15,253,2,8,667,2,19,10,E,9,10,E,9,10.0,A,7,7.0,A,7,82,A,7,1008,A,7,340,A,7,3.1,A,7,16100,A,7,1050,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,3,A,7
 64 | 01/03/2005,14:00,644,1415,376,2,8,284,2,15,247,2,8,404,2,8,285,2,15,274,2,8,598,2,19,2,E,9,2,E,9,11.0,A,7,7.0,A,7,76,A,7,1008,A,7,350,A,7,3.6,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 65 | 01/03/2005,15:00,523,1415,76,2,8,0,2,15,76,2,8,90,2,8,0,2,15,90,2,8,328,2,19,4,E,9,4,E,9,11.0,A,7,7.0,A,7,76,A,7,1007,A,7,340,A,7,4.1,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 66 | 01/03/2005,16:00,344,1415,201,2,8,205,2,15,151,2,8,212,2,8,171,2,15,170,2,8,331,2,19,3,E,9,3,E,9,10.0,A,7,7.0,A,7,82,A,7,1008,A,7,350,A,7,2.6,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,6,A,7
 67 | 01/03/2005,17:00,122,1415,0,2,8,0,2,15,0,2,8,0,2,8,0,2,15,0,2,8,0,2,19,7,E,9,7,E,9,9.0,A,7,6.0,A,7,81,A,7,1008,A,7,350,A,7,3.1,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 68 | 01/03/2005,18:00,0,35,0,2,8,0,2,15,0,2,8,0,2,8,0,2,15,0,2,8,0,2,19,2,E,9,2,E,9,9.0,A,7,7.0,A,7,87,A,7,1008,A,7,350,A,7,3.6,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 69 | 01/03/2005,19:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,3,E,9,3,E,9,8.0,A,7,6.0,A,7,87,A,7,1008,A,7,340,A,7,3.1,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 70 | 01/03/2005,20:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,1,E,9,1,E,9,8.0,A,7,6.0,A,7,87,A,7,1008,A,7,350,A,7,4.1,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 71 | 01/03/2005,21:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,E,9,0,E,9,7.0,A,7,6.0,A,7,93,A,7,1009,A,7,360,A,7,2.1,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 72 | 01/03/2005,22:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1009,A,7,0,A,7,0.0,A,7,14500,A,7,510,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 73 | 01/03/2005,23:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,7.0,A,7,93,A,7,1009,A,7,180,A,7,1.5,A,7,16100,A,7,570,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 74 | 01/03/2005,24:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,E,9,8,E,9,8.0,A,7,6.0,A,7,87,A,7,1008,A,7,200,A,7,1.5,A,7,16100,A,7,630,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 75 | 01/04/2005,01:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,8,E,9,8,E,9,6.0,A,7,6.0,A,7,100,A,7,1008,A,7,0,A,7,0.0,A,7,11300,A,7,630,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 76 | 01/04/2005,02:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,7.0,A,7,6.0,A,7,93,A,7,1008,A,7,200,A,7,1.5,A,7,14500,A,7,630,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 77 | 01/04/2005,03:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,6.0,A,7,6.0,A,7,100,A,7,1008,A,7,250,A,7,1.5,A,7,12900,A,7,330,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 78 | 01/04/2005,04:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,7.0,A,7,6.0,A,7,93,A,7,1008,A,7,0,A,7,0.0,A,7,14500,A,7,330,A,7,1.0,E,8,0.099,F,8,0.160,F,8,1,6,A,7
 79 | 01/04/2005,05:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,7.0,A,7,5.0,A,7,87,A,7,1008,A,7,180,A,7,1.5,A,7,12900,A,7,990,A,7,1.0,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 80 | 01/04/2005,06:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,7.0,A,7,6.0,A,7,93,A,7,1008,A,7,140,A,7,1.5,A,7,12900,A,7,1260,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 81 | 01/04/2005,07:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,7.0,A,7,6.0,A,7,93,A,7,1008,A,7,130,A,7,2.6,A,7,12900,A,7,810,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 82 | 01/04/2005,08:00,40,837,3,2,8,64,2,15,2,2,8,6,2,8,36,2,15,4,2,8,8,2,19,10,E,9,10,E,9,7.0,A,7,5.0,A,7,87,A,7,1009,A,7,140,A,7,1.5,A,7,14500,A,7,750,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 83 | 01/04/2005,09:00,252,1415,99,2,8,175,2,15,67,2,8,104,2,8,123,2,15,81,2,8,128,2,19,5,E,9,5,E,9,8.0,A,7,6.0,A,7,87,A,7,1009,A,7,160,A,7,1.5,A,7,12900,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 84 | 01/04/2005,10:00,453,1415,153,2,8,56,2,15,135,2,8,168,2,8,52,2,15,151,2,8,354,2,19,6,E,9,6,E,9,9.0,A,7,6.0,A,7,81,A,7,1010,A,7,110,A,7,1.5,A,7,12900,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 85 | 01/04/2005,11:00,601,1415,129,2,8,0,2,15,129,2,8,149,2,8,0,2,15,149,2,8,532,2,19,7,E,9,7,E,9,11.0,A,7,6.0,A,7,71,A,7,1010,A,7,0,A,7,0.0,A,7,16100,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 86 | 01/04/2005,12:00,686,1415,221,2,8,36,2,15,204,2,8,243,2,8,36,2,15,226,2,8,605,2,19,10,E,9,9,E,9,11.0,A,7,6.0,A,7,71,A,7,1010,A,7,10,A,7,2.6,A,7,16100,A,7,930,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 87 | 01/04/2005,13:00,702,1415,81,2,8,0,2,15,81,2,8,99,2,8,0,2,15,99,2,8,385,2,19,10,E,9,10,E,9,11.0,A,7,6.0,A,7,71,A,7,1010,A,7,20,A,7,1.5,A,7,16100,A,7,1500,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 88 | 01/04/2005,14:00,647,1415,74,2,8,0,2,15,74,2,8,90,2,8,0,2,15,90,2,8,345,2,19,10,E,9,10,E,9,11.0,A,7,5.0,A,7,66,A,7,1010,A,7,0,A,7,0.0,A,7,16100,A,7,780,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 89 | 01/04/2005,15:00,526,1415,69,2,8,0,2,15,69,2,8,82,2,8,0,2,15,82,2,8,303,2,19,10,E,9,10,E,9,11.0,A,7,6.0,A,7,71,A,7,1010,A,7,150,A,7,2.1,A,7,16100,A,7,1500,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 90 | 01/04/2005,16:00,348,1415,105,2,8,0,2,15,105,2,8,116,2,8,0,2,15,116,2,8,346,2,19,10,E,9,10,E,9,11.0,A,7,6.0,A,7,71,A,7,1012,A,7,220,A,7,2.1,A,7,14500,A,7,840,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 91 | 01/04/2005,17:00,126,1415,0,2,8,0,2,15,0,2,8,0,2,8,0,2,15,0,2,8,0,2,19,10,E,9,10,E,9,9.0,A,7,7.0,A,7,87,A,7,1012,A,7,0,A,7,0.0,A,7,16100,A,7,1500,A,7,1.2,E,8,0.099,F,8,0.160,F,8,0,1,A,7
 92 | 01/04/2005,18:00,0,59,0,2,8,0,2,15,0,2,8,0,2,8,0,2,15,0,2,8,0,2,19,4,E,9,4,E,9,9.0,A,7,7.0,A,7,87,A,7,1013,A,7,160,A,7,2.1,A,7,16100,A,7,77777,A,7,1.3,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 93 | 01/04/2005,19:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,E,9,0,E,9,9.0,A,7,7.0,A,7,87,A,7,1013,A,7,140,A,7,1.5,A,7,16100,A,7,77777,A,7,1.3,E,8,0.099,F,8,0.160,F,8,0,3,A,7
 94 | 01/04/2005,20:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,E,9,0,E,9,8.0,A,7,7.0,A,7,93,A,7,1014,A,7,170,A,7,1.5,A,7,12900,A,7,77777,A,7,1.2,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 95 | 01/04/2005,21:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,4,E,9,4,E,9,8.0,A,7,7.0,A,7,93,A,7,1015,A,7,160,A,7,2.6,A,7,12900,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 96 | 01/04/2005,22:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,E,9,0,E,9,8.0,A,7,6.0,A,7,87,A,7,1015,A,7,180,A,7,1.5,A,7,16100,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,0,6,A,7
 97 | 01/04/2005,23:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,E,9,0,E,9,7.0,A,7,6.0,A,7,93,A,7,1016,A,7,140,A,7,2.6,A,7,16100,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 98 | 01/04/2005,24:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,3,E,9,3,E,9,7.0,A,7,5.0,A,7,87,A,7,1016,A,7,150,A,7,3.1,A,7,16100,A,7,77777,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
 99 | 01/05/2005,01:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,7.0,A,7,5.0,A,7,87,A,7,1017,A,7,150,A,7,2.6,A,7,16100,A,7,1800,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
100 | 01/05/2005,02:00,0,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,0,2,0,10,E,9,10,E,9,8.0,A,7,6.0,A,7,87,A,7,1017,A,7,140,A,7,3.6,A,7,16100,A,7,2100,A,7,1.1,E,8,0.099,F,8,0.160,F,8,-9900,-9900,?,0
101 | 


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/01-Using CSV Module/parsecsv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 | Your task is to process the supplied file and use the csv module to extract data from it.
 5 | The data comes from NREL (National Renewable Energy Laboratory) website. Each file
 6 | contains information from one meteorological station, in particular - about amount of
 7 | solar and wind energy for each hour of day.
 8 | 
 9 | Note that the first line of the datafile is neither data entry, nor header. It is a line
10 | describing the data source. You should extract the name of the station from it.
11 | 
12 | The data should be returned as a list of lists (not dictionaries).
13 | You can use the csv modules "reader" method to get data in such format.
14 | Another useful method is next() - to get the next line from the iterator.
15 | You should only change the parse_file function.
16 | """
17 | 
18 | import csv
19 | import os
20 | 
21 | DATADIR = ""
22 | DATAFILE = "745090.csv"
23 | 
24 | 
25 | def parse_file(datafile):
26 |     name = ""
27 |     data = []
28 |     with open(datafile, 'rb') as f:
29 |         reader = csv.reader(f, delimiter=',')
30 |         name += reader.next()[1]
31 |         reader.next()  # skip header
32 |         for row in reader:
33 |             data.append(row)
34 |     # Do not change the line below
35 |     return name, data
36 | 
37 | 
38 | def test():
39 |     datafile = os.path.join(DATADIR, DATAFILE)
40 |     name, data = parse_file(datafile)
41 | 
42 |     assert name == "MOUNTAIN VIEW MOFFETT FLD NAS"
43 |     assert data[0][1] == "01:00"
44 |     assert data[2][0] == "01/01/2005"
45 |     assert data[2][5] == "2"
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     test()
50 | 


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/02-Excel To CSV/2013_ERCOT_Hourly_Load_Data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/02-Excel To CSV/2013_ERCOT_Hourly_Load_Data.xls


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/02-Excel To CSV/2013_ERCOT_Hourly_Load_Data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/02-Excel To CSV/2013_ERCOT_Hourly_Load_Data.zip


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/02-Excel To CSV/2013_Max_Loads.csv:
--------------------------------------------------------------------------------
 1 | Station|Year|Month|Day|Hour|Max Load
 2 | COAST|2013|8|13|17|18779.025510000003
 3 | EAST|2013|8|5|17|2380.1654089999956
 4 | FAR_WEST|2013|6|26|17|2281.2722140000024
 5 | NORTH|2013|8|7|17|1544.7707140000005
 6 | NORTH_C|2013|8|7|18|24415.570226999993
 7 | SOUTHERN|2013|8|8|16|5494.157645
 8 | SOUTH_C|2013|8|8|18|11433.30491600001
 9 | WEST|2013|8|7|17|1862.6137649999998
10 | 


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/02-Excel To CSV/example.csv:
--------------------------------------------------------------------------------
1 | Station|Year|Month|Day|Hour|Max Load
2 | COAST|2013|01|01|10|12345.6
3 | EAST|2013|01|01|10|12345.6
4 | FAR_WEST|2013|01|01|10|12345.6
5 | NORTH|2013|01|01|10|12345.6
6 | NORTH_C|2013|01|01|10|12345.6
7 | SOUTHERN|2013|01|01|10|12345.6
8 | SOUTH_C|2013|01|01|10|12345.6
9 | WEST|2013|01|01|10|12345.6


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/02-Excel To CSV/excel_csv.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 | Find the time and value of max load for each of the regions
 5 | COAST, EAST, FAR_WEST, NORTH, NORTH_C, SOUTHERN, SOUTH_C, WEST
 6 | and write the result out in a csv file, using pipe character | as the delimiter.
 7 | 
 8 | An example output can be seen in the "example.csv" file.
 9 | """
10 | 
11 | import xlrd
12 | import csv
13 | from zipfile import ZipFile
14 | 
15 | datafile = "2013_ERCOT_Hourly_Load_Data"
16 | outfile = "2013_Max_Loads.csv"
17 | 
18 | 
19 | def open_zip(data_file):
20 |     with ZipFile('{0}.zip'.format(data_file), 'r') as myzip:
21 |         myzip.extractall()
22 | 
23 | 
24 | def parse_file(data_file):
25 |     workbook = xlrd.open_workbook('{0}.xls'.format(data_file))
26 |     sheet = workbook.sheet_by_index(0)
27 |     data = []
28 |     # YOUR CODE HERE
29 |     # Remember that you can use xlrd.xldate_as_tuple(sometime, 0) to convert
30 |     # Excel date to Python tuple of (year, month, day, hour, minute, second)
31 |     header = ['Station', 'Year', 'Month', 'Day', 'Hour', 'Max Load']
32 |     data.append(header)
33 |     stations = sheet.row_values(0, start_colx=1, end_colx=9)
34 |     for i in range(len(stations)):
35 |         station_values = sheet.col_values(i + 1, start_rowx=1)
36 |         max_row = station_values.index(max(station_values)) + 1
37 |         Year, Month, Day, Hour, Minute, Second = xlrd.xldate_as_tuple(sheet.cell_value(max_row, 0), 0)
38 |         data.append([stations[i], Year, Month, Day, Hour, max(station_values)])
39 | 
40 |     return data
41 | 
42 | 
43 | def save_file(data, filename):
44 |     # YOUR CODE HERE
45 |     with open(filename, 'wb') as of:
46 |         writer = csv.writer(of, delimiter='|')
47 |         for row in data:
48 |             writer.writerow(row)
49 | 
50 | 
51 | def test():
52 |     open_zip(datafile)
53 |     data = parse_file(datafile)
54 |     save_file(data, outfile)
55 | 
56 |     number_of_rows = 0
57 |     stations = []
58 | 
59 |     ans = {'FAR_WEST': {'Max Load': '2281.2722140000024',
60 |                         'Year': '2013',
61 |                         'Month': '6',
62 |                         'Day': '26',
63 |                         'Hour': '17'}}
64 |     correct_stations = ['COAST', 'EAST', 'FAR_WEST', 'NORTH',
65 |                         'NORTH_C', 'SOUTHERN', 'SOUTH_C', 'WEST']
66 |     fields = ['Year', 'Month', 'Day', 'Hour', 'Max Load']
67 | 
68 |     with open(outfile) as of:
69 |         csvfile = csv.DictReader(of, delimiter="|")
70 |         for line in csvfile:
71 |             station = line['Station']
72 |             if station == 'FAR_WEST':
73 |                 for field in fields:
74 |                     # Check if 'Max Load' is within .1 of answer
75 |                     if field == 'Max Load':
76 |                         max_answer = round(float(ans[station][field]), 1)
77 |                         max_line = round(float(line[field]), 1)
78 |                         assert max_answer == max_line
79 | 
80 |                     # Otherwise check for equality
81 |                     else:
82 |                         assert ans[station][field] == line[field]
83 | 
84 |             number_of_rows += 1
85 |             stations.append(station)
86 | 
87 |         # Output should be 8 lines not including header
88 |         assert number_of_rows == 8
89 | 
90 |         # Check Station Names
91 |         assert set(stations) == set(correct_stations)
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     test()
96 | 


--------------------------------------------------------------------------------
/Lesson 1 - Data Extraction Fundamentals/Problem Set 1/03-Wrangling JSON/nytimes.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """
  4 | This exercise shows some important concepts that you should be aware about:
  5 | - using codecs module to write unicode files
  6 | - using authentication with web APIs
  7 | - using offset when accessing web APIs
  8 | 
  9 | To run this code locally you have to register at the NYTimes developer site
 10 | and get your own API key. You will be able to complete this exercise in our UI
 11 | without doing so, as we have provided a sample result.
 12 | 
 13 | Your task is to process the saved file that represents the most popular
 14 | articles (by view count) from the last day, and return the following data:
 15 | - list of dictionaries, where the dictionary key is "section" and value is "title"
 16 | - list of URLs for all media entries with "format": "Standard Thumbnail"
 17 | 
 18 | All your changes should be in the article_overview function.
 19 | The rest of functions are provided for your convenience, if you want to access
 20 | the API by yourself.
 21 | """
 22 | 
 23 | import json
 24 | import codecs
 25 | import requests
 26 | 
 27 | URL_MAIN = "http://api.nytimes.com/svc/"
 28 | URL_POPULAR = URL_MAIN + "mostpopular/v2/"
 29 | API_KEY = {"popular": "",
 30 |            "article": ""}
 31 | 
 32 | 
 33 | def get_from_file(kind, period):
 34 |     filename = "popular-{0}-{1}.json".format(kind, period)
 35 |     with open(filename, "r") as f:
 36 |         return json.loads(f.read())
 37 | 
 38 | 
 39 | def article_overview(kind, period):
 40 |     data = get_from_file(kind, period)
 41 |     titles = []
 42 |     urls = []
 43 |     # YOUR CODE HERE
 44 |     for article in data:
 45 |         section = article['section']
 46 |         title = article['title']
 47 |         titles.append({section: title})
 48 |         for media in article['media']:
 49 |             for media_metadata in media['media-metadata']:
 50 |                 if media_metadata['format'] == 'Standard Thumbnail':
 51 |                     urls.append(media_metadata['url'])
 52 | 
 53 |     return titles, urls
 54 | 
 55 | 
 56 | def query_site(url, target, offset):
 57 |     # This will set up the query with the API key and offset
 58 |     # Web services often use offset paramter to return data in small chunks
 59 |     # NYTimes returns 20 articles per request, if you want the next 20
 60 |     # You have to provide the offset parameter
 61 |     if API_KEY["popular"] == "" or API_KEY["article"] == "":
 62 |         print "You need to register for NYTimes Developer account to run this program."
 63 |         print "See Intructor notes for information"
 64 |         return False
 65 |     params = {"api-key": API_KEY[target], "offset": offset}
 66 |     r = requests.get(url, params=params)
 67 | 
 68 |     if r.status_code == requests.codes.ok:
 69 |         return r.json()
 70 |     else:
 71 |         r.raise_for_status()
 72 | 
 73 | 
 74 | def get_popular(url, kind, days, section="all-sections", offset=0):
 75 |     # This function will construct the query according to the requirements of the site
 76 |     # and return the data, or print an error message if called incorrectly
 77 |     if days not in [1, 7, 30]:
 78 |         print "Time period can be 1,7, 30 days only"
 79 |         return False
 80 |     if kind not in ["viewed", "shared", "emailed"]:
 81 |         print "kind can be only one of viewed/shared/emailed"
 82 |         return False
 83 | 
 84 |     url += "most{0}/{1}/{2}.json".format(kind, section, days)
 85 |     data = query_site(url, "popular", offset)
 86 | 
 87 |     return data
 88 | 
 89 | 
 90 | def save_file(kind, period):
 91 |     # This will process all results, by calling the API repeatedly with supplied offset value,
 92 |     # combine the data and then write all results in a file.
 93 |     data = get_popular(URL_POPULAR, "viewed", 1)
 94 |     num_results = data["num_results"]
 95 |     full_data = []
 96 |     with codecs.open("popular-{0}-{1}.json".format(kind, period), encoding='utf-8', mode='w') as v:
 97 |         for offset in range(0, num_results, 20):
 98 |             data = get_popular(URL_POPULAR, kind, period, offset=offset)
 99 |             full_data += data["results"]
100 | 
101 |         v.write(json.dumps(full_data, indent=2))
102 | 
103 | 
104 | def test():
105 |     titles, urls = article_overview("viewed", 1)
106 |     assert len(titles) == 20
107 |     assert len(urls) == 30
108 |     assert titles[2] == {'Opinion': 'Professors, We Need You!'}
109 |     assert urls[20] == 'http://graphics8.nytimes.com/images/2014/02/17/sports/ICEDANCE/ICEDANCE-thumbStandard.jpg'
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     test()
114 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/07-Extracting Data/authors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Your task here is to extract data from xml on authors of an article
 4 | # and add it to a list, one item for an author.
 5 | # See the provided data structure for the expected format.
 6 | # The tags for first name, surname and email should map directly
 7 | # to the dictionary keys
 8 | 
 9 | import xml.etree.ElementTree as ET
10 | 
11 | article_file = "exampleResearchArticle.xml"
12 | 
13 | 
14 | def get_root(fname):
15 |     tree = ET.parse(fname)
16 |     return tree.getroot()
17 | 
18 | 
19 | def get_authors(root):
20 |     authors = []
21 |     for author in root.findall('./fm/bibl/aug/au'):
22 |         data = {
23 |             "fnm": None,
24 |             "snm": None,
25 |             "email": None
26 |         }
27 | 
28 |         # YOUR CODE HERE
29 |         fnm = author.find('./fnm')
30 |         snm = author.find('./snm')
31 |         email = author.find('./email')
32 |         if fnm is not None:
33 |             data['fnm'] = fnm.text
34 |         if snm is not None:
35 |             data['snm'] = snm.text
36 |         if email is not None:
37 |             data['email'] = email.text
38 | 
39 |         authors.append(data)
40 | 
41 |     return authors
42 | 
43 | 
44 | def test():
45 |     solution = [{'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
46 |                 {'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
47 |                 {'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
48 |                 {'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
49 |                 {'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
50 |                 {'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
51 |                 {'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
52 |                 {'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
53 | 
54 |     root = get_root(article_file)
55 |     data = get_authors(root)
56 | 
57 |     assert data[0] == solution[0]
58 |     assert data[1]["fnm"] == solution[1]["fnm"]
59 | 
60 | 
61 | test()
62 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/08-Handling Attributes/authors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # Your task here is to extract data from xml on authors of an article
 4 | # and add it to a list, one item for an author.
 5 | # See the provided data structure for the expected format.
 6 | # The tags for first name, surname and email should map directly
 7 | # to the dictionary keys, but you have to extract the attributes from the "insr" tag
 8 | # and add them to the list for the dictionary key "insr"
 9 | import xml.etree.ElementTree as ET
10 | 
11 | article_file = "exampleResearchArticle.xml"
12 | 
13 | 
14 | def get_root(fname):
15 |     tree = ET.parse(fname)
16 |     return tree.getroot()
17 | 
18 | 
19 | def get_authors(root):
20 |     authors = []
21 |     data = {
22 |         "fnm": None,
23 |         "snm": None,
24 |         "email": None,
25 |         "insr": []
26 |     }
27 |     for author in root.findall('./fm/bibl/aug/au'):
28 | 
29 |         # YOUR CODE HERE
30 |         data["fnm"] = author.find('./fnm').text
31 |         data["snm"] = author.find('./snm').text
32 |         data["email"] = author.find('./email').text
33 |         insr = author.findall('./insr')
34 |         for i in insr:
35 |             data["insr"].append(i.attrib["iid"])
36 | 
37 |         authors.append(data)
38 | 
39 |     return authors
40 | 
41 | 
42 | def test():
43 |     solution = [{'insr': ['I1'], 'fnm': 'Omer', 'snm': 'Mei-Dan', 'email': 'omer@extremegate.com'},
44 |                 {'insr': ['I2'], 'fnm': 'Mike', 'snm': 'Carmont', 'email': 'mcarmont@hotmail.com'},
45 |                 {'insr': ['I3', 'I4'], 'fnm': 'Lior', 'snm': 'Laver', 'email': 'laver17@gmail.com'},
46 |                 {'insr': ['I3'], 'fnm': 'Meir', 'snm': 'Nyska', 'email': 'nyska@internet-zahav.net'},
47 |                 {'insr': ['I8'], 'fnm': 'Hagay', 'snm': 'Kammar', 'email': 'kammarh@gmail.com'},
48 |                 {'insr': ['I3', 'I5'], 'fnm': 'Gideon', 'snm': 'Mann', 'email': 'gideon.mann.md@gmail.com'},
49 |                 {'insr': ['I6'], 'fnm': 'Barnaby', 'snm': 'Clarck', 'email': 'barns.nz@gmail.com'},
50 |                 {'insr': ['I7'], 'fnm': 'Eugene', 'snm': 'Kots', 'email': 'eukots@gmail.com'}]
51 | 
52 |     root = get_root(article_file)
53 |     data = get_authors(root)
54 | 
55 |     assert data[0] == solution[0]
56 |     assert data[1]["insr"] == solution[1]["insr"]
57 | 
58 | 
59 | test()
60 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/18-Using Beautiful Soup/html_soup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Please note that the function 'make_request' is provided for your reference only.
 5 | # You will not be able to to actually use it from within the Udacity web UI.
 6 | # Your task is to process the HTML using BeautifulSoup, extract the hidden
 7 | # form field values for "__EVENTVALIDATION" and "__VIEWSTATE" and set the appropriate
 8 | # values in the data dictionary.
 9 | # All your changes should be in the 'extract_data' function
10 | 
11 | from bs4 import BeautifulSoup
12 | import requests
13 | 
14 | html_page = "page_source.html"
15 | 
16 | 
17 | def extract_data(page):
18 |     data = {"eventvalidation": "",
19 |             "viewstate": ""}
20 |     with open(page, "r") as html:
21 |         # do something here to find the necessary values
22 |         soup = BeautifulSoup(html, "lxml")
23 |         ev = soup.find(id='__EVENTVALIDATION')
24 |         data['eventvalidation'] = ev['value']
25 | 
26 |         vs = soup.find(id='__VIEWSTATE')
27 |         data['viewstate'] = vs['value']
28 | 
29 |     return data
30 | 
31 | 
32 | def make_request(data):
33 |     eventvalidation = data["eventvalidation"]
34 |     viewstate = data["viewstate"]
35 | 
36 |     r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
37 |                       data={'AirportList': "BOS",
38 |                             'CarrierList': "VX",
39 |                             'Submit': 'Submit',
40 |                             "__EVENTTARGET": "",
41 |                             "__EVENTARGUMENT": "",
42 |                             "__EVENTVALIDATION": eventvalidation,
43 |                             "__VIEWSTATE": viewstate
44 |                             })
45 | 
46 |     return r.text
47 | 
48 | 
49 | def test():
50 |     data = extract_data(html_page)
51 |     assert data["eventvalidation"] != ""
52 |     assert data["eventvalidation"].startswith("/wEWjAkCoIj1ng0")
53 |     assert data["viewstate"].startswith("/wEPDwUKLTI")
54 | 
55 | 
56 | test()
57 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/01-Carrier List/carriers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Please note that the function 'make_request' is provided for your reference only.
 6 | You will not be able to to actually use it from within the Udacity web UI.
 7 | All your changes should be in the 'extract_carrier' function.
 8 | Also note that the html file is a stripped down version of what is actually on
 9 | the website.
10 | 
11 | Your task in this exercise is to get a list of all airlines. Exclude all of the
12 | combination values like "All U.S. Carriers" from the data that you return.
13 | You should return a list of codes for the carriers.
14 | """
15 | 
16 | from bs4 import BeautifulSoup
17 | import requests
18 | 
19 | html_page = "options.html"
20 | 
21 | 
22 | def extract_carriers(page):
23 |     data = []
24 | 
25 |     with open(page, "r") as html:
26 |         # do something here to find the necessary values
27 |         soup = BeautifulSoup(html, "lxml")
28 |         for carrier in soup.find(id="CarrierList").find_all("option"):
29 |             if len(carrier['value']) == 2:
30 |                 data.append(carrier['value'])
31 | 
32 |     return data
33 | 
34 | 
35 | def make_request(data):
36 |     eventvalidation = data["eventvalidation"]
37 |     viewstate = data["viewstate"]
38 |     airport = data["airport"]
39 |     carrier = data["carrier"]
40 | 
41 |     r = requests.post("http://www.transtats.bts.gov/Data_Elements.aspx?Data=2",
42 |                       data={'AirportList': airport,
43 |                             'CarrierList': carrier,
44 |                             'Submit': 'Submit',
45 |                             "__EVENTTARGET": "",
46 |                             "__EVENTARGUMENT": "",
47 |                             "__EVENTVALIDATION": eventvalidation,
48 |                             "__VIEWSTATE": viewstate
49 |                             })
50 | 
51 |     return r.text
52 | 
53 | 
54 | def test():
55 |     data = extract_carriers(html_page)
56 |     assert len(data) == 16
57 |     assert "FL" in data
58 |     assert "NK" in data
59 | 
60 | test()
61 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/02-Airport List/airports.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Complete the 'extract_airports' function so that it returns a list of airport
 6 | codes, excluding any combinations like "All".
 7 | """
 8 | 
 9 | from bs4 import BeautifulSoup
10 | html_page = "options.html"
11 | 
12 | 
13 | def extract_airports(page):
14 |     data = []
15 |     with open(page, "r") as html:
16 |         # do something here to find the necessary values
17 |         soup = BeautifulSoup(html, "lxml")
18 |         for airport in soup.find(id="AirportList").find_all("option"):
19 |             if len(airport['value']) == 3:
20 |                 if airport['value'] != 'All':
21 |                     data.append(airport['value'])
22 | 
23 |     return data
24 | 
25 | 
26 | def test():
27 |     data = extract_airports(html_page)
28 |     assert len(data) == 15
29 |     assert "ATL" in data
30 |     assert "ABR" in data
31 | 
32 | test()
33 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/03-Processing All/data.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ziyanfeng/udacity-data-wrangling-mongodb/88072c8711f2d00143e80cae6d832251179c7f4d/Lesson 2 - Data in More Complex Formats/Problem Set 2/03-Processing All/data.zip


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/03-Processing All/process.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | Let's assume that you combined the code from the previous 2 exercises with code
  5 | from the lesson on how to build requests, and downloaded all the data locally.
  6 | The files are in a directory "data", named after the carrier and airport:
  7 | "{}-{}.html".format(carrier, airport), for example "FL-ATL.html".
  8 | 
  9 | The table with flight info has a table class="dataTDRight". Your task is to
 10 | extract the flight data from that table as a list of dictionaries, each
 11 | dictionary containing relevant data from the file and table row. This is an
 12 | example of the data structure you should return:
 13 | 
 14 | data = [{"courier": "FL",
 15 |          "airport": "ATL",
 16 |          "year": 2012,
 17 |          "month": 12,
 18 |          "flights": {"domestic": 100,
 19 |                      "international": 100}
 20 |         },
 21 |          {"courier": "..."}
 22 | ]
 23 | 
 24 | Note - year, month, and the flight data should be integers.
 25 | You should skip the rows that contain the TOTAL data for a year.
 26 | 
 27 | There are couple of helper functions to deal with the data files.
 28 | Please do not change them for grading purposes.
 29 | All your changes should be in the 'process_file' function.
 30 | """
 31 | from bs4 import BeautifulSoup
 32 | from zipfile import ZipFile
 33 | import os
 34 | 
 35 | datadir = "data"
 36 | 
 37 | 
 38 | def open_zip(data_dir):
 39 |     with ZipFile('{0}.zip'.format(data_dir), 'r') as myzip:
 40 |         myzip.extractall()
 41 | 
 42 | 
 43 | def process_all(data_dir):
 44 |     files = os.listdir(data_dir)
 45 |     return files
 46 | 
 47 | 
 48 | def process_file(f):
 49 |     """
 50 |     This function extracts data from the file given as the function argument in
 51 |     a list of dictionaries. This is example of the data structure you should
 52 |     return:
 53 | 
 54 |     data = [{"courier": "FL",
 55 |              "airport": "ATL",
 56 |              "year": 2012,
 57 |              "month": 12,
 58 |              "flights": {"domestic": 100,
 59 |                          "international": 100}
 60 |             },
 61 |             {"courier": "..."}
 62 |     ]
 63 | 
 64 | 
 65 |     Note - year, month, and the flight data should be integers.
 66 |     You should skip the rows that contain the TOTAL data for a year.
 67 |     """
 68 |     data = []
 69 |     info = dict()
 70 |     info["courier"], info["airport"] = f[:6].split("-")
 71 |     # Note: create a new dictionary for each entry in the output data list.
 72 |     # If you use the info dictionary defined here each element in the list
 73 |     # will be a reference to the same info dictionary.
 74 |     with open("{}/{}".format(datadir, f), "r") as html:
 75 |         soup = BeautifulSoup(html, 'lxml')
 76 |         trs = soup.find('table', class_='dataTDRight').find_all('tr', class_='dataTDRight')
 77 |         for tr in trs:
 78 |             tds = tr.find_all('td')
 79 |             if tds[1].text != 'TOTAL':
 80 |                 info["year"] = int(tds[0].text)
 81 |                 info["month"] = int(tds[1].text)
 82 |                 info["flights"] = {"domestic": int(tds[2].text.replace(',', '')),
 83 |                                    "international": int(tds[3].text.replace(',', ''))}
 84 |                 data.append(info)
 85 | 
 86 |     return data
 87 | 
 88 | 
 89 | def test():
 90 |     print "Running a simple test..."
 91 |     open_zip(datadir)
 92 |     files = process_all(datadir)
 93 |     data = []
 94 |     # Test will loop over three data files.
 95 |     for f in files:
 96 |         data += process_file(f)
 97 | 
 98 |     assert len(data) == 399  # Total number of rows
 99 |     for entry in data[:3]:
100 |         assert type(entry["year"]) == int
101 |         assert type(entry["month"]) == int
102 |         assert type(entry["flights"]["domestic"]) == int
103 |         assert len(entry["airport"]) == 3
104 |         assert len(entry["courier"]) == 2
105 |     assert data[0]["courier"] == 'FL'
106 |     assert data[0]["month"] == 10
107 |     assert data[-1]["airport"] == "ATL"
108 |     assert data[-1]["flights"] == {'international': 108289, 'domestic': 701425}
109 | 
110 |     print "... success!"
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     test()
115 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/04-Patent Database/patent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | This and the following exercise are using US Patent database. The patent.data
 6 | file is a small excerpt of much larger datafiles that are available for
 7 | download from US Patent website. These files are pretty large ( >100 MB each).
 8 | The original file is ~600MB large, you might not be able to open it in a text
 9 | editor.
10 | 
11 | The data itself is in XML, however there is a problem with how it's formatted.
12 | Please run this script and observe the error. Then find the line that is
13 | causing the error. You can do that by just looking at the datafile in the web
14 | UI, or programmatically. For quiz purposes it does not matter, but as an
15 | exercise we suggest that you try to do it programmatically.
16 | 
17 | NOTE: You do not need to correct the error - for now, just find where the error
18 | is occurring.
19 | """
20 | 
21 | import xml.etree.ElementTree as ET
22 | 
23 | PATENTS = 'patent.data'
24 | 
25 | 
26 | def get_root(fname):
27 | 
28 |     tree = ET.parse(fname)
29 |     return tree.getroot()
30 | 
31 | 
32 | get_root(PATENTS)
33 | 
34 | 
35 | # Quiz: Result of Parsing the Datafile
36 | # Please enter content of the line that is causing the error:
37 | # line 657
38 | # <?xml version="1.0" encoding="UTF-8"?>
39 | 
40 | # What do you think is the problem?
41 | # multiple xml schema in the same file
42 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/05-Processing Patents/patent.data-1:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE us-patent-grant SYSTEM "us-patent-grant-v44-2013-05-16.dtd" [ ]>
  3 | <us-patent-grant lang="EN" dtd-version="v4.4 2013-05-16" file="USD0696837-20140107.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20131224" date-publ="20140107">
  4 | <us-bibliographic-data-grant>
  5 | <publication-reference>
  6 | <document-id>
  7 | <country>US</country>
  8 | <doc-number>D0696837</doc-number>
  9 | <kind>S1</kind>
 10 | <date>20140107</date>
 11 | </document-id>
 12 | </publication-reference>
 13 | <application-reference appl-type="design">
 14 | <document-id>
 15 | <country>US</country>
 16 | <doc-number>29422143</doc-number>
 17 | <date>20120517</date>
 18 | </document-id>
 19 | </application-reference>
 20 | <us-application-series-code>29</us-application-series-code>
 21 | <us-term-of-grant>
 22 | <length-of-grant>14</length-of-grant>
 23 | </us-term-of-grant>
 24 | <classification-locarno>
 25 | <edition>10</edition>
 26 | <main-classification>0101</main-classification>
 27 | </classification-locarno>
 28 | <classification-national>
 29 | <country>US</country>
 30 | <main-classification>D 1128</main-classification>
 31 | </classification-national>
 32 | <invention-title id="d2e53">Swirled cookie with three icing stripes</invention-title>
 33 | <us-references-cited>
 34 | <us-citation>
 35 | <patcit num="00001">
 36 | <document-id>
 37 | <country>US</country>
 38 | <doc-number>D234099</doc-number>
 39 | <kind>S</kind>
 40 | <name>Gobble</name>
 41 | <date>19750100</date>
 42 | </document-id>
 43 | </patcit>
 44 | <category>cited by examiner</category>
 45 | <classification-national><country>US</country><main-classification>D 1129</main-classification></classification-national>
 46 | </us-citation>
 47 | <us-citation>
 48 | <patcit num="00002">
 49 | <document-id>
 50 | <country>US</country>
 51 | <doc-number>6371755</doc-number>
 52 | <kind>B1</kind>
 53 | <name>Dearth</name>
 54 | <date>20020400</date>
 55 | </document-id>
 56 | </patcit>
 57 | <category>cited by examiner</category>
 58 | <classification-national><country>US</country><main-classification>431291</main-classification></classification-national>
 59 | </us-citation>
 60 | <us-citation>
 61 | <patcit num="00003">
 62 | <document-id>
 63 | <country>US</country>
 64 | <doc-number>6468569</doc-number>
 65 | <kind>B1</kind>
 66 | <name>Dunker et al.</name>
 67 | <date>20021000</date>
 68 | </document-id>
 69 | </patcit>
 70 | <category>cited by examiner</category>
 71 | <classification-national><country>US</country><main-classification>426 94</main-classification></classification-national>
 72 | </us-citation>
 73 | <us-citation>
 74 | <patcit num="00004">
 75 | <document-id>
 76 | <country>US</country>
 77 | <doc-number>D493604</doc-number>
 78 | <kind>S</kind>
 79 | <name>Willoughby</name>
 80 | <date>20040800</date>
 81 | </document-id>
 82 | </patcit>
 83 | <category>cited by examiner</category>
 84 | <classification-national><country>US</country><main-classification>D 1129</main-classification></classification-national>
 85 | </us-citation>
 86 | <us-citation>
 87 | <patcit num="00005">
 88 | <document-id>
 89 | <country>US</country>
 90 | <doc-number>D512199</doc-number>
 91 | <kind>S</kind>
 92 | <name>Ibanez</name>
 93 | <date>20051200</date>
 94 | </document-id>
 95 | </patcit>
 96 | <category>cited by examiner</category>
 97 | <classification-national><country>US</country><main-classification>D 1129</main-classification></classification-national>
 98 | </us-citation>
 99 | <us-citation>
100 | <patcit num="00006">
101 | <document-id>
102 | <country>US</country>
103 | <doc-number>D530483</doc-number>
104 | <kind>S</kind>
105 | <name>Dahl et al.</name>
106 | <date>20061000</date>
107 | </document-id>
108 | </patcit>
109 | <category>cited by examiner</category>
110 | <classification-national><country>US</country><main-classification>D 1130</main-classification></classification-national>
111 | </us-citation>
112 | <us-citation>
113 | <patcit num="00007">
114 | <document-id>
115 | <country>US</country>
116 | <doc-number>D537562</doc-number>
117 | <kind>S</kind>
118 | <name>Taniguchi et al.</name>
119 | <date>20070200</date>
120 | </document-id>
121 | </patcit>
122 | <category>cited by examiner</category>
123 | <classification-national><country>US</country><main-classification>D28  4</main-classification></classification-national>
124 | </us-citation>
125 | <us-citation>
126 | <patcit num="00008">
127 | <document-id>
128 | <country>US</country>
129 | <doc-number>D537563</doc-number>
130 | <kind>S</kind>
131 | <name>Taniguchi et al.</name>
132 | <date>20070200</date>
133 | </document-id>
134 | </patcit>
135 | <category>cited by examiner</category>
136 | <classification-national><country>US</country><main-classification>D28  4</main-classification></classification-national>
137 | </us-citation>
138 | <us-citation>
139 | <patcit num="00009">
140 | <document-id>
141 | <country>US</country>
142 | <doc-number>D600426</doc-number>
143 | <kind>S</kind>
144 | <name>Dennison</name>
145 | <date>20090900</date>
146 | </document-id>
147 | </patcit>
148 | <category>cited by examiner</category>
149 | <classification-national><country>US</country><main-classification>D 1125</main-classification></classification-national>
150 | </us-citation>
151 | <us-citation>
152 | <patcit num="00010">
153 | <document-id>
154 | <country>US</country>
155 | <doc-number>D609877</doc-number>
156 | <kind>S</kind>
157 | <name>Kadow-Dougherty</name>
158 | <date>20100200</date>
159 | </document-id>
160 | </patcit>
161 | <category>cited by examiner</category>
162 | <classification-national><country>US</country><main-classification>D 1127</main-classification></classification-national>
163 | </us-citation>
164 | <us-citation>
165 | <patcit num="00011">
166 | <document-id>
167 | <country>US</country>
168 | <doc-number>D632842</doc-number>
169 | <kind>S</kind>
170 | <name>Taniguchi</name>
171 | <date>20110200</date>
172 | </document-id>
173 | </patcit>
174 | <category>cited by examiner</category>
175 | <classification-national><country>US</country><main-classification>D28  4</main-classification></classification-national>
176 | </us-citation>
177 | <us-citation>
178 | <patcit num="00012">
179 | <document-id>
180 | <country>US</country>
181 | <doc-number>D644817</doc-number>
182 | <kind>S</kind>
183 | <name>Hanna et al.</name>
184 | <date>20110900</date>
185 | </document-id>
186 | </patcit>
187 | <category>cited by examiner</category>
188 | <classification-national><country>US</country><main-classification>D 1115</main-classification></classification-national>
189 | </us-citation>
190 | <us-citation>
191 | <patcit num="00013">
192 | <document-id>
193 | <country>US</country>
194 | <doc-number>2003/0157222</doc-number>
195 | <kind>A1</kind>
196 | <name>Henry et al.</name>
197 | <date>20030800</date>
198 | </document-id>
199 | </patcit>
200 | <category>cited by examiner</category>
201 | <classification-national><country>US</country><main-classification>426 94</main-classification></classification-national>
202 | </us-citation>
203 | </us-references-cited>
204 | <number-of-claims>1</number-of-claims>
205 | <us-exemplary-claim>1</us-exemplary-claim>
206 | <us-field-of-classification-search>
207 | <classification-national>
208 | <country>US</country>
209 | <main-classification>D 1100-130</main-classification>
210 | <additional-info>unstructured</additional-info>
211 | </classification-national>
212 | <classification-national>
213 | <country>US</country>
214 | <main-classification>D 1199</main-classification>
215 | </classification-national>
216 | <classification-national>
217 | <country>US</country>
218 | <main-classification>D11 14</main-classification>
219 | </classification-national>
220 | <classification-national>
221 | <country>US</country>
222 | <main-classification>D11 30</main-classification>
223 | </classification-national>
224 | <classification-national>
225 | <country>US</country>
226 | <main-classification>D11 44</main-classification>
227 | </classification-national>
228 | <classification-national>
229 | <country>US</country>
230 | <main-classification>D11 48</main-classification>
231 | </classification-national>
232 | <classification-national>
233 | <country>US</country>
234 | <main-classification>D11 81</main-classification>
235 | </classification-national>
236 | <classification-national>
237 | <country>US</country>
238 | <main-classification>D 7359</main-classification>
239 | </classification-national>
240 | <classification-national>
241 | <country>US</country>
242 | <main-classification>D 7677</main-classification>
243 | </classification-national>
244 | <classification-national>
245 | <country>US</country>
246 | <main-classification>D 7900</main-classification>
247 | </classification-national>
248 | <classification-national>
249 | <country>US</country>
250 | <main-classification>D28  4</main-classification>
251 | </classification-national>
252 | <classification-national>
253 | <country>US</country>
254 | <main-classification>426 94</main-classification>
255 | </classification-national>
256 | <classification-national>
257 | <country>US</country>
258 | <main-classification>426503</main-classification>
259 | </classification-national>
260 | <classification-national>
261 | <country>US</country>
262 | <main-classification>264259</main-classification>
263 | </classification-national>
264 | </us-field-of-classification-search>
265 | <figures>
266 | <number-of-drawing-sheets>2</number-of-drawing-sheets>
267 | <number-of-figures>6</number-of-figures>
268 | </figures>
269 | <us-parties>
270 | <us-applicants>
271 | <us-applicant sequence="001" app-type="applicant" designation="us-only">
272 | <addressbook>
273 | <last-name>Lu</last-name>
274 | <first-name>Wei</first-name>
275 | <address>
276 | <city>Kalamazoo</city>
277 | <state>MI</state>
278 | <country>US</country>
279 | </address>
280 | </addressbook>
281 | <residence>
282 | <country>US</country>
283 | </residence>
284 | </us-applicant>
285 | <us-applicant sequence="002" app-type="applicant" designation="us-only">
286 | <addressbook>
287 | <last-name>Quinlan</last-name>
288 | <first-name>Glenn</first-name>
289 | <address>
290 | <city>Battle Creek</city>
291 | <state>MI</state>
292 | <country>US</country>
293 | </address>
294 | </addressbook>
295 | <residence>
296 | <country>US</country>
297 | </residence>
298 | </us-applicant>
299 | <us-applicant sequence="003" app-type="applicant" designation="us-only">
300 | <addressbook>
301 | <last-name>Okrasinski</last-name>
302 | <first-name>Nathan</first-name>
303 | <address>
304 | <city>Augusta</city>
305 | <state>MI</state>
306 | <country>US</country>
307 | </address>
308 | </addressbook>
309 | <residence>
310 | <country>US</country>
311 | </residence>
312 | </us-applicant>
313 | </us-applicants>
314 | <inventors>
315 | <inventor sequence="001" designation="us-only">
316 | <addressbook>
317 | <last-name>Lu</last-name>
318 | <first-name>Wei</first-name>
319 | <address>
320 | <city>Kalamazoo</city>
321 | <state>MI</state>
322 | <country>US</country>
323 | </address>
324 | </addressbook>
325 | </inventor>
326 | <inventor sequence="002" designation="us-only">
327 | <addressbook>
328 | <last-name>Quinlan</last-name>
329 | <first-name>Glenn</first-name>
330 | <address>
331 | <city>Battle Creek</city>
332 | <state>MI</state>
333 | <country>US</country>
334 | </address>
335 | </addressbook>
336 | </inventor>
337 | <inventor sequence="003" designation="us-only">
338 | <addressbook>
339 | <last-name>Okrasinski</last-name>
340 | <first-name>Nathan</first-name>
341 | <address>
342 | <city>Augusta</city>
343 | <state>MI</state>
344 | <country>US</country>
345 | </address>
346 | </addressbook>
347 | </inventor>
348 | </inventors>
349 | <agents>
350 | <agent sequence="01" rep-type="attorney">
351 | <addressbook>
352 | <orgname>Dickinson Wright PLLC</orgname>
353 | <address>
354 | <country>unknown</country>
355 | </address>
356 | </addressbook>
357 | </agent>
358 | </agents>
359 | </us-parties>
360 | <assignees>
361 | <assignee>
362 | <addressbook>
363 | <orgname>Kellogg Company</orgname>
364 | <role>02</role>
365 | <address>
366 | <city>Battle Creek</city>
367 | <state>MI</state>
368 | <country>US</country>
369 | </address>
370 | </addressbook>
371 | </assignee>
372 | </assignees>
373 | <examiners>
374 | <primary-examiner>
375 | <last-name>Brooks</last-name>
376 | <first-name>Cathron</first-name>
377 | <department>2911</department>
378 | </primary-examiner>
379 | <assistant-examiner>
380 | <last-name>Mroczka</last-name>
381 | <first-name>Katie</first-name>
382 | </assistant-examiner>
383 | </examiners>
384 | </us-bibliographic-data-grant>
385 | <drawings id="DRAWINGS">
386 | <figure id="Fig-EMI-D00000" num="00000">
387 | <img id="EMI-D00000" he="140.80mm" wi="149.61mm" file="USD0696837-20140107-D00000.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
388 | </figure>
389 | <figure id="Fig-EMI-D00001" num="00001">
390 | <img id="EMI-D00001" he="245.45mm" wi="179.15mm" file="USD0696837-20140107-D00001.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
391 | </figure>
392 | <figure id="Fig-EMI-D00002" num="00002">
393 | <img id="EMI-D00002" he="254.25mm" wi="181.61mm" file="USD0696837-20140107-D00002.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
394 | </figure>
395 | </drawings>
396 | <description id="description">
397 | <?brief-description-of-drawings description="Brief Description of Drawings" end="lead"?>
398 | <description-of-drawings>
399 | <p id="p-0001" num="0001"><figref idref="DRAWINGS">FIG. 1</figref> is a top view of the swirled cookie with three icing stripes of the present invention.</p>
400 | <p id="p-0002" num="0002"><figref idref="DRAWINGS">FIG. 2</figref> is a first end view of the swirled cookie with three icing stripes of the present invention.</p>
401 | <p id="p-0003" num="0003"><figref idref="DRAWINGS">FIG. 3</figref> is a second end view of the swirled cookie with three icing stripes of the present invention.</p>
402 | <p id="p-0004" num="0004"><figref idref="DRAWINGS">FIG. 4</figref> is a first side view of the swirled cookie with three icing stripes of the present invention.</p>
403 | <p id="p-0005" num="0005"><figref idref="DRAWINGS">FIG. 5</figref> is a second side view of the swirled cookie with three icing stripes of the present invention; and,</p>
404 | <p id="p-0006" num="0006"><figref idref="DRAWINGS">FIG. 6</figref> is a bottom view of the swirled cookie with three icing stripes of the present invention.</p>
405 | </description-of-drawings>
406 | <?brief-description-of-drawings description="Brief Description of Drawings" end="tail"?>
407 | </description>
408 | <us-claim-statement>CLAIM</us-claim-statement>
409 | <claims id="claims">
410 | <claim id="CLM-00001" num="00001">
411 | <claim-text>We claim the ornamental design for the swirled cookie with three icing stripes, as shown.</claim-text>
412 | </claim>
413 | </claims>
414 | </us-patent-grant>
415 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/05-Processing Patents/patent.data-3:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <!DOCTYPE us-patent-grant SYSTEM "us-patent-grant-v44-2013-05-16.dtd" [ ]>
  3 | <us-patent-grant lang="EN" dtd-version="v4.4 2013-05-16" file="USD0696839-20140107.XML" status="PRODUCTION" id="us-patent-grant" country="US" date-produced="20131224" date-publ="20140107">
  4 | <us-bibliographic-data-grant>
  5 | <publication-reference>
  6 | <document-id>
  7 | <country>US</country>
  8 | <doc-number>D0696839</doc-number>
  9 | <kind>S1</kind>
 10 | <date>20140107</date>
 11 | </document-id>
 12 | </publication-reference>
 13 | <application-reference appl-type="design">
 14 | <document-id>
 15 | <country>US</country>
 16 | <doc-number>29455647</doc-number>
 17 | <date>20130523</date>
 18 | </document-id>
 19 | </application-reference>
 20 | <us-application-series-code>29</us-application-series-code>
 21 | <us-term-of-grant>
 22 | <length-of-grant>14</length-of-grant>
 23 | </us-term-of-grant>
 24 | <classification-locarno>
 25 | <edition>10</edition>
 26 | <main-classification>0202</main-classification>
 27 | </classification-locarno>
 28 | <classification-national>
 29 | <country>US</country>
 30 | <main-classification>D 2743</main-classification>
 31 | <further-classification> D2739</further-classification>
 32 | </classification-national>
 33 | <invention-title id="d2e53">Medical scrubs having removable cuffs</invention-title>
 34 | <us-references-cited>
 35 | <us-citation>
 36 | <patcit num="00001">
 37 | <document-id>
 38 | <country>US</country>
 39 | <doc-number>4601066</doc-number>
 40 | <kind>A</kind>
 41 | <name>Campbell</name>
 42 | <date>19860700</date>
 43 | </document-id>
 44 | </patcit>
 45 | <category>cited by examiner</category>
 46 | <classification-national><country>US</country><main-classification>  2 70</main-classification></classification-national>
 47 | </us-citation>
 48 | <us-citation>
 49 | <patcit num="00002">
 50 | <document-id>
 51 | <country>US</country>
 52 | <doc-number>4737995</doc-number>
 53 | <kind>A</kind>
 54 | <name>Wiley</name>
 55 | <date>19880400</date>
 56 | </document-id>
 57 | </patcit>
 58 | <category>cited by examiner</category>
 59 | <classification-national><country>US</country><main-classification>  2114</main-classification></classification-national>
 60 | </us-citation>
 61 | <us-citation>
 62 | <patcit num="00003">
 63 | <document-id>
 64 | <country>US</country>
 65 | <doc-number>4860388</doc-number>
 66 | <kind>A</kind>
 67 | <name>Dean</name>
 68 | <date>19890800</date>
 69 | </document-id>
 70 | </patcit>
 71 | <category>cited by examiner</category>
 72 | <classification-national><country>US</country><main-classification>  2247</main-classification></classification-national>
 73 | </us-citation>
 74 | <us-citation>
 75 | <patcit num="00004">
 76 | <document-id>
 77 | <country>US</country>
 78 | <doc-number>D341919</doc-number>
 79 | <kind>S</kind>
 80 | <name>Grimes</name>
 81 | <date>19931200</date>
 82 | </document-id>
 83 | </patcit>
 84 | <category>cited by examiner</category>
 85 | <classification-national><country>US</country><main-classification>D 2743</main-classification></classification-national>
 86 | </us-citation>
 87 | <us-citation>
 88 | <patcit num="00005">
 89 | <document-id>
 90 | <country>US</country>
 91 | <doc-number>5539930</doc-number>
 92 | <kind>A</kind>
 93 | <name>Sesselmann</name>
 94 | <date>19960700</date>
 95 | </document-id>
 96 | </patcit>
 97 | <category>cited by examiner</category>
 98 | <classification-national><country>US</country><main-classification>  22431</main-classification></classification-national>
 99 | </us-citation>
100 | <us-citation>
101 | <patcit num="00006">
102 | <document-id>
103 | <country>US</country>
104 | <doc-number>D452988</doc-number>
105 | <kind>S</kind>
106 | <name>Ergezinger et al.</name>
107 | <date>20020100</date>
108 | </document-id>
109 | </patcit>
110 | <category>cited by examiner</category>
111 | <classification-national><country>US</country><main-classification>D 2743</main-classification></classification-national>
112 | </us-citation>
113 | <us-citation>
114 | <patcit num="00007">
115 | <document-id>
116 | <country>US</country>
117 | <doc-number>D476137</doc-number>
118 | <kind>S</kind>
119 | <name>McFarlane</name>
120 | <date>20030600</date>
121 | </document-id>
122 | </patcit>
123 | <category>cited by applicant</category>
124 | </us-citation>
125 | <us-citation>
126 | <patcit num="00008">
127 | <document-id>
128 | <country>US</country>
129 | <doc-number>7203974</doc-number>
130 | <kind>B2</kind>
131 | <name>Jones et al.</name>
132 | <date>20070400</date>
133 | </document-id>
134 | </patcit>
135 | <category>cited by applicant</category>
136 | </us-citation>
137 | <us-citation>
138 | <patcit num="00009">
139 | <document-id>
140 | <country>US</country>
141 | <doc-number>D551826</doc-number>
142 | <kind>S</kind>
143 | <name>Lambert</name>
144 | <date>20071000</date>
145 | </document-id>
146 | </patcit>
147 | <category>cited by examiner</category>
148 | <classification-national><country>US</country><main-classification>D 2742</main-classification></classification-national>
149 | </us-citation>
150 | <us-citation>
151 | <patcit num="00010">
152 | <document-id>
153 | <country>US</country>
154 | <doc-number>D627137</doc-number>
155 | <kind>S</kind>
156 | <name>Keavey et al.</name>
157 | <date>20101100</date>
158 | </document-id>
159 | </patcit>
160 | <category>cited by applicant</category>
161 | </us-citation>
162 | <us-citation>
163 | <patcit num="00011">
164 | <document-id>
165 | <country>US</country>
166 | <doc-number>D661466</doc-number>
167 | <kind>S</kind>
168 | <name>Taft</name>
169 | <date>20120600</date>
170 | </document-id>
171 | </patcit>
172 | <category>cited by examiner</category>
173 | <classification-national><country>US</country><main-classification>D 2743</main-classification></classification-national>
174 | </us-citation>
175 | <us-citation>
176 | <patcit num="00012">
177 | <document-id>
178 | <country>US</country>
179 | <doc-number>2012/0311759</doc-number>
180 | <kind>A1</kind>
181 | <name>Moore</name>
182 | <date>20121200</date>
183 | </document-id>
184 | </patcit>
185 | <category>cited by examiner</category>
186 | <classification-national><country>US</country><main-classification>  2 69</main-classification></classification-national>
187 | </us-citation>
188 | </us-references-cited>
189 | <number-of-claims>1</number-of-claims>
190 | <us-exemplary-claim>1</us-exemplary-claim>
191 | <us-field-of-classification-search>
192 | <classification-national>
193 | <country>US</country>
194 | <main-classification>D 2717</main-classification>
195 | </classification-national>
196 | <classification-national>
197 | <country>US</country>
198 | <main-classification>D 2731</main-classification>
199 | </classification-national>
200 | <classification-national>
201 | <country>US</country>
202 | <main-classification>D 2749</main-classification>
203 | </classification-national>
204 | <classification-national>
205 | <country>US</country>
206 | <main-classification>D 2750</main-classification>
207 | </classification-national>
208 | <classification-national>
209 | <country>US</country>
210 | <main-classification>D 2840</main-classification>
211 | </classification-national>
212 | <classification-national>
213 | <country>US</country>
214 | <main-classification>D 2841</main-classification>
215 | </classification-national>
216 | <classification-national>
217 | <country>US</country>
218 | <main-classification>D 2844</main-classification>
219 | </classification-national>
220 | <classification-national>
221 | <country>US</country>
222 | <main-classification>D 2847</main-classification>
223 | </classification-national>
224 | <classification-national>
225 | <country>US</country>
226 | <main-classification>D 2848</main-classification>
227 | </classification-national>
228 | <classification-national>
229 | <country>US</country>
230 | <main-classification>D 2850</main-classification>
231 | </classification-national>
232 | <classification-national>
233 | <country>US</country>
234 | <main-classification>D 2853</main-classification>
235 | </classification-national>
236 | <classification-national>
237 | <country>US</country>
238 | <main-classification>D 2742</main-classification>
239 | </classification-national>
240 | <classification-national>
241 | <country>US</country>
242 | <main-classification>D 2743</main-classification>
243 | </classification-national>
244 | <classification-national>
245 | <country>US</country>
246 | <main-classification>D 2857</main-classification>
247 | </classification-national>
248 | <classification-national>
249 | <country>US</country>
250 | <main-classification>D 2739</main-classification>
251 | </classification-national>
252 | <classification-national>
253 | <country>US</country>
254 | <main-classification>D 2720</main-classification>
255 | </classification-national>
256 | <classification-national>
257 | <country>US</country>
258 | <main-classification>  2 90</main-classification>
259 | </classification-national>
260 | <classification-national>
261 | <country>US</country>
262 | <main-classification>  2106</main-classification>
263 | </classification-national>
264 | <classification-national>
265 | <country>US</country>
266 | <main-classification>  2113</main-classification>
267 | </classification-national>
268 | <classification-national>
269 | <country>US</country>
270 | <main-classification>  2115</main-classification>
271 | </classification-national>
272 | <classification-national>
273 | <country>US</country>
274 | <main-classification>  2227</main-classification>
275 | </classification-national>
276 | <classification-national>
277 | <country>US</country>
278 | <main-classification>  2228</main-classification>
279 | </classification-national>
280 | <classification-national>
281 | <country>US</country>
282 | <main-classification>  2242</main-classification>
283 | </classification-national>
284 | <classification-national>
285 | <country>US</country>
286 | <main-classification>  2247</main-classification>
287 | </classification-national>
288 | </us-field-of-classification-search>
289 | <figures>
290 | <number-of-drawing-sheets>6</number-of-drawing-sheets>
291 | <number-of-figures>9</number-of-figures>
292 | </figures>
293 | <us-parties>
294 | <us-applicants>
295 | <us-applicant sequence="001" app-type="applicant" designation="us-only">
296 | <addressbook>
297 | <last-name>Harris</last-name>
298 | <first-name>Marilyn</first-name>
299 | <address>
300 | <city>Powder Springs</city>
301 | <state>GA</state>
302 | <country>US</country>
303 | </address>
304 | </addressbook>
305 | <residence>
306 | <country>US</country>
307 | </residence>
308 | </us-applicant>
309 | </us-applicants>
310 | <inventors>
311 | <inventor sequence="001" designation="us-only">
312 | <addressbook>
313 | <last-name>Harris</last-name>
314 | <first-name>Marilyn</first-name>
315 | <address>
316 | <city>Powder Springs</city>
317 | <state>GA</state>
318 | <country>US</country>
319 | </address>
320 | </addressbook>
321 | </inventor>
322 | </inventors>
323 | <agents>
324 | <agent sequence="01" rep-type="attorney">
325 | <addressbook>
326 | <orgname>Crose Law LLC</orgname>
327 | <address>
328 | <country>unknown</country>
329 | </address>
330 | </addressbook>
331 | </agent>
332 | <agent sequence="02" rep-type="attorney">
333 | <addressbook>
334 | <last-name>Crose</last-name>
335 | <first-name>Bradley D.</first-name>
336 | <address>
337 | <country>unknown</country>
338 | </address>
339 | </addressbook>
340 | </agent>
341 | </agents>
342 | </us-parties>
343 | <examiners>
344 | <primary-examiner>
345 | <last-name>Johnson</last-name>
346 | <first-name>Rashida</first-name>
347 | <department>2916</department>
348 | </primary-examiner>
349 | </examiners>
350 | </us-bibliographic-data-grant>
351 | <drawings id="DRAWINGS">
352 | <figure id="Fig-EMI-D00000" num="00000">
353 | <img id="EMI-D00000" he="206.93mm" wi="238.42mm" file="USD0696839-20140107-D00000.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
354 | </figure>
355 | <figure id="Fig-EMI-D00001" num="00001">
356 | <img id="EMI-D00001" he="239.52mm" wi="164.68mm" file="USD0696839-20140107-D00001.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
357 | </figure>
358 | <figure id="Fig-EMI-D00002" num="00002">
359 | <img id="EMI-D00002" he="227.92mm" wi="143.59mm" file="USD0696839-20140107-D00002.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
360 | </figure>
361 | <figure id="Fig-EMI-D00003" num="00003">
362 | <img id="EMI-D00003" he="231.90mm" wi="143.59mm" file="USD0696839-20140107-D00003.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
363 | </figure>
364 | <figure id="Fig-EMI-D00004" num="00004">
365 | <img id="EMI-D00004" he="233.43mm" wi="122.85mm" file="USD0696839-20140107-D00004.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
366 | </figure>
367 | <figure id="Fig-EMI-D00005" num="00005">
368 | <img id="EMI-D00005" he="122.85mm" wi="147.83mm" file="USD0696839-20140107-D00005.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
369 | </figure>
370 | <figure id="Fig-EMI-D00006" num="00006">
371 | <img id="EMI-D00006" he="255.44mm" wi="131.06mm" file="USD0696839-20140107-D00006.TIF" alt="embedded image" img-content="drawing" img-format="tif"/>
372 | </figure>
373 | </drawings>
374 | <description id="description">
375 | <?brief-description-of-drawings description="Brief Description of Drawings" end="lead"?>
376 | <description-of-drawings>
377 | <p id="p-0001" num="0001"><figref idref="DRAWINGS">FIG. 1</figref> is a front perspective view of medical scrubs having removable cuffs illustrating the new, original and ornamental design;</p>
378 | <p id="p-0002" num="0002"><figref idref="DRAWINGS">FIG. 2</figref> is a cross-sectional view thereof, taken along line <b>2</b>-<b>2</b> in <figref idref="DRAWINGS">FIG. 1</figref>;</p>
379 | <p id="p-0003" num="0003"><figref idref="DRAWINGS">FIG. 3</figref> is a front elevational view thereof;</p>
380 | <p id="p-0004" num="0004"><figref idref="DRAWINGS">FIG. 4</figref> is a rear elevational view thereof;</p>
381 | <p id="p-0005" num="0005"><figref idref="DRAWINGS">FIG. 5</figref> is a right side elevational view thereof;</p>
382 | <p id="p-0006" num="0006"><figref idref="DRAWINGS">FIG. 6</figref> is a left side elevational view thereof;</p>
383 | <p id="p-0007" num="0007"><figref idref="DRAWINGS">FIG. 7</figref> is a top plan view thereof;</p>
384 | <p id="p-0008" num="0008"><figref idref="DRAWINGS">FIG. 8</figref> is a bottom plan view thereof; and,</p>
385 | <p id="p-0009" num="0009"><figref idref="DRAWINGS">FIG. 9</figref> is a front perspective view thereof, showing the cuffs unattached.</p>
386 | <p id="p-0010" num="0010">The broken lines shown in the drawings are included for the purpose of illustrating environmental structure and form no part of the claimed design.</p>
387 | </description-of-drawings>
388 | <?brief-description-of-drawings description="Brief Description of Drawings" end="tail"?>
389 | </description>
390 | <us-claim-statement>CLAIM</us-claim-statement>
391 | <claims id="claims">
392 | <claim id="CLM-00001" num="00001">
393 | <claim-text>The ornamental design for medical scrubs having removable cuffs, as shown and described.</claim-text>
394 | </claim>
395 | </claims>
396 | </us-patent-grant>
397 | 


--------------------------------------------------------------------------------
/Lesson 2 - Data in More Complex Formats/Problem Set 2/05-Processing Patents/split_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # So, the problem is that the gigantic file is actually not a valid XML, because
 5 | # it has several root elements, and XML declarations.
 6 | # It is, a matter of fact, a collection of a lot of concatenated XML documents.
 7 | # So, one solution would be to split the file into separate documents,
 8 | # so that you can process the resulting files as valid XML documents.
 9 | 
10 | import xml.etree.ElementTree as ET
11 | 
12 | PATENTS = 'patent.data'
13 | 
14 | 
15 | def get_root(fname):
16 |     tree = ET.parse(fname)
17 |     return tree.getroot()
18 | 
19 | 
20 | def split_file(filename):
21 |     """
22 |     Split the input file into separate files, each containing a single patent.
23 |     As a hint - each patent declaration starts with the same line that was
24 |     causing the error found in the previous exercises.
25 | 
26 |     The new files should be saved with filename in the following format:
27 |     "{}-{}".format(filename, n) where n is a counter, starting from 0.
28 |     """
29 |     with open(filename, 'r') as reader:
30 |         filenum = -1
31 |         writer = None
32 |         for line in reader:
33 |             if line.find('<?xml version="1.0" encoding="UTF-8"?>') != -1:
34 |                 if writer:
35 |                     writer.close()
36 |                 filenum += 1
37 |                 writer = open("{}-{}".format(filename, filenum), 'w')
38 |             writer.write(line)
39 |         writer.close()
40 | 
41 | 
42 | def test():
43 |     split_file(PATENTS)
44 |     for n in range(4):
45 |         try:
46 |             fname = "{}-{}".format(PATENTS, n)
47 |             f = open(fname, "r")
48 |             if not f.readline().startswith("<?xml"):
49 |                 print "You have not split the file {} in the correct boundary!".format(fname)
50 |             f.close()
51 |         except:
52 |             print "Could not find file {}. Check if the filename is correct!".format(fname)
53 | 
54 | test()
55 | 


--------------------------------------------------------------------------------
/Lesson 3 - Data Quality/12-Correcting Validity/FIXME-autos.csv:
--------------------------------------------------------------------------------
1 | URI,rdf-schema#label,rdf-schema#comment,assembly_label,assembly,automobilePlatform_label,automobilePlatform,bodyStyle_label,bodyStyle,class_label,class,designCompany_label,designCompany,designer_label,designer,engine_label,engine,fuelCapacity,height,layout_label,layout,length,manufacturer_label,manufacturer,modelEndYear,modelStartYear,parentCompany_label,parentCompany,predecessor_label,predecessor,productionEndDate,productionEndYear,productionStartDate,productionStartYear,relatedMeanOfTransportation_label,relatedMeanOfTransportation,sales_label,sales,successor_label,successor,thumbnail_label,thumbnail,transmission,variantOf_label,variantOf,vehicle_label,vehicle,weight,wheelbase,width,point,22-rdf-syntax-ns#type_label,22-rdf-syntax-ns#type,wgs84_pos#lat,wgs84_pos#long,depiction_label,depiction,name
2 | http://dbpedia.org/resource/Lada_Samara,Lada Samara,The Lada Samara is an economy car produced by Soviet/Russian vehicle manufacturer AvtoVAZ under the Lada brand since 1984. The brand name Samara originally was used only for exported models in Russia the same model was called Sputnik (satellite) until 1991 when the sedan version of Samara entered in production using the export name. The production of Lada Samara will be ended in 2013. However there is no word of the pickup version to be discontinued.,{Finland|Kazakhstan|Montevideo|Oskemen|Russia|Tolyatti|Ukraine|Uruguay|Uusikaupunki|Zaporizhia},{http://dbpedia.org/resource/Finland|http://dbpedia.org/resource/Kazakhstan|http://dbpedia.org/resource/Montevideo|http://dbpedia.org/resource/Oskemen|http://dbpedia.org/resource/Russia|http://dbpedia.org/resource/Tolyatti|http://dbpedia.org/resource/Ukraine|http://dbpedia.org/resource/Uruguay|http://dbpedia.org/resource/Uusikaupunki|http://dbpedia.org/resource/Zaporizhia},NULL,NULL,{Hatchback|Sedan (automobile)},{http://dbpedia.org/resource/Hatchback|http://dbpedia.org/resource/Sedan_(automobile)},Economy car,http://dbpedia.org/resource/Economy_car,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1.335,Front-engine front-wheel-drive layout,"http://dbpedia.org/resource/Front-engine,_front-wheel-drive_layout",4.005,Lada,http://dbpedia.org/resource/Lada,NULL,NULL,NULL,NULL,NULL,NULL,NULL,2108-01-01T00:00:00+02:00,NULL,2108,NULL,NULL,NULL,NULL,NULL,NULL,200px-Lada_2109_Samara_1300_S_Hanseat_Front.JPG,http://upload.wikimedia.org/wikipedia/commons/thumb/f/fd/Lada_2109_Samara_1300_S_Hanseat_Front.JPG/200px-Lada_2109_Samara_1300_S_Hanseat_Front.JPG,NULL,NULL,NULL,NULL,NULL,NULL,2.46,1.65,NULL,{automobile|mean of transportation|Product|owl#Thing},{http://dbpedia.org/ontology/Automobile|http://dbpedia.org/ontology/MeanOfTransportation|http://schema.org/Product|http://www.w3.org/2002/07/owl#Thing},NULL,NULL,Lada_2109_Samara_1300_S_Hanseat_Front.JPG,http://upload.wikimedia.org/wikipedia/commons/f/fd/Lada_2109_Samara_1300_S_Hanseat_Front.JPG,{Bognor Sagona (ROU)|Lada Forma (Sedan)|Lada Sagona (Sedan)|Lada Samara|Lada Samara 2|Lada Sputnik|VAZ 2108/2109/21099|VAZ 2113/2114/2115}
3 | http://dbpedia.org/resource/Vauxhall_Viva,Vauxhall Viva,The Viva was a small family car produced by Vauxhall Motors in a succession of three versions between 1963 and 1979. These were known as the HA the HB and the HC series. The Viva appeared a year after Vauxhall's sister company Opel launched the Opel Kadett A: visually the two cars' kinship was obvious. A van version was also produced as the Bedford HA. In the UK the Viva's principal competitors at the time of its launch included the well-established Ford Anglia and Morris Minor.,{Cheshire|Ellesmere Port},{http://dbpedia.org/resource/Cheshire|http://dbpedia.org/resource/Ellesmere_Port},NULL,NULL,NULL,NULL,Compact car,http://dbpedia.org/resource/Compact_car,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,Vauxhall Motors,http://dbpedia.org/resource/Vauxhall_Motors,NULL,NULL,NULL,NULL,NULL,NULL,NULL,1979-01-01T00:00:00+02:00,NULL,1,NULL,NULL,NULL,NULL,{Vauxhall Astra|Vauxhall Chevette},{http://dbpedia.org/resource/Vauxhall_Astra|http://dbpedia.org/resource/Vauxhall_Chevette},200px-Vivahc.jpg,http://upload.wikimedia.org/wikipedia/commons/thumb/1/14/Vivahc.jpg/200px-Vivahc.jpg,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,{automobile|mean of transportation|Product|owl#Thing},{http://dbpedia.org/ontology/Automobile|http://dbpedia.org/ontology/MeanOfTransportation|http://schema.org/Product|http://www.w3.org/2002/07/owl#Thing},NULL,NULL,Vivahc.jpg,http://upload.wikimedia.org/wikipedia/commons/1/14/Vivahc.jpg,Vauxhall Viva
4 | http://dbpedia.org/resource/Saab_9-2,Saab 9-2,The Saab 9-2 (or sometimes referred to as Saab 9-1) was a car that was proposed in 2010 be added to Saab's production line by 2014 under its then new ownership Spyker Cars though some commentators considered the project a pipe dream. The 9-2 was thought to developed in a joint venture with another automaker probably European.,NULL,NULL,NULL,NULL,NULL,NULL,Compact car,http://dbpedia.org/resource/Compact_car,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,Front-engine front-wheel-drive layout,"http://dbpedia.org/resource/Front-engine,_front-wheel-drive_layout",NULL,Saab Automobile,http://dbpedia.org/resource/Saab_Automobile,NULL,NULL,NULL,NULL,Subaru Impreza (second generation),http://dbpedia.org/resource/Subaru_Impreza_(second_generation),NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,200px-Saab_wordmark_grey.svg.png,http://upload.wikimedia.org/wikipedia/commons/thumb/e/e9/Saab_wordmark_grey.svg/200px-Saab_wordmark_grey.svg.png,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,{automobile|mean of transportation|Product|owl#Thing},{http://dbpedia.org/ontology/Automobile|http://dbpedia.org/ontology/MeanOfTransportation|http://schema.org/Product|http://www.w3.org/2002/07/owl#Thing},NULL,NULL,Saab_wordmark_grey.svg,http://upload.wikimedia.org/wikipedia/commons/e/e9/Saab_wordmark_grey.svg,{Saab 9-1|Saab 9-2}
5 | http://dbpedia.org/resource/E-M-F_Company,E-M-F Company,The E-M-F Company was an early American automobile manufacturer that produced automobiles from 1909 to 1912. The name E-M-F was gleaned from the initials of the three company founders: Barney Everitt (a custom auto-body builder from Detroit) William Metzger and Walter Flanders (who had served as Henry Ford's production manager).,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,200px-EMF_Model_30_Roadster_1912.jpg,http://upload.wikimedia.org/wikipedia/commons/thumb/c/c6/EMF_Model_30_Roadster_1912.jpg/200px-EMF_Model_30_Roadster_1912.jpg,3-speed sliding gear manual,NULL,NULL,NULL,NULL,NULL,2.6924,NULL,NULL,{automobile|mean of transportation|Product|owl#Thing},{http://dbpedia.org/ontology/Automobile|http://dbpedia.org/ontology/MeanOfTransportation|http://schema.org/Product|http://www.w3.org/2002/07/owl#Thing},NULL,NULL,EMF_Model_30_Roadster_1912.jpg,http://upload.wikimedia.org/wikipedia/commons/c/c6/EMF_Model_30_Roadster_1912.jpg,E-M-F 30
6 | 


--------------------------------------------------------------------------------
/Lesson 3 - Data Quality/12-Correcting Validity/validity.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Your task is to check the "productionStartYear" of the DBPedia autos datafile for valid values.
 5 | The following things should be done:
 6 | - check if the field "productionStartYear" contains a year
 7 | - check if the year is in range 1886-2014
 8 | - convert the value of the field to be just a year (not full datetime)
 9 | - the rest of the fields and values should stay the same
10 | - if the value of the field is a valid year in the range as described above,
11 |   write that line to the output_good file
12 | - if the value of the field is not a valid year as described above,
13 |   write that line to the output_bad file
14 | - discard rows (neither write to good nor bad) if the URI is not from dbpedia.org
15 | - you should use the provided way of reading and writing data (DictReader and DictWriter)
16 |   They will take care of dealing with the header.
17 | 
18 | You can write helper functions for checking the data and writing the files, but we will call only the
19 | 'process_file' with 3 arguments (inputfile, output_good, output_bad).
20 | """
21 | import csv
22 | 
23 | INPUT_FILE = 'autos.csv'
24 | OUTPUT_GOOD = 'autos-valid.csv'
25 | OUTPUT_BAD = 'FIXME-autos.csv'
26 | 
27 | 
28 | def process_file(input_file, output_good, output_bad):
29 | 
30 |     with open(input_file, "r") as f, open(output_good, "w") as og, open(output_bad, "w") as ob:
31 |         reader = csv.DictReader(f)
32 |         header = reader.fieldnames
33 |         # COMPLETE THIS FUNCTION
34 |         goodWriter = csv.DictWriter(og, delimiter=",", fieldnames=header)
35 |         goodWriter.writeheader()
36 | 
37 |         badWriter = csv.DictWriter(ob, delimiter=",", fieldnames=header)
38 |         badWriter.writeheader()
39 | 
40 |         for line in reader:
41 |             if line['URI'].find('dbpedia.org') != -1:
42 |                 try:
43 |                     line['productionStartYear'] = int(line['productionStartYear'][:4])
44 |                     if line['productionStartYear'] in range(1886, 2015):
45 |                         goodWriter.writerow(line)
46 |                     else:
47 |                         badWriter.writerow(line)
48 |                 except ValueError:
49 |                     badWriter.writerow(line)
50 | 
51 | 
52 | def test():
53 | 
54 |     process_file(INPUT_FILE, OUTPUT_GOOD, OUTPUT_BAD)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     test()
59 | 


--------------------------------------------------------------------------------
/Lesson 3 - Data Quality/Problem Set 3/01-Auditing Data Quality/audit.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | In this problem set you work with cities infobox data, audit it, come up with a
 6 | cleaning idea and then clean it up. In the first exercise we want you to audit
 7 | the datatypes that can be found in some particular fields in the dataset.
 8 | The possible types of values can be:
 9 | - NoneType if the value is a string "NULL" or an empty string ""
10 | - list, if the value starts with "{"
11 | - int, if the value can be cast to int
12 | - float, if the value can be cast to float, but CANNOT be cast to int.
13 |    For example, '3.23e+07' should be considered a float because it can be cast
14 |    as float but int('3.23e+07') will throw a ValueError
15 | - 'str', for all other values
16 | 
17 | The audit_file function should return a dictionary containing fieldnames and a
18 | SET of the types that can be found in the field. e.g.
19 | {"field1": set([type(float()), type(int()), type(str())]),
20 |  "field2": set([type(str())]),
21 |   ....
22 | }
23 | The type() function returns a type object describing the argument given to the
24 | function. You can also use examples of objects to create type objects, e.g.
25 | type(1.1) for a float: see the test function below for examples.
26 | 
27 | Note that the first three rows (after the header row) in the cities.csv file
28 | are not actual data points. The contents of these rows should note be included
29 | when processing data types. Be sure to include functionality in your code to
30 | skip over or detect these rows.
31 | """
32 | 
33 | import csv
34 | import pprint
35 | 
36 | CITIES = 'cities.csv'
37 | 
38 | FIELDS = ["name", "timeZone_label", "utcOffset", "homepage", "governmentType_label",
39 |           "isPartOf_label", "areaCode", "populationTotal", "elevation",
40 |           "maximumElevation", "minimumElevation", "populationDensity",
41 |           "wgs84_pos#lat", "wgs84_pos#long", "areaLand", "areaMetro", "areaUrban"]
42 | 
43 | 
44 | def audit_file(filename, fields):
45 |     fieldtypes = {}
46 |     # YOUR CODE HERE
47 |     for field in fields:
48 |         fieldtypes[field] = set()
49 |     with open(filename, 'r') as f:
50 |         reader = csv.DictReader(f)
51 |         for i in range(3):
52 |             reader.next()
53 |         for row in reader:
54 |             for field in fields:
55 |                 value = row[field]
56 |                 if value == 'NULL' or value == '':
57 |                     fieldtypes[field].add(type(None))
58 |                 elif value.startswith('{'):
59 |                     fieldtypes[field].add(list)
60 |                 else:
61 |                     try:
62 |                         int(value)
63 |                         fieldtypes[field].add(int)
64 |                     except ValueError:
65 |                         try:
66 |                             float(value)
67 |                             fieldtypes[field].add(float)
68 |                         except ValueError:
69 |                             fieldtypes[field].add(str)
70 | 
71 |     return fieldtypes
72 | 
73 | 
74 | def test():
75 |     fieldtypes = audit_file(CITIES, FIELDS)
76 | 
77 |     pprint.pprint(fieldtypes)
78 | 
79 |     assert fieldtypes["areaLand"] == set([type(1.1), type([]), type(None)])
80 |     assert fieldtypes['areaMetro'] == set([type(1.1), type(None)])
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     test()
85 | 


--------------------------------------------------------------------------------
/Lesson 3 - Data Quality/Problem Set 3/03-Fixing the Area/area.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | In this problem set you work with cities infobox data, audit it, come up with a
 6 | cleaning idea and then clean it up.
 7 | 
 8 | Since in the previous quiz you made a decision on which value to keep for the
 9 | "areaLand" field, you now know what has to be done.
10 | 
11 | Finish the function fix_area(). It will receive a string as an input, and it
12 | has to return a float representing the value of the area or None.
13 | You have to change the function fix_area. You can use extra functions if you
14 | like, but changes to process_file will not be taken into account.
15 | The rest of the code is just an example on how this function can be used.
16 | """
17 | 
18 | import csv
19 | import pprint
20 | 
21 | CITIES = 'cities.csv'
22 | 
23 | 
24 | def fix_area(area):
25 | 
26 |     # YOUR CODE HERE
27 |     if area.startswith('{'):
28 |         area_list = area.replace('{', '').replace('}', '').split('|')
29 |         area0 = area_list[0]
30 |         area1 = area_list[1]
31 |         if len(area0) > len(area1):
32 |             area = float(area0)
33 |         else:
34 |             area = float(area1)
35 |     elif area == 'NULL' or '':
36 |         area = None
37 |     else:
38 |         area = float(area)
39 | 
40 |     return area
41 | 
42 | 
43 | def process_file(filename):
44 |     # CHANGES TO THIS FUNCTION WILL BE IGNORED WHEN YOU SUBMIT THE EXERCISE
45 |     data = []
46 | 
47 |     with open(filename, "r") as f:
48 |         reader = csv.DictReader(f)
49 | 
50 |         # skipping the extra metadata
51 |         for i in range(3):
52 |             reader.next()
53 | 
54 |         # processing file
55 |         for line in reader:
56 |             # calling your function to fix the area value
57 |             if "areaLand" in line:
58 |                 line["areaLand"] = fix_area(line["areaLand"])
59 |             data.append(line)
60 | 
61 |     return data
62 | 
63 | 
64 | def test():
65 |     data = process_file(CITIES)
66 | 
67 |     print "Printing three example results:"
68 |     for n in range(5, 8):
69 |         pprint.pprint(data[n]["areaLand"])
70 | 
71 |     assert data[3]["areaLand"] is None
72 |     assert data[8]["areaLand"] == 55166700.0
73 |     assert data[20]["areaLand"] == 14581600.0
74 |     assert data[33]["areaLand"] == 20564500.0
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     test()
79 | 


--------------------------------------------------------------------------------
/Lesson 3 - Data Quality/Problem Set 3/05-Fixing Name/name.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | In this problem set you work with cities infobox data, audit it, come up with a
 5 | cleaning idea and then clean it up.
 6 | 
 7 | In the previous quiz you recognized that the "name" value can be an array (or
 8 | list in Python terms). It would make it easier to process and query the data
 9 | later if all values for the name are in a Python list, instead of being
10 | just a string separated with special characters, like now.
11 | 
12 | Finish the function fix_name(). It will recieve a string as an input, and it
13 | will return a list of all the names. If there is only one name, the list will
14 | have only one item in it; if the name is "NULL", the list should be empty.
15 | The rest of the code is just an example on how this function can be used.
16 | """
17 | 
18 | import csv
19 | import pprint
20 | 
21 | CITIES = 'cities.csv'
22 | 
23 | 
24 | def fix_name(name):
25 | 
26 |     # YOUR CODE HERE
27 |     if name.startswith('{'):
28 |         name = name.replace('{', '').replace('}', '').split('|')
29 |     elif name == 'NULL':
30 |         name = []
31 |     else:
32 |         name = [name]
33 |     return name
34 | 
35 | 
36 | def process_file(filename):
37 |     data = []
38 |     with open(filename, "r") as f:
39 |         reader = csv.DictReader(f)
40 |         # skipping the extra metadata
41 |         for i in range(3):
42 |             reader.next()
43 |         # processing file
44 |         for line in reader:
45 |             # calling your function to fix the area value
46 |             if "name" in line:
47 |                 line["name"] = fix_name(line["name"])
48 |             data.append(line)
49 |     return data
50 | 
51 | 
52 | def test():
53 |     data = process_file(CITIES)
54 | 
55 |     print "Printing 20 results:"
56 |     for n in range(20):
57 |         pprint.pprint(data[n]["name"])
58 | 
59 |     assert data[14]["name"] == ['Negtemiut', 'Nightmute']
60 |     assert data[9]["name"] == ['Pell City Alabama']
61 |     assert data[3]["name"] == ['Kumhari']
62 | 
63 | if __name__ == "__main__":
64 |     test()
65 | 


--------------------------------------------------------------------------------
/Lesson 3 - Data Quality/Problem Set 3/06-Crossfield Auditing/location.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | In this problem set you work with cities infobox data, audit it, come up with a
 6 | cleaning idea and then clean it up.
 7 | 
 8 | If you look at the full city data, you will notice that there are couple of
 9 | values that seem to provide the same information in different formats: "point"
10 | seems to be the combination of "wgs84_pos#lat" and "wgs84_pos#long". However,
11 | we do not know if that is the case and should check if they are equivalent.
12 | 
13 | Finish the function check_loc(). It will recieve 3 strings: first, the combined
14 | value of "point" followed by the separate "wgs84_pos#" values. You have to
15 | extract the lat and long values from the "point" argument and compare them to
16 | the "wgs84_pos# values, returning True or False.
17 | 
18 | Note that you do not have to fix the values, only determine if they are
19 | consistent. To fix them in this case you would need more information. Feel free
20 | to discuss possible strategies for fixing this on the discussion forum.
21 | 
22 | The rest of the code is just an example on how this function can be used.
23 | Changes to "process_file" function will not be taken into account for grading.
24 | """
25 | 
26 | import csv
27 | 
28 | CITIES = 'cities.csv'
29 | 
30 | 
31 | def check_loc(point, lat, longi):
32 |     # YOUR CODE HERE
33 |     coordinates = point.split(' ')
34 |     return coordinates[0] == lat and coordinates[1] == longi
35 | 
36 | 
37 | def process_file(filename):
38 |     data = []
39 |     with open(filename, "r") as f:
40 |         reader = csv.DictReader(f)
41 |         # skipping the extra matadata
42 |         for i in range(3):
43 |             reader.next()
44 |         # processing file
45 |         for line in reader:
46 |             # calling your function to check the location
47 |             result = check_loc(line["point"], line["wgs84_pos#lat"], line["wgs84_pos#long"])
48 |             if not result:
49 |                 print "{}: {} != {} {}".format(line["name"], line["point"], line["wgs84_pos#lat"],
50 |                                                line["wgs84_pos#long"])
51 |             data.append(line)
52 | 
53 |     return data
54 | 
55 | 
56 | def test():
57 |     assert check_loc("33.08 75.28", "33.08", "75.28") is True
58 |     assert check_loc("44.57833333333333 -91.21833333333333", "44.5783", "-91.2183") is False
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     test()
63 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/10-Finding Porsche/example_car.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"layout" : "rear mid-engine rear-wheel-drive layout",
 3 | 	"name" : "Porsche Boxster",
 4 | 	"productionYears" : [ ],
 5 | 	"modelYears" : [ ],
 6 | 	"bodyStyle" : "roadster",
 7 | 	"assembly" : [
 8 | 		"Finland",
 9 | 		"Germany",
10 | 		"Stuttgart",
11 | 		"Uusikaupunki"
12 | 	],
13 | 	"class" : "sports car",
14 | 	"manufacturer" : "Porsche"
15 | }


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/10-Finding Porsche/find_porsche.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Your task is to complete the 'porsche_query' function and in particular the query
 4 | to find all autos where the manufacturer field matches "Porsche".
 5 | Please modify only 'porsche_query' function, as only that will be taken into account.
 6 | 
 7 | Your code will be run against a MongoDB instance that we have provided.
 8 | If you want to run this code locally on your machine,
 9 | you have to install MongoDB and download and insert the dataset.
10 | For instructions related to MongoDB setup and datasets please see Course Materials at
11 | the following link:
12 | https://www.udacity.com/wiki/ud032
13 | """
14 | 
15 | 
16 | def porsche_query():
17 |     # Please fill in the query to find all autos manuafactured by Porsche.
18 |     query = {'manufacturer': 'Porsche'}
19 |     return query
20 | 
21 | 
22 | # Do not edit code below this line in the online code editor.
23 | # Code here is for local use on your own computer.
24 | def get_db(db_name):
25 |     # For local use
26 |     from pymongo import MongoClient
27 |     client = MongoClient('localhost:27017')
28 |     db = client[db_name]
29 |     return db
30 | 
31 | 
32 | def find_porsche(db, query):
33 |     # For local use
34 |     return db.autos.find(query)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     # For local use
39 |     db = get_db('examples')
40 |     query = porsche_query()
41 |     results = find_porsche(db, query)
42 | 
43 |     print "Printing first 3 results\n"
44 |     import pprint
45 |     for car in results[:3]:
46 |         pprint.pprint(car)
47 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/14-Inserting Multiple Documents/autos.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | from pymongo import MongoClient
  4 | import csv
  5 | import json
  6 | import io
  7 | import re
  8 | import pprint
  9 | 
 10 | field_map = {
 11 |     "name": "name",
 12 |     "bodyStyle_label": "bodyStyle",
 13 |     "assembly_label": "assembly",
 14 |     "class_label": "class",
 15 |     "designer_label": "designer",
 16 |     "engine_label": "engine",
 17 |     "length": "length",
 18 |     "height": "height",
 19 |     "width": "width",
 20 |     "weight": "weight",
 21 |     "wheelbase": "wheelbase",
 22 |     "layout_label": "layout",
 23 |     "manufacturer_label": "manufacturer",
 24 |     "modelEndYear": "modelEndYear",
 25 |     "modelStartYear": "modelStartYear",
 26 |     "predecessorLabel": "predecessorLabel",
 27 |     "productionStartYear": "productionStartYear",
 28 |     "productionEndYear": "productionEndYear",
 29 |     "transmission": "transmission"
 30 | }
 31 | fields = field_map.keys()
 32 | 
 33 | 
 34 | def skip_lines(input_file, skip):
 35 |     for i in range(0, skip):
 36 |         next(input_file)
 37 | 
 38 | 
 39 | def is_number(s):
 40 |     try:
 41 |         float(s)
 42 |         return True
 43 |     except ValueError:
 44 |         return False
 45 | 
 46 | 
 47 | def strip_automobile(v):
 48 |     return re.sub(r"\s*\(automobile\)\s*", " ", v)
 49 | 
 50 | 
 51 | def strip_city(v):
 52 |     return re.sub(r"\s*\(city\)\s*", " ", v)
 53 | 
 54 | 
 55 | def parse_array(v):
 56 |     if (v[0] == "{") and (v[-1] == "}"):
 57 |         v = v.lstrip("{")
 58 |         v = v.rstrip("}")
 59 |         v_array = v.split("|")
 60 |         v_array = [i.strip() for i in v_array]
 61 |         return v_array
 62 |     return v
 63 | 
 64 | 
 65 | def mm_to_meters(v):
 66 |     if v < 0.01:
 67 |         return v * 1000
 68 |     return v
 69 | 
 70 | 
 71 | def clean_dimension(d, field, v):
 72 |     if is_number(v):
 73 |         if field == "weight":
 74 |             d[field] = float(v) / 1000.0
 75 |         else:
 76 |             d[field] = mm_to_meters(float(v))
 77 | 
 78 | 
 79 | def clean_year(d, field, v):
 80 |     d[field] = v[0:4]
 81 | 
 82 | 
 83 | def parse_array2(v):
 84 |     if (v[0] == "{") and (v[-1] == "}"):
 85 |         v = v.lstrip("{")
 86 |         v = v.rstrip("}")
 87 |         v_array = v.split("|")
 88 |         v_array = [i.strip() for i in v_array]
 89 |         return (True, v_array)
 90 |     return (False, v)
 91 | 
 92 | 
 93 | def ensure_not_array(v):
 94 |     (is_array, v) = parse_array(v)
 95 |     if is_array:
 96 |         return v[0]
 97 |     return v
 98 | 
 99 | 
100 | def ensure_array(v):
101 |     (is_array, v) = parse_array2(v)
102 |     if is_array:
103 |         return v
104 |     return [v]
105 | 
106 | 
107 | def ensure_float(v):
108 |     if is_number(v):
109 |         return float(v)
110 | 
111 | 
112 | def ensure_int(v):
113 |     if is_number(v):
114 |         return int(v)
115 | 
116 | 
117 | def ensure_year_array(val):
118 |     # print "val:", val
119 |     vals = ensure_array(val)
120 |     year_vals = []
121 |     for v in vals:
122 |         v = v[0:4]
123 |         v = int(v)
124 |         if v:
125 |             year_vals.append(v)
126 |     return year_vals
127 | 
128 | 
129 | def empty_val(val):
130 |     val = val.strip()
131 |     return (val == "NULL") or (val == "")
132 | 
133 | 
134 | def years(row, start_field, end_field):
135 |     start_val = row[start_field]
136 |     end_val = row[end_field]
137 | 
138 |     if empty_val(start_val) or empty_val(end_val):
139 |         return []
140 | 
141 |     start_years = ensure_year_array(start_val)
142 |     if start_years:
143 |         start_years = sorted(start_years)
144 |     end_years = ensure_year_array(end_val)
145 |     if end_years:
146 |         end_years = sorted(end_years)
147 |     all_years = []
148 |     if start_years and end_years:
149 |         # print start_years
150 |         # print end_years
151 |         for i in range(0, min(len(start_years), len(end_years))):
152 |             for y in range(start_years[i], end_years[i] + 1):
153 |                 all_years.append(y)
154 |     return all_years
155 | 
156 | 
157 | def process_file(input_file):
158 |     input_data = csv.DictReader(open(input_file))
159 |     autos = []
160 |     skip_lines(input_data, 3)
161 |     for row in input_data:
162 |         auto = {}
163 |         model_years = {}
164 |         production_years = {}
165 |         dimensions = {}
166 |         for field, val in row.iteritems():
167 |             if field not in fields or empty_val(val):
168 |                 continue
169 |             if field in ["bodyStyle_label", "class_label", "layout_label"]:
170 |                 val = val.lower()
171 |             val = strip_automobile(val)
172 |             val = strip_city(val)
173 |             val = val.strip()
174 |             val = parse_array(val)
175 |             if field in ["length", "width", "height", "weight", "wheelbase"]:
176 |                 clean_dimension(dimensions, field_map[field], val)
177 |             elif field in ["modelStartYear", "modelEndYear"]:
178 |                 clean_year(model_years, field_map[field], val)
179 |             elif field in ["productionStartYear", "productionEndYear"]:
180 |                 clean_year(production_years, field_map[field], val)
181 |             else:
182 |                 auto[field_map[field]] = val
183 |         if dimensions:
184 |             auto['dimensions'] = dimensions
185 |         auto['modelYears'] = years(row, 'modelStartYear', 'modelEndYear')
186 |         auto['productionYears'] = years(row, 'productionStartYear', 'productionEndYear')
187 |         autos.append(auto)
188 |     return autos
189 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/14-Inserting Multiple Documents/insert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Add a single line of code to the insert_autos function that will insert the
 4 | automobile data into the 'autos' collection. The data variable that is
 5 | returned from the process_file function is a list of dictionaries, as in the
 6 | example in the previous video.
 7 | """
 8 | 
 9 | from autos import process_file
10 | 
11 | 
12 | def insert_autos(infile, db):
13 |     data = process_file(infile)
14 |     # Add your code here. Insert the data in one command.
15 |     for item in data:
16 |         db.autos.insert(item)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     # Code here is for local use on your own computer.
21 |     from pymongo import MongoClient
22 | 
23 |     client = MongoClient("mongodb://localhost:27017")
24 |     db = client.examples
25 | 
26 |     insert_autos('autos-small.csv', db)
27 |     print db.autos.find_one()
28 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/18-Range Queries/example_city.txt:
--------------------------------------------------------------------------------
 1 | {
 2 |  'areaCode': ['916'],
 3 |  'areaLand': 109271000.0,
 4 |  'country': 'United States',
 5 |  'elevation': 13.716,
 6 |  'foundingDate': datetime.datetime(2000, 7, 1, 0, 0),
 7 |  'governmentType': ['Council\u2013manager government'],
 8 |  'homepage': ['http://elkgrovecity.org/'],
 9 |  'isPartOf': ['California', u'Sacramento County California'],
10 |  'lat': 38.4383,
11 |  'leaderTitle': 'Chief Of Police',
12 |  'lon': -121.382,
13 |  'motto': 'Proud Heritage Bright Future',
14 |  'name': 'City of Elk Grove',
15 |  'population': 155937,
16 |  'postalCode': '95624 95757 95758 95759',
17 |  'timeZone': ['Pacific Time Zone'],
18 |  'utcOffset': ['-7', '-8']
19 | }


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/18-Range Queries/find_cities.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Your task is to write a query that will return all cities
 4 | that are founded in 21st century.
 5 | Please modify only 'range_query' function, as only that will be taken into account.
 6 | 
 7 | Your code will be run against a MongoDB instance that we have provided.
 8 | If you want to run this code locally on your machine,
 9 | you have to install MongoDB, download and insert the dataset.
10 | For instructions related to MongoDB setup and datasets please see Course Materials.
11 | """
12 | 
13 | from datetime import datetime
14 | 
15 | 
16 | def range_query():
17 |     # Modify the below line with your query.
18 |     # You can use datetime(year, month, day) to specify date in the query
19 |     query = {'foundingDate': {'$gte': datetime(2001, 1, 1)}}
20 |     return query
21 | 
22 | 
23 | # Do not edit code below this line in the online code editor.
24 | # Code here is for local use on your own computer.
25 | def get_db():
26 |     from pymongo import MongoClient
27 |     client = MongoClient('localhost:27017')
28 |     db = client.examples
29 |     return db
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     # For local use
34 |     db = get_db()
35 |     query = range_query()
36 |     cities = db.cities.find(query)
37 | 
38 |     print "Found cities:", cities.count()
39 |     import pprint
40 | 
41 |     pprint.pprint(cities[0])
42 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/23-Using $in Operator/example.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"layout" : "rear mid-engine rear-wheel-drive layout",
 3 | 	"name" : "Porsche Boxster",
 4 | 	"productionYears" : [ ],
 5 | 	"modelYears" : [ ],
 6 | 	"bodyStyle" : "roadster",
 7 | 	"assembly" : [
 8 | 		"Finland",
 9 | 		"Germany",
10 | 		"Stuttgart",
11 | 		"Uusikaupunki"
12 | 	],
13 | 	"class" : "sports car",
14 | 	"manufacturer" : "Porsche"
15 | }


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/23-Using $in Operator/find_cars.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Your task is to write a query that will return all cars manufactured by
 4 | "Ford Motor Company" that are assembled in Germany, United Kingdom, or Japan.
 5 | Please modify only 'in_query' function, as only that will be taken into account.
 6 | 
 7 | Your code will be run against a MongoDB instance that we have provided.
 8 | If you want to run this code locally on your machine,
 9 | you have to install MongoDB, download and insert the dataset.
10 | For instructions related to MongoDB setup and datasets please see Course Materials.
11 | """
12 | 
13 | 
14 | def in_query():
15 |     # Modify the below line with your query; try to use the $in operator.
16 |     query = {'manufacturer': 'Ford Motor Company', 'assembly': {'$in': ['Germany', 'United Kingdom', 'Japan']}}
17 | 
18 |     return query
19 | 
20 | 
21 | # Do not edit code below this line in the online code editor.
22 | # Code here is for local use on your own computer.
23 | def get_db():
24 |     from pymongo import MongoClient
25 |     client = MongoClient('localhost:27017')
26 |     db = client.examples
27 |     return db
28 | 
29 | 
30 | if __name__ == "__main__":
31 | 
32 |     db = get_db()
33 |     query = in_query()
34 |     autos = db.autos.find(query, {"name": 1, "manufacturer": 1, "assembly": 1, "_id": 0})
35 | 
36 |     print "Found autos:", autos.count()
37 |     import pprint
38 | 
39 |     for a in autos:
40 |         pprint.pprint(a)
41 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/25-Dot Notation/dot_find.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Your task is to write a query that will return all cars with width dimension
 4 | greater than 2.5. Please modify only the 'dot_query' function, as only that
 5 | will be taken into account.
 6 | 
 7 | Your code will be run against a MongoDB instance that we have provided.
 8 | If you want to run this code locally on your machine, you will need to install
 9 | MongoDB, download and insert the dataset. For instructions related to MongoDB
10 | setup and datasets, please see the Course Materials.
11 | """
12 | 
13 | 
14 | def dot_query():
15 |     # Edit the line below with your query - try to use dot notation.
16 |     # You can check out example_auto.txt for an example of the document
17 |     # structure in the collection.
18 |     query = {'dimensions.width': {'$gt': 2.5}}
19 |     return query
20 | 
21 | 
22 | # Do not edit code below this line in the online code editor.
23 | # Code here is for local use on your own computer.
24 | def get_db():
25 |     from pymongo import MongoClient
26 |     client = MongoClient('localhost:27017')
27 |     db = client.examples
28 |     return db
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     db = get_db()
33 |     query = dot_query()
34 |     cars = db.cars.find(query)
35 | 
36 |     print "Printing first 3 results\n"
37 |     import pprint
38 |     for car in cars[:3]:
39 |         pprint.pprint(car)
40 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/25-Dot Notation/example_auto.txt:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"_id" : ObjectId("52fd438b5a98d65507d288cf"),
 3 | 	"engine" : "Crawler-transporter__1",
 4 | 	"dimensions" : {
 5 | 		"width" : 34.7472,
 6 | 		"length" : 39.9288,
 7 | 		"weight" : 2721000
 8 | 	},
 9 | 	"transmission" : "16 traction motors powered by four  generators",
10 | 	"modelYears" : [ ],
11 | 	"productionYears" : [ ],
12 | 	"manufacturer" : "Marion Power Shovel Company",
13 | 	"name" : "Crawler-transporter"
14 | }


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/Problem Set 4/01-Preparing Data/processing.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | In this problem set you work with another type of infobox data, audit it,
  5 | clean it, come up with a data model, insert it into MongoDB and then run some
  6 | queries against your database. The set contains data about Arachnid class
  7 | animals.
  8 | 
  9 | Your task in this exercise is to parse the file, process only the fields that
 10 | are listed in the FIELDS dictionary as keys, and return a list of dictionaries
 11 | of cleaned values.
 12 | 
 13 | The following things should be done:
 14 | - keys of the dictionary changed according to the mapping in FIELDS dictionary
 15 | - trim out redundant description in parenthesis from the 'rdf-schema#label'
 16 |   field, like "(spider)"
 17 | - if 'name' is "NULL" or contains non-alphanumeric characters, set it to the
 18 |   same value as 'label'.
 19 | - if a value of a field is "NULL", convert it to None
 20 | - if there is a value in 'synonym', it should be converted to an array (list)
 21 |   by stripping the "{}" characters and splitting the string on "|". Rest of the
 22 |   cleanup is up to you, e.g. removing "*" prefixes etc. If there is a singular
 23 |   synonym, the value should still be formatted in a list.
 24 | - strip leading and ending whitespace from all fields, if there is any
 25 | - the output structure should be as follows:
 26 | 
 27 | [ { 'label': 'Argiope',
 28 |     'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
 29 |     'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
 30 |     'name': 'Argiope',
 31 |     'synonym': ["One", "Two"],
 32 |     'classification': {
 33 |                       'family': 'Orb-weaver spider',
 34 |                       'class': 'Arachnid',
 35 |                       'phylum': 'Arthropod',
 36 |                       'order': 'Spider',
 37 |                       'kingdom': 'Animal',
 38 |                       'genus': None
 39 |                       }
 40 |   },
 41 |   { 'label': ... , }, ...
 42 | ]
 43 | 
 44 |   * Note that the value associated with the classification key is a dictionary
 45 |     with taxonomic labels.
 46 | """
 47 | import csv
 48 | import pprint
 49 | import re
 50 | 
 51 | DATAFILE = 'arachnid.csv'
 52 | FIELDS = {'rdf-schema#label': 'label',
 53 |           'URI': 'uri',
 54 |           'rdf-schema#comment': 'description',
 55 |           'synonym': 'synonym',
 56 |           'name': 'name',
 57 |           'family_label': 'family',
 58 |           'class_label': 'class',
 59 |           'phylum_label': 'phylum',
 60 |           'order_label': 'order',
 61 |           'kingdom_label': 'kingdom',
 62 |           'genus_label': 'genus'}
 63 | 
 64 | 
 65 | def process_file(filename, fields):
 66 |     process_fields = fields.keys()
 67 |     data = []
 68 |     with open(filename, "r") as f:
 69 |         reader = csv.DictReader(f)
 70 |         for i in range(3):
 71 |             reader.next()
 72 | 
 73 |         for line in reader:
 74 |             # YOUR CODE HERE
 75 |             line['rdf-schema#label'] = re.sub('\(.+\)', '', line['rdf-schema#label']).strip()
 76 |             if line['rdf-schema#label'] == 'NULL':
 77 |                 line['rdf-schema#label'] = None
 78 | 
 79 |             if line['name'] == 'NULL' or re.search(r'\W', line['name']):
 80 |                 line['name'] = line['rdf-schema#label']
 81 | 
 82 |             if line['synonym'] == 'NULL':
 83 |                 line['synonym'] = None
 84 |             else:
 85 |                 line['synonym'] = parse_array(line['synonym'])
 86 |                 for syn in line['synonym']:
 87 |                     syn.replace('*', "")
 88 | 
 89 |             item = {}
 90 |             item['classification'] = {}
 91 | 
 92 |             for key in fields:
 93 |                 if line[key] == 'NULL':
 94 |                     line[key] = None
 95 | 
 96 |                 if re.search(r'_label', key):
 97 |                     item['classification'][fields[key]] = line[key]
 98 |                 else:
 99 |                     item[fields[key]] = line[key]
100 | 
101 |         data.append(item)
102 |     return data
103 | 
104 | 
105 | def parse_array(v):
106 |     if (v[0] == "{") and (v[-1] == "}"):
107 |         v = v.lstrip("{")
108 |         v = v.rstrip("}")
109 |         v_array = v.split("|")
110 |         v_array = [i.strip() for i in v_array]
111 |         return v_array
112 |     return [v]
113 | 
114 | 
115 | def test():
116 |     data = process_file(DATAFILE, FIELDS)
117 |     print "Your first entry:"
118 |     pprint.pprint(data[0])
119 |     first_entry = {
120 |         "synonym": None,
121 |         "name": "Argiope",
122 |         "classification": {
123 |             "kingdom": "Animal",
124 |             "family": "Orb-weaver spider",
125 |             "order": "Spider",
126 |             "phylum": "Arthropod",
127 |             "genus": None,
128 |             "class": "Arachnid"
129 |         },
130 |         "uri": "http://dbpedia.org/resource/Argiope_(spider)",
131 |         "label": "Argiope",
132 |         "description": "The genus Argiope includes rather large and spectacular spiders that often have a strikingly "
133 |                        "coloured abdomen. These spiders are distributed throughout the world. Most countries in "
134 |                        "tropical or temperate climates host one or more species that are similar in appearance. "
135 |                        "The etymology of the name is from a Greek name meaning silver-faced."
136 |     }
137 | 
138 |     assert len(data) == 76
139 |     assert data[0] == first_entry
140 |     assert data[17]["name"] == "Ogdenia"
141 |     assert data[48]["label"] == "Hydrachnidiae"
142 |     assert data[14]["synonym"] == ["Cyrene Peckham & Peckham"]
143 | 
144 | 
145 | if __name__ == "__main__":
146 |     test()
147 | 


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/Problem Set 4/02-Inserting into DB/dbinsert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 | Complete the insert_data function to insert the data into MongoDB.
 5 | """
 6 | 
 7 | import json
 8 | 
 9 | 
10 | def insert_data(data, db):
11 |     # Your code here. Insert the data into a collection 'arachnid'
12 |     db.arachnid.insert(data)
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     from pymongo import MongoClient
17 | 
18 |     client = MongoClient("mongodb://localhost:27017")
19 |     db = client.examples
20 | 
21 |     with open('arachnid.json') as f:
22 |         data = json.loads(f.read())
23 |         insert_data(data, db)
24 |         print db.arachnid.find_one()


--------------------------------------------------------------------------------
/Lesson 4 - Working with MongoDB/Problem Set 4/03-Updating Schema/update.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | In this problem set you work with another type of infobox data, audit it,
  5 | clean it, come up with a data model, insert it into MongoDB and then run some
  6 | queries against your database. The set contains data about Arachnid class.
  7 | 
  8 | For this exercise, the arachnid data is already in the database. You have been
  9 | given the task of including 'binomialAuthority' information in the records.
 10 | You will do this by processing the arachnid.csv to extract binomial authority
 11 | data and then using this data to update the corresponding data base records.
 12 | 
 13 | The following things should be done in the function add_field:
 14 | - process the csv file and extract 2 fields - 'rdf-schema#label' and
 15 |   'binomialAuthority_label'
 16 | - clean up the 'rdf-schema#label' the same way as in the first exercise,
 17 |   removing redundant "(spider)" suffixes
 18 | - return a dictionary with the cleaned 'rdf-schema#label' field values as keys,
 19 |   and 'binomialAuthority_label' field values as values
 20 | - if 'binomialAuthority_label' is "NULL" for a row in the csv, skip the item
 21 | 
 22 | The following should be done in the function update_db:
 23 | - query the 'label' field in the database using rdf-schema#label keys from the
 24 |   data dictionary
 25 | - update the documents by adding a new item under 'classification' with the key
 26 |   'binomialAuthority' and the binomialAuthority_label value from the data
 27 |   dictionary as the value
 28 | 
 29 | For item {'Argiope': 'Jill Ward'} in the data dictionary, the resulting document structure
 30 | should look like this:
 31 | 
 32 | { 'label': 'Argiope',
 33 |   'uri': 'http://dbpedia.org/resource/Argiope_(spider)',
 34 |   'description': 'The genus Argiope includes rather large and spectacular spiders that often ...',
 35 |   'name': 'Argiope',
 36 |   'synonym': ["One", "Two"],
 37 |   'classification': {
 38 |                     'binomialAuthority' : 'Jill Ward'
 39 |                     'family': 'Orb-weaver spider',
 40 |                     'class': 'Arachnid',
 41 |                     'phylum': 'Arthropod',
 42 |                     'order': 'Spider',
 43 |                     'kingdom': 'Animal',
 44 |                     'genus': None
 45 |                     }
 46 | }
 47 | 
 48 | Note that the value in the 'binomialAuthority' field is a placeholder; this is only to
 49 | demonstrate the output structure form, for the entries that require updating.
 50 | """
 51 | import csv
 52 | import pprint
 53 | import re
 54 | 
 55 | DATAFILE = 'arachnid.csv'
 56 | FIELDS = {'rdf-schema#label': 'label',
 57 |           'binomialAuthority_label': 'binomialAuthority'}
 58 | 
 59 | 
 60 | def add_field(filename, fields):
 61 |     """
 62 |     Complete this function to set up a dictionary for adding binomialAuthority
 63 |     information to the database.
 64 |     """
 65 |     process_fields = fields.keys()
 66 |     data = {}
 67 |     with open(filename, "r") as f:
 68 |         reader = csv.DictReader(f)
 69 |         for i in range(3):
 70 |             reader.next()
 71 |         # YOUR CODE HERE
 72 |         for line in reader:
 73 |             if line['binomialAuthority_label'] != 'NULL':
 74 |                 label = re.sub('\(.*?\)', '', line['rdf-schema#label']).strip()
 75 |                 value = line['binomialAuthority_label']
 76 |                 data[label] = value
 77 |     return data
 78 | 
 79 | 
 80 | def update_db(data, db):
 81 |     """
 82 |     Use the dictionary you generated from add_field to update the database.
 83 |     """
 84 |     # YOUR CODE HERE
 85 |     for key in data:
 86 |         db.arachnid.update({'label': key},
 87 |                            {"$set": {'classification.binomialAuthority':data[key]}}, multi=True)
 88 | 
 89 | 
 90 | def test():
 91 |     # Please change only the add_field and update_db functions!
 92 |     # Changes done to this function will not be taken into account
 93 |     # when doing a Test Run or Submit, they are just for your own reference
 94 |     # and as an example for running this code locally!
 95 | 
 96 |     data = add_field(DATAFILE, FIELDS)
 97 |     from pymongo import MongoClient
 98 |     client = MongoClient("mongodb://localhost:27017")
 99 |     db = client.examples
100 | 
101 |     update_db(data, db)
102 | 
103 |     updated = db.arachnid.find_one({'label': 'Opisthoncana'})
104 |     assert updated['classification']['binomialAuthority'] == 'Embrik Strand'
105 |     pprint.pprint(data)
106 | 
107 | 
108 | if __name__ == "__main__":
109 |     test()


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/05-Using group/Using group.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | The tweets in our twitter collection have a field called "source". This field describes the application
 4 | that was used to create the tweet. Following the examples for using the $group operator, your task is
 5 | to modify the 'make-pipeline' function to identify most used applications for creating tweets.
 6 | As a check on your query, 'web' is listed as the most frequently used application.
 7 | 'Ubertwitter' is the second most used. The number of counts should be stored in a field named 'count'
 8 | (see the assertion at the end of the script).
 9 | 
10 | Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline
11 | that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation
12 | pipeline should be a list of one or more dictionary objects.
13 | Please review the lesson examples if you are unsure of the syntax.
14 | 
15 | Your code will be run against a MongoDB instance that we have provided.
16 | If you want to run this code locally on your machine, you have to install MongoDB,
17 | download and insert the dataset.
18 | For instructions related to MongoDB setup and datasets please see Course Materials.
19 | 
20 | Please note that the dataset you are using here is a smaller version of the twitter dataset
21 | used in examples in this lesson.
22 | If you attempt some of the same queries that we looked at in the lesson examples,
23 | your results will be different.
24 | """
25 | 
26 | 
27 | def get_db(db_name):
28 |     from pymongo import MongoClient
29 |     client = MongoClient('localhost:27017')
30 |     db = client[db_name]
31 |     return db
32 | 
33 | 
34 | def make_pipeline():
35 |     # complete the aggregation pipeline
36 |     pipeline = list()
37 |     pipeline.append({"$group": {"_id": "$source",
38 |                                 "count": {"$sum": 1}}})
39 |     pipeline.append({"$sort": {"count": -1}})
40 |     return pipeline
41 | 
42 | 
43 | def tweet_sources(db, pipeline):
44 |     return [doc for doc in db.tweets.aggregate(pipeline)]
45 | 
46 | if __name__ == '__main__':
47 |     db = get_db('twitter')
48 |     pipeline = make_pipeline()
49 |     result = tweet_sources(db, pipeline)
50 |     import pprint
51 |     pprint.pprint(result[0])
52 |     assert result[0] == {u'count': 868, u'_id': u'web'}
53 | 


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/10-Using match and project/Using match and project.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Write an aggregation query to answer this question:
 4 | 
 5 | Of the users in the "Brasilia" timezone who have tweeted 100 times or more,
 6 | who has the largest number of followers?
 7 | 
 8 | The following hints will help you solve this problem:
 9 | - Time zone is found in the "time_zone" field of the user object in each tweet.
10 | - The number of tweets for each user is found in the "statuses_count" field.
11 |   To access these fields you will need to use dot notation (from Lesson 4)
12 | - Your aggregation query should return something like the following:
13 | {u'ok': 1.0,
14 |  u'result': [{u'_id': ObjectId('52fd2490bac3fa1975477702'),
15 |                   u'followers': 2597,
16 |                   u'screen_name': u'marbles',
17 |                   u'tweets': 12334}]}
18 | Note that you will need to create the fields 'followers', 'screen_name' and 'tweets'.
19 | 
20 | Please modify only the 'make_pipeline' function so that it creates and returns an aggregation
21 | pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson,
22 | the aggregation pipeline should be a list of one or more dictionary objects.
23 | Please review the lesson examples if you are unsure of the syntax.
24 | 
25 | Your code will be run against a MongoDB instance that we have provided. If you want to run this code
26 | locally on your machine, you have to install MongoDB, download and insert the dataset.
27 | For instructions related to MongoDB setup and datasets please see Course Materials.
28 | 
29 | Please note that the dataset you are using here is a smaller version of the twitter dataset used
30 | in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson
31 | examples, your results will be different.
32 | """
33 | 
34 | 
35 | def get_db(db_name):
36 |     from pymongo import MongoClient
37 |     client = MongoClient('localhost:27017')
38 |     db = client[db_name]
39 |     return db
40 | 
41 | 
42 | def make_pipeline():
43 |     # complete the aggregation pipeline
44 |     pipeline = []
45 |     match = {'$match': {'user.time_zone': 'Brasilia',
46 |                         'user.statuses_count': {'$gt': 100}}}
47 |     pipeline.append(match)
48 | 
49 |     project = {'$project': {'followers': '$user.followers_count',
50 |                             'screen_name': '$user.screen_name',
51 |                             'tweets': '$user.statuses_count'}}
52 |     pipeline.append(project)
53 | 
54 |     sort = {'$sort': {'followers': -1}}
55 |     pipeline.append(sort)
56 | 
57 |     limit = {'$limit': 1}
58 |     pipeline.append(limit)
59 | 
60 |     return pipeline
61 | 
62 | 
63 | def aggregate(db, pipeline):
64 |     return [doc for doc in db.tweets.aggregate(pipeline)]
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     db = get_db('twitter')
69 |     pipeline = make_pipeline()
70 |     result = aggregate(db, pipeline)
71 |     import pprint
72 | 
73 |     pprint.pprint(result)
74 |     assert len(result) == 1
75 |     assert result[0]["followers"] == 17209
76 | 
77 | 


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/12-Using unwind/Using unwind.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | For this exercise, let's return to our cities infobox dataset. The question we would like you to answer
 4 | is as follows:  Which region or district in India contains the most cities? (Make sure that the count of
 5 | cities is stored in a field named 'count'; see the assertions at the end of the script.)
 6 | 
 7 | As a starting point, use the solution for the example question we looked at -- "Who includes the most
 8 | user mentions in their tweets?"
 9 | 
10 | One thing to note about the cities data is that the "isPartOf" field contains an array of regions or
11 | districts in which a given city is found. See the example document in Instructor Comments below.
12 | 
13 | Please modify only the 'make_pipeline' function so that it creates and returns an aggregation pipeline
14 | that can be passed to the MongoDB aggregate function. As in our examples in this lesson, the aggregation
15 | pipeline should be a list of one or more dictionary objects. Please review the lesson examples if you
16 | are unsure of the syntax.
17 | 
18 | Your code will be run against a MongoDB instance that we have provided. If you want to run this code
19 | locally on your machine, you have to install MongoDB, download and insert the dataset.
20 | For instructions related to MongoDB setup and datasets please see Course Materials.
21 | 
22 | Please note that the dataset you are using here is a smaller version of the cities collection used in
23 | examples in this lesson. If you attempt some of the same queries that we looked at in the lesson
24 | examples, your results may be different.
25 | """
26 | 
27 | 
28 | def get_db(db_name):
29 |     from pymongo import MongoClient
30 |     client = MongoClient('localhost:27017')
31 |     db = client[db_name]
32 |     return db
33 | 
34 | 
35 | def make_pipeline():
36 |     # complete the aggregation pipeline
37 |     pipeline = []
38 |     unwind = {'$unwind': '$isPartOf'}
39 |     pipeline.append(unwind)
40 | 
41 |     match = {'$match': {'country': 'India'}}
42 |     pipeline.append(match)
43 | 
44 |     group = {'$group': {'_id': '$isPartOf',
45 |                         'count': {'$sum': 1}}}
46 |     pipeline.append(group)
47 | 
48 |     sort = {'$sort': {'count': -1}}
49 |     pipeline.append(sort)
50 | 
51 |     limit = {'$limit': 1}
52 |     pipeline.append(limit)
53 | 
54 |     return pipeline
55 | 
56 | 
57 | def aggregate(db, pipeline):
58 |     return [doc for doc in db.cities.aggregate(pipeline)]
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     db = get_db('examples')
63 |     pipeline = make_pipeline()
64 |     result = aggregate(db, pipeline)
65 |     print "Printing the first result:"
66 |     import pprint
67 | 
68 |     pprint.pprint(result[0])
69 |     assert result[0]["_id"] == "Uttar Pradesh"
70 |     assert result[0]["count"] == 623
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/14-Using push/Using push.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | $push is similar to $addToSet. The difference is that rather than accumulating only unique values
 4 | it aggregates all values into an array.
 5 | 
 6 | Using an aggregation query, count the number of tweets for each user. In the same $group stage,
 7 | use $push to accumulate all the tweet texts for each user. Limit your output to the 5 users
 8 | with the most tweets.
 9 | Your result documents should include only the fields:
10 | "_id" (screen name of user),
11 | "count" (number of tweets found for the user),
12 | "tweet_texts" (a list of the tweet texts found for the user).
13 | 
14 | Please modify only the 'make_pipeline' function so that it creates and returns an aggregation
15 | pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson,
16 | the aggregation pipeline should be a list of one or more dictionary objects.
17 | Please review the lesson examples if you are unsure of the syntax.
18 | 
19 | Your code will be run against a MongoDB instance that we have provided. If you want to run this code
20 | locally on your machine, you have to install MongoDB, download and insert the dataset.
21 | For instructions related to MongoDB setup and datasets please see Course Materials.
22 | 
23 | Please note that the dataset you are using here is a smaller version of the twitter dataset used in
24 | examples in this lesson. If you attempt some of the same queries that we looked at in the lesson
25 | examples, your results will be different.
26 | """
27 | 
28 | 
29 | def get_db(db_name):
30 |     from pymongo import MongoClient
31 |     client = MongoClient('localhost:27017')
32 |     db = client[db_name]
33 |     return db
34 | 
35 | 
36 | def make_pipeline():
37 |     # complete the aggregation pipeline
38 |     pipeline = []
39 | 
40 |     group = {'$group': {'_id': '$user.screen_name',
41 |                         'count': {'$sum': 1},
42 |                         'tweet_texts': {'$push': '$text'}}}
43 |     pipeline.append(group)
44 | 
45 |     sort = {'$sort': {'count': -1}}
46 |     pipeline.append(sort)
47 | 
48 |     limit = {'$limit': 5}
49 |     pipeline.append(limit)
50 | 
51 |     return pipeline
52 | 
53 | 
54 | def aggregate(db, pipeline):
55 |     return [doc for doc in db.twitter.aggregate(pipeline)]
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     db = get_db('twitter')
60 |     pipeline = make_pipeline()
61 |     result = aggregate(db, pipeline)
62 |     import pprint
63 | 
64 |     pprint.pprint(result)
65 |     assert len(result) == 5
66 |     assert result[0]["count"] > result[4]["count"]
67 | 


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/16-Same Operator/Same Operator.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | In an earlier exercise we looked at the cities dataset and asked which region in India contains
 4 | the most cities. In this exercise, we'd like you to answer a related question regarding regions in
 5 | India. What is the average city population for a region in India? Calculate your answer by first
 6 | finding the average population of cities in each region and then by calculating the average of the
 7 | regional averages.
 8 | 
 9 | Hint: If you want to accumulate using values from all input documents to a group stage, you may use
10 | a constant as the value of the "_id" field. For example,
11 |     { "$group" : {"_id" : "India Regional City Population Average",
12 |       ... }
13 | 
14 | Please modify only the 'make_pipeline' function so that it creates and returns an aggregation
15 | pipeline that can be passed to the MongoDB aggregate function. As in our examples in this lesson,
16 | the aggregation pipeline should be a list of one or more dictionary objects.
17 | Please review the lesson examples if you are unsure of the syntax.
18 | 
19 | Your code will be run against a MongoDB instance that we have provided. If you want to run this code
20 | locally on your machine, you have to install MongoDB, download and insert the dataset.
21 | For instructions related to MongoDB setup and datasets please see Course Materials.
22 | 
23 | Please note that the dataset you are using here is a smaller version of the twitter dataset used
24 | in examples in this lesson. If you attempt some of the same queries that we looked at in the lesson
25 | examples, your results will be different.
26 | """
27 | 
28 | 
29 | def get_db(db_name):
30 |     from pymongo import MongoClient
31 |     client = MongoClient('localhost:27017')
32 |     db = client[db_name]
33 |     return db
34 | 
35 | 
36 | def make_pipeline():
37 |     # complete the aggregation pipeline
38 |     pipeline = []
39 | 
40 |     match = {'$match': {'country': 'India'}}
41 |     pipeline.append(match)
42 | 
43 |     unwind = {'$unwind': '$isPartOf'}
44 |     pipeline.append(unwind)
45 | 
46 |     group = {'$group': {'_id': '$isPartOf',
47 |                         'avg': {'$avg': '$population'}}}
48 |     pipeline.append(group)
49 | 
50 |     group = {'$group': {'_id': 'India Regional City Population Average',
51 |                         'avg': {'$avg': '$avg'}}}
52 |     pipeline.append(group)
53 | 
54 |     return pipeline
55 | 
56 | 
57 | def aggregate(db, pipeline):
58 |     return [doc for doc in db.cities.aggregate(pipeline)]
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     db = get_db('examples')
63 |     pipeline = make_pipeline()
64 |     result = aggregate(db, pipeline)
65 |     assert len(result) == 1
66 |     # Your result should be close to the value after the minus sign.
67 |     assert abs(result[0]["avg"] - 196025.97814809752) < 10 ** -8
68 |     import pprint
69 | 
70 |     pprint.pprint(result)
71 | 


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/Problem Set 5/01-Most Common City Name/Most Common City Name.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Use an aggregation query to answer the following question.
 4 | 
 5 | What is the most common city name in our cities collection?
 6 | 
 7 | Your first attempt probably identified None as the most frequently occurring
 8 | city name. What that actually means is that there are a number of cities
 9 | without a name field at all. It's strange that such documents would exist in
10 | this collection and, depending on your situation, might actually warrant
11 | further cleaning.
12 | 
13 | To solve this problem the right way, we should really ignore cities that don't
14 | have a name specified. As a hint ask yourself what pipeline operator allows us
15 | to simply filter input? How do we test for the existence of a field?
16 | 
17 | Please modify only the 'make_pipeline' function so that it creates and returns
18 | an aggregation pipeline that can be passed to the MongoDB aggregate function.
19 | As in our examples in this lesson, the aggregation pipeline should be a list of
20 | one or more dictionary objects. Please review the lesson examples if you are
21 | unsure of the syntax.
22 | 
23 | Your code will be run against a MongoDB instance that we have provided. If you
24 | want to run this code locally on your machine, you have to install MongoDB,
25 | download and insert the dataset. For instructions related to MongoDB setup and
26 | datasets please see Course Materials.
27 | 
28 | Please note that the dataset you are using here is a different version of the
29 | cities collection provided in the course materials. If you attempt some of the
30 | same queries that we look at in the problem set, your results may be different.
31 | """
32 | 
33 | 
34 | def get_db(db_name):
35 |     from pymongo import MongoClient
36 |     client = MongoClient('localhost:27017')
37 |     db = client[db_name]
38 |     return db
39 | 
40 | 
41 | def make_pipeline():
42 |     # complete the aggregation pipeline
43 |     pipeline = [ ]
44 |     pipeline.append({'$match': {'name': {'$exists': 1}}})
45 |     pipeline.append({'$group': {'_id': '$name',
46 |                                 'count': {'$sum':1}}})
47 |     pipeline.append({'$sort': {'count': -1}})
48 |     pipeline.append({'$limit':1})
49 |     return pipeline
50 | 
51 | 
52 | def aggregate(db, pipeline):
53 |     return [doc for doc in db.cities.aggregate(pipeline)]
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     # The following statements will be used to test your code by the grader.
58 |     # Any modifications to the code past this point will not be reflected by
59 |     # the Test Run.
60 |     db = get_db('examples')
61 |     pipeline = make_pipeline()
62 |     result = aggregate(db, pipeline)
63 |     import pprint
64 |     pprint.pprint(result[0])
65 |     assert len(result) == 1
66 |     assert result[0] == {'_id': 'Shahpur', 'count': 6}
67 | 


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/Problem Set 5/02-Region Cities/Region Cities.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Use an aggregation query to answer the following question.
 4 | 
 5 | Which Region in India has the largest number of cities with longitude between
 6 | 75 and 80?
 7 | 
 8 | Please modify only the 'make_pipeline' function so that it creates and returns
 9 | an aggregation pipeline that can be passed to the MongoDB aggregate function.
10 | As in our examples in this lesson, the aggregation pipeline should be a list of
11 | one or more dictionary objects. Please review the lesson examples if you are
12 | unsure of the syntax.
13 | 
14 | Your code will be run against a MongoDB instance that we have provided. If you
15 | want to run this code locally on your machine, you have to install MongoDB,
16 | download and insert the dataset. For instructions related to MongoDB setup and
17 | datasets please see Course Materials.
18 | 
19 | Please note that the dataset you are using here is a different version of the
20 | cities collection provided in the course materials. If you attempt some of the
21 | same queries that we look at in the problem set, your results may be different.
22 | """
23 | 
24 | 
25 | def get_db(db_name):
26 |     from pymongo import MongoClient
27 |     client = MongoClient('localhost:27017')
28 |     db = client[db_name]
29 |     return db
30 | 
31 | 
32 | def make_pipeline():
33 |     # complete the aggregation pipeline
34 |     pipeline = []
35 | 
36 |     match = {'$match': {'country': 'India',
37 |                         'lon': {'$gte': 75,
38 |                                 '$lte': 80}}}
39 |     pipeline.append(match)
40 | 
41 |     unwind = {'$unwind': '$isPartOf'}
42 |     pipeline.append(unwind)
43 | 
44 |     group = {'$group': {'_id': '$isPartOf',
45 |                         'count': {'$sum': 1}}}
46 |     pipeline.append(group)
47 | 
48 |     pipeline.append({'$sort': {'count': -1}})
49 |     pipeline.append({'$limit': 1})
50 | 
51 |     return pipeline
52 | 
53 | 
54 | def aggregate(db, pipeline):
55 |     return [doc for doc in db.cities.aggregate(pipeline)]
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     # The following statements will be used to test your code by the grader.
60 |     # Any modifications to the code past this point will not be reflected by
61 |     # the Test Run.
62 |     db = get_db('examples')
63 |     pipeline = make_pipeline()
64 |     result = aggregate(db, pipeline)
65 |     import pprint
66 | 
67 |     pprint.pprint(result[0])
68 |     assert len(result) == 1
69 |     assert result[0]["_id"] == 'Tamil Nadu'
70 |     assert result[0]["count"] == 424
71 | 


--------------------------------------------------------------------------------
/Lesson 5 - Analyzing Data/Problem Set 5/03-Average Population/Average Population.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | Use an aggregation query to answer the following question.
 4 | 
 5 | Extrapolating from an earlier exercise in this lesson, find the average
 6 | regional city population for all countries in the cities collection. What we
 7 | are asking here is that you first calculate the average city population for each
 8 | region in a country and then calculate the average of all the regional averages
 9 | for a country.
10 |   As a hint, _id fields in group stages need not be single values. They can
11 | also be compound keys (documents composed of multiple fields). You will use the
12 | same aggregation operator in more than one stage in writing this aggregation
13 | query. I encourage you to write it one stage at a time and test after writing
14 | each stage.
15 | 
16 | Please modify only the 'make_pipeline' function so that it creates and returns
17 | an aggregation  pipeline that can be passed to the MongoDB aggregate function.
18 | As in our examples in this lesson, the aggregation pipeline should be a list of
19 | one or more dictionary objects. Please review the lesson examples if you are
20 | unsure of the syntax.
21 | 
22 | Your code will be run against a MongoDB instance that we have provided. If you
23 | want to run this code locally on your machine, you have to install MongoDB,
24 | download and insert the dataset. For instructions related to MongoDB setup and
25 | datasets please see Course Materials.
26 | 
27 | Please note that the dataset you are using here is a different version of the
28 | cities collection provided in the course materials. If you attempt some of the
29 | same queries that we look at in the problem set, your results may be different.
30 | """
31 | 
32 | 
33 | def get_db(db_name):
34 |     from pymongo import MongoClient
35 |     client = MongoClient('localhost:27017')
36 |     db = client[db_name]
37 |     return db
38 | 
39 | 
40 | def make_pipeline():
41 |     # complete the aggregation pipeline
42 |     pipeline = []
43 | 
44 |     pipeline.append({'$unwind': '$isPartOf'})
45 | 
46 |     group = {'$group': {'_id': {'country': '$country',
47 |                                 'region': '$isPartOf'},
48 |                         'avg': {'$avg': '$population'}}}
49 |     pipeline.append(group)
50 | 
51 |     group = {'$group': {'_id': '$_id.country',
52 |                         'avgRegionalPopulation': {'$avg': '$avg'}}}
53 |     pipeline.append(group)
54 |     return pipeline
55 | 
56 | 
57 | def aggregate(db, pipeline):
58 |     return [doc for doc in db.cities.aggregate(pipeline)]
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     # The following statements will be used to test your code by the grader.
63 |     # Any modifications to the code past this point will not be reflected by
64 |     # the Test Run.
65 |     db = get_db('examples')
66 |     pipeline = make_pipeline()
67 |     result = aggregate(db, pipeline)
68 |     import pprint
69 | 
70 |     if len(result) < 150:
71 |         pprint.pprint(result)
72 |     else:
73 |         pprint.pprint(result[:100])
74 |     key_pop = 0
75 |     for country in result:
76 |         if country["_id"] == 'Lithuania':
77 |             assert country["_id"] == 'Lithuania'
78 |             assert abs(country["avgRegionalPopulation"] - 14750.784447977203) < 1e-10
79 |             key_pop = country["avgRegionalPopulation"]
80 |     assert {'_id': 'Lithuania', 'avgRegionalPopulation': key_pop} in result
81 | 


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/03-Iterative Parsing/example.osm:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <osm version="0.6" generator="CGImap 0.3.3 (28791 thorn-03.openstreetmap.org)" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
 3 |  <bounds minlat="41.9704500" minlon="-87.6928300" maxlat="41.9758200" maxlon="-87.6894800"/>
 4 |  <node id="261114295" visible="true" version="7" changeset="11129782" timestamp="2012-03-28T18:31:23Z" user="bbmiller" uid="451048" lat="41.9730791" lon="-87.6866303"/>
 5 |  <node id="261114296" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9730416" lon="-87.6878512"/>
 6 |  <node id="261114299" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9729565" lon="-87.6939548"/>
 7 |  <node id="261146436" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707380" lon="-87.6976025"/>
 8 |  <node id="261147304" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9740068" lon="-87.6988576"/>
 9 |  <node id="261224274" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707656" lon="-87.6938669"/>
10 |  <node id="293816175" visible="true" version="47" changeset="8448766" timestamp="2011-06-15T16:55:37Z" user="bbmiller" uid="451048" lat="41.9730154" lon="-87.6890403"/>
11 |  <node id="305896090" visible="true" version="37" changeset="15348240" timestamp="2013-03-13T07:46:29Z" user="Umbugbene" uid="567034" lat="41.9749225" lon="-87.6891198"/>
12 |  <node id="317636974" visible="true" version="12" changeset="15348240" timestamp="2013-03-13T08:02:56Z" user="Umbugbene" uid="567034" lat="41.9740292" lon="-87.7012430"/>
13 |  <node id="317636971" visible="true" version="13" changeset="15348240" timestamp="2013-03-13T08:08:01Z" user="Umbugbene" uid="567034" lat="41.9740556" lon="-87.6979712"/>
14 |  <node id="317637399" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9705609" lon="-87.7012048"/>
15 |  <node id="317637398" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9706972" lon="-87.7012109"/>
16 |  <node id="365214872" visible="true" version="3" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731130" lon="-87.6847998"/>
17 |  <node id="261299091" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9747482" lon="-87.6988886"/>
18 |  <node id="261114294" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731219" lon="-87.6841979"/>
19 |  <node id="261210804" visible="true" version="4" changeset="3359748" timestamp="2009-12-13T00:36:09Z" user="woodpeck_fixbot" uid="147510" lat="41.9707217" lon="-87.7000019"/>
20 |  <node id="261221422" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9748542" lon="-87.6922652"/>
21 |  <node id="261221424" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9758794" lon="-87.6923639">
22 |   <tag k="highway" v="traffic_signals"/>
23 |  </node>
24 |  <node id="261198953" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:13Z" user="bbmiller" uid="451048" lat="41.9707413" lon="-87.6963097"/>
25 |  <node id="757860928" visible="true" version="2" changeset="5288876" timestamp="2010-07-22T16:16:51Z" user="uboot" uid="26299" lat="41.9747374" lon="-87.6920102">
26 |   <tag k="amenity" v="fast_food"/>
27 |   <tag k="cuisine" v="sausage"/>
28 |   <tag k="name" v="Shelly's Tasty Freeze"/>
29 |  </node>
30 |   <way id="258219703" visible="true" version="1" changeset="20187382" timestamp="2014-01-25T02:01:54Z" user="linuxUser16" uid="1219059">
31 |   <nd ref="2636086179"/>
32 |   <nd ref="2636086178"/>
33 |   <nd ref="2636086177"/>
34 |   <nd ref="2636086176"/>
35 |   <tag k="highway" v="service"/>
36 |  </way>
37 |  <relation id="1557627" visible="true" version="2" changeset="14326854" timestamp="2012-12-19T05:32:37Z" user="fredr" uid="939355">
38 |   <member type="node" ref="1258927212" role="via"/>
39 |   <member type="way" ref="110160127" role="from"/>
40 |   <member type="way" ref="34073105" role="to"/>
41 |   <tag k="restriction" v="only_right_turn"/>
42 |   <tag k="type" v="restriction"/>
43 |  </relation>
44 | </osm>


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/03-Iterative Parsing/mapparser.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | Your task is to use the iterative parsing to process the map file and
 5 | find out not only what tags are there, but also how many, to get the
 6 | feeling on how much of which data you can expect to have in the map.
 7 | Fill out the count_tags function. It should return a dictionary with the
 8 | tag name as the key and number of times this tag can be encountered in
 9 | the map as value.
10 | 
11 | Note that your code will be tested with a different data file than the 'example.osm'
12 | """
13 | 
14 | import xml.etree.ElementTree as ET
15 | import pprint
16 | 
17 | 
18 | def count_tags(filename):
19 |     # YOUR CODE HERE
20 |     tags = {}
21 |     for event, elem in ET.iterparse(filename):
22 |         if elem.tag not in tags:
23 |             tags[elem.tag] = 1
24 |         else:
25 |             tags[elem.tag] += 1
26 |     return tags
27 | 
28 | 
29 | def test():
30 |     tags = count_tags('example.osm')
31 |     pprint.pprint(tags)
32 |     assert tags == {'bounds': 1,
33 |                     'member': 3,
34 |                     'nd': 4,
35 |                     'node': 20,
36 |                     'osm': 1,
37 |                     'relation': 1,
38 |                     'tag': 7,
39 |                     'way': 1}
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     test()
44 | 


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/07-Tag Types/example.osm:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <osm version="0.6" generator="CGImap 0.3.3 (28791 thorn-03.openstreetmap.org)" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
 3 |  <bounds minlat="41.9704500" minlon="-87.6928300" maxlat="41.9758200" maxlon="-87.6894800"/>
 4 |  <node id="261114295" visible="true" version="7" changeset="11129782" timestamp="2012-03-28T18:31:23Z" user="bbmiller" uid="451048" lat="41.9730791" lon="-87.6866303"/>
 5 |  <node id="261114296" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9730416" lon="-87.6878512"/>
 6 |  <node id="261114299" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9729565" lon="-87.6939548"/>
 7 |  <node id="261146436" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707380" lon="-87.6976025"/>
 8 |  <node id="261147304" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9740068" lon="-87.6988576"/>
 9 |  <node id="261224274" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707656" lon="-87.6938669"/>
10 |  <node id="293816175" visible="true" version="47" changeset="8448766" timestamp="2011-06-15T16:55:37Z" user="bbmiller" uid="451048" lat="41.9730154" lon="-87.6890403"/>
11 |  <node id="305896090" visible="true" version="37" changeset="15348240" timestamp="2013-03-13T07:46:29Z" user="Umbugbene" uid="567034" lat="41.9749225" lon="-87.6891198"/>
12 |  <node id="317636974" visible="true" version="12" changeset="15348240" timestamp="2013-03-13T08:02:56Z" user="Umbugbene" uid="567034" lat="41.9740292" lon="-87.7012430"/>
13 |  <node id="317636971" visible="true" version="13" changeset="15348240" timestamp="2013-03-13T08:08:01Z" user="Umbugbene" uid="567034" lat="41.9740556" lon="-87.6979712"/>
14 |  <node id="317637399" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9705609" lon="-87.7012048"/>
15 |  <node id="317637398" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9706972" lon="-87.7012109"/>
16 |  <node id="365214872" visible="true" version="3" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731130" lon="-87.6847998"/>
17 |  <node id="261299091" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9747482" lon="-87.6988886"/>
18 |  <node id="261114294" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731219" lon="-87.6841979"/>
19 |  <node id="261210804" visible="true" version="4" changeset="3359748" timestamp="2009-12-13T00:36:09Z" user="woodpeck_fixbot" uid="147510" lat="41.9707217" lon="-87.7000019"/>
20 |  <node id="261221422" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9748542" lon="-87.6922652"/>
21 |  <node id="261221424" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9758794" lon="-87.6923639">
22 |   <tag k="highway" v="traffic_signals"/>
23 |  </node>
24 |  <node id="261198953" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:13Z" user="bbmiller" uid="451048" lat="41.9707413" lon="-87.6963097"/>
25 |  <node id="757860928" visible="true" version="2" changeset="5288876" timestamp="2010-07-22T16:16:51Z" user="uboot" uid="26299" lat="41.9747374" lon="-87.6920102">
26 |   <tag k="amenity" v="fast_food"/>
27 |   <tag k="cuisine" v="sausage"/>
28 |   <tag k="name" v="Shelly's Tasty Freeze"/>
29 |  </node>
30 |   <way id="258219703" visible="true" version="1" changeset="20187382" timestamp="2014-01-25T02:01:54Z" user="linuxUser16" uid="1219059">
31 |   <nd ref="2636086179"/>
32 |   <nd ref="2636086178"/>
33 |   <nd ref="2636086177"/>
34 |   <nd ref="2636086176"/>
35 |   <tag k="highway" v="service"/>
36 |  </way>
37 |  <relation id="1557627" visible="true" version="2" changeset="14326854" timestamp="2012-12-19T05:32:37Z" user="fredr" uid="939355">
38 |   <member type="node" ref="1258927212" role="via"/>
39 |   <member type="way" ref="110160127" role="from"/>
40 |   <member type="way" ref="34073105" role="to"/>
41 |   <tag k="restriction" v="only_right_turn"/>
42 |   <tag k="type" v="restriction"/>
43 |  </relation>
44 | </osm>


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/07-Tag Types/tags.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import xml.etree.cElementTree as ET
 5 | import pprint
 6 | import re
 7 | 
 8 | """
 9 | Your task is to explore the data a bit more.
10 | Before you process the data and add it into your database, you should check the
11 | "k" value for each "<tag>" and see if there are any potential problems.
12 | 
13 | We have provided you with 3 regular expressions to check for certain patterns
14 | in the tags. As we saw in the quiz earlier, we would like to change the data
15 | model and expand the "addr:street" type of keys to a dictionary like this:
16 | {"address": {"street": "Some value"}}
17 | So, we have to see if we have such tags, and if we have any tags with
18 | problematic characters.
19 | 
20 | Please complete the function 'key_type', such that we have a count of each of
21 | four tag categories in a dictionary:
22 |   "lower", for tags that contain only lowercase letters and are valid,
23 |   "lower_colon", for otherwise valid tags with a colon in their names,
24 |   "problemchars", for tags with problematic characters, and
25 |   "other", for other tags that do not fall into the other three categories.
26 | See the 'process_map' and 'test' functions for examples of the expected format.
27 | """
28 | 
29 | lower = re.compile(r'^([a-z]|_)*$')
30 | lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
31 | problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
32 | 
33 | 
34 | def key_type(element, keys):
35 |     if element.tag == "tag":
36 |         # YOUR CODE HERE
37 | 
38 |         if lower.search(element.attrib['k']):
39 |             keys['lower'] += 1
40 |         elif lower_colon.search(element.attrib['k']):
41 |             keys['lower_colon'] += 1
42 |         elif problemchars.search(element.attrib['k']):
43 |             keys['problemchars'] += 1
44 |         else:
45 |             keys['other'] += 1
46 | 
47 |     return keys
48 | 
49 | 
50 | def process_map(filename):
51 |     keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
52 |     for _, element in ET.iterparse(filename):
53 |         keys = key_type(element, keys)
54 | 
55 |     return keys
56 | 
57 | 
58 | def test():
59 |     # You can use another testfile 'map.osm' to look at your solution
60 |     # Note that the assertion below will be incorrect then.
61 |     # Note as well that the test function here is only used in the Test Run;
62 |     # when you submit, your code will be checked against a different dataset.
63 |     keys = process_map('example.osm')
64 |     pprint.pprint(keys)
65 |     assert keys == {'lower': 5, 'lower_colon': 0, 'other': 1, 'problemchars': 1}
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     test()
70 | 


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/08-Exploring Users/example.osm:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <osm version="0.6" generator="CGImap 0.3.3 (28791 thorn-03.openstreetmap.org)" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
 3 |  <bounds minlat="41.9704500" minlon="-87.6928300" maxlat="41.9758200" maxlon="-87.6894800"/>
 4 |  <node id="261114295" visible="true" version="7" changeset="11129782" timestamp="2012-03-28T18:31:23Z" user="bbmiller" uid="451048" lat="41.9730791" lon="-87.6866303"/>
 5 |  <node id="261114296" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9730416" lon="-87.6878512"/>
 6 |  <node id="261114299" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9729565" lon="-87.6939548"/>
 7 |  <node id="261146436" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707380" lon="-87.6976025"/>
 8 |  <node id="261147304" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9740068" lon="-87.6988576"/>
 9 |  <node id="261224274" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707656" lon="-87.6938669"/>
10 |  <node id="293816175" visible="true" version="47" changeset="8448766" timestamp="2011-06-15T16:55:37Z" user="bbmiller" uid="451048" lat="41.9730154" lon="-87.6890403"/>
11 |  <node id="305896090" visible="true" version="37" changeset="15348240" timestamp="2013-03-13T07:46:29Z" user="Umbugbene" uid="567034" lat="41.9749225" lon="-87.6891198"/>
12 |  <node id="317636974" visible="true" version="12" changeset="15348240" timestamp="2013-03-13T08:02:56Z" user="Umbugbene" uid="567034" lat="41.9740292" lon="-87.7012430"/>
13 |  <node id="317636971" visible="true" version="13" changeset="15348240" timestamp="2013-03-13T08:08:01Z" user="Umbugbene" uid="567034" lat="41.9740556" lon="-87.6979712"/>
14 |  <node id="317637399" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9705609" lon="-87.7012048"/>
15 |  <node id="317637398" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9706972" lon="-87.7012109"/>
16 |  <node id="365214872" visible="true" version="3" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731130" lon="-87.6847998"/>
17 |  <node id="261299091" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9747482" lon="-87.6988886"/>
18 |  <node id="261114294" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731219" lon="-87.6841979"/>
19 |  <node id="261210804" visible="true" version="4" changeset="3359748" timestamp="2009-12-13T00:36:09Z" user="woodpeck_fixbot" uid="147510" lat="41.9707217" lon="-87.7000019"/>
20 |  <node id="261221422" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9748542" lon="-87.6922652"/>
21 |  <node id="261221424" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9758794" lon="-87.6923639">
22 |   <tag k="highway" v="traffic_signals"/>
23 |  </node>
24 |  <node id="261198953" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:13Z" user="bbmiller" uid="451048" lat="41.9707413" lon="-87.6963097"/>
25 |  <node id="757860928" visible="true" version="2" changeset="5288876" timestamp="2010-07-22T16:16:51Z" user="uboot" uid="26299" lat="41.9747374" lon="-87.6920102">
26 |   <tag k="amenity" v="fast_food"/>
27 |   <tag k="cuisine" v="sausage"/>
28 |   <tag k="name" v="Shelly's Tasty Freeze"/>
29 |  </node>
30 |   <way id="258219703" visible="true" version="1" changeset="20187382" timestamp="2014-01-25T02:01:54Z" user="linuxUser16" uid="1219059">
31 |   <nd ref="2636086179"/>
32 |   <nd ref="2636086178"/>
33 |   <nd ref="2636086177"/>
34 |   <nd ref="2636086176"/>
35 |   <tag k="highway" v="service"/>
36 |  </way>
37 |  <relation id="1557627" visible="true" version="2" changeset="14326854" timestamp="2012-12-19T05:32:37Z" user="fredr" uid="939355">
38 |   <member type="node" ref="1258927212" role="via"/>
39 |   <member type="way" ref="110160127" role="from"/>
40 |   <member type="way" ref="34073105" role="to"/>
41 |   <tag k="restriction" v="only_right_turn"/>
42 |   <tag k="type" v="restriction"/>
43 |  </relation>
44 | </osm>


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/08-Exploring Users/users.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import xml.etree.cElementTree as ET
 5 | import pprint
 6 | 
 7 | """
 8 | Your task is to explore the data a bit more.
 9 | The first task is a fun one - find out how many unique users
10 | have contributed to the map in this particular area!
11 | 
12 | The function process_map should return a set of unique user IDs ("uid")
13 | """
14 | 
15 | 
16 | def get_user(element):
17 |     return
18 | 
19 | 
20 | def process_map(filename):
21 |     users = set()
22 |     for _, element in ET.iterparse(filename):
23 |         if "uid" in element.attrib:
24 |             users.add(element.attrib["uid"])
25 |     return users
26 | 
27 | 
28 | def test():
29 | 
30 |     users = process_map('example.osm')
31 |     pprint.pprint(users)
32 |     assert len(users) == 6
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     test()
37 | 


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/11-Improving Street Names/audit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Your task in this exercise has two steps:
 3 | 
 4 | - audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
 5 |     the unexpected street types to the appropriate ones in the expected list.
 6 |     You have to add mappings only for the actual problems you find in this OSMFILE,
 7 |     not a generalized solution, since that may and will depend on the particular area you are auditing.
 8 | - write the update_name function, to actually fix the street name.
 9 |     The function takes a string with street name as an argument and should return the fixed name
10 |     We have provided a simple test so that you see what exactly is expected
11 | """
12 | 
13 | import xml.etree.cElementTree as ET
14 | from collections import defaultdict
15 | import re
16 | import pprint
17 | 
18 | OSMFILE = "example.osm"
19 | street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
20 | 
21 | 
22 | expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
23 |             "Trail", "Parkway", "Commons"]
24 | 
25 | # UPDATE THIS VARIABLE
26 | mapping = { "St": "Street",
27 |             "St.": "Street",
28 |             "Ave": "Avenue",
29 |             "Rd.": "Road"
30 |             }
31 | 
32 | 
33 | def audit_street_type(street_types, street_name):
34 |     m = street_type_re.search(street_name)
35 |     if m:
36 |         street_type = m.group()
37 |         if street_type not in expected:
38 |             street_types[street_type].add(street_name)
39 | 
40 | 
41 | def is_street_name(elem):
42 |     return (elem.attrib['k'] == "addr:street")
43 | 
44 | 
45 | def audit(osmfile):
46 |     osm_file = open(osmfile, "r")
47 |     street_types = defaultdict(set)
48 |     for event, elem in ET.iterparse(osm_file, events=("start",)):
49 | 
50 |         if elem.tag == "node" or elem.tag == "way":
51 |             for tag in elem.iter("tag"):
52 |                 if is_street_name(tag):
53 |                     audit_street_type(street_types, tag.attrib['v'])
54 |     osm_file.close()
55 |     return street_types
56 | 
57 | 
58 | def update_name(name, mapping):
59 | 
60 |     # YOUR CODE HERE
61 |     m = street_type_re.search(name)
62 |     if m:
63 |         street_type = m.group()
64 |         if street_type not in expected:
65 |             name = re.sub(street_type_re, mapping[street_type], name)
66 | 
67 |     return name
68 | 
69 | 
70 | def test():
71 |     st_types = audit(OSMFILE)
72 |     assert len(st_types) == 3
73 |     pprint.pprint(dict(st_types))
74 | 
75 |     for st_type, ways in st_types.iteritems():
76 |         for name in ways:
77 |             better_name = update_name(name, mapping)
78 |             print name, "=>", better_name
79 |             if name == "West Lexington St.":
80 |                 assert better_name == "West Lexington Street"
81 |             if name == "Baldwin Rd.":
82 |                 assert better_name == "Baldwin Road"
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     test()
87 | 


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/11-Improving Street Names/example.osm:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <osm version="0.6" generator="CGImap 0.3.3 (28791 thorn-03.openstreetmap.org)" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
 3 |  <bounds minlat="41.9704500" minlon="-87.6928300" maxlat="41.9758200" maxlon="-87.6894800"/>
 4 |  <node id="261114295" visible="true" version="7" changeset="11129782" timestamp="2012-03-28T18:31:23Z" user="bbmiller" uid="451048" lat="41.9730791" lon="-87.6866303"/>
 5 |  <node id="261114296" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9730416" lon="-87.6878512"/>
 6 |  <node id="261114299" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9729565" lon="-87.6939548"/>
 7 |  <node id="261146436" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707380" lon="-87.6976025"/>
 8 |  <node id="261147304" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9740068" lon="-87.6988576"/>
 9 |  <node id="261224274" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707656" lon="-87.6938669"/>
10 |  <node id="293816175" visible="true" version="47" changeset="8448766" timestamp="2011-06-15T16:55:37Z" user="bbmiller" uid="451048" lat="41.9730154" lon="-87.6890403"/>
11 |  <node id="305896090" visible="true" version="37" changeset="15348240" timestamp="2013-03-13T07:46:29Z" user="Umbugbene" uid="567034" lat="41.9749225" lon="-87.6891198"/>
12 |  <node id="317636974" visible="true" version="12" changeset="15348240" timestamp="2013-03-13T08:02:56Z" user="Umbugbene" uid="567034" lat="41.9740292" lon="-87.7012430"/>
13 |  <node id="317636971" visible="true" version="13" changeset="15348240" timestamp="2013-03-13T08:08:01Z" user="Umbugbene" uid="567034" lat="41.9740556" lon="-87.6979712"/>
14 |  <node id="317637399" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9705609" lon="-87.7012048"/>
15 |  <node id="317637398" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9706972" lon="-87.7012109"/>
16 |  <node id="365214872" visible="true" version="3" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731130" lon="-87.6847998"/>
17 |  <node id="261299091" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9747482" lon="-87.6988886"/>
18 |  <node id="261114294" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731219" lon="-87.6841979"/>
19 |  <node id="261210804" visible="true" version="4" changeset="3359748" timestamp="2009-12-13T00:36:09Z" user="woodpeck_fixbot" uid="147510" lat="41.9707217" lon="-87.7000019"/>
20 |  <node id="261221422" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9748542" lon="-87.6922652"/>
21 |  <node id="261221424" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9758794" lon="-87.6923639">
22 |   <tag k="highway" v="traffic_signals"/>
23 |  </node>
24 |  <node id="261198953" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:13Z" user="bbmiller" uid="451048" lat="41.9707413" lon="-87.6963097"/>
25 |  <node id="757860928" visible="true" version="2" changeset="5288876" timestamp="2010-07-22T16:16:51Z" user="uboot" uid="26299" lat="41.9747374" lon="-87.6920102">
26 |   <tag k="amenity" v="fast_food"/>
27 |   <tag k="cuisine" v="sausage"/>
28 |   <tag k="name" v="Shelly's Tasty Freeze"/>
29 |  </node>
30 |   <way id="258219703" visible="true" version="1" changeset="20187382" timestamp="2014-01-25T02:01:54Z" user="linuxUser16" uid="1219059">
31 |   <nd ref="2636086179"/>
32 |   <nd ref="2636086178"/>
33 |   <nd ref="2636086177"/>
34 |   <nd ref="2636086176"/>
35 |   <tag k="highway" v="service"/>
36 |  </way>
37 |  <relation id="1557627" visible="true" version="2" changeset="14326854" timestamp="2012-12-19T05:32:37Z" user="fredr" uid="939355">
38 |   <member type="node" ref="1258927212" role="via"/>
39 |   <member type="way" ref="110160127" role="from"/>
40 |   <member type="way" ref="34073105" role="to"/>
41 |   <tag k="restriction" v="only_right_turn"/>
42 |   <tag k="type" v="restriction"/>
43 |  </relation>
44 | </osm>


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/12-Preparing for Database - MongoDB/data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import xml.etree.cElementTree as ET
  5 | import pprint
  6 | import re
  7 | import codecs
  8 | import json
  9 | 
 10 | """
 11 | Your task is to wrangle the data and transform the shape of the data
 12 | into the model we mentioned earlier. The output should be a list of dictionaries
 13 | that look like this:
 14 | 
 15 | {
 16 | "id": "2406124091",
 17 | "type: "node",
 18 | "visible":"true",
 19 | "created": {
 20 |           "version":"2",
 21 |           "changeset":"17206049",
 22 |           "timestamp":"2013-08-03T16:43:42Z",
 23 |           "user":"linuxUser16",
 24 |           "uid":"1219059"
 25 |         },
 26 | "pos": [41.9757030, -87.6921867],
 27 | "address": {
 28 |           "housenumber": "5157",
 29 |           "postcode": "60625",
 30 |           "street": "North Lincoln Ave"
 31 |         },
 32 | "amenity": "restaurant",
 33 | "cuisine": "mexican",
 34 | "name": "La Cabana De Don Luis",
 35 | "phone": "1 (773)-271-5176"
 36 | }
 37 | 
 38 | You have to complete the function 'shape_element'.
 39 | We have provided a function that will parse the map file, and call the function with the element
 40 | as an argument. You should return a dictionary, containing the shaped data for that element.
 41 | We have also provided a way to save the data in a file, so that you could use
 42 | mongoimport later on to import the shaped data into MongoDB.
 43 | 
 44 | Note that in this exercise we do not use the 'update street name' procedures
 45 | you worked on in the previous exercise. If you are using this code in your final
 46 | project, you are strongly encouraged to use the code from previous exercise to
 47 | update the street names before you save them to JSON.
 48 | 
 49 | In particular the following things should be done:
 50 | - you should process only 2 types of top level tags: "node" and "way"
 51 | - all attributes of "node" and "way" should be turned into regular key/value pairs, except:
 52 |     - attributes in the CREATED array should be added under a key "created"
 53 |     - attributes for latitude and longitude should be added to a "pos" array,
 54 |       for use in geospacial indexing. Make sure the values inside "pos" array are floats
 55 |       and not strings.
 56 | - if the second level tag "k" value contains problematic characters, it should be ignored
 57 | - if the second level tag "k" value starts with "addr:", it should be added to a dictionary "address"
 58 | - if the second level tag "k" value does not start with "addr:", but contains ":", you can
 59 |   process it in a way that you feel is best. For example, you might split it into a two-level
 60 |   dictionary like with "addr:", or otherwise convert the ":" to create a valid key.
 61 | - if there is a second ":" that separates the type/direction of a street,
 62 |   the tag should be ignored, for example:
 63 | 
 64 | <tag k="addr:housenumber" v="5158"/>
 65 | <tag k="addr:street" v="North Lincoln Avenue"/>
 66 | <tag k="addr:street:name" v="Lincoln"/>
 67 | <tag k="addr:street:prefix" v="North"/>
 68 | <tag k="addr:street:type" v="Avenue"/>
 69 | <tag k="amenity" v="pharmacy"/>
 70 | 
 71 |   should be turned into:
 72 | 
 73 | {...
 74 | "address": {
 75 |     "housenumber": 5158,
 76 |     "street": "North Lincoln Avenue"
 77 | }
 78 | "amenity": "pharmacy",
 79 | ...
 80 | }
 81 | 
 82 | - for "way" specifically:
 83 | 
 84 |   <nd ref="305896090"/>
 85 |   <nd ref="1719825889"/>
 86 | 
 87 | should be turned into
 88 | "node_refs": ["305896090", "1719825889"]
 89 | """
 90 | 
 91 | lower = re.compile(r'^([a-z]|_)*$')
 92 | lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
 93 | problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')
 94 | 
 95 | CREATED = ["version", "changeset", "timestamp", "user", "uid"]
 96 | 
 97 | 
 98 | def shape_element(element):
 99 |     node = {}
100 |     if element.tag == "node" or element.tag == "way":
101 |         # YOUR CODE HERE
102 |         node['type'] = element.tag
103 | 
104 |         # Parse attributes
105 |         for a in element.attrib:
106 |             if a in CREATED:
107 |                 if 'created' not in node:
108 |                     node['created'] = {}
109 |                 node['created'][a] = element.attrib[a]
110 | 
111 |             elif a in ['lat', 'lon']:
112 |                 if 'pos' not in node:
113 |                     node['pos'] = [None, None]
114 |                 if a == 'lat':
115 |                     node['pos'][0] = float(element.attrib[a])
116 |                 else:
117 |                     node['pos'][1] = float(element.attrib[a])
118 | 
119 |             else:
120 |                 node[a] = element.attrib[a]
121 | 
122 |         # Iterate tag children
123 |         for tag in element.iter("tag"):
124 |             if not problemchars.search(tag.attrib['k']):
125 |                 # Tags with single colon
126 |                 if lower_colon.search(tag.attrib['k']):
127 | 
128 |                     # Single colon beginning with addr
129 |                     if tag.attrib['k'].find('addr') == 0:
130 |                         if 'address' not in node:
131 |                             node['address'] = {}
132 | 
133 |                         sub_attr = tag.attrib['k'].split(':', 1)
134 |                         node['address'][sub_attr[1]] = tag.attrib['v']
135 | 
136 |                     # All other single colons processed normally
137 |                     else:
138 |                         node[tag.attrib['k']] = tag.attrib['v']
139 | 
140 |                 # Tags with no colon
141 |                 elif tag.attrib['k'].find(':') == -1:
142 |                     node[tag.attrib['k']] = tag.attrib['v']
143 | 
144 |             # Iterate nd children
145 |             for nd in element.iter("nd"):
146 |                 if 'node_refs' not in node:
147 |                     node['node_refs'] = []
148 |                 node['node_refs'].append(nd.attrib['ref'])
149 | 
150 |         return node
151 |     else:
152 |         return None
153 | 
154 | 
155 | def process_map(file_in, pretty=False):
156 |     # You do not need to change this file
157 |     file_out = "{0}.json".format(file_in)
158 |     data = []
159 |     with codecs.open(file_out, "w") as fo:
160 |         for _, element in ET.iterparse(file_in):
161 |             el = shape_element(element)
162 |             if el:
163 |                 data.append(el)
164 |                 if pretty:
165 |                     fo.write(json.dumps(el, indent=2) + "\n")
166 |                 else:
167 |                     fo.write(json.dumps(el) + "\n")
168 |     return data
169 | 
170 | 
171 | def test():
172 |     # NOTE: if you are running this code on your computer, with a larger dataset,
173 |     # call the process_map procedure with pretty=False. The pretty=True option adds
174 |     # additional spaces to the output, making it significantly larger.
175 |     data = process_map('example.osm', True)
176 |     # pprint.pprint(data)
177 | 
178 |     correct_first_elem = {
179 |         "id": "261114295",
180 |         "visible": "true",
181 |         "type": "node",
182 |         "pos": [41.9730791, -87.6866303],
183 |         "created": {
184 |             "changeset": "11129782",
185 |             "user": "bbmiller",
186 |             "version": "7",
187 |             "uid": "451048",
188 |             "timestamp": "2012-03-28T18:31:23Z"
189 |         }
190 |     }
191 |     assert data[0] == correct_first_elem
192 |     assert data[-1]["address"] == {
193 |         "street": "West Lexington St.",
194 |         "housenumber": "1412"
195 |     }
196 |     assert data[-1]["node_refs"] == ["2199822281", "2199822390", "2199822392", "2199822369",
197 |                                      "2199822370", "2199822284", "2199822281"]
198 | 
199 | 
200 | if __name__ == "__main__":
201 |     test()
202 | 


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/12-Preparing for Database - MongoDB/example.osm:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <osm version="0.6" generator="CGImap 0.3.3 (28791 thorn-03.openstreetmap.org)" copyright="OpenStreetMap and contributors" attribution="http://www.openstreetmap.org/copyright" license="http://opendatacommons.org/licenses/odbl/1-0/">
 3 |  <bounds minlat="41.9704500" minlon="-87.6928300" maxlat="41.9758200" maxlon="-87.6894800"/>
 4 |  <node id="261114295" visible="true" version="7" changeset="11129782" timestamp="2012-03-28T18:31:23Z" user="bbmiller" uid="451048" lat="41.9730791" lon="-87.6866303"/>
 5 |  <node id="261114296" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9730416" lon="-87.6878512"/>
 6 |  <node id="261114299" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9729565" lon="-87.6939548"/>
 7 |  <node id="261146436" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707380" lon="-87.6976025"/>
 8 |  <node id="261147304" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9740068" lon="-87.6988576"/>
 9 |  <node id="261224274" visible="true" version="5" changeset="8581395" timestamp="2011-06-29T14:14:14Z" user="bbmiller" uid="451048" lat="41.9707656" lon="-87.6938669"/>
10 |  <node id="293816175" visible="true" version="47" changeset="8448766" timestamp="2011-06-15T16:55:37Z" user="bbmiller" uid="451048" lat="41.9730154" lon="-87.6890403"/>
11 |  <node id="305896090" visible="true" version="37" changeset="15348240" timestamp="2013-03-13T07:46:29Z" user="Umbugbene" uid="567034" lat="41.9749225" lon="-87.6891198"/>
12 |  <node id="317636974" visible="true" version="12" changeset="15348240" timestamp="2013-03-13T08:02:56Z" user="Umbugbene" uid="567034" lat="41.9740292" lon="-87.7012430"/>
13 |  <node id="317636971" visible="true" version="13" changeset="15348240" timestamp="2013-03-13T08:08:01Z" user="Umbugbene" uid="567034" lat="41.9740556" lon="-87.6979712"/>
14 |  <node id="317637399" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9705609" lon="-87.7012048"/>
15 |  <node id="317637398" visible="true" version="2" changeset="14927972" timestamp="2013-02-05T22:43:49Z" user="Umbugbene" uid="567034" lat="41.9706972" lon="-87.7012109"/>
16 |  <node id="365214872" visible="true" version="3" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731130" lon="-87.6847998"/>
17 |  <node id="261299091" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9747482" lon="-87.6988886"/>
18 |  <node id="261114294" visible="true" version="6" changeset="8448766" timestamp="2011-06-15T17:04:54Z" user="bbmiller" uid="451048" lat="41.9731219" lon="-87.6841979"/>
19 |  <node id="261210804" visible="true" version="4" changeset="3359748" timestamp="2009-12-13T00:36:09Z" user="woodpeck_fixbot" uid="147510" lat="41.9707217" lon="-87.7000019"/>
20 |  <node id="261221422" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9748542" lon="-87.6922652"/>
21 |  <node id="261221424" visible="true" version="7" changeset="8581395" timestamp="2011-06-29T14:14:15Z" user="bbmiller" uid="451048" lat="41.9758794" lon="-87.6923639">
22 |   <tag k="highway" v="traffic_signals"/>
23 |  </node>
24 |  <node id="261198953" visible="true" version="6" changeset="8581395" timestamp="2011-06-29T14:14:13Z" user="bbmiller" uid="451048" lat="41.9707413" lon="-87.6963097"/>
25 |  <node id="757860928" visible="true" version="2" changeset="5288876" timestamp="2010-07-22T16:16:51Z" user="uboot" uid="26299" lat="41.9747374" lon="-87.6920102">
26 |   <tag k="amenity" v="fast_food"/>
27 |   <tag k="cuisine" v="sausage"/>
28 |   <tag k="name" v="Shelly's Tasty Freeze"/>
29 |  </node>
30 |   <way id="258219703" visible="true" version="1" changeset="20187382" timestamp="2014-01-25T02:01:54Z" user="linuxUser16" uid="1219059">
31 |   <nd ref="2636086179"/>
32 |   <nd ref="2636086178"/>
33 |   <nd ref="2636086177"/>
34 |   <nd ref="2636086176"/>
35 |   <tag k="highway" v="service"/>
36 |  </way>
37 |  <relation id="1557627" visible="true" version="2" changeset="14326854" timestamp="2012-12-19T05:32:37Z" user="fredr" uid="939355">
38 |   <member type="node" ref="1258927212" role="via"/>
39 |   <member type="way" ref="110160127" role="from"/>
40 |   <member type="way" ref="34073105" role="to"/>
41 |   <tag k="restriction" v="only_right_turn"/>
42 |   <tag k="type" v="restriction"/>
43 |  </relation>
44 | </osm>


--------------------------------------------------------------------------------
/Lesson 6 - Case Study - OpenStreetMap Data/12-Preparing for Database - MongoDB/example.osm.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "id": "261114295", 
  3 |   "visible": "true", 
  4 |   "type": "node", 
  5 |   "pos": [
  6 |     41.9730791, 
  7 |     -87.6866303
  8 |   ], 
  9 |   "created": {
 10 |     "changeset": "11129782", 
 11 |     "user": "bbmiller", 
 12 |     "version": "7", 
 13 |     "uid": "451048", 
 14 |     "timestamp": "2012-03-28T18:31:23Z"
 15 |   }
 16 | }
 17 | {
 18 |   "id": "261114296", 
 19 |   "visible": "true", 
 20 |   "type": "node", 
 21 |   "pos": [
 22 |     41.9730416, 
 23 |     -87.6878512
 24 |   ], 
 25 |   "created": {
 26 |     "changeset": "8448766", 
 27 |     "user": "bbmiller", 
 28 |     "version": "6", 
 29 |     "uid": "451048", 
 30 |     "timestamp": "2011-06-15T17:04:54Z"
 31 |   }
 32 | }
 33 | {
 34 |   "id": "261114299", 
 35 |   "visible": "true", 
 36 |   "type": "node", 
 37 |   "pos": [
 38 |     41.9729565, 
 39 |     -87.6939548
 40 |   ], 
 41 |   "created": {
 42 |     "changeset": "8581395", 
 43 |     "user": "bbmiller", 
 44 |     "version": "5", 
 45 |     "uid": "451048", 
 46 |     "timestamp": "2011-06-29T14:14:14Z"
 47 |   }
 48 | }
 49 | {
 50 |   "id": "261146436", 
 51 |   "visible": "true", 
 52 |   "type": "node", 
 53 |   "pos": [
 54 |     41.970738, 
 55 |     -87.6976025
 56 |   ], 
 57 |   "created": {
 58 |     "changeset": "8581395", 
 59 |     "user": "bbmiller", 
 60 |     "version": "5", 
 61 |     "uid": "451048", 
 62 |     "timestamp": "2011-06-29T14:14:14Z"
 63 |   }
 64 | }
 65 | {
 66 |   "id": "261147304", 
 67 |   "visible": "true", 
 68 |   "type": "node", 
 69 |   "pos": [
 70 |     41.9740068, 
 71 |     -87.6988576
 72 |   ], 
 73 |   "created": {
 74 |     "changeset": "8581395", 
 75 |     "user": "bbmiller", 
 76 |     "version": "7", 
 77 |     "uid": "451048", 
 78 |     "timestamp": "2011-06-29T14:14:15Z"
 79 |   }
 80 | }
 81 | {
 82 |   "id": "261224274", 
 83 |   "visible": "true", 
 84 |   "type": "node", 
 85 |   "pos": [
 86 |     41.9707656, 
 87 |     -87.6938669
 88 |   ], 
 89 |   "created": {
 90 |     "changeset": "8581395", 
 91 |     "user": "bbmiller", 
 92 |     "version": "5", 
 93 |     "uid": "451048", 
 94 |     "timestamp": "2011-06-29T14:14:14Z"
 95 |   }
 96 | }
 97 | {
 98 |   "id": "293816175", 
 99 |   "visible": "true", 
100 |   "type": "node", 
101 |   "pos": [
102 |     41.9730154, 
103 |     -87.6890403
104 |   ], 
105 |   "created": {
106 |     "changeset": "8448766", 
107 |     "user": "bbmiller", 
108 |     "version": "47", 
109 |     "uid": "451048", 
110 |     "timestamp": "2011-06-15T16:55:37Z"
111 |   }
112 | }
113 | {
114 |   "id": "305896090", 
115 |   "visible": "true", 
116 |   "type": "node", 
117 |   "pos": [
118 |     41.9749225, 
119 |     -87.6891198
120 |   ], 
121 |   "created": {
122 |     "changeset": "15348240", 
123 |     "user": "Umbugbene", 
124 |     "version": "37", 
125 |     "uid": "567034", 
126 |     "timestamp": "2013-03-13T07:46:29Z"
127 |   }
128 | }
129 | {
130 |   "id": "317636974", 
131 |   "visible": "true", 
132 |   "type": "node", 
133 |   "pos": [
134 |     41.9740292, 
135 |     -87.701243
136 |   ], 
137 |   "created": {
138 |     "changeset": "15348240", 
139 |     "user": "Umbugbene", 
140 |     "version": "12", 
141 |     "uid": "567034", 
142 |     "timestamp": "2013-03-13T08:02:56Z"
143 |   }
144 | }
145 | {
146 |   "id": "317636971", 
147 |   "visible": "true", 
148 |   "type": "node", 
149 |   "pos": [
150 |     41.9740556, 
151 |     -87.6979712
152 |   ], 
153 |   "created": {
154 |     "changeset": "15348240", 
155 |     "user": "Umbugbene", 
156 |     "version": "13", 
157 |     "uid": "567034", 
158 |     "timestamp": "2013-03-13T08:08:01Z"
159 |   }
160 | }
161 | {
162 |   "id": "317637399", 
163 |   "visible": "true", 
164 |   "type": "node", 
165 |   "pos": [
166 |     41.9705609, 
167 |     -87.7012048
168 |   ], 
169 |   "created": {
170 |     "changeset": "14927972", 
171 |     "user": "Umbugbene", 
172 |     "version": "2", 
173 |     "uid": "567034", 
174 |     "timestamp": "2013-02-05T22:43:49Z"
175 |   }
176 | }
177 | {
178 |   "id": "317637398", 
179 |   "visible": "true", 
180 |   "type": "node", 
181 |   "pos": [
182 |     41.9706972, 
183 |     -87.7012109
184 |   ], 
185 |   "created": {
186 |     "changeset": "14927972", 
187 |     "user": "Umbugbene", 
188 |     "version": "2", 
189 |     "uid": "567034", 
190 |     "timestamp": "2013-02-05T22:43:49Z"
191 |   }
192 | }
193 | {
194 |   "id": "365214872", 
195 |   "visible": "true", 
196 |   "type": "node", 
197 |   "pos": [
198 |     41.973113, 
199 |     -87.6847998
200 |   ], 
201 |   "created": {
202 |     "changeset": "8448766", 
203 |     "user": "bbmiller", 
204 |     "version": "3", 
205 |     "uid": "451048", 
206 |     "timestamp": "2011-06-15T17:04:54Z"
207 |   }
208 | }
209 | {
210 |   "id": "261299091", 
211 |   "visible": "true", 
212 |   "type": "node", 
213 |   "pos": [
214 |     41.9747482, 
215 |     -87.6988886
216 |   ], 
217 |   "created": {
218 |     "changeset": "8581395", 
219 |     "user": "bbmiller", 
220 |     "version": "6", 
221 |     "uid": "451048", 
222 |     "timestamp": "2011-06-29T14:14:15Z"
223 |   }
224 | }
225 | {
226 |   "id": "261114294", 
227 |   "visible": "true", 
228 |   "type": "node", 
229 |   "pos": [
230 |     41.9731219, 
231 |     -87.6841979
232 |   ], 
233 |   "created": {
234 |     "changeset": "8448766", 
235 |     "user": "bbmiller", 
236 |     "version": "6", 
237 |     "uid": "451048", 
238 |     "timestamp": "2011-06-15T17:04:54Z"
239 |   }
240 | }
241 | {
242 |   "id": "261210804", 
243 |   "visible": "true", 
244 |   "type": "node", 
245 |   "pos": [
246 |     41.9707217, 
247 |     -87.7000019
248 |   ], 
249 |   "created": {
250 |     "changeset": "3359748", 
251 |     "user": "woodpeck_fixbot", 
252 |     "version": "4", 
253 |     "uid": "147510", 
254 |     "timestamp": "2009-12-13T00:36:09Z"
255 |   }
256 | }
257 | {
258 |   "id": "261221422", 
259 |   "visible": "true", 
260 |   "type": "node", 
261 |   "pos": [
262 |     41.9748542, 
263 |     -87.6922652
264 |   ], 
265 |   "created": {
266 |     "changeset": "8581395", 
267 |     "user": "bbmiller", 
268 |     "version": "7", 
269 |     "uid": "451048", 
270 |     "timestamp": "2011-06-29T14:14:15Z"
271 |   }
272 | }
273 | {
274 |   "created": {
275 |     "changeset": "8581395", 
276 |     "user": "bbmiller", 
277 |     "version": "7", 
278 |     "uid": "451048", 
279 |     "timestamp": "2011-06-29T14:14:15Z"
280 |   }, 
281 |   "pos": [
282 |     41.9758794, 
283 |     -87.6923639
284 |   ], 
285 |   "visible": "true", 
286 |   "type": "node", 
287 |   "id": "261221424", 
288 |   "highway": "traffic_signals"
289 | }
290 | {
291 |   "id": "261198953", 
292 |   "visible": "true", 
293 |   "type": "node", 
294 |   "pos": [
295 |     41.9707413, 
296 |     -87.6963097
297 |   ], 
298 |   "created": {
299 |     "changeset": "8581395", 
300 |     "user": "bbmiller", 
301 |     "version": "6", 
302 |     "uid": "451048", 
303 |     "timestamp": "2011-06-29T14:14:13Z"
304 |   }
305 | }
306 | {
307 |   "cuisine": "sausage", 
308 |   "amenity": "fast_food", 
309 |   "name": "Shelly's Tasty Freeze", 
310 |   "created": {
311 |     "changeset": "5288876", 
312 |     "user": "uboot", 
313 |     "version": "2", 
314 |     "uid": "26299", 
315 |     "timestamp": "2010-07-22T16:16:51Z"
316 |   }, 
317 |   "pos": [
318 |     41.9747374, 
319 |     -87.6920102
320 |   ], 
321 |   "visible": "true", 
322 |   "type": "node", 
323 |   "id": "757860928"
324 | }
325 | {
326 |   "node_refs": [
327 |     "2636086179", 
328 |     "2636086178", 
329 |     "2636086177", 
330 |     "2636086176"
331 |   ], 
332 |   "created": {
333 |     "changeset": "20187382", 
334 |     "user": "linuxUser16", 
335 |     "version": "1", 
336 |     "uid": "1219059", 
337 |     "timestamp": "2014-01-25T02:01:54Z"
338 |   }, 
339 |   "visible": "true", 
340 |   "type": "way", 
341 |   "id": "258219703", 
342 |   "highway": "service"
343 | }
344 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Udacity: [Data Wrangling with MongoDB](https://www.udacity.com/course/data-wrangling-with-mongodb--ud032)
 2 | 
 3 | ## [Course Contents](https://www.udacity.com/wiki/ud032)
 4 | 
 5 | This repository contains solutions to all quizzes and problem sets.
 6 | 
 7 | This coursework requires **Python 2.7** and the following Python libraries:
 8 | 
 9 | - [os](https://docs.python.org/2/library/os.html)
10 | - [csv](https://docs.python.org/2/library/csv.html)
11 | - [xlrd](http://xlrd.readthedocs.io/en/latest/)
12 | - [zipfile](https://docs.python.org/2/library/zipfile.html)
13 | - [json](https://docs.python.org/2/library/json.html#module-json)
14 | - [requests](http://docs.python-requests.org/en/master/)
15 | - [codecs](https://docs.python.org/2/library/codecs.html)
16 | - [xml.etree.ElementTree](https://docs.python.org/2/library/xml.etree.elementtree.html)
17 | - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
18 | - [pprint](https://docs.python.org/2/library/pprint.html)
19 | - [pymongo](https://api.mongodb.com/python/current/)
20 | - [re](https://docs.python.org/2/library/re.html)
21 | 


--------------------------------------------------------------------------------