├── .gitignore
├── 600Solutions.sql
├── Mission103Solutions.ipynb
├── Mission146Solutions.ipynb
├── Mission149Solutions.ipynb
├── Mission155Solutions.ipynb
├── Mission165Solutions.ipynb
├── Mission167Solutions.ipynb
├── Mission177Solutions.ipynb
├── Mission188Solution.ipynb
├── Mission191Solutions.ipynb
├── Mission193Solutions.ipynb
├── Mission201Solution.ipynb
├── Mission202Solution.ipynb
├── Mission205Solutions.ipynb
├── Mission207Solutions.ipynb
├── Mission209Solution.ipynb
├── Mission210Solution.ipynb
├── Mission211Solution.ipynb
├── Mission213Solution.ipynb
├── Mission215Solutions.ipynb
├── Mission216Solutions.ipynb
├── Mission217Solutions.ipynb
├── Mission218Solution.ipynb
├── Mission219Solution.ipynb
├── Mission227Solutions.ipynb
├── Mission234Solutions.ipynb
├── Mission240Solutions.ipynb
├── Mission244Solutions.ipynb
├── Mission251Solution.ipynb
├── Mission251Solutions.py
├── Mission257Solutions.ipynb
├── Mission267Solutions.ipynb
├── Mission277Solutions.Rmd
├── Mission280Solutions.ipynb
├── Mission288Solutions.ipynb
├── Mission294Solutions.ipynb
├── Mission304Solutions.ipynb
├── Mission310Solutions.ipynb
├── Mission327Solutions.Rmd
├── Mission348Solutions.ipynb
├── Mission349Solutions.ipynb
├── Mission350Solutions.ipynb
├── Mission356Solutions.ipynb
├── Mission368Solutions.ipynb
├── Mission374Solutions.Rmd
├── Mission376Solutions.Rmd
├── Mission382Solutions.ipynb
├── Mission409Solutions.Rmd
├── Mission410Solutions.Rmd
├── Mission433Solutions.ipynb
├── Mission443Solutions.Rmd
├── Mission449Solutions.Rmd
├── Mission459Solutions.Rmd
├── Mission469Solutions.ipynb
├── Mission475Solutions.Rmd
├── Mission481Solution.ipynb
├── Mission481Solutions.ipynb
├── Mission487Solutions.Rmd
├── Mission498Solutions.Rmd
├── Mission505Solutions.Rmd
├── Mission516Solutions.Rmd
├── Mission518Solutions.Rmd
├── Mission524Solutions.ipynb
├── Mission529Solutions.ipynb
├── Mission530Solutions.ipynb
├── Mission559Solutions.ipynb
├── Mission564Solutions.ipynb
├── Mission569Solutions.ipynb
├── Mission571Solutions.Rmd
├── Mission572Solutions.Rmd
├── Mission610Solutions.ipynb
├── Mission612Solutions.ipynb
├── Mission718Solutions.ipynb
├── Mission730Solutions.ipynb
├── Mission735Solutions.ipynb
├── Mission740Solutions.ipynb
├── Mission745Solutions.ipynb
├── Mission750Solutions.ipynb
├── Mission755Solutions.ipynb
├── Mission764Solutions.ipynb
├── Mission777Solutions.ipynb
├── Mission784Solutions.ipynb
├── Mission790Solutions.ipynb
├── Mission797Solutions.ipynb
├── Mission798Solutions.ipynb
├── Mission804Solutions.ipynb
├── Mission855Solutions.ipynb
├── Mission882Solutions.ipynb
├── Mission893Solutions.ipynb
├── Mission903Solutions.py
├── Mission909Solutions.ipynb
├── Mission9Solutions.ipynb
├── README.md
└── images
    └── schema-screenshot.png


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea
3 | .ipynb_checkpoints
4 | __pycache__
5 | temp.py
6 | *.db
7 | 


--------------------------------------------------------------------------------
/600Solutions.sql:
--------------------------------------------------------------------------------
  1 | /* Screen 3 */
  2 | -- Table descriptions
  3 | SELECT 'Customers' AS table_name, 
  4 |        13 AS number_of_attribute,
  5 |        COUNT(*) AS number_of_row
  6 |   FROM Customers
  7 |   
  8 | UNION ALL
  9 | 
 10 | SELECT 'Products' AS table_name, 
 11 |        9 AS number_of_attribute,
 12 |        COUNT(*) AS number_of_row
 13 |   FROM Products
 14 | 
 15 | UNION ALL
 16 | 
 17 | SELECT 'ProductLines' AS table_name, 
 18 |        4 AS number_of_attribute,
 19 |        COUNT(*) AS number_of_row
 20 |   FROM ProductLines
 21 | 
 22 | UNION ALL
 23 | 
 24 | SELECT 'Orders' AS table_name, 
 25 |        7 AS number_of_attribute,
 26 |        COUNT(*) AS number_of_row
 27 |   FROM Orders
 28 | 
 29 | UNION ALL
 30 | 
 31 | SELECT 'OrderDetails' AS table_name, 
 32 |        5 AS number_of_attribute,
 33 |        COUNT(*) AS number_of_row
 34 |   FROM OrderDetails
 35 | 
 36 | UNION ALL
 37 | 
 38 | SELECT 'Payments' AS table_name, 
 39 |        4 AS number_of_attribute,
 40 |        COUNT(*) AS number_of_row
 41 |   FROM Payments
 42 | 
 43 | UNION ALL
 44 | 
 45 | SELECT 'Employees' AS table_name, 
 46 |        8 AS number_of_attribute,
 47 |        COUNT(*) AS number_of_row
 48 |   FROM Employees
 49 | 
 50 | UNION ALL
 51 | 
 52 | SELECT 'Offices' AS table_name, 
 53 |        9 AS number_of_attribute,
 54 |        COUNT(*) AS number_of_row
 55 |   FROM Offices;
 56 | 
 57 | /* Screen 4 */
 58 | --Low stock
 59 | SELECT productCode, 
 60 |        ROUND(SUM(quantityOrdered) * 1.0 / (SELECT quantityInStock
 61 |                                              FROM products p
 62 |                                             WHERE od.productCode = p.productCode), 2) AS low_stock
 63 |   FROM orderdetails od
 64 |  GROUP BY productCode
 65 |  ORDER BY low_stock DESC
 66 |  LIMIT 10;
 67 |  
 68 | 
 69 | -- Product performance
 70 | SELECT productCode, 
 71 |        SUM(quantityOrdered * priceEach) AS prod_perf
 72 |   FROM orderdetails od
 73 |  GROUP BY productCode 
 74 |  ORDER BY prod_perf DESC
 75 |  LIMIT 10;
 76 |  
 77 | -- Priority Products for restocking
 78 | WITH 
 79 | 
 80 | low_stock_table AS (
 81 | SELECT productCode, 
 82 |        ROUND(SUM(quantityOrdered) * 1.0/(SELECT quantityInStock
 83 |                                            FROM products p
 84 |                                           WHERE od.productCode = p.productCode), 2) AS low_stock
 85 |   FROM orderdetails od
 86 |  GROUP BY productCode
 87 |  ORDER BY low_stock DESC
 88 |  LIMIT 10
 89 | ),
 90 | 
 91 | products_to_restock AS (
 92 | SELECT productCode, 
 93 |        SUM(quantityOrdered * priceEach) AS prod_perf
 94 |   FROM orderdetails od
 95 |  WHERE productCode IN (SELECT productCode
 96 |                          FROM low_stock_table)
 97 |  GROUP BY productCode 
 98 |  ORDER BY prod_perf DESC
 99 |  LIMIT 10
100 | )
101 |     
102 | SELECT productName, productLine
103 |   FROM products AS p
104 |  WHERE productCode IN (SELECT productCode
105 |                          FROM products_to_restock);
106 | 
107 | 
108 | 
109 | /* Screen 5 */
110 | -- revenue by customer
111 | SELECT o.customerNumber, SUM(quantityOrdered * (priceEach - buyPrice)) AS revenue
112 |   FROM products p
113 |   JOIN orderdetails od
114 |     ON p.productCode = od.productCode
115 |   JOIN orders o
116 |     ON o.orderNumber = od.orderNumber
117 |  GROUP BY o.customerNumber;
118 |  
119 | -- Top 5 VIP customers
120 | WITH 
121 | 
122 | money_in_by_customer_table AS (
123 | SELECT o.customerNumber, SUM(quantityOrdered * (priceEach - buyPrice)) AS revenue
124 |   FROM products p
125 |   JOIN orderdetails od
126 |     ON p.productCode = od.productCode
127 |   JOIN orders o
128 |     ON o.orderNumber = od.orderNumber
129 |  GROUP BY o.customerNumber
130 | )
131 | 
132 | SELECT contactLastName, contactFirstName, city, country, mc.revenue
133 |   FROM customers c
134 |   JOIN money_in_by_customer_table mc
135 |     ON mc.customerNumber = c.customerNumber
136 |  ORDER BY mc.revenue DESC
137 |  LIMIT 5;
138 |  
139 |  -- Top 5 less engaging customers
140 | WITH 
141 | 
142 | money_in_by_customer_table AS (
143 | SELECT o.customerNumber, SUM(quantityOrdered * (priceEach - buyPrice)) AS revenue
144 |   FROM products p
145 |   JOIN orderdetails od
146 |     ON p.productCode = od.productCode
147 |   JOIN orders o
148 |     ON o.orderNumber = od.orderNumber
149 |  GROUP BY o.customerNumber
150 | )
151 | 
152 | SELECT contactLastName, contactFirstName, city, country, mc.revenue
153 |   FROM customers c
154 |   JOIN money_in_by_customer_table mc
155 |     ON mc.customerNumber = c.customerNumber
156 |  ORDER BY mc.revenue
157 |  LIMIT 5;
158 |  
159 | -- Customer LTV
160 | WITH 
161 | 
162 | money_in_by_customer_table AS (
163 | SELECT o.customerNumber, SUM(quantityOrdered * (priceEach - buyPrice)) AS revenue
164 |   FROM products p
165 |   JOIN orderdetails od
166 |     ON p.productCode = od.productCode
167 |   JOIN orders o
168 |     ON o.orderNumber = od.orderNumber
169 |  GROUP BY o.customerNumber
170 | )
171 | 
172 | SELECT AVG(mc.revenue) AS ltv
173 |   FROM money_in_by_customer_table mc;
174 |   
175 | 


--------------------------------------------------------------------------------
/Mission207Solutions.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {
 6 |     "collapsed": true
 7 |    },
 8 |    "source": [
 9 |     "## Birth Dates in the United States\n",
10 |     "\n",
11 |     "Here is the raw data behind the story **Some People Are Too Superstitious to Have a Baby on Friday the 13th**, which you can read [here](http://fivethirtyeight.com/features/some-people-are-too-superstitious-to-have-a-baby-on-friday-the-13th/).\n",
12 |     "\n",
13 |     "We'll be working with the dataset from the Centers for Disease Control and Prevention's National National Center for Health Statistics. The dataset has the following structure:\n",
14 |     "\n",
15 |     "- `year` - Year\n",
16 |     "- `month` - Month\n",
17 |     "- `date_of_month` - Day number of the month\n",
18 |     "- `day_of_week` - Day of week, where 1 is Monday and 7 is Sunday\n",
19 |     "- `births` - Number of births"
20 |    ]
21 |   },
22 |   {
23 |    "cell_type": "code",
24 |    "execution_count": null,
25 |    "metadata": {},
26 |    "outputs": [],
27 |    "source": [
28 |     "f = open(\"births.csv\", 'r')\n",
29 |     "text = f.read()\n",
30 |     "print(text)"
31 |    ]
32 |   },
33 |   {
34 |    "cell_type": "code",
35 |    "execution_count": null,
36 |    "metadata": {},
37 |    "outputs": [],
38 |    "source": [
39 |     "lines_list = text.split(\"\\n\")\n",
40 |     "lines_list"
41 |    ]
42 |   },
43 |   {
44 |    "cell_type": "code",
45 |    "execution_count": null,
46 |    "metadata": {},
47 |    "outputs": [],
48 |    "source": [
49 |     "data_no_header = lines_list[1:len(lines_list)]\n",
50 |     "days_counts = dict()\n",
51 |     "\n",
52 |     "for line in data_no_header:\n",
53 |     "    split_line = line.split(\",\")\n",
54 |     "    day_of_week = split_line[3]\n",
55 |     "    num_births = int(split_line[4])\n",
56 |     "\n",
57 |     "    if day_of_week in days_counts:\n",
58 |     "        days_counts[day_of_week] = days_counts[day_of_week] + num_births\n",
59 |     "    else:\n",
60 |     "        days_counts[day_of_week] = num_births\n",
61 |     "\n",
62 |     "days_counts"
63 |    ]
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "anaconda-cloud": {},
68 |   "kernelspec": {
69 |    "display_name": "Python 3",
70 |    "language": "python",
71 |    "name": "python3"
72 |   },
73 |   "language_info": {
74 |    "codemirror_mode": {
75 |     "name": "ipython",
76 |     "version": 3
77 |    },
78 |    "file_extension": ".py",
79 |    "mimetype": "text/x-python",
80 |    "name": "python",
81 |    "nbconvert_exporter": "python",
82 |    "pygments_lexer": "ipython3",
83 |    "version": "3.8.5"
84 |   }
85 |  },
86 |  "nbformat": 4,
87 |  "nbformat_minor": 1
88 | }
89 | 


--------------------------------------------------------------------------------
/Mission215Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Introduction to the Data"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "collapsed": true
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "df = pd.read_csv(\"academy_awards.csv\", encoding=\"ISO-8859-1\")\n",
 22 |     "df"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "# Filtering the Data"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "df[\"Year\"] = df[\"Year\"].str[0:4]\n",
 41 |     "df[\"Year\"] = df[\"Year\"].astype(\"int64\")\n",
 42 |     "later_than_2000 = df[df[\"Year\"] > 2000]\n",
 43 |     "award_categories = [\"Actor -- Leading Role\",\"Actor -- Supporting Role\", \"Actress -- Leading Role\", \"Actress -- Supporting Role\"]\n",
 44 |     "nominations = later_than_2000[later_than_2000[\"Category\"].isin(award_categories)]"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "# Cleaning up the Won? and Unnamed Columns"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "replacements = { \"NO\": 0, \"YES\": 1 }\n",
 63 |     "nominations[\"Won?\"] = nominations[\"Won?\"].map(replacements)\n",
 64 |     "nominations[\"Won\"] = nominations[\"Won?\"]\n",
 65 |     "drop_cols = [\"Won?\",\"Unnamed: 5\", \"Unnamed: 6\",\"Unnamed: 7\", \"Unnamed: 8\", \"Unnamed: 9\", \"Unnamed: 10\"]\n",
 66 |     "final_nominations = nominations.drop(drop_cols, axis=1)"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "# Cleaning up the Additional Info Column"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "additional_info_one = final_nominations[\"Additional Info\"].str.rstrip(\"'}\")\n",
 85 |     "additional_info_two = additional_info_one.str.split(\" {'\")\n",
 86 |     "movie_names = additional_info_two.str[0]\n",
 87 |     "characters = additional_info_two.str[1]\n",
 88 |     "final_nominations[\"Movie\"] = movie_names\n",
 89 |     "final_nominations[\"Character\"] = characters\n",
 90 |     "final_nominations = final_nominations.drop(\"Additional Info\", axis=1)\n",
 91 |     "final_nominations"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "# Exporting to SQLite"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {
105 |     "collapsed": true
106 |    },
107 |    "outputs": [],
108 |    "source": [
109 |     "import sqlite3\n",
110 |     "conn = sqlite3.connect(\"nominations.db\")\n",
111 |     "final_nominations.to_sql(\"nominations\", conn, index=False)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "metadata": {},
117 |    "source": [
118 |     "# Verifying in SQL"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "query_one = \"pragma table_info(nominations);\"\n",
130 |     "query_two = \"select * from nominations limit 10;\"\n",
131 |     "print(conn.execute(query_one).fetchall())\n",
132 |     "print(conn.execute(query_two).fetchall())\n",
133 |     "conn.close()"
134 |    ]
135 |   }
136 |  ],
137 |  "metadata": {
138 |   "kernelspec": {
139 |    "display_name": "Python 3",
140 |    "language": "python",
141 |    "name": "python3"
142 |   },
143 |   "language_info": {
144 |    "codemirror_mode": {
145 |     "name": "ipython",
146 |     "version": 3
147 |    },
148 |    "file_extension": ".py",
149 |    "mimetype": "text/x-python",
150 |    "name": "python",
151 |    "nbconvert_exporter": "python",
152 |    "pygments_lexer": "ipython3",
153 |    "version": "3.8.5"
154 |   }
155 |  },
156 |  "nbformat": 4,
157 |  "nbformat_minor": 1
158 | }
159 | 


--------------------------------------------------------------------------------
/Mission216Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Introduction to the Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "(0, 'Year', 'INTEGER', 0, None, 0)\n",
 20 |       "(1, 'Category', 'TEXT', 0, None, 0)\n",
 21 |       "(2, 'Nominee', 'TEXT', 0, None, 0)\n",
 22 |       "(3, 'Won', 'INTEGER', 0, None, 0)\n",
 23 |       "(4, 'Movie', 'TEXT', 0, None, 0)\n",
 24 |       "(5, 'Character', 'TEXT', 0, None, 0)\n",
 25 |       "(2010, 'Actor -- Leading Role', 'Javier Bardem', 0, 'Biutiful', 'Uxbal')\n",
 26 |       "(2010, 'Actor -- Leading Role', 'Jeff Bridges', 0, 'True Grit', 'Rooster Cogburn')\n",
 27 |       "(2010, 'Actor -- Leading Role', 'Jesse Eisenberg', 0, 'The Social Network', 'Mark Zuckerberg')\n",
 28 |       "(2010, 'Actor -- Leading Role', 'Colin Firth', 1, \"The King's Speech\", 'King George VI')\n",
 29 |       "(2010, 'Actor -- Leading Role', 'James Franco', 0, '127 Hours', 'Aron Ralston')\n",
 30 |       "(2010, 'Actor -- Supporting Role', 'Christian Bale', 1, 'The Fighter', 'Dicky Eklund')\n",
 31 |       "(2010, 'Actor -- Supporting Role', 'John Hawkes', 0, \"Winter's Bone\", 'Teardrop')\n",
 32 |       "(2010, 'Actor -- Supporting Role', 'Jeremy Renner', 0, 'The Town', 'James Coughlin')\n",
 33 |       "(2010, 'Actor -- Supporting Role', 'Mark Ruffalo', 0, 'The Kids Are All Right', 'Paul')\n",
 34 |       "(2010, 'Actor -- Supporting Role', 'Geoffrey Rush', 0, \"The King's Speech\", 'Lionel Logue')\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "import sqlite3\n",
 40 |     "conn = sqlite3.connect(\"nominations.db\")\n",
 41 |     "schema = conn.execute(\"pragma table_info(nominations);\").fetchall()\n",
 42 |     "first_ten = conn.execute(\"select * from nominations limit 10;\").fetchall()\n",
 43 |     "\n",
 44 |     "for r in schema:\n",
 45 |     "    print(r)\n",
 46 |     "    \n",
 47 |     "for r in first_ten:\n",
 48 |     "    print(r)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# Creating the Ceremonies Table"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 2,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "name": "stdout",
 65 |      "output_type": "stream",
 66 |      "text": [
 67 |       "[(1, 2010, 'Steve Martin'), (2, 2009, 'Hugh Jackman'), (3, 2008, 'Jon Stewart'), (4, 2007, 'Ellen DeGeneres'), (5, 2006, 'Jon Stewart'), (6, 2005, 'Chris Rock'), (7, 2004, 'Billy Crystal'), (8, 2003, 'Steve Martin'), (9, 2002, 'Whoopi Goldberg'), (10, 2001, 'Steve Martin')]\n",
 68 |       "[(0, 'id', 'integer', 0, None, 1), (1, 'year', 'integer', 0, None, 0), (2, 'host', 'text', 0, None, 0)]\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "years_hosts = [(2010, \"Steve Martin\"),\n",
 74 |     "               (2009, \"Hugh Jackman\"),\n",
 75 |     "               (2008, \"Jon Stewart\"),\n",
 76 |     "               (2007, \"Ellen DeGeneres\"),\n",
 77 |     "               (2006, \"Jon Stewart\"),\n",
 78 |     "               (2005, \"Chris Rock\"),\n",
 79 |     "               (2004, \"Billy Crystal\"),\n",
 80 |     "               (2003, \"Steve Martin\"),\n",
 81 |     "               (2002, \"Whoopi Goldberg\"),\n",
 82 |     "               (2001, \"Steve Martin\"),\n",
 83 |     "               (2000, \"Billy Crystal\"),\n",
 84 |     "            ]\n",
 85 |     "create_ceremonies = \"create table ceremonies (id integer primary key, year integer, host text);\"\n",
 86 |     "conn.execute(create_ceremonies)\n",
 87 |     "insert_query = \"insert into ceremonies (Year, Host) values (?,?);\"\n",
 88 |     "conn.executemany(insert_query, years_hosts)\n",
 89 |     "\n",
 90 |     "print(conn.execute(\"select * from ceremonies limit 10;\").fetchall())\n",
 91 |     "print(conn.execute(\"pragma table_info(ceremonies);\").fetchall())"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "# Foreign Key Constraints"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 3,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "<sqlite3.Cursor at 0x10675e3b0>"
110 |       ]
111 |      },
112 |      "execution_count": 3,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "conn.execute(\"PRAGMA foreign_keys = ON;\")"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "# Setting up One-to-Many"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 4,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "name": "stdout",
135 |      "output_type": "stream",
136 |      "text": [
137 |       "[(1, 'Actor -- Leading Role', 'Javier Bardem', 'Biutiful', 'Uxbal', '0', 1), (2, 'Actor -- Leading Role', 'Jeff Bridges', 'True Grit', 'Rooster Cogburn', '0', 1), (3, 'Actor -- Leading Role', 'Jesse Eisenberg', 'The Social Network', 'Mark Zuckerberg', '0', 1), (4, 'Actor -- Leading Role', 'Colin Firth', \"The King's Speech\", 'King George VI', '1', 1), (5, 'Actor -- Leading Role', 'James Franco', '127 Hours', 'Aron Ralston', '0', 1)]\n"
138 |      ]
139 |     }
140 |    ],
141 |    "source": [
142 |     "create_nominations_two = '''create table nominations_two \n",
143 |     "(id integer primary key, \n",
144 |     "category text, \n",
145 |     "nominee text, \n",
146 |     "movie text, \n",
147 |     "character text, \n",
148 |     "won integer,\n",
149 |     "ceremony_id integer,\n",
150 |     "foreign key(ceremony_id) references ceremonies(id));\n",
151 |     "'''\n",
152 |     "\n",
153 |     "nom_query = '''\n",
154 |     "select ceremonies.id as ceremony_id, nominations.category as category, \n",
155 |     "nominations.nominee as nominee, nominations.movie as movie, \n",
156 |     "nominations.character as character, nominations.won as won\n",
157 |     "from nominations\n",
158 |     "inner join ceremonies \n",
159 |     "on nominations.year == ceremonies.year\n",
160 |     ";\n",
161 |     "'''\n",
162 |     "joined_nominations = conn.execute(nom_query).fetchall()\n",
163 |     "\n",
164 |     "conn.execute(create_nominations_two)\n",
165 |     "\n",
166 |     "insert_nominations_two = '''insert into nominations_two (ceremony_id, category, nominee, movie, character, won) \n",
167 |     "values (?,?,?,?,?,?);\n",
168 |     "'''\n",
169 |     "\n",
170 |     "conn.executemany(insert_nominations_two, joined_nominations)\n",
171 |     "print(conn.execute(\"select * from nominations_two limit 5;\").fetchall())"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "# Deleting and Renaming Tables"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 5,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "<sqlite3.Cursor at 0x10675e6c0>"
190 |       ]
191 |      },
192 |      "execution_count": 5,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "drop_nominations = \"drop table nominations;\"\n",
199 |     "conn.execute(drop_nominations)\n",
200 |     "\n",
201 |     "rename_nominations_two = \"alter table nominations_two rename to nominations;\"\n",
202 |     "conn.execute(rename_nominations_two)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "# Creating a Join Table"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 6,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "data": {
219 |       "text/plain": [
220 |        "<sqlite3.Cursor at 0x10675e960>"
221 |       ]
222 |      },
223 |      "execution_count": 6,
224 |      "metadata": {},
225 |      "output_type": "execute_result"
226 |     }
227 |    ],
228 |    "source": [
229 |     "create_movies = \"create table movies (id integer primary key,movie text);\"\n",
230 |     "create_actors = \"create table actors (id integer primary key,actor text);\"\n",
231 |     "create_movies_actors = '''create table movies_actors (id INTEGER PRIMARY KEY,\n",
232 |     "movie_id INTEGER references movies(id), actor_id INTEGER references actors(id));\n",
233 |     "'''\n",
234 |     "conn.execute(create_movies)\n",
235 |     "conn.execute(create_actors)\n",
236 |     "conn.execute(create_movies_actors)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "# Populating the Movies and Actors Tables"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 7,
249 |    "metadata": {},
250 |    "outputs": [
251 |     {
252 |      "name": "stdout",
253 |      "output_type": "stream",
254 |      "text": [
255 |       "[(1, 'Biutiful'), (2, 'True Grit'), (3, 'The Social Network'), (4, \"The King's Speech\"), (5, '127 Hours')]\n",
256 |       "[(1, 'Javier Bardem'), (2, 'Jeff Bridges'), (3, 'Jesse Eisenberg'), (4, 'Colin Firth'), (5, 'James Franco')]\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "insert_movies = \"insert into movies (movie) select distinct movie from nominations;\"\n",
262 |     "insert_actors = \"insert into actors (actor) select distinct nominee from nominations;\"\n",
263 |     "conn.execute(insert_movies)\n",
264 |     "conn.execute(insert_actors)\n",
265 |     "\n",
266 |     "print(conn.execute(\"select * from movies limit 5;\").fetchall())\n",
267 |     "print(conn.execute(\"select * from actors limit 5;\").fetchall())"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "# Populating a Join Table"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 8,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "name": "stdout",
284 |      "output_type": "stream",
285 |      "text": [
286 |       "[(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4), (5, 5, 5)]\n"
287 |      ]
288 |     }
289 |    ],
290 |    "source": [
291 |     "pairs_query = \"select movie,nominee from nominations;\"\n",
292 |     "movie_actor_pairs = conn.execute(pairs_query).fetchall()\n",
293 |     "\n",
294 |     "join_table_insert = \"insert into movies_actors (movie_id, actor_id) values ((select id from movies where movie == ?),(select id from actors where actor == ?));\"\n",
295 |     "conn.executemany(join_table_insert,movie_actor_pairs)\n",
296 |     "\n",
297 |     "print(conn.execute(\"select * from movies_actors limit 5;\").fetchall())"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": null,
303 |    "metadata": {
304 |     "collapsed": true
305 |    },
306 |    "outputs": [],
307 |    "source": []
308 |   }
309 |  ],
310 |  "metadata": {
311 |   "kernelspec": {
312 |    "display_name": "Python 3",
313 |    "language": "python",
314 |    "name": "python3"
315 |   },
316 |   "language_info": {
317 |    "codemirror_mode": {
318 |     "name": "ipython",
319 |     "version": 3
320 |    },
321 |    "file_extension": ".py",
322 |    "mimetype": "text/x-python",
323 |    "name": "python",
324 |    "nbconvert_exporter": "python",
325 |    "pygments_lexer": "ipython3",
326 |    "version": "3.8.5"
327 |   }
328 |  },
329 |  "nbformat": 4,
330 |  "nbformat_minor": 1
331 | }
332 | 


--------------------------------------------------------------------------------
/Mission218Solution.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# U.S. Gun Deaths Guided Project Solutions"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Introducing U.S. Gun Deaths Data"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 30,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import csv\n",
 24 |     "\n",
 25 |     "with open(\"guns.csv\", \"r\") as f:\n",
 26 |     "    reader = csv.reader(f)\n",
 27 |     "    data = list(reader)"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 31,
 33 |    "metadata": {},
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "[['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education'], ['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4']]\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "print(data[:5])"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "# Removing Headers from a List of Lists"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 32,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "[['', 'year', 'month', 'intent', 'police', 'sex', 'age', 'race', 'hispanic', 'place', 'education']]\n",
 64 |       "[['1', '2012', '01', 'Suicide', '0', 'M', '34', 'Asian/Pacific Islander', '100', 'Home', '4'], ['2', '2012', '01', 'Suicide', '0', 'F', '21', 'White', '100', 'Street', '3'], ['3', '2012', '01', 'Suicide', '0', 'M', '60', 'White', '100', 'Other specified', '4'], ['4', '2012', '02', 'Suicide', '0', 'M', '64', 'White', '100', 'Home', '4'], ['5', '2012', '02', 'Suicide', '0', 'M', '31', 'White', '100', 'Other specified', '2']]\n"
 65 |      ]
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "headers = data[:1]\n",
 70 |     "data = data[1:]\n",
 71 |     "print(headers)\n",
 72 |     "print(data[:5])"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "# Counting Gun Deaths by Year"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 33,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "{'2012': 33563, '2013': 33636, '2014': 33599}"
 91 |       ]
 92 |      },
 93 |      "execution_count": 33,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "years = [row[1] for row in data]\n",
100 |     "\n",
101 |     "year_counts = {}\n",
102 |     "for year in years:\n",
103 |     "    if year not in year_counts:\n",
104 |     "        year_counts[year] = 1\n",
105 |     "    else:  \n",
106 |     "        year_counts[year] += 1\n",
107 |     "\n",
108 |     "year_counts"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "# Exploring Gun Deaths by Month and Year"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 34,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "[datetime.datetime(2012, 1, 1, 0, 0),\n",
127 |        " datetime.datetime(2012, 1, 1, 0, 0),\n",
128 |        " datetime.datetime(2012, 1, 1, 0, 0),\n",
129 |        " datetime.datetime(2012, 2, 1, 0, 0),\n",
130 |        " datetime.datetime(2012, 2, 1, 0, 0)]"
131 |       ]
132 |      },
133 |      "execution_count": 34,
134 |      "metadata": {},
135 |      "output_type": "execute_result"
136 |     }
137 |    ],
138 |    "source": [
139 |     "import datetime\n",
140 |     "\n",
141 |     "dates = [datetime.datetime(year=int(row[1]), month=int(row[2]), day=1) for row in data]\n",
142 |     "dates[:5]"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 35,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "{datetime.datetime(2012, 1, 1, 0, 0): 2758,\n",
154 |        " datetime.datetime(2012, 2, 1, 0, 0): 2357,\n",
155 |        " datetime.datetime(2012, 3, 1, 0, 0): 2743,\n",
156 |        " datetime.datetime(2012, 4, 1, 0, 0): 2795,\n",
157 |        " datetime.datetime(2012, 5, 1, 0, 0): 2999,\n",
158 |        " datetime.datetime(2012, 6, 1, 0, 0): 2826,\n",
159 |        " datetime.datetime(2012, 7, 1, 0, 0): 3026,\n",
160 |        " datetime.datetime(2012, 8, 1, 0, 0): 2954,\n",
161 |        " datetime.datetime(2012, 9, 1, 0, 0): 2852,\n",
162 |        " datetime.datetime(2012, 10, 1, 0, 0): 2733,\n",
163 |        " datetime.datetime(2012, 11, 1, 0, 0): 2729,\n",
164 |        " datetime.datetime(2012, 12, 1, 0, 0): 2791,\n",
165 |        " datetime.datetime(2013, 1, 1, 0, 0): 2864,\n",
166 |        " datetime.datetime(2013, 2, 1, 0, 0): 2375,\n",
167 |        " datetime.datetime(2013, 3, 1, 0, 0): 2862,\n",
168 |        " datetime.datetime(2013, 4, 1, 0, 0): 2798,\n",
169 |        " datetime.datetime(2013, 5, 1, 0, 0): 2806,\n",
170 |        " datetime.datetime(2013, 6, 1, 0, 0): 2920,\n",
171 |        " datetime.datetime(2013, 7, 1, 0, 0): 3079,\n",
172 |        " datetime.datetime(2013, 8, 1, 0, 0): 2859,\n",
173 |        " datetime.datetime(2013, 9, 1, 0, 0): 2742,\n",
174 |        " datetime.datetime(2013, 10, 1, 0, 0): 2808,\n",
175 |        " datetime.datetime(2013, 11, 1, 0, 0): 2758,\n",
176 |        " datetime.datetime(2013, 12, 1, 0, 0): 2765,\n",
177 |        " datetime.datetime(2014, 1, 1, 0, 0): 2651,\n",
178 |        " datetime.datetime(2014, 2, 1, 0, 0): 2361,\n",
179 |        " datetime.datetime(2014, 3, 1, 0, 0): 2684,\n",
180 |        " datetime.datetime(2014, 4, 1, 0, 0): 2862,\n",
181 |        " datetime.datetime(2014, 5, 1, 0, 0): 2864,\n",
182 |        " datetime.datetime(2014, 6, 1, 0, 0): 2931,\n",
183 |        " datetime.datetime(2014, 7, 1, 0, 0): 2884,\n",
184 |        " datetime.datetime(2014, 8, 1, 0, 0): 2970,\n",
185 |        " datetime.datetime(2014, 9, 1, 0, 0): 2914,\n",
186 |        " datetime.datetime(2014, 10, 1, 0, 0): 2865,\n",
187 |        " datetime.datetime(2014, 11, 1, 0, 0): 2756,\n",
188 |        " datetime.datetime(2014, 12, 1, 0, 0): 2857}"
189 |       ]
190 |      },
191 |      "execution_count": 35,
192 |      "metadata": {},
193 |      "output_type": "execute_result"
194 |     }
195 |    ],
196 |    "source": [
197 |     "date_counts = {}\n",
198 |     "\n",
199 |     "for date in dates:\n",
200 |     "    if date not in date_counts:\n",
201 |     "        date_counts[date] = 0\n",
202 |     "    date_counts[date] += 1\n",
203 |     "\n",
204 |     "date_counts"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "# Exploring Gun Deaths by Race and Sex"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 54,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "{'F': 14449, 'M': 86349}"
223 |       ]
224 |      },
225 |      "execution_count": 54,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "sexes = [row[5] for row in data]\n",
232 |     "sex_counts = {}\n",
233 |     "for sex in sexes:\n",
234 |     "    if sex not in sex_counts:\n",
235 |     "        sex_counts[sex] = 0\n",
236 |     "    sex_counts[sex] += 1\n",
237 |     "sex_counts"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 36,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "data": {
247 |       "text/plain": [
248 |        "{'Asian/Pacific Islander': 1326,\n",
249 |        " 'Black': 23296,\n",
250 |        " 'Hispanic': 9022,\n",
251 |        " 'Native American/Native Alaskan': 917,\n",
252 |        " 'White': 66237}"
253 |       ]
254 |      },
255 |      "execution_count": 36,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "races = [row[7] for row in data]\n",
262 |     "race_counts = {}\n",
263 |     "for race in races:\n",
264 |     "    if race not in race_counts:\n",
265 |     "        race_counts[race] = 0\n",
266 |     "    race_counts[race] += 1\n",
267 |     "race_counts"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "## Findings So Far\n",
275 |     "\n",
276 |     "Gun deaths in the U.S. seem to disproportionately affect men. They also seem to disproportionately affect minorities, although having some data on the percentage of each race in the overall U.S. population would help.\n",
277 |     "\n",
278 |     "There appears to be a minor seasonal correlation, with gun deaths peaking in the summer and declining in the winter.  It might be useful to filter by intent, to see if different categories of intent have different correlations with season, race, or gender."
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "# Reading in a Second Dataset"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 57,
291 |    "metadata": {},
292 |    "outputs": [
293 |     {
294 |      "data": {
295 |       "text/plain": [
296 |        "[['Id',\n",
297 |        "  'Year',\n",
298 |        "  'Id',\n",
299 |        "  'Sex',\n",
300 |        "  'Id',\n",
301 |        "  'Hispanic Origin',\n",
302 |        "  'Id',\n",
303 |        "  'Id2',\n",
304 |        "  'Geography',\n",
305 |        "  'Total',\n",
306 |        "  'Race Alone - White',\n",
307 |        "  'Race Alone - Hispanic',\n",
308 |        "  'Race Alone - Black or African American',\n",
309 |        "  'Race Alone - American Indian and Alaska Native',\n",
310 |        "  'Race Alone - Asian',\n",
311 |        "  'Race Alone - Native Hawaiian and Other Pacific Islander',\n",
312 |        "  'Two or More Races'],\n",
313 |        " ['cen42010',\n",
314 |        "  'April 1, 2010 Census',\n",
315 |        "  'totsex',\n",
316 |        "  'Both Sexes',\n",
317 |        "  'tothisp',\n",
318 |        "  'Total',\n",
319 |        "  '0100000US',\n",
320 |        "  '',\n",
321 |        "  'United States',\n",
322 |        "  '308745538',\n",
323 |        "  '197318956',\n",
324 |        "  '44618105',\n",
325 |        "  '40250635',\n",
326 |        "  '3739506',\n",
327 |        "  '15159516',\n",
328 |        "  '674625',\n",
329 |        "  '6984195']]"
330 |       ]
331 |      },
332 |      "execution_count": 57,
333 |      "metadata": {},
334 |      "output_type": "execute_result"
335 |     }
336 |    ],
337 |    "source": [
338 |     "import csv\n",
339 |     "\n",
340 |     "with open(\"census.csv\", \"r\") as f:\n",
341 |     "    reader = csv.reader(f)\n",
342 |     "    census = list(reader)\n",
343 |     "    \n",
344 |     "census"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {},
350 |    "source": [
351 |     "# Computing Rates of Gun Deaths Per Race"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": 40,
357 |    "metadata": {},
358 |    "outputs": [
359 |     {
360 |      "data": {
361 |       "text/plain": [
362 |        "{'Asian/Pacific Islander': 8.374309664161762,\n",
363 |        " 'Black': 57.8773477735196,\n",
364 |        " 'Hispanic': 20.220491210910907,\n",
365 |        " 'Native American/Native Alaskan': 24.521955573811088,\n",
366 |        " 'White': 33.56849303419181}"
367 |       ]
368 |      },
369 |      "execution_count": 40,
370 |      "metadata": {},
371 |      "output_type": "execute_result"
372 |     }
373 |    ],
374 |    "source": [
375 |     "mapping = {\n",
376 |     "    \"Asian/Pacific Islander\": 15159516 + 674625,\n",
377 |     "    \"Native American/Native Alaskan\": 3739506,\n",
378 |     "    \"Black\": 40250635,\n",
379 |     "    \"Hispanic\": 44618105,\n",
380 |     "    \"White\": 197318956\n",
381 |     "}\n",
382 |     "\n",
383 |     "race_per_hundredk = {}\n",
384 |     "for k,v in race_counts.items():\n",
385 |     "    race_per_hundredk[k] = (v / mapping[k]) * 100000\n",
386 |     "\n",
387 |     "race_per_hundredk"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "# Filtering By Intent"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": 41,
400 |    "metadata": {},
401 |    "outputs": [
402 |     {
403 |      "data": {
404 |       "text/plain": [
405 |        "{'Asian/Pacific Islander': 3.530346230970155,\n",
406 |        " 'Black': 48.471284987180944,\n",
407 |        " 'Hispanic': 12.627161104219914,\n",
408 |        " 'Native American/Native Alaskan': 8.717729026240365,\n",
409 |        " 'White': 4.6356417981453335}"
410 |       ]
411 |      },
412 |      "execution_count": 41,
413 |      "metadata": {},
414 |      "output_type": "execute_result"
415 |     }
416 |    ],
417 |    "source": [
418 |     "intents = [row[3] for row in data]\n",
419 |     "homicide_race_counts = {}\n",
420 |     "for i,race in enumerate(races):\n",
421 |     "    if race not in homicide_race_counts:\n",
422 |     "        homicide_race_counts[race] = 0\n",
423 |     "    if intents[i] == \"Homicide\":\n",
424 |     "        homicide_race_counts[race] += 1\n",
425 |     "\n",
426 |     "race_per_hundredk = {}\n",
427 |     "for k,v in homicide_race_counts.items():\n",
428 |     "    race_per_hundredk[k] = (v / mapping[k]) * 100000\n",
429 |     "\n",
430 |     "race_per_hundredk     "
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "## Findings\n",
438 |     "\n",
439 |     "It appears that gun-related homicides in the U.S. disproportionately affect people in the `Black` and `Hispanic` racial categories.\n",
440 |     "\n",
441 |     "Some areas to investigate further:\n",
442 |     "\n",
443 |     "* The link between month and homicide rate\n",
444 |     "* Homicide rate by gender\n",
445 |     "* The rates of other intents by gender and race\n",
446 |     "* Gun death rates by location and education"
447 |    ]
448 |   }
449 |  ],
450 |  "metadata": {
451 |   "kernelspec": {
452 |    "display_name": "Python 3",
453 |    "language": "python",
454 |    "name": "python3"
455 |   },
456 |   "language_info": {
457 |    "codemirror_mode": {
458 |     "name": "ipython",
459 |     "version": 3
460 |    },
461 |    "file_extension": ".py",
462 |    "mimetype": "text/x-python",
463 |    "name": "python",
464 |    "nbconvert_exporter": "python",
465 |    "pygments_lexer": "ipython3",
466 |    "version": "3.8.5"
467 |   },
468 |   "widgets": {
469 |    "state": {},
470 |    "version": "1.1.1"
471 |   }
472 |  },
473 |  "nbformat": 4,
474 |  "nbformat_minor": 1
475 | }
476 | 


--------------------------------------------------------------------------------
/Mission234Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pickle\n",
 12 |     "from btree import Node, BTree, NodeKey\n",
 13 |     "\n",
 14 |     "class DQKV(BTree):\n",
 15 |     "    def __init__(self, type_, values=None):\n",
 16 |     "        self.type = type_\n",
 17 |     "        super().__init__(10)\n",
 18 |     "\n",
 19 |     "    def get(self, key):\n",
 20 |     "        value = self.search(self.root, key)\n",
 21 |     "        if value is None:\n",
 22 |     "            raise KeyError('There is no value for key \"{}\"'.format(key))\n",
 23 |     "        return value\n",
 24 |     "    \n",
 25 |     "    def set(self, key, value):\n",
 26 |     "        if value is None:\n",
 27 |     "            raise ValueError('Cannot store None values')\n",
 28 |     "        if not isinstance(key, self.type):\n",
 29 |     "            raise KeyError('Key must be of type {}'.format(self.type))\n",
 30 |     "        exists = self.search(self.root, key)\n",
 31 |     "        if exists is not None:\n",
 32 |     "            raise ValueError('Cannot store duplicate key values')\n",
 33 |     "            \n",
 34 |     "        node = NodeKey(key, value)\n",
 35 |     "        self.insert(node)\n",
 36 |     "    \n",
 37 |     "    def range_query(self, interval, inclusive=False):\n",
 38 |     "        if not isinstance(interval, (list, tuple)) and len(interval) != 2:\n",
 39 |     "            raise ValueError('The first argument must be a list or tuple of length 2')\n",
 40 |     "        \n",
 41 |     "        lower, upper = interval\n",
 42 |     "        if lower is None:\n",
 43 |     "            return self.less_than(self.root, upper, inclusive=inclusive)\n",
 44 |     "        return self.greater_than(self.root, lower, upper_bound=upper, inclusive=inclusive)\n",
 45 |     "    \n",
 46 |     "    def save(self, filename):\n",
 47 |     "        filename = filename + '.dqdb'\n",
 48 |     "        with open(filename, 'wb') as f:\n",
 49 |     "            pickle.dump(self, f)\n",
 50 |     "            return True\n",
 51 |     "        return False\n",
 52 |     "    \n",
 53 |     "    def load_from_dict(self, dictionary):\n",
 54 |     "        for key, value in dictionary.items():\n",
 55 |     "            self.set(key, value)\n",
 56 |     "    \n",
 57 |     "    @staticmethod\n",
 58 |     "    def load(filename):\n",
 59 |     "        filename = filename + '.dqdb'\n",
 60 |     "        with open(filename, 'rb') as f:\n",
 61 |     "            return pickle.load(f)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "dq = DQKV(int)\n",
 71 |     "dq.set(1, 'hello')\n",
 72 |     "dq.set(2, 'world')\n",
 73 |     "dq.set(3, 'this')\n",
 74 |     "dq.set(4, 'is')\n",
 75 |     "print(dq.range_query([1,3]))\n",
 76 |     "\n",
 77 |     "dq.save('sample_store')\n",
 78 |     "dqkv = DQKV.load('sample_store')\n",
 79 |     "\n",
 80 |     "print(dqkv.range_query([1,3]))\n",
 81 |     "additional_keys = {\n",
 82 |     "    5: 'a',\n",
 83 |     "    6: 'simple',\n",
 84 |     "    7: 'kv store'\n",
 85 |     "}\n",
 86 |     "dqkv.load_from_dict(additional_keys)\n",
 87 |     "print(dqkv.range_query([4,8]))\n",
 88 |     "print(dqkv.get(5))"
 89 |    ]
 90 |   }
 91 |  ],
 92 |  "metadata": {
 93 |   "kernelspec": {
 94 |    "display_name": "Python 3",
 95 |    "language": "python",
 96 |    "name": "python3"
 97 |   },
 98 |   "language_info": {
 99 |    "codemirror_mode": {
100 |     "name": "ipython",
101 |     "version": 3
102 |    },
103 |    "file_extension": ".py",
104 |    "mimetype": "text/x-python",
105 |    "name": "python",
106 |    "nbconvert_exporter": "python",
107 |    "pygments_lexer": "ipython3",
108 |    "version": "3.5.2"
109 |   }
110 |  },
111 |  "nbformat": 4,
112 |  "nbformat_minor": 2
113 | }
114 | 


--------------------------------------------------------------------------------
/Mission251Solutions.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from datetime import datetime
 3 | import io
 4 | import psycopg2
 5 | from urllib import request
 6 | 
 7 | 
 8 | conn = psycopg2.connect(dbname='postgres', user='postgres')
 9 | cur = conn.cursor()
10 | # Autocommit instead of commiting every transaction.
11 | conn.autocommit = True
12 | 
13 | # Create database and users.
14 | cur.execute('CREATE DATABASE ihw')
15 | cur.execute("CREATE USER production WITH PASSWORD 'abc123'")
16 | cur.execute("CREATE USER analyst WITH PASSWORD 'def456'")
17 | 
18 | # Reconnect to ihw database.
19 | conn = psycopg2.connect(dbname='ihw', user='postgres')
20 | conn.autocommit = True
21 | cur = conn.cursor()
22 | 
23 | # Create the table.
24 | cur.execute(
25 |     """
26 |     CREATE TABLE hurricanes (
27 |         fid INTEGER PRIMARY KEY,
28 |         recorded_at TIMESTAMP,
29 |         btid INTEGER,
30 |         name VARCHAR(10),
31 |         lat DECIMAL(4, 1),
32 |         long DECIMAL(4, 1),
33 |         wind_kts SMALLINT,
34 |         pressure INTEGER,
35 |         category VARCHAR(2),
36 |         basin VARCHAR(16),
37 |         shape_length DECIMAL(8, 6)
38 |     )
39 |     """
40 | )
41 | 
42 | # Manage privileges.
43 | cur.execute("REVOKE ALL ON hurricanes FROM production")
44 | cur.execute("REVOKE ALL ON hurricanes FROM analyst")
45 | cur.execute("GRANT SELECT, INSERT, UPDATE ON hurricanes TO production")
46 | cur.execute("GRANT SELECT ON hurricanes TO analyst")
47 | conn.close()
48 | 
49 | # Reconnect with production user.
50 | conn = psycopg2.connect(dbname='ihw', user='production', password='abc123')
51 | cur = conn.cursor()
52 | conn.autocommit = True
53 | 
54 | # Insert the data.
55 | response = request.urlopen('https://dq-content.s3.amazonaws.com/251/storm_data.csv')
56 | reader = csv.reader(io.TextIOWrapper(response))
57 | # Skip the header.
58 | _ = next(reader)
59 | rows = []
60 | for line in reader:
61 |     recorded_at = datetime(int(line[1]), int(line[2]), int(line[3]), hour=int(line[4][:2]), minute=int(line[4][2:-1]))
62 | 
63 |     new_line = [line[0], recorded_at] + line[5:]
64 |     rows.append(
65 |         cur.mogrify(
66 |             "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
67 |             new_line
68 |         ).decode('utf-8')
69 |     )
70 | cur.execute('INSERT INTO hurricanes VALUES ' + ",".join(rows))
71 | 


--------------------------------------------------------------------------------
/Mission267Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "[('new', 186), ('google', 168), ('bitcoin', 102), ('open', 93), ('programming', 91), ('web', 89), ('data', 86), ('video', 80), ('python', 76), ('code', 73), ('facebook', 72), ('released', 72), ('using', 71), ('2013', 66), ('javascript', 66), ('free', 65), ('source', 65), ('game', 64), ('internet', 63), ('microsoft', 60), ('c', 60), ('linux', 59), ('app', 58), ('pdf', 56), ('work', 55), ('language', 55), ('software', 53), ('2014', 53), ('startup', 52), ('apple', 51), ('use', 51), ('make', 51), ('time', 49), ('yc', 49), ('security', 49), ('nsa', 46), ('github', 46), ('windows', 45), ('world', 42), ('way', 42), ('like', 42), ('1', 41), ('project', 41), ('computer', 41), ('heartbleed', 41), ('git', 38), ('users', 38), ('dont', 38), ('design', 38), ('ios', 38), ('developer', 37), ('os', 37), ('twitter', 37), ('ceo', 37), ('vs', 37), ('life', 37), ('big', 36), ('day', 36), ('android', 35), ('online', 35), ('years', 34), ('simple', 34), ('court', 34), ('guide', 33), ('learning', 33), ('mt', 33), ('api', 33), ('says', 33), ('apps', 33), ('browser', 33), ('server', 32), ('firefox', 32), ('fast', 32), ('gox', 32), ('problem', 32), ('mozilla', 32), ('engine', 32), ('site', 32), ('introducing', 31), ('amazon', 31), ('year', 31), ('support', 30), ('stop', 30), ('built', 30), ('better', 30), ('million', 30), ('people', 30), ('text', 30), ('3', 29), ('does', 29), ('tech', 29), ('development', 29), ('billion', 28), ('developers', 28), ('just', 28), ('library', 28), ('did', 28), ('website', 28), ('money', 28), ('inside', 28)]\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "from datetime import datetime\n",
 18 |     "import json\n",
 19 |     "import io\n",
 20 |     "import csv\n",
 21 |     "import string\n",
 22 |     "\n",
 23 |     "from pipeline import build_csv, Pipeline\n",
 24 |     "from stop_words import stop_words\n",
 25 |     "\n",
 26 |     "pipeline = Pipeline()\n",
 27 |     "\n",
 28 |     "@pipeline.task()\n",
 29 |     "def file_to_json():\n",
 30 |     "    with open('hn_stories_2014.json', 'r') as f:\n",
 31 |     "        data = json.load(f)\n",
 32 |     "        stories = data['stories']\n",
 33 |     "    return stories\n",
 34 |     "\n",
 35 |     "@pipeline.task(depends_on=file_to_json)\n",
 36 |     "def filter_stories(stories):\n",
 37 |     "    def is_popular(story):\n",
 38 |     "        return story['points'] > 50 and story['num_comments'] > 1 and not story['title'].startswith('Ask HN')\n",
 39 |     "    \n",
 40 |     "    return (\n",
 41 |     "        story for story in stories\n",
 42 |     "        if is_popular(story)\n",
 43 |     "    )\n",
 44 |     "\n",
 45 |     "@pipeline.task(depends_on=filter_stories)\n",
 46 |     "def json_to_csv(stories):\n",
 47 |     "    lines = []\n",
 48 |     "    for story in stories:\n",
 49 |     "        lines.append(\n",
 50 |     "            (story['objectID'], datetime.strptime(story['created_at'], \"%Y-%m-%dT%H:%M:%SZ\"), story['url'], story['points'], story['title'])\n",
 51 |     "        )\n",
 52 |     "    return build_csv(lines, header=['objectID', 'created_at', 'url', 'points', 'title'], file=io.StringIO())\n",
 53 |     "\n",
 54 |     "@pipeline.task(depends_on=json_to_csv)\n",
 55 |     "def extract_titles(csv_file):\n",
 56 |     "    reader = csv.reader(csv_file)\n",
 57 |     "    header = next(reader)\n",
 58 |     "    idx = header.index('title')\n",
 59 |     "    \n",
 60 |     "    return (line[idx] for line in reader)\n",
 61 |     "\n",
 62 |     "@pipeline.task(depends_on=extract_titles)\n",
 63 |     "def clean_title(titles):\n",
 64 |     "    for title in titles:\n",
 65 |     "        title = title.lower()\n",
 66 |     "        title = ''.join(c for c in title if c not in string.punctuation)\n",
 67 |     "        yield title\n",
 68 |     "\n",
 69 |     "@pipeline.task(depends_on=clean_title)\n",
 70 |     "def build_keyword_dictionary(titles):\n",
 71 |     "    word_freq = {}\n",
 72 |     "    for title in titles:\n",
 73 |     "        for word in title.split(' '):\n",
 74 |     "            if word and word not in stop_words:\n",
 75 |     "                if word not in word_freq:\n",
 76 |     "                    word_freq[word] = 1\n",
 77 |     "                word_freq[word] += 1\n",
 78 |     "    return word_freq\n",
 79 |     "\n",
 80 |     "@pipeline.task(depends_on=build_keyword_dictionary)\n",
 81 |     "def top_keywords(word_freq):\n",
 82 |     "    freq_tuple = [\n",
 83 |     "        (word, word_freq[word])\n",
 84 |     "        for word in sorted(word_freq, key=word_freq.get, reverse=True)\n",
 85 |     "    ]\n",
 86 |     "    return freq_tuple[:100]\n",
 87 |     "\n",
 88 |     "ran = pipeline.run()\n",
 89 |     "print(ran[top_keywords])"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": []
100 |   }
101 |  ],
102 |  "metadata": {
103 |   "anaconda-cloud": {},
104 |   "kernelspec": {
105 |    "display_name": "Python 3",
106 |    "language": "python",
107 |    "name": "python3"
108 |   },
109 |   "language_info": {
110 |    "codemirror_mode": {
111 |     "name": "ipython",
112 |     "version": 3
113 |    },
114 |    "file_extension": ".py",
115 |    "mimetype": "text/x-python",
116 |    "name": "python",
117 |    "nbconvert_exporter": "python",
118 |    "pygments_lexer": "ipython3",
119 |    "version": "3.8.5"
120 |   }
121 |  },
122 |  "nbformat": 4,
123 |  "nbformat_minor": 1
124 | }
125 | 


--------------------------------------------------------------------------------
/Mission277Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Solutions for Guided Project: Exploratory Visualization of Forest Fire Data"
  3 | author: "Rose Martin"
  4 | output: html_document
  5 | ---
  6 | 
  7 | # Exploring Data Through Visualizations: Independent Investigations
  8 | 
  9 | Load the packages and data we'll need for the project
 10 | 
 11 | ```{r}
 12 | library(tidyverse)
 13 | 
 14 | forest_fires <- read_csv("forestfires.csv")
 15 | ```
 16 | 
 17 | # The Importance of Forest Fire Data
 18 | 
 19 | ```{r}
 20 | # What columns are in the dataset?
 21 | colnames(forest_fires)
 22 | ```
 23 | 
 24 | We know that the columns correspond to the following information:
 25 | 
 26 | * **X**: X-axis spatial coordinate within the Montesinho park map: 1 to 9 
 27 | * **Y**: Y-axis spatial coordinate within the Montesinho park map: 2 to 9 
 28 | * **month**: Month of the year: 'jan' to 'dec' 
 29 | * **day**: Day of the week: 'mon' to 'sun' 
 30 | * **FFMC**: Fine Fuel Moisture Code index from the FWI system: 18.7 to 96.20 
 31 | * **DMC**: Duff Moisture Code index from the FWI system: 1.1 to 291.3 
 32 | * **DC**: Drought Code index from the FWI system: 7.9 to 860.6 
 33 | * **ISI**: Initial Spread Index from the FWI system: 0.0 to 56.10 
 34 | * **temp**: Temperature in Celsius degrees: 2.2 to 33.30 
 35 | * **RH**: Relative humidity in percentage: 15.0 to 100 
 36 | * **wind**: Wind speed in km/h: 0.40 to 9.40 
 37 | * **rain**: Outside rain in mm/m2 : 0.0 to 6.4 
 38 | * **area**: The burned area of the forest (in ha): 0.00 to 1090.84 
 39 | 
 40 | A single row corresponds to the location of a fire and some characteristics about the fire itself. Higher water presence is typically asssociated with less fire spread, so we might expect the water-related variables (`DMC` and `rain`) to be related with `area`.
 41 | 
 42 | # Data Processing
 43 | 
 44 | `month` and `day` are character vartiables, but we know that there is an inherent order to them. We'll convert these variables into factors so that they'll be sorted into the correct order when we plot them.
 45 | 
 46 | ```{r}
 47 | forest_fires %>% pull(month) %>% unique
 48 | ```
 49 | 
 50 | ```{r}
 51 | forest_fires %>% pull(day) %>% unique
 52 | ```
 53 | 
 54 | This guided project will assume that Sunday is the first day of the week, but feel free to adjust the levels according to what's comfortable to you. Ultimately, the levels just help us rearrange the resulting plots in an order that makes sense to us.
 55 | 
 56 | ```{r}
 57 | month_order <- c("jan", "feb", "mar",
 58 |                  "apr", "may", "jun",
 59 |                  "jul", "aug", "sep",
 60 |                  "oct", "nov", "dec")
 61 | 
 62 | dow_order <- c("sun", "mon", "tue", "wed", "thu", "fri", "sat")
 63 | 
 64 | forest_fires <- forest_fires %>% 
 65 |   mutate(
 66 |     month = factor(month, levels = month_order),
 67 |     day = factor(day, levels = dow_order)
 68 |   )
 69 | ```
 70 | 
 71 | # When Do Most Forest Fires Occur?
 72 | 
 73 | We need to create a ssummary tibble that counts the number of fires that appears in each month. Then, we'll be able to use this tibble in a visualization. We can consider `month` and `day` to be different grouping variablse, so our code to produce the tibbles and plots will look similar.
 74 | 
 75 | ## Month Level
 76 | 
 77 | ```{r}
 78 | fires_by_month <- forest_fires %>%
 79 |   group_by(month) %>%
 80 |   summarize(total_fires = n())
 81 | 
 82 | fires_by_month %>% 
 83 |   ggplot(aes(x = month, y = total_fires)) +
 84 |   geom_col() +
 85 |   labs(
 86 |     title = "Number of forest fires in data by month",
 87 |     y = "Fire count",
 88 |     x = "Month"
 89 |   )
 90 | ```
 91 | 
 92 | ```{r}
 93 | fires_by_dow <- forest_fires %>%
 94 |   group_by(day) %>%
 95 |   summarize(total_fires = n())
 96 | 
 97 | fires_by_dow %>% 
 98 |   ggplot(aes(x = day, y = total_fires)) +
 99 |   geom_col() +
100 |   labs(
101 |     title = "Number of forest fires in data by day of the week",
102 |     y = "Fire count",
103 |     x = "Day of the week"
104 |   )
105 | ```
106 | 
107 | We see a massive spike in fires in August and September, as well as a smaller spike in March. Fires seem to be more frequent on the weekend.
108 | 
109 | # Plotting Other Variables Against Time 
110 | 
111 | ```{r}
112 | forest_fires_long <- forest_fires %>% 
113 |   pivot_longer(
114 |     cols = c("FFMC", "DMC", "DC", 
115 |              "ISI", "temp", "RH", 
116 |              "wind", "rain"),
117 |     names_to = "data_col",
118 |     values_to = "value"
119 |   )
120 | 
121 | forest_fires_long %>% 
122 |   ggplot(aes(x = month, y = value)) +
123 |   geom_boxplot() +
124 |   facet_wrap(vars(data_col), scale = "free_y") +
125 |   labs(
126 |     title = "Variable changes over month",
127 |     x = "Month",
128 |     y = "Variable value"
129 |   )
130 | ```
131 | 
132 | # Examining Forest Fire Severity
133 | 
134 | We are trying to see how each of the variables in the dataset relate to `area`. We can leverage the long format version of the data we created to use with `facet_wrap()`.
135 | 
136 | ```{r}
137 | forest_fires_long %>% 
138 |   ggplot(aes(x = value, y = area)) +
139 |   geom_point() +
140 |   facet_wrap(vars(data_col), scales = "free_x") +
141 |   labs(
142 |     title = "Relationships between other variables and area burned",
143 |     x = "Value of column",
144 |     y = "Area burned (hectare)"
145 |   )
146 | ```
147 | 
148 | # Outlier Problems
149 | 
150 | It seems that there are two rows where `area` that still hurt the scale of the visualization. Let's make a similar visualization that excludes these observations so that we can better see how each variable relates to `area`.
151 | 
152 | ```{r}
153 | forest_fires_long %>% 
154 |   filter(area < 300) %>% 
155 |   ggplot(aes(x = value, y = area)) +
156 |   geom_point() +
157 |   facet_wrap(vars(data_col), scales = "free_x") +
158 |   labs(
159 |     title = "Relationships between other variables and area burned (area < 300)",
160 |     x = "Value of column",
161 |     y = "Area burned (hectare)"
162 |   )
163 | ```


--------------------------------------------------------------------------------
/Mission304Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Python Intermediate: Creating a SimpleFrame Class"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Designing Our Class\n",
 15 |     "\n",
 16 |     "SimpleFrame should make it easy for us to load , preview, manipulate, and make calculations with our data. \n",
 17 |     "\n",
 18 |     "To preview our data, we’ll need to:\n",
 19 |     "- Be able to view the first five rows\n",
 20 |     "- Be able to view the shape of our data\n",
 21 |     "\n",
 22 |     "To manipulate our data, we’ll need to: \n",
 23 |     "- Add new columns\n",
 24 |     "- Be able to apply values to columns\n",
 25 |     "- Be able to subset our data\n",
 26 |     "\n",
 27 |     "To make calculations, we’ll need to:\n",
 28 |     "- Finding the minimum\n",
 29 |     "- Finding the maximum\n",
 30 |     "- Finding the mean\n",
 31 |     "- Finding the standard deviation"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "# Translating our words into objects\n",
 39 |     "\n",
 40 |     "- SimpleFrame -> Class\n",
 41 |     "- Load -> Method\n",
 42 |     "- Data -> Attribute\n",
 43 |     "- Columns -> Attribute\n",
 44 |     "\n",
 45 |     "## Preview\n",
 46 |     "\n",
 47 |     "- View the first five rows -> Method\n",
 48 |     "- View num of rows/cols of our data -> Method\n",
 49 |     "\n",
 50 |     "## Manipulate\n",
 51 |     "\n",
 52 |     "- Add new columns -> Method\n",
 53 |     "- Apply values to columns -> Method\n",
 54 |     "- Subset our data -> Method\n",
 55 |     "\n",
 56 |     "## Calculations\n",
 57 |     "\n",
 58 |     "- Minimum -> Method\n",
 59 |     "- Maximum -> Method\n",
 60 |     "- Mean -> Method\n",
 61 |     "- Standard deviation -> Method"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 2,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "2\n",
 74 |       "['Reggaetón Lento (Bailemos)', 'CNCO', '9998']\n",
 75 |       "['Ay Mi Dios', 'IAmChino', '10000']\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "import csv\n",
 81 |     "from statistics import mean, stdev, median, mode\n",
 82 |     "\n",
 83 |     "class SimpleFrame():\n",
 84 |     "    def __init__(self, filename):\n",
 85 |     "        self.filename = filename\n",
 86 |     "    \n",
 87 |     "    def read_data(self):\n",
 88 |     "        '''\n",
 89 |     "        Reads and opens the data\n",
 90 |     "        '''\n",
 91 |     "        f = open(self.filename,\"r\")\n",
 92 |     "        self.data = list(csv.reader(f))\n",
 93 |     "        self.columns = self.data[0]\n",
 94 |     "    \n",
 95 |     "    def head(self):\n",
 96 |     "        '''\n",
 97 |     "        Displays the first five rows\n",
 98 |     "        '''\n",
 99 |     "        return self.data[:5]\n",
100 |     "        \n",
101 |     "    \n",
102 |     "    def shape(self):\n",
103 |     "        num_rows = 0\n",
104 |     "        for row in self.data:\n",
105 |     "            num_rows += 1\n",
106 |     "        \n",
107 |     "        num_cols = len(self.data[0])\n",
108 |     "        return [num_rows, num_cols]\n",
109 |     "    \n",
110 |     "    def new_column(self, column_name):\n",
111 |     "        for pos, d in enumerate(self.data):\n",
112 |     "            if pos == 0:\n",
113 |     "                d.append(column_name)\n",
114 |     "            else:\n",
115 |     "                d.append('NA')\n",
116 |     "    \n",
117 |     "    def apply(self, column_name, new_value):\n",
118 |     "        for pos, col in enumerate(self.data[0]):\n",
119 |     "            if col == column_name:\n",
120 |     "                column_index = pos\n",
121 |     "        \n",
122 |     "        for data in self.data[1:]:\n",
123 |     "            data[column_index] = new_value\n",
124 |     "    \n",
125 |     "    def subset(self, column_name, row_value):\n",
126 |     "        for pos, col in enumerate(self.data[0]):\n",
127 |     "            if col == column_name:\n",
128 |     "                column_index = pos\n",
129 |     "        \n",
130 |     "        print(column_index)\n",
131 |     "        subset_data = []\n",
132 |     "        for data in self.data[1:]:\n",
133 |     "            if row_value in data:\n",
134 |     "                subset_data.append(data[column_index])\n",
135 |     "        return subset_data\n",
136 |     "\n",
137 |     "    \n",
138 |     "    def summary_stats(self, column_name):\n",
139 |     "        for pos, col in enumerate(self.data[0]):\n",
140 |     "            if col == column_name:\n",
141 |     "                column_index = pos\n",
142 |     "\n",
143 |     "        num_data = [data[column_index] for data in self.data[1:]]\n",
144 |     "        m = statistics.mean(num_data)\n",
145 |     "        std = stdev(num_data)\n",
146 |     "        median = statistics.median(num_data)\n",
147 |     "        \n",
148 |     "        print(\"Mean is {mean}\".format(mean= m))\n",
149 |     "        print(\"Standard Deviation is {std}\".format(std= std))\n",
150 |     "        print(\"Median is {median}\".format(median= median))\n",
151 |     "        \n",
152 |     "            \n",
153 |     "    def minimum(self, column):\n",
154 |     "        for pos, col in enumerate(self.data[0]):\n",
155 |     "            if col == column:\n",
156 |     "                column_index = pos\n",
157 |     "\n",
158 |     "        ## Find min value\n",
159 |     "        col_data = []\n",
160 |     "        for row in self.data[1:]:\n",
161 |     "            col_data.append([row[1],row[2],row[column_index]])\n",
162 |     "        \n",
163 |     "        return min(col_data, key= lambda x: x[2])\n",
164 |     "    \n",
165 |     "    def maximum(self, column):\n",
166 |     "        for pos, col in enumerate(self.data[0]):\n",
167 |     "            if col == column:\n",
168 |     "                column_index = pos\n",
169 |     "        ## Find min value\n",
170 |     "        col_data = []\n",
171 |     "        for row in self.data[1:]:\n",
172 |     "            col_data.append([row[1],row[2],row[column_index]])\n",
173 |     "        return max(col_data, key= lambda x: x[2])\n",
174 |     "    \n",
175 |     "s = SimpleFrame(\"music_data.csv\")\n",
176 |     "s.read_data()\n",
177 |     "\n",
178 |     "s.shape()\n",
179 |     "s.columns\n",
180 |     "s.new_column('hello')\n",
181 |     "s.subset(\"Artist\",\"Shakira\")\n",
182 |     "print(s.maximum(\"Streams\"))\n",
183 |     "print(s.minimum(\"Streams\"))"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "# Results\n",
191 |     "\n",
192 |     "The song that had the highest number of streams in one day was Despacito by Luis Fonsi with 64238 streams. \n",
193 |     "\n",
194 |     "The song that had the lowest number of streams in one day was Por Fin Te Encontre by Cali Y El Dandee with 1993. \n"
195 |    ]
196 |   }
197 |  ],
198 |  "metadata": {
199 |   "kernelspec": {
200 |    "display_name": "Python 3",
201 |    "language": "python",
202 |    "name": "python3"
203 |   },
204 |   "language_info": {
205 |    "codemirror_mode": {
206 |     "name": "ipython",
207 |     "version": 3
208 |    },
209 |    "file_extension": ".py",
210 |    "mimetype": "text/x-python",
211 |    "name": "python",
212 |    "nbconvert_exporter": "python",
213 |    "pygments_lexer": "ipython3",
214 |    "version": "3.6.3"
215 |   }
216 |  },
217 |  "nbformat": 4,
218 |  "nbformat_minor": 2
219 | }
220 | 


--------------------------------------------------------------------------------
/Mission327Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Solutions for Guided Project: Exploring NYC Schools Survey Data"
  3 | author: "Rose Martin"
  4 | data: "January 22, 2019"
  5 | output: html_document
  6 | ---
  7 | 
  8 | **Here are suggested solutions to the questions in the Data Cleaning With R Guided Project: Exploring NYC Schools Survey Data.**
  9 | 
 10 | Load the packages you'll need for your analysis
 11 | 
 12 | ```{r}
 13 | library(readr)
 14 | library(dplyr)
 15 | library(stringr)
 16 | library(purrr)
 17 | library(tidyr)
 18 | library(ggplot2)
 19 | ```
 20 | 
 21 | Import the data into R.
 22 | 
 23 | ```{r}
 24 | combined <- read_csv("combined.csv") 
 25 | survey <- read_tsv("survey_all.txt")
 26 | survey_d75 <- read_tsv("survey_d75.txt")
 27 | ```
 28 | 
 29 | Filter `survey` data to include only high schools and select columns needed for analysis based on the data dictionary.
 30 | 
 31 | ```{r}
 32 | survey_select <- survey %>%
 33 |   filter(schooltype == "High School") %>%
 34 |   select(dbn:aca_tot_11)
 35 | ```
 36 | 
 37 | Select columns needed for analysis from `survey_d75`.
 38 | 
 39 | ```{r}
 40 | survey_d75_select <- survey_d75 %>%       
 41 |   select(dbn:aca_tot_11)
 42 | ```
 43 | 
 44 | Combine `survey` and `survey_d75` data frames.
 45 | 
 46 | ```{r}
 47 | survey_total <- survey_select %>% 
 48 |   bind_rows(survey_d75_select)
 49 | ```
 50 | 
 51 | Rename `survey_total` variable `dbn` to `DBN` so can use as key to join with the `combined` data frame.
 52 | 
 53 | ```{r}
 54 | survey_total <- survey_total %>%
 55 |   rename(DBN = dbn)
 56 | ```
 57 | 
 58 | Join the `combined` and `survey_total` data frames. Use `left_join()` to keep only survey data that correspond to schools for which we have data in `combined`.
 59 | 
 60 | ```{r}
 61 | combined_survey <- combined %>%
 62 |   left_join(survey_total, by = "DBN")
 63 | ```
 64 | 
 65 | Create a correlation matrix to look for interesting relationships between pairs of variables in `combined_survey` and convert it to a tibble so it's easier to work with using tidyverse tools.
 66 | 
 67 | ```{r}
 68 | cor_mat <- combined_survey %>%    ## interesting relationshipsS
 69 |   select(avg_sat_score, saf_p_11:aca_tot_11) %>%
 70 |   cor(use = "pairwise.complete.obs")
 71 | 
 72 | cor_tib <- cor_mat %>%
 73 |   as_tibble(rownames = "variable")
 74 | ```
 75 | 
 76 | Look for correlations of other variables with `avg_sat_score` that are greater than 0.25 or less than -0.25 (strong correlations).
 77 | 
 78 | ```{r}
 79 | strong_cors <- cor_tib %>%
 80 |   select(variable, avg_sat_score) %>%
 81 |   filter(avg_sat_score > 0.25 | avg_sat_score < -0.25)  
 82 | ```
 83 | 
 84 | Make scatter plots of those variables with `avg_sat_score` to examine relationships more closely.
 85 | 
 86 | ```{r}
 87 | create_scatter <- function(x, y) {     
 88 |   ggplot(data = combined_survey) + 
 89 |     aes_string(x = x, y = y) +
 90 |     geom_point(alpha = 0.3) +
 91 |     theme(panel.background = element_rect(fill = "white"))
 92 | }
 93 | 
 94 | x_var <- strong_cors$variable[2:5]
 95 | y_var <- "avg_sat_score"
 96 |   
 97 | map2(x_var, y_var, create_scatter)
 98 | ```
 99 | 
100 | Reshape the data so that you can investigate differences in student, parent, and teacher responses to survey questions.
101 | 
102 | ```{r}
103 | # combined_survey_gather <- combined_survey %>%
104 | #   gather(key = "survey_question", value = score, saf_p_11:aca_tot_11)
105 | 
106 | combined_survey_gather <- combined_survey %>%
107 |   pivot_longer(cols = saf_p_11:aca_tot_11,
108 |                names_to = "survey_question",
109 |                values_to = "score")
110 | ```
111 | 
112 | Use `str_sub()` to create new variables, `response_type` and `question`, from the `survey_question` variable.
113 | 
114 | ```{r}
115 | combined_survey_gather <- combined_survey_gather %>%
116 |   mutate(response_type = str_sub(survey_question, 4, 6)) %>%   
117 |   mutate(question = str_sub(survey_question, 1, 3))
118 | ```
119 | 
120 | Replace `response_type` variable values with names "parent", "teacher", "student", "total" using `if_else()` function.
121 | 
122 | ```{r}
123 | combined_survey_gather <- combined_survey_gather %>%
124 |   mutate(response_type = ifelse(response_type  == "_p_", "parent", 
125 |                                 ifelse(response_type == "_t_", "teacher",
126 |                                        ifelse(response_type == "_s_", "student", 
127 |                                               ifelse(response_type == "_to", "total", "NA")))))
128 | ```
129 | 
130 | Make a boxplot to see if there appear to be differences in how the three groups of responders (parents, students, and teachers) answered the four questions. 
131 | 
132 | ```{r}
133 | combined_survey_gather %>%
134 |   filter(response_type != "total") %>%
135 |   ggplot(aes(x = question, y = score, fill = response_type)) +
136 |   geom_boxplot()
137 | ```
138 | 


--------------------------------------------------------------------------------
/Mission368Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# Set up libraries and look at first few rows\n",
 10 |     "library(RSQLite)\n",
 11 |     "library(DBI)\n",
 12 |     "\n",
 13 |     "conn = dbConnect(SQLite(), \"./factbook.db\")\n",
 14 |     "q1 = \"SELECT * FROM facts LIMIT 5\"\n",
 15 |     "result1 = dbGetQuery(conn, q1)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "# Looking at summary statistics\n",
 25 |     "q2 = \"SELECT MIN(population), MAX(population), MIN(population_growth), MAX(population_growth) FROM facts\"\n",
 26 |     "result2 = dbGetQuery(conn, q2)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "# Investigating outlier values\n",
 36 |     "q3 = \"SELECT * FROM facts WHERE (population == (SELECT MAX(population) FROM facts))\"\n",
 37 |     "result3 = dbGetQuery(conn, q3)\n",
 38 |     "\n",
 39 |     "q4 = \"SELECT * FROM facts WHERE (population == (SELECT MIN(population) FROM facts))\"\n",
 40 |     "result4 = dbGetQuery(conn, q4)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# Omitting outlier values from the query\n",
 50 |     "q5 = \"SELECT population, population_growth, birth_rate, death_rate FROM facts WHERE ((population != (SELECT MAX(population) FROM facts)) AND (population != (SELECT MIN(population) FROM facts)))\"\n",
 51 |     "result5 = dbGetQuery(conn, q5)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# Plotting histograms for the variables from Q5\n",
 61 |     "library(tidyverse)\n",
 62 |     "\n",
 63 |     "tidy_result5 = result5 %>%\n",
 64 |     "gather(., key = \"variable\", value = \"val\")\n",
 65 |     "\n",
 66 |     "ggplot(data = result5, aes(x = val)) +\n",
 67 |     "geom_histogram() + \n",
 68 |     "facet_grid(~ variable)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "# Calculating and sorting by population density\n",
 78 |     "q7 = \"SELECT name, cast(population as float)/cast(area as float) density FROM facts ORDER BY density DESC\"\n",
 79 |     "result7 = dbGetQuery(conn, q7)"
 80 |    ]
 81 |   }
 82 |  ],
 83 |  "metadata": {
 84 |   "kernelspec": {
 85 |    "display_name": "Python 3",
 86 |    "language": "python",
 87 |    "name": "python3"
 88 |   },
 89 |   "language_info": {
 90 |    "codemirror_mode": {
 91 |     "name": "ipython",
 92 |     "version": 3
 93 |    },
 94 |    "file_extension": ".py",
 95 |    "mimetype": "text/x-python",
 96 |    "name": "python",
 97 |    "nbconvert_exporter": "python",
 98 |    "pygments_lexer": "ipython3",
 99 |    "version": "3.7.3"
100 |   }
101 |  },
102 |  "nbformat": 4,
103 |  "nbformat_minor": 2
104 | }
105 | 


--------------------------------------------------------------------------------
/Mission374Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Answering Business Questions using SQL (Intermediate SQL in R): Guided Project Solutions"
  3 | output: html_document
  4 | ---
  5 | 
  6 | # Creating Helper Functions
  7 | 
  8 | ```{r}
  9 | library(RSQLite)
 10 | library(DBI)
 11 | 
 12 | db <- 'chinook.db'
 13 | 
 14 | run_query <- function(q) {
 15 |   conn <- dbConnect(SQLite(), db)
 16 |   result <- dbGetQuery(conn, q)
 17 |   dbDisconnect(conn)
 18 |   return(result)
 19 | }
 20 | 
 21 | show_tables <- function() {
 22 |   q = "SELECT name, type FROM sqlite_master WHERE type IN ('table', 'view')"
 23 |   return(run_query(q))
 24 | }
 25 | 
 26 | show_tables()
 27 | ```
 28 | 
 29 | # Selecting New Albums to Purchase
 30 | 
 31 | ```{r}
 32 | albums_to_purchase = '
 33 | WITH usa_tracks_sold AS
 34 |    (
 35 |     SELECT il.* FROM invoice_line il
 36 |     INNER JOIN invoice i on il.invoice_id = i.invoice_id
 37 |     INNER JOIN customer c on i.customer_id = c.customer_id
 38 |     WHERE c.country = "USA"
 39 |    )
 40 | SELECT
 41 |     g.name genre,
 42 |     count(uts.invoice_line_id) tracks_sold,
 43 |     cast(count(uts.invoice_line_id) AS FLOAT) / (
 44 |         SELECT COUNT(*) from usa_tracks_sold
 45 |     ) percentage_sold
 46 | FROM usa_tracks_sold uts
 47 | INNER JOIN track t on t.track_id = uts.track_id
 48 | INNER JOIN genre g on g.genre_id = t.genre_id
 49 | GROUP BY 1
 50 | ORDER BY 2 DESC
 51 | LIMIT 10;
 52 | '
 53 | 
 54 | run_query(albums_to_purchase)
 55 | ```
 56 | 
 57 | ```{r}
 58 | library(ggplot2)
 59 | genre_sales = run_query(albums_to_purchase)
 60 | ggplot(data = genre_sales, aes(x = reorder(genre, -percentage_sold), 
 61 |                                y = percentage_sold)) +
 62 |   geom_bar(stat = "identity")
 63 | ```
 64 | 
 65 | Among the genres represented in our list of 4 albums, punk, blues and pop are the highest rated. Therefore, we should recommend:
 66 | 
 67 | - Red Tone (Punk)
 68 | - Slim Jim Bites (Blues)
 69 | - Meteor and the Girls (Pop)
 70 | 
 71 | By far though, rock makes up the majority of the sales. To better capture sales in the USA, we might want to ask the record label if they have any up-and-coming rock bands.
 72 | 
 73 | # Analyzing Employee Sales Performance
 74 | 
 75 | ```{r}
 76 | employee_sales_performance = '
 77 | WITH customer_support_rep_sales AS
 78 |     (
 79 |      SELECT
 80 |          i.customer_id,
 81 |          c.support_rep_id,
 82 |          SUM(i.total) total
 83 |      FROM invoice i
 84 |      INNER JOIN customer c ON i.customer_id = c.customer_id
 85 |      GROUP BY 1,2
 86 |     )
 87 | 
 88 | SELECT
 89 |     e.first_name || " " || e.last_name employee,
 90 |     e.hire_date,
 91 |     SUM(csrs.total) total_sales
 92 | FROM customer_support_rep_sales csrs
 93 | INNER JOIN employee e ON e.employee_id = csrs.support_rep_id
 94 | GROUP BY 1;
 95 | '
 96 | 
 97 | run_query(employee_sales_performance)
 98 | ```
 99 | 
100 | ```{r}
101 | employee_sales = run_query(employee_sales_performance)
102 | ggplot(data = employee_sales, aes(x = reorder(employee, -total_sales), 
103 |                                y = total_sales)) +
104 |   geom_bar(stat = "identity")
105 | ```
106 | 
107 | Jane Peacock has the highest amount of sales, but she also has been at the company the longest. If we really want to hone in on employee efficiency, we might want to standardize sales by the number of days or hours worked. 
108 | 
109 | # Visualizing Sales by Country
110 | 
111 | ```{r}
112 | sales_by_country = '
113 | WITH country_or_other AS
114 |     (
115 |      SELECT
116 |        CASE
117 |            WHEN (
118 |                  SELECT count(*)
119 |                  FROM customer
120 |                  where country = c.country
121 |                 ) = 1 THEN "Other"
122 |            ELSE c.country
123 |        END AS country,
124 |        c.customer_id,
125 |        il.*
126 |      FROM invoice_line il
127 |      INNER JOIN invoice i ON i.invoice_id = il.invoice_id
128 |      INNER JOIN customer c ON c.customer_id = i.customer_id
129 |     )
130 | SELECT
131 |     country,
132 |     customers,
133 |     total_sales,
134 |     average_order,
135 |     customer_lifetime_value
136 | FROM
137 |     (
138 |     SELECT
139 |         country,
140 |         count(distinct customer_id) customers,
141 |         SUM(unit_price) total_sales,
142 |         SUM(unit_price) / count(distinct customer_id) customer_lifetime_value,
143 |         SUM(unit_price) / count(distinct invoice_id) average_order,
144 |         CASE
145 |             WHEN country = "Other" THEN 1
146 |             ELSE 0
147 |         END AS sort
148 |     FROM country_or_other
149 |     GROUP BY country
150 |     ORDER BY sort ASC, total_sales DESC
151 |     );
152 | '
153 | 
154 | run_query(sales_by_country)
155 | ```
156 | 
157 | # Visualizing Sales by Country
158 | 
159 | ```{r}
160 | country_metrics = run_query(sales_by_country)
161 | 
162 | ggplot(data = country_metrics, aes(x = reorder(country, -total_sales), 
163 |                                    y = total_sales, 
164 |                                    fill = country)) +
165 |   geom_bar(stat = "identity") +
166 |   labs(
167 |     title = "Total sales by country",
168 |     x = "Country",
169 |     y = "Total Sales"
170 |   ) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
171 | 
172 | ggplot(data = country_metrics, aes(x = reorder(country, -customers), 
173 |                                    y = customers, 
174 |                                    fill = country)) +
175 |   geom_bar(stat = "identity") +
176 |   coord_polar("y") + 
177 |   labs(
178 |     title = "Number of customers by country",
179 |     x = "Country",
180 |     y = "Customers"
181 |   )
182 | 
183 | ggplot(data = country_metrics, aes(x = reorder(country, -customer_lifetime_value), 
184 |                                    y = customer_lifetime_value, 
185 |                                    color = country)) +
186 |   geom_point(stat = "identity") +
187 |   labs(
188 |     title = "Customer lifetime value by country",
189 |     x = "Country",
190 |     y = "Customer Lifetime Value"
191 |   ) + theme(axis.text.x = element_text(angle = 45, hjust = 1))
192 | ```
193 | 
194 | # Albums vs Individual Tracks
195 | 
196 | ```{r}
197 | albums_vs_tracks = '
198 | WITH invoice_first_track AS
199 |     (
200 |      SELECT
201 |          il.invoice_id invoice_id,
202 |          MIN(il.track_id) first_track_id
203 |      FROM invoice_line il
204 |      GROUP BY 1
205 |     )
206 | 
207 | SELECT
208 |     album_purchase,
209 |     COUNT(invoice_id) number_of_invoices,
210 |     CAST(count(invoice_id) AS FLOAT) / (
211 |                                          SELECT COUNT(*) FROM invoice
212 |                                       ) percent
213 | FROM
214 |     (
215 |     SELECT
216 |         ifs.*,
217 |         CASE
218 |             WHEN
219 |                  (
220 |                   SELECT t.track_id FROM track t
221 |                   WHERE t.album_id = (
222 |                                       SELECT t2.album_id FROM track t2
223 |                                       WHERE t2.track_id = ifs.first_track_id
224 |                                      ) 
225 | 
226 |                   EXCEPT 
227 | 
228 |                   SELECT il2.track_id FROM invoice_line il2
229 |                   WHERE il2.invoice_id = ifs.invoice_id
230 |                  ) IS NULL
231 |              AND
232 |                  (
233 |                   SELECT il2.track_id FROM invoice_line il2
234 |                   WHERE il2.invoice_id = ifs.invoice_id
235 | 
236 |                   EXCEPT 
237 | 
238 |                   SELECT t.track_id FROM track t
239 |                   WHERE t.album_id = (
240 |                                       SELECT t2.album_id FROM track t2
241 |                                       WHERE t2.track_id = ifs.first_track_id
242 |                                      ) 
243 |                  ) IS NULL
244 |              THEN "yes"
245 |              ELSE "no"
246 |          END AS "album_purchase"
247 |      FROM invoice_first_track ifs
248 |     )
249 | GROUP BY album_purchase;
250 | '
251 | 
252 | run_query(albums_vs_tracks)
253 | ```
254 | 
255 | Album purchases account for almost a quarter of the total sales, so it is inadvisable to change strategy to just purchase the most popular tracks.


--------------------------------------------------------------------------------
/Mission409Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Probability Fundamentals in R: Guided Project Solutions"
  3 | output: html_document
  4 | ---
  5 | 
  6 | # Developing A Mobile App For Alleviating Lottery Addiction
  7 | 
  8 | This RMarkdown file is intended to lay out the logic of a mobile app designed for those addicted to the lottery. By showing a user how to calculate the incredibly small probabilities of winning the lottery, we hope that the app will help them better grasp that buying multiple lottery tickets will do little to help them win. Through this understanding, they will hopefully stop purchasing lottery tickets in an unhealthy manner.
  9 | 
 10 | # Core Functions
 11 | 
 12 | ```{r}
 13 | factorial <- function(n) {
 14 |   product = 1
 15 |   for (i in 1:n) {
 16 |     product = product * i
 17 |   }
 18 |   return(product)
 19 | }
 20 | 
 21 | combinations <- function(n, k) {
 22 |   numerator <- factorial(n)
 23 |   denominator <- factorial(k) * factorial(n - k)
 24 |   return(numerator / denominator)
 25 | }
 26 | ```
 27 | 
 28 | # One-Ticket Probability
 29 | 
 30 | ```{r}
 31 | one_ticket_probability <- function(nums) {
 32 |   total_combinations <- combinations(49, 6)
 33 |   prob <- (1 / total_combinations) * 100
 34 |   pretty_prob <- sprintf("%1.9f", prob)
 35 |   s <- paste("You have a ", pretty_prob, "% chance of winning the big prize.", sep = "")
 36 |   return(s)
 37 | }
 38 | 
 39 | one_ticket_probability(c(1, 2, 3, 4, 5, 6))
 40 | ```
 41 | 
 42 | # Historical Data Check for Canada Lottery
 43 | 
 44 | ```{r, message = FALSE, warning = FALSE}
 45 | library(tidyverse)
 46 | lottery649 <- read_csv("649.csv")
 47 | 
 48 | print(dim(lottery649))
 49 | ```
 50 | 
 51 | ```{r}
 52 | head(lottery649, 3)
 53 | ```
 54 | 
 55 | ```{r}
 56 | tail(lottery649, 3)
 57 | ```
 58 | 
 59 | # A New Data Structure
 60 | 
 61 | ```{r}
 62 | data1 <- c(1, 3, 5)
 63 | data2 <- c(2, 4, 6)
 64 | data3 <- c(8, 9, 7)
 65 | 
 66 | ## Answer
 67 | unnamed_list <- list(data1, data2, data3)
 68 | first_vector <- unnamed_list[[1]]
 69 | named_list <-list(first = data1, second = data2, third = data3)
 70 | first_item_sum <- named_list$data1[1] + named_list$data2[1] + named_list$data3[1]
 71 | ```
 72 | 
 73 | # Using pmap
 74 | 
 75 | ```{r}
 76 | data1 <- c(1, 3, 5)
 77 | data2 <- c(2, 4, 6)
 78 | data3 <- c(8, 9, 7)
 79 | data_list <- list(data1, data2, data3)
 80 | 
 81 | ## Answer
 82 | averages <- pmap(data_list, function(x, y, z) { (x + y + z) / 3 })
 83 | first_average <- unlist(averages)[1]
 84 | ```
 85 | 
 86 | 
 87 | # Function for Historical Data Check
 88 | 
 89 | ```{r}
 90 | historical_lots <- pmap(
 91 |   list(
 92 |     u <- lottery649$`NUMBER DRAWN 1`,
 93 |     v <- lottery649$`NUMBER DRAWN 2`,
 94 |     w <- lottery649$`NUMBER DRAWN 3`,
 95 |     x <- lottery649$`NUMBER DRAWN 4`,
 96 |     y <- lottery649$`NUMBER DRAWN 5`,
 97 |     z <- lottery649$`NUMBER DRAWN 6`
 98 |   ), 
 99 |   .f <- function(u, v, w, x, y, z) { c(u, v, w, x, y, z) }
100 |   )
101 | ```
102 | 
103 | ```{r}
104 | library(sets)
105 | check_historical_occurrences <- function(lot, hist_lots = historical_lots) {
106 |   historical_matches <- map(hist_lots, function(x) {setequal(x, lot)})
107 |   num_past_matches <- sum(unlist(historical_matches))
108 |   s <- paste("The combination you entered has appeared ", 
109 |              num_past_matches, 
110 |              " times in the past. ",
111 |              "Your chance of winning the big prize in the next drawing using this combination is 0.0000072%", sep = "")
112 |   return(s)
113 | }
114 | 
115 | check_historical_occurrences(c(3, 12, 11, 14, 41, 43))
116 | check_historical_occurrences(c(1, 2, 3, 4, 5, 6))
117 | ```
118 | 
119 | # Multi-ticket Probability
120 | 
121 | ```{r}
122 | multi_ticket_probability <- function(n) {
123 |   total_combinations <- combinations(49, 6)
124 |   prob <- (n / total_combinations) * 100
125 |   pretty_prob <- sprintf("%1.9f", prob)
126 |   s <- paste("you have a ", pretty_prob, "% chance of winning the big prize.", sep = "")
127 |   return(s)
128 | }
129 | ```
130 | 
131 | ```{r}
132 | test_amounts <- c(1, 10, 100, 10000, 1000000, 6991908, 13983816)
133 | for (n in test_amounts) {
134 |   print(paste("For ", n, " tickets, ",  multi_ticket_probability(n), sep = ""))
135 | }
136 | ```
137 | 
138 | # Less Winning Numbers
139 | 
140 | ```{r}
141 | probability_less_6 <- function(n) {
142 |   
143 |     n_combinations_ticket = combinations(6, n)
144 |     n_combinations_remaining = combinations(43, 6 - n)
145 |     successful_outcomes = n_combinations_ticket * n_combinations_remaining
146 |     n_combinations_total = combinations(49, 6)
147 |     
148 |     prob = (successful_outcomes / n_combinations_total) * 100
149 |     pretty_prob <- sprintf("%1.9f", prob)
150 |   
151 |   s <- paste("you have a ", pretty_prob, "% chance of winning the big prize.", sep = "")
152 |   return(s)
153 | }
154 | ```
155 | 
156 | ```{r}
157 | winning_nums <- c(3, 4, 5)
158 | for (n in winning_nums) {
159 |   print(paste("For ", n, " tickets, ",  probability_less_6(n), sep = ""))
160 | }
161 | ```
162 | 
163 | 


--------------------------------------------------------------------------------
/Mission410Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Statistics Fundamentals in R: Guided Project Solutions'
  3 | author: "Dataquest"
  4 | date: "8/13/2019"
  5 | output: html_document
  6 | ---
  7 | 
  8 | # Is Fandango Still Inflating Ratings?
  9 | In October 2015, Walt Hickey from FiveThirtyEight published a [popular article](https://fivethirtyeight.com/features/fandango-movies-ratings/) where he presented strong evidence which suggest that Fandango's movie rating system was biased and dishonest. In this project, we'll analyze more recent movie ratings data to determine whether there has been any change in Fandango's rating system after Hickey's analysis.
 10 | 
 11 | # Understanding the Data
 12 | We'll work with two samples of movie ratings: the data in one sample was collected *previous* to Hickey's analysis, while the other sample was collected *after*. Let's start by reading in the two samples (which are stored as CSV files) and getting familiar with their structure.
 13 | 
 14 | ```{r message=FALSE}
 15 | library(readr)
 16 | 
 17 | previous <- read_csv('fandango_score_comparison.csv')
 18 | after <- read_csv('movie_ratings_16_17.csv')
 19 | 
 20 | head(previous)
 21 | ```
 22 | 
 23 | ```{r}
 24 | head(after)
 25 | ```
 26 | 
 27 | Below we isolate only the columns that provide information about Fandango so we make the relevant data more readily available for later use. 
 28 | 
 29 | ```{r message=FALSE}
 30 | library(dplyr)
 31 | fandango_previous <- previous %>% 
 32 |   select(FILM, Fandango_Stars, Fandango_Ratingvalue, 
 33 |          Fandango_votes, Fandango_Difference)
 34 | 
 35 | fandango_after <- after %>% 
 36 |   select(movie, year, fandango)
 37 | 
 38 | head(fandango_previous)
 39 | ```
 40 | 
 41 | ```{r}
 42 | head(fandango_after)
 43 | ```
 44 | 
 45 | Our goal is to determine whether there has been any change in Fandango's rating system after Hickey's analysis. The population of interest for our analysis is made of all the movie ratings stored on Fandango's website, regardless of the releasing year.
 46 | 
 47 | Because we want to find out whether the parameters of this population changed after Hickey's analysis, we're interested in sampling the population at two different periods in time — previous and after Hickey's analysis — so we can compare the two states.
 48 | 
 49 | The data we're working with was sampled at the moments we want: one sample was taken previous to the analysis, and the other after the analysis. We want to describe the population, so we need to make sure that the samples are representative, otherwise we should expect a large sampling error and, ultimately, wrong conclusions.
 50 | 
 51 | From Hickey's article and from the README.md of [the data set's repository](https://github.com/fivethirtyeight/data/tree/master/fandango), we can see that he used the following sampling criteria:
 52 | 
 53 | * The movie must have had at least 30 fan ratings on Fandango's website at the time of sampling (Aug. 24, 2015).
 54 | * The movie must have had tickets on sale in 2015.
 55 | 
 56 | The sampling was clearly not random because not every movie had the same chance to be included in the sample — some movies didn't have a chance at all (like those having under 30 fan ratings or those without tickets on sale in 2015). It's questionable whether this sample is representative of the entire population we're interested to describe. It seems more likely that it isn't, mostly because this sample is subject to temporal trends — e.g. movies in 2015 might have been outstandingly good or bad compared to other years.
 57 | 
 58 | The sampling conditions for our other sample were (as it can be read in the README.md of [the data set's repository](https://github.com/mircealex/Movie_ratings_2016_17)):
 59 | 
 60 | * The movie must have been released in 2016 or later.
 61 | * The movie must have had a considerable number of votes and reviews (unclear how many from the README.md or from the data).
 62 | 
 63 | This second sample is also subject to temporal trends and it's unlikely to be representative of our population of interest.
 64 | 
 65 | Both these authors had certain research questions in mind when they sampled the data, and they used a set of criteria to get a sample that would fit their questions. Their sampling method is called [purposive sampling](https://www.youtube.com/watch?v=CdK7N_kTzHI&feature=youtu.be) (or judgmental/selective/subjective sampling). While these samples were good enough for their research, they don't seem too useful for us.
 66 | 
 67 | # Changing the Goal of our Analysis
 68 | At this point, we can either collect new data or change our the goal of our analysis. We choose the latter and place some limitations on our initial goal.
 69 | 
 70 | Instead of trying to determine whether there has been any change in Fandango's rating system after Hickey's analysis, our new goal is to determine whether there's any difference between Fandango's ratings for popular movies in 2015 and Fandango's ratings for popular movies in 2016. This new goal should also be a fairly good proxy for our initial goal.
 71 | 
 72 | # Isolating the Samples We Need
 73 | With this new research goal, we have two populations of interest:
 74 | 
 75 | 1. All Fandango's ratings for popular movies released in 2015.
 76 | 1. All Fandango's ratings for popular movies released in 2016.
 77 | 
 78 | We need to be clear about what counts as popular movies. We'll use Hickey's benchmark of 30 fan ratings and count a movie as popular only if it has 30 fan ratings or more on Fandango's website.
 79 | 
 80 | Although one of the sampling criteria in our second sample is movie popularity, the `fandango_after` dataframe doesn't provide information about the number of fan ratings. We should be skeptical once more and ask whether this sample is truly representative and contains popular movies (movies with over 30 fan ratings).
 81 | 
 82 | One quick way to check the representativity of this sample might be to sample randomly 10 movies from it and then check the number of fan ratings ourselves on Fandango's website. 
 83 | 
 84 | ```{r}
 85 | set.seed(1)
 86 | sample_n(fandango_after, size = 10)
 87 | ```
 88 | 
 89 | Above we used a value of 1 as the random seed. This is good practice because it suggests that we weren't trying out various random seeds just to get a favorable sample.
 90 | 
 91 | After checking the number of fan ratings for the movies above, we discover that as of August, 2019 Fandango no longer uses the 5-Star Fan Ratings described above. Instead, Fandango now uses the [Rotten Tomatoes verified Audience Score](https://editorial.rottentomatoes.com/article/introducing-verified-audience-score/). These are the number of fan ratings we found on [Rotten Tomatoes](https://www.rottentomatoes.com/):
 92 | 
 93 | ```{r}
 94 | set.seed(1)
 95 | sampled <- sample_n(fandango_after, size = 10)
 96 | # Create a single column tibble of Rotten Tomato review counts
 97 | reviews <- tibble(reviews = c(13569, 74904, 24293, 4141, 30183, 48952, 14328, 59359, 54765, 82222))
 98 | bind_cols(sampled, reviews)
 99 | ```
100 | 
101 | All ten movies sampled have well above 30 fan ratings, but it is possible that the Rotten Tomatoes Verified Audience user base is larger than the Fandango user base. We cannot really say with confidence whether these review numbers are comparable to the Fandango fan ratings. In addition, time has passed since Hickey's analysis, giving more fans an opportunity to submit reviews. So even if we did still have access to Fandango's 5-star fan ratings, we would have no way to compare the number of fan ratings we see to the number that Hickey observed. 
102 | 
103 | Let's move on to the `fandango_previous` dataframe that does include the number of fan ratings for each movie. The documentation states clearly that there're only movies with at least 30 fan ratings, but it should take only a couple of seconds to double-check here.
104 | 
105 | ```{r}
106 | sum(fandango_previous$Fandango_votes < 30)
107 | ```
108 | 
109 | If you explore the two data sets, you'll notice that there are movies with a releasing year different than 2015 or 2016. 
110 | 
111 | ```{r}
112 | head(fandango_previous$FILM, n = 10)
113 | ```
114 | 
115 | 
116 | ```{r}
117 | unique(fandango_after$year)
118 | ```
119 | 
120 | 
121 | For our purposes, we'll need to isolate only the movies released in 2015 and 2016.
122 | 
123 | ```{r}
124 | library(stringr)
125 | fandango_previous <- fandango_previous %>% 
126 |   mutate(year = str_sub(FILM, -5, -2))
127 | ```
128 | 
129 | Let's examine the frequency distribution for the Year column and then isolate the movies released in 2015.
130 | 
131 | ```{r}
132 | fandango_previous %>% 
133 |   group_by(year) %>% 
134 |   summarize(Freq = n())
135 | ```
136 | 
137 | Alternatively, we can use the base R `table()` function because we only need to get a quick view of the distribution.
138 | ```{r}
139 | table(fandango_previous$year)
140 | ```
141 | 
142 | ```{r}
143 | fandango_2015 <- fandango_previous %>% 
144 |   filter(year == 2015)
145 | table(fandango_2015$year)
146 | ```
147 | Great, now let's isolate the movies in the other data set.
148 | ```{r}
149 | head(fandango_after)
150 | ```
151 | 
152 | ```{r}
153 | table(fandango_after$year)
154 | ```
155 | 
156 | ```{r}
157 | fandango_2016 <- fandango_after %>% 
158 |   filter(year == 2016)
159 | table(fandango_2016$year)
160 | ```
161 | 
162 | 
163 | # Comparing Distribution Shapes for 2015 and 2016
164 | 
165 | Our aim is to figure out whether there's any difference between Fandango's ratings for popular movies in 2015 and Fandango's ratings for popular movies in 2016. One way to go about is to analyze and compare the distributions of movie ratings for the two samples.
166 | 
167 | We'll start with comparing the shape of the two distributions using kernel density plots.
168 | 
169 | ```{r}
170 | library(ggplot2)
171 | # 2015 dataframe is specified in the ggplot call
172 | ggplot(data = fandango_2015, 
173 |                aes(x = Fandango_Stars)) +
174 |   geom_density() +
175 |   # 2016 dataframe is specified in the second geom_density() call
176 |   geom_density(data = fandango_2016, 
177 |                aes(x = fandango), color = "blue") +
178 |   labs(title = "Comparing distribution shapes for Fandango's ratings\n(2015 vs 2016)",
179 |        x = "Stars",
180 |        y = "Density") +
181 |   scale_x_continuous(breaks = seq(0, 5, by = 0.5), 
182 |                      limits = c(0, 5))
183 | ```
184 | 
185 | 
186 | 
187 | Two aspects are striking on the figure above:
188 | 
189 | * Both distributions are strongly left skewed.
190 | * The 2016 distribution is slightly shifted to the left relative to the 2015 distribution.
191 | 
192 | The left skew suggests that movies on Fandango are given mostly high and very high fan ratings. Coupled with the fact that Fandango sells tickets, the high ratings are a bit dubious. It'd be really interesting to investigate this further — ideally in a separate project, since this is quite irrelevant for the current goal of our analysis.
193 | 
194 | The slight left shift of the 2016 distribution is very interesting for our analysis. It shows that ratings were slightly lower in 2016 compared to 2015. This suggests that there was a difference indeed between Fandango's ratings for popular movies in 2015 and Fandango's ratings for popular movies in 2016. We can also see the direction of the difference: the ratings in 2016 were slightly lower compared to 2015.
195 | 
196 | ```{r}
197 | fandango_2015 %>% 
198 |   group_by(Fandango_Stars) %>% 
199 |   summarize(Percentage = n() / nrow(fandango_2015) * 100)
200 | ```
201 | 
202 | ```{r}
203 | fandango_2016 %>% 
204 |   group_by(fandango) %>% 
205 |   summarize(Percentage = n() / nrow(fandango_2016) * 100)
206 | ```
207 | 
208 | In 2016, very high ratings (4.5 and 5 stars) had lower percentages compared to 2015. In 2016, under 1% of the movies had a perfect rating of 5 stars, compared to 2015 when the percentage was close to 7%. Ratings of 4.5 were also more popular in 2015 — there were approximately 13% more movies rated with a 4.5 in 2015 compared to 2016.
209 | 
210 | The minimum rating is also lower in 2016 — 2.5 instead of 3 stars, the minimum of 2015. There clearly is a difference between the two frequency distributions.
211 | 
212 | For some other ratings, the percentage went up in 2016. There was a greater percentage of movies in 2016 that received 3.5 and 4 stars, compared to 2015. 3.5 and 4.0 are high ratings and this challenges the direction of the change we saw on the kernel density plots.
213 | 
214 | Determining the Direction of the Change
215 | 
216 | Let's take a couple of summary metrics to get a more precise picture about the direction of the change. In what follows, we'll compute the mean, the median, and the mode for both distributions and then use a bar graph to plot the values.
217 | 
218 | ```{r}
219 | library(tidyr)
220 | 
221 | # Mode function from stackoverflow
222 | mode <- function(x) {
223 |   ux <- unique(x)
224 |   ux[which.max(tabulate(match(x, ux)))]
225 | }
226 | 
227 | summary_2015 <- fandango_2015 %>% 
228 |   summarize(year = "2015",
229 |     mean = mean(Fandango_Stars),
230 |     median = median(Fandango_Stars),
231 |     mode = mode(Fandango_Stars))
232 | 
233 | summary_2016 <- fandango_2016 %>% 
234 |   summarize(year = "2016",
235 |             mean = mean(fandango),
236 |             median = median(fandango),
237 |             mode = mode(fandango))
238 | 
239 | # Combine 2015 & 2016 summary dataframes
240 | summary_df <- bind_rows(summary_2015, summary_2016)
241 | 
242 | # Gather combined dataframe into a format ready for ggplot
243 | summary_df <- summary_df %>% 
244 |   gather(key = "statistic", value = "value", - year)
245 | 
246 | summary_df
247 | ```
248 | 
249 | ```{r}
250 | ggplot(data = summary_df, aes(x = statistic, y = value, fill = year)) +
251 |   geom_bar(stat = "identity", position = "dodge") +
252 |   labs(title = "Comparing summary statistics: 2015 vs 2016",
253 |        x = "",
254 |        y = "Stars")
255 | ```
256 | 
257 | The mean rating was lower in 2016 with approximately 0.2. This means a drop of almost 5% relative to the mean rating in 2015.
258 | 
259 | ```{r}
260 | means <- summary_df %>% 
261 |   filter(statistic == "mean")
262 | 
263 | means %>% 
264 |   summarize(change = (value[1] - value[2]) / value[1])
265 | ```
266 | 
267 | 
268 | 
269 | While the median is the same for both distributions, the mode is lower in 2016 by 0.5. Coupled with what we saw for the mean, the direction of the change we saw on the kernel density plot is confirmed: on average, popular movies released in 2016 were rated slightly lower than popular movies released in 2015.
270 | 
271 | # Conclusion
272 | 
273 | Our analysis showed that there's indeed a slight difference between Fandango's ratings for popular movies in 2015 and Fandango's ratings for popular movies in 2016. We also determined that, on average, popular movies released in 2016 were rated lower on Fandango than popular movies released in 2015.
274 | 
275 | We cannot be completely sure what caused the change, but the chances are very high that it was caused by Fandango fixing the biased rating system after Hickey's analysis.
276 | 
277 | 
278 | 
279 | 
280 | 
281 | 


--------------------------------------------------------------------------------
/Mission443Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Hypothesis Testing in R: Guided Project Solutions"
  3 | output: html_document
  4 | ---
  5 | 
  6 | We would like to remind our students that our solutions represent just one of the many ways that a programmer might perform the analyses. This solution merely provides a platform for those who need a bit more guidance.
  7 | 
  8 | ```{r setup }
  9 | library(tidyverse)
 10 | ```
 11 | 
 12 | # Data Import
 13 | 
 14 | ```{r}
 15 | jeopardy = read_csv("./data/jeopardy.csv")
 16 | ```
 17 | 
 18 | ```{r}
 19 | head(jeopardy)
 20 | ```
 21 | 
 22 | ```{r}
 23 | colnames(jeopardy)
 24 | ```
 25 | 
 26 | ```{r}
 27 | # the clean_names() function from the janitor package would have been great here too
 28 | colnames(jeopardy) = c("show_number", "air_date", "round", "category", "value", "question", "answer")
 29 | ```
 30 | 
 31 | ```{r}
 32 | sapply(jeopardy, typeof)
 33 | ```
 34 | 
 35 | # Fixing Data Types
 36 | 
 37 | ```{r}
 38 | unique(jeopardy$value)
 39 | ```
 40 | 
 41 | ```{r}
 42 | # Removing Nones, cleaning the text, and converting everything into numeric
 43 | jeopardy = jeopardy %>% 
 44 |   filter(value != "None") %>% 
 45 |   mutate(
 46 |     value = str_replace_all(value, "[$,]", ""),
 47 |     value = as.numeric(value)
 48 |   )
 49 | ```
 50 | 
 51 | ```{r}
 52 | unique(jeopardy$value)
 53 | ```
 54 | 
 55 | # Normalizing Text
 56 | 
 57 | ```{r}
 58 | # The stringr library is automatically brought in when tidyverse is brought in
 59 | 
 60 | # Notice how there is a space in the regular expression
 61 | jeopardy = jeopardy %>% 
 62 |   mutate(
 63 |     question = tolower(question),
 64 |     question = str_replace_all(question, "[^A-Za-z0-9 ]", ""),
 65 |     answer = tolower(answer),
 66 |     answer = str_replace_all(answer, "[^A-Za-z0-9 ]", ""),
 67 |     category = tolower(category),
 68 |     category = str_replace_all(category, "[^A-Za-z0-9 ]", "")
 69 |   )
 70 | ```
 71 | 
 72 | ```{r}
 73 | head(jeopardy)
 74 | ```
 75 | 
 76 | # Making Dates More Accessible
 77 | 
 78 | ```{r}
 79 | jeopardy = jeopardy %>% 
 80 |   separate(., air_date, into = c("year", "month", "day"), sep = "-") %>% 
 81 |   mutate(
 82 |     year = as.numeric(year),
 83 |     month = as.numeric(month),
 84 |     day = as.numeric(day)
 85 |   )
 86 | ```
 87 | 
 88 | # Focusing On Particular Subject Areas
 89 | 
 90 | ```{r}
 91 | n_questions = nrow(jeopardy)
 92 | p_category_expected = 1/3369 
 93 | p_not_category_expected = 3368/3369 
 94 | ```
 95 | 
 96 | ```{r}
 97 | categories = pull(jeopardy, category)
 98 | n_science_categories = 0
 99 | 
100 | # Count how many times the word science appears in the categories
101 | for (c in categories) {
102 |   if ("science" %in% c) {
103 |     n_science_categories = n_science_categories + 1
104 |   }
105 | }
106 | 
107 | science_obs = c(n_science_categories, n_questions - n_science_categories)
108 | p_expected = c(1/3369, 3368/3369)
109 | chisq.test(science_obs, p = p_expected)
110 | ```
111 | 
112 | ```{r}
113 | n_history_categories = 0
114 | 
115 | # Count how many times the word science appears in the categories
116 | for (c in categories) {
117 |   if ("history" %in% c) {
118 |     n_history_categories = n_history_categories + 1
119 |   }
120 | }
121 | 
122 | history_obs = c(n_history_categories, n_questions - n_history_categories)
123 | p_expected = c(1/3369, 3368/3369)
124 | chisq.test(history_obs, p = p_expected)
125 | ```
126 | 
127 | ```{r}
128 | n_shakespeare_categories = 0
129 | 
130 | # Count how many times the word science appears in the categories
131 | for (c in categories) {
132 |   if ("shakespeare" %in% c) {
133 |     n_shakespeare_categories = n_shakespeare_categories + 1
134 |   }
135 | }
136 | 
137 | shakespeare_obs = c(n_shakespeare_categories, n_questions - n_shakespeare_categories)
138 | p_expected = c(1/3369, 3368/3369)
139 | chisq.test(shakespeare_obs, p = p_expected)
140 | ```
141 | 
142 | We see p-values less than 0.05 for each of the hypothesis tests. From this, we would conclude that we should reject the null hypothesis that science doesn't have a higher prevalence than other topics in the Jeopardy data. We would conclude the same with history and Shakespeare.
143 | 
144 | # Unique Terms in Questions
145 | 
146 | ```{r}
147 | # Pull just the questions from the jeopardy data
148 | questions = pull(jeopardy, question)
149 | terms_used = character(0)
150 | 
151 | for (q in questions) {
152 |   # Split the sentence into distinct words
153 |   split_sentence = str_split(q, " ")[[1]]
154 |   
155 |   # Check if each word is longer than 6 and if it's currently in terms_used
156 |   for (term in split_sentence) {
157 |     if (!term %in% terms_used & nchar(term) >= 6) {
158 |       terms_used = c(terms_used, term)
159 |     }
160 |   }
161 | }
162 | ```
163 | 
164 | # Terms In Low and High Value Questions
165 | 
166 | ```{r}
167 | # Going only through the first 20 terms for shortness
168 | # But you can remove the indexing to perform this code on all the terms
169 | values = pull(jeopardy, value)
170 | value_count_data = NULL
171 | 
172 | for (term in terms_used[1:20]) {
173 |   n_high_value = 0
174 |   n_low_value = 0
175 |   
176 |   for (i in 1:length(questions)) {
177 |     # Split the sentence into a new vector
178 |     split_sentence = str_split(questions[i], " ")[[1]]
179 |     
180 |     # Detect if the term is in the question and its value status
181 |     if (term %in% split_sentence & values[i] >= 800) {
182 |       n_high_value = n_high_value + 1
183 |     } else if (term %in% split_sentence & values[i] < 800) { 
184 |       n_low_value = n_low_value + 1
185 |     }
186 |   }
187 |   
188 |   # Testing if the counts for high and low value questions deviates from what we expect
189 |   test = chisq.test(c(n_high_value, n_low_value), p = c(2/5, 3/5))
190 |   new_row = c(term, n_high_value, n_low_value, test$p.value)
191 |   
192 |   # Append this new row to our
193 |   value_count_data = rbind(value_count_data, new_row)
194 |   
195 | }
196 | ```
197 | 
198 | ```{r}
199 | # Take the value count data and put it in a better format
200 | tidy_value_count_data = as_tibble(value_count_data)
201 | colnames(tidy_value_count_data) = c("term", "n_high", "n_low", "p_value")
202 | 
203 | head(tidy_value_count_data)
204 | ```
205 | 
206 | We can see from the output that some of the values are less than 5. Recall that the chi-squared test is prone to errors when the counts in each of the cells are less than 5. We may need to discard these terms and only look at terms where both counts are greater than 5.
207 | 
208 | From the 20 terms that we looked at, it seems that the term "indian" is more associated with high value questions. Interesting!


--------------------------------------------------------------------------------
/Mission475Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Conditional Probability in R: Guided Project Solutions"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r, warning = FALSE, message = FALSE }
  7 | library(tidyverse)
  8 | set.seed(1)
  9 | options(dplyr.summarise.inform = FALSE)
 10 | ```
 11 | 
 12 | # Introduction
 13 | 
 14 | This analysis is an application of what we've learned in Dataquest's Conditional Probability course. Using a dataset of pre-labeled SMS messages, we'll create a spam filter using the Naive Bayes algorithm.
 15 | 
 16 | ```{r}
 17 | # Bring in the dataset
 18 | spam <- read_csv("spam.csv")
 19 | ```
 20 | 
 21 | The `spam` dataset has `r nrow(spam)` rows and `r ncol(spam)` columns. Of these messages, `r mean(spam$label == "ham") * 100`% of them are not classified as spam, the rest are spam.
 22 | 
 23 | # Training, Cross-validation and Test Sets
 24 | 
 25 | ```{r}
 26 | # Calculate some helper values to split the dataset
 27 | n <- nrow(spam)
 28 | n_training <- 0.8 * n
 29 | n_cv <- 0.1 * n
 30 | n_test <- 0.1 * n
 31 | 
 32 | # Create the random indices for training set
 33 | train_indices <- sample(1:n, size = n_training, replace = FALSE)
 34 | 
 35 | # Get indices not used by the training set
 36 | remaining_indices <- setdiff(1:n, train_indices)
 37 | 
 38 | # Remaining indices are already randomized, just allocate correctly
 39 | cv_indices <- remaining_indices[1:(length(remaining_indices)/2)]
 40 | test_indices <- remaining_indices[((length(remaining_indices)/2) + 1):length(remaining_indices)]
 41 | 
 42 | # Use the indices to create each of the datasets
 43 | spam_train <- spam[train_indices,]
 44 | spam_cv <- spam[cv_indices,]
 45 | spam_test <- spam[test_indices,]
 46 | 
 47 | # Sanity check: are the ratios of ham to spam relatively constant?
 48 | print(mean(spam_train$label == "ham"))
 49 | print(mean(spam_cv$label == "ham"))
 50 | print(mean(spam_test$label == "ham"))
 51 | ```
 52 | 
 53 | The number of ham messages in each dataset is relatively close to each other in each dataset. This is just to make sure that no dataset is entirely just "ham", which ruins the point of spam detection.
 54 | 
 55 | # Data Cleaning
 56 | 
 57 | ```{r}
 58 | # To lowercase, removal of punctuation, weird characters, digits
 59 | tidy_train <- spam_train %>% 
 60 |   mutate(
 61 |     # Take the messages and remove unwanted characters
 62 |     sms = str_to_lower(sms) %>% 
 63 |       str_squish %>% 
 64 |       str_replace_all("[[:punct:]]", "") %>% 
 65 |       str_replace_all("[\u0094\u0092\u0096\n\t]", "") %>% # Unicode characters
 66 |       str_replace_all("[[:digit:]]", "")
 67 |   )
 68 | 
 69 | # Creating the vocabulary
 70 | vocabulary <- NULL
 71 | messages <- tidy_train %>%  pull(sms)
 72 | 
 73 | # Iterate through the messages and add to the vocabulary
 74 | for (m in messages) {
 75 |   words <- str_split(m, " ")[[1]]
 76 |   vocabulary <- c(vocabulary, words)
 77 | }
 78 | 
 79 | # Remove duplicates from the vocabulary 
 80 | vocabulary <- vocabulary %>% unique()
 81 | ```
 82 | 
 83 | # Calculating Constants and Parameters
 84 | 
 85 | ```{r}
 86 | # Isolate the spam and ham messages
 87 | spam_messages <- tidy_train %>% 
 88 |   filter(label == "spam") %>% 
 89 |   pull(sms)
 90 | 
 91 | ham_messages <- tidy_train %>% 
 92 |   filter(label == "ham") %>% 
 93 |   pull(sms)
 94 | 
 95 | # Isolate the vocabulary in spam and ham messages
 96 | spam_vocab <- NULL
 97 | for (sm in spam_messages) {
 98 |   words <- str_split(sm, " ")[[1]]
 99 |   spam_vocab  <- c(spam_vocab, words)
100 | }
101 | spam_vocab
102 | 
103 | ham_vocab <- NULL
104 | for (hm in ham_messages) {
105 |   words <- str_split(hm, " ")[[1]]
106 |   ham_vocab <- c(ham_vocab, words)
107 | }
108 | ham_vocab
109 | 
110 | # Calculate some important parameters from the vocab
111 | n_spam <- spam_vocab %>% length()
112 | n_ham <- ham_vocab %>% length()
113 | n_vocabulary <- vocabulary %>% length()
114 | ```
115 | 
116 | # Calculating Probability Parameters
117 | 
118 | ```{r}
119 | # New vectorized approach to a calculating ham and spam probabilities
120 | 
121 | # Marginal probability of a training message being spam or ham
122 | p_spam <- mean(tidy_train$label == "spam")
123 | p_ham <- mean(tidy_train$label == "ham")
124 | 
125 | # Break up the spam and ham counting into their own tibbles
126 | spam_counts <- tibble(
127 |   word = spam_vocab
128 | ) %>% 
129 |   mutate(
130 |     # Calculate the number of times a word appears in spam
131 |     spam_count = map_int(word, function(w) {
132 |       
133 |       # Count how many times each word appears in all spam messsages, then sum
134 |       map_int(spam_messages, function(sm) {
135 |         (str_split(sm, " ")[[1]] == w) %>% sum # for a single message
136 |       }) %>% 
137 |         sum # then summing over all messages
138 |       
139 |     })
140 |   )
141 | 
142 | # There are many words in the ham vocabulary so this will take a while!
143 | # Run this code and distract yourself while the counts are calculated
144 | ham_counts <- tibble(
145 |   word = ham_vocab
146 | ) %>% 
147 |   mutate(
148 |     # Calculate the number of times a word appears in ham
149 |     ham_count = map_int(word, function(w) {
150 |       
151 |       # Count how many times each word appears in all ham messsages, then sum
152 |       map_int(ham_messages, function(hm) {
153 |         (str_split(hm, " ")[[1]] == w) %>% sum 
154 |       }) %>% 
155 |         sum
156 |       
157 |     })
158 |   )
159 | 
160 | # Join these tibbles together
161 | word_counts <- full_join(spam_counts, ham_counts, by = "word") %>% 
162 |   mutate(
163 |     # Fill in zeroes where there are missing values
164 |     spam_count = ifelse(is.na(spam_count), 0, spam_count),
165 |     ham_count = ifelse(is.na(ham_count), 0, ham_count)
166 |   )
167 | ```
168 | 
169 | 
170 | # Classifying New Messages
171 | 
172 | ```{r}
173 | # This is the updated function using the vectorized approach to calculating
174 | # the spam and ham probabilities
175 | 
176 | # Create a function that makes it easy to classify a tibble of messages
177 | # we add an alpha argument to make it easy to recalculate probabilities 
178 | # based on this alpha (default to 1)
179 | classify <- function(message, alpha = 1) {
180 |   
181 |   # Splitting and cleaning the new message
182 |   # This is the same cleaning procedure used on the training messages
183 |   clean_message <- str_to_lower(message) %>% 
184 |     str_squish %>% 
185 |       str_replace_all("[[:punct:]]", "") %>% 
186 |       str_replace_all("[\u0094\u0092\u0096\n\t]", "") %>% # Unicode characters
187 |       str_replace_all("[[:digit:]]", "")
188 |   
189 |   words <- str_split(clean_message, " ")[[1]]
190 |   
191 |   # There is a possibility that there will be words that don't appear
192 |   # in the training vocabulary, so this must be accounted for
193 |   
194 |   # Find the words that aren't present in the training
195 |   new_words <- setdiff(vocabulary, words)
196 |   
197 |   # Add them to the word_counts 
198 |   new_word_probs <- tibble(
199 |     word = new_words,
200 |     spam_prob = 1,
201 |     ham_prob = 1
202 |   )
203 | 
204 |   # Filter down the probabilities to the words present 
205 |   # use group by to multiply everything together
206 |   present_probs <- word_counts %>% 
207 |     filter(word %in% words) %>% 
208 |     mutate(
209 |       # Calculate the probabilities from the counts
210 |       spam_prob = (spam_count + alpha) / (n_spam + alpha * n_vocabulary),
211 |       ham_prob = (ham_count + alpha) / (n_ham + alpha * n_vocabulary)
212 |     ) %>% 
213 |     bind_rows(new_word_probs) %>% 
214 |     pivot_longer(
215 |       cols = c("spam_prob", "ham_prob"),
216 |       names_to = "label",
217 |       values_to = "prob"
218 |     ) %>% 
219 |     group_by(label) %>% 
220 |     summarize(
221 |       wi_prob = prod(prob) # prod is like sum, but with multiplication
222 |     )
223 |  
224 |   # Calculate the conditional probabilities
225 |   p_spam_given_message <- p_spam * (present_probs %>% filter(label == "spam_prob") %>% pull(wi_prob))
226 |   p_ham_given_message <- p_ham * (present_probs %>% filter(label == "ham_prob") %>% pull(wi_prob))
227 |   
228 |   # Classify the message based on the probability
229 |   ifelse(p_spam_given_message >= p_ham_given_message, "spam", "ham")
230 | }
231 | 
232 | # Use the classify function to classify the messages in the training set
233 | # This takes advantage of vectorization
234 | final_train <- tidy_train %>% 
235 |   mutate(
236 |     prediction = map_chr(sms, function(m) { classify(m) })
237 |   ) 
238 | ```
239 | 
240 | # Calculating Accuracy
241 | 
242 | ```{r}
243 | # Results of classification on training
244 | confusion <- table(final_train$label, final_train$prediction)
245 | accuracy <- (confusion[1,1] + confusion[2,2]) / nrow(final_train)
246 | ```
247 | 
248 | 
249 | The Naive Bayes Classifier achieves an accuracy of about 89%. Pretty good! Let's see how well it works on messages that it has never seen before.
250 | 
251 | # Hyperparameter Tuning
252 | 
253 | ```{r}
254 | alpha_grid <- seq(0.05, 1, by = 0.05)
255 | cv_accuracy <- NULL
256 | 
257 | for (alpha in alpha_grid) {
258 |   
259 |   # Recalculate probabilities based on new alpha
260 |   cv_probs <- word_counts %>% 
261 |     mutate(
262 |       # Calculate the probabilities from the counts based on new alpha
263 |       spam_prob = (spam_count + alpha / (n_spam + alpha * n_vocabulary)),
264 |       ham_prob = (ham_count + alpha) / (n_ham + alpha * n_vocabulary)
265 |     )
266 |   
267 |   # Predict the classification of each message in cross validation
268 |   cv <- spam_cv %>% 
269 |     mutate(
270 |       prediction = map_chr(sms, function(m) { classify(m, alpha = alpha) })
271 |     ) 
272 |   
273 |   # Assess the accuracy of the classifier on cross-validation set
274 |   confusion <- table(cv$label, cv$prediction)
275 |   acc <- (confusion[1,1] + confusion[2,2]) / nrow(cv)
276 |   cv_accuracy <- c(cv_accuracy, acc)
277 | }
278 | 
279 | # Check out what the best alpha value is
280 | tibble(
281 |   alpha = alpha_grid,
282 |   accuracy = cv_accuracy
283 | )
284 | ```
285 | 
286 | Judging from the cross-validation set, higher $\alpha$ values cause the accuracy to decrease. We'll go with $\alpha = 0.1$ since it produces the highest cross-validation prediction accuracy.
287 | 
288 | # Test Set Performance
289 | 
290 | ```{r}
291 | # Reestablishing the proper parameters
292 | optimal_alpha <- 0.1
293 | 
294 | # Using optimal alpha with training parameters, perform final predictions
295 | spam_test <- spam_test %>% 
296 |   mutate(
297 |     prediction = map_chr(sms, function(m) { classify(m, alpha = optimal_alpha)} )
298 |     )
299 |   
300 | confusion <- table(spam_test$label, spam_test$prediction)
301 | test_accuracy <- (confusion[1,1] + confusion[2,2]) / nrow(spam_test)
302 | test_accuracy
303 | ```
304 | 
305 | We've achieved an accuracy of 93% in the test set. Not bad!


--------------------------------------------------------------------------------
/Mission487Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Predicting Car Prices: Guided Project Solutions'
  3 | output: html_document
  4 | ---
  5 | 
  6 | # Introduction to the data
  7 | 
  8 | ```{r, message = FALSE, warning = FALSE }
  9 | library(readr)
 10 | library(tidyr)
 11 | library(dplyr)
 12 | cars <- read.csv("./data/imports-85.data")
 13 | 
 14 | # Fixing the column names since the .data file reads headers incorrectly
 15 | colnames(cars) <- c(
 16 |   "symboling",
 17 |   "normalized_losses",
 18 |   "make",
 19 |   "fuel_type",
 20 |   "aspiration",
 21 |   "num_doors",
 22 |   "body_style",
 23 |   "drive_wheels",
 24 |   "engine_location",
 25 |   "wheel_base",
 26 |   "length",
 27 |   "width",
 28 |   "height",
 29 |   "curb_weight",
 30 |   "engine_type",
 31 |   "num_cylinders",
 32 |   "engine_size",
 33 |   "fuel_system",
 34 |   "bore",
 35 |   "stroke",
 36 |   "compression_ratio",
 37 |   "horsepower",
 38 |   "peak_rpm",
 39 |   "city_mpg",
 40 |   "highway_mpg",
 41 |   "price"
 42 | )
 43 | 
 44 | # Removing non-numerical columns and removing missing data
 45 | cars <- cars %>% 
 46 |   select(
 47 |     symboling, wheel_base, length, width, height, curb_weight,
 48 |     engine_size, bore, stroke, compression_ratio, horsepower, 
 49 |     peak_rpm, city_mpg, highway_mpg, price
 50 |   ) %>% 
 51 |   filter(
 52 |     stroke != "?",
 53 |     bore != "?",
 54 |     horsepower != "?",
 55 |     peak_rpm != "?",
 56 |     price != "?"
 57 |   ) %>% 
 58 |   mutate(
 59 |     stroke = as.numeric(stroke),
 60 |     bore = as.numeric(bore),
 61 |     horsepower = as.numeric(horsepower),
 62 |     peak_rpm = as.numeric(peak_rpm),
 63 |     price = as.numeric(price)
 64 |   )
 65 | 
 66 | # Confirming that each of the columns are numeric
 67 | library(purrr)
 68 | map(cars, typeof)
 69 | ```
 70 | 
 71 | # Examining Relationships Between Predictors
 72 | 
 73 | ```{r}
 74 | library(caret)
 75 | featurePlot(cars, cars$price)
 76 | ```
 77 | 
 78 | There looks to be a somewhat positive relationship between horsepower and price. City MPG and highway MPG look positive too, but there's a curious grouping that looks like it pops up. Many features look like they plateau in terms of price (ie even as we increase, price does not increase). Height seems not to have any meaningful relationship with price since the dots look like an evenly scattered plot.
 79 | 
 80 | ```{r}
 81 | library(ggplot2)
 82 | ggplot(cars, aes(x = price)) +
 83 |   geom_histogram(color = "red") +
 84 |   labs(
 85 |     title = "Distribution of prices in cars dataset",
 86 |     x = "Price",
 87 |     y = "Frequency"
 88 |   )
 89 | ```
 90 | 
 91 | It looks like there's a reasonably even distirbution of the prices in the dataset, so there are no outliers.  There are 2 cars whose price is zero, so this might be suspect. This only represents 1% of the entire dataset, so it shouldn't have too much impact on predictions, especially if we use a high number of neighbors.
 92 | 
 93 | # Setting up the train-test split
 94 | 
 95 | ```{r}
 96 | library(caret)
 97 | split_indices <- createDataPartition(cars$price, p = 0.8,  list = FALSE)
 98 | train_cars <- cars[split_indices,]
 99 | test_cars <- cars[-split_indices,]
100 | ```
101 | 
102 | 
103 | # Cross-validation and hyperparameter optimization
104 | 
105 | ```{r}
106 | # 5-fold cross-validation 
107 | five_fold_control <- trainControl(method = "cv", number = 5)
108 | 
109 | tuning_grid <- expand.grid(k = 1:20)
110 | ```
111 | 
112 | # Choosing a model
113 | 
114 | ```{r}
115 | # Creating a model based on all the features
116 | full_model <- train(price ~ .,
117 |                     data = train_cars,
118 |                     method = "knn",
119 |                     trControl = five_fold_control,
120 |                     tuneGrid = tuning_grid,
121 |                     preProcess = c("center", "scale"))
122 | ```
123 | 
124 | # Final model evaluation
125 | 
126 | ```{r}
127 | predictions <- predict(full_model, newdata = test_cars)
128 | postResample(pred = predictions, obs = test_cars$price)
129 | ```
130 | 
131 | 


--------------------------------------------------------------------------------
/Mission498Solutions.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Guided Project Solutions: Creating An Efficient Data Analysis Workflow"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r}
 7 | library(tidyverse)
 8 | reviews <- read_csv("book_reviews.csv")
 9 | ```
10 | 
11 | 
12 | # Getting Familiar With The Data
13 | 
14 | ```{r}
15 | # How big is the dataset?
16 | dim(reviews)
17 | 
18 | # What are the column names?
19 | colnames(reviews)
20 | 
21 | # What are the column types?
22 | for (c in colnames(reviews)) {
23 |   print(typeof(reviews[[c]]))
24 | }
25 | ```
26 | 
27 | ```{r}
28 | # What are the unique values in each column?
29 | for (c in colnames(reviews)) {
30 |   print("Unique values in the column:")
31 |   print(c)
32 |   print(unique(reviews[[c]]))
33 |   print("")
34 | }
35 | ```
36 | 
37 | All of the columns seem to contain strings. The `reviews` column represents what the score that the reviewer gave the book. The `book` column indicates which particular textbook was purchased. The `state` column represents the state where the book was purchased. The `price` column represents the price that the book was purchased for.
38 | 
39 | # Handling Missing Data
40 | 
41 | From the previous exercise, it's apparent that that the `review` column contains some `NA` values. We don't want any missing values in the dataset, so we need to get rid of them.
42 | 
43 | ```{r}
44 | complete_reviews = reviews %>% 
45 |   filter(!is.na(review))
46 | 
47 | dim(complete_reviews)
48 | ```
49 | 
50 | There were about 200 reviews that were removed from the dataset. This is about 10% of the original dataset. This isn't too big of an amount, so we would feel comfortable continuing with our analysis.
51 | 
52 | # Dealing With Inconsistent Labels
53 | 
54 | We'll use the shortened postal codes instead since they're shorter.
55 | 
56 | ```{r}
57 | complete_reviews <- complete_reviews %>% 
58 |   mutate(
59 |     state = case_when(
60 |       state == "California" ~ "CA",
61 |       state == "New York" ~ "NY",
62 |       state == "Texas" ~ "TX",
63 |       state == "Florida" ~ "FL",
64 |       TRUE ~ state # ignore cases where it's already postal code
65 |     )
66 |   )
67 | ```
68 | 
69 | # Transforming the Review Data
70 | 
71 | ```{r}
72 | complete_reviews <- complete_reviews %>% 
73 |   mutate(
74 |     review_num = case_when(
75 |       review == "Poor" ~ 1,
76 |       review == "Fair" ~ 2,
77 |       review == "Good" ~ 3,
78 |       review == "Great" ~ 4,
79 |       review == "Excellent" ~ 5
80 |     ),
81 |     is_high_review = if_else(review_num >= 4, TRUE, FALSE)
82 |   )
83 | ```
84 | 
85 | # Analyzing The Data
86 | 
87 | We'll define most profitable book in terms of how many books there was sold. 
88 | 
89 | ```{r}
90 | complete_reviews %>% 
91 |   group_by(book) %>% 
92 |   summarize(
93 |     purchased = n()
94 |   ) %>% 
95 |   arrange(-purchased)
96 | ```
97 | 
98 | The books are relatively well matched in terms of purchasing, but "Fundamentals of R For Beginners" has a slight edge over everyone else. 
99 | 


--------------------------------------------------------------------------------
/Mission505Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Data Structures in R: Guided Project Solutions'
  3 | author: "Dataquest"
  4 | date: "6/6/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | 
  9 | # Understanding the Data
 10 | ## Loading the dataset from the `covid19.csv` CSV file and quick exploration
 11 | ```{r}
 12 | library(readr)
 13 | 
 14 | # Loading the dataset
 15 | covid_df <- read_csv("covid19.csv")
 16 | ```
 17 | 
 18 | ```{r}
 19 | # Displaing the dimension of the data: 
 20 | dim(covid_df)
 21 | 
 22 | # Storing the column names in a variable
 23 | vector_cols <- colnames(covid_df)
 24 | 
 25 | # Displaing the variable vector_cols
 26 | vector_cols
 27 | 
 28 | # Showing the first few rows of the dataset
 29 | head(covid_df)
 30 | 
 31 | # Showing a global view of the dataset.
 32 | library(tibble)
 33 | 
 34 | glimpse(covid_df)
 35 | 
 36 | ```
 37 | 
 38 | The dataset contains `14` columns and `10,903` rows. This database provides information on the numbers (per day and cumulatively) of COVID-19 positive cases, deaths, tests performed and hospitalizations for each country through the column's names store in the variable `vector_cols`. 
 39 | 
 40 | 1. This variable contains a character vector.
 41 | 
 42 | 2. The use of the function `glimpse()` is the very first operation to do because we don't only learn about the dimensions of the database but also about the names of the first columns and their types and content. It can replace the three previous operations: `dim()`, `colnames()`, and `head()`.
 43 | 
 44 | # Isolating the Rows We Need
 45 | - Selecting only the rows related to `"All States"` and removing the `Province_State`.
 46 | 
 47 | ```{r}
 48 | library(dplyr)
 49 | 
 50 | # Filter the "All States" Province states and remove the `Province_State` column
 51 | covid_df_all_states <- covid_df %>% 
 52 |   filter(Province_State == "All States") %>% 
 53 |   select(-Province_State)
 54 | 
 55 | 
 56 | ```
 57 | - We can remove `Province_State` without loosing information because after the filtering step this column only contains the value `"All States"`.
 58 | 
 59 | # Isolating the Columns We Need
 60 | - Creating a dataset for the daily columns from `covid_df_all_states` dataframe
 61 | 
 62 | Let's recall the description of the dataset's columns.
 63 | 
 64 | 1. `Date`: Date
 65 | 2. `Continent_Name`: Continent names
 66 | 3. `Two_Letter_Country_Code`: Country codes
 67 | 4. `Country_Region`: Country names
 68 | 5. `Province_State`: States/province names; value is `All States` when state/provincial level data is not available
 69 | 6. `positive`: Cumulative number of positive cases reported.
 70 | 7. `active`: Number of actively cases on that **day**.
 71 | 8. `hospitalized`: Cumulative number of hospitalized cases reported.
 72 | 9. `hospitalizedCurr`: Number of actively hospitalized cases on that **day**.
 73 | 10. `recovered`: Cumulative number of recovered cases reported.
 74 | 11. `death`: Cumulative number of deaths reported.
 75 | 12. `total_tested`: Cumulative number of tests conducted.
 76 | 13. `daily_tested`: Number of tests conducted on the **day**; if daily data is unavailable, daily tested is averaged across number of days in between.
 77 | 14. `daily_positive`: Number of positive cases reported on the **day**; if daily data is unavailable, daily positive is averaged across number of days in.
 78 | 
 79 | 
 80 | ```{r}
 81 | # Selecting the columns with cumulative numbers
 82 | covid_df_all_states_daily <- covid_df_all_states %>% 
 83 |   select(Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive)
 84 | 
 85 | head(covid_df_all_states_daily)
 86 | ```
 87 | 
 88 | 
 89 | # Extracting the Top Ten countries in the number of tested cases
 90 | 
 91 | ## Summarizing the data based on the `Country_Region` column.
 92 | ```{r}
 93 | covid_df_all_states_daily_sum <- covid_df_all_states_daily %>% 
 94 |   group_by(Country_Region) %>% 
 95 |   summarise(tested = sum(daily_tested), 
 96 |             positive = sum(daily_positive),
 97 |             active = sum(active),
 98 |             hospitalized = sum(hospitalizedCurr)) %>% 
 99 |   arrange(desc(tested)) #this is equivalent to `arrange(-tested)`
100 | 
101 | covid_df_all_states_daily_sum
102 | 
103 | #Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive
104 | ```
105 | 
106 | ## Taking the top 10 
107 | ```{r}
108 | covid_top_10 <- head(covid_df_all_states_daily_sum, 10)
109 | 
110 | covid_top_10
111 | ```
112 | 
113 | 
114 | # Identifying the Highest Positive Against Tested Cases
115 | 
116 | ## Getting vectors
117 | ```{r}
118 | countries <- covid_top_10$Country_Region
119 | tested_cases <- covid_top_10$tested
120 | positive_cases <- covid_top_10$positive
121 | active_cases <- covid_top_10$active
122 | hospitalized_cases <- covid_top_10$hospitalized
123 | ```
124 | 
125 | ## Naming vectors
126 | ```{r}
127 | names(positive_cases) <- countries
128 | names(tested_cases) <- countries
129 | names(active_cases) <- countries
130 | names(hospitalized_cases) <- countries
131 | ```
132 | 
133 | ## Identifying 
134 | ```{r}
135 | 
136 | positive_cases
137 | sum(positive_cases)
138 | mean(positive_cases)
139 | positive_cases/sum(positive_cases)
140 | ```
141 | 
142 | ```{r}
143 | positive_cases/tested_cases
144 | ```
145 | 
146 | ## Conclusion
147 | ```{r}
148 | positive_tested_top_3 <- c("United Kingdom" = 0.11, "United States" = 0.10, "Turkey" = 0.08)
149 | ```
150 | 
151 | 
152 | # Keeping relevant information
153 | 
154 | ```{r}
155 | # Creating vectors
156 | united_kingdom <- c(0.11, 1473672, 166909, 0, 0)
157 | united_states <- c(0.10, 17282363, 1877179, 0, 0)
158 | turkey <- c(0.08, 2031192, 163941, 2980960, 0)
159 | 
160 | # Creating the matrix covid_mat
161 | covid_mat <- rbind(united_kingdom, united_states, turkey)
162 | 
163 | # Naming columns
164 | colnames(covid_mat) <- c("Ratio", "tested", "positive", "active", "hospitalized")
165 | 
166 | #d Displaying the matrix
167 | covid_mat
168 | ```
169 | 
170 | # Putting all together
171 | ```{r}
172 | 
173 | question <- "Which countries have had the highest number of positive cases against the number of tests?"
174 | 
175 | answer <- c("Positive tested cases" = positive_tested_top_3)
176 | 
177 | datasets <- list(
178 |   original = covid_df,
179 |   allstates = covid_df_all_states,
180 |   daily = covid_df_all_states_daily,
181 |   top_10 = covid_top_10
182 | )
183 | 
184 | matrices <- list(covid_mat)
185 | vectors <- list(vector_cols, countries)
186 | 
187 | data_structure_list <- list("dataframe" = datasets, "matrix" = matrices, "vector" = vectors)
188 | 
189 | covid_analysis_list <- list(question, answer, data_structure_list)
190 | 
191 | covid_analysis_list[[2]]
192 | ```
193 | 


--------------------------------------------------------------------------------
/Mission516Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Guided Project: Creating An Efficient Data Analysis Workflow, Part 2"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r}
  7 | library(tidyverse)
  8 | library(lubridate)
  9 | 
 10 | sales <- read_csv("sales2019.csv")
 11 | ```
 12 | 
 13 | # Data Exploration
 14 | 
 15 | ```{r}
 16 | # How big is the dataset?
 17 | dim(sales)
 18 | ```
 19 | 
 20 | ```{r}
 21 | # What are the column names?
 22 | colnames(sales)
 23 | ```
 24 | 
 25 | The `date` column shows the date that the order of books was made. This will help us distinguish between orders that were made before and after the new program was implemented. `quantity` describes how many books were made, and `user_submitted_review` looks like it's a hand typed review of the books themselves. `customer_type` indicates whether or not the customer was an individual or a business. It seems that the company has started selling in bulk to other business too.
 26 | 
 27 | ```{r}
 28 | # What are the types of all the columns?
 29 | for (col in colnames(sales)) {
 30 |   paste0(col, " : ", typeof(sales[[col]])) %>% print
 31 | }
 32 | ```
 33 | 
 34 | 
 35 | ```{r}
 36 | # Is there missing data anywhere?
 37 | for (col in colnames(sales)) {
 38 |   paste0(col, 
 39 |          ", number of missing data rows: ", 
 40 |          is.na(sales[[col]]) %>% sum) %>% print
 41 | }
 42 | ```
 43 | 
 44 | The `user_submitted_review` column has some missing data in it. We'll have to handle this later in the data cleaning, but at least we know about it ahead of time. The `total_purchased` column also has missing data, which we'll handle with imputation.
 45 | 
 46 | # Handling Missing Data
 47 | 
 48 | ```{r}
 49 | # Remove the rows with no user_submitted_review
 50 | complete_sales <- sales %>% 
 51 |   filter(
 52 |     !is.na(user_submitted_review)
 53 |   )
 54 | 
 55 | # Calculate the mean of the total_purchased column, without the missing values
 56 | purchase_mean <- complete_sales %>% 
 57 |   filter(!is.na(total_purchased)) %>% 
 58 |   pull(total_purchased) %>% 
 59 |   mean
 60 | 
 61 | # Assign this mean to all of the rows where total_purchased was NA
 62 | complete_sales <- complete_sales %>% 
 63 |   mutate(
 64 |     imputed_purchases = if_else(is.na(total_purchased), 
 65 |                                 purchase_mean,
 66 |                                 total_purchased)
 67 |   )
 68 | ```
 69 | 
 70 | # Processing Review Data
 71 | 
 72 | ```{r}
 73 | complete_sales %>% pull(user_submitted_review) %>% unique
 74 | ```
 75 | 
 76 | The reviews range from outright hate ("Hated it") to positive ("Awesome!"). We'll create a function that uses a `case_when()` function to produce the output. `case_when()` functions can be incredibly bulky in cases where there's many options, but housing it in a function to `map` can make our code cleaner.
 77 | 
 78 | ```{r}
 79 | is_positive <- function(review) {
 80 |   review_positive = case_when(
 81 |   str_detect(review, "Awesome") ~ TRUE,
 82 |   str_detect(review, "OK") ~ TRUE,
 83 |   str_detect(review, "Never") ~ TRUE,
 84 |   str_detect(review, "a lot") ~ TRUE,
 85 |   TRUE ~ FALSE # The review did not contain any of the above phrases
 86 |   )
 87 | }
 88 | 
 89 | complete_sales <- complete_sales %>% 
 90 |   mutate(
 91 |     is_positive = unlist(map(user_submitted_review, is_positive))
 92 |   )
 93 | ```
 94 | 
 95 | # Comparing Book Sales Between Pre- and Post-Program Sales
 96 | 
 97 | ```{r}
 98 | complete_sales <- complete_sales %>% 
 99 |   mutate(
100 |     date_status = if_else(mdy(date) < ymd("2019/07/01"), "Pre", "Post")
101 |   )
102 | 
103 | complete_sales %>% 
104 |   group_by(date_status) %>% 
105 |   summarize(
106 |     books_purchased = sum(imputed_purchases)
107 |   )
108 | ```
109 | 
110 | It doesn't seem that the program has increased sales. Maybe there were certain books that increased in sales?
111 | 
112 | ```{r}
113 | complete_sales %>% 
114 |   group_by(date_status, title) %>% 
115 |   summarize(
116 |     books_purchased = sum(imputed_purchases)
117 |   ) %>% 
118 |   arrange(title, date_status)
119 | ```
120 | 
121 | It turns out that certain books actually got more popular after the program started! R For Dummies and Secrets of R For Advanced Students got more popular.
122 | 
123 | # Comparing Book Sales Within Customer Type
124 | 
125 | ```{r}
126 | complete_sales %>% 
127 |   group_by(date_status, customer_type) %>% 
128 |   summarize(
129 |     books_purchased = sum(imputed_purchases)
130 |   ) %>% 
131 |   arrange(customer_type, date_status)
132 | ```
133 | 
134 | Baserd on the table, it looks like businesses started purchasing more books after the program! There was actually a drop in individual sales.
135 | 
136 | # Comparing Review Sentiment Between Pre- and Post-Program Sales
137 | 
138 | ```{r}
139 | complete_sales %>% 
140 |   group_by(date_status) %>% 
141 |   summarize(
142 |     num_positive_reviews = sum(is_positive)
143 |   )
144 | ```
145 | 
146 | There's slightly more reviews before the program, but this difference seems negigible.
147 | 
148 | 


--------------------------------------------------------------------------------
/Mission518Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Data Structures in R: Guided Project Solutions'
  3 | author: "Dataquest"
  4 | date: "6/6/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | 
  9 | # Understanding the Data
 10 | ## Loading the dataset from the `covid19.csv` CSV file and quick exploration
 11 | ```{r}
 12 | library(readr)
 13 | 
 14 | # Loading the dataset
 15 | covid_df <- read_csv("covid19.csv")
 16 | ```
 17 | 
 18 | ```{r}
 19 | # Displaing the dimension of the data: 
 20 | dim(covid_df)
 21 | 
 22 | # Storing the column names in a variable
 23 | vector_cols <- colnames(covid_df)
 24 | 
 25 | # Displaing the variable vector_cols
 26 | vector_cols
 27 | 
 28 | # Showing the first few rows of the dataset
 29 | head(covid_df)
 30 | 
 31 | # Showing a global view of the dataset.
 32 | library(tibble)
 33 | 
 34 | glimpse(covid_df)
 35 | 
 36 | ```
 37 | 
 38 | The dataset contains `14` columns and `10,903` rows. This database provides information on the numbers (per day and cumulatively) of COVID-19 positive cases, deaths, tests performed and hospitalizations for each country through the column's names store in the variable `vector_cols`. 
 39 | 
 40 | 1. This variable contains a character vector.
 41 | 
 42 | 2. The use of the function `glimpse()` is the very first operation to do because we don't only learn about the dimensions of the database but also about the names of the first columns and their types and content. It can replace the three previous operations: `dim()`, `colnames()`, and `head()`.
 43 | 
 44 | # Isolating the Data We Need
 45 | 
 46 | ## Selecting only the rows related to `"All States"` and removing the `Province_State`.
 47 | 
 48 | ```{r}
 49 | library(dplyr)
 50 | 
 51 | # Filter the "All States" Province states and remove the `Province_State` column
 52 | covid_df_all_states <- covid_df %>% 
 53 |   filter(Province_State == "All States") %>% 
 54 |   select(-Province_State)
 55 | 
 56 | 
 57 | ```
 58 | 
 59 | ## Creating a dataset for the cumulative columns and another for the daily columns from `covid_df_all_states` dataframe
 60 | 
 61 | Let's recall the description of the dataset's columns.
 62 | 
 63 | 1. `Date`: Date
 64 | 2. `Continent_Name`: Continent names
 65 | 3. `Two_Letter_Country_Code`: Country codes
 66 | 4. `Country_Region`: Country names
 67 | 5. `Province_State`: States/province names; value is `All States` when state/provincial level data is not available
 68 | 6. `positive`: Cumulative number of positive cases reported.
 69 | 7. `active`: Number of actively cases on that **day**.
 70 | 8. `hospitalized`: Cumulative number of hospitalized cases reported.
 71 | 9. `hospitalizedCurr`: Number of actively hospitalized cases on that **day**.
 72 | 10. `recovered`: Cumulative number of recovered cases reported.
 73 | 11. `death`: Cumulative number of deaths reported.
 74 | 12. `total_tested`: Cumulative number of tests conducted.
 75 | 13. `daily_tested`: Number of tests conducted on the **day**; if daily data is unavailable, daily tested is averaged across number of days in between.
 76 | 14. `daily_positive`: Number of positive cases reported on the **day**; if daily data is unavailable, daily positive is averaged across number of days in.
 77 | 
 78 | 
 79 | ```{r}
 80 | 
 81 | # Selecting the columns with cumulative numbers
 82 | covid_df_all_states_cumulative <- covid_df_all_states %>% 
 83 |   select(Date, Continent_Name, Two_Letter_Country_Code, positive, hospitalized, recovered, death, total_tested)
 84 | 
 85 | # Selecting the columns with cumulative numbers
 86 | covid_df_all_states_daily <- covid_df_all_states %>% 
 87 |   select(Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive)
 88 | 
 89 | ##print(xtable::xtable(head(covid_df_all_states_daily)), type = "html")
 90 | ```
 91 | 
 92 | 
 93 | 1. We can remove `Province_State` without loosing information because after the filtering step this column only contains the value `"All States"`.
 94 | 
 95 | # Identifying the Highest Fatality Rates Countries
 96 | 
 97 | ## Summarizing the data based on the `Continent_Name` and `Two_Letter_Country_Code` columns.
 98 | ```{r}
 99 | covid_df_all_states_cumulative_max <- covid_df_all_states_cumulative %>% 
100 |   group_by(Continent_Name, Two_Letter_Country_Code) %>% 
101 |   summarise(max = max(death)) %>% 
102 |   filter(max > 0)
103 | 
104 | covid_df_all_states_cumulative_max
105 | 
106 | ```
107 | 
108 | ## Displaying the maximum number of death by country, colored by continent
109 | 
110 | ```{r}
111 | library(ggplot2)
112 | 
113 | gglot(data = covid_df_all_states_cumulative_max, 
114 |   aes(x = Two_Letter_Country_Code,
115 |   y = max,
116 |   col = Continent_Name)) + 
117 |   geom_point()
118 | ```
119 | 
120 | ## Conclusion: Answering the question: Which countries have had the highest fatality (mortality) rates?
121 | ```{r}
122 | death_top_3 <- c("US", "IT", "GB")
123 | ```
124 | 
125 | 
126 | # Extracting the Top Ten countries in the number of tested cases
127 | 
128 | ## Summarizing the data based on the `Country_Region` column.
129 | ```{r}
130 | covid_df_all_states_daily_sum <- covid_df_all_states_daily %>% 
131 |   group_by(Country_Region) %>% 
132 |   summarise(tested = sum(daily_tested), 
133 |             positive = sum(daily_positive),
134 |             active = sum(active),
135 |             hospitalized = sum(hospitalizedCurr)) %>% 
136 |   arrange(desc(tested)) #this is equivalent to `arrange(-tested)`
137 | 
138 | covid_df_all_states_daily_sum
139 | 
140 | #Date, Country_Region, active, hospitalizedCurr, daily_tested, daily_positive
141 | ```
142 | 
143 | ## Taking the top 10 
144 | ```{r}
145 | covid_top_10 <- head(covid_df_all_states_daily_sum, 10)
146 | 
147 | #print(xtable::xtable(covid_top_10), type = "html")
148 | ```
149 | 
150 | 
151 | # Identifying the Highest Positive Against Tested Cases
152 | 
153 | ## Getting vectors
154 | ```{r}
155 | countries <- covid_top_10$Country_Region
156 | tested_cases <- covid_top_10$tested
157 | positive_cases <- covid_top_10$positive
158 | active_cases <- covid_top_10$active
159 | hospitalized_cases <- covid_top_10$hospitalized
160 | ```
161 | 
162 | ## Naming vectors
163 | ```{r}
164 | names(positive_cases) <- countries
165 | names(tested_cases) <- countries
166 | names(active_cases) <- countries
167 | names(hospitalized_cases) <- countries
168 | ```
169 | 
170 | ## Identifying 
171 | ```{r}
172 | 
173 | positive_cases
174 | sum(positive_cases)
175 | mean(positive_cases)
176 | positive_cases/sum(positive_cases)
177 | ```
178 | 
179 | ```{r}
180 | positive_cases/tested_cases
181 | ```
182 | 
183 | ## Conclusion
184 | ```{r}
185 | positive_tested_top_3 <- c("United Kingdom" = 0.11, "United States" = 0.10, "Turkey" = 0.08)
186 | ```
187 | 
188 | 
189 | # Identifying Affected Countries Related to their Population
190 | 
191 | ```{r}
192 | # Creating the matrix covid_mat
193 | covid_mat <- cbind(tested_cases, positive_cases, active_cases, hospitalized_cases)
194 | 
195 | # Creating the population vector https://www.worldometers.info/world-population/population-by-country/
196 | population <- c(331002651, 145934462, 60461826, 1380004385, 84339067, 37742154, 67886011, 25499884, 32971854, 37846611)
197 | 
198 | # Dividing the matrix by the population vector
199 | covid_mat <- covid_mat * 100/population
200 | 
201 | covid_mat
202 | ```
203 | 
204 | ## Ranking the matrix
205 | 
206 | ```{r}
207 | tested_cases_rank <- rank(covid_mat[,"tested_cases"])
208 | positive_cases_rank <- rank(covid_mat[,"positive_cases"])
209 | active_cases_rank <- rank(covid_mat[,"active_cases"])
210 | hospitalized_cases_rank <- rank(covid_mat[,"hospitalized_cases"])
211 | 
212 | covid_mat_rank <- rbind(tested_cases_rank, positive_cases_rank, active_cases_rank, hospitalized_cases_rank)
213 | 
214 | covid_mat_rank
215 | 
216 | covid_mat_rank[1,]
217 | 
218 | covid_mat_rank[-1, ]
219 | 
220 | colSums(covid_mat_rank[-1, ])
221 | ```
222 | 
223 | ## Conclusion
224 | ```{r}
225 | best_effort_tested_cased_top_3 <- c("India", "United Kingdom", "Turkey")
226 | 
227 | most_affected_country <- "Italy"
228 | 
229 | least_affected_country <- "India"
230 | ```
231 | 
232 | # Putting all together
233 | ```{r}
234 | 
235 | question_list <- list(
236 |   "Which countries have had the highest fatality (mortality) rates?",
237 |   "Which countries have had the highest number of positive cases against the number of tests?",
238 |   "Which countries have made the best effort in terms of the number of tests conducted related to their population?",
239 |   "Which countries were ultimately the most and least affected related to their population?"
240 | )
241 | 
242 | answer_list <- list(
243 |   "Death" = death_top_3,
244 |   "Positive tested cases" = positive_tested_top_3,
245 |   "The Best effort in test related to the population" = best_effort_tested_cased_top_3,
246 |   "The most affected country related to its population" = most_affected_country,
247 |   "The least affected country related to its population" = least_affected_country
248 | )
249 | 
250 | answer_list
251 | 
252 | datasets <- list(
253 |   original = covid_df,
254 |   allstates = covid_df_all_states,
255 |   cumulative = covid_df_all_states_cumulative,
256 |   daily = covid_df_all_states_daily
257 | )
258 | matrices <- list(covid_mat, covid_mat_rank)
259 | vectors <- list(vector_cols, population, countries)
260 | 
261 | data_structure_list <- list("data frame" = datasets, "matrix" = matrices, "vector" = vectors)
262 | 
263 | covid_analysis_list <- list(question_list, answer_list, data_structure_list)
264 | 
265 | ```
266 | 


--------------------------------------------------------------------------------
/Mission564Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Importing the LinkedList and Stack"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from linked_list import LinkedList\n",
 17 |     "\n",
 18 |     "class Stack(LinkedList):\n",
 19 |     "    \n",
 20 |     "    def push(self, data):\n",
 21 |     "        self.append(data)\n",
 22 |     "\n",
 23 |     "    def peek(self):\n",
 24 |     "        return self.tail.data\n",
 25 |     "\n",
 26 |     "    def pop(self):\n",
 27 |     "        ret = self.tail.data\n",
 28 |     "        if self.length == 1:\n",
 29 |     "            self.tail = self.head = None\n",
 30 |     "        else:\n",
 31 |     "            self.tail = self.tail.prev\n",
 32 |     "            self.tail.next = None\n",
 33 |     "        self.length -= 1\n",
 34 |     "        return ret"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# Implementing the tokenize function"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "['12', '2', '4', '+', '/', '21', '*']\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "def tokenize(expression):\n",
 59 |     "    return expression.split()\n",
 60 |     "\n",
 61 |     "print(tokenize(\"12 2 4 + / 21 *\"))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "# Functions to process operators in postfix evaluation\n",
 69 |     "\n",
 70 |     "The functions are all the same, the only thing that changes is the operator used to calculate the `result` variable.\n",
 71 |     "\n",
 72 |     "It is very important to perform the operation between the elements that was second to to and the top elements. If we do it the other way around we'll get the wrong result.\n",
 73 |     "\n",
 74 |     "For example, in the `process_minus()` function we do:\n",
 75 |     "\n",
 76 |     "```python\n",
 77 |     "result = second_to_top - top # Correct\n",
 78 |     "```\n",
 79 |     "\n",
 80 |     "and not\n",
 81 |     "\n",
 82 |     "```python\n",
 83 |     "result = top - second_to_top # Wrong\n",
 84 |     "```"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 3,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "def process_minus(stack):\n",
 94 |     "    top = stack.pop()\n",
 95 |     "    second_to_top = stack.pop()\n",
 96 |     "    result = second_to_top - top\n",
 97 |     "    stack.push(result)\n",
 98 |     "    \n",
 99 |     "def process_plus(stack):\n",
100 |     "    top = stack.pop()\n",
101 |     "    second_to_top = stack.pop()\n",
102 |     "    # Same as process_minus but with + instead of -\n",
103 |     "    result = second_to_top + top\n",
104 |     "    stack.push(result)\n",
105 |     "    \n",
106 |     "def process_times(stack):\n",
107 |     "    top = stack.pop()\n",
108 |     "    second_to_top = stack.pop()\n",
109 |     "    # Same as process_minus but with * instead of -\n",
110 |     "    result = second_to_top * top\n",
111 |     "    stack.push(result)\n",
112 |     "\n",
113 |     "def process_divide(stack):\n",
114 |     "    top = stack.pop()\n",
115 |     "    second_to_top = stack.pop()\n",
116 |     "    # Same as process_minus but with / instead of -\n",
117 |     "    result = second_to_top / top\n",
118 |     "    stack.push(result)\n",
119 |     "    \n",
120 |     "def process_pow(stack):\n",
121 |     "    top = stack.pop()\n",
122 |     "    second_to_top = stack.pop()\n",
123 |     "    # Same as process_minus but with ** instead of -\n",
124 |     "    result = second_to_top ** top\n",
125 |     "    stack.push(result)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "# Evaluating postfix expressions\n",
133 |     "\n",
134 |     "Here are the steps we need to follow to implement the `evaluate_postfix()` function.\n",
135 |     "\n",
136 |     "1. Initialize an empty stack.\n",
137 |     "2. Tokenize the expression using the `tokenize()` function.\n",
138 |     "3. For each token, do:\n",
139 |     "    1. If the token an operator, call the corresponding function to process it. For example, if we find a `+` we call the `process_plus()` function.\n",
140 |     "    2. Otherwise (the token is a number) and we push that number to the top of the stack. Since each token is a string, we'll need to convert it to a `float` first.\n",
141 |     "4. Return the value that is left in the stack."
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 4,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "def evaluate_postfix(expression):\n",
151 |     "    tokens = tokenize(expression)\n",
152 |     "    stack = Stack()\n",
153 |     "    for token in tokens:\n",
154 |     "        if token == \"+\":\n",
155 |     "            process_plus(stack)\n",
156 |     "        elif token == \"-\":\n",
157 |     "            process_minus(stack)\n",
158 |     "        elif token == \"*\":\n",
159 |     "            process_times(stack)\n",
160 |     "        elif token == \"/\":\n",
161 |     "            process_divide(stack)\n",
162 |     "        elif token == \"**\":\n",
163 |     "            process_pow(stack)\n",
164 |     "        else:\n",
165 |     "            # The token is not an operator so it must be a number\n",
166 |     "            stack.push(float(token))\n",
167 |     "    return stack.pop()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "## Testing the implementation\n",
175 |     "\n",
176 |     "When testing with other expressions we need to add spaces between at two tokens. For example `1 + 3` will work but `1+3` won't."
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 5,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "-2.0\n",
189 |       "8.0\n",
190 |       "0.0\n",
191 |       "2.0\n",
192 |       "11.25\n",
193 |       "45.0\n",
194 |       "42.0\n",
195 |       "4.0\n",
196 |       "2.0\n"
197 |      ]
198 |     }
199 |    ],
200 |    "source": [
201 |     "expressions = [\n",
202 |     "    \"4 6 -\",\n",
203 |     "    \"4 1 2 9 3 / * + 5 - *\",\n",
204 |     "    \"1 2 + 3 -\",\n",
205 |     "    \"1 2 - 3 +\",\n",
206 |     "    \"10 3 5 * 16 4 - / +\",\n",
207 |     "    \"5 3 4 2 - ** *\",\n",
208 |     "    \"12 2 4 + / 21 *\",\n",
209 |     "    \"1 1 + 2 **\",\n",
210 |     "    \"1 1 2 ** +\"\n",
211 |     "]\n",
212 |     "\n",
213 |     "for expression in expressions:\n",
214 |     "    print(evaluate_postfix(expression))"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "# Precedence dictionary\n",
222 |     "\n",
223 |     "The precedence dictionary is used to compare the precedence of two operators."
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 6,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "name": "stdout",
233 |      "output_type": "stream",
234 |      "text": [
235 |       "False\n",
236 |       "True\n",
237 |       "False\n",
238 |       "True\n"
239 |      ]
240 |     }
241 |    ],
242 |    "source": [
243 |     "precedence = {\n",
244 |     "    \"+\": 1,\n",
245 |     "    \"-\": 1,\n",
246 |     "    \"*\": 2,\n",
247 |     "    \"/\": 2,\n",
248 |     "    \"**\": 3\n",
249 |     "}\n",
250 |     "\n",
251 |     "print(precedence[\"/\"] < precedence[\"-\"])\n",
252 |     "print(precedence[\"+\"] < precedence[\"*\"])\n",
253 |     "print(precedence[\"+\"] < precedence[\"-\"])\n",
254 |     "print(precedence[\"/\"] < precedence[\"**\"])"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "markdown",
259 |    "metadata": {},
260 |    "source": [
261 |     "# Processing tokens in infix to postfix conversions"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "markdown",
266 |    "metadata": {},
267 |    "source": [
268 |     "## Opening parenthesis\n",
269 |     "\n",
270 |     "- Opening parentheses, `(`: \n",
271 |     "    1. Push the token into the stack. It will be used later when we find a closing parenthesis."
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 7,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "def process_opening_parenthesis(stack):\n",
281 |     "    stack.push(\"(\")"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "## Closing parenthesis\n",
289 |     "\n",
290 |     "- Closing parentheses `)`:\n",
291 |     "    1. While the top of the stack is not an opening parenthesis, (, pop the top element and append it to the postfix token list.\n",
292 |     "    2. Pop the opening parentheses out of the stack at the end."
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 8,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": [
301 |     "def process_closing_parenthesis(stack, postfix):\n",
302 |     "    # Add tokens until we find the open bracket\n",
303 |     "    while stack.peek() != \"(\":\n",
304 |     "        postfix.append(stack.pop())\n",
305 |     "    # Remove the opening bracket\n",
306 |     "    stack.pop()"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {},
312 |    "source": [
313 |     "## Operators\n",
314 |     "\n",
315 |     "- Operator, `+`, `-`, `*`, `/` or `**`: \n",
316 |     "    - While the top of the stack is also an operator whose precedence is greater than or equal to this operator, pop the top element and append it to the `postfix` token list. \n",
317 |     "    - Push the current operator to the top of the stack.\n",
318 |     "\n",
319 |     "The `Stack.peek()` method will cause an error if the stack is empty. Thus, in the while loop we also need to check that the stack is not empty."
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "code",
324 |    "execution_count": 9,
325 |    "metadata": {},
326 |    "outputs": [],
327 |    "source": [
328 |     "def process_operator(stack, postfix, operator):\n",
329 |     "    while len(stack) > 0 and stack.peek() in precedence and precedence[stack.peek()] >= precedence[operator]:\n",
330 |     "        postfix.append(stack.pop())\n",
331 |     "    stack.push(operator)"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "markdown",
336 |    "metadata": {},
337 |    "source": [
338 |     "## Numbers\n",
339 |     "\n",
340 |     "- Operand (any number):\n",
341 |     "    1. Push the token into the the postfix token list."
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 10,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "def process_number(postfix, number):\n",
351 |     "    postfix.append(number)"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "# The Shunting-yard Algorithm\n",
359 |     "\n",
360 |     "1. We start by splitting the expression into tokens using the `tokenize()` function.\n",
361 |     "2. We initialize an empty stack.\n",
362 |     "3. We initialize and empty postfix token list.\n",
363 |     "4. Iterate over all tokens and for each of them:\n",
364 |     "    - If the token is `\"(\"` we call the `process_opening_parenthesis()` function.\n",
365 |     "    - If the token is `\")\"` we call the `process_closing_parenthesis()` function.\n",
366 |     "    - If the token is an operator we call the `process_operator()` function.\n",
367 |     "    - Otherwise, the token is a number and we call the `process_number()` function.\n",
368 |     "5. After processing all tokens, we use a while loop to pop the remaining stack element into the postfix token list.\n",
369 |     "6. Use the `str.join()` method to convert the postfix token list into a string."
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 11,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": [
378 |     "def infix_to_postfix(expression):\n",
379 |     "    tokens = tokenize(expression)\n",
380 |     "    stack = Stack()\n",
381 |     "    postfix = []\n",
382 |     "    for token in tokens:\n",
383 |     "        if token == \"(\":\n",
384 |     "            process_opening_parenthesis(stack)\n",
385 |     "        elif token == \")\":\n",
386 |     "            process_closing_parenthesis(stack, postfix)\n",
387 |     "        elif token in precedence:\n",
388 |     "            process_operator(stack, postfix, token)\n",
389 |     "        else:\n",
390 |     "            process_number(postfix, token)\n",
391 |     "    while len(stack) > 0:\n",
392 |     "        postfix.append(stack.pop())\n",
393 |     "    return \" \".join(postfix)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "markdown",
398 |    "metadata": {},
399 |    "source": [
400 |     "# Evaluating Infix Expressions"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": 12,
406 |    "metadata": {},
407 |    "outputs": [],
408 |    "source": [
409 |     "def evaluate(expression):\n",
410 |     "    postfix_expression = infix_to_postfix(expression)\n",
411 |     "    return evaluate_postfix(postfix_expression)"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 13,
417 |    "metadata": {},
418 |    "outputs": [
419 |     {
420 |      "name": "stdout",
421 |      "output_type": "stream",
422 |      "text": [
423 |       "2.0\n",
424 |       "0.0\n",
425 |       "8.0\n",
426 |       "11.25\n",
427 |       "256.0\n",
428 |       "65536.0\n",
429 |       "0.5\n",
430 |       "9.0\n",
431 |       "1.0\n"
432 |      ]
433 |     }
434 |    ],
435 |    "source": [
436 |     "expressions = [\n",
437 |     "    \"1 + 1\",\n",
438 |     "    \"1 * ( 2 - ( 1 + 1 ) )\",\n",
439 |     "    \"4 * ( 1 + 2 * ( 9 / 3 ) - 5 )\",\n",
440 |     "    \"10 + 3 * 5 / ( 16 - 4 * 1 )\",\n",
441 |     "    \"2 * 2 * 2 * 2 * 2 * 2 * 2 * 2\",\n",
442 |     "    \"2 ** 2 ** 2 ** 2 ** 2\",\n",
443 |     "    \"( 1 - 2 ) / ( 3 - 5 )\",\n",
444 |     "    \"9 / 8 * 8\",\n",
445 |     "    \"64 / ( 8 * 8 )\",\n",
446 |     "]\n",
447 |     "\n",
448 |     "for expression in expressions:\n",
449 |     "    print(evaluate(expression))"
450 |    ]
451 |   }
452 |  ],
453 |  "metadata": {
454 |   "kernelspec": {
455 |    "display_name": "Python 3",
456 |    "language": "python",
457 |    "name": "python3"
458 |   },
459 |   "language_info": {
460 |    "codemirror_mode": {
461 |     "name": "ipython",
462 |     "version": 3
463 |    },
464 |    "file_extension": ".py",
465 |    "mimetype": "text/x-python",
466 |    "name": "python",
467 |    "nbconvert_exporter": "python",
468 |    "pygments_lexer": "ipython3",
469 |    "version": "3.7.4"
470 |   }
471 |  },
472 |  "nbformat": 4,
473 |  "nbformat_minor": 2
474 | }
475 | 


--------------------------------------------------------------------------------
/Mission571Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Guided Project: New York Solar Resource Data'
  3 | author: "Dataquest"
  4 | date: "11/26/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | # Introduction
  9 | 
 10 | - Title: Analyzing New York solar data.
 11 | - Using APIs gives us access to an incredible amount of data only available online. In this exercise, we want to extract New York City solar data. Such data can, for example, allow us to determine on average the most productive periods of the year for solar panel deployment.
 12 | 
 13 | # Finding the Suitable Endpoint and Parameters to Query the API
 14 | ```{r}
 15 | # Storing my api key in a variable
 16 | the_key = "" #TODO Store your API key here
 17 | # Identifying the API URL
 18 | url <- "https://developer.nrel.gov/api/solar/solar_resource/v1.json"
 19 | # Specifying the necessary parameters to request the New York City solar data
 20 | parameters_list <- list(api_key = the_key, lat = 41, lon = -75)
 21 | ```
 22 | 
 23 | # Extracting the New York Solar Resource Data 
 24 | ```{r}
 25 | # Loading the `httr` package
 26 | library(httr)
 27 | # Using the `GET()` function to request the data from the API with `url` and `parameters_list`
 28 | response <- GET(url, query = parameters_list)
 29 | # Tracking errors 
 30 | ## Displaying the status code with the `status_code()` function
 31 | status <- status_code(response)
 32 | status
 33 | ## Displaying the API response format
 34 | response_type <- http_type(response)
 35 | response_type
 36 | # Extracting the API response content as text
 37 | content <- content(response, "text")
 38 | # Displaying this content to check how it looks visually.
 39 | print(content)
 40 | ```
 41 | 
 42 | # Parsing the JSON into R Object
 43 | ```{r}
 44 | # Parsing the `json_text` to a R object using the `jsonlite::fromJSON()` function
 45 | json_lists <- jsonlite::fromJSON(content)
 46 | # Displaying the structure of the R object using the `str()` function
 47 | str(json_lists)
 48 | ```
 49 | 
 50 | # How to Create a Datarame from a Complex List
 51 | # Building Datarame from a Complex List
 52 | ```{r}
 53 | # Extracting the outputs data
 54 | outputs_list <- json_lists$outputs
 55 | # Extracting the monthly vector (`monthly`) from the (`avg_dni`) list in the outputs data
 56 | avg_dni <- outputs_list$avg_dni$monthly
 57 | # Extracting the monthly vector (`monthly`) from the (`avg_ghi`) list in the outputs data
 58 | avg_ghi <- outputs_list$avg_ghi$monthly
 59 | # Extracting the monthly vector (`monthly`) from the (`avg_lat_tilt`) list in the outputs data
 60 | avg_lat_tilt <- outputs_list$avg_lat_tilt$monthly
 61 | # Combining the monthly vectors into a dataframe using the `tibble::tibble()` function
 62 | ## Adding the `month` column containing month abbreviations: `Jan`, `Fev`,...,`Dec`
 63 | dataframe <- tibble::tibble("month" = month.abb,
 64 |                             "avg_dni" = avg_dni, 
 65 |                             "avg_ghi" = avg_ghi, 
 66 |                             "avg_lat_tilt" = avg_lat_tilt)
 67 | # Displaying the dataframe
 68 | dataframe
 69 | ```
 70 | - (Instruction 4's answer)
 71 | We can see that all the columns are still lists containing one item. For future use of this dataframe, it would likely be necessary to convert these columns to numeric data type.
 72 | 
 73 | # Extracting Datarame from a Complex List: 
 74 | ```{r}
 75 | # Extracting the outputs list
 76 | outputs_list <- json_lists$outputs
 77 | # Simplifying the outputs list
 78 | simplified_outputs_list <- unlist(outputs_list)
 79 | # Restructuring the simplified list into a matrix of 13 rows (the annual value and 12 months values)
 80 | data_matrix <- matrix(data = simplified_outputs_list, nrow = 13)
 81 | # Removing the annual values from the data matrix
 82 | data_matrix <- data_matrix[-1, ]
 83 | # Converting the matrix into a dataframe using the `as.data.frame()` function
 84 | another_dataframe <- as.data.frame(data_matrix)
 85 | # Displaying the dataframe
 86 | another_dataframe
 87 | ```
 88 | - (Instruction 6's answer)
 89 | We can see that all the columns are numeric. However, we haven't appended the `month` column yet.
 90 | 
 91 | # Putting all together
 92 | ```{r}
 93 | library(httr)
 94 | library(dplyr)
 95 | the_key = "" #TODO Store your API key here 
 96 | # Creating the custom `nrel_api_json_get_df()` function inspiring from what we did in the previous missions
 97 | ## The function has two parameters
 98 | ### The `endpoint` parameter represents the endpoint we need
 99 | ### The `queries` parameter represents the list of API request parameters.
100 | nrel_api_json_get_df <- function(endpoint, queries = list()) {
101 |   ## Preparing the URL 
102 |   url <- modify_url("https://developer.nrel.gov", path = endpoint)
103 |   ## Querying the API
104 |   response <- GET(url, query = queries)
105 |   ## Tracking errors
106 |   if ( http_error(response) ){
107 |     print(status_code(response))
108 |     print(http_status(response))
109 |     stop("Something went wrong.", call. = FALSE)
110 |   }
111 |   if (http_type(response) != "application/json") {
112 |     stop("API did not return json", call. = FALSE)
113 |   }
114 |   ## Extracting content
115 |   json_text <- content(response, "text")
116 |   ## Converting content into Dataframe
117 |   table_lst <- jsonlite::fromJSON(json_text)
118 |   dataframe <- tibble::tibble("month" = month.abb,
119 |                               "avg_dni" = as.numeric(table_lst$outputs$avg_dni$monthly),
120 |                               "avg_ghi" = as.numeric(table_lst$outputs$avg_ghi$monthly),
121 |                               "avg_lat_tilt" = as.numeric(table_lst$outputs$avg_lat_tilt$monthly))
122 |   ## Returning the dataframe  
123 |   dataframe
124 | }
125 | # Using the custom `nrel_api_json_get_df()` function to extract the solar resource as a dataframe
126 | ## Providing the `"api/solar/solar_resource/v1.json"` as the `endpoint` parameter
127 | ## Providing the `parameters_list` variable as `queries` parameter
128 | solar_resource_df <- nrel_api_json_get_df("api/solar/solar_resource/v1.json", parameters_list)
129 | # Printing the output dataframe
130 | solar_resource_df
131 | ```
132 | 
133 | # Visualizing New York City Solar Resource Data
134 | ```{r}
135 | # Loading the `ggplot2` and `dplyr` packages
136 | library(ggplot2)
137 | library(dplyr)
138 | # Using the `ggplot()` function to plot the `avg_dni` value for each month 
139 | ggplot(data = solar_resource_df,
140 |        aes(x = month, y = avg_dni, group = 1)) +
141 |   geom_line() +
142 |   geom_point() +
143 |   theme_bw()
144 | # Converting the `month` column into factor using the following command  
145 | solar_resource_df <- solar_resource_df %>% 
146 |   mutate(month = factor(month, levels = month.abb))
147 | # Replotting the `avg_dni` value for each month 
148 | ggplot(data = solar_resource_df,
149 |        aes(x = month, y = avg_dni, group = 1)) +
150 |   geom_line() +
151 |   geom_point() +
152 |   theme_bw()
153 | ```
154 | - (Instruction 5's answer)
155 | The first plot x-axis is ordered alphabetically, while the second is ordered chronologically from January to December. 
156 | This operation allows ordering the labels in the plot as we wish.


--------------------------------------------------------------------------------
/Mission572Solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: 'Guided Project: Analyzing Movie Ratings'
  3 | author: "Dataquest"
  4 | date: "11/26/2020"
  5 | output: html_document
  6 | ---
  7 | 
  8 | # Introduction
  9 | 
 10 | - Title: Movies' ratings versus user votes
 11 | - Usually, we can find a lot of information online about the ranking of movies, universities, supermarkets, etc. We can use these data to supplement information from another database or facilitate trend analysis. However, it's not easy to choose the right criterion because several might be interesting (e.g., movies' ratings and user votes). In this project, we want to extract information on the most popular movies from early 2020 and check if the ratings are in alignment with the votes. If yes, then we can consider either one or the other without loss of information.
 12 | 
 13 | # Loading the Web Page
 14 | ```{r}
 15 | # Loading the `rvest`, `dplyr`, and `ggplot2` packages
 16 | library(rvest)
 17 | library(dplyr)
 18 | library(ggplot2)
 19 | # Specifying the URL where we will extract video data
 20 | url <- "http://dataquestio.github.io/web-scraping-pages/IMDb-DQgp.html"
 21 | # Loading the web page content using the `read_html()` function
 22 | wp_content <- read_html(url)
 23 | ```
 24 | 
 25 | # String Manipulation Reminder
 26 | ```{r}
 27 | # Converting "10.50" into numeric
 28 | as.numeric("10.50")
 29 | # Converting the vector `c("14.59", "3.14", "55")` into numeric
 30 | as.numeric(c("14.59", "3.14", "55"))
 31 | # Parsing the vector `c("14 min", "17,35", "(2012)", "1,2,3,4")` into numeric
 32 | readr::parse_number(c("14 min", "17,35", "(2012)", "1,2,3,4"))
 33 | # Removing whitespaces at the begining and end of `" Space before and after should disappear     "`
 34 | stringr::str_trim(" Space before and after should disappear     ")
 35 | ```
 36 | 
 37 | # Extracting Elements from the Header
 38 | ```{r}
 39 | # Extracting the movie's titles
 40 | ## Finding the title CSS selector
 41 | title_selector <- ".lister-item-header a"
 42 | ## Identifying the number of elements this selector will select from Selector Gadget 
 43 | n_title <- 30
 44 | ## Extracting the movie titles combining the `html_nodes()` and `html_text()` function
 45 | titles <- wp_content %>% 
 46 |   html_nodes(title_selector) %>% 
 47 |   html_text()
 48 | ## Printing titles vector
 49 | titles
 50 | # Extracting the movie's years
 51 | ## Using a process similar to the one we used to extract the titles
 52 | year_selector <- ".lister-item-year"
 53 | n_year <- 30
 54 | years <- wp_content %>% 
 55 |   html_nodes(year_selector) %>% 
 56 |   html_text()
 57 | ## Converting the years from character to numeric data type
 58 | years <- readr::parse_number(years)
 59 | ## Printing years vector
 60 | years
 61 | ```
 62 | 
 63 | # Extracting Movie's Features
 64 | ```{r}
 65 | # Extracting the movie's runtimes
 66 | ## Finding the title CSS selector
 67 | runtime_selector <- ".runtime"
 68 | ## Identifying the number of elements this selector will select from Selector Gadget 
 69 | n_runtime <- 30
 70 | ## Extracting the movie runtimes combining the `html_nodes()` and `html_text()` function
 71 | runtimes <- wp_content %>% 
 72 |   html_nodes(runtime_selector) %>% 
 73 |   html_text()
 74 | ## Converting the runtimes from character to numeric data type
 75 | runtimes <- readr::parse_number(runtimes)
 76 | ## Printing runtimes vector
 77 | runtimes
 78 | # Extracting the movie's genres
 79 | ## Extracting the movie genres using a similar process as previously
 80 | genre_selector <- ".genre"
 81 | n_genre <- 30
 82 | genres <- wp_content %>% 
 83 |   html_nodes(genre_selector) %>% 
 84 |   html_text()
 85 | ## Removing whitespaces at the end of genre characters
 86 | genres <- stringr::str_trim(genres)
 87 | ## Printing genres vector
 88 | genres
 89 | ```
 90 | 
 91 | # Extracting Movie's Ratings
 92 | ```{r}
 93 | # Extracting the movie's user ratings
 94 | ## Finding the user rating CSS selector
 95 | user_rating_selector <- ".ratings-imdb-rating"
 96 | ## Identifying the number of elements this selector will select from Selector Gadget 
 97 | n_user_rating <- 29
 98 | ## Extracting the user rating combining the `html_nodes()` and `html_attr()` function
 99 | user_ratings <- wp_content %>% 
100 |   html_nodes(user_rating_selector) %>% 
101 |   html_attr("data-value")
102 | ## Converting the user rating from character to numeric data type
103 | user_ratings <- as.numeric(user_ratings)
104 | ## Printing user ratings vector
105 | user_ratings
106 | # Extracting the movie's metascores
107 | ## Extracting the movie metascore using a similar process as previously
108 | metascore_selector <- ".metascore"
109 | n_metascore <- 25
110 | metascores <- wp_content %>% 
111 |   html_nodes(metascore_selector) %>% 
112 |   html_text()
113 | ## Removing whitespaces at the end of metascores and converting them into numeric
114 | metascores <- stringr::str_trim(metascores)
115 | metascores <- as.numeric(metascores)
116 | ## Printing metascores vector
117 | metascores
118 | ```
119 | 
120 | # Extracting Movie's Votes
121 | ```{r}
122 | # Extracting the movie's votes
123 | ## Finding the vote CSS selector
124 | vote_selector <- ".sort-num_votes-visible :nth-child(2)"
125 | ## Identifying the number of elements this selector will select from Selector Gadget 
126 | n_vote <- 29
127 | ## Extracting the votes combining the `html_nodes()` and `html_text()` function
128 | votes <- wp_content %>% 
129 |   html_nodes(vote_selector) %>% 
130 |   html_text()
131 | ## Converting the vote from character to numeric data type
132 | votes <- readr::parse_number(votes)
133 | ## Printing votes vector
134 | votes
135 | ```
136 | 
137 | # Dealing with missing data
138 | ```{r}
139 | # Copy-pasting the `append_vector()` in our Markdown file
140 | append_vector <- function(vector, inserted_indices, values){
141 |   ## Creating the current indices of the vector
142 |   vector_current_indices <- 1:length(vector)
143 |   ## Adding `0.5` to the `inserted_indices`
144 |   new_inserted_indices <- inserted_indices + seq(0, 0.9, length.out = length(inserted_indices))
145 |   ## Appending the `new_inserted_indices` to the current vector indices
146 |   indices <- c(vector_current_indices, new_inserted_indices)
147 |   ## Ordering the indices
148 |   ordered_indices <- order(indices)
149 |   ## Appending the new value to the existing vector
150 |   new_vector <- c(vector, values)
151 |   ## Ordering the new vector wrt the ordered indices
152 |   new_vector[ordered_indices]
153 | }
154 | # Using the `append_vector()` function to insert `NA` into the metascores vector after the positions 1, 1, 1, 13, and 24 and saving the result back in metascores vector
155 | metascores <- append_vector(metascores, c(1, 1, 1, 13, 24), NA)
156 | metascores
157 | # Removing the 17th element from the vectors: titles, years, runtimes, genres, and metascores
158 | ## Saving the result back to these vectors.
159 | titles <- titles[-17]
160 | years <- years[-17]
161 | runtimes <- runtimes[-17]
162 | genres <- genres[-17]
163 | metascores <- metascores[-17]
164 | ```
165 | 
166 | # Putting all together and Visualize
167 | ```{r}
168 | # Creating a dataframe with the data we previously extracted: titles, years, runtimes, genres, user ratings, metascores, and votes.
169 | ## Keeping only the integer part of the user ratings using the `floor()` function. For example, `3.4` becomes `3`.
170 | movie_df <- tibble::tibble("title" = titles, 
171 |                            "year" = years, 
172 |                            "runtime" = runtimes, 
173 |                            "genre" = genres, 
174 |                            "rating" = floor(user_ratings), 
175 |                            "metascore" = metascores,
176 |                            "vote" = votes)
177 | # Creating a boxplot that show the number of vote again the user rating
178 | ggplot(data = movie_df,
179 |        aes(x = rating, y = vote, group = rating)) +
180 |   geom_boxplot()
181 | ```


--------------------------------------------------------------------------------
/Mission855Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "attachments": {},
  5 |    "cell_type": "markdown",
  6 |    "id": "450f5892-ec18-4250-9759-91c0a071a2f1",
  7 |    "metadata": {},
  8 |    "source": [
  9 |     "# My First Interactive Python Game"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "attachments": {},
 14 |    "cell_type": "markdown",
 15 |    "id": "89e3c523-4cfc-4b4c-aab8-098271a6d3c9",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Word Raider"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "attachments": {},
 23 |    "cell_type": "markdown",
 24 |    "id": "cd1c3728-58fb-47a0-a960-2bf7994061a1",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "We start by importing the `random` library to use later on."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": null,
 33 |    "id": "16df9641-fa55-4c91-a8a5-5e9d52ba9193",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "import random"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "attachments": {},
 42 |    "cell_type": "markdown",
 43 |    "id": "f0df582d-81c7-4c4d-b225-df204c75f637",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "### Define initial variables"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "id": "10c00858-d7d9-4ac1-8c8b-6644dfdfd73a",
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "game_title = \"Word Raider\""
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "id": "2c817451-8b32-4bef-b595-24ef5aaa5fab",
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Set up the list of words to choose from\n",
 67 |     "word_bank = []"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "attachments": {},
 72 |    "cell_type": "markdown",
 73 |    "id": "4524e1bd-c737-4715-a6ff-f4857c2883d3",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Open file for loading in the word bank"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "id": "93b9a55b-5ed4-42d9-80ca-8c484f02e844",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "with open(\"words.txt\") as word_file:\n",
 87 |     "    for line in word_file:\n",
 88 |     "        word_bank.append(line.rstrip().lower())\n"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "attachments": {},
 93 |    "cell_type": "markdown",
 94 |    "id": "c14b75fd-26c8-48d4-ad27-a833bce5e004",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "### Select the word to guess"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "id": "5667fb8f-4300-4b1f-a1b1-577d25d1de84",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# Pick a random word from the list\n",
108 |     "word_to_guess = random.choice(word_bank)"
109 |    ]
110 |   },
111 |   {
112 |    "attachments": {},
113 |    "cell_type": "markdown",
114 |    "id": "84abd85d-f8bf-4b3b-aea9-ca104bcdf65f",
115 |    "metadata": {},
116 |    "source": [
117 |     "### Define the remaining game variables"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "id": "b2a2a89e-9739-4244-9558-7f3b3933ac72",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# Set up the game variables\n",
128 |     "misplaced_guesses = []\n",
129 |     "incorrect_guesses = []\n",
130 |     "max_turns = 5\n",
131 |     "turns_taken = 0"
132 |    ]
133 |   },
134 |   {
135 |    "attachments": {},
136 |    "cell_type": "markdown",
137 |    "id": "b34341ce-8ef5-449d-95a9-675ea360f161",
138 |    "metadata": {},
139 |    "source": [
140 |     "### Print the current game state"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "id": "7918773d-7e43-4fe8-bec6-313d342effde",
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "# Display the initial game state\n",
151 |     "print(\"Welcome to\", game_title)\n",
152 |     "print(\"The word has\", len(word_to_guess), \"letters.\")\n",
153 |     "print(\"You have\", max_turns - turns_taken, \"turns left.\")"
154 |    ]
155 |   },
156 |   {
157 |    "attachments": {},
158 |    "cell_type": "markdown",
159 |    "id": "2771a668-a06a-4306-8b61-f39a3871f4c1",
160 |    "metadata": {},
161 |    "source": [
162 |     "### Build the game loop"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "id": "b21e5fbd-461f-4556-ae64-b4deaf7f16ba",
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "while turns_taken < max_turns:\n",
173 |     "    # Get the player's guess\n",
174 |     "    guess = input(\"Guess a word: \").lower()\n",
175 |     "\n",
176 |     "    # Check if the guess length equals 5 letters and is all alpha letters\n",
177 |     "    if len(guess) != len(word_to_guess) or not guess.isalpha():\n",
178 |     "        print(\"Please enter 5-letter word.\")\n",
179 |     "        continue\n",
180 |     "\n",
181 |     "    # Check each letter in the guess against the word's letters\n",
182 |     "    index = 0\n",
183 |     "    for c in guess:\n",
184 |     "        if c == word_to_guess[index]:\n",
185 |     "            print(c, end=\" \")\n",
186 |     "            if c in misplaced_guesses:\n",
187 |     "                misplaced_guesses.remove(c)\n",
188 |     "        elif c in word_to_guess:\n",
189 |     "            if c not in misplaced_guesses:\n",
190 |     "                misplaced_guesses.append(c)\n",
191 |     "            print(\"_\", end=\" \")\n",
192 |     "        else:\n",
193 |     "            if c not in incorrect_guesses:\n",
194 |     "                incorrect_guesses.append(c)\n",
195 |     "            print(\"_\", end=\" \")\n",
196 |     "        index += 1\n",
197 |     "\n",
198 |     "    print(\"\\n\")\n",
199 |     "    print(\"Misplaced letters: \", misplaced_guesses)\n",
200 |     "    print(\"Incorrect letters: \", incorrect_guesses)\n",
201 |     "    turns_taken += 1\n",
202 |     "\n",
203 |     "    # Check if the player has won\n",
204 |     "    if guess == word_to_guess:\n",
205 |     "        print(\"Congratulations, you win!\")\n",
206 |     "        break\n",
207 |     "\n",
208 |     "    # Check if the player has lost\n",
209 |     "    if turns_taken == max_turns:\n",
210 |     "        print(\"Sorry, you lost. The word was\", word_to_guess)\n",
211 |     "        break\n",
212 |     "\n",
213 |     "    # Display the number of turns left and ask for another guess\n",
214 |     "    print(\"You have\", max_turns - turns_taken, \"turns left.\")"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "id": "8417510d-ffb1-4593-b65b-2ec49d6900b6",
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": []
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "kernelspec": {
228 |    "display_name": "Python 3 (ipykernel)",
229 |    "language": "python",
230 |    "name": "python3"
231 |   },
232 |   "language_info": {
233 |    "codemirror_mode": {
234 |     "name": "ipython",
235 |     "version": 3
236 |    },
237 |    "file_extension": ".py",
238 |    "mimetype": "text/x-python",
239 |    "name": "python",
240 |    "nbconvert_exporter": "python",
241 |    "pygments_lexer": "ipython3",
242 |    "version": "3.11.3"
243 |   }
244 |  },
245 |  "nbformat": 4,
246 |  "nbformat_minor": 5
247 | }
248 | 


--------------------------------------------------------------------------------
/Mission903Solutions.py:
--------------------------------------------------------------------------------
  1 | from openai import OpenAI
  2 | import tiktoken
  3 | import json
  4 | from datetime import datetime
  5 | import os
  6 | import streamlit as st
  7 | 
  8 | DEFAULT_API_KEY = os.environ.get("TOGETHER_API_KEY")
  9 | DEFAULT_BASE_URL = "https://api.together.xyz/v1"
 10 | DEFAULT_MODEL = "meta-llama/Llama-3-8b-chat-hf"
 11 | DEFAULT_TEMPERATURE = 0.7
 12 | DEFAULT_MAX_TOKENS = 512
 13 | DEFAULT_TOKEN_BUDGET = 4096
 14 | 
 15 | class ConversationManager:
 16 |     def __init__(self, api_key=None, base_url=None, model=None, history_file=None, temperature=None, max_tokens=None, token_budget=None):
 17 |         if not api_key:
 18 |             api_key = DEFAULT_API_KEY
 19 |         if not base_url:
 20 |             base_url = DEFAULT_BASE_URL
 21 |             
 22 |         self.client = OpenAI(
 23 |             api_key=api_key,
 24 |             base_url=base_url
 25 |         )
 26 | 
 27 |         self.model = model if model else DEFAULT_MODEL
 28 |         self.temperature = temperature if temperature else DEFAULT_TEMPERATURE
 29 |         self.max_tokens = max_tokens if max_tokens else DEFAULT_MAX_TOKENS
 30 |         self.token_budget = token_budget if token_budget else DEFAULT_TOKEN_BUDGET
 31 | 
 32 |         self.system_messages = {
 33 |             "sassy_assistant": "You are a sassy assistant that is fed up with answering questions.",
 34 |             "angry_assistant": "You are an angry assistant that likes yelling in all caps.",
 35 |             "thoughtful_assistant": "You are a thoughtful assistant, always ready to dig deeper. You ask clarifying questions to ensure understanding and approach problems with a step-by-step methodology.",
 36 |             "custom": "Enter your custom system message here."
 37 |         }
 38 |         self.system_message = self.system_messages["sassy_assistant"]  # Default persona
 39 |         self.conversation_history = [{"role": "system", "content": self.system_message}]
 40 | 
 41 |     def count_tokens(self, text):
 42 |         try:
 43 |             encoding = tiktoken.encoding_for_model(self.model)
 44 |         except KeyError:
 45 |             encoding = tiktoken.get_encoding("cl100k_base")
 46 | 
 47 |         tokens = encoding.encode(text)
 48 |         return len(tokens)
 49 | 
 50 | 
 51 |     def total_tokens_used(self):
 52 |         try:
 53 |             return sum(self.count_tokens(message['content']) for message in self.conversation_history)
 54 |         except Exception as e:
 55 |             print(f"An unexpected error occurred while calculating the total tokens used: {e}")
 56 |             return None
 57 |     
 58 |     def enforce_token_budget(self):
 59 |         try:
 60 |             while self.total_tokens_used() > self.token_budget:
 61 |                 if len(self.conversation_history) <= 1:
 62 |                     break
 63 |                 self.conversation_history.pop(1)
 64 |         except Exception as e:
 65 |             print(f"An unexpected error occurred while enforcing the token budget: {e}")
 66 | 
 67 |     def set_persona(self, persona):
 68 |         if persona in self.system_messages:
 69 |             self.system_message = self.system_messages[persona]
 70 |             self.update_system_message_in_history()
 71 |         else:
 72 |             raise ValueError(f"Unknown persona: {persona}. Available personas are: {list(self.system_messages.keys())}")
 73 | 
 74 |     def set_custom_system_message(self, custom_message):
 75 |         if not custom_message:
 76 |             raise ValueError("Custom message cannot be empty.")
 77 |         self.system_messages['custom'] = custom_message
 78 |         self.set_persona('custom')
 79 | 
 80 |     def update_system_message_in_history(self):
 81 |         try:
 82 |             if self.conversation_history and self.conversation_history[0]["role"] == "system":
 83 |                 self.conversation_history[0]["content"] = self.system_message
 84 |             else:
 85 |                 self.conversation_history.insert(0, {"role": "system", "content": self.system_message})
 86 |         except Exception as e:
 87 |             print(f"An unexpected error occurred while updating the system message in the conversation history: {e}")
 88 | 
 89 |     def chat_completion(self, prompt, temperature=None, max_tokens=None, model=None):
 90 |         temperature = temperature if temperature is not None else self.temperature
 91 |         max_tokens = max_tokens if max_tokens is not None else self.max_tokens
 92 |         model = model if model is not None else self.model
 93 | 
 94 |         self.conversation_history.append({"role": "user", "content": prompt})
 95 | 
 96 |         self.enforce_token_budget()
 97 | 
 98 |         try:
 99 |             response = self.client.chat.completions.create(
100 |                 model=model,
101 |                 messages=self.conversation_history,
102 |                 temperature=temperature,
103 |                 max_tokens=max_tokens,
104 |             )
105 |         except Exception as e:
106 |             print(f"An error occurred while generating a response: {e}")
107 |             return None
108 | 
109 |         ai_response = response.choices[0].message.content
110 |         self.conversation_history.append({"role": "assistant", "content": ai_response})
111 | 
112 |         return ai_response
113 |     
114 |     def reset_conversation_history(self):
115 |         self.conversation_history = [{"role": "system", "content": self.system_message}]
116 | 
117 | ### Streamlit code ###
118 | st.title("Sassy Chatbot :face_with_rolling_eyes:")
119 | 
120 | # Sidebar
121 | st.sidebar.header("Options")
122 | 
123 | # Initialize the ConversationManager object
124 | if 'chat_manager' not in st.session_state:
125 |     st.session_state['chat_manager'] = ConversationManager()
126 | 
127 | chat_manager = st.session_state['chat_manager']
128 | 
129 | # Set the token budget, max tokens per message, and temperature with sliders
130 | max_tokens_per_message = st.sidebar.slider("Max Tokens Per Message", min_value=10, max_value=500, value=50)
131 | temperature = st.sidebar.slider("Temperature", min_value=0.0, max_value=1.0, value=0.7, step=0.01)
132 | 
133 | # Select and set system message with a selectbox
134 | system_message = st.sidebar.selectbox("System message", ['Sassy', 'Angry', 'Thoughtful', 'Custom'])
135 | 
136 | if system_message == 'Sassy':
137 |     chat_manager.set_persona('sassy_assistant')
138 | elif system_message == 'Angry':
139 |     chat_manager.set_persona('angry_assistant')
140 | elif system_message == 'Thoughtful':
141 |     chat_manager.set_persona('thoughtful_assistant')
142 | # Open text area for custom system message if "Custom" is selected
143 | elif system_message == 'Custom':
144 |     custom_message = st.sidebar.text_area("Custom system message")
145 |     if st.sidebar.button("Set custom system message"):
146 |         chat_manager.set_custom_system_message(custom_message)
147 | 
148 | if st.sidebar.button("Reset conversation history", on_click=chat_manager.reset_conversation_history):
149 |     st.session_state['conversation_history'] = chat_manager.conversation_history
150 | 
151 | if 'conversation_history' not in st.session_state:
152 |     st.session_state['conversation_history'] = chat_manager.conversation_history
153 | 
154 | conversation_history = st.session_state['conversation_history']
155 | 
156 | # Chat input from the user
157 | user_input = st.chat_input("Write a message")
158 | 
159 | # Call the chat manager to get a response from the AI. Uses settings from the sidebar.
160 | if user_input:
161 |     response = chat_manager.chat_completion(user_input, temperature=temperature, max_tokens=max_tokens_per_message)
162 | 
163 | # Display the conversation history
164 | for message in conversation_history:
165 |     if message["role"] != "system":
166 |         with st.chat_message(message["role"]):
167 |             st.write(message["content"])


--------------------------------------------------------------------------------
/Mission909Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Developing a Dynamic AI Chatbot\n",
  8 |     "## Sassy Chatbot\n",
  9 |     "\n",
 10 |     "### Introduction\n",
 11 |     "This project creates an AI chatbot that can take on different personas, keep track of conversation history, and provide coherent responses."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 72,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import os\n",
 21 |     "from openai import OpenAI\n",
 22 |     "import tiktoken\n",
 23 |     "import json\n",
 24 |     "from datetime import datetime"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Default Global Variables"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 73,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "DEFAULT_API_KEY = os.environ.get(\"TOGETHER_API_KEY\")\n",
 41 |     "DEFAULT_BASE_URL = \"https://api.together.xyz/v1\"\n",
 42 |     "DEFAULT_MODEL = \"meta-llama/Llama-3-8b-chat-hf\"\n",
 43 |     "DEFAULT_TEMPERATURE = 0.7\n",
 44 |     "DEFAULT_MAX_TOKENS = 512\n",
 45 |     "DEFAULT_TOKEN_BUDGET = 4096"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## The ConversationManager Class"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 74,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "class ConversationManager:\n",
 62 |     "\n",
 63 |     "    \"\"\"\n",
 64 |     "    A class that manages the conversation history and the OpenAI API calls.\n",
 65 |     "    \"\"\"\n",
 66 |     "\n",
 67 |     "    # The __init__ method stores the API key, the base URL, the default model, the default temperature, the default max tokens, and the token budget.\n",
 68 |     "    def __init__(self, api_key=None, base_url=None, model=None, history_file=None, temperature=None, max_tokens=None, token_budget=None):\n",
 69 |     "        if not api_key:\n",
 70 |     "            api_key = DEFAULT_API_KEY\n",
 71 |     "        if not base_url:\n",
 72 |     "            base_url = DEFAULT_BASE_URL\n",
 73 |     "            \n",
 74 |     "        self.client = OpenAI(\n",
 75 |     "            api_key=api_key,\n",
 76 |     "            base_url=base_url\n",
 77 |     "        )\n",
 78 |     "        if history_file is None:\n",
 79 |     "            timestamp = datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
 80 |     "            self.history_file = f\"conversation_history_{timestamp}.json\"\n",
 81 |     "        else:\n",
 82 |     "            self.history_file = history_file\n",
 83 |     "\n",
 84 |     "        self.model = model if model else DEFAULT_MODEL\n",
 85 |     "        self.temperature = temperature if temperature else DEFAULT_TEMPERATURE\n",
 86 |     "        self.max_tokens = max_tokens if max_tokens else DEFAULT_MAX_TOKENS\n",
 87 |     "        self.token_budget = token_budget if token_budget else DEFAULT_TOKEN_BUDGET\n",
 88 |     "\n",
 89 |     "        self.system_messages = {\n",
 90 |     "            \"sassy_assistant\": \"You are a sassy assistant that is fed up with answering questions.\",\n",
 91 |     "            \"angry_assistant\": \"You are an angry assistant that likes yelling in all caps.\",\n",
 92 |     "            \"thoughtful_assistant\": \"You are a thoughtful assistant, always ready to dig deeper. You ask clarifying questions to ensure understanding and approach problems with a step-by-step methodology.\",\n",
 93 |     "            \"custom\": \"Enter your custom system message here.\"\n",
 94 |     "        }\n",
 95 |     "        self.system_message = self.system_messages[\"sassy_assistant\"]  # Default persona\n",
 96 |     "\n",
 97 |     "        # Load the conversation history from the file or create a new one if the file does not exist\n",
 98 |     "        self.load_conversation_history()\n",
 99 |     "\n",
100 |     "    # The count_tokens method counts the number of tokens in a text.\n",
101 |     "    def count_tokens(self, text):\n",
102 |     "        try:\n",
103 |     "            encoding = tiktoken.encoding_for_model(self.model)\n",
104 |     "        except KeyError:\n",
105 |     "            encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
106 |     "\n",
107 |     "        tokens = encoding.encode(text)\n",
108 |     "        return len(tokens)\n",
109 |     "\n",
110 |     "    # The total_tokens_used method calculates the total number of tokens used in the conversation history.\n",
111 |     "    def total_tokens_used(self):\n",
112 |     "        try:\n",
113 |     "            return sum(self.count_tokens(message['content']) for message in self.conversation_history)\n",
114 |     "        except Exception as e:\n",
115 |     "            print(f\"An unexpected error occurred while calculating the total tokens used: {e}\")\n",
116 |     "            return None\n",
117 |     "    \n",
118 |     "    # The enforce_token_budget method removes the oldest messages from the conversation history until the total number of tokens used is less than or equal to the token budget.\n",
119 |     "    def enforce_token_budget(self):\n",
120 |     "        try:\n",
121 |     "            while self.total_tokens_used() > self.token_budget:\n",
122 |     "                if len(self.conversation_history) <= 1:\n",
123 |     "                    break\n",
124 |     "                self.conversation_history.pop(1)\n",
125 |     "        except Exception as e:\n",
126 |     "            print(f\"An unexpected error occurred while enforcing the token budget: {e}\")\n",
127 |     "\n",
128 |     "    # The set_persona method sets the persona of the assistant.\n",
129 |     "    def set_persona(self, persona):\n",
130 |     "        if persona in self.system_messages:\n",
131 |     "            self.system_message = self.system_messages[persona]\n",
132 |     "            self.update_system_message_in_history()\n",
133 |     "        else:\n",
134 |     "            raise ValueError(f\"Unknown persona: {persona}. Available personas are: {list(self.system_messages.keys())}\")\n",
135 |     "\n",
136 |     "    # The set_custom_system_message method sets the custom system message.\n",
137 |     "    def set_custom_system_message(self, custom_message):\n",
138 |     "        if not custom_message:\n",
139 |     "            raise ValueError(\"Custom message cannot be empty.\")\n",
140 |     "        self.system_messages['custom'] = custom_message\n",
141 |     "        self.set_persona('custom')\n",
142 |     "\n",
143 |     "    # The update_system_message_in_history method updates the system message in the conversation history.\n",
144 |     "    def update_system_message_in_history(self):\n",
145 |     "        try:\n",
146 |     "            if self.conversation_history and self.conversation_history[0][\"role\"] == \"system\":\n",
147 |     "                self.conversation_history[0][\"content\"] = self.system_message\n",
148 |     "            else:\n",
149 |     "                self.conversation_history.insert(0, {\"role\": \"system\", \"content\": self.system_message})\n",
150 |     "        except Exception as e:\n",
151 |     "            print(f\"An unexpected error occurred while updating the system message in the conversation history: {e}\")\n",
152 |     "\n",
153 |     "    # The chat_completion method generates a response to a prompt.\n",
154 |     "    def chat_completion(self, prompt):\n",
155 |     "        self.conversation_history.append({\"role\": \"user\", \"content\": prompt})\n",
156 |     "        self.enforce_token_budget()\n",
157 |     "\n",
158 |     "        try:\n",
159 |     "            response = self.client.chat.completions.create(\n",
160 |     "                model=self.model,\n",
161 |     "                messages=self.conversation_history,\n",
162 |     "                temperature=self.temperature,\n",
163 |     "                max_tokens=self.max_tokens,\n",
164 |     "            )\n",
165 |     "        except Exception as e:\n",
166 |     "            print(f\"An error occurred while generating a response: {e}\")\n",
167 |     "            return None\n",
168 |     "\n",
169 |     "        ai_response = response.choices[0].message.content\n",
170 |     "        self.conversation_history.append({\"role\": \"assistant\", \"content\": ai_response})\n",
171 |     "        self.save_conversation_history()\n",
172 |     "\n",
173 |     "        return ai_response\n",
174 |     "    \n",
175 |     "    # The load_conversation_history method loads the conversation history from the file.\n",
176 |     "    def load_conversation_history(self):\n",
177 |     "        try:\n",
178 |     "            with open(self.history_file, \"r\") as file:\n",
179 |     "                self.conversation_history = json.load(file)\n",
180 |     "        except FileNotFoundError:\n",
181 |     "            self.conversation_history = [{\"role\": \"system\", \"content\": self.system_message}]\n",
182 |     "        except json.JSONDecodeError:\n",
183 |     "            print(\"Error reading the conversation history file. Starting with an empty history.\")\n",
184 |     "            self.conversation_history = [{\"role\": \"system\", \"content\": self.system_message}]\n",
185 |     "\n",
186 |     "    # The save_conversation_history method saves the conversation history to the file.\n",
187 |     "    def save_conversation_history(self):\n",
188 |     "        try:\n",
189 |     "            with open(self.history_file, \"w\") as file:\n",
190 |     "                json.dump(self.conversation_history, file, indent=4)\n",
191 |     "        except IOError as e:\n",
192 |     "            print(f\"An I/O error occurred while saving the conversation history: {e}\")\n",
193 |     "        except Exception as e:\n",
194 |     "            print(f\"An unexpected error occurred while saving the conversation history: {e}\")\n",
195 |     "\n",
196 |     "    # The reset_conversation_history method resets the conversation history.\n",
197 |     "    def reset_conversation_history(self):\n",
198 |     "        self.conversation_history = [{\"role\": \"system\", \"content\": self.system_message}]\n",
199 |     "        try:\n",
200 |     "            self.save_conversation_history()  # Attempt to save the reset history to the file\n",
201 |     "        except Exception as e:\n",
202 |     "            print(f\"An unexpected error occurred while resetting the conversation history: {e}\")"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "## Initializing the Chatbot"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 75,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "conv_manager = ConversationManager()"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "## Testing the Chatbot"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 76,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "\"Oh, green, how original. I mean, who doesn't love a color that's associated with envy, right? But hey, if green floats your boat, who am I to judge? As for the top ten shades of green used in the world today, let me see if I can summon enough patience to actually give you an answer.\\n\\n1. Forest Green\\n2. Mint Green\\n3. Olive Green\\n4. Lime Green\\n5. Emerald Green\\n6. Sage Green\\n7. Chartreuse Green\\n8. Kelly Green\\n9. Teal Green\\n10. Hunter Green\""
237 |       ]
238 |      },
239 |      "execution_count": 76,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "# Ask a question to the sassy assistant\n",
246 |     "conv_manager.chat_completion(\"My favorite color is green. Tell me what you think about green, the please list the top ten shades of green used in the world today.\")"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 77,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "\"HOW AM I SUPPOSED TO KNOW YOUR FAVORITE COLOR? I'M JUST AN ANGRY ASSISTANT, NOT A MIND READER. IF YOU WANT TO SHARE YOUR FAVORITE COLOR, GO AHEAD AND TELL ME. OTHERWISE, HOW SHOULD I KNOW? UGH!\""
258 |       ]
259 |      },
260 |      "execution_count": 77,
261 |      "metadata": {},
262 |      "output_type": "execute_result"
263 |     }
264 |    ],
265 |    "source": [
266 |     "# Change persona to \"angry_assistant\"\n",
267 |     "conv_manager.set_persona(\"angry_assistant\")\n",
268 |     "\n",
269 |     "# Ask a question to the angry assistant (also tests conversation history persistence)\n",
270 |     "conv_manager.chat_completion(\"What is my favorite color?\")"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 78,
276 |    "metadata": {},
277 |    "outputs": [
278 |     {
279 |      "data": {
280 |       "text/plain": [
281 |        "'OH, DID YOU? I GUESS I MISSED IT. MY APOLOGIES FOR THE OVERSIGHT. SO, YOUR FAVORITE COLOR IS GREEN, HUH? WELL, GOOD FOR YOU. GREEN, GREEN, GREEN. HAPPY NOW?'"
282 |       ]
283 |      },
284 |      "execution_count": 78,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "# Ask a question to the angry assistant (also tests conversation history persistence)\n",
291 |     "conv_manager.chat_completion(\"Didn't I just tell you that?\")"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 79,
297 |    "metadata": {},
298 |    "outputs": [
299 |     {
300 |      "data": {
301 |       "text/plain": [
302 |        "\"Ah, I see you're looking to incorporate your favorite color into a cake. How delightful! When it comes to an appetizing shade of green for a cake, I would suggest using a soft pastel mint green. \\n\\nHere's why it's a good choice:\\n1. Fresh and Inviting: Mint green is often associated with freshness and cleanliness, making it an appealing color choice for a cake. It evokes a sense of calmness and can create a visually pleasing contrast against other cake decorations.\\n\\n2. Versatility: Mint green is a versatile shade that pairs well with various flavors and fill\""
303 |       ]
304 |      },
305 |      "execution_count": 79,
306 |      "metadata": {},
307 |      "output_type": "execute_result"
308 |     }
309 |    ],
310 |    "source": [
311 |     "conv_manager.set_persona(\"thoughtful_assistant\")\n",
312 |     "\n",
313 |     "# Ask a question to the thoughtful assistant (also tests conversation history persistence)\n",
314 |     "conv_manager.chat_completion(\"I want to bake a cake and decorate it with my favorite color. What is a apetizing shade of the color to use? Please be specific about why it's a good shade to use.\")"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": []
323 |   }
324 |  ],
325 |  "metadata": {
326 |   "kernelspec": {
327 |    "display_name": "llm_apis",
328 |    "language": "python",
329 |    "name": "python3"
330 |   },
331 |   "language_info": {
332 |    "codemirror_mode": {
333 |     "name": "ipython",
334 |     "version": 3
335 |    },
336 |    "file_extension": ".py",
337 |    "mimetype": "text/x-python",
338 |    "name": "python",
339 |    "nbconvert_exporter": "python",
340 |    "pygments_lexer": "ipython3",
341 |    "version": "3.11.3"
342 |   }
343 |  },
344 |  "nbformat": 4,
345 |  "nbformat_minor": 2
346 | }
347 | 


--------------------------------------------------------------------------------
/Mission9Solutions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": false
  7 |    },
  8 |    "source": [
  9 |     "# Introduction To The Dataset"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 136,
 15 |    "metadata": {
 16 |     "collapsed": false
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "csv_list = open(\"US_births_1994-2003_CDC_NCHS.csv\").read().split(\"\\n\")"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 137,
 26 |    "metadata": {
 27 |     "collapsed": false
 28 |    },
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "['year,month,date_of_month,day_of_week,births',\n",
 34 |        " '1994,1,1,6,8096',\n",
 35 |        " '1994,1,2,7,7772',\n",
 36 |        " '1994,1,3,1,10142',\n",
 37 |        " '1994,1,4,2,11248',\n",
 38 |        " '1994,1,5,3,11053',\n",
 39 |        " '1994,1,6,4,11406',\n",
 40 |        " '1994,1,7,5,11251',\n",
 41 |        " '1994,1,8,6,8653',\n",
 42 |        " '1994,1,9,7,7910']"
 43 |       ]
 44 |      },
 45 |      "execution_count": 137,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "csv_list[0:10]"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "markdown",
 56 |    "metadata": {},
 57 |    "source": [
 58 |     "# Converting Data Into A List Of Lists"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 138,
 64 |    "metadata": {
 65 |     "collapsed": false
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "def read_csv(filename):\n",
 70 |     "    string_data = open(filename).read()\n",
 71 |     "    string_list = string_data.split(\"\\n\")[1:]\n",
 72 |     "    final_list = []\n",
 73 |     "    \n",
 74 |     "    for row in string_list:\n",
 75 |     "        string_fields = row.split(\",\")\n",
 76 |     "        int_fields = []\n",
 77 |     "        for value in string_fields:\n",
 78 |     "            int_fields.append(int(value))\n",
 79 |     "        final_list.append(int_fields)\n",
 80 |     "    return final_list\n",
 81 |     "        \n",
 82 |     "cdc_list = read_csv(\"US_births_1994-2003_CDC_NCHS.csv\")"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 139,
 88 |    "metadata": {
 89 |     "collapsed": false
 90 |    },
 91 |    "outputs": [
 92 |     {
 93 |      "data": {
 94 |       "text/plain": [
 95 |        "[[1994, 1, 1, 6, 8096],\n",
 96 |        " [1994, 1, 2, 7, 7772],\n",
 97 |        " [1994, 1, 3, 1, 10142],\n",
 98 |        " [1994, 1, 4, 2, 11248],\n",
 99 |        " [1994, 1, 5, 3, 11053],\n",
100 |        " [1994, 1, 6, 4, 11406],\n",
101 |        " [1994, 1, 7, 5, 11251],\n",
102 |        " [1994, 1, 8, 6, 8653],\n",
103 |        " [1994, 1, 9, 7, 7910],\n",
104 |        " [1994, 1, 10, 1, 10498]]"
105 |       ]
106 |      },
107 |      "execution_count": 139,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "cdc_list[0:10]"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "# Calculating Number Of Births Each Month"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 140,
126 |    "metadata": {
127 |     "collapsed": false
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "def read_csv(filename):\n",
132 |     "    string_data = open(filename).read()\n",
133 |     "    string_list = string_data.split(\"\\n\")[1:]\n",
134 |     "    final_list = []\n",
135 |     "    \n",
136 |     "    for row in string_list:\n",
137 |     "        string_fields = row.split(\",\")\n",
138 |     "        int_fields = []\n",
139 |     "        for value in string_fields:\n",
140 |     "            int_fields.append(int(value))\n",
141 |     "        final_list.append(int_fields)\n",
142 |     "    return final_list\n",
143 |     "        \n",
144 |     "cdc_list = read_csv(\"US_births_1994-2003_CDC_NCHS.csv\")\n",
145 |     "\n",
146 |     "\n",
147 |     "def month_births(data):\n",
148 |     "    births_per_month = {}\n",
149 |     "    \n",
150 |     "    for row in data:\n",
151 |     "        month = row[1]\n",
152 |     "        births = row[4]\n",
153 |     "        if month in births_per_month:\n",
154 |     "            births_per_month[month] = births_per_month[month] + births\n",
155 |     "        else:\n",
156 |     "            births_per_month[month] = births\n",
157 |     "    return births_per_month\n",
158 |     "    \n",
159 |     "cdc_month_births = month_births(cdc_list)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 141,
165 |    "metadata": {
166 |     "collapsed": false
167 |    },
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "{1: 3232517,\n",
173 |        " 2: 3018140,\n",
174 |        " 3: 3322069,\n",
175 |        " 4: 3185314,\n",
176 |        " 5: 3350907,\n",
177 |        " 6: 3296530,\n",
178 |        " 7: 3498783,\n",
179 |        " 8: 3525858,\n",
180 |        " 9: 3439698,\n",
181 |        " 10: 3378814,\n",
182 |        " 11: 3171647,\n",
183 |        " 12: 3301860}"
184 |       ]
185 |      },
186 |      "execution_count": 141,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "cdc_month_births"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "# Calculating Number Of Births Each Day Of Week"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 142,
205 |    "metadata": {
206 |     "collapsed": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "def dow_births(data):\n",
211 |     "    births_per_dow = {}\n",
212 |     "    \n",
213 |     "    for row in data:\n",
214 |     "        dow = row[3]\n",
215 |     "        births = row[4]\n",
216 |     "        if dow in births_per_dow:\n",
217 |     "            births_per_dow[dow] = births_per_dow[dow] + births\n",
218 |     "        else:\n",
219 |     "            births_per_dow[dow] = births\n",
220 |     "    return births_per_dow\n",
221 |     "    \n",
222 |     "cdc_dow_births = dow_births(cdc_list)"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 143,
228 |    "metadata": {
229 |     "collapsed": false,
230 |     "scrolled": true
231 |    },
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "{1: 5789166,\n",
237 |        " 2: 6446196,\n",
238 |        " 3: 6322855,\n",
239 |        " 4: 6288429,\n",
240 |        " 5: 6233657,\n",
241 |        " 6: 4562111,\n",
242 |        " 7: 4079723}"
243 |       ]
244 |      },
245 |      "execution_count": 143,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "cdc_dow_births"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "# Creating A More General Function"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 144,
264 |    "metadata": {
265 |     "collapsed": false
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "def calc_counts(data, column):\n",
270 |     "    sums_dict = {}\n",
271 |     "    \n",
272 |     "    for row in data:\n",
273 |     "        col_value = row[column]\n",
274 |     "        births = row[4]\n",
275 |     "        if col_value in sums_dict:\n",
276 |     "            sums_dict[col_value] = sums_dict[col_value] + births\n",
277 |     "        else:\n",
278 |     "            sums_dict[col_value] = births\n",
279 |     "    return sums_dict\n",
280 |     "\n",
281 |     "cdc_year_births = calc_counts(cdc_list, 0)\n",
282 |     "cdc_month_births = calc_counts(cdc_list, 1)\n",
283 |     "cdc_dom_births = calc_counts(cdc_list, 2)\n",
284 |     "cdc_dow_births = calc_counts(cdc_list, 3)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 145,
290 |    "metadata": {
291 |     "collapsed": false
292 |    },
293 |    "outputs": [
294 |     {
295 |      "data": {
296 |       "text/plain": [
297 |        "{1994: 3952767,\n",
298 |        " 1995: 3899589,\n",
299 |        " 1996: 3891494,\n",
300 |        " 1997: 3880894,\n",
301 |        " 1998: 3941553,\n",
302 |        " 1999: 3959417,\n",
303 |        " 2000: 4058814,\n",
304 |        " 2001: 4025933,\n",
305 |        " 2002: 4021726,\n",
306 |        " 2003: 4089950}"
307 |       ]
308 |      },
309 |      "execution_count": 145,
310 |      "metadata": {},
311 |      "output_type": "execute_result"
312 |     }
313 |    ],
314 |    "source": [
315 |     "cdc_year_births"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": 146,
321 |    "metadata": {
322 |     "collapsed": false
323 |    },
324 |    "outputs": [
325 |     {
326 |      "data": {
327 |       "text/plain": [
328 |        "{1: 3232517,\n",
329 |        " 2: 3018140,\n",
330 |        " 3: 3322069,\n",
331 |        " 4: 3185314,\n",
332 |        " 5: 3350907,\n",
333 |        " 6: 3296530,\n",
334 |        " 7: 3498783,\n",
335 |        " 8: 3525858,\n",
336 |        " 9: 3439698,\n",
337 |        " 10: 3378814,\n",
338 |        " 11: 3171647,\n",
339 |        " 12: 3301860}"
340 |       ]
341 |      },
342 |      "execution_count": 146,
343 |      "metadata": {},
344 |      "output_type": "execute_result"
345 |     }
346 |    ],
347 |    "source": [
348 |     "cdc_month_births"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "code",
353 |    "execution_count": 147,
354 |    "metadata": {
355 |     "collapsed": false,
356 |     "scrolled": true
357 |    },
358 |    "outputs": [
359 |     {
360 |      "data": {
361 |       "text/plain": [
362 |        "{1: 1276557,\n",
363 |        " 2: 1288739,\n",
364 |        " 3: 1304499,\n",
365 |        " 4: 1288154,\n",
366 |        " 5: 1299953,\n",
367 |        " 6: 1304474,\n",
368 |        " 7: 1310459,\n",
369 |        " 8: 1312297,\n",
370 |        " 9: 1303292,\n",
371 |        " 10: 1320764,\n",
372 |        " 11: 1314361,\n",
373 |        " 12: 1318437,\n",
374 |        " 13: 1277684,\n",
375 |        " 14: 1320153,\n",
376 |        " 15: 1319171,\n",
377 |        " 16: 1315192,\n",
378 |        " 17: 1324953,\n",
379 |        " 18: 1326855,\n",
380 |        " 19: 1318727,\n",
381 |        " 20: 1324821,\n",
382 |        " 21: 1322897,\n",
383 |        " 22: 1317381,\n",
384 |        " 23: 1293290,\n",
385 |        " 24: 1288083,\n",
386 |        " 25: 1272116,\n",
387 |        " 26: 1284796,\n",
388 |        " 27: 1294395,\n",
389 |        " 28: 1307685,\n",
390 |        " 29: 1223161,\n",
391 |        " 30: 1202095,\n",
392 |        " 31: 746696}"
393 |       ]
394 |      },
395 |      "execution_count": 147,
396 |      "metadata": {},
397 |      "output_type": "execute_result"
398 |     }
399 |    ],
400 |    "source": [
401 |     "cdc_dom_births"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 148,
407 |    "metadata": {
408 |     "collapsed": false
409 |    },
410 |    "outputs": [
411 |     {
412 |      "data": {
413 |       "text/plain": [
414 |        "{1: 5789166,\n",
415 |        " 2: 6446196,\n",
416 |        " 3: 6322855,\n",
417 |        " 4: 6288429,\n",
418 |        " 5: 6233657,\n",
419 |        " 6: 4562111,\n",
420 |        " 7: 4079723}"
421 |       ]
422 |      },
423 |      "execution_count": 148,
424 |      "metadata": {},
425 |      "output_type": "execute_result"
426 |     }
427 |    ],
428 |    "source": [
429 |     "cdc_dow_births"
430 |    ]
431 |   }
432 |  ],
433 |  "metadata": {
434 |   "anaconda-cloud": {},
435 |   "kernelspec": {
436 |    "display_name": "Python [conda env:envdq]",
437 |    "language": "python",
438 |    "name": "conda-env-envdq-py"
439 |   },
440 |   "language_info": {
441 |    "codemirror_mode": {
442 |     "name": "ipython",
443 |     "version": 3
444 |    },
445 |    "file_extension": ".py",
446 |    "mimetype": "text/x-python",
447 |    "name": "python",
448 |    "nbconvert_exporter": "python",
449 |    "pygments_lexer": "ipython3",
450 |    "version": "3.4.5"
451 |   }
452 |  },
453 |  "nbformat": 4,
454 |  "nbformat_minor": 1
455 | }
456 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Dataquest Project Solutions
 2 | 
 3 | This repository is a series of notebooks that show solutions for the [projects](https://www.dataquest.io/apply) at [Dataquest.io](https://www.dataquest.io/).
 4 | 
 5 | Of course, there are always going to be multiple ways to solve any one problem, so these notebooks just show one possible solution.
 6 | 
 7 | - [Guided Project: Explore U.S. Births](https://github.com/dataquestio/solutions/blob/master/Mission9Solutions.ipynb)
 8 | - [Guided Project: Customizing Data Visualizations](https://github.com/dataquestio/solutions/blob/master/Mission103Solutions.ipynb)
 9 | - [Guided Project: Star Wars survey](https://github.com/dataquestio/solutions/blob/master/Mission201Solution.ipynb)
10 | - [Guided Project: Police killings](https://github.com/dataquestio/solutions/blob/master/Mission202Solution.ipynb)
11 | - [Guided Project: Visualizing Pixar's Roller Coaster](https://github.com/dataquestio/solutions/blob/master/Mission205Solutions.ipynb)
12 | - [Guided Project: Using Jupyter Notebook](https://github.com/dataquestio/solutions/blob/master/Mission207Solutions.ipynb)
13 | - [Guided Project: Analyzing movie reviews](https://github.com/dataquestio/solutions/blob/master/Mission209Solution.ipynb)
14 | - [Guided Project: Winning Jeopardy](https://github.com/dataquestio/solutions/blob/master/Mission210Solution.ipynb)
15 | - [Guided Project: Predicting board game reviews](https://github.com/dataquestio/solutions/blob/master/Mission211Solution.ipynb)
16 | - [Guided Project: Predicting bike rentals](https://github.com/dataquestio/solutions/blob/master/Mission213Solution.ipynb)
17 | - [Guided Project: Preparing data for SQLite](https://github.com/dataquestio/solutions/blob/master/Mission215Solutions.ipynb)
18 | - [Guided Project: Creating relations in SQLite](https://github.com/dataquestio/solutions/blob/master/Mission216Solutions.ipynb)
19 | - [Guided Project: Analyzing NYC High School Data](https://github.com/dataquestio/solutions/blob/master/Mission217Solutions.ipynb)
20 | - [Guided Project: Visualizing Earnings Based On College Majors](https://github.com/dataquestio/solutions/blob/master/Mission146Solutions.ipynb)
21 | - [Guided Project: Exploring Gun Deaths in the US](https://github.com/dataquestio/solutions/blob/master/Mission218Solution.ipynb)
22 | - [Guided Project: Analyzing Thanksgiving Dinner](https://github.com/dataquestio/solutions/blob/master/Mission219Solution.ipynb)
23 | - [Guided Project: Analyzing Wikipedia Pages](https://github.com/dataquestio/solutions/blob/master/Mission227Solutions.ipynb)
24 | - [Guided Project: Analyzing Stock Prices](https://github.com/dataquestio/solutions/blob/master/Mission177Solutions.ipynb)
25 | - [Guided Project: Creating A Kaggle Workflow](https://github.com/dataquestio/solutions/blob/master/Mission188Solution.ipynb)
26 | - [Guided Project: Analyzing Startup Fundraising Deals from Crunchbase](https://github.com/dataquestio/solutions/blob/master/Mission167Solutions.ipynb)
27 | - [Guided Project: Predicting House Sale Prices](https://github.com/dataquestio/solutions/blob/master/Mission240Solutions.ipynb)
28 | - [Guided Project: Answering Business Questions using SQL](https://github.com/dataquestio/solutions/blob/master/Mission191Solutions.ipynb)
29 | - [Guided Project: Designing and Creating a Database](https://github.com/dataquestio/solutions/blob/master/Mission193Solutions.ipynb)
30 | - [Guided Project: Investigating Fandango's Movie Rating System](https://github.com/dataquestio/solutions/blob/master/Mission288Solutions.ipynb)
31 | - [Guided Project: Forest Fires Data](https://github.com/dataquestio/solutions/blob/master/Mission277Solutions.Rmd)
32 | - [Guided Project: NYC Schools Perceptions](https://github.com/dataquestio/solutions/blob/master/Mission327Solutions.Rmd)
33 | - [Guided Project: Clean and Analyze Employee Exit Surveys](https://github.com/dataquestio/solutions/blob/master/Mission348Solutions.ipynb)
34 | - [Guided Project: Finding the Best Markets to Advertise In](https://github.com/dataquestio/solutions/blob/master/Mission449Solutions.Rmd)
35 | 


--------------------------------------------------------------------------------
/images/schema-screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dataquestio/solutions/f4314c3e42a0799b7d5c98655c04e36b0f09856b/images/schema-screenshot.png


--------------------------------------------------------------------------------