├── .gitignore ├── README.md ├── chapter-01 ├── .ipynb_checkpoints │ ├── Exercise 01 — Test scores-checkpoint.ipynb │ ├── Exercise 01b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 02 — Scaling test scores-checkpoint.ipynb │ ├── Exercise 02b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 03 — Counting 10s digits-checkpoint.ipynb │ ├── Exercise 03b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 04 — Descriptive statistics-checkpoint.ipynb │ ├── Exercise 04b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 05 — Monday temperatures-checkpoint.ipynb │ ├── Exercise 05b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 06 — Passenger frequency-checkpoint.ipynb │ ├── Exercise 06b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 07 — Long, medium, and short rides-checkpoint.ipynb │ └── Exercise 07b — Beyond the exercise-checkpoint.ipynb ├── Exercise 01 — Test scores.ipynb ├── Exercise 01b — Beyond the exercise.ipynb ├── Exercise 02 — Scaling test scores.ipynb ├── Exercise 02b — Beyond the exercise.ipynb ├── Exercise 03 — Counting 10s digits.ipynb ├── Exercise 03b — Beyond the exercise.ipynb ├── Exercise 04 — Descriptive statistics.ipynb ├── Exercise 04b — Beyond the exercise.ipynb ├── Exercise 05 — Monday temperatures.ipynb ├── Exercise 05b — Beyond the exercise.ipynb ├── Exercise 06 — Passenger frequency.ipynb ├── Exercise 06b — Beyond the exercise.ipynb ├── Exercise 07 — Long, medium, and short rides.ipynb └── Exercise 07b — Beyond the exercise.ipynb ├── chapter-02 ├── .ipynb_checkpoints │ ├── Exercise 08 — Net revenue-checkpoint.ipynb │ ├── Exercise 08b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 09 — Tax planning-checkpoint.ipynb │ ├── Exercise 09b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 10 — Adding products-checkpoint.ipynb │ ├── Exercise 10b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 11 — Best sellers-checkpoint.ipynb │ ├── Exercise 11b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 12 — Finding outliers-checkpoint.ipynb │ ├── Exercise 12b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 13 — Interpolation-checkpoint.ipynb │ ├── Exercise 13b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 14 — Selective updating-checkpoint.ipynb │ └── Exercise 14b — Beyond the exercise-checkpoint.ipynb ├── Exercise 08 — Net revenue.ipynb ├── Exercise 08b — Beyond the exercise.ipynb ├── Exercise 09 — Tax planning.ipynb ├── Exercise 09b — Beyond the exercise.ipynb ├── Exercise 10 — Adding products.ipynb ├── Exercise 10b — Beyond the exercise.ipynb ├── Exercise 11 — Best sellers.ipynb ├── Exercise 11b — Beyond the exercise.ipynb ├── Exercise 12 — Finding outliers.ipynb ├── Exercise 12b — Beyond the exercise.ipynb ├── Exercise 13 — Interpolation.ipynb ├── Exercise 13b — Beyond the exercise.ipynb ├── Exercise 14 — Selective updating.ipynb └── Exercise 14b — Beyond the exercise.ipynb ├── chapter-03 ├── .ipynb_checkpoints │ ├── Exercise 15 — Weird taxi rides-checkpoint.ipynb │ ├── Exercise 15b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 16 — Pandemic taxis-checkpoint.ipynb │ ├── Exercise 16b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 17 — Setting column types-checkpoint.ipynb │ ├── Exercise 17b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 18 — passwd to df-checkpoint.ipynb │ ├── Exercise 18b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 19 — Bitcoin values-checkpoint.ipynb │ ├── Exercise 19b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 20 — Big cities-checkpoint.ipynb │ └── Exercise 20b — Beyond the exercise-checkpoint.ipynb ├── Exercise 15 — Weird taxi rides.ipynb ├── Exercise 15b — Beyond the exercise.ipynb ├── Exercise 16 — Pandemic taxis.ipynb ├── Exercise 16b — Beyond the exercise.ipynb ├── Exercise 17 — Setting column types.ipynb ├── Exercise 17b — Beyond the exercise.ipynb ├── Exercise 18 — passwd to df.ipynb ├── Exercise 18b — Beyond the exercise.ipynb ├── Exercise 19 — Bitcoin values.ipynb ├── Exercise 19b — Beyond the exercise.ipynb ├── Exercise 20 — Big cities.ipynb └── Exercise 20b — Beyond the exercise.ipynb ├── chapter-04 ├── .ipynb_checkpoints │ ├── Exercise 21 — Parking tickets-checkpoint.ipynb │ ├── Exercise 21b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 22 — SAT scores-checkpoint.ipynb │ ├── Exercise 22b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 23 — Olympic games-checkpoint.ipynb │ ├── Exercise 23b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 24 — More Olympic stats-checkpoint.ipynb │ ├── Exercise 24b — Beyond the exercise-checkpoint.ipynb │ └── Sandbox-checkpoint.ipynb ├── Exercise 21 — Parking tickets.ipynb ├── Exercise 21b — Beyond the exercise.ipynb ├── Exercise 22 — SAT scores.ipynb ├── Exercise 22b — Beyond the exercise.ipynb ├── Exercise 23 — Olympic games.ipynb ├── Exercise 23b — Beyond the exercise.ipynb ├── Exercise 24 — More Olympic stats.ipynb ├── Exercise 24b — Beyond the exercise.ipynb └── Sandbox.ipynb ├── chapter-05 ├── .ipynb_checkpoints │ ├── Exercise 25 — Parking cleanup-checkpoint.ipynb │ ├── Exercise 25b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 26 — Celebrity deaths-checkpoint.ipynb │ ├── Exercise 26b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 27 — Titanic interpolation-checkpoint.ipynb │ ├── Exercise 27b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 28 — Inconsistent data-checkpoint.ipynb │ └── Exercise 28b — Beyond the exercise-checkpoint.ipynb ├── Exercise 25 — Parking cleanup.ipynb ├── Exercise 25b — Beyond the exercise.ipynb ├── Exercise 26 — Celebrity deaths.ipynb ├── Exercise 26b — Beyond the exercise.ipynb ├── Exercise 27 — Titanic interpolation.ipynb ├── Exercise 27b — Beyond the exercise.ipynb ├── Exercise 28 — Inconsistent data.ipynb └── Exercise 28b — Beyond the exercise.ipynb ├── chapter-06 ├── .ipynb_checkpoints │ ├── Exercise 29 — Longest taxi rides-checkpoint.ipynb │ ├── Exercise 29b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 30 — Taxi rides per passenger count-checkpoint.ipynb │ ├── Exercise 30b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 31 — Tourist spending-checkpoint.ipynb │ ├── Exercise 31b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 32 — Multi-city temperatures-checkpoint.ipynb │ ├── Exercise 32b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 33 — SAT scores per state-checkpoint.ipynb │ ├── Exercise 33b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 34 — Rainy, snowy cities-checkpoint.ipynb │ ├── Exercise 34b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 35 — Outer joins-checkpoint.ipynb │ ├── Exercise 35b — Beyond the exercise-checkpoint.ipynb │ └── Joining sidebar-checkpoint.ipynb ├── Exercise 29 — Longest taxi rides.ipynb ├── Exercise 29b — Beyond the exercise.ipynb ├── Exercise 30 — Taxi rides per passenger count.ipynb ├── Exercise 30b — Beyond the exercise.ipynb ├── Exercise 31 — Tourist spending.ipynb ├── Exercise 31b — Beyond the exercise.ipynb └── Joining sidebar.ipynb ├── chapter-07 ├── Exercise 32 — Multi-city temperatures.ipynb ├── Exercise 32b — Beyond the exercise.ipynb ├── Exercise 33 — SAT scores per state.ipynb ├── Exercise 33b — Beyond the exercise.ipynb ├── Exercise 34 — Rainy, snowy cities.ipynb ├── Exercise 34b — Beyond the exercise.ipynb ├── Exercise 35 — Outer joins.ipynb └── Exercise 35b — Beyond the exercise.ipynb ├── chapter-08 ├── .ipynb_checkpoints │ └── Chapter 7 — project-checkpoint.ipynb └── Chapter 7 — project.ipynb ├── chapter-09 ├── .ipynb_checkpoints │ ├── Exercise 36 — Analyzing Alice-checkpoint.ipynb │ ├── Exercise 36b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 37 — Wine words-checkpoint.ipynb │ ├── Exercise 37b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 38 — Programming languages-checkpoint.ipynb │ └── Exercise 38b — Beyond the exercise-checkpoint.ipynb ├── Exercise 36 — Analyzing Alice.ipynb ├── Exercise 36b — Beyond the exercise.ipynb ├── Exercise 37 — Wine words.ipynb ├── Exercise 37b — Beyond the exercise.ipynb ├── Exercise 38 — Programming languages.ipynb └── Exercise 38b — Beyond the exercise.ipynb ├── chapter-10 ├── .ipynb_checkpoints │ ├── Exercise 39 — Long taxi rides-checkpoint.ipynb │ ├── Exercise 39b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 40 — Writing dates, reading dates-checkpoint.ipynb │ ├── Exercise 40b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 41 — Oil prices-checkpoint.ipynb │ ├── Exercise 41b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 42 — Best tippers-checkpoint.ipynb │ ├── Exercise 42b — Beyond the exercise-checkpoint.ipynb │ └── Untitled-checkpoint.ipynb ├── Exercise 39 — Long taxi rides.ipynb ├── Exercise 39b — Beyond the exercise.ipynb ├── Exercise 40 — Writing dates, reading dates.ipynb ├── Exercise 40b — Beyond the exercise.ipynb ├── Exercise 41 — Oil prices.ipynb ├── Exercise 41b — Beyond the exercise.ipynb ├── Exercise 42 — Best tippers.ipynb ├── Exercise 42b — Beyond the exercise.ipynb └── Untitled.ipynb ├── chapter-11 ├── .ipynb_checkpoints │ ├── Exercise 43 — Cities-checkpoint.ipynb │ ├── Exercise 43b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 44 — Boxplotting weather-checkpoint.ipynb │ ├── Exercise 44b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 45 — Taxi fare parts-checkpoint.ipynb │ ├── Exercise 45b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 46 — Cars, Oil, and ice cream-checkpoint.ipynb │ ├── Exercise 46b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 47 — Seaborn taxi plots-checkpoint.ipynb │ ├── Exercise 47a — Seaborn sidebar-checkpoint.ipynb │ ├── Exercise 47b — Beyond the exercise-checkpoint.ipynb │ └── Untitled-checkpoint.ipynb ├── Exercise 43 — Cities.ipynb ├── Exercise 43b — Beyond the exercise.ipynb ├── Exercise 44 — Boxplotting weather.ipynb ├── Exercise 44b — Beyond the exercise.ipynb ├── Exercise 45 — Taxi fare parts.ipynb ├── Exercise 45b — Beyond the exercise.ipynb ├── Exercise 46 — Cars, Oil, and ice cream.ipynb ├── Exercise 46b — Beyond the exercise.ipynb ├── Exercise 47 — Seaborn taxi plots.ipynb ├── Exercise 47a — Seaborn sidebar.ipynb ├── Exercise 47b — Beyond the exercise.ipynb └── Untitled.ipynb ├── chapter-12 ├── .gitignore ├── .ipynb_checkpoints │ ├── Exercise 48 — Categories-checkpoint.ipynb │ ├── Exercise 48b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 49 — Faster reading and writing-checkpoint.ipynb │ ├── Exercise 49b — Beyond the exercise-checkpoint.ipynb │ ├── Exercise 50 — Faster queries with query-checkpoint.ipynb │ └── Exercise 50b — Beyond the exercise-checkpoint.ipynb ├── Exercise 48 — Categories.ipynb ├── Exercise 48b — Beyond the exercise.ipynb ├── Exercise 49 — Faster reading and writing.ipynb ├── Exercise 49b — Beyond the exercise.ipynb ├── Exercise 50 — Faster queries with query.ipynb ├── Exercise 50b — Beyond the exercise.ipynb └── dask-worker-space │ ├── global.lock │ └── purge.lock └── chapter-13 ├── .ipynb_checkpoints └── Final project-checkpoint.ipynb └── Final project.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | *~ 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Jupyter notebooks for Pandas Workout (https://PandasWorkout.com/) 2 | 3 | These are the Jupyter notebooks for my book, Pandas Workout. The data for the exercises are stored separately, at https://files.lerner.co.il/pandas-workout-data.zip . 4 | 5 | Did you like the book? Please help me to spread the word, by reviewing it on Amazon (https://www.amazon.com/Pandas-Workout-Reuven-Lerner/dp/1617299723). 6 | 7 | Other links that might be of interest to you: 8 | 9 | - "Better developers," with new articles about Python each week: https://BetterDevelopersWeekly.com/ 10 | - "Bamboo Weekly," with Pandas puzzles based on current events: https://BambooWeekly.com 11 | - My previous book, "Python Workout": https://PythonWorkout.com/ 12 | - My YouTube channel: https://YouTube.com/reuvenlerner 13 | - My Twitter feed: https://Twitter.com/reuvenmlerner 14 | - My online courses: https://LernerPython.com 15 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 01 — Test scores-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "Sep 96\n", 23 | "Oct 89\n", 24 | "Nov 85\n", 25 | "Dec 78\n", 26 | "Jan 79\n", 27 | "Feb 71\n", 28 | "Mar 72\n", 29 | "Apr 70\n", 30 | "May 75\n", 31 | "Jun 95\n", 32 | "dtype: int64" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "g = np.random.default_rng(0)\n", 42 | "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n", 43 | "\n", 44 | "s = Series(g.integers(70, 101, 10),\n", 45 | " index=months)\n", 46 | "s" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "Entire year average: 81.0\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "print(f'Entire year average: {s.mean()}')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": { 70 | "scrolled": true 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Yearly average: 81.0\n", 78 | "First half average: 85.4\n", 79 | "Second half average: 76.6\n", 80 | "Improvement: -8.800000000000011\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "first_half_average = s['Sep':'Jan'].mean()\n", 86 | "second_half_average = s['Feb':'Jun'].mean()\n", 87 | "\n", 88 | "print(f'Yearly average: {s.mean()}')\n", 89 | "\n", 90 | "print(f'First half average: {first_half_average}')\n", 91 | "print(f'Second half average: {second_half_average}')\n", 92 | "\n", 93 | "print(f'Improvement: {second_half_average - first_half_average}')" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3 (ipykernel)", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.11.6" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 4 125 | } 126 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 01b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "g = np.random.default_rng(0)\n", 14 | "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n", 15 | "\n", 16 | "s = Series(g.integers(70, 100, 10),\n", 17 | " index=months)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Beyond 1\n", 25 | "\n", 26 | "In which month did this student get their highest score? Note that there are at least two ways to accomplish this: You can sort the values, taking the largest one, or you can use a boolean (\"mask\") index to find those rows that match the value of `s.max()`, the highest value." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "'Sep'" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "# Option 1\n", 47 | "s.sort_values(ascending=False).index[0]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "'Sep'" 59 | ] 60 | }, 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "# Option 2\n", 68 | "s[s==s.max()].index[0]" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "'Sep'" 80 | ] 81 | }, 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# Option 3\n", 89 | "s.idxmax()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "# Beyond 2\n", 97 | "\n", 98 | "What were this student's five highest scores in the year?" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "Sep 95\n", 110 | "Jun 94\n", 111 | "Oct 89\n", 112 | "Nov 85\n", 113 | "Jan 79\n", 114 | "dtype: int64" 115 | ] 116 | }, 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "s.sort_values(ascending=False).head(5)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "# Beyond 3\n", 131 | "\n", 132 | "Round the student's scores to the nearest 10. So a score of 82 would be rounded down to 80, but a score of 87 would be rounded up to 90." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "Sep 100\n", 144 | "Oct 90\n", 145 | "Nov 80\n", 146 | "Dec 80\n", 147 | "Jan 80\n", 148 | "Feb 70\n", 149 | "Mar 70\n", 150 | "Apr 70\n", 151 | "May 80\n", 152 | "Jun 90\n", 153 | "dtype: int64" 154 | ] 155 | }, 156 | "execution_count": 6, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "# The \"round\" method, when given a positive integer argument, rounds numbers after the\n", 163 | "# decimal point. When given a negative integer argument, it rounds numbers *before* the decimal point!\n", 164 | "\n", 165 | "s.round(-1) " 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3 (ipykernel)", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.11.6" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 4 197 | } 198 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 02 — Scaling test scores-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "Sep 57\n", 23 | "Oct 52\n", 24 | "Nov 50\n", 25 | "Dec 45\n", 26 | "Jan 46\n", 27 | "Feb 40\n", 28 | "Mar 41\n", 29 | "Apr 40\n", 30 | "May 43\n", 31 | "Jun 56\n", 32 | "dtype: int64" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "g = np.random.default_rng(0)\n", 42 | "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n", 43 | "\n", 44 | "s = Series(g.integers(40, 60, 10),\n", 45 | " index=months)\n", 46 | "s" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "Sep 90.0\n", 58 | "Oct 85.0\n", 59 | "Nov 83.0\n", 60 | "Dec 78.0\n", 61 | "Jan 79.0\n", 62 | "Feb 73.0\n", 63 | "Mar 74.0\n", 64 | "Apr 73.0\n", 65 | "May 76.0\n", 66 | "Jun 89.0\n", 67 | "dtype: float64" 68 | ] 69 | }, 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "s + (80 - s.mean())" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3 (ipykernel)", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.11.6" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 4 108 | } 109 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 03 — Counting 10s digits-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0 85\n", 12 | "1 63\n", 13 | "2 51\n", 14 | "3 26\n", 15 | "4 30\n", 16 | "5 4\n", 17 | "6 7\n", 18 | "7 1\n", 19 | "8 17\n", 20 | "9 81\n", 21 | "dtype: int64" 22 | ] 23 | }, 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "from pandas import Series, DataFrame\n", 33 | "\n", 34 | "g = np.random.default_rng(0)\n", 35 | "s = Series(g.integers(0, 100, 10))\n", 36 | "s" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "0 8\n", 48 | "1 6\n", 49 | "2 5\n", 50 | "3 2\n", 51 | "4 3\n", 52 | "5 0\n", 53 | "6 0\n", 54 | "7 0\n", 55 | "8 1\n", 56 | "9 8\n", 57 | "dtype: int8" 58 | ] 59 | }, 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "# solution 1, using /\n", 67 | "(s / 10).astype(np.int8)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "0 8\n", 79 | "1 6\n", 80 | "2 5\n", 81 | "3 2\n", 82 | "4 3\n", 83 | "5 0\n", 84 | "6 0\n", 85 | "7 0\n", 86 | "8 1\n", 87 | "9 8\n", 88 | "dtype: int64" 89 | ] 90 | }, 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "# solution 2, using //\n", 98 | "(s // 10)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 4, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "0 8\n", 110 | "1 6\n", 111 | "2 5\n", 112 | "3 2\n", 113 | "4 3\n", 114 | "5 0\n", 115 | "6 0\n", 116 | "7 0\n", 117 | "8 1\n", 118 | "9 8\n", 119 | "dtype: object" 120 | ] 121 | }, 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "# solution 3, partial\n", 129 | "s.astype(str).str.get(-2).fillna('0')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "0 8\n", 141 | "1 6\n", 142 | "2 5\n", 143 | "3 2\n", 144 | "4 3\n", 145 | "5 0\n", 146 | "6 0\n", 147 | "7 0\n", 148 | "8 1\n", 149 | "9 8\n", 150 | "dtype: int8" 151 | ] 152 | }, 153 | "execution_count": 5, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "# solution 3, complete\n", 160 | "s.astype(str).str.get(-2).fillna('0').astype(np.int8)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 6, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "0 8\n", 172 | "1 6\n", 173 | "2 5\n", 174 | "3 2\n", 175 | "4 3\n", 176 | "5 0\n", 177 | "6 0\n", 178 | "7 0\n", 179 | "8 1\n", 180 | "9 8\n", 181 | "dtype: int8" 182 | ] 183 | }, 184 | "execution_count": 6, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "(\n", 191 | " s\n", 192 | " .astype(str) # get a series based on s, with dtype str\n", 193 | " .str.get(-2) # retrieve the second-to-last character\n", 194 | " .fillna('0') # replace NaN with '0'\n", 195 | " .astype(np.int8) # get a new series back dtype int8\n", 196 | ")\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | } 206 | ], 207 | "metadata": { 208 | "kernelspec": { 209 | "display_name": "Python 3 (ipykernel)", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "codemirror_mode": { 215 | "name": "ipython", 216 | "version": 3 217 | }, 218 | "file_extension": ".py", 219 | "mimetype": "text/x-python", 220 | "name": "python", 221 | "nbconvert_exporter": "python", 222 | "pygments_lexer": "ipython3", 223 | "version": "3.11.6" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 4 228 | } 229 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 04 — Descriptive statistics-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0 12.573022\n", 12 | "1 -13.210486\n", 13 | "2 64.042265\n", 14 | "3 10.490012\n", 15 | "4 -53.566937\n", 16 | " ... \n", 17 | "99995 -91.667135\n", 18 | "99996 -231.480500\n", 19 | "99997 -0.028179\n", 20 | "99998 -109.645051\n", 21 | "99999 -49.541294\n", 22 | "Length: 100000, dtype: float64" 23 | ] 24 | }, 25 | "execution_count": 5, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from pandas import Series, DataFrame\n", 34 | "\n", 35 | "g = np.random.default_rng(0)\n", 36 | "s = Series(g.normal(0, 100, 100_000))\n", 37 | "s" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 6, 43 | "metadata": { 44 | "scrolled": true 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "count 100000.000000\n", 51 | "mean -0.090825\n", 52 | "std 100.013350\n", 53 | "min -449.411704\n", 54 | "25% -67.292120\n", 55 | "50% -0.414699\n", 56 | "75% 67.636542\n", 57 | "max 473.195769\n", 58 | "dtype: float64" 59 | ] 60 | }, 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "s.describe()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 7, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "s.loc[s == s.min()] = 5*s.max()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": { 83 | "scrolled": true 84 | }, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "count 100000.000000\n", 90 | "mean -0.062671\n", 91 | "std 100.282770\n", 92 | "min -402.315865\n", 93 | "25% -67.288054\n", 94 | "50% -0.409289\n", 95 | "75% 67.640758\n", 96 | "max 2365.978844\n", 97 | "dtype: float64" 98 | ] 99 | }, 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "s.describe()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3 (ipykernel)", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.11.6" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 4 138 | } 139 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 04b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0 12.573022\n", 12 | "1 -13.210486\n", 13 | "2 64.042265\n", 14 | "3 10.490012\n", 15 | "4 -53.566937\n", 16 | " ... \n", 17 | "99995 -91.667135\n", 18 | "99996 -231.480500\n", 19 | "99997 -0.028179\n", 20 | "99998 -109.645051\n", 21 | "99999 -49.541294\n", 22 | "Length: 100000, dtype: float64" 23 | ] 24 | }, 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from pandas import Series, DataFrame\n", 34 | "\n", 35 | "g = np.random.default_rng(0)\n", 36 | "s = Series(g.normal(0, 100, 100_000))\n", 37 | "s" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Beyond 1\n", 45 | "\n", 46 | "Demonstrate that 68%, 95%, and 99.7% of the values in `s` are indeed within 1, 2, and 3 standard distributions of the mean." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "0.68396" 58 | ] 59 | }, 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "# within one standard deviation\n", 67 | "s[(s > s.mean() - s.std()) &\n", 68 | " (s < s.mean() + s.std())].count() / s.count()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "0.95461" 80 | ] 81 | }, 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# within two standard deviations\n", 89 | "s[(s > s.mean() - 2*s.std()) &\n", 90 | " (s < s.mean() + 2*s.std())].count() / s.count()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "0.99708" 102 | ] 103 | }, 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "# within three standard deviations\n", 111 | "s[(s > s.mean() - 3*s.std()) &\n", 112 | " (s < s.mean() + 3*s.std())].count() / s.count()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "# Beyond 2\n", 120 | "\n", 121 | " Calculate the mean of numbers greater than `s.mean()`. Then calculate the mean of numbers less than `s.mean()`. Is the average of these two numbers the same as `s.mean()`?" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "0.12941477214831565" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "(s[s < s.mean()].mean() + s[s > s.mean()].mean() ) / 2" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "-0.09082507731206121" 153 | ] 154 | }, 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "# They're pretty close!\n", 162 | "s.mean()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# Beyond 3\n", 170 | "\n", 171 | "What is the mean of the numbers beyond 3 standard deviations?" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 7, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "-11.606040282602287" 183 | ] 184 | }, 185 | "execution_count": 7, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "# A pretty complex combination of mask indexes,\n", 192 | "# but the result is still a series, on which we can run mean()\n", 193 | "s[(s < s.mean() - 3*s.std()) | \n", 194 | " (s > s.mean() + 3*s.std()) ].mean()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3 (ipykernel)", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.11.6" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 4 226 | } 227 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 05 — Monday temperatures-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n", 14 | "\n", 15 | "g = np.random.default_rng(0)\n", 16 | "s = Series(g.normal(20, 5, 28),\n", 17 | " index=days*4).round().astype(np.int8)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/plain": [ 28 | "Sun 21\n", 29 | "Mon 19\n", 30 | "Tue 23\n", 31 | "Wed 21\n", 32 | "Thu 17\n", 33 | "Fri 22\n", 34 | "Sat 27\n", 35 | "Sun 25\n", 36 | "Mon 16\n", 37 | "Tue 14\n", 38 | "Wed 17\n", 39 | "Thu 20\n", 40 | "Fri 8\n", 41 | "Sat 19\n", 42 | "Sun 14\n", 43 | "Mon 16\n", 44 | "Tue 17\n", 45 | "Wed 18\n", 46 | "Thu 22\n", 47 | "Fri 25\n", 48 | "Sat 19\n", 49 | "Sun 27\n", 50 | "Mon 17\n", 51 | "Tue 22\n", 52 | "Wed 25\n", 53 | "Thu 20\n", 54 | "Fri 16\n", 55 | "Sat 15\n", 56 | "dtype: int8" 57 | ] 58 | }, 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "s" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "17.0" 77 | ] 78 | }, 79 | "execution_count": 3, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "s.loc['Mon'].mean()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3 (ipykernel)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.11.6" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 4 117 | } 118 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 05b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n", 14 | "\n", 15 | "g = np.random.default_rng(0)\n", 16 | "s = Series(g.normal(20, 5, 28),\n", 17 | " index=days*4).round().astype(np.int8)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Beyond 1\n", 25 | "\n", 26 | "What was the average temperature on weekends (i.e., Saturdays and Sundays)?" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "20.875" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "s[['Sun', 'Sat']].mean()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Beyond 2\n", 54 | "\n", 55 | "How many times will the change in temperature from the previous day be greater than 2 degrees?" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "Tue 23\n", 67 | "Fri 22\n", 68 | "Sat 27\n", 69 | "Wed 17\n", 70 | "Thu 20\n", 71 | "Sat 19\n", 72 | "Thu 22\n", 73 | "Fri 25\n", 74 | "Sun 27\n", 75 | "Tue 22\n", 76 | "Wed 25\n", 77 | "dtype: int8" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "# by default, the \"diff\" method compares with the previous element\n", 87 | "s[s.diff() > 2]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "# Beyond 3\n", 95 | "\n", 96 | "What are the two most common temperatures, and how often does each appear?" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "17 4\n", 108 | "19 3\n", 109 | "Name: count, dtype: int64" 110 | ] 111 | }, 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "# value_counts returns a series in which the values from s are \n", 119 | "# the index, the number of appearances is the value, and the\n", 120 | "# items are ordered from most common to least common. We can\n", 121 | "# then use \"head\" to get only the 2 most common values.\n", 122 | "s.value_counts().head(2)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3 (ipykernel)", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.11.6" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 06 — Passenger frequency-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0\n", 12 | "1 0.720772\n", 13 | "6 0.036904\n", 14 | "Name: proportion, dtype: float64" 15 | ] 16 | }, 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "output_type": "execute_result" 20 | } 21 | ], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "from pandas import Series, DataFrame\n", 26 | "\n", 27 | "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n", 28 | "\n", 29 | "s.value_counts(normalize=True)[[1, 6]]" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "0\n", 41 | "1 7207\n", 42 | "2 1313\n", 43 | "5 520\n", 44 | "3 406\n", 45 | "6 369\n", 46 | "4 182\n", 47 | "0 2\n", 48 | "Name: count, dtype: int64" 49 | ] 50 | }, 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "s.value_counts()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0\n", 69 | "1 0.720772\n", 70 | "6 0.036904\n", 71 | "Name: proportion, dtype: float64" 72 | ] 73 | }, 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "s.value_counts(normalize=True)[[1,6]]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3 (ipykernel)", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.11.6" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 4 112 | } 113 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 06b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Beyond 1\n", 21 | "\n", 22 | "What are the 25%, 50% (median), and 75% quantiles for this data set? Can you guess the results before you execute the code?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "0.25 1.0\n", 34 | "0.50 1.0\n", 35 | "0.75 2.0\n", 36 | "Name: 0, dtype: float64" 37 | ] 38 | }, 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "# Since 1-passenger rides are 72% of the values, we can\n", 46 | "# guess that the 25% and 50% marks will be 1, whereas \n", 47 | "# the 75% mark will be 2 or 3, depending on how common those are.\n", 48 | "s.quantile([.25, .50, .75])" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# Beyond 2\n", 56 | "\n", 57 | "What proportion of taxi rides are for 3, 4, 5, or 6 passengers?" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0.1477147714771477" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "s.value_counts(normalize=True)[[3,4,5,6]].sum()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# Beyond 3\n", 85 | "\n", 86 | "Consider that you're in charge of vehicle licensing for New York taxis. Given these numbers, would more people benefit from smaller taxis that can take only one or two passengers, or larger taxis that can take five or six passengers?" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Given that a huge proportion of rides are for 1 or 2 passengers, licensing more small taxis would seem to match the needs." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3 (ipykernel)", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.11.6" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 4 125 | } 126 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 07 — Long, medium, and short rides-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "0\n", 25 | "short 5890\n", 26 | "medium 3402\n", 27 | "long 707\n", 28 | "Name: count, dtype: int64" 29 | ] 30 | }, 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "pd.cut(s, \n", 38 | " bins=[0, 2, 10, s.max()], \n", 39 | " include_lowest=True,\n", 40 | " labels=['short', 'medium', 'long']).value_counts()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "Python 3 (ipykernel)", 54 | "language": "python", 55 | "name": "python3" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 3 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython3", 67 | "version": "3.11.6" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 4 72 | } 73 | -------------------------------------------------------------------------------- /chapter-01/.ipynb_checkpoints/Exercise 07b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Beyond 1\n", 21 | "\n", 22 | "Compare the mean and median trip distances. What does that tell you about the distribution of our data?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": { 29 | "scrolled": true 30 | }, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "count 9999.000000\n", 36 | "mean 3.158511\n", 37 | "std 4.037516\n", 38 | "min 0.000000\n", 39 | "25% 1.000000\n", 40 | "50% 1.700000\n", 41 | "75% 3.300000\n", 42 | "max 64.600000\n", 43 | "Name: 0, dtype: float64" 44 | ] 45 | }, 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "s.describe()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Because the mean is significantly higher than the median, it would seem that there are some *very* long trips in our data set that are pulling the mean up. And sure enough, we see that the standard deviation is 4, but that we have at least one trip > 64 miles in length." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# Beyond 2\n", 67 | "\n", 68 | "How many short, medium, and long trips were there for trips that had only one passenger? Note that data for passenger count and trip length are from the same data set, meaning that the indexes are the same." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "0\n", 80 | "short 4333\n", 81 | "medium 2387\n", 82 | "long 487\n", 83 | "Name: count, dtype: int64" 84 | ] 85 | }, 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n", 93 | "\n", 94 | "pd.cut(s[passenger_count == 1], \n", 95 | " bins=[s.min(), 2, 10, s.max()], \n", 96 | " include_lowest=True,\n", 97 | " labels=['short', 'medium', 'long']).value_counts()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# Beyond 3\n", 105 | "\n", 106 | "What happens if we don't pass explicit intervals, and instead ask `pd.cut` to just create 3 bins, with `bins=3`?" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "array([-0.0646 , 21.53333333, 43.06666667, 64.6 ])" 118 | ] 119 | }, 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n", 127 | "\n", 128 | "pd.cut(s[passenger_count == 1], \n", 129 | " bins=3,\n", 130 | " labels=['short', 'medium', 'long'], retbins=True)[-1]" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "metadata": { 137 | "scrolled": true 138 | }, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "0\n", 144 | "short 7179\n", 145 | "medium 26\n", 146 | "long 2\n", 147 | "Name: count, dtype: int64" 148 | ] 149 | }, 150 | "execution_count": 5, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "pd.cut(s[passenger_count == 1], \n", 157 | " bins=3,\n", 158 | " labels=['short', 'medium', 'long']).value_counts()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "`pd.cut` took the interval from `s.min()` to `s.max()`, divided it into three equal parts, and assigned those to be `short`, `medium`, and `long`. We can see, though, that this meant our `long` category is from 43 miles to 64.6 miles -- numerically one-third of the values' interval, but only including a handful of values!" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3 (ipykernel)", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.11.6" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 4 190 | } 191 | -------------------------------------------------------------------------------- /chapter-01/Exercise 01 — Test scores.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "Sep 96\n", 23 | "Oct 89\n", 24 | "Nov 85\n", 25 | "Dec 78\n", 26 | "Jan 79\n", 27 | "Feb 71\n", 28 | "Mar 72\n", 29 | "Apr 70\n", 30 | "May 75\n", 31 | "Jun 95\n", 32 | "dtype: int64" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "g = np.random.default_rng(0)\n", 42 | "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n", 43 | "\n", 44 | "s = Series(g.integers(70, 101, 10),\n", 45 | " index=months)\n", 46 | "s" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "Entire year average: 81.0\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "print(f'Entire year average: {s.mean()}')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": { 70 | "scrolled": true 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "Yearly average: 81.0\n", 78 | "First half average: 85.4\n", 79 | "Second half average: 76.6\n", 80 | "Improvement: -8.800000000000011\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "first_half_average = s['Sep':'Jan'].mean()\n", 86 | "second_half_average = s['Feb':'Jun'].mean()\n", 87 | "\n", 88 | "print(f'Yearly average: {s.mean()}')\n", 89 | "\n", 90 | "print(f'First half average: {first_half_average}')\n", 91 | "print(f'Second half average: {second_half_average}')\n", 92 | "\n", 93 | "print(f'Improvement: {second_half_average - first_half_average}')" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3 (ipykernel)", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.11.6" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 4 125 | } 126 | -------------------------------------------------------------------------------- /chapter-01/Exercise 01b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "g = np.random.default_rng(0)\n", 14 | "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n", 15 | "\n", 16 | "s = Series(g.integers(70, 100, 10),\n", 17 | " index=months)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Beyond 1\n", 25 | "\n", 26 | "In which month did this student get their highest score? Note that there are at least two ways to accomplish this: You can sort the values, taking the largest one, or you can use a boolean (\"mask\") index to find those rows that match the value of `s.max()`, the highest value." 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "'Sep'" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "# Option 1\n", 47 | "s.sort_values(ascending=False).index[0]" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "'Sep'" 59 | ] 60 | }, 61 | "execution_count": 3, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "# Option 2\n", 68 | "s[s==s.max()].index[0]" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "'Sep'" 80 | ] 81 | }, 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# Option 3\n", 89 | "s.idxmax()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "# Beyond 2\n", 97 | "\n", 98 | "What were this student's five highest scores in the year?" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "Sep 95\n", 110 | "Jun 94\n", 111 | "Oct 89\n", 112 | "Nov 85\n", 113 | "Jan 79\n", 114 | "dtype: int64" 115 | ] 116 | }, 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "output_type": "execute_result" 120 | } 121 | ], 122 | "source": [ 123 | "s.sort_values(ascending=False).head(5)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "# Beyond 3\n", 131 | "\n", 132 | "Round the student's scores to the nearest 10. So a score of 82 would be rounded down to 80, but a score of 87 would be rounded up to 90." 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "Sep 100\n", 144 | "Oct 90\n", 145 | "Nov 80\n", 146 | "Dec 80\n", 147 | "Jan 80\n", 148 | "Feb 70\n", 149 | "Mar 70\n", 150 | "Apr 70\n", 151 | "May 80\n", 152 | "Jun 90\n", 153 | "dtype: int64" 154 | ] 155 | }, 156 | "execution_count": 6, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "# The \"round\" method, when given a positive integer argument, rounds numbers after the\n", 163 | "# decimal point. When given a negative integer argument, it rounds numbers *before* the decimal point!\n", 164 | "\n", 165 | "s.round(-1) " 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3 (ipykernel)", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "language_info": { 183 | "codemirror_mode": { 184 | "name": "ipython", 185 | "version": 3 186 | }, 187 | "file_extension": ".py", 188 | "mimetype": "text/x-python", 189 | "name": "python", 190 | "nbconvert_exporter": "python", 191 | "pygments_lexer": "ipython3", 192 | "version": "3.11.6" 193 | } 194 | }, 195 | "nbformat": 4, 196 | "nbformat_minor": 4 197 | } 198 | -------------------------------------------------------------------------------- /chapter-01/Exercise 02 — Scaling test scores.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "Sep 57\n", 23 | "Oct 52\n", 24 | "Nov 50\n", 25 | "Dec 45\n", 26 | "Jan 46\n", 27 | "Feb 40\n", 28 | "Mar 41\n", 29 | "Apr 40\n", 30 | "May 43\n", 31 | "Jun 56\n", 32 | "dtype: int64" 33 | ] 34 | }, 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "output_type": "execute_result" 38 | } 39 | ], 40 | "source": [ 41 | "g = np.random.default_rng(0)\n", 42 | "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n", 43 | "\n", 44 | "s = Series(g.integers(40, 60, 10),\n", 45 | " index=months)\n", 46 | "s" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "Sep 90.0\n", 58 | "Oct 85.0\n", 59 | "Nov 83.0\n", 60 | "Dec 78.0\n", 61 | "Jan 79.0\n", 62 | "Feb 73.0\n", 63 | "Mar 74.0\n", 64 | "Apr 73.0\n", 65 | "May 76.0\n", 66 | "Jun 89.0\n", 67 | "dtype: float64" 68 | ] 69 | }, 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "s + (80 - s.mean())" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3 (ipykernel)", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.11.6" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 4 108 | } 109 | -------------------------------------------------------------------------------- /chapter-01/Exercise 03 — Counting 10s digits.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0 85\n", 12 | "1 63\n", 13 | "2 51\n", 14 | "3 26\n", 15 | "4 30\n", 16 | "5 4\n", 17 | "6 7\n", 18 | "7 1\n", 19 | "8 17\n", 20 | "9 81\n", 21 | "dtype: int64" 22 | ] 23 | }, 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "output_type": "execute_result" 27 | } 28 | ], 29 | "source": [ 30 | "import numpy as np\n", 31 | "import pandas as pd\n", 32 | "from pandas import Series, DataFrame\n", 33 | "\n", 34 | "g = np.random.default_rng(0)\n", 35 | "s = Series(g.integers(0, 100, 10))\n", 36 | "s" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "0 8\n", 48 | "1 6\n", 49 | "2 5\n", 50 | "3 2\n", 51 | "4 3\n", 52 | "5 0\n", 53 | "6 0\n", 54 | "7 0\n", 55 | "8 1\n", 56 | "9 8\n", 57 | "dtype: int8" 58 | ] 59 | }, 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "# solution 1, using /\n", 67 | "(s / 10).astype(np.int8)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "0 8\n", 79 | "1 6\n", 80 | "2 5\n", 81 | "3 2\n", 82 | "4 3\n", 83 | "5 0\n", 84 | "6 0\n", 85 | "7 0\n", 86 | "8 1\n", 87 | "9 8\n", 88 | "dtype: int64" 89 | ] 90 | }, 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "# solution 2, using //\n", 98 | "(s // 10)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 4, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "0 8\n", 110 | "1 6\n", 111 | "2 5\n", 112 | "3 2\n", 113 | "4 3\n", 114 | "5 0\n", 115 | "6 0\n", 116 | "7 0\n", 117 | "8 1\n", 118 | "9 8\n", 119 | "dtype: object" 120 | ] 121 | }, 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "# solution 3, partial\n", 129 | "s.astype(str).str.get(-2).fillna('0')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 5, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "data": { 139 | "text/plain": [ 140 | "0 8\n", 141 | "1 6\n", 142 | "2 5\n", 143 | "3 2\n", 144 | "4 3\n", 145 | "5 0\n", 146 | "6 0\n", 147 | "7 0\n", 148 | "8 1\n", 149 | "9 8\n", 150 | "dtype: int8" 151 | ] 152 | }, 153 | "execution_count": 5, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "# solution 3, complete\n", 160 | "s.astype(str).str.get(-2).fillna('0').astype(np.int8)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 6, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "0 8\n", 172 | "1 6\n", 173 | "2 5\n", 174 | "3 2\n", 175 | "4 3\n", 176 | "5 0\n", 177 | "6 0\n", 178 | "7 0\n", 179 | "8 1\n", 180 | "9 8\n", 181 | "dtype: int8" 182 | ] 183 | }, 184 | "execution_count": 6, 185 | "metadata": {}, 186 | "output_type": "execute_result" 187 | } 188 | ], 189 | "source": [ 190 | "(\n", 191 | " s\n", 192 | " .astype(str) # get a series based on s, with dtype str\n", 193 | " .str.get(-2) # retrieve the second-to-last character\n", 194 | " .fillna('0') # replace NaN with '0'\n", 195 | " .astype(np.int8) # get a new series back dtype int8\n", 196 | ")\n" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | } 206 | ], 207 | "metadata": { 208 | "kernelspec": { 209 | "display_name": "Python 3 (ipykernel)", 210 | "language": "python", 211 | "name": "python3" 212 | }, 213 | "language_info": { 214 | "codemirror_mode": { 215 | "name": "ipython", 216 | "version": 3 217 | }, 218 | "file_extension": ".py", 219 | "mimetype": "text/x-python", 220 | "name": "python", 221 | "nbconvert_exporter": "python", 222 | "pygments_lexer": "ipython3", 223 | "version": "3.11.6" 224 | } 225 | }, 226 | "nbformat": 4, 227 | "nbformat_minor": 4 228 | } 229 | -------------------------------------------------------------------------------- /chapter-01/Exercise 04 — Descriptive statistics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0 12.573022\n", 12 | "1 -13.210486\n", 13 | "2 64.042265\n", 14 | "3 10.490012\n", 15 | "4 -53.566937\n", 16 | " ... \n", 17 | "99995 -91.667135\n", 18 | "99996 -231.480500\n", 19 | "99997 -0.028179\n", 20 | "99998 -109.645051\n", 21 | "99999 -49.541294\n", 22 | "Length: 100000, dtype: float64" 23 | ] 24 | }, 25 | "execution_count": 5, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from pandas import Series, DataFrame\n", 34 | "\n", 35 | "g = np.random.default_rng(0)\n", 36 | "s = Series(g.normal(0, 100, 100_000))\n", 37 | "s" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 6, 43 | "metadata": { 44 | "scrolled": true 45 | }, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/plain": [ 50 | "count 100000.000000\n", 51 | "mean -0.090825\n", 52 | "std 100.013350\n", 53 | "min -449.411704\n", 54 | "25% -67.292120\n", 55 | "50% -0.414699\n", 56 | "75% 67.636542\n", 57 | "max 473.195769\n", 58 | "dtype: float64" 59 | ] 60 | }, 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "s.describe()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 7, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "s.loc[s == s.min()] = 5*s.max()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 4, 82 | "metadata": { 83 | "scrolled": true 84 | }, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "count 100000.000000\n", 90 | "mean -0.062671\n", 91 | "std 100.282770\n", 92 | "min -402.315865\n", 93 | "25% -67.288054\n", 94 | "50% -0.409289\n", 95 | "75% 67.640758\n", 96 | "max 2365.978844\n", 97 | "dtype: float64" 98 | ] 99 | }, 100 | "execution_count": 4, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "s.describe()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3 (ipykernel)", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.11.6" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 4 138 | } 139 | -------------------------------------------------------------------------------- /chapter-01/Exercise 04b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0 12.573022\n", 12 | "1 -13.210486\n", 13 | "2 64.042265\n", 14 | "3 10.490012\n", 15 | "4 -53.566937\n", 16 | " ... \n", 17 | "99995 -91.667135\n", 18 | "99996 -231.480500\n", 19 | "99997 -0.028179\n", 20 | "99998 -109.645051\n", 21 | "99999 -49.541294\n", 22 | "Length: 100000, dtype: float64" 23 | ] 24 | }, 25 | "execution_count": 1, 26 | "metadata": {}, 27 | "output_type": "execute_result" 28 | } 29 | ], 30 | "source": [ 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "from pandas import Series, DataFrame\n", 34 | "\n", 35 | "g = np.random.default_rng(0)\n", 36 | "s = Series(g.normal(0, 100, 100_000))\n", 37 | "s" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Beyond 1\n", 45 | "\n", 46 | "Demonstrate that 68%, 95%, and 99.7% of the values in `s` are indeed within 1, 2, and 3 standard distributions of the mean." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "0.68396" 58 | ] 59 | }, 60 | "execution_count": 2, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "# within one standard deviation\n", 67 | "s[(s > s.mean() - s.std()) &\n", 68 | " (s < s.mean() + s.std())].count() / s.count()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "0.95461" 80 | ] 81 | }, 82 | "execution_count": 3, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "# within two standard deviations\n", 89 | "s[(s > s.mean() - 2*s.std()) &\n", 90 | " (s < s.mean() + 2*s.std())].count() / s.count()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 4, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "0.99708" 102 | ] 103 | }, 104 | "execution_count": 4, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "# within three standard deviations\n", 111 | "s[(s > s.mean() - 3*s.std()) &\n", 112 | " (s < s.mean() + 3*s.std())].count() / s.count()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "# Beyond 2\n", 120 | "\n", 121 | " Calculate the mean of numbers greater than `s.mean()`. Then calculate the mean of numbers less than `s.mean()`. Is the average of these two numbers the same as `s.mean()`?" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "0.12941477214831565" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "(s[s < s.mean()].mean() + s[s > s.mean()].mean() ) / 2" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "-0.09082507731206121" 153 | ] 154 | }, 155 | "execution_count": 6, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "# They're pretty close!\n", 162 | "s.mean()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "# Beyond 3\n", 170 | "\n", 171 | "What is the mean of the numbers beyond 3 standard deviations?" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 7, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "-11.606040282602287" 183 | ] 184 | }, 185 | "execution_count": 7, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "# A pretty complex combination of mask indexes,\n", 192 | "# but the result is still a series, on which we can run mean()\n", 193 | "s[(s < s.mean() - 3*s.std()) | \n", 194 | " (s > s.mean() + 3*s.std()) ].mean()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3 (ipykernel)", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.11.6" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 4 226 | } 227 | -------------------------------------------------------------------------------- /chapter-01/Exercise 05 — Monday temperatures.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n", 14 | "\n", 15 | "g = np.random.default_rng(0)\n", 16 | "s = Series(g.normal(20, 5, 28),\n", 17 | " index=days*4).round().astype(np.int8)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [ 25 | { 26 | "data": { 27 | "text/plain": [ 28 | "Sun 21\n", 29 | "Mon 19\n", 30 | "Tue 23\n", 31 | "Wed 21\n", 32 | "Thu 17\n", 33 | "Fri 22\n", 34 | "Sat 27\n", 35 | "Sun 25\n", 36 | "Mon 16\n", 37 | "Tue 14\n", 38 | "Wed 17\n", 39 | "Thu 20\n", 40 | "Fri 8\n", 41 | "Sat 19\n", 42 | "Sun 14\n", 43 | "Mon 16\n", 44 | "Tue 17\n", 45 | "Wed 18\n", 46 | "Thu 22\n", 47 | "Fri 25\n", 48 | "Sat 19\n", 49 | "Sun 27\n", 50 | "Mon 17\n", 51 | "Tue 22\n", 52 | "Wed 25\n", 53 | "Thu 20\n", 54 | "Fri 16\n", 55 | "Sat 15\n", 56 | "dtype: int8" 57 | ] 58 | }, 59 | "execution_count": 2, 60 | "metadata": {}, 61 | "output_type": "execute_result" 62 | } 63 | ], 64 | "source": [ 65 | "s" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "17.0" 77 | ] 78 | }, 79 | "execution_count": 3, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "s.loc['Mon'].mean()" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [] 94 | } 95 | ], 96 | "metadata": { 97 | "kernelspec": { 98 | "display_name": "Python 3 (ipykernel)", 99 | "language": "python", 100 | "name": "python3" 101 | }, 102 | "language_info": { 103 | "codemirror_mode": { 104 | "name": "ipython", 105 | "version": 3 106 | }, 107 | "file_extension": ".py", 108 | "mimetype": "text/x-python", 109 | "name": "python", 110 | "nbconvert_exporter": "python", 111 | "pygments_lexer": "ipython3", 112 | "version": "3.11.6" 113 | } 114 | }, 115 | "nbformat": 4, 116 | "nbformat_minor": 4 117 | } 118 | -------------------------------------------------------------------------------- /chapter-01/Exercise 05b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n", 14 | "\n", 15 | "g = np.random.default_rng(0)\n", 16 | "s = Series(g.normal(20, 5, 28),\n", 17 | " index=days*4).round().astype(np.int8)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Beyond 1\n", 25 | "\n", 26 | "What was the average temperature on weekends (i.e., Saturdays and Sundays)?" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "20.875" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "s[['Sun', 'Sat']].mean()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Beyond 2\n", 54 | "\n", 55 | "How many times will the change in temperature from the previous day be greater than 2 degrees?" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "Tue 23\n", 67 | "Fri 22\n", 68 | "Sat 27\n", 69 | "Wed 17\n", 70 | "Thu 20\n", 71 | "Sat 19\n", 72 | "Thu 22\n", 73 | "Fri 25\n", 74 | "Sun 27\n", 75 | "Tue 22\n", 76 | "Wed 25\n", 77 | "dtype: int8" 78 | ] 79 | }, 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "# by default, the \"diff\" method compares with the previous element\n", 87 | "s[s.diff() > 2]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "# Beyond 3\n", 95 | "\n", 96 | "What are the two most common temperatures, and how often does each appear?" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 4, 102 | "metadata": {}, 103 | "outputs": [ 104 | { 105 | "data": { 106 | "text/plain": [ 107 | "17 4\n", 108 | "19 3\n", 109 | "Name: count, dtype: int64" 110 | ] 111 | }, 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "# value_counts returns a series in which the values from s are \n", 119 | "# the index, the number of appearances is the value, and the\n", 120 | "# items are ordered from most common to least common. We can\n", 121 | "# then use \"head\" to get only the 2 most common values.\n", 122 | "s.value_counts().head(2)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3 (ipykernel)", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.11.6" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /chapter-01/Exercise 06 — Passenger frequency.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/plain": [ 11 | "0\n", 12 | "1 0.720772\n", 13 | "6 0.036904\n", 14 | "Name: proportion, dtype: float64" 15 | ] 16 | }, 17 | "execution_count": 1, 18 | "metadata": {}, 19 | "output_type": "execute_result" 20 | } 21 | ], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "from pandas import Series, DataFrame\n", 26 | "\n", 27 | "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n", 28 | "\n", 29 | "s.value_counts(normalize=True)[[1, 6]]" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "data": { 39 | "text/plain": [ 40 | "0\n", 41 | "1 7207\n", 42 | "2 1313\n", 43 | "5 520\n", 44 | "3 406\n", 45 | "6 369\n", 46 | "4 182\n", 47 | "0 2\n", 48 | "Name: count, dtype: int64" 49 | ] 50 | }, 51 | "execution_count": 2, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "s.value_counts()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0\n", 69 | "1 0.720772\n", 70 | "6 0.036904\n", 71 | "Name: proportion, dtype: float64" 72 | ] 73 | }, 74 | "execution_count": 3, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "s.value_counts(normalize=True)[[1,6]]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [] 89 | } 90 | ], 91 | "metadata": { 92 | "kernelspec": { 93 | "display_name": "Python 3 (ipykernel)", 94 | "language": "python", 95 | "name": "python3" 96 | }, 97 | "language_info": { 98 | "codemirror_mode": { 99 | "name": "ipython", 100 | "version": 3 101 | }, 102 | "file_extension": ".py", 103 | "mimetype": "text/x-python", 104 | "name": "python", 105 | "nbconvert_exporter": "python", 106 | "pygments_lexer": "ipython3", 107 | "version": "3.11.6" 108 | } 109 | }, 110 | "nbformat": 4, 111 | "nbformat_minor": 4 112 | } 113 | -------------------------------------------------------------------------------- /chapter-01/Exercise 06b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Beyond 1\n", 21 | "\n", 22 | "What are the 25%, 50% (median), and 75% quantiles for this data set? Can you guess the results before you execute the code?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "0.25 1.0\n", 34 | "0.50 1.0\n", 35 | "0.75 2.0\n", 36 | "Name: 0, dtype: float64" 37 | ] 38 | }, 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "# Since 1-passenger rides are 72% of the values, we can\n", 46 | "# guess that the 25% and 50% marks will be 1, whereas \n", 47 | "# the 75% mark will be 2 or 3, depending on how common those are.\n", 48 | "s.quantile([.25, .50, .75])" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# Beyond 2\n", 56 | "\n", 57 | "What proportion of taxi rides are for 3, 4, 5, or 6 passengers?" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "metadata": {}, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "0.1477147714771477" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "s.value_counts(normalize=True)[[3,4,5,6]].sum()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# Beyond 3\n", 85 | "\n", 86 | "Consider that you're in charge of vehicle licensing for New York taxis. Given these numbers, would more people benefit from smaller taxis that can take only one or two passengers, or larger taxis that can take five or six passengers?" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Given that a huge proportion of rides are for 1 or 2 passengers, licensing more small taxis would seem to match the needs." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3 (ipykernel)", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.11.6" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 4 125 | } 126 | -------------------------------------------------------------------------------- /chapter-01/Exercise 07 — Long, medium, and short rides.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "0\n", 25 | "short 5890\n", 26 | "medium 3402\n", 27 | "long 707\n", 28 | "Name: count, dtype: int64" 29 | ] 30 | }, 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "output_type": "execute_result" 34 | } 35 | ], 36 | "source": [ 37 | "pd.cut(s, \n", 38 | " bins=[0, 2, 10, s.max()], \n", 39 | " include_lowest=True,\n", 40 | " labels=['short', 'medium', 'long']).value_counts()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | } 50 | ], 51 | "metadata": { 52 | "kernelspec": { 53 | "display_name": "Python 3 (ipykernel)", 54 | "language": "python", 55 | "name": "python3" 56 | }, 57 | "language_info": { 58 | "codemirror_mode": { 59 | "name": "ipython", 60 | "version": 3 61 | }, 62 | "file_extension": ".py", 63 | "mimetype": "text/x-python", 64 | "name": "python", 65 | "nbconvert_exporter": "python", 66 | "pygments_lexer": "ipython3", 67 | "version": "3.11.6" 68 | } 69 | }, 70 | "nbformat": 4, 71 | "nbformat_minor": 4 72 | } 73 | -------------------------------------------------------------------------------- /chapter-01/Exercise 07b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "# Beyond 1\n", 21 | "\n", 22 | "Compare the mean and median trip distances. What does that tell you about the distribution of our data?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": { 29 | "scrolled": true 30 | }, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "count 9999.000000\n", 36 | "mean 3.158511\n", 37 | "std 4.037516\n", 38 | "min 0.000000\n", 39 | "25% 1.000000\n", 40 | "50% 1.700000\n", 41 | "75% 3.300000\n", 42 | "max 64.600000\n", 43 | "Name: 0, dtype: float64" 44 | ] 45 | }, 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "output_type": "execute_result" 49 | } 50 | ], 51 | "source": [ 52 | "s.describe()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "Because the mean is significantly higher than the median, it would seem that there are some *very* long trips in our data set that are pulling the mean up. And sure enough, we see that the standard deviation is 4, but that we have at least one trip > 64 miles in length." 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# Beyond 2\n", 67 | "\n", 68 | "How many short, medium, and long trips were there for trips that had only one passenger? Note that data for passenger count and trip length are from the same data set, meaning that the indexes are the same." 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "0\n", 80 | "short 4333\n", 81 | "medium 2387\n", 82 | "long 487\n", 83 | "Name: count, dtype: int64" 84 | ] 85 | }, 86 | "execution_count": 3, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n", 93 | "\n", 94 | "pd.cut(s[passenger_count == 1], \n", 95 | " bins=[s.min(), 2, 10, s.max()], \n", 96 | " include_lowest=True,\n", 97 | " labels=['short', 'medium', 'long']).value_counts()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "# Beyond 3\n", 105 | "\n", 106 | "What happens if we don't pass explicit intervals, and instead ask `pd.cut` to just create 3 bins, with `bins=3`?" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "array([-0.0646 , 21.53333333, 43.06666667, 64.6 ])" 118 | ] 119 | }, 120 | "execution_count": 4, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n", 127 | "\n", 128 | "pd.cut(s[passenger_count == 1], \n", 129 | " bins=3,\n", 130 | " labels=['short', 'medium', 'long'], retbins=True)[-1]" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 5, 136 | "metadata": { 137 | "scrolled": true 138 | }, 139 | "outputs": [ 140 | { 141 | "data": { 142 | "text/plain": [ 143 | "0\n", 144 | "short 7179\n", 145 | "medium 26\n", 146 | "long 2\n", 147 | "Name: count, dtype: int64" 148 | ] 149 | }, 150 | "execution_count": 5, 151 | "metadata": {}, 152 | "output_type": "execute_result" 153 | } 154 | ], 155 | "source": [ 156 | "pd.cut(s[passenger_count == 1], \n", 157 | " bins=3,\n", 158 | " labels=['short', 'medium', 'long']).value_counts()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "`pd.cut` took the interval from `s.min()` to `s.max()`, divided it into three equal parts, and assigned those to be `short`, `medium`, and `long`. We can see, though, that this meant our `long` category is from 43 miles to 64.6 miles -- numerically one-third of the values' interval, but only including a handful of values!" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 3 (ipykernel)", 172 | "language": "python", 173 | "name": "python3" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 3 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython3", 185 | "version": "3.12.1" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 4 190 | } 191 | -------------------------------------------------------------------------------- /chapter-02/.ipynb_checkpoints/Exercise 08 — Net revenue-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "from pandas import Series, DataFrame" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "
\n", 25 | "\n", 38 | "\n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | "
product_idnamewholesale_priceretail_pricesales
023computer500.01000100
196Python Workout35.0751000
297Pandas Workout35.075500
315banana0.51200
487sandwich3.05300
\n", 92 | "
" 93 | ], 94 | "text/plain": [ 95 | " product_id name wholesale_price retail_price sales\n", 96 | "0 23 computer 500.0 1000 100\n", 97 | "1 96 Python Workout 35.0 75 1000\n", 98 | "2 97 Pandas Workout 35.0 75 500\n", 99 | "3 15 banana 0.5 1 200\n", 100 | "4 87 sandwich 3.0 5 300" 101 | ] 102 | }, 103 | "execution_count": 2, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "df = DataFrame([{'product_id':23, 'name':'computer', 'wholesale_price': 500, \n", 110 | " 'retail_price':1000, 'sales':100},\n", 111 | " {'product_id':96, 'name':'Python Workout', 'wholesale_price': 35,\n", 112 | " 'retail_price':75, 'sales':1000},\n", 113 | " {'product_id':97, 'name':'Pandas Workout', 'wholesale_price': 35,\n", 114 | " 'retail_price':75, 'sales':500},\n", 115 | " {'product_id':15, 'name':'banana', 'wholesale_price': 0.5,\n", 116 | " 'retail_price':1, 'sales':200},\n", 117 | " {'product_id':87, 'name':'sandwich', 'wholesale_price': 3,\n", 118 | " 'retail_price':5, 'sales':300},\n", 119 | " ])\n", 120 | "\n", 121 | "df" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 3, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "110700.0" 133 | ] 134 | }, 135 | "execution_count": 3, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "((df['retail_price'] - df['wholesale_price']) * df['sales']).sum()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3 (ipykernel)", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.11.6" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 4 173 | } 174 | -------------------------------------------------------------------------------- /chapter-02/.ipynb_checkpoints/Exercise 13 — Interpolation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n", 14 | "df = DataFrame({'temp': s, \n", 15 | " 'hour': [0,3,6,9,12,15,18,21] * 91})" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "count 728.000000\n", 27 | "mean -1.050824\n", 28 | "std 5.026357\n", 29 | "min -14.000000\n", 30 | "25% -4.000000\n", 31 | "50% 0.000000\n", 32 | "75% 2.000000\n", 33 | "max 12.000000\n", 34 | "Name: temp, dtype: float64" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "# Get the mean + median for baseline data\n", 44 | "df['temp'].describe()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Set temp at 3 and 6 a.m. to NaN\n", 54 | "df.loc[\n", 55 | " df['hour'].isin([3, 6]),\n", 56 | " 'temp'\n", 57 | "] = np.NaN" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# Interpolate!\n", 67 | "df = df.interpolate()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "count 728.000000\n", 79 | "mean -1.050824\n", 80 | "std 5.026357\n", 81 | "min -14.000000\n", 82 | "25% -4.000000\n", 83 | "50% 0.000000\n", 84 | "75% 2.000000\n", 85 | "max 12.000000\n", 86 | "Name: temp, dtype: float64" 87 | ] 88 | }, 89 | "execution_count": 5, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "# Get the mean + median for when we're missing 3 and 6 a.m.\n", 96 | "df['temp'].describe()" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3 (ipykernel)", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.11.6" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 4 121 | } 122 | -------------------------------------------------------------------------------- /chapter-02/.ipynb_checkpoints/Exercise 14 — Selective updating-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n", 13 | "df = DataFrame({'temp': s, \n", 14 | " 'hour': [0,3,6,9,12,15,18,21] * 91})" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "df.loc[\n", 24 | " df['temp'] < 0, \n", 25 | " 'temp'\n", 26 | "] = 0" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/html": [ 37 | "
\n", 38 | "\n", 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | "
temphour
count728.000000728.000000
mean1.43131910.500000
std2.3784246.878589
min0.0000000.000000
25%0.0000005.250000
50%0.00000010.500000
75%2.00000015.750000
max12.00000021.000000
\n", 102 | "
" 103 | ], 104 | "text/plain": [ 105 | " temp hour\n", 106 | "count 728.000000 728.000000\n", 107 | "mean 1.431319 10.500000\n", 108 | "std 2.378424 6.878589\n", 109 | "min 0.000000 0.000000\n", 110 | "25% 0.000000 5.250000\n", 111 | "50% 0.000000 10.500000\n", 112 | "75% 2.000000 15.750000\n", 113 | "max 12.000000 21.000000" 114 | ] 115 | }, 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "df.describe()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3 (ipykernel)", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.11.6" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /chapter-02/Exercise 08 — Net revenue.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import numpy as np\n", 12 | "import pandas as pd\n", 13 | "from pandas import Series, DataFrame" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "
\n", 25 | "\n", 38 | "\n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | "
product_idnamewholesale_priceretail_pricesales
023computer500.01000100
196Python Workout35.0751000
297Pandas Workout35.075500
315banana0.51200
487sandwich3.05300
\n", 92 | "
" 93 | ], 94 | "text/plain": [ 95 | " product_id name wholesale_price retail_price sales\n", 96 | "0 23 computer 500.0 1000 100\n", 97 | "1 96 Python Workout 35.0 75 1000\n", 98 | "2 97 Pandas Workout 35.0 75 500\n", 99 | "3 15 banana 0.5 1 200\n", 100 | "4 87 sandwich 3.0 5 300" 101 | ] 102 | }, 103 | "execution_count": 2, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "df = DataFrame([{'product_id':23, 'name':'computer', 'wholesale_price': 500, \n", 110 | " 'retail_price':1000, 'sales':100},\n", 111 | " {'product_id':96, 'name':'Python Workout', 'wholesale_price': 35,\n", 112 | " 'retail_price':75, 'sales':1000},\n", 113 | " {'product_id':97, 'name':'Pandas Workout', 'wholesale_price': 35,\n", 114 | " 'retail_price':75, 'sales':500},\n", 115 | " {'product_id':15, 'name':'banana', 'wholesale_price': 0.5,\n", 116 | " 'retail_price':1, 'sales':200},\n", 117 | " {'product_id':87, 'name':'sandwich', 'wholesale_price': 3,\n", 118 | " 'retail_price':5, 'sales':300},\n", 119 | " ])\n", 120 | "\n", 121 | "df" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 3, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/plain": [ 132 | "110700.0" 133 | ] 134 | }, 135 | "execution_count": 3, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "((df['retail_price'] - df['wholesale_price']) * df['sales']).sum()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3 (ipykernel)", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.11.6" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 4 173 | } 174 | -------------------------------------------------------------------------------- /chapter-02/Exercise 13 — Interpolation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame\n", 12 | "\n", 13 | "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n", 14 | "df = DataFrame({'temp': s, \n", 15 | " 'hour': [0,3,6,9,12,15,18,21] * 91})" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "count 728.000000\n", 27 | "mean -1.050824\n", 28 | "std 5.026357\n", 29 | "min -14.000000\n", 30 | "25% -4.000000\n", 31 | "50% 0.000000\n", 32 | "75% 2.000000\n", 33 | "max 12.000000\n", 34 | "Name: temp, dtype: float64" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "# Get the mean + median for baseline data\n", 44 | "df['temp'].describe()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Set temp at 3 and 6 a.m. to NaN\n", 54 | "df.loc[\n", 55 | " df['hour'].isin([3, 6]),\n", 56 | " 'temp'\n", 57 | "] = np.NaN" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "# Interpolate!\n", 67 | "df = df.interpolate()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 5, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "data": { 77 | "text/plain": [ 78 | "count 728.000000\n", 79 | "mean -1.050824\n", 80 | "std 5.026357\n", 81 | "min -14.000000\n", 82 | "25% -4.000000\n", 83 | "50% 0.000000\n", 84 | "75% 2.000000\n", 85 | "max 12.000000\n", 86 | "Name: temp, dtype: float64" 87 | ] 88 | }, 89 | "execution_count": 5, 90 | "metadata": {}, 91 | "output_type": "execute_result" 92 | } 93 | ], 94 | "source": [ 95 | "# Get the mean + median for when we're missing 3 and 6 a.m.\n", 96 | "df['temp'].describe()" 97 | ] 98 | } 99 | ], 100 | "metadata": { 101 | "kernelspec": { 102 | "display_name": "Python 3 (ipykernel)", 103 | "language": "python", 104 | "name": "python3" 105 | }, 106 | "language_info": { 107 | "codemirror_mode": { 108 | "name": "ipython", 109 | "version": 3 110 | }, 111 | "file_extension": ".py", 112 | "mimetype": "text/x-python", 113 | "name": "python", 114 | "nbconvert_exporter": "python", 115 | "pygments_lexer": "ipython3", 116 | "version": "3.11.6" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 4 121 | } 122 | -------------------------------------------------------------------------------- /chapter-02/Exercise 14 — Selective updating.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n", 13 | "df = DataFrame({'temp': s, \n", 14 | " 'hour': [0,3,6,9,12,15,18,21] * 91})" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "df.loc[\n", 24 | " df['temp'] < 0, \n", 25 | " 'temp'\n", 26 | "] = 0" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/html": [ 37 | "
\n", 38 | "\n", 51 | "\n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | "
temphour
count728.000000728.000000
mean1.43131910.500000
std2.3784246.878589
min0.0000000.000000
25%0.0000005.250000
50%0.00000010.500000
75%2.00000015.750000
max12.00000021.000000
\n", 102 | "
" 103 | ], 104 | "text/plain": [ 105 | " temp hour\n", 106 | "count 728.000000 728.000000\n", 107 | "mean 1.431319 10.500000\n", 108 | "std 2.378424 6.878589\n", 109 | "min 0.000000 0.000000\n", 110 | "25% 0.000000 5.250000\n", 111 | "50% 0.000000 10.500000\n", 112 | "75% 2.000000 15.750000\n", 113 | "max 12.000000 21.000000" 114 | ] 115 | }, 116 | "execution_count": 3, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "df.describe()" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3 (ipykernel)", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.11.6" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /chapter-03/.ipynb_checkpoints/Exercise 15 — Weird taxi rides-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n", 13 | " usecols=['passenger_count', 'trip_distance',\n", 14 | " 'total_amount', 'payment_type'])" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "9" 26 | ] 27 | }, 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "# How many taxi rides had more than 8 passengers?\n", 35 | "df.loc[df['passenger_count'] > 8, 'passenger_count'].count()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "117381" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "# How many taxi rides had zero passengers?\n", 56 | "df.loc[\n", 57 | " df['passenger_count'] == 0, 'passenger_count'\n", 58 | "].count()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "5" 70 | ] 71 | }, 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "# How many taxi rides were paid for in cash, and cost more than $1,000?\n", 79 | "df.loc[\n", 80 | " (df['payment_type'] == 2) & (df['total_amount'] > 1000), \n", 81 | " 'passenger_count'\n", 82 | "].count()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "7131" 94 | ] 95 | }, 96 | "execution_count": 5, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "# How many rides cost less than 0?\n", 103 | "df.loc[\n", 104 | " df['total_amount'] < 0, \n", 105 | " 'total_amount'\n", 106 | "].count()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "411255" 118 | ] 119 | }, 120 | "execution_count": 6, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "# How many rides traveled a below-average distance, but cost an above-average amount?\n", 127 | "df.loc[((df['trip_distance'] < df['trip_distance'].mean()) &\n", 128 | " (df['total_amount'] > df['total_amount'].mean())), \n", 129 | " 'trip_distance'].count()" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3 (ipykernel)", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.11.6" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /chapter-03/.ipynb_checkpoints/Exercise 15b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n", 13 | " usecols=['passenger_count', 'trip_distance',\n", 14 | " 'total_amount', 'payment_type'])" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Beyond 1\n", 22 | "\n", 23 | "Repeat this exercise, but using the `query` method rather than a boolean index." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "9" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "# How many taxi rides had more than 8 passengers (query version)\n", 44 | "df.query('passenger_count > 8')['passenger_count'].count()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "117381" 56 | ] 57 | }, 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "# How many taxi rides had zero passengers (query version)\n", 65 | "df.query('passenger_count == 0')['passenger_count'].count()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "5" 77 | ] 78 | }, 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "# How many taxi rides were paid for in cash, and cost more than $1,000? (query version)\n", 86 | "df.query('payment_type == 2 & total_amount > 1000')['payment_type'].count()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "7131" 98 | ] 99 | }, 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "# How many rides cost less than 0? (query version)\n", 107 | "df.query('total_amount < 0')['total_amount'].count()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "411255" 119 | ] 120 | }, 121 | "execution_count": 6, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "# How many rides traveled a below-average distance, but cost an above-average amount?\n", 128 | "# (query version)\n", 129 | "df.query('trip_distance < trip_distance.mean() & total_amount > total_amount.mean()')['trip_distance'].count()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "# Beyond 2\n", 137 | "\n", 138 | "How many of the rides that cost less than 0 were indeed for either a dispute (`payment_type` of 4) or a voided trip (`payment_type` of 6)?" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 7, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "2666" 150 | ] 151 | }, 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "df.loc[(df['total_amount'] < 0) & \n", 159 | " ((df['payment_type'] == 4) | \n", 160 | " (df['payment_type'] == 6)), 'total_amount'].count()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "# Beyond 3\n", 168 | "\n", 169 | "I stated above that most people pay for their taxi rides using a credit card. Show this, and find what percentages normally pay in cash vs. a credit card." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/plain": [ 180 | "payment_type\n", 181 | "1 0.715464\n", 182 | "2 0.278752\n", 183 | "Name: proportion, dtype: float64" 184 | ] 185 | }, 186 | "execution_count": 8, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "# 1 == credit card\n", 193 | "# 2 == cash\n", 194 | "\n", 195 | "df['payment_type'].value_counts(normalize=True)[[1,2]]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3 (ipykernel)", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.11.6" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 4 227 | } 228 | -------------------------------------------------------------------------------- /chapter-03/.ipynb_checkpoints/Exercise 16 — Pandemic taxis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "df_2019_jul = pd.read_csv('../data/nyc_taxi_2019-07.csv',\n", 13 | " usecols=['passenger_count', \n", 14 | " 'total_amount', 'payment_type'])\n", 15 | "df_2019_jul['year'] = 2019\n", 16 | "\n", 17 | "df_2020_jul = pd.read_csv('../data/nyc_taxi_2020-07.csv',\n", 18 | " usecols=['passenger_count', \n", 19 | " 'total_amount', 'payment_type'])\n", 20 | "df_2020_jul['year'] = 2020\n", 21 | "\n", 22 | "df = pd.concat([df_2019_jul, df_2020_jul])" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "5510007" 34 | ] 35 | }, 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "# How many rides were taken in 2019 vs. 2020?\n", 43 | "(\n", 44 | " df.loc[df['year'] == 2019, 'total_amount'].count() -\n", 45 | " df.loc[df['year'] == 2020, 'total_amount'].count()\n", 46 | ")\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "108848979.24000001" 58 | ] 59 | }, 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "# How much money was collected in 2019 vs. 2020?\n", 67 | "(\n", 68 | " df.loc[df['year'] == 2019, 'total_amount'].sum() -\n", 69 | " df.loc[df['year'] == 2020, 'total_amount'].sum()\n", 70 | ")\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "0.2833900000955953" 82 | ] 83 | }, 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "# Did the proportion of trips with more than passenger change dramatically?\n", 91 | "df.loc[(df['year'] == 2019) & \n", 92 | " (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "0.2061513222563435" 104 | ] 105 | }, 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "df.loc[(df['year'] == 2020) & \n", 113 | " (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "0.2870595845428793" 125 | ] 126 | }, 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "# Did people use cash less in 2019 or 2020?\n", 134 | "df.loc[(df['year'] == 2019) & \n", 135 | " (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "0.320558865998251" 147 | ] 148 | }, 149 | "execution_count": 7, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "df.loc[(df['year'] == 2020) & \n", 156 | " (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3 (ipykernel)", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.11.6" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 4 188 | } 189 | -------------------------------------------------------------------------------- /chapter-03/.ipynb_checkpoints/Exercise 20 — Big cities-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "filename = '../data/cities.json'\n", 13 | "df = pd.read_json(filename)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "mean 131132.443\n", 25 | "50% 68207.000\n", 26 | "Name: population, dtype: float64" 27 | ] 28 | }, 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "# What are the mean and median populations for these 1,000 largest cities?\n", 36 | "# What does that tell us?\n", 37 | "\n", 38 | "df['population'].describe()[['mean', '50%']]" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "mean 87027.387368\n", 50 | "50% 65796.000000\n", 51 | "Name: population, dtype: float64" 52 | ] 53 | }, 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "# Along these lines: If we remove the 50 most populous cities, \n", 61 | "# what happens to the mean population? What happens to the median?\n", 62 | "df.loc[50:, 'population'].describe()[['mean', '50%']]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/html": [ 73 | "
\n", 74 | "\n", 87 | "\n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | "
citystaterank
62AnchorageAlaska63
\n", 105 | "
" 106 | ], 107 | "text/plain": [ 108 | " city state rank\n", 109 | "62 Anchorage Alaska 63" 110 | ] 111 | }, 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "# What is the northernmost city, and where does it rank?\n", 119 | "df.loc[df['latitude'] == df['latitude'].max(), ['city', 'state', 'rank']]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "state\n", 131 | "California 212\n", 132 | "Name: count, dtype: int64" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "# Which state has the largest number of cities in this list?\n", 142 | "df['state'].value_counts().head(1)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": [ 153 | "state\n", 154 | "Alaska 1\n", 155 | "Hawaii 1\n", 156 | "District of Columbia 1\n", 157 | "Maine 1\n", 158 | "Vermont 1\n", 159 | "Name: count, dtype: int64" 160 | ] 161 | }, 162 | "execution_count": 6, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "# Which state has the smallest number of cities in this list?\n", 169 | "df['state'].value_counts().tail(5)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3 (ipykernel)", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.11.6" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 4 201 | } 202 | -------------------------------------------------------------------------------- /chapter-03/Exercise 15 — Weird taxi rides.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n", 13 | " usecols=['passenger_count', 'trip_distance',\n", 14 | " 'total_amount', 'payment_type'])" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "9" 26 | ] 27 | }, 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "# How many taxi rides had more than 8 passengers?\n", 35 | "df.loc[df['passenger_count'] > 8, 'passenger_count'].count()" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": {}, 42 | "outputs": [ 43 | { 44 | "data": { 45 | "text/plain": [ 46 | "117381" 47 | ] 48 | }, 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "# How many taxi rides had zero passengers?\n", 56 | "df.loc[\n", 57 | " df['passenger_count'] == 0, 'passenger_count'\n", 58 | "].count()" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "5" 70 | ] 71 | }, 72 | "execution_count": 4, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "# How many taxi rides were paid for in cash, and cost more than $1,000?\n", 79 | "df.loc[\n", 80 | " (df['payment_type'] == 2) & (df['total_amount'] > 1000), \n", 81 | " 'passenger_count'\n", 82 | "].count()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "7131" 94 | ] 95 | }, 96 | "execution_count": 5, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "# How many rides cost less than 0?\n", 103 | "df.loc[\n", 104 | " df['total_amount'] < 0, \n", 105 | " 'total_amount'\n", 106 | "].count()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 6, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/plain": [ 117 | "411255" 118 | ] 119 | }, 120 | "execution_count": 6, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "# How many rides traveled a below-average distance, but cost an above-average amount?\n", 127 | "df.loc[((df['trip_distance'] < df['trip_distance'].mean()) &\n", 128 | " (df['total_amount'] > df['total_amount'].mean())), \n", 129 | " 'trip_distance'].count()" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3 (ipykernel)", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.11.6" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 4 154 | } 155 | -------------------------------------------------------------------------------- /chapter-03/Exercise 15b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n", 13 | " usecols=['passenger_count', 'trip_distance',\n", 14 | " 'total_amount', 'payment_type'])" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "# Beyond 1\n", 22 | "\n", 23 | "Repeat this exercise, but using the `query` method rather than a boolean index." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "data": { 33 | "text/plain": [ 34 | "9" 35 | ] 36 | }, 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "output_type": "execute_result" 40 | } 41 | ], 42 | "source": [ 43 | "# How many taxi rides had more than 8 passengers (query version)\n", 44 | "df.query('passenger_count > 8')['passenger_count'].count()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "117381" 56 | ] 57 | }, 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "# How many taxi rides had zero passengers (query version)\n", 65 | "df.query('passenger_count == 0')['passenger_count'].count()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "5" 77 | ] 78 | }, 79 | "execution_count": 4, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "# How many taxi rides were paid for in cash, and cost more than $1,000? (query version)\n", 86 | "df.query('payment_type == 2 & total_amount > 1000')['payment_type'].count()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "data": { 96 | "text/plain": [ 97 | "7131" 98 | ] 99 | }, 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "output_type": "execute_result" 103 | } 104 | ], 105 | "source": [ 106 | "# How many rides cost less than 0? (query version)\n", 107 | "df.query('total_amount < 0')['total_amount'].count()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "411255" 119 | ] 120 | }, 121 | "execution_count": 6, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "# How many rides traveled a below-average distance, but cost an above-average amount?\n", 128 | "# (query version)\n", 129 | "df.query('trip_distance < trip_distance.mean() & total_amount > total_amount.mean()')['trip_distance'].count()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "# Beyond 2\n", 137 | "\n", 138 | "How many of the rides that cost less than 0 were indeed for either a dispute (`payment_type` of 4) or a voided trip (`payment_type` of 6)?" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 7, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "2666" 150 | ] 151 | }, 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "df.loc[(df['total_amount'] < 0) & \n", 159 | " ((df['payment_type'] == 4) | \n", 160 | " (df['payment_type'] == 6)), 'total_amount'].count()" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "# Beyond 3\n", 168 | "\n", 169 | "I stated above that most people pay for their taxi rides using a credit card. Show this, and find what percentages normally pay in cash vs. a credit card." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 8, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/plain": [ 180 | "payment_type\n", 181 | "1 0.715464\n", 182 | "2 0.278752\n", 183 | "Name: proportion, dtype: float64" 184 | ] 185 | }, 186 | "execution_count": 8, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "# 1 == credit card\n", 193 | "# 2 == cash\n", 194 | "\n", 195 | "df['payment_type'].value_counts(normalize=True)[[1,2]]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "Python 3 (ipykernel)", 209 | "language": "python", 210 | "name": "python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.11.6" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 4 227 | } 228 | -------------------------------------------------------------------------------- /chapter-03/Exercise 16 — Pandemic taxis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "df_2019_jul = pd.read_csv('../data/nyc_taxi_2019-07.csv',\n", 13 | " usecols=['passenger_count', \n", 14 | " 'total_amount', 'payment_type'])\n", 15 | "df_2019_jul['year'] = 2019\n", 16 | "\n", 17 | "df_2020_jul = pd.read_csv('../data/nyc_taxi_2020-07.csv',\n", 18 | " usecols=['passenger_count', \n", 19 | " 'total_amount', 'payment_type'])\n", 20 | "df_2020_jul['year'] = 2020\n", 21 | "\n", 22 | "df = pd.concat([df_2019_jul, df_2020_jul])" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "data": { 32 | "text/plain": [ 33 | "5510007" 34 | ] 35 | }, 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "# How many rides were taken in 2019 vs. 2020?\n", 43 | "(\n", 44 | " df.loc[df['year'] == 2019, 'total_amount'].count() -\n", 45 | " df.loc[df['year'] == 2020, 'total_amount'].count()\n", 46 | ")\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "108848979.24000001" 58 | ] 59 | }, 60 | "execution_count": 3, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "# How much money was collected in 2019 vs. 2020?\n", 67 | "(\n", 68 | " df.loc[df['year'] == 2019, 'total_amount'].sum() -\n", 69 | " df.loc[df['year'] == 2020, 'total_amount'].sum()\n", 70 | ")\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "0.2833900000955953" 82 | ] 83 | }, 84 | "execution_count": 4, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "# Did the proportion of trips with more than passenger change dramatically?\n", 91 | "df.loc[(df['year'] == 2019) & \n", 92 | " (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "0.2061513222563435" 104 | ] 105 | }, 106 | "execution_count": 5, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "df.loc[(df['year'] == 2020) & \n", 113 | " (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "0.2870595845428793" 125 | ] 126 | }, 127 | "execution_count": 6, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "# Did people use cash less in 2019 or 2020?\n", 134 | "df.loc[(df['year'] == 2019) & \n", 135 | " (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 7, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "data": { 145 | "text/plain": [ 146 | "0.320558865998251" 147 | ] 148 | }, 149 | "execution_count": 7, 150 | "metadata": {}, 151 | "output_type": "execute_result" 152 | } 153 | ], 154 | "source": [ 155 | "df.loc[(df['year'] == 2020) & \n", 156 | " (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3 (ipykernel)", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.11.6" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 4 188 | } 189 | -------------------------------------------------------------------------------- /chapter-03/Exercise 17b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# Beyond 1\n", 19 | "\n", 20 | "Create a data frame from four other columns (`VendorID`, `trip_distance`, `tip_amount`, and `total_amount`), specifying the `dtype` for each. What types are most appropriate? Can you use them directly, or must you first clean the data?" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 10, 26 | "metadata": { 27 | "scrolled": true 28 | }, 29 | "outputs": [ 30 | { 31 | "name": "stderr", 32 | "output_type": "stream", 33 | "text": [ 34 | "/var/folders/rr/0mnyyv811fs5vyp22gf4fxk00000gn/T/ipykernel_72055/3862543684.py:9: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", 35 | " df.loc['VendorID'] = df['VendorID'].astype(np.int8)\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "df = pd.read_csv('../data/nyc_taxi_2020-01.csv',\n", 41 | " usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],\n", 42 | " dtype={'VendorID':np.float32,\n", 43 | " 'trip_distance':np.float32, \n", 44 | " 'tip_amount':np.float32,\n", 45 | " 'total_amount':np.float32})\n", 46 | "\n", 47 | "df = df.dropna().copy()\n", 48 | "df.loc['VendorID'] = df['VendorID'].astype(np.int8)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "# Beyond 2\n", 56 | "\n", 57 | "Instead of removing `NaN` values from the `VendorID` column, set it to a new value, 3. How does that affect your specifications and cleaning of the data?" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 16, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "df = pd.read_csv('../data/nyc_taxi_2020-01.csv',\n", 67 | " usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],\n", 68 | " dtype={'VendorID':np.float32, \n", 69 | " 'trip_distance':np.float32,\n", 70 | " 'tip_amount':np.float32,\n", 71 | " 'total_amount':np.float32})\n", 72 | "\n", 73 | "df['VendorID'] = df['VendorID'].fillna(3)\n", 74 | "df['VendorID'] = df['VendorID'].astype(np.int8)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "# Beyond 3\n", 82 | "\n", 83 | "We'll talk more about this in future chapters, but the `memory_usage` method allows you to see how much memory is being used by each column in a data frame. It returns a series of integers, in which the index lists the columns and the values represent the memory used by each column. Compare the memory used by the data frame with `float16` (which you've already used) and when you use `float64` instead for the final three columns." 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 17, 89 | "metadata": { 90 | "scrolled": true 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "83265236" 97 | ] 98 | }, 99 | "execution_count": 17, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "# Memory usage with float16\n", 106 | "df.memory_usage().sum() " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 19, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "df = pd.read_csv('../data/nyc_taxi_2020-01.csv',\n", 116 | " usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],\n", 117 | " dtype={'VendorID':np.float32, \n", 118 | " 'trip_distance':np.float32,\n", 119 | " 'tip_amount':np.float32,\n", 120 | " 'total_amount':np.float32})\n", 121 | "\n", 122 | "df['VendorID'] = df['VendorID'].fillna(3)\n", 123 | "df['VendorID'] = df['VendorID'].astype(np.int8)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "scrolled": true 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "# Memory usage with float64\n", 135 | "df.memory_usage().sum()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "# float64 uses about 3.5x the memory as float16!\n", 145 | "160125328 / 44835184" 146 | ] 147 | } 148 | ], 149 | "metadata": { 150 | "kernelspec": { 151 | "display_name": "Python 3 (ipykernel)", 152 | "language": "python", 153 | "name": "python3" 154 | }, 155 | "language_info": { 156 | "codemirror_mode": { 157 | "name": "ipython", 158 | "version": 3 159 | }, 160 | "file_extension": ".py", 161 | "mimetype": "text/x-python", 162 | "name": "python", 163 | "nbconvert_exporter": "python", 164 | "pygments_lexer": "ipython3", 165 | "version": "3.11.6" 166 | } 167 | }, 168 | "nbformat": 4, 169 | "nbformat_minor": 4 170 | } 171 | -------------------------------------------------------------------------------- /chapter-03/Exercise 20 — Big cities.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame\n", 11 | "\n", 12 | "filename = '../data/cities.json'\n", 13 | "df = pd.read_json(filename)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "mean 131132.443\n", 25 | "50% 68207.000\n", 26 | "Name: population, dtype: float64" 27 | ] 28 | }, 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "# What are the mean and median populations for these 1,000 largest cities?\n", 36 | "# What does that tell us?\n", 37 | "\n", 38 | "df['population'].describe()[['mean', '50%']]" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "mean 87027.387368\n", 50 | "50% 65796.000000\n", 51 | "Name: population, dtype: float64" 52 | ] 53 | }, 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "output_type": "execute_result" 57 | } 58 | ], 59 | "source": [ 60 | "# Along these lines: If we remove the 50 most populous cities, \n", 61 | "# what happens to the mean population? What happens to the median?\n", 62 | "df.loc[50:, 'population'].describe()[['mean', '50%']]" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/html": [ 73 | "
\n", 74 | "\n", 87 | "\n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | "
citystaterank
62AnchorageAlaska63
\n", 105 | "
" 106 | ], 107 | "text/plain": [ 108 | " city state rank\n", 109 | "62 Anchorage Alaska 63" 110 | ] 111 | }, 112 | "execution_count": 4, 113 | "metadata": {}, 114 | "output_type": "execute_result" 115 | } 116 | ], 117 | "source": [ 118 | "# What is the northernmost city, and where does it rank?\n", 119 | "df.loc[df['latitude'] == df['latitude'].max(), ['city', 'state', 'rank']]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 5, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "state\n", 131 | "California 212\n", 132 | "Name: count, dtype: int64" 133 | ] 134 | }, 135 | "execution_count": 5, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "# Which state has the largest number of cities in this list?\n", 142 | "df['state'].value_counts().head(1)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/plain": [ 153 | "state\n", 154 | "Alaska 1\n", 155 | "Hawaii 1\n", 156 | "District of Columbia 1\n", 157 | "Maine 1\n", 158 | "Vermont 1\n", 159 | "Name: count, dtype: int64" 160 | ] 161 | }, 162 | "execution_count": 6, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "# Which state has the smallest number of cities in this list?\n", 169 | "df['state'].value_counts().tail(5)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [] 178 | } 179 | ], 180 | "metadata": { 181 | "kernelspec": { 182 | "display_name": "Python 3 (ipykernel)", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.11.6" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 4 201 | } 202 | -------------------------------------------------------------------------------- /chapter-04/.ipynb_checkpoints/Exercise 21 — Parking tickets-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "Index(['Plate ID', 'Registration State', 'Issue Date', 'Vehicle Make',\n", 22 | " 'Street Name', 'Date First Observed', 'Vehicle Color'],\n", 23 | " dtype='object')" 24 | ] 25 | }, 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 33 | "\n", 34 | "df = pd.read_csv(filename,\n", 35 | " usecols=['Date First Observed', 'Plate ID', 'Registration State',\n", 36 | " 'Issue Date', 'Vehicle Make', 'Street Name', 'Vehicle Color'])\n", 37 | "df.columns" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# set the index to the \"Issue Date\" column\n", 47 | "df = df.set_index('Issue Date')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "Vehicle Make\n", 59 | "TOYOT 3829\n", 60 | "HONDA 3593\n", 61 | "FORD 3164\n", 62 | "Name: count, dtype: int64" 63 | ] 64 | }, 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "# what three vehicle makes were most likely to be ticketed on January 2nd?\n", 72 | "df.loc['01/02/2020 12:00:00 AM', 'Vehicle Make'].value_counts().head(3)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "Street Name\n", 84 | "WB CONDUIT BLVD @ LI 1068\n", 85 | "SB WEST ST @ LEROY S 335\n", 86 | "EB HORACE HARDING EX 273\n", 87 | "EB QUEENS BLVD @ 82N 245\n", 88 | "WB ATLANTIC AVE @ CL 229\n", 89 | "Name: count, dtype: int64" 90 | ] 91 | }, 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "# On what five streets were cars most likely to be ticketed on June 1st, 2020?\n", 99 | "df.loc['06/01/2020 12:00:00 AM', 'Street Name'].value_counts().head(5)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# change the index to be \"Vehicle Color\"\n", 109 | "df = df.reset_index().set_index('Vehicle Color')" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "Vehicle Make\n", 121 | "HONDA 39353\n", 122 | "Name: count, dtype: int64" 123 | ] 124 | }, 125 | "execution_count": 7, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "# What was the most common make of ticketed cars that were either blue or red?\n", 132 | "df.loc[['BLUE', 'RED'], 'Vehicle Make'].value_counts().head(1)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3 (ipykernel)", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.11.6" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 4 164 | } 165 | -------------------------------------------------------------------------------- /chapter-04/.ipynb_checkpoints/Sandbox-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /chapter-04/Exercise 21 — Parking tickets.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas import Series, DataFrame" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "data": { 20 | "text/plain": [ 21 | "Index(['Plate ID', 'Registration State', 'Issue Date', 'Vehicle Make',\n", 22 | " 'Street Name', 'Date First Observed', 'Vehicle Color'],\n", 23 | " dtype='object')" 24 | ] 25 | }, 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 33 | "\n", 34 | "df = pd.read_csv(filename,\n", 35 | " usecols=['Date First Observed', 'Plate ID', 'Registration State',\n", 36 | " 'Issue Date', 'Vehicle Make', 'Street Name', 'Vehicle Color'])\n", 37 | "df.columns" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# set the index to the \"Issue Date\" column\n", 47 | "df = df.set_index('Issue Date')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 4, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "Vehicle Make\n", 59 | "TOYOT 3829\n", 60 | "HONDA 3593\n", 61 | "FORD 3164\n", 62 | "Name: count, dtype: int64" 63 | ] 64 | }, 65 | "execution_count": 4, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "# what three vehicle makes were most likely to be ticketed on January 2nd?\n", 72 | "df.loc['01/02/2020 12:00:00 AM', 'Vehicle Make'].value_counts().head(3)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "Street Name\n", 84 | "WB CONDUIT BLVD @ LI 1068\n", 85 | "SB WEST ST @ LEROY S 335\n", 86 | "EB HORACE HARDING EX 273\n", 87 | "EB QUEENS BLVD @ 82N 245\n", 88 | "WB ATLANTIC AVE @ CL 229\n", 89 | "Name: count, dtype: int64" 90 | ] 91 | }, 92 | "execution_count": 5, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "# On what five streets were cars most likely to be ticketed on June 1st, 2020?\n", 99 | "df.loc['06/01/2020 12:00:00 AM', 'Street Name'].value_counts().head(5)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "# change the index to be \"Vehicle Color\"\n", 109 | "df = df.reset_index().set_index('Vehicle Color')" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 7, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "data": { 119 | "text/plain": [ 120 | "Vehicle Make\n", 121 | "HONDA 39353\n", 122 | "Name: count, dtype: int64" 123 | ] 124 | }, 125 | "execution_count": 7, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "# What was the most common make of ticketed cars that were either blue or red?\n", 132 | "df.loc[['BLUE', 'RED'], 'Vehicle Make'].value_counts().head(1)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3 (ipykernel)", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.11.6" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 4 164 | } 165 | -------------------------------------------------------------------------------- /chapter-06/.ipynb_checkpoints/Joining sidebar-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /chapter-09/.ipynb_checkpoints/Exercise 36b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "0 The\n", 23 | "1 Project\n", 24 | "2 Gutenberg\n", 25 | "3 EBook\n", 26 | "4 of\n", 27 | "dtype: object" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "filename = '../data/alice-in-wonderland.txt'\n", 37 | "\n", 38 | "s = Series(open(filename).read().split())\n", 39 | "s.head()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# Beyond 1\n", 47 | "\n", 48 | "What is the mean of all integers in Alice?" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "8030.851851851852" 60 | ] 61 | }, 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "import string\n", 69 | "\n", 70 | "(\n", 71 | " s\n", 72 | " .str\n", 73 | " .strip(string.punctuation)\n", 74 | " .loc[lambda s_: s_.str.isdigit()]\n", 75 | " .astype(int)\n", 76 | " .mean()\n", 77 | ")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# Beyond 2\n", 85 | "\n", 86 | "What words in Alice don't appear in the dictionary? Which are the five most common such words?" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "words = {one_word.strip() for one_word in open('../data/words.txt')}" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "Project 83\n", 107 | "She 36\n", 108 | "Rabbit 28\n", 109 | "Queen 27\n", 110 | "Gutenberg 27\n", 111 | " ..\n", 112 | "reasons 1\n", 113 | "knocked 1\n", 114 | "curls 1\n", 115 | "From 1\n", 116 | "includes 1\n", 117 | "Name: count, Length: 758, dtype: int64" 118 | ] 119 | }, 120 | "execution_count": 7, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "(\n", 127 | " s\n", 128 | " .str.strip(string.punctuation) # Strip punctuation\n", 129 | " .loc[lambda s_: s_.str.isalpha()] # Keep only those with letters\n", 130 | " .loc[lambda s_: ~s_.isin(words)] # Now keep those *not* in the dictionary, and find the most common ones\n", 131 | " .value_counts()\n", 132 | ")" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "# Beyond 3\n", 140 | "\n", 141 | "What is the mean number of words per paragraph?" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "count 393.000000\n", 153 | "mean 32.475827\n", 154 | "std 32.428415\n", 155 | "min 0.000000\n", 156 | "25% 7.000000\n", 157 | "50% 22.000000\n", 158 | "75% 48.000000\n", 159 | "max 169.000000\n", 160 | "dtype: float64" 161 | ] 162 | }, 163 | "execution_count": 8, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "# Read the file into a series by paragraph\n", 170 | "s = Series(open(filename).read().split('\\n\\n'))\n", 171 | "\n", 172 | "# Just use describe to get min, max, and everything else\n", 173 | "(\n", 174 | " s\n", 175 | " .str\n", 176 | " .split()\n", 177 | " .str\n", 178 | " .len()\n", 179 | " .describe() \n", 180 | ")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3 (ipykernel)", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.11.7" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 4 212 | } 213 | -------------------------------------------------------------------------------- /chapter-09/Exercise 36b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "from pandas import Series, DataFrame" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/plain": [ 22 | "0 The\n", 23 | "1 Project\n", 24 | "2 Gutenberg\n", 25 | "3 EBook\n", 26 | "4 of\n", 27 | "dtype: object" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "filename = '../data/alice-in-wonderland.txt'\n", 37 | "\n", 38 | "s = Series(open(filename).read().split())\n", 39 | "s.head()" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "# Beyond 1\n", 47 | "\n", 48 | "What is the mean of all integers in Alice?" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "data": { 58 | "text/plain": [ 59 | "8030.851851851852" 60 | ] 61 | }, 62 | "execution_count": 4, 63 | "metadata": {}, 64 | "output_type": "execute_result" 65 | } 66 | ], 67 | "source": [ 68 | "import string\n", 69 | "\n", 70 | "(\n", 71 | " s\n", 72 | " .str\n", 73 | " .strip(string.punctuation)\n", 74 | " .loc[lambda s_: s_.str.isdigit()]\n", 75 | " .astype(int)\n", 76 | " .mean()\n", 77 | ")" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "# Beyond 2\n", 85 | "\n", 86 | "What words in Alice don't appear in the dictionary? Which are the five most common such words?" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "words = {one_word.strip() for one_word in open('../data/words.txt')}" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 7, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "Project 83\n", 107 | "She 36\n", 108 | "Rabbit 28\n", 109 | "Queen 27\n", 110 | "Gutenberg 27\n", 111 | " ..\n", 112 | "reasons 1\n", 113 | "knocked 1\n", 114 | "curls 1\n", 115 | "From 1\n", 116 | "includes 1\n", 117 | "Name: count, Length: 758, dtype: int64" 118 | ] 119 | }, 120 | "execution_count": 7, 121 | "metadata": {}, 122 | "output_type": "execute_result" 123 | } 124 | ], 125 | "source": [ 126 | "(\n", 127 | " s\n", 128 | " .str.strip(string.punctuation) # Strip punctuation\n", 129 | " .loc[lambda s_: s_.str.isalpha()] # Keep only those with letters\n", 130 | " .loc[lambda s_: ~s_.isin(words)] # Now keep those *not* in the dictionary, and find the most common ones\n", 131 | " .value_counts()\n", 132 | ")" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "# Beyond 3\n", 140 | "\n", 141 | "What is the mean number of words per paragraph?" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 8, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "count 393.000000\n", 153 | "mean 32.475827\n", 154 | "std 32.428415\n", 155 | "min 0.000000\n", 156 | "25% 7.000000\n", 157 | "50% 22.000000\n", 158 | "75% 48.000000\n", 159 | "max 169.000000\n", 160 | "dtype: float64" 161 | ] 162 | }, 163 | "execution_count": 8, 164 | "metadata": {}, 165 | "output_type": "execute_result" 166 | } 167 | ], 168 | "source": [ 169 | "# Read the file into a series by paragraph\n", 170 | "s = Series(open(filename).read().split('\\n\\n'))\n", 171 | "\n", 172 | "# Just use describe to get min, max, and everything else\n", 173 | "(\n", 174 | " s\n", 175 | " .str\n", 176 | " .split()\n", 177 | " .str\n", 178 | " .len()\n", 179 | " .describe() \n", 180 | ")" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3 (ipykernel)", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.11.7" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 4 212 | } 213 | -------------------------------------------------------------------------------- /chapter-10/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /chapter-10/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "f6b49bb8", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 3, 18 | "id": "720ae110", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/html": [ 24 | "
\n", 25 | "\n", 38 | "\n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | "
xyz
a725660
b293711
c541336
d942379
\n", 74 | "
" 75 | ], 76 | "text/plain": [ 77 | " x y z\n", 78 | "a 72 56 60\n", 79 | "b 29 37 11\n", 80 | "c 54 13 36\n", 81 | "d 94 23 79" 82 | ] 83 | }, 84 | "execution_count": 3, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "df = DataFrame(np.random.randint(0, 100, [4,3]),\n", 91 | " index=list('abcd'),\n", 92 | " columns=list('xyz'))\n", 93 | "df" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 4, 99 | "id": "ae5daa0d", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "x 249\n", 106 | "y 129\n", 107 | "z 186\n", 108 | "dtype: int64" 109 | ] 110 | }, 111 | "execution_count": 4, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "df.sum()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "id": "f017d8b2", 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "a 188\n", 130 | "b 77\n", 131 | "c 103\n", 132 | "d 196\n", 133 | "dtype: int64" 134 | ] 135 | }, 136 | "execution_count": 5, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "df.sum(axis='columns')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "id": "a4ac793a", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "a 188\n", 155 | "b 77\n", 156 | "c 103\n", 157 | "d 196\n", 158 | "dtype: int64" 159 | ] 160 | }, 161 | "execution_count": 6, 162 | "metadata": {}, 163 | "output_type": "execute_result" 164 | } 165 | ], 166 | "source": [ 167 | "df.T.sum()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "id": "5b0e6ab5", 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | } 178 | ], 179 | "metadata": { 180 | "kernelspec": { 181 | "display_name": "Python 3 (ipykernel)", 182 | "language": "python", 183 | "name": "python3" 184 | }, 185 | "language_info": { 186 | "codemirror_mode": { 187 | "name": "ipython", 188 | "version": 3 189 | }, 190 | "file_extension": ".py", 191 | "mimetype": "text/x-python", 192 | "name": "python", 193 | "nbconvert_exporter": "python", 194 | "pygments_lexer": "ipython3", 195 | "version": "3.11.2" 196 | } 197 | }, 198 | "nbformat": 4, 199 | "nbformat_minor": 5 200 | } 201 | -------------------------------------------------------------------------------- /chapter-11/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /chapter-12/.gitignore: -------------------------------------------------------------------------------- 1 | parking-violations.csv 2 | parking-violations.feather 3 | parking-violations.json 4 | -------------------------------------------------------------------------------- /chapter-12/.ipynb_checkpoints/Exercise 49 — Faster reading and writing-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "75417ef5", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "2dbd4a89", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Read the full NYC parking-violations CSV file into memory.\n", 24 | "\n", 25 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 26 | "df = pd.read_csv(filename, low_memory=False)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "id": "76aa5472", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "root = 'parking-violations'\n", 37 | "write_methods = {'JSON': df.to_json,\n", 38 | " 'CSV': df.to_csv,\n", 39 | " 'feather': df.to_feather\n", 40 | " }" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "id": "0c0bb56a", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Saving in JSON\n", 54 | "\tWriting JSON: total_time=47.94986385299126\n", 55 | "Saving in CSV\n", 56 | "\tWriting CSV: total_time=84.28116728103487\n", 57 | "Saving in feather\n", 58 | "\tWriting feather: total_time=10.2521946990164\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "for one_format, method in write_methods.items():\n", 64 | " print(f'Saving in {one_format}')\n", 65 | " start_time = time.perf_counter()\n", 66 | " write_methods[one_format](f'parking-violations.{one_format.lower()}')\n", 67 | " end_time = time.perf_counter()\n", 68 | "\n", 69 | " total_time = end_time - start_time\n", 70 | " print(f'\\tWriting {one_format}: {total_time=}') " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 6, 76 | "id": "e20e3ab7", 77 | "metadata": { 78 | "scrolled": true 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "parking-violations.json : 8,820,247,015\n", 86 | "parking-violations.csv : 2,440,860,181\n", 87 | "parking-violations.feather : 1,466,536,058\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "# How big are the files you've created?\n", 93 | "import glob\n", 94 | "import os\n", 95 | "\n", 96 | "for one_filename in glob.glob(f'{root}*'):\n", 97 | " print(f'{one_filename:27}: {os.stat(one_filename).st_size:,}')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 7, 103 | "id": "258f10b7", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "read_methods = {'JSON': pd.read_json,\n", 108 | " 'CSV': pd.read_csv,\n", 109 | " 'feather': pd.read_feather\n", 110 | " }" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 8, 116 | "id": "05175935", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "Reading from JSON\n", 124 | "\tReading JSON: total_time=512.0497572919703\n", 125 | "Reading from CSV\n" 126 | ] 127 | }, 128 | { 129 | "name": "stderr", 130 | "output_type": "stream", 131 | "text": [ 132 | "/var/folders/rr/0mnyyv811fs5vyp22gf4fxk00000gn/T/ipykernel_52668/1751173684.py:4: DtypeWarning: Columns (19,30,39,40) have mixed types. Specify dtype option on import or set low_memory=False.\n", 133 | " df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n" 134 | ] 135 | }, 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "\tReading CSV: total_time=44.657161787035875\n", 141 | "Reading from feather\n", 142 | "\tReading feather: total_time=13.85696751094656\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "for one_format, method in read_methods.items():\n", 148 | " print(f'Reading from {one_format}')\n", 149 | " start_time = time.perf_counter()\n", 150 | " df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n", 151 | " end_time = time.perf_counter()\n", 152 | "\n", 153 | " total_time = end_time - start_time\n", 154 | " print(f'\\tReading {one_format}: {total_time=}') " 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "404fa135", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.12.1" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /chapter-12/.ipynb_checkpoints/Exercise 49b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0bb1cbb2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "33611a51", 19 | "metadata": {}, 20 | "source": [ 21 | "# Beyond 1\n", 22 | "\n", 23 | "If we read the CSV file using the \"pyarrow\" engine, do we see any speedup? That is, can we read CSV files into memory any faster if we use a different engine?" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "b95a0fc2", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "Reading via pyarrow engine, total_time=9.923564148019068\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 42 | "start_time = time.perf_counter()\n", 43 | "df = pd.read_csv(filename, engine='pyarrow')\n", 44 | "end_time = time.perf_counter()\n", 45 | "total_time = end_time - start_time\n", 46 | "print(f'Reading via pyarrow engine, {total_time=}') " 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "54c0002c", 52 | "metadata": {}, 53 | "source": [ 54 | "# Beyond 2\n", 55 | "\n", 56 | "If we specify the dtypes when reading from a CSV file, do we save any time?" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "id": "87edccd4", 63 | "metadata": { 64 | "scrolled": true 65 | }, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "63.521172957960516" 71 | ] 72 | }, 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "start_time = time.perf_counter()\n", 80 | "df = pd.read_csv(filename, low_memory=False,\n", 81 | " dtype=dict(df.dtypes))\n", 82 | "end_time = time.perf_counter()\n", 83 | "\n", 84 | "total_time = end_time - start_time\n", 85 | "total_time" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "261622c5", 91 | "metadata": {}, 92 | "source": [ 93 | "# Beyond 3\n", 94 | "\n", 95 | "How much memory does our data frame take in as a `pandas` data frame? How much memory does it require as an Arrow table?" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "id": "14842cf1", 102 | "metadata": { 103 | "scrolled": true 104 | }, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "16,789,335,057\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "# Pandas table\n", 116 | "n = df.memory_usage(deep=True).sum()\n", 117 | "print(f'{n:,}')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "id": "947e968a", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# Arrow table\n", 128 | "import pyarrow.feather as feather\n", 129 | "read_arrow = feather.read_table('parking-violations.feather')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 6, 135 | "id": "a0d4be71", 136 | "metadata": { 137 | "scrolled": true 138 | }, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "4,309,680,899\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "n = read_arrow.nbytes\n", 150 | "print(f'{n:,}')" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "5d110443-4287-47dc-b3ef-117930957cbf", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3 (ipykernel)", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.12.1" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 5 183 | } 184 | -------------------------------------------------------------------------------- /chapter-12/.ipynb_checkpoints/Exercise 50b — Beyond the exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0bb1cbb2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "7edd172f", 19 | "metadata": { 20 | "scrolled": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 25 | "df = pd.read_csv(filename,\n", 26 | " usecols=['Plate ID', 'Registration State', 'Plate Type', 'Feet From Curb',\n", 27 | " 'Vehicle Make', 'Vehicle Color'])\n", 28 | "df.columns = ['pid', 'state', 'ptype', 'make', 'color', 'feet']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "ed955dcd", 34 | "metadata": {}, 35 | "source": [ 36 | "# Beyond 1\n", 37 | "\n", 38 | "In `df.query`, we can use the words `and` and `or`, rather than the symbols `&` and `|`, thanks to the `numexpr` library. Rewrite our final query using the words. Does this change the speed at all?" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "id": "2a1b21d5", 45 | "metadata": { 46 | "scrolled": true 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "914 ms ± 7.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "%timeit df.query('state == \"NY\" and ptype == \"PAS\" and color == \"WHITE\" and feet > 1 and make == \"TOYOT\"')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "33611a51", 64 | "metadata": {}, 65 | "source": [ 66 | "# Beyond 2\n", 67 | "\n", 68 | "I prefer measuring distance in meters, rather than in feet. I thus want to find all of the cars that were ticketed when they were more than 1 meter from the curb. Perform this query using the traditional `df.loc` and also using `df.query`. Which one runs faster?" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "id": "b95a0fc2", 75 | "metadata": { 76 | "scrolled": true 77 | }, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "63.2 ms ± 2.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "%timeit df.loc[(df['feet'] * 0.3048) > 1]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "id": "d3a6a8c0", 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "84.4 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "%timeit df.query('(feet * 0.3048) > 1')" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "261622c5", 112 | "metadata": {}, 113 | "source": [ 114 | "# Beyond 3\n", 115 | "\n", 116 | "What if we modify our query, such that we look for cars that are > 1 meter from the curb and the state is New York? Which query runs faster, and by how much?" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "id": "f51ddb07", 123 | "metadata": { 124 | "scrolled": true 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "507 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "%timeit df.loc[((df['feet'] * 0.3048) > 1) & (df['state'] == 'NY')]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 7, 142 | "id": "5ebb17b3", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "314 ms ± 4.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "%timeit df.query('(feet * 0.3048) > 1 and state == \"NY\" ')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "ada77895", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.12.1" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /chapter-12/Exercise 49 — Faster reading and writing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "75417ef5", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "2dbd4a89", 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Read the full NYC parking-violations CSV file into memory.\n", 24 | "\n", 25 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 26 | "df = pd.read_csv(filename, low_memory=False)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "id": "76aa5472", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "root = 'parking-violations'\n", 37 | "write_methods = {'JSON': df.to_json,\n", 38 | " 'CSV': df.to_csv,\n", 39 | " 'feather': df.to_feather\n", 40 | " }" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "id": "0c0bb56a", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "Saving in JSON\n", 54 | "\tWriting JSON: total_time=47.94986385299126\n", 55 | "Saving in CSV\n", 56 | "\tWriting CSV: total_time=84.28116728103487\n", 57 | "Saving in feather\n", 58 | "\tWriting feather: total_time=10.2521946990164\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "for one_format, method in write_methods.items():\n", 64 | " print(f'Saving in {one_format}')\n", 65 | " start_time = time.perf_counter()\n", 66 | " write_methods[one_format](f'parking-violations.{one_format.lower()}')\n", 67 | " end_time = time.perf_counter()\n", 68 | "\n", 69 | " total_time = end_time - start_time\n", 70 | " print(f'\\tWriting {one_format}: {total_time=}') " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 6, 76 | "id": "e20e3ab7", 77 | "metadata": { 78 | "scrolled": true 79 | }, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "parking-violations.json : 8,820,247,015\n", 86 | "parking-violations.csv : 2,440,860,181\n", 87 | "parking-violations.feather : 1,466,536,058\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "# How big are the files you've created?\n", 93 | "import glob\n", 94 | "import os\n", 95 | "\n", 96 | "for one_filename in glob.glob(f'{root}*'):\n", 97 | " print(f'{one_filename:27}: {os.stat(one_filename).st_size:,}')" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 7, 103 | "id": "258f10b7", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "read_methods = {'JSON': pd.read_json,\n", 108 | " 'CSV': pd.read_csv,\n", 109 | " 'feather': pd.read_feather\n", 110 | " }" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 8, 116 | "id": "05175935", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "Reading from JSON\n", 124 | "\tReading JSON: total_time=512.0497572919703\n", 125 | "Reading from CSV\n" 126 | ] 127 | }, 128 | { 129 | "name": "stderr", 130 | "output_type": "stream", 131 | "text": [ 132 | "/var/folders/rr/0mnyyv811fs5vyp22gf4fxk00000gn/T/ipykernel_52668/1751173684.py:4: DtypeWarning: Columns (19,30,39,40) have mixed types. Specify dtype option on import or set low_memory=False.\n", 133 | " df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n" 134 | ] 135 | }, 136 | { 137 | "name": "stdout", 138 | "output_type": "stream", 139 | "text": [ 140 | "\tReading CSV: total_time=44.657161787035875\n", 141 | "Reading from feather\n", 142 | "\tReading feather: total_time=13.85696751094656\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "for one_format, method in read_methods.items():\n", 148 | " print(f'Reading from {one_format}')\n", 149 | " start_time = time.perf_counter()\n", 150 | " df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n", 151 | " end_time = time.perf_counter()\n", 152 | "\n", 153 | " total_time = end_time - start_time\n", 154 | " print(f'\\tReading {one_format}: {total_time=}') " 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "404fa135", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.12.1" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /chapter-12/Exercise 49b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0bb1cbb2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame\n", 13 | "import time" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "33611a51", 19 | "metadata": {}, 20 | "source": [ 21 | "# Beyond 1\n", 22 | "\n", 23 | "If we read the CSV file using the \"pyarrow\" engine, do we see any speedup? That is, can we read CSV files into memory any faster if we use a different engine?" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "id": "b95a0fc2", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "name": "stdout", 34 | "output_type": "stream", 35 | "text": [ 36 | "Reading via pyarrow engine, total_time=9.923564148019068\n" 37 | ] 38 | } 39 | ], 40 | "source": [ 41 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 42 | "start_time = time.perf_counter()\n", 43 | "df = pd.read_csv(filename, engine='pyarrow')\n", 44 | "end_time = time.perf_counter()\n", 45 | "total_time = end_time - start_time\n", 46 | "print(f'Reading via pyarrow engine, {total_time=}') " 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "54c0002c", 52 | "metadata": {}, 53 | "source": [ 54 | "# Beyond 2\n", 55 | "\n", 56 | "If we specify the dtypes when reading from a CSV file, do we save any time?" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "id": "87edccd4", 63 | "metadata": { 64 | "scrolled": true 65 | }, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "63.521172957960516" 71 | ] 72 | }, 73 | "execution_count": 3, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "start_time = time.perf_counter()\n", 80 | "df = pd.read_csv(filename, low_memory=False,\n", 81 | " dtype=dict(df.dtypes))\n", 82 | "end_time = time.perf_counter()\n", 83 | "\n", 84 | "total_time = end_time - start_time\n", 85 | "total_time" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "id": "261622c5", 91 | "metadata": {}, 92 | "source": [ 93 | "# Beyond 3\n", 94 | "\n", 95 | "How much memory does our data frame take in as a `pandas` data frame? How much memory does it require as an Arrow table?" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 4, 101 | "id": "14842cf1", 102 | "metadata": { 103 | "scrolled": true 104 | }, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "16,789,335,057\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "# Pandas table\n", 116 | "n = df.memory_usage(deep=True).sum()\n", 117 | "print(f'{n:,}')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "id": "947e968a", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# Arrow table\n", 128 | "import pyarrow.feather as feather\n", 129 | "read_arrow = feather.read_table('parking-violations.feather')" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 6, 135 | "id": "a0d4be71", 136 | "metadata": { 137 | "scrolled": true 138 | }, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "4,309,680,899\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "n = read_arrow.nbytes\n", 150 | "print(f'{n:,}')" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "5d110443-4287-47dc-b3ef-117930957cbf", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3 (ipykernel)", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.12.1" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 5 183 | } 184 | -------------------------------------------------------------------------------- /chapter-12/Exercise 50b — Beyond the exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0bb1cbb2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np\n", 11 | "import pandas as pd\n", 12 | "from pandas import Series, DataFrame" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "id": "7edd172f", 19 | "metadata": { 20 | "scrolled": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "filename = '../data/nyc-parking-violations-2020.csv'\n", 25 | "df = pd.read_csv(filename,\n", 26 | " usecols=['Plate ID', 'Registration State', 'Plate Type', 'Feet From Curb',\n", 27 | " 'Vehicle Make', 'Vehicle Color'])\n", 28 | "df.columns = ['pid', 'state', 'ptype', 'make', 'color', 'feet']" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "ed955dcd", 34 | "metadata": {}, 35 | "source": [ 36 | "# Beyond 1\n", 37 | "\n", 38 | "In `df.query`, we can use the words `and` and `or`, rather than the symbols `&` and `|`, thanks to the `numexpr` library. Rewrite our final query using the words. Does this change the speed at all?" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "id": "2a1b21d5", 45 | "metadata": { 46 | "scrolled": true 47 | }, 48 | "outputs": [ 49 | { 50 | "name": "stdout", 51 | "output_type": "stream", 52 | "text": [ 53 | "914 ms ± 7.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "%timeit df.query('state == \"NY\" and ptype == \"PAS\" and color == \"WHITE\" and feet > 1 and make == \"TOYOT\"')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "id": "33611a51", 64 | "metadata": {}, 65 | "source": [ 66 | "# Beyond 2\n", 67 | "\n", 68 | "I prefer measuring distance in meters, rather than in feet. I thus want to find all of the cars that were ticketed when they were more than 1 meter from the curb. Perform this query using the traditional `df.loc` and also using `df.query`. Which one runs faster?" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 4, 74 | "id": "b95a0fc2", 75 | "metadata": { 76 | "scrolled": true 77 | }, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "63.2 ms ± 2.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "%timeit df.loc[(df['feet'] * 0.3048) > 1]" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "id": "d3a6a8c0", 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "84.4 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "%timeit df.query('(feet * 0.3048) > 1')" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "id": "261622c5", 112 | "metadata": {}, 113 | "source": [ 114 | "# Beyond 3\n", 115 | "\n", 116 | "What if we modify our query, such that we look for cars that are > 1 meter from the curb and the state is New York? Which query runs faster, and by how much?" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 6, 122 | "id": "f51ddb07", 123 | "metadata": { 124 | "scrolled": true 125 | }, 126 | "outputs": [ 127 | { 128 | "name": "stdout", 129 | "output_type": "stream", 130 | "text": [ 131 | "507 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 132 | ] 133 | } 134 | ], 135 | "source": [ 136 | "%timeit df.loc[((df['feet'] * 0.3048) > 1) & (df['state'] == 'NY')]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 7, 142 | "id": "5ebb17b3", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "314 ms ± 4.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "%timeit df.query('(feet * 0.3048) > 1 and state == \"NY\" ')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "ada77895", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [] 164 | } 165 | ], 166 | "metadata": { 167 | "kernelspec": { 168 | "display_name": "Python 3 (ipykernel)", 169 | "language": "python", 170 | "name": "python3" 171 | }, 172 | "language_info": { 173 | "codemirror_mode": { 174 | "name": "ipython", 175 | "version": 3 176 | }, 177 | "file_extension": ".py", 178 | "mimetype": "text/x-python", 179 | "name": "python", 180 | "nbconvert_exporter": "python", 181 | "pygments_lexer": "ipython3", 182 | "version": "3.12.1" 183 | } 184 | }, 185 | "nbformat": 4, 186 | "nbformat_minor": 5 187 | } 188 | -------------------------------------------------------------------------------- /chapter-12/dask-worker-space/global.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reuven/pandas-workout/e79d3429f194fb05bfae4aa48f9970bd254a89b6/chapter-12/dask-worker-space/global.lock -------------------------------------------------------------------------------- /chapter-12/dask-worker-space/purge.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/reuven/pandas-workout/e79d3429f194fb05bfae4aa48f9970bd254a89b6/chapter-12/dask-worker-space/purge.lock --------------------------------------------------------------------------------