├── .gitignore
├── README.md
├── chapter-01
    ├── .ipynb_checkpoints
    │   ├── Exercise 01 — Test scores-checkpoint.ipynb
    │   ├── Exercise 01b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 02 — Scaling test scores-checkpoint.ipynb
    │   ├── Exercise 02b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 03 — Counting 10s digits-checkpoint.ipynb
    │   ├── Exercise 03b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 04 — Descriptive statistics-checkpoint.ipynb
    │   ├── Exercise 04b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 05 — Monday temperatures-checkpoint.ipynb
    │   ├── Exercise 05b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 06 — Passenger frequency-checkpoint.ipynb
    │   ├── Exercise 06b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 07 — Long, medium, and short rides-checkpoint.ipynb
    │   └── Exercise 07b — Beyond the exercise-checkpoint.ipynb
    ├── Exercise 01 — Test scores.ipynb
    ├── Exercise 01b — Beyond the exercise.ipynb
    ├── Exercise 02 — Scaling test scores.ipynb
    ├── Exercise 02b — Beyond the exercise.ipynb
    ├── Exercise 03 — Counting 10s digits.ipynb
    ├── Exercise 03b — Beyond the exercise.ipynb
    ├── Exercise 04 — Descriptive statistics.ipynb
    ├── Exercise 04b — Beyond the exercise.ipynb
    ├── Exercise 05 — Monday temperatures.ipynb
    ├── Exercise 05b — Beyond the exercise.ipynb
    ├── Exercise 06 — Passenger frequency.ipynb
    ├── Exercise 06b — Beyond the exercise.ipynb
    ├── Exercise 07 — Long, medium, and short rides.ipynb
    └── Exercise 07b — Beyond the exercise.ipynb
├── chapter-02
    ├── .ipynb_checkpoints
    │   ├── Exercise 08 — Net revenue-checkpoint.ipynb
    │   ├── Exercise 08b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 09 — Tax planning-checkpoint.ipynb
    │   ├── Exercise 09b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 10 — Adding products-checkpoint.ipynb
    │   ├── Exercise 10b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 11 — Best sellers-checkpoint.ipynb
    │   ├── Exercise 11b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 12 — Finding outliers-checkpoint.ipynb
    │   ├── Exercise 12b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 13 — Interpolation-checkpoint.ipynb
    │   ├── Exercise 13b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 14 — Selective updating-checkpoint.ipynb
    │   └── Exercise 14b — Beyond the exercise-checkpoint.ipynb
    ├── Exercise 08 — Net revenue.ipynb
    ├── Exercise 08b — Beyond the exercise.ipynb
    ├── Exercise 09 — Tax planning.ipynb
    ├── Exercise 09b — Beyond the exercise.ipynb
    ├── Exercise 10 — Adding products.ipynb
    ├── Exercise 10b — Beyond the exercise.ipynb
    ├── Exercise 11 — Best sellers.ipynb
    ├── Exercise 11b — Beyond the exercise.ipynb
    ├── Exercise 12 — Finding outliers.ipynb
    ├── Exercise 12b — Beyond the exercise.ipynb
    ├── Exercise 13 — Interpolation.ipynb
    ├── Exercise 13b — Beyond the exercise.ipynb
    ├── Exercise 14 — Selective updating.ipynb
    └── Exercise 14b — Beyond the exercise.ipynb
├── chapter-03
    ├── .ipynb_checkpoints
    │   ├── Exercise 15 — Weird taxi rides-checkpoint.ipynb
    │   ├── Exercise 15b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 16 — Pandemic taxis-checkpoint.ipynb
    │   ├── Exercise 16b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 17 — Setting column types-checkpoint.ipynb
    │   ├── Exercise 17b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 18 — passwd to df-checkpoint.ipynb
    │   ├── Exercise 18b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 19 — Bitcoin values-checkpoint.ipynb
    │   ├── Exercise 19b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 20 — Big cities-checkpoint.ipynb
    │   └── Exercise 20b — Beyond the exercise-checkpoint.ipynb
    ├── Exercise 15 — Weird taxi rides.ipynb
    ├── Exercise 15b — Beyond the exercise.ipynb
    ├── Exercise 16 — Pandemic taxis.ipynb
    ├── Exercise 16b — Beyond the exercise.ipynb
    ├── Exercise 17 — Setting column types.ipynb
    ├── Exercise 17b — Beyond the exercise.ipynb
    ├── Exercise 18 — passwd to df.ipynb
    ├── Exercise 18b — Beyond the exercise.ipynb
    ├── Exercise 19 — Bitcoin values.ipynb
    ├── Exercise 19b — Beyond the exercise.ipynb
    ├── Exercise 20 — Big cities.ipynb
    └── Exercise 20b — Beyond the exercise.ipynb
├── chapter-04
    ├── .ipynb_checkpoints
    │   ├── Exercise 21 — Parking tickets-checkpoint.ipynb
    │   ├── Exercise 21b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 22 — SAT scores-checkpoint.ipynb
    │   ├── Exercise 22b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 23 — Olympic games-checkpoint.ipynb
    │   ├── Exercise 23b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 24 — More Olympic stats-checkpoint.ipynb
    │   ├── Exercise 24b — Beyond the exercise-checkpoint.ipynb
    │   └── Sandbox-checkpoint.ipynb
    ├── Exercise 21 — Parking tickets.ipynb
    ├── Exercise 21b — Beyond the exercise.ipynb
    ├── Exercise 22 — SAT scores.ipynb
    ├── Exercise 22b — Beyond the exercise.ipynb
    ├── Exercise 23 — Olympic games.ipynb
    ├── Exercise 23b — Beyond the exercise.ipynb
    ├── Exercise 24 — More Olympic stats.ipynb
    ├── Exercise 24b — Beyond the exercise.ipynb
    └── Sandbox.ipynb
├── chapter-05
    ├── .ipynb_checkpoints
    │   ├── Exercise 25 — Parking cleanup-checkpoint.ipynb
    │   ├── Exercise 25b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 26 — Celebrity deaths-checkpoint.ipynb
    │   ├── Exercise 26b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 27 — Titanic interpolation-checkpoint.ipynb
    │   ├── Exercise 27b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 28 — Inconsistent data-checkpoint.ipynb
    │   └── Exercise 28b — Beyond the exercise-checkpoint.ipynb
    ├── Exercise 25 — Parking cleanup.ipynb
    ├── Exercise 25b — Beyond the exercise.ipynb
    ├── Exercise 26 — Celebrity deaths.ipynb
    ├── Exercise 26b — Beyond the exercise.ipynb
    ├── Exercise 27 — Titanic interpolation.ipynb
    ├── Exercise 27b — Beyond the exercise.ipynb
    ├── Exercise 28 — Inconsistent data.ipynb
    └── Exercise 28b — Beyond the exercise.ipynb
├── chapter-06
    ├── .ipynb_checkpoints
    │   ├── Exercise 29 — Longest taxi rides-checkpoint.ipynb
    │   ├── Exercise 29b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 30 — Taxi rides per passenger count-checkpoint.ipynb
    │   ├── Exercise 30b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 31 — Tourist spending-checkpoint.ipynb
    │   ├── Exercise 31b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 32 — Multi-city temperatures-checkpoint.ipynb
    │   ├── Exercise 32b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 33 — SAT scores per state-checkpoint.ipynb
    │   ├── Exercise 33b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 34 — Rainy, snowy cities-checkpoint.ipynb
    │   ├── Exercise 34b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 35 — Outer joins-checkpoint.ipynb
    │   ├── Exercise 35b — Beyond the exercise-checkpoint.ipynb
    │   └── Joining sidebar-checkpoint.ipynb
    ├── Exercise 29 — Longest taxi rides.ipynb
    ├── Exercise 29b — Beyond the exercise.ipynb
    ├── Exercise 30 — Taxi rides per passenger count.ipynb
    ├── Exercise 30b — Beyond the exercise.ipynb
    ├── Exercise 31 — Tourist spending.ipynb
    ├── Exercise 31b — Beyond the exercise.ipynb
    └── Joining sidebar.ipynb
├── chapter-07
    ├── Exercise 32 — Multi-city temperatures.ipynb
    ├── Exercise 32b — Beyond the exercise.ipynb
    ├── Exercise 33 — SAT scores per state.ipynb
    ├── Exercise 33b — Beyond the exercise.ipynb
    ├── Exercise 34 — Rainy, snowy cities.ipynb
    ├── Exercise 34b — Beyond the exercise.ipynb
    ├── Exercise 35 — Outer joins.ipynb
    └── Exercise 35b — Beyond the exercise.ipynb
├── chapter-08
    ├── .ipynb_checkpoints
    │   └── Chapter 7 — project-checkpoint.ipynb
    └── Chapter 7 — project.ipynb
├── chapter-09
    ├── .ipynb_checkpoints
    │   ├── Exercise 36 — Analyzing Alice-checkpoint.ipynb
    │   ├── Exercise 36b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 37 — Wine words-checkpoint.ipynb
    │   ├── Exercise 37b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 38 — Programming languages-checkpoint.ipynb
    │   └── Exercise 38b — Beyond the exercise-checkpoint.ipynb
    ├── Exercise 36 — Analyzing Alice.ipynb
    ├── Exercise 36b — Beyond the exercise.ipynb
    ├── Exercise 37 — Wine words.ipynb
    ├── Exercise 37b — Beyond the exercise.ipynb
    ├── Exercise 38 — Programming languages.ipynb
    └── Exercise 38b — Beyond the exercise.ipynb
├── chapter-10
    ├── .ipynb_checkpoints
    │   ├── Exercise 39 — Long taxi rides-checkpoint.ipynb
    │   ├── Exercise 39b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 40 — Writing dates, reading dates-checkpoint.ipynb
    │   ├── Exercise 40b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 41 — Oil prices-checkpoint.ipynb
    │   ├── Exercise 41b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 42 — Best tippers-checkpoint.ipynb
    │   ├── Exercise 42b — Beyond the exercise-checkpoint.ipynb
    │   └── Untitled-checkpoint.ipynb
    ├── Exercise 39 — Long taxi rides.ipynb
    ├── Exercise 39b — Beyond the exercise.ipynb
    ├── Exercise 40 — Writing dates, reading dates.ipynb
    ├── Exercise 40b — Beyond the exercise.ipynb
    ├── Exercise 41 — Oil prices.ipynb
    ├── Exercise 41b — Beyond the exercise.ipynb
    ├── Exercise 42 — Best tippers.ipynb
    ├── Exercise 42b — Beyond the exercise.ipynb
    └── Untitled.ipynb
├── chapter-11
    ├── .ipynb_checkpoints
    │   ├── Exercise 43 — Cities-checkpoint.ipynb
    │   ├── Exercise 43b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 44 — Boxplotting weather-checkpoint.ipynb
    │   ├── Exercise 44b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 45 — Taxi fare parts-checkpoint.ipynb
    │   ├── Exercise 45b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 46 — Cars, Oil, and ice cream-checkpoint.ipynb
    │   ├── Exercise 46b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 47 — Seaborn taxi plots-checkpoint.ipynb
    │   ├── Exercise 47a — Seaborn sidebar-checkpoint.ipynb
    │   ├── Exercise 47b — Beyond the exercise-checkpoint.ipynb
    │   └── Untitled-checkpoint.ipynb
    ├── Exercise 43 — Cities.ipynb
    ├── Exercise 43b — Beyond the exercise.ipynb
    ├── Exercise 44 — Boxplotting weather.ipynb
    ├── Exercise 44b — Beyond the exercise.ipynb
    ├── Exercise 45 — Taxi fare parts.ipynb
    ├── Exercise 45b — Beyond the exercise.ipynb
    ├── Exercise 46 — Cars, Oil, and ice cream.ipynb
    ├── Exercise 46b — Beyond the exercise.ipynb
    ├── Exercise 47 — Seaborn taxi plots.ipynb
    ├── Exercise 47a — Seaborn sidebar.ipynb
    ├── Exercise 47b — Beyond the exercise.ipynb
    └── Untitled.ipynb
├── chapter-12
    ├── .gitignore
    ├── .ipynb_checkpoints
    │   ├── Exercise 48 — Categories-checkpoint.ipynb
    │   ├── Exercise 48b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 49 — Faster reading and writing-checkpoint.ipynb
    │   ├── Exercise 49b — Beyond the exercise-checkpoint.ipynb
    │   ├── Exercise 50 — Faster queries with query-checkpoint.ipynb
    │   └── Exercise 50b — Beyond the exercise-checkpoint.ipynb
    ├── Exercise 48 — Categories.ipynb
    ├── Exercise 48b — Beyond the exercise.ipynb
    ├── Exercise 49 — Faster reading and writing.ipynb
    ├── Exercise 49b — Beyond the exercise.ipynb
    ├── Exercise 50 — Faster queries with query.ipynb
    ├── Exercise 50b — Beyond the exercise.ipynb
    └── dask-worker-space
    │   ├── global.lock
    │   └── purge.lock
└── chapter-13
    ├── .ipynb_checkpoints
        └── Final project-checkpoint.ipynb
    └── Final project.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | *.zip
2 | *~
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Jupyter notebooks for Pandas Workout (https://PandasWorkout.com/)
 2 | 
 3 | These are the Jupyter notebooks for my book, Pandas Workout. The data for the exercises are stored separately, at https://files.lerner.co.il/pandas-workout-data.zip .
 4 | 
 5 | Did you like the book? Please help me to spread the word, by reviewing it on Amazon (https://www.amazon.com/Pandas-Workout-Reuven-Lerner/dp/1617299723).
 6 | 
 7 | Other links that might be of interest to you:
 8 | 
 9 | - "Better developers," with new articles about Python each week: https://BetterDevelopersWeekly.com/
10 | - "Bamboo Weekly," with Pandas puzzles based on  current events: https://BambooWeekly.com
11 | - My previous book, "Python Workout": https://PythonWorkout.com/
12 | - My YouTube channel: https://YouTube.com/reuvenlerner
13 | - My Twitter feed: https://Twitter.com/reuvenmlerner
14 | - My online courses: https://LernerPython.com
15 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 01 — Test scores-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "Sep    96\n",
 23 |        "Oct    89\n",
 24 |        "Nov    85\n",
 25 |        "Dec    78\n",
 26 |        "Jan    79\n",
 27 |        "Feb    71\n",
 28 |        "Mar    72\n",
 29 |        "Apr    70\n",
 30 |        "May    75\n",
 31 |        "Jun    95\n",
 32 |        "dtype: int64"
 33 |       ]
 34 |      },
 35 |      "execution_count": 2,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "g = np.random.default_rng(0)\n",
 42 |     "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n",
 43 |     "\n",
 44 |     "s = Series(g.integers(70, 101, 10),\n",
 45 |     "          index=months)\n",
 46 |     "s"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "Entire year average: 81.0\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "print(f'Entire year average: {s.mean()}')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {
 70 |     "scrolled": true
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "Yearly average: 81.0\n",
 78 |       "First half average: 85.4\n",
 79 |       "Second half average: 76.6\n",
 80 |       "Improvement: -8.800000000000011\n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "first_half_average = s['Sep':'Jan'].mean()\n",
 86 |     "second_half_average = s['Feb':'Jun'].mean()\n",
 87 |     "\n",
 88 |     "print(f'Yearly average: {s.mean()}')\n",
 89 |     "\n",
 90 |     "print(f'First half average: {first_half_average}')\n",
 91 |     "print(f'Second half average: {second_half_average}')\n",
 92 |     "\n",
 93 |     "print(f'Improvement: {second_half_average - first_half_average}')"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": []
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "Python 3 (ipykernel)",
107 |    "language": "python",
108 |    "name": "python3"
109 |   },
110 |   "language_info": {
111 |    "codemirror_mode": {
112 |     "name": "ipython",
113 |     "version": 3
114 |    },
115 |    "file_extension": ".py",
116 |    "mimetype": "text/x-python",
117 |    "name": "python",
118 |    "nbconvert_exporter": "python",
119 |    "pygments_lexer": "ipython3",
120 |    "version": "3.11.6"
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 4
125 | }
126 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 01b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "g = np.random.default_rng(0)\n",
 14 |     "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n",
 15 |     "\n",
 16 |     "s = Series(g.integers(70, 100, 10),\n",
 17 |     "          index=months)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Beyond 1\n",
 25 |     "\n",
 26 |     "In which month did this student get their highest score? Note that there are at least two ways to accomplish this: You can sort the values, taking the largest one, or you can use a boolean (\"mask\") index to find those rows that match the value of `s.max()`, the highest value."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/plain": [
 37 |        "'Sep'"
 38 |       ]
 39 |      },
 40 |      "execution_count": 2,
 41 |      "metadata": {},
 42 |      "output_type": "execute_result"
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "# Option 1\n",
 47 |     "s.sort_values(ascending=False).index[0]"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "'Sep'"
 59 |       ]
 60 |      },
 61 |      "execution_count": 3,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "# Option 2\n",
 68 |     "s[s==s.max()].index[0]"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "'Sep'"
 80 |       ]
 81 |      },
 82 |      "execution_count": 4,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "# Option 3\n",
 89 |     "s.idxmax()"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "# Beyond 2\n",
 97 |     "\n",
 98 |     "What were this student's five highest scores in the year?"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "Sep    95\n",
110 |        "Jun    94\n",
111 |        "Oct    89\n",
112 |        "Nov    85\n",
113 |        "Jan    79\n",
114 |        "dtype: int64"
115 |       ]
116 |      },
117 |      "execution_count": 5,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "s.sort_values(ascending=False).head(5)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# Beyond 3\n",
131 |     "\n",
132 |     "Round the student's scores to the nearest 10.  So a score of 82 would be rounded down to 80, but a score of 87 would be rounded up to 90."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 6,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "Sep    100\n",
144 |        "Oct     90\n",
145 |        "Nov     80\n",
146 |        "Dec     80\n",
147 |        "Jan     80\n",
148 |        "Feb     70\n",
149 |        "Mar     70\n",
150 |        "Apr     70\n",
151 |        "May     80\n",
152 |        "Jun     90\n",
153 |        "dtype: int64"
154 |       ]
155 |      },
156 |      "execution_count": 6,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "# The \"round\" method, when given a positive integer argument, rounds numbers after the\n",
163 |     "# decimal point. When given a negative integer argument, it rounds numbers *before* the decimal point!\n",
164 |     "\n",
165 |     "s.round(-1)  "
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": []
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3 (ipykernel)",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.11.6"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 4
197 | }
198 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 02 — Scaling test scores-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "Sep    57\n",
 23 |        "Oct    52\n",
 24 |        "Nov    50\n",
 25 |        "Dec    45\n",
 26 |        "Jan    46\n",
 27 |        "Feb    40\n",
 28 |        "Mar    41\n",
 29 |        "Apr    40\n",
 30 |        "May    43\n",
 31 |        "Jun    56\n",
 32 |        "dtype: int64"
 33 |       ]
 34 |      },
 35 |      "execution_count": 2,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "g = np.random.default_rng(0)\n",
 42 |     "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n",
 43 |     "\n",
 44 |     "s = Series(g.integers(40, 60, 10),\n",
 45 |     "          index=months)\n",
 46 |     "s"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "Sep    90.0\n",
 58 |        "Oct    85.0\n",
 59 |        "Nov    83.0\n",
 60 |        "Dec    78.0\n",
 61 |        "Jan    79.0\n",
 62 |        "Feb    73.0\n",
 63 |        "Mar    74.0\n",
 64 |        "Apr    73.0\n",
 65 |        "May    76.0\n",
 66 |        "Jun    89.0\n",
 67 |        "dtype: float64"
 68 |       ]
 69 |      },
 70 |      "execution_count": 3,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "s + (80 - s.mean())"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": []
 85 |   }
 86 |  ],
 87 |  "metadata": {
 88 |   "kernelspec": {
 89 |    "display_name": "Python 3 (ipykernel)",
 90 |    "language": "python",
 91 |    "name": "python3"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 3
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython3",
103 |    "version": "3.11.6"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 4
108 | }
109 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 03 — Counting 10s digits-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0    85\n",
 12 |        "1    63\n",
 13 |        "2    51\n",
 14 |        "3    26\n",
 15 |        "4    30\n",
 16 |        "5     4\n",
 17 |        "6     7\n",
 18 |        "7     1\n",
 19 |        "8    17\n",
 20 |        "9    81\n",
 21 |        "dtype: int64"
 22 |       ]
 23 |      },
 24 |      "execution_count": 1,
 25 |      "metadata": {},
 26 |      "output_type": "execute_result"
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "import pandas as pd\n",
 32 |     "from pandas import Series, DataFrame\n",
 33 |     "\n",
 34 |     "g = np.random.default_rng(0)\n",
 35 |     "s = Series(g.integers(0, 100, 10))\n",
 36 |     "s"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "0    8\n",
 48 |        "1    6\n",
 49 |        "2    5\n",
 50 |        "3    2\n",
 51 |        "4    3\n",
 52 |        "5    0\n",
 53 |        "6    0\n",
 54 |        "7    0\n",
 55 |        "8    1\n",
 56 |        "9    8\n",
 57 |        "dtype: int8"
 58 |       ]
 59 |      },
 60 |      "execution_count": 2,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# solution 1, using /\n",
 67 |     "(s / 10).astype(np.int8)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/plain": [
 78 |        "0    8\n",
 79 |        "1    6\n",
 80 |        "2    5\n",
 81 |        "3    2\n",
 82 |        "4    3\n",
 83 |        "5    0\n",
 84 |        "6    0\n",
 85 |        "7    0\n",
 86 |        "8    1\n",
 87 |        "9    8\n",
 88 |        "dtype: int64"
 89 |       ]
 90 |      },
 91 |      "execution_count": 3,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# solution 2, using //\n",
 98 |     "(s // 10)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 4,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "0    8\n",
110 |        "1    6\n",
111 |        "2    5\n",
112 |        "3    2\n",
113 |        "4    3\n",
114 |        "5    0\n",
115 |        "6    0\n",
116 |        "7    0\n",
117 |        "8    1\n",
118 |        "9    8\n",
119 |        "dtype: object"
120 |       ]
121 |      },
122 |      "execution_count": 4,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "# solution 3, partial\n",
129 |     "s.astype(str).str.get(-2).fillna('0')"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 5,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "0    8\n",
141 |        "1    6\n",
142 |        "2    5\n",
143 |        "3    2\n",
144 |        "4    3\n",
145 |        "5    0\n",
146 |        "6    0\n",
147 |        "7    0\n",
148 |        "8    1\n",
149 |        "9    8\n",
150 |        "dtype: int8"
151 |       ]
152 |      },
153 |      "execution_count": 5,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "# solution 3, complete\n",
160 |     "s.astype(str).str.get(-2).fillna('0').astype(np.int8)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 6,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "0    8\n",
172 |        "1    6\n",
173 |        "2    5\n",
174 |        "3    2\n",
175 |        "4    3\n",
176 |        "5    0\n",
177 |        "6    0\n",
178 |        "7    0\n",
179 |        "8    1\n",
180 |        "9    8\n",
181 |        "dtype: int8"
182 |       ]
183 |      },
184 |      "execution_count": 6,
185 |      "metadata": {},
186 |      "output_type": "execute_result"
187 |     }
188 |    ],
189 |    "source": [
190 |     "(\n",
191 |     "    s\n",
192 |     "    .astype(str)     # get a series based on s, with dtype str\n",
193 |     "    .str.get(-2)     # retrieve the second-to-last character\n",
194 |     "    .fillna('0')     # replace NaN with '0'\n",
195 |     "    .astype(np.int8) # get a new series back dtype int8\n",
196 |     ")\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": []
205 |   }
206 |  ],
207 |  "metadata": {
208 |   "kernelspec": {
209 |    "display_name": "Python 3 (ipykernel)",
210 |    "language": "python",
211 |    "name": "python3"
212 |   },
213 |   "language_info": {
214 |    "codemirror_mode": {
215 |     "name": "ipython",
216 |     "version": 3
217 |    },
218 |    "file_extension": ".py",
219 |    "mimetype": "text/x-python",
220 |    "name": "python",
221 |    "nbconvert_exporter": "python",
222 |    "pygments_lexer": "ipython3",
223 |    "version": "3.11.6"
224 |   }
225 |  },
226 |  "nbformat": 4,
227 |  "nbformat_minor": 4
228 | }
229 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 04 — Descriptive statistics-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0         12.573022\n",
 12 |        "1        -13.210486\n",
 13 |        "2         64.042265\n",
 14 |        "3         10.490012\n",
 15 |        "4        -53.566937\n",
 16 |        "            ...    \n",
 17 |        "99995    -91.667135\n",
 18 |        "99996   -231.480500\n",
 19 |        "99997     -0.028179\n",
 20 |        "99998   -109.645051\n",
 21 |        "99999    -49.541294\n",
 22 |        "Length: 100000, dtype: float64"
 23 |       ]
 24 |      },
 25 |      "execution_count": 5,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "from pandas import Series, DataFrame\n",
 34 |     "\n",
 35 |     "g = np.random.default_rng(0)\n",
 36 |     "s = Series(g.normal(0, 100, 100_000))\n",
 37 |     "s"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 6,
 43 |    "metadata": {
 44 |     "scrolled": true
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "count    100000.000000\n",
 51 |        "mean         -0.090825\n",
 52 |        "std         100.013350\n",
 53 |        "min        -449.411704\n",
 54 |        "25%         -67.292120\n",
 55 |        "50%          -0.414699\n",
 56 |        "75%          67.636542\n",
 57 |        "max         473.195769\n",
 58 |        "dtype: float64"
 59 |       ]
 60 |      },
 61 |      "execution_count": 6,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "s.describe()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 7,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "s.loc[s == s.min()] = 5*s.max()"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {
 83 |     "scrolled": true
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "count    100000.000000\n",
 90 |        "mean         -0.062671\n",
 91 |        "std         100.282770\n",
 92 |        "min        -402.315865\n",
 93 |        "25%         -67.288054\n",
 94 |        "50%          -0.409289\n",
 95 |        "75%          67.640758\n",
 96 |        "max        2365.978844\n",
 97 |        "dtype: float64"
 98 |       ]
 99 |      },
100 |      "execution_count": 4,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "s.describe()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": []
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3 (ipykernel)",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.11.6"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 4
138 | }
139 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 04b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0         12.573022\n",
 12 |        "1        -13.210486\n",
 13 |        "2         64.042265\n",
 14 |        "3         10.490012\n",
 15 |        "4        -53.566937\n",
 16 |        "            ...    \n",
 17 |        "99995    -91.667135\n",
 18 |        "99996   -231.480500\n",
 19 |        "99997     -0.028179\n",
 20 |        "99998   -109.645051\n",
 21 |        "99999    -49.541294\n",
 22 |        "Length: 100000, dtype: float64"
 23 |       ]
 24 |      },
 25 |      "execution_count": 1,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "from pandas import Series, DataFrame\n",
 34 |     "\n",
 35 |     "g = np.random.default_rng(0)\n",
 36 |     "s = Series(g.normal(0, 100, 100_000))\n",
 37 |     "s"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "# Beyond 1\n",
 45 |     "\n",
 46 |     "Demonstrate that 68%, 95%, and 99.7% of the values in `s` are indeed within 1, 2, and 3 standard distributions of the mean."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "0.68396"
 58 |       ]
 59 |      },
 60 |      "execution_count": 2,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# within one standard deviation\n",
 67 |     "s[(s > s.mean() - s.std()) &\n",
 68 |     "  (s < s.mean() + s.std())].count() / s.count()"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 3,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "0.95461"
 80 |       ]
 81 |      },
 82 |      "execution_count": 3,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "# within two standard deviations\n",
 89 |     "s[(s > s.mean() - 2*s.std()) &\n",
 90 |     "  (s < s.mean() + 2*s.std())].count() / s.count()"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 4,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "0.99708"
102 |       ]
103 |      },
104 |      "execution_count": 4,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "# within three standard deviations\n",
111 |     "s[(s > s.mean() - 3*s.std()) &\n",
112 |     "  (s < s.mean() + 3*s.std())].count() / s.count()"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "# Beyond 2\n",
120 |     "\n",
121 |     " Calculate the mean of numbers greater than `s.mean()`. Then calculate the mean of numbers less than `s.mean()`. Is the average of these two numbers the same as `s.mean()`?"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 5,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "0.12941477214831565"
133 |       ]
134 |      },
135 |      "execution_count": 5,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "(s[s < s.mean()].mean() + s[s > s.mean()].mean() ) / 2"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 6,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "-0.09082507731206121"
153 |       ]
154 |      },
155 |      "execution_count": 6,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "# They're pretty close!\n",
162 |     "s.mean()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "# Beyond 3\n",
170 |     "\n",
171 |     "What is the mean of the numbers beyond 3 standard deviations?"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 7,
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "-11.606040282602287"
183 |       ]
184 |      },
185 |      "execution_count": 7,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "# A pretty complex combination of mask indexes,\n",
192 |     "# but the result is still a series, on which we can run mean()\n",
193 |     "s[(s < s.mean() - 3*s.std()) | \n",
194 |     "  (s > s.mean() + 3*s.std()) ].mean()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python 3 (ipykernel)",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.11.6"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 4
226 | }
227 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 05 — Monday temperatures-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n",
 14 |     "\n",
 15 |     "g = np.random.default_rng(0)\n",
 16 |     "s = Series(g.normal(20, 5, 28),\n",
 17 |     "          index=days*4).round().astype(np.int8)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "data": {
 27 |       "text/plain": [
 28 |        "Sun    21\n",
 29 |        "Mon    19\n",
 30 |        "Tue    23\n",
 31 |        "Wed    21\n",
 32 |        "Thu    17\n",
 33 |        "Fri    22\n",
 34 |        "Sat    27\n",
 35 |        "Sun    25\n",
 36 |        "Mon    16\n",
 37 |        "Tue    14\n",
 38 |        "Wed    17\n",
 39 |        "Thu    20\n",
 40 |        "Fri     8\n",
 41 |        "Sat    19\n",
 42 |        "Sun    14\n",
 43 |        "Mon    16\n",
 44 |        "Tue    17\n",
 45 |        "Wed    18\n",
 46 |        "Thu    22\n",
 47 |        "Fri    25\n",
 48 |        "Sat    19\n",
 49 |        "Sun    27\n",
 50 |        "Mon    17\n",
 51 |        "Tue    22\n",
 52 |        "Wed    25\n",
 53 |        "Thu    20\n",
 54 |        "Fri    16\n",
 55 |        "Sat    15\n",
 56 |        "dtype: int8"
 57 |       ]
 58 |      },
 59 |      "execution_count": 2,
 60 |      "metadata": {},
 61 |      "output_type": "execute_result"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "s"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 3,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "17.0"
 77 |       ]
 78 |      },
 79 |      "execution_count": 3,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "s.loc['Mon'].mean()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": []
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "kernelspec": {
 98 |    "display_name": "Python 3 (ipykernel)",
 99 |    "language": "python",
100 |    "name": "python3"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.11.6"
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 4
117 | }
118 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 05b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n",
 14 |     "\n",
 15 |     "g = np.random.default_rng(0)\n",
 16 |     "s = Series(g.normal(20, 5, 28),\n",
 17 |     "          index=days*4).round().astype(np.int8)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Beyond 1\n",
 25 |     "\n",
 26 |     "What was the average temperature on weekends (i.e., Saturdays and Sundays)?"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/plain": [
 37 |        "20.875"
 38 |       ]
 39 |      },
 40 |      "execution_count": 2,
 41 |      "metadata": {},
 42 |      "output_type": "execute_result"
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "s[['Sun', 'Sat']].mean()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "# Beyond 2\n",
 54 |     "\n",
 55 |     "How many times will the change in temperature from the previous day be greater than 2 degrees?"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "Tue    23\n",
 67 |        "Fri    22\n",
 68 |        "Sat    27\n",
 69 |        "Wed    17\n",
 70 |        "Thu    20\n",
 71 |        "Sat    19\n",
 72 |        "Thu    22\n",
 73 |        "Fri    25\n",
 74 |        "Sun    27\n",
 75 |        "Tue    22\n",
 76 |        "Wed    25\n",
 77 |        "dtype: int8"
 78 |       ]
 79 |      },
 80 |      "execution_count": 3,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "# by default, the \"diff\" method compares with the previous element\n",
 87 |     "s[s.diff() > 2]"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "# Beyond 3\n",
 95 |     "\n",
 96 |     "What are the two most common temperatures, and how often does each appear?"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 4,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "17    4\n",
108 |        "19    3\n",
109 |        "Name: count, dtype: int64"
110 |       ]
111 |      },
112 |      "execution_count": 4,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "# value_counts returns a series in which the values from s are \n",
119 |     "# the index, the number of appearances is the value, and the\n",
120 |     "# items are ordered from most common to least common. We can\n",
121 |     "# then use \"head\" to get only the 2 most common values.\n",
122 |     "s.value_counts().head(2)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": []
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3 (ipykernel)",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.11.6"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 06 — Passenger frequency-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0\n",
 12 |        "1    0.720772\n",
 13 |        "6    0.036904\n",
 14 |        "Name: proportion, dtype: float64"
 15 |       ]
 16 |      },
 17 |      "execution_count": 1,
 18 |      "metadata": {},
 19 |      "output_type": "execute_result"
 20 |     }
 21 |    ],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import pandas as pd\n",
 25 |     "from pandas import Series, DataFrame\n",
 26 |     "\n",
 27 |     "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n",
 28 |     "\n",
 29 |     "s.value_counts(normalize=True)[[1, 6]]"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "0\n",
 41 |        "1    7207\n",
 42 |        "2    1313\n",
 43 |        "5     520\n",
 44 |        "3     406\n",
 45 |        "6     369\n",
 46 |        "4     182\n",
 47 |        "0       2\n",
 48 |        "Name: count, dtype: int64"
 49 |       ]
 50 |      },
 51 |      "execution_count": 2,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "s.value_counts()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "0\n",
 69 |        "1    0.720772\n",
 70 |        "6    0.036904\n",
 71 |        "Name: proportion, dtype: float64"
 72 |       ]
 73 |      },
 74 |      "execution_count": 3,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "s.value_counts(normalize=True)[[1,6]]"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": []
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "kernelspec": {
 93 |    "display_name": "Python 3 (ipykernel)",
 94 |    "language": "python",
 95 |    "name": "python3"
 96 |   },
 97 |   "language_info": {
 98 |    "codemirror_mode": {
 99 |     "name": "ipython",
100 |     "version": 3
101 |    },
102 |    "file_extension": ".py",
103 |    "mimetype": "text/x-python",
104 |    "name": "python",
105 |    "nbconvert_exporter": "python",
106 |    "pygments_lexer": "ipython3",
107 |    "version": "3.11.6"
108 |   }
109 |  },
110 |  "nbformat": 4,
111 |  "nbformat_minor": 4
112 | }
113 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 06b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Beyond 1\n",
 21 |     "\n",
 22 |     "What are the 25%, 50% (median), and 75% quantiles for this data set? Can you guess the results before you execute the code?"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "0.25    1.0\n",
 34 |        "0.50    1.0\n",
 35 |        "0.75    2.0\n",
 36 |        "Name: 0, dtype: float64"
 37 |       ]
 38 |      },
 39 |      "execution_count": 2,
 40 |      "metadata": {},
 41 |      "output_type": "execute_result"
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "# Since 1-passenger rides are 72% of the values, we can\n",
 46 |     "# guess that the 25% and 50% marks will be 1, whereas \n",
 47 |     "# the 75% mark will be 2 or 3, depending on how common those are.\n",
 48 |     "s.quantile([.25, .50, .75])"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# Beyond 2\n",
 56 |     "\n",
 57 |     "What proportion of taxi rides are for 3, 4, 5, or 6 passengers?"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "0.1477147714771477"
 69 |       ]
 70 |      },
 71 |      "execution_count": 3,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "s.value_counts(normalize=True)[[3,4,5,6]].sum()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# Beyond 3\n",
 85 |     "\n",
 86 |     "Consider that you're in charge of vehicle licensing for New York taxis. Given these numbers, would more people benefit from smaller taxis that can take only one or two passengers, or larger taxis that can take five or six passengers?"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Given that a huge proportion of rides are for 1 or 2 passengers, licensing more small taxis would seem to match the needs."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": []
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "Python 3 (ipykernel)",
107 |    "language": "python",
108 |    "name": "python3"
109 |   },
110 |   "language_info": {
111 |    "codemirror_mode": {
112 |     "name": "ipython",
113 |     "version": 3
114 |    },
115 |    "file_extension": ".py",
116 |    "mimetype": "text/x-python",
117 |    "name": "python",
118 |    "nbconvert_exporter": "python",
119 |    "pygments_lexer": "ipython3",
120 |    "version": "3.11.6"
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 4
125 | }
126 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 07 — Long, medium, and short rides-checkpoint.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import numpy as np\n",
10 |     "import pandas as pd\n",
11 |     "from pandas import Series, DataFrame\n",
12 |     "\n",
13 |     "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": 3,
19 |    "metadata": {},
20 |    "outputs": [
21 |     {
22 |      "data": {
23 |       "text/plain": [
24 |        "0\n",
25 |        "short     5890\n",
26 |        "medium    3402\n",
27 |        "long       707\n",
28 |        "Name: count, dtype: int64"
29 |       ]
30 |      },
31 |      "execution_count": 3,
32 |      "metadata": {},
33 |      "output_type": "execute_result"
34 |     }
35 |    ],
36 |    "source": [
37 |     "pd.cut(s, \n",
38 |     "       bins=[0, 2, 10, s.max()], \n",
39 |     "       include_lowest=True,\n",
40 |     "       labels=['short', 'medium', 'long']).value_counts()"
41 |    ]
42 |   },
43 |   {
44 |    "cell_type": "code",
45 |    "execution_count": null,
46 |    "metadata": {},
47 |    "outputs": [],
48 |    "source": []
49 |   }
50 |  ],
51 |  "metadata": {
52 |   "kernelspec": {
53 |    "display_name": "Python 3 (ipykernel)",
54 |    "language": "python",
55 |    "name": "python3"
56 |   },
57 |   "language_info": {
58 |    "codemirror_mode": {
59 |     "name": "ipython",
60 |     "version": 3
61 |    },
62 |    "file_extension": ".py",
63 |    "mimetype": "text/x-python",
64 |    "name": "python",
65 |    "nbconvert_exporter": "python",
66 |    "pygments_lexer": "ipython3",
67 |    "version": "3.11.6"
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 4
72 | }
73 | 


--------------------------------------------------------------------------------
/chapter-01/.ipynb_checkpoints/Exercise 07b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Beyond 1\n",
 21 |     "\n",
 22 |     "Compare the mean and median trip distances. What does that tell you about the distribution of our data?"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {
 29 |     "scrolled": true
 30 |    },
 31 |    "outputs": [
 32 |     {
 33 |      "data": {
 34 |       "text/plain": [
 35 |        "count    9999.000000\n",
 36 |        "mean        3.158511\n",
 37 |        "std         4.037516\n",
 38 |        "min         0.000000\n",
 39 |        "25%         1.000000\n",
 40 |        "50%         1.700000\n",
 41 |        "75%         3.300000\n",
 42 |        "max        64.600000\n",
 43 |        "Name: 0, dtype: float64"
 44 |       ]
 45 |      },
 46 |      "execution_count": 2,
 47 |      "metadata": {},
 48 |      "output_type": "execute_result"
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "s.describe()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "Because the mean is significantly higher than the median, it would seem that there are some *very* long trips in our data set that are pulling the mean up. And sure enough, we see that the standard deviation is 4, but that we have at least one trip > 64 miles in length."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Beyond 2\n",
 67 |     "\n",
 68 |     "How many short, medium, and long trips were there for trips that had only one passenger? Note that data for passenger count and trip length are from the same data set, meaning that the indexes are the same."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 3,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "0\n",
 80 |        "short     4333\n",
 81 |        "medium    2387\n",
 82 |        "long       487\n",
 83 |        "Name: count, dtype: int64"
 84 |       ]
 85 |      },
 86 |      "execution_count": 3,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n",
 93 |     "\n",
 94 |     "pd.cut(s[passenger_count == 1], \n",
 95 |     "       bins=[s.min(), 2, 10, s.max()], \n",
 96 |     "       include_lowest=True,\n",
 97 |     "       labels=['short', 'medium', 'long']).value_counts()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Beyond 3\n",
105 |     "\n",
106 |     "What happens if we don't pass explicit intervals, and instead ask `pd.cut` to just create 3 bins, with `bins=3`?"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 4,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "array([-0.0646    , 21.53333333, 43.06666667, 64.6       ])"
118 |       ]
119 |      },
120 |      "execution_count": 4,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n",
127 |     "\n",
128 |     "pd.cut(s[passenger_count == 1], \n",
129 |     "       bins=3,\n",
130 |     "       labels=['short', 'medium', 'long'], retbins=True)[-1]"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 5,
136 |    "metadata": {
137 |     "scrolled": true
138 |    },
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "0\n",
144 |        "short     7179\n",
145 |        "medium      26\n",
146 |        "long         2\n",
147 |        "Name: count, dtype: int64"
148 |       ]
149 |      },
150 |      "execution_count": 5,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "pd.cut(s[passenger_count == 1], \n",
157 |     "       bins=3,\n",
158 |     "       labels=['short', 'medium', 'long']).value_counts()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "`pd.cut` took the interval from `s.min()` to `s.max()`, divided it into three equal parts, and assigned those to be `short`, `medium`, and `long`. We can see, though, that this meant our `long` category is from 43 miles to 64.6 miles -- numerically one-third of the values' interval, but only including a handful of values!"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3 (ipykernel)",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.11.6"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 4
190 | }
191 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 01 — Test scores.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "Sep    96\n",
 23 |        "Oct    89\n",
 24 |        "Nov    85\n",
 25 |        "Dec    78\n",
 26 |        "Jan    79\n",
 27 |        "Feb    71\n",
 28 |        "Mar    72\n",
 29 |        "Apr    70\n",
 30 |        "May    75\n",
 31 |        "Jun    95\n",
 32 |        "dtype: int64"
 33 |       ]
 34 |      },
 35 |      "execution_count": 2,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "g = np.random.default_rng(0)\n",
 42 |     "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n",
 43 |     "\n",
 44 |     "s = Series(g.integers(70, 101, 10),\n",
 45 |     "          index=months)\n",
 46 |     "s"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "Entire year average: 81.0\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "print(f'Entire year average: {s.mean()}')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {
 70 |     "scrolled": true
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "Yearly average: 81.0\n",
 78 |       "First half average: 85.4\n",
 79 |       "Second half average: 76.6\n",
 80 |       "Improvement: -8.800000000000011\n"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "first_half_average = s['Sep':'Jan'].mean()\n",
 86 |     "second_half_average = s['Feb':'Jun'].mean()\n",
 87 |     "\n",
 88 |     "print(f'Yearly average: {s.mean()}')\n",
 89 |     "\n",
 90 |     "print(f'First half average: {first_half_average}')\n",
 91 |     "print(f'Second half average: {second_half_average}')\n",
 92 |     "\n",
 93 |     "print(f'Improvement: {second_half_average - first_half_average}')"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": []
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "Python 3 (ipykernel)",
107 |    "language": "python",
108 |    "name": "python3"
109 |   },
110 |   "language_info": {
111 |    "codemirror_mode": {
112 |     "name": "ipython",
113 |     "version": 3
114 |    },
115 |    "file_extension": ".py",
116 |    "mimetype": "text/x-python",
117 |    "name": "python",
118 |    "nbconvert_exporter": "python",
119 |    "pygments_lexer": "ipython3",
120 |    "version": "3.11.6"
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 4
125 | }
126 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 01b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "g = np.random.default_rng(0)\n",
 14 |     "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n",
 15 |     "\n",
 16 |     "s = Series(g.integers(70, 100, 10),\n",
 17 |     "          index=months)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Beyond 1\n",
 25 |     "\n",
 26 |     "In which month did this student get their highest score? Note that there are at least two ways to accomplish this: You can sort the values, taking the largest one, or you can use a boolean (\"mask\") index to find those rows that match the value of `s.max()`, the highest value."
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/plain": [
 37 |        "'Sep'"
 38 |       ]
 39 |      },
 40 |      "execution_count": 2,
 41 |      "metadata": {},
 42 |      "output_type": "execute_result"
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "# Option 1\n",
 47 |     "s.sort_values(ascending=False).index[0]"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "'Sep'"
 59 |       ]
 60 |      },
 61 |      "execution_count": 3,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "# Option 2\n",
 68 |     "s[s==s.max()].index[0]"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "'Sep'"
 80 |       ]
 81 |      },
 82 |      "execution_count": 4,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "# Option 3\n",
 89 |     "s.idxmax()"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "# Beyond 2\n",
 97 |     "\n",
 98 |     "What were this student's five highest scores in the year?"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "Sep    95\n",
110 |        "Jun    94\n",
111 |        "Oct    89\n",
112 |        "Nov    85\n",
113 |        "Jan    79\n",
114 |        "dtype: int64"
115 |       ]
116 |      },
117 |      "execution_count": 5,
118 |      "metadata": {},
119 |      "output_type": "execute_result"
120 |     }
121 |    ],
122 |    "source": [
123 |     "s.sort_values(ascending=False).head(5)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "# Beyond 3\n",
131 |     "\n",
132 |     "Round the student's scores to the nearest 10.  So a score of 82 would be rounded down to 80, but a score of 87 would be rounded up to 90."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 6,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "Sep    100\n",
144 |        "Oct     90\n",
145 |        "Nov     80\n",
146 |        "Dec     80\n",
147 |        "Jan     80\n",
148 |        "Feb     70\n",
149 |        "Mar     70\n",
150 |        "Apr     70\n",
151 |        "May     80\n",
152 |        "Jun     90\n",
153 |        "dtype: int64"
154 |       ]
155 |      },
156 |      "execution_count": 6,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "# The \"round\" method, when given a positive integer argument, rounds numbers after the\n",
163 |     "# decimal point. When given a negative integer argument, it rounds numbers *before* the decimal point!\n",
164 |     "\n",
165 |     "s.round(-1)  "
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": []
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3 (ipykernel)",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "language_info": {
183 |    "codemirror_mode": {
184 |     "name": "ipython",
185 |     "version": 3
186 |    },
187 |    "file_extension": ".py",
188 |    "mimetype": "text/x-python",
189 |    "name": "python",
190 |    "nbconvert_exporter": "python",
191 |    "pygments_lexer": "ipython3",
192 |    "version": "3.11.6"
193 |   }
194 |  },
195 |  "nbformat": 4,
196 |  "nbformat_minor": 4
197 | }
198 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 02 — Scaling test scores.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "Sep    57\n",
 23 |        "Oct    52\n",
 24 |        "Nov    50\n",
 25 |        "Dec    45\n",
 26 |        "Jan    46\n",
 27 |        "Feb    40\n",
 28 |        "Mar    41\n",
 29 |        "Apr    40\n",
 30 |        "May    43\n",
 31 |        "Jun    56\n",
 32 |        "dtype: int64"
 33 |       ]
 34 |      },
 35 |      "execution_count": 2,
 36 |      "metadata": {},
 37 |      "output_type": "execute_result"
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "g = np.random.default_rng(0)\n",
 42 |     "months = 'Sep Oct Nov Dec Jan Feb Mar Apr May Jun'.split()\n",
 43 |     "\n",
 44 |     "s = Series(g.integers(40, 60, 10),\n",
 45 |     "          index=months)\n",
 46 |     "s"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "Sep    90.0\n",
 58 |        "Oct    85.0\n",
 59 |        "Nov    83.0\n",
 60 |        "Dec    78.0\n",
 61 |        "Jan    79.0\n",
 62 |        "Feb    73.0\n",
 63 |        "Mar    74.0\n",
 64 |        "Apr    73.0\n",
 65 |        "May    76.0\n",
 66 |        "Jun    89.0\n",
 67 |        "dtype: float64"
 68 |       ]
 69 |      },
 70 |      "execution_count": 3,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "s + (80 - s.mean())"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": []
 85 |   }
 86 |  ],
 87 |  "metadata": {
 88 |   "kernelspec": {
 89 |    "display_name": "Python 3 (ipykernel)",
 90 |    "language": "python",
 91 |    "name": "python3"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 3
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython3",
103 |    "version": "3.11.6"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 4
108 | }
109 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 03 — Counting 10s digits.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0    85\n",
 12 |        "1    63\n",
 13 |        "2    51\n",
 14 |        "3    26\n",
 15 |        "4    30\n",
 16 |        "5     4\n",
 17 |        "6     7\n",
 18 |        "7     1\n",
 19 |        "8    17\n",
 20 |        "9    81\n",
 21 |        "dtype: int64"
 22 |       ]
 23 |      },
 24 |      "execution_count": 1,
 25 |      "metadata": {},
 26 |      "output_type": "execute_result"
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import numpy as np\n",
 31 |     "import pandas as pd\n",
 32 |     "from pandas import Series, DataFrame\n",
 33 |     "\n",
 34 |     "g = np.random.default_rng(0)\n",
 35 |     "s = Series(g.integers(0, 100, 10))\n",
 36 |     "s"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 2,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "data": {
 46 |       "text/plain": [
 47 |        "0    8\n",
 48 |        "1    6\n",
 49 |        "2    5\n",
 50 |        "3    2\n",
 51 |        "4    3\n",
 52 |        "5    0\n",
 53 |        "6    0\n",
 54 |        "7    0\n",
 55 |        "8    1\n",
 56 |        "9    8\n",
 57 |        "dtype: int8"
 58 |       ]
 59 |      },
 60 |      "execution_count": 2,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# solution 1, using /\n",
 67 |     "(s / 10).astype(np.int8)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 3,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/plain": [
 78 |        "0    8\n",
 79 |        "1    6\n",
 80 |        "2    5\n",
 81 |        "3    2\n",
 82 |        "4    3\n",
 83 |        "5    0\n",
 84 |        "6    0\n",
 85 |        "7    0\n",
 86 |        "8    1\n",
 87 |        "9    8\n",
 88 |        "dtype: int64"
 89 |       ]
 90 |      },
 91 |      "execution_count": 3,
 92 |      "metadata": {},
 93 |      "output_type": "execute_result"
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# solution 2, using //\n",
 98 |     "(s // 10)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 4,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "data": {
108 |       "text/plain": [
109 |        "0    8\n",
110 |        "1    6\n",
111 |        "2    5\n",
112 |        "3    2\n",
113 |        "4    3\n",
114 |        "5    0\n",
115 |        "6    0\n",
116 |        "7    0\n",
117 |        "8    1\n",
118 |        "9    8\n",
119 |        "dtype: object"
120 |       ]
121 |      },
122 |      "execution_count": 4,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "# solution 3, partial\n",
129 |     "s.astype(str).str.get(-2).fillna('0')"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 5,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "0    8\n",
141 |        "1    6\n",
142 |        "2    5\n",
143 |        "3    2\n",
144 |        "4    3\n",
145 |        "5    0\n",
146 |        "6    0\n",
147 |        "7    0\n",
148 |        "8    1\n",
149 |        "9    8\n",
150 |        "dtype: int8"
151 |       ]
152 |      },
153 |      "execution_count": 5,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "# solution 3, complete\n",
160 |     "s.astype(str).str.get(-2).fillna('0').astype(np.int8)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 6,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "0    8\n",
172 |        "1    6\n",
173 |        "2    5\n",
174 |        "3    2\n",
175 |        "4    3\n",
176 |        "5    0\n",
177 |        "6    0\n",
178 |        "7    0\n",
179 |        "8    1\n",
180 |        "9    8\n",
181 |        "dtype: int8"
182 |       ]
183 |      },
184 |      "execution_count": 6,
185 |      "metadata": {},
186 |      "output_type": "execute_result"
187 |     }
188 |    ],
189 |    "source": [
190 |     "(\n",
191 |     "    s\n",
192 |     "    .astype(str)     # get a series based on s, with dtype str\n",
193 |     "    .str.get(-2)     # retrieve the second-to-last character\n",
194 |     "    .fillna('0')     # replace NaN with '0'\n",
195 |     "    .astype(np.int8) # get a new series back dtype int8\n",
196 |     ")\n"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": []
205 |   }
206 |  ],
207 |  "metadata": {
208 |   "kernelspec": {
209 |    "display_name": "Python 3 (ipykernel)",
210 |    "language": "python",
211 |    "name": "python3"
212 |   },
213 |   "language_info": {
214 |    "codemirror_mode": {
215 |     "name": "ipython",
216 |     "version": 3
217 |    },
218 |    "file_extension": ".py",
219 |    "mimetype": "text/x-python",
220 |    "name": "python",
221 |    "nbconvert_exporter": "python",
222 |    "pygments_lexer": "ipython3",
223 |    "version": "3.11.6"
224 |   }
225 |  },
226 |  "nbformat": 4,
227 |  "nbformat_minor": 4
228 | }
229 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 04 — Descriptive statistics.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0         12.573022\n",
 12 |        "1        -13.210486\n",
 13 |        "2         64.042265\n",
 14 |        "3         10.490012\n",
 15 |        "4        -53.566937\n",
 16 |        "            ...    \n",
 17 |        "99995    -91.667135\n",
 18 |        "99996   -231.480500\n",
 19 |        "99997     -0.028179\n",
 20 |        "99998   -109.645051\n",
 21 |        "99999    -49.541294\n",
 22 |        "Length: 100000, dtype: float64"
 23 |       ]
 24 |      },
 25 |      "execution_count": 5,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "from pandas import Series, DataFrame\n",
 34 |     "\n",
 35 |     "g = np.random.default_rng(0)\n",
 36 |     "s = Series(g.normal(0, 100, 100_000))\n",
 37 |     "s"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 6,
 43 |    "metadata": {
 44 |     "scrolled": true
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "data": {
 49 |       "text/plain": [
 50 |        "count    100000.000000\n",
 51 |        "mean         -0.090825\n",
 52 |        "std         100.013350\n",
 53 |        "min        -449.411704\n",
 54 |        "25%         -67.292120\n",
 55 |        "50%          -0.414699\n",
 56 |        "75%          67.636542\n",
 57 |        "max         473.195769\n",
 58 |        "dtype: float64"
 59 |       ]
 60 |      },
 61 |      "execution_count": 6,
 62 |      "metadata": {},
 63 |      "output_type": "execute_result"
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "s.describe()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 7,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "s.loc[s == s.min()] = 5*s.max()"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 4,
 82 |    "metadata": {
 83 |     "scrolled": true
 84 |    },
 85 |    "outputs": [
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "count    100000.000000\n",
 90 |        "mean         -0.062671\n",
 91 |        "std         100.282770\n",
 92 |        "min        -402.315865\n",
 93 |        "25%         -67.288054\n",
 94 |        "50%          -0.409289\n",
 95 |        "75%          67.640758\n",
 96 |        "max        2365.978844\n",
 97 |        "dtype: float64"
 98 |       ]
 99 |      },
100 |      "execution_count": 4,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "s.describe()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": []
115 |   }
116 |  ],
117 |  "metadata": {
118 |   "kernelspec": {
119 |    "display_name": "Python 3 (ipykernel)",
120 |    "language": "python",
121 |    "name": "python3"
122 |   },
123 |   "language_info": {
124 |    "codemirror_mode": {
125 |     "name": "ipython",
126 |     "version": 3
127 |    },
128 |    "file_extension": ".py",
129 |    "mimetype": "text/x-python",
130 |    "name": "python",
131 |    "nbconvert_exporter": "python",
132 |    "pygments_lexer": "ipython3",
133 |    "version": "3.11.6"
134 |   }
135 |  },
136 |  "nbformat": 4,
137 |  "nbformat_minor": 4
138 | }
139 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 04b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0         12.573022\n",
 12 |        "1        -13.210486\n",
 13 |        "2         64.042265\n",
 14 |        "3         10.490012\n",
 15 |        "4        -53.566937\n",
 16 |        "            ...    \n",
 17 |        "99995    -91.667135\n",
 18 |        "99996   -231.480500\n",
 19 |        "99997     -0.028179\n",
 20 |        "99998   -109.645051\n",
 21 |        "99999    -49.541294\n",
 22 |        "Length: 100000, dtype: float64"
 23 |       ]
 24 |      },
 25 |      "execution_count": 1,
 26 |      "metadata": {},
 27 |      "output_type": "execute_result"
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import numpy as np\n",
 32 |     "import pandas as pd\n",
 33 |     "from pandas import Series, DataFrame\n",
 34 |     "\n",
 35 |     "g = np.random.default_rng(0)\n",
 36 |     "s = Series(g.normal(0, 100, 100_000))\n",
 37 |     "s"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "# Beyond 1\n",
 45 |     "\n",
 46 |     "Demonstrate that 68%, 95%, and 99.7% of the values in `s` are indeed within 1, 2, and 3 standard distributions of the mean."
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 2,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "0.68396"
 58 |       ]
 59 |      },
 60 |      "execution_count": 2,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# within one standard deviation\n",
 67 |     "s[(s > s.mean() - s.std()) &\n",
 68 |     "  (s < s.mean() + s.std())].count() / s.count()"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 3,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "0.95461"
 80 |       ]
 81 |      },
 82 |      "execution_count": 3,
 83 |      "metadata": {},
 84 |      "output_type": "execute_result"
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "# within two standard deviations\n",
 89 |     "s[(s > s.mean() - 2*s.std()) &\n",
 90 |     "  (s < s.mean() + 2*s.std())].count() / s.count()"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 4,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "data": {
100 |       "text/plain": [
101 |        "0.99708"
102 |       ]
103 |      },
104 |      "execution_count": 4,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "# within three standard deviations\n",
111 |     "s[(s > s.mean() - 3*s.std()) &\n",
112 |     "  (s < s.mean() + 3*s.std())].count() / s.count()"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "markdown",
117 |    "metadata": {},
118 |    "source": [
119 |     "# Beyond 2\n",
120 |     "\n",
121 |     " Calculate the mean of numbers greater than `s.mean()`. Then calculate the mean of numbers less than `s.mean()`. Is the average of these two numbers the same as `s.mean()`?"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 5,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "0.12941477214831565"
133 |       ]
134 |      },
135 |      "execution_count": 5,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "(s[s < s.mean()].mean() + s[s > s.mean()].mean() ) / 2"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 6,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "-0.09082507731206121"
153 |       ]
154 |      },
155 |      "execution_count": 6,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "# They're pretty close!\n",
162 |     "s.mean()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "# Beyond 3\n",
170 |     "\n",
171 |     "What is the mean of the numbers beyond 3 standard deviations?"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 7,
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "-11.606040282602287"
183 |       ]
184 |      },
185 |      "execution_count": 7,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "# A pretty complex combination of mask indexes,\n",
192 |     "# but the result is still a series, on which we can run mean()\n",
193 |     "s[(s < s.mean() - 3*s.std()) | \n",
194 |     "  (s > s.mean() + 3*s.std()) ].mean()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": []
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python 3 (ipykernel)",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.11.6"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 4
226 | }
227 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 05 — Monday temperatures.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n",
 14 |     "\n",
 15 |     "g = np.random.default_rng(0)\n",
 16 |     "s = Series(g.normal(20, 5, 28),\n",
 17 |     "          index=days*4).round().astype(np.int8)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [
 25 |     {
 26 |      "data": {
 27 |       "text/plain": [
 28 |        "Sun    21\n",
 29 |        "Mon    19\n",
 30 |        "Tue    23\n",
 31 |        "Wed    21\n",
 32 |        "Thu    17\n",
 33 |        "Fri    22\n",
 34 |        "Sat    27\n",
 35 |        "Sun    25\n",
 36 |        "Mon    16\n",
 37 |        "Tue    14\n",
 38 |        "Wed    17\n",
 39 |        "Thu    20\n",
 40 |        "Fri     8\n",
 41 |        "Sat    19\n",
 42 |        "Sun    14\n",
 43 |        "Mon    16\n",
 44 |        "Tue    17\n",
 45 |        "Wed    18\n",
 46 |        "Thu    22\n",
 47 |        "Fri    25\n",
 48 |        "Sat    19\n",
 49 |        "Sun    27\n",
 50 |        "Mon    17\n",
 51 |        "Tue    22\n",
 52 |        "Wed    25\n",
 53 |        "Thu    20\n",
 54 |        "Fri    16\n",
 55 |        "Sat    15\n",
 56 |        "dtype: int8"
 57 |       ]
 58 |      },
 59 |      "execution_count": 2,
 60 |      "metadata": {},
 61 |      "output_type": "execute_result"
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "s"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 3,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "17.0"
 77 |       ]
 78 |      },
 79 |      "execution_count": 3,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "s.loc['Mon'].mean()"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": []
 94 |   }
 95 |  ],
 96 |  "metadata": {
 97 |   "kernelspec": {
 98 |    "display_name": "Python 3 (ipykernel)",
 99 |    "language": "python",
100 |    "name": "python3"
101 |   },
102 |   "language_info": {
103 |    "codemirror_mode": {
104 |     "name": "ipython",
105 |     "version": 3
106 |    },
107 |    "file_extension": ".py",
108 |    "mimetype": "text/x-python",
109 |    "name": "python",
110 |    "nbconvert_exporter": "python",
111 |    "pygments_lexer": "ipython3",
112 |    "version": "3.11.6"
113 |   }
114 |  },
115 |  "nbformat": 4,
116 |  "nbformat_minor": 4
117 | }
118 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 05b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "days = 'Sun Mon Tue Wed Thu Fri Sat'.split()\n",
 14 |     "\n",
 15 |     "g = np.random.default_rng(0)\n",
 16 |     "s = Series(g.normal(20, 5, 28),\n",
 17 |     "          index=days*4).round().astype(np.int8)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Beyond 1\n",
 25 |     "\n",
 26 |     "What was the average temperature on weekends (i.e., Saturdays and Sundays)?"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/plain": [
 37 |        "20.875"
 38 |       ]
 39 |      },
 40 |      "execution_count": 2,
 41 |      "metadata": {},
 42 |      "output_type": "execute_result"
 43 |     }
 44 |    ],
 45 |    "source": [
 46 |     "s[['Sun', 'Sat']].mean()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "# Beyond 2\n",
 54 |     "\n",
 55 |     "How many times will the change in temperature from the previous day be greater than 2 degrees?"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 3,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "Tue    23\n",
 67 |        "Fri    22\n",
 68 |        "Sat    27\n",
 69 |        "Wed    17\n",
 70 |        "Thu    20\n",
 71 |        "Sat    19\n",
 72 |        "Thu    22\n",
 73 |        "Fri    25\n",
 74 |        "Sun    27\n",
 75 |        "Tue    22\n",
 76 |        "Wed    25\n",
 77 |        "dtype: int8"
 78 |       ]
 79 |      },
 80 |      "execution_count": 3,
 81 |      "metadata": {},
 82 |      "output_type": "execute_result"
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "# by default, the \"diff\" method compares with the previous element\n",
 87 |     "s[s.diff() > 2]"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "# Beyond 3\n",
 95 |     "\n",
 96 |     "What are the two most common temperatures, and how often does each appear?"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 4,
102 |    "metadata": {},
103 |    "outputs": [
104 |     {
105 |      "data": {
106 |       "text/plain": [
107 |        "17    4\n",
108 |        "19    3\n",
109 |        "Name: count, dtype: int64"
110 |       ]
111 |      },
112 |      "execution_count": 4,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "# value_counts returns a series in which the values from s are \n",
119 |     "# the index, the number of appearances is the value, and the\n",
120 |     "# items are ordered from most common to least common. We can\n",
121 |     "# then use \"head\" to get only the 2 most common values.\n",
122 |     "s.value_counts().head(2)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": []
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3 (ipykernel)",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.11.6"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 06 — Passenger frequency.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "0\n",
 12 |        "1    0.720772\n",
 13 |        "6    0.036904\n",
 14 |        "Name: proportion, dtype: float64"
 15 |       ]
 16 |      },
 17 |      "execution_count": 1,
 18 |      "metadata": {},
 19 |      "output_type": "execute_result"
 20 |     }
 21 |    ],
 22 |    "source": [
 23 |     "import numpy as np\n",
 24 |     "import pandas as pd\n",
 25 |     "from pandas import Series, DataFrame\n",
 26 |     "\n",
 27 |     "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n",
 28 |     "\n",
 29 |     "s.value_counts(normalize=True)[[1, 6]]"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "metadata": {},
 36 |    "outputs": [
 37 |     {
 38 |      "data": {
 39 |       "text/plain": [
 40 |        "0\n",
 41 |        "1    7207\n",
 42 |        "2    1313\n",
 43 |        "5     520\n",
 44 |        "3     406\n",
 45 |        "6     369\n",
 46 |        "4     182\n",
 47 |        "0       2\n",
 48 |        "Name: count, dtype: int64"
 49 |       ]
 50 |      },
 51 |      "execution_count": 2,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "s.value_counts()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "0\n",
 69 |        "1    0.720772\n",
 70 |        "6    0.036904\n",
 71 |        "Name: proportion, dtype: float64"
 72 |       ]
 73 |      },
 74 |      "execution_count": 3,
 75 |      "metadata": {},
 76 |      "output_type": "execute_result"
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "s.value_counts(normalize=True)[[1,6]]"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": []
 89 |   }
 90 |  ],
 91 |  "metadata": {
 92 |   "kernelspec": {
 93 |    "display_name": "Python 3 (ipykernel)",
 94 |    "language": "python",
 95 |    "name": "python3"
 96 |   },
 97 |   "language_info": {
 98 |    "codemirror_mode": {
 99 |     "name": "ipython",
100 |     "version": 3
101 |    },
102 |    "file_extension": ".py",
103 |    "mimetype": "text/x-python",
104 |    "name": "python",
105 |    "nbconvert_exporter": "python",
106 |    "pygments_lexer": "ipython3",
107 |    "version": "3.11.6"
108 |   }
109 |  },
110 |  "nbformat": 4,
111 |  "nbformat_minor": 4
112 | }
113 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 06b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "s = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Beyond 1\n",
 21 |     "\n",
 22 |     "What are the 25%, 50% (median), and 75% quantiles for this data set? Can you guess the results before you execute the code?"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "0.25    1.0\n",
 34 |        "0.50    1.0\n",
 35 |        "0.75    2.0\n",
 36 |        "Name: 0, dtype: float64"
 37 |       ]
 38 |      },
 39 |      "execution_count": 2,
 40 |      "metadata": {},
 41 |      "output_type": "execute_result"
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "# Since 1-passenger rides are 72% of the values, we can\n",
 46 |     "# guess that the 25% and 50% marks will be 1, whereas \n",
 47 |     "# the 75% mark will be 2 or 3, depending on how common those are.\n",
 48 |     "s.quantile([.25, .50, .75])"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# Beyond 2\n",
 56 |     "\n",
 57 |     "What proportion of taxi rides are for 3, 4, 5, or 6 passengers?"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 3,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "0.1477147714771477"
 69 |       ]
 70 |      },
 71 |      "execution_count": 3,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "s.value_counts(normalize=True)[[3,4,5,6]].sum()"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# Beyond 3\n",
 85 |     "\n",
 86 |     "Consider that you're in charge of vehicle licensing for New York taxis. Given these numbers, would more people benefit from smaller taxis that can take only one or two passengers, or larger taxis that can take five or six passengers?"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Given that a huge proportion of rides are for 1 or 2 passengers, licensing more small taxis would seem to match the needs."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": []
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "Python 3 (ipykernel)",
107 |    "language": "python",
108 |    "name": "python3"
109 |   },
110 |   "language_info": {
111 |    "codemirror_mode": {
112 |     "name": "ipython",
113 |     "version": 3
114 |    },
115 |    "file_extension": ".py",
116 |    "mimetype": "text/x-python",
117 |    "name": "python",
118 |    "nbconvert_exporter": "python",
119 |    "pygments_lexer": "ipython3",
120 |    "version": "3.11.6"
121 |   }
122 |  },
123 |  "nbformat": 4,
124 |  "nbformat_minor": 4
125 | }
126 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 07 — Long, medium, and short rides.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 2,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import numpy as np\n",
10 |     "import pandas as pd\n",
11 |     "from pandas import Series, DataFrame\n",
12 |     "\n",
13 |     "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()"
14 |    ]
15 |   },
16 |   {
17 |    "cell_type": "code",
18 |    "execution_count": 3,
19 |    "metadata": {},
20 |    "outputs": [
21 |     {
22 |      "data": {
23 |       "text/plain": [
24 |        "0\n",
25 |        "short     5890\n",
26 |        "medium    3402\n",
27 |        "long       707\n",
28 |        "Name: count, dtype: int64"
29 |       ]
30 |      },
31 |      "execution_count": 3,
32 |      "metadata": {},
33 |      "output_type": "execute_result"
34 |     }
35 |    ],
36 |    "source": [
37 |     "pd.cut(s, \n",
38 |     "       bins=[0, 2, 10, s.max()], \n",
39 |     "       include_lowest=True,\n",
40 |     "       labels=['short', 'medium', 'long']).value_counts()"
41 |    ]
42 |   },
43 |   {
44 |    "cell_type": "code",
45 |    "execution_count": null,
46 |    "metadata": {},
47 |    "outputs": [],
48 |    "source": []
49 |   }
50 |  ],
51 |  "metadata": {
52 |   "kernelspec": {
53 |    "display_name": "Python 3 (ipykernel)",
54 |    "language": "python",
55 |    "name": "python3"
56 |   },
57 |   "language_info": {
58 |    "codemirror_mode": {
59 |     "name": "ipython",
60 |     "version": 3
61 |    },
62 |    "file_extension": ".py",
63 |    "mimetype": "text/x-python",
64 |    "name": "python",
65 |    "nbconvert_exporter": "python",
66 |    "pygments_lexer": "ipython3",
67 |    "version": "3.11.6"
68 |   }
69 |  },
70 |  "nbformat": 4,
71 |  "nbformat_minor": 4
72 | }
73 | 


--------------------------------------------------------------------------------
/chapter-01/Exercise 07b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "s = pd.read_csv('../data/taxi-distance.csv', header=None).squeeze()"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "# Beyond 1\n",
 21 |     "\n",
 22 |     "Compare the mean and median trip distances. What does that tell you about the distribution of our data?"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {
 29 |     "scrolled": true
 30 |    },
 31 |    "outputs": [
 32 |     {
 33 |      "data": {
 34 |       "text/plain": [
 35 |        "count    9999.000000\n",
 36 |        "mean        3.158511\n",
 37 |        "std         4.037516\n",
 38 |        "min         0.000000\n",
 39 |        "25%         1.000000\n",
 40 |        "50%         1.700000\n",
 41 |        "75%         3.300000\n",
 42 |        "max        64.600000\n",
 43 |        "Name: 0, dtype: float64"
 44 |       ]
 45 |      },
 46 |      "execution_count": 2,
 47 |      "metadata": {},
 48 |      "output_type": "execute_result"
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "s.describe()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "Because the mean is significantly higher than the median, it would seem that there are some *very* long trips in our data set that are pulling the mean up. And sure enough, we see that the standard deviation is 4, but that we have at least one trip > 64 miles in length."
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Beyond 2\n",
 67 |     "\n",
 68 |     "How many short, medium, and long trips were there for trips that had only one passenger? Note that data for passenger count and trip length are from the same data set, meaning that the indexes are the same."
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 3,
 74 |    "metadata": {},
 75 |    "outputs": [
 76 |     {
 77 |      "data": {
 78 |       "text/plain": [
 79 |        "0\n",
 80 |        "short     4333\n",
 81 |        "medium    2387\n",
 82 |        "long       487\n",
 83 |        "Name: count, dtype: int64"
 84 |       ]
 85 |      },
 86 |      "execution_count": 3,
 87 |      "metadata": {},
 88 |      "output_type": "execute_result"
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n",
 93 |     "\n",
 94 |     "pd.cut(s[passenger_count == 1], \n",
 95 |     "       bins=[s.min(), 2, 10, s.max()], \n",
 96 |     "       include_lowest=True,\n",
 97 |     "       labels=['short', 'medium', 'long']).value_counts()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "# Beyond 3\n",
105 |     "\n",
106 |     "What happens if we don't pass explicit intervals, and instead ask `pd.cut` to just create 3 bins, with `bins=3`?"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 4,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "array([-0.0646    , 21.53333333, 43.06666667, 64.6       ])"
118 |       ]
119 |      },
120 |      "execution_count": 4,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "passenger_count = pd.read_csv('../data/taxi-passenger-count.csv', header=None).squeeze()\n",
127 |     "\n",
128 |     "pd.cut(s[passenger_count == 1], \n",
129 |     "       bins=3,\n",
130 |     "       labels=['short', 'medium', 'long'], retbins=True)[-1]"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 5,
136 |    "metadata": {
137 |     "scrolled": true
138 |    },
139 |    "outputs": [
140 |     {
141 |      "data": {
142 |       "text/plain": [
143 |        "0\n",
144 |        "short     7179\n",
145 |        "medium      26\n",
146 |        "long         2\n",
147 |        "Name: count, dtype: int64"
148 |       ]
149 |      },
150 |      "execution_count": 5,
151 |      "metadata": {},
152 |      "output_type": "execute_result"
153 |     }
154 |    ],
155 |    "source": [
156 |     "pd.cut(s[passenger_count == 1], \n",
157 |     "       bins=3,\n",
158 |     "       labels=['short', 'medium', 'long']).value_counts()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "`pd.cut` took the interval from `s.min()` to `s.max()`, divided it into three equal parts, and assigned those to be `short`, `medium`, and `long`. We can see, though, that this meant our `long` category is from 43 miles to 64.6 miles -- numerically one-third of the values' interval, but only including a handful of values!"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 3 (ipykernel)",
172 |    "language": "python",
173 |    "name": "python3"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 3
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython3",
185 |    "version": "3.12.1"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 4
190 | }
191 | 


--------------------------------------------------------------------------------
/chapter-02/.ipynb_checkpoints/Exercise 08 — Net revenue-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "from pandas import Series, DataFrame"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/html": [
 24 |        "<div>\n",
 25 |        "<style scoped>\n",
 26 |        "    .dataframe tbody tr th:only-of-type {\n",
 27 |        "        vertical-align: middle;\n",
 28 |        "    }\n",
 29 |        "\n",
 30 |        "    .dataframe tbody tr th {\n",
 31 |        "        vertical-align: top;\n",
 32 |        "    }\n",
 33 |        "\n",
 34 |        "    .dataframe thead th {\n",
 35 |        "        text-align: right;\n",
 36 |        "    }\n",
 37 |        "</style>\n",
 38 |        "<table border=\"1\" class=\"dataframe\">\n",
 39 |        "  <thead>\n",
 40 |        "    <tr style=\"text-align: right;\">\n",
 41 |        "      <th></th>\n",
 42 |        "      <th>product_id</th>\n",
 43 |        "      <th>name</th>\n",
 44 |        "      <th>wholesale_price</th>\n",
 45 |        "      <th>retail_price</th>\n",
 46 |        "      <th>sales</th>\n",
 47 |        "    </tr>\n",
 48 |        "  </thead>\n",
 49 |        "  <tbody>\n",
 50 |        "    <tr>\n",
 51 |        "      <th>0</th>\n",
 52 |        "      <td>23</td>\n",
 53 |        "      <td>computer</td>\n",
 54 |        "      <td>500.0</td>\n",
 55 |        "      <td>1000</td>\n",
 56 |        "      <td>100</td>\n",
 57 |        "    </tr>\n",
 58 |        "    <tr>\n",
 59 |        "      <th>1</th>\n",
 60 |        "      <td>96</td>\n",
 61 |        "      <td>Python Workout</td>\n",
 62 |        "      <td>35.0</td>\n",
 63 |        "      <td>75</td>\n",
 64 |        "      <td>1000</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>2</th>\n",
 68 |        "      <td>97</td>\n",
 69 |        "      <td>Pandas Workout</td>\n",
 70 |        "      <td>35.0</td>\n",
 71 |        "      <td>75</td>\n",
 72 |        "      <td>500</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>3</th>\n",
 76 |        "      <td>15</td>\n",
 77 |        "      <td>banana</td>\n",
 78 |        "      <td>0.5</td>\n",
 79 |        "      <td>1</td>\n",
 80 |        "      <td>200</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>4</th>\n",
 84 |        "      <td>87</td>\n",
 85 |        "      <td>sandwich</td>\n",
 86 |        "      <td>3.0</td>\n",
 87 |        "      <td>5</td>\n",
 88 |        "      <td>300</td>\n",
 89 |        "    </tr>\n",
 90 |        "  </tbody>\n",
 91 |        "</table>\n",
 92 |        "</div>"
 93 |       ],
 94 |       "text/plain": [
 95 |        "   product_id            name  wholesale_price  retail_price  sales\n",
 96 |        "0          23        computer            500.0          1000    100\n",
 97 |        "1          96  Python Workout             35.0            75   1000\n",
 98 |        "2          97  Pandas Workout             35.0            75    500\n",
 99 |        "3          15          banana              0.5             1    200\n",
100 |        "4          87        sandwich              3.0             5    300"
101 |       ]
102 |      },
103 |      "execution_count": 2,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "df = DataFrame([{'product_id':23, 'name':'computer', 'wholesale_price': 500, \n",
110 |     "                 'retail_price':1000, 'sales':100},\n",
111 |     "               {'product_id':96, 'name':'Python Workout', 'wholesale_price': 35,\n",
112 |     "                'retail_price':75, 'sales':1000},\n",
113 |     "               {'product_id':97, 'name':'Pandas Workout', 'wholesale_price': 35,\n",
114 |     "                'retail_price':75, 'sales':500},\n",
115 |     "               {'product_id':15, 'name':'banana', 'wholesale_price': 0.5,\n",
116 |     "                'retail_price':1, 'sales':200},\n",
117 |     "               {'product_id':87, 'name':'sandwich', 'wholesale_price': 3,\n",
118 |     "                'retail_price':5, 'sales':300},\n",
119 |     "               ])\n",
120 |     "\n",
121 |     "df"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 3,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "110700.0"
133 |       ]
134 |      },
135 |      "execution_count": 3,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "((df['retail_price'] - df['wholesale_price']) * df['sales']).sum()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": []
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "Python 3 (ipykernel)",
155 |    "language": "python",
156 |    "name": "python3"
157 |   },
158 |   "language_info": {
159 |    "codemirror_mode": {
160 |     "name": "ipython",
161 |     "version": 3
162 |    },
163 |    "file_extension": ".py",
164 |    "mimetype": "text/x-python",
165 |    "name": "python",
166 |    "nbconvert_exporter": "python",
167 |    "pygments_lexer": "ipython3",
168 |    "version": "3.11.6"
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 4
173 | }
174 | 


--------------------------------------------------------------------------------
/chapter-02/.ipynb_checkpoints/Exercise 13 — Interpolation-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n",
 14 |     "df = DataFrame({'temp': s, \n",
 15 |     "                'hour': [0,3,6,9,12,15,18,21] * 91})"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "data": {
 25 |       "text/plain": [
 26 |        "count    728.000000\n",
 27 |        "mean      -1.050824\n",
 28 |        "std        5.026357\n",
 29 |        "min      -14.000000\n",
 30 |        "25%       -4.000000\n",
 31 |        "50%        0.000000\n",
 32 |        "75%        2.000000\n",
 33 |        "max       12.000000\n",
 34 |        "Name: temp, dtype: float64"
 35 |       ]
 36 |      },
 37 |      "execution_count": 2,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# Get the mean + median for baseline data\n",
 44 |     "df['temp'].describe()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Set temp at 3 and 6 a.m. to NaN\n",
 54 |     "df.loc[\n",
 55 |     "    df['hour'].isin([3, 6]),\n",
 56 |     "    'temp'\n",
 57 |     "] = np.NaN"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Interpolate!\n",
 67 |     "df = df.interpolate()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 5,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/plain": [
 78 |        "count    728.000000\n",
 79 |        "mean      -1.050824\n",
 80 |        "std        5.026357\n",
 81 |        "min      -14.000000\n",
 82 |        "25%       -4.000000\n",
 83 |        "50%        0.000000\n",
 84 |        "75%        2.000000\n",
 85 |        "max       12.000000\n",
 86 |        "Name: temp, dtype: float64"
 87 |       ]
 88 |      },
 89 |      "execution_count": 5,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "# Get the mean + median for when we're missing 3 and 6 a.m.\n",
 96 |     "df['temp'].describe()"
 97 |    ]
 98 |   }
 99 |  ],
100 |  "metadata": {
101 |   "kernelspec": {
102 |    "display_name": "Python 3 (ipykernel)",
103 |    "language": "python",
104 |    "name": "python3"
105 |   },
106 |   "language_info": {
107 |    "codemirror_mode": {
108 |     "name": "ipython",
109 |     "version": 3
110 |    },
111 |    "file_extension": ".py",
112 |    "mimetype": "text/x-python",
113 |    "name": "python",
114 |    "nbconvert_exporter": "python",
115 |    "pygments_lexer": "ipython3",
116 |    "version": "3.11.6"
117 |   }
118 |  },
119 |  "nbformat": 4,
120 |  "nbformat_minor": 4
121 | }
122 | 


--------------------------------------------------------------------------------
/chapter-02/.ipynb_checkpoints/Exercise 14 — Selective updating-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n",
 13 |     "df = DataFrame({'temp': s, \n",
 14 |     "                'hour': [0,3,6,9,12,15,18,21] * 91})"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "df.loc[\n",
 24 |     "    df['temp'] < 0, \n",
 25 |     "    'temp'\n",
 26 |     "]  = 0"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/html": [
 37 |        "<div>\n",
 38 |        "<style scoped>\n",
 39 |        "    .dataframe tbody tr th:only-of-type {\n",
 40 |        "        vertical-align: middle;\n",
 41 |        "    }\n",
 42 |        "\n",
 43 |        "    .dataframe tbody tr th {\n",
 44 |        "        vertical-align: top;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe thead th {\n",
 48 |        "        text-align: right;\n",
 49 |        "    }\n",
 50 |        "</style>\n",
 51 |        "<table border=\"1\" class=\"dataframe\">\n",
 52 |        "  <thead>\n",
 53 |        "    <tr style=\"text-align: right;\">\n",
 54 |        "      <th></th>\n",
 55 |        "      <th>temp</th>\n",
 56 |        "      <th>hour</th>\n",
 57 |        "    </tr>\n",
 58 |        "  </thead>\n",
 59 |        "  <tbody>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>count</th>\n",
 62 |        "      <td>728.000000</td>\n",
 63 |        "      <td>728.000000</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>mean</th>\n",
 67 |        "      <td>1.431319</td>\n",
 68 |        "      <td>10.500000</td>\n",
 69 |        "    </tr>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>std</th>\n",
 72 |        "      <td>2.378424</td>\n",
 73 |        "      <td>6.878589</td>\n",
 74 |        "    </tr>\n",
 75 |        "    <tr>\n",
 76 |        "      <th>min</th>\n",
 77 |        "      <td>0.000000</td>\n",
 78 |        "      <td>0.000000</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>25%</th>\n",
 82 |        "      <td>0.000000</td>\n",
 83 |        "      <td>5.250000</td>\n",
 84 |        "    </tr>\n",
 85 |        "    <tr>\n",
 86 |        "      <th>50%</th>\n",
 87 |        "      <td>0.000000</td>\n",
 88 |        "      <td>10.500000</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>75%</th>\n",
 92 |        "      <td>2.000000</td>\n",
 93 |        "      <td>15.750000</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>max</th>\n",
 97 |        "      <td>12.000000</td>\n",
 98 |        "      <td>21.000000</td>\n",
 99 |        "    </tr>\n",
100 |        "  </tbody>\n",
101 |        "</table>\n",
102 |        "</div>"
103 |       ],
104 |       "text/plain": [
105 |        "             temp        hour\n",
106 |        "count  728.000000  728.000000\n",
107 |        "mean     1.431319   10.500000\n",
108 |        "std      2.378424    6.878589\n",
109 |        "min      0.000000    0.000000\n",
110 |        "25%      0.000000    5.250000\n",
111 |        "50%      0.000000   10.500000\n",
112 |        "75%      2.000000   15.750000\n",
113 |        "max     12.000000   21.000000"
114 |       ]
115 |      },
116 |      "execution_count": 3,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "df.describe()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": []
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3 (ipykernel)",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.11.6"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/chapter-02/Exercise 08 — Net revenue.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "scrolled": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import numpy as np\n",
 12 |     "import pandas as pd\n",
 13 |     "from pandas import Series, DataFrame"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/html": [
 24 |        "<div>\n",
 25 |        "<style scoped>\n",
 26 |        "    .dataframe tbody tr th:only-of-type {\n",
 27 |        "        vertical-align: middle;\n",
 28 |        "    }\n",
 29 |        "\n",
 30 |        "    .dataframe tbody tr th {\n",
 31 |        "        vertical-align: top;\n",
 32 |        "    }\n",
 33 |        "\n",
 34 |        "    .dataframe thead th {\n",
 35 |        "        text-align: right;\n",
 36 |        "    }\n",
 37 |        "</style>\n",
 38 |        "<table border=\"1\" class=\"dataframe\">\n",
 39 |        "  <thead>\n",
 40 |        "    <tr style=\"text-align: right;\">\n",
 41 |        "      <th></th>\n",
 42 |        "      <th>product_id</th>\n",
 43 |        "      <th>name</th>\n",
 44 |        "      <th>wholesale_price</th>\n",
 45 |        "      <th>retail_price</th>\n",
 46 |        "      <th>sales</th>\n",
 47 |        "    </tr>\n",
 48 |        "  </thead>\n",
 49 |        "  <tbody>\n",
 50 |        "    <tr>\n",
 51 |        "      <th>0</th>\n",
 52 |        "      <td>23</td>\n",
 53 |        "      <td>computer</td>\n",
 54 |        "      <td>500.0</td>\n",
 55 |        "      <td>1000</td>\n",
 56 |        "      <td>100</td>\n",
 57 |        "    </tr>\n",
 58 |        "    <tr>\n",
 59 |        "      <th>1</th>\n",
 60 |        "      <td>96</td>\n",
 61 |        "      <td>Python Workout</td>\n",
 62 |        "      <td>35.0</td>\n",
 63 |        "      <td>75</td>\n",
 64 |        "      <td>1000</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>2</th>\n",
 68 |        "      <td>97</td>\n",
 69 |        "      <td>Pandas Workout</td>\n",
 70 |        "      <td>35.0</td>\n",
 71 |        "      <td>75</td>\n",
 72 |        "      <td>500</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>3</th>\n",
 76 |        "      <td>15</td>\n",
 77 |        "      <td>banana</td>\n",
 78 |        "      <td>0.5</td>\n",
 79 |        "      <td>1</td>\n",
 80 |        "      <td>200</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>4</th>\n",
 84 |        "      <td>87</td>\n",
 85 |        "      <td>sandwich</td>\n",
 86 |        "      <td>3.0</td>\n",
 87 |        "      <td>5</td>\n",
 88 |        "      <td>300</td>\n",
 89 |        "    </tr>\n",
 90 |        "  </tbody>\n",
 91 |        "</table>\n",
 92 |        "</div>"
 93 |       ],
 94 |       "text/plain": [
 95 |        "   product_id            name  wholesale_price  retail_price  sales\n",
 96 |        "0          23        computer            500.0          1000    100\n",
 97 |        "1          96  Python Workout             35.0            75   1000\n",
 98 |        "2          97  Pandas Workout             35.0            75    500\n",
 99 |        "3          15          banana              0.5             1    200\n",
100 |        "4          87        sandwich              3.0             5    300"
101 |       ]
102 |      },
103 |      "execution_count": 2,
104 |      "metadata": {},
105 |      "output_type": "execute_result"
106 |     }
107 |    ],
108 |    "source": [
109 |     "df = DataFrame([{'product_id':23, 'name':'computer', 'wholesale_price': 500, \n",
110 |     "                 'retail_price':1000, 'sales':100},\n",
111 |     "               {'product_id':96, 'name':'Python Workout', 'wholesale_price': 35,\n",
112 |     "                'retail_price':75, 'sales':1000},\n",
113 |     "               {'product_id':97, 'name':'Pandas Workout', 'wholesale_price': 35,\n",
114 |     "                'retail_price':75, 'sales':500},\n",
115 |     "               {'product_id':15, 'name':'banana', 'wholesale_price': 0.5,\n",
116 |     "                'retail_price':1, 'sales':200},\n",
117 |     "               {'product_id':87, 'name':'sandwich', 'wholesale_price': 3,\n",
118 |     "                'retail_price':5, 'sales':300},\n",
119 |     "               ])\n",
120 |     "\n",
121 |     "df"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 3,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/plain": [
132 |        "110700.0"
133 |       ]
134 |      },
135 |      "execution_count": 3,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "((df['retail_price'] - df['wholesale_price']) * df['sales']).sum()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": []
150 |   }
151 |  ],
152 |  "metadata": {
153 |   "kernelspec": {
154 |    "display_name": "Python 3 (ipykernel)",
155 |    "language": "python",
156 |    "name": "python3"
157 |   },
158 |   "language_info": {
159 |    "codemirror_mode": {
160 |     "name": "ipython",
161 |     "version": 3
162 |    },
163 |    "file_extension": ".py",
164 |    "mimetype": "text/x-python",
165 |    "name": "python",
166 |    "nbconvert_exporter": "python",
167 |    "pygments_lexer": "ipython3",
168 |    "version": "3.11.6"
169 |   }
170 |  },
171 |  "nbformat": 4,
172 |  "nbformat_minor": 4
173 | }
174 | 


--------------------------------------------------------------------------------
/chapter-02/Exercise 13 — Interpolation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame\n",
 12 |     "\n",
 13 |     "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n",
 14 |     "df = DataFrame({'temp': s, \n",
 15 |     "                'hour': [0,3,6,9,12,15,18,21] * 91})"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "data": {
 25 |       "text/plain": [
 26 |        "count    728.000000\n",
 27 |        "mean      -1.050824\n",
 28 |        "std        5.026357\n",
 29 |        "min      -14.000000\n",
 30 |        "25%       -4.000000\n",
 31 |        "50%        0.000000\n",
 32 |        "75%        2.000000\n",
 33 |        "max       12.000000\n",
 34 |        "Name: temp, dtype: float64"
 35 |       ]
 36 |      },
 37 |      "execution_count": 2,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# Get the mean + median for baseline data\n",
 44 |     "df['temp'].describe()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# Set temp at 3 and 6 a.m. to NaN\n",
 54 |     "df.loc[\n",
 55 |     "    df['hour'].isin([3, 6]),\n",
 56 |     "    'temp'\n",
 57 |     "] = np.NaN"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# Interpolate!\n",
 67 |     "df = df.interpolate()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 5,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "data": {
 77 |       "text/plain": [
 78 |        "count    728.000000\n",
 79 |        "mean      -1.050824\n",
 80 |        "std        5.026357\n",
 81 |        "min      -14.000000\n",
 82 |        "25%       -4.000000\n",
 83 |        "50%        0.000000\n",
 84 |        "75%        2.000000\n",
 85 |        "max       12.000000\n",
 86 |        "Name: temp, dtype: float64"
 87 |       ]
 88 |      },
 89 |      "execution_count": 5,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "# Get the mean + median for when we're missing 3 and 6 a.m.\n",
 96 |     "df['temp'].describe()"
 97 |    ]
 98 |   }
 99 |  ],
100 |  "metadata": {
101 |   "kernelspec": {
102 |    "display_name": "Python 3 (ipykernel)",
103 |    "language": "python",
104 |    "name": "python3"
105 |   },
106 |   "language_info": {
107 |    "codemirror_mode": {
108 |     "name": "ipython",
109 |     "version": 3
110 |    },
111 |    "file_extension": ".py",
112 |    "mimetype": "text/x-python",
113 |    "name": "python",
114 |    "nbconvert_exporter": "python",
115 |    "pygments_lexer": "ipython3",
116 |    "version": "3.11.6"
117 |   }
118 |  },
119 |  "nbformat": 4,
120 |  "nbformat_minor": 4
121 | }
122 | 


--------------------------------------------------------------------------------
/chapter-02/Exercise 14 — Selective updating.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "s = pd.read_csv('../data/nyc-temps.txt').squeeze()\n",
 13 |     "df = DataFrame({'temp': s, \n",
 14 |     "                'hour': [0,3,6,9,12,15,18,21] * 91})"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "df.loc[\n",
 24 |     "    df['temp'] < 0, \n",
 25 |     "    'temp'\n",
 26 |     "]  = 0"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/html": [
 37 |        "<div>\n",
 38 |        "<style scoped>\n",
 39 |        "    .dataframe tbody tr th:only-of-type {\n",
 40 |        "        vertical-align: middle;\n",
 41 |        "    }\n",
 42 |        "\n",
 43 |        "    .dataframe tbody tr th {\n",
 44 |        "        vertical-align: top;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe thead th {\n",
 48 |        "        text-align: right;\n",
 49 |        "    }\n",
 50 |        "</style>\n",
 51 |        "<table border=\"1\" class=\"dataframe\">\n",
 52 |        "  <thead>\n",
 53 |        "    <tr style=\"text-align: right;\">\n",
 54 |        "      <th></th>\n",
 55 |        "      <th>temp</th>\n",
 56 |        "      <th>hour</th>\n",
 57 |        "    </tr>\n",
 58 |        "  </thead>\n",
 59 |        "  <tbody>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>count</th>\n",
 62 |        "      <td>728.000000</td>\n",
 63 |        "      <td>728.000000</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>mean</th>\n",
 67 |        "      <td>1.431319</td>\n",
 68 |        "      <td>10.500000</td>\n",
 69 |        "    </tr>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>std</th>\n",
 72 |        "      <td>2.378424</td>\n",
 73 |        "      <td>6.878589</td>\n",
 74 |        "    </tr>\n",
 75 |        "    <tr>\n",
 76 |        "      <th>min</th>\n",
 77 |        "      <td>0.000000</td>\n",
 78 |        "      <td>0.000000</td>\n",
 79 |        "    </tr>\n",
 80 |        "    <tr>\n",
 81 |        "      <th>25%</th>\n",
 82 |        "      <td>0.000000</td>\n",
 83 |        "      <td>5.250000</td>\n",
 84 |        "    </tr>\n",
 85 |        "    <tr>\n",
 86 |        "      <th>50%</th>\n",
 87 |        "      <td>0.000000</td>\n",
 88 |        "      <td>10.500000</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>75%</th>\n",
 92 |        "      <td>2.000000</td>\n",
 93 |        "      <td>15.750000</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>max</th>\n",
 97 |        "      <td>12.000000</td>\n",
 98 |        "      <td>21.000000</td>\n",
 99 |        "    </tr>\n",
100 |        "  </tbody>\n",
101 |        "</table>\n",
102 |        "</div>"
103 |       ],
104 |       "text/plain": [
105 |        "             temp        hour\n",
106 |        "count  728.000000  728.000000\n",
107 |        "mean     1.431319   10.500000\n",
108 |        "std      2.378424    6.878589\n",
109 |        "min      0.000000    0.000000\n",
110 |        "25%      0.000000    5.250000\n",
111 |        "50%      0.000000   10.500000\n",
112 |        "75%      2.000000   15.750000\n",
113 |        "max     12.000000   21.000000"
114 |       ]
115 |      },
116 |      "execution_count": 3,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "df.describe()"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": null,
128 |    "metadata": {},
129 |    "outputs": [],
130 |    "source": []
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3 (ipykernel)",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.11.6"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/chapter-03/.ipynb_checkpoints/Exercise 15 — Weird taxi rides-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n",
 13 |     "                usecols=['passenger_count', 'trip_distance',\n",
 14 |     "                        'total_amount', 'payment_type'])"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "data": {
 24 |       "text/plain": [
 25 |        "9"
 26 |       ]
 27 |      },
 28 |      "execution_count": 2,
 29 |      "metadata": {},
 30 |      "output_type": "execute_result"
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "# How many taxi rides had more than 8 passengers?\n",
 35 |     "df.loc[df['passenger_count'] > 8, 'passenger_count'].count()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "117381"
 47 |       ]
 48 |      },
 49 |      "execution_count": 3,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "# How many taxi rides had zero passengers?\n",
 56 |     "df.loc[\n",
 57 |     "    df['passenger_count'] == 0, 'passenger_count'\n",
 58 |     "].count()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "5"
 70 |       ]
 71 |      },
 72 |      "execution_count": 4,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "# How many taxi rides were paid for in cash, and cost more than $1,000?\n",
 79 |     "df.loc[\n",
 80 |     "    (df['payment_type'] == 2) & (df['total_amount'] > 1000), \n",
 81 |     "       'passenger_count'\n",
 82 |     "].count()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "7131"
 94 |       ]
 95 |      },
 96 |      "execution_count": 5,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "# How many rides cost less than 0?\n",
103 |     "df.loc[\n",
104 |     "    df['total_amount'] < 0, \n",
105 |     "    'total_amount'\n",
106 |     "].count()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 6,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "411255"
118 |       ]
119 |      },
120 |      "execution_count": 6,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "# How many rides traveled a below-average distance, but cost an above-average amount?\n",
127 |     "df.loc[((df['trip_distance'] < df['trip_distance'].mean()) &\n",
128 |     "        (df['total_amount'] > df['total_amount'].mean())), \n",
129 |     "       'trip_distance'].count()"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3 (ipykernel)",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.11.6"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/chapter-03/.ipynb_checkpoints/Exercise 15b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n",
 13 |     "                usecols=['passenger_count', 'trip_distance',\n",
 14 |     "                        'total_amount', 'payment_type'])"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Beyond 1\n",
 22 |     "\n",
 23 |     "Repeat this exercise, but using the `query` method rather than a boolean index."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "9"
 35 |       ]
 36 |      },
 37 |      "execution_count": 2,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# How many taxi rides had more than 8 passengers (query version)\n",
 44 |     "df.query('passenger_count > 8')['passenger_count'].count()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "117381"
 56 |       ]
 57 |      },
 58 |      "execution_count": 3,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "# How many taxi rides had zero passengers (query version)\n",
 65 |     "df.query('passenger_count == 0')['passenger_count'].count()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "5"
 77 |       ]
 78 |      },
 79 |      "execution_count": 4,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "# How many taxi rides were paid for in cash, and cost more than $1,000? (query version)\n",
 86 |     "df.query('payment_type == 2 & total_amount > 1000')['payment_type'].count()"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "7131"
 98 |       ]
 99 |      },
100 |      "execution_count": 5,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "# How many rides cost less than 0? (query version)\n",
107 |     "df.query('total_amount < 0')['total_amount'].count()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 6,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "411255"
119 |       ]
120 |      },
121 |      "execution_count": 6,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "# How many rides traveled a below-average distance, but cost an above-average amount?\n",
128 |     "# (query version)\n",
129 |     "df.query('trip_distance < trip_distance.mean() & total_amount > total_amount.mean()')['trip_distance'].count()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "# Beyond 2\n",
137 |     "\n",
138 |     "How many of the rides that cost less than 0 were indeed for either a dispute (`payment_type` of 4) or a voided trip (`payment_type` of 6)?"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 7,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "2666"
150 |       ]
151 |      },
152 |      "execution_count": 7,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "df.loc[(df['total_amount'] < 0) & \n",
159 |     "       ((df['payment_type'] == 4) | \n",
160 |     "        (df['payment_type'] == 6)), 'total_amount'].count()"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# Beyond 3\n",
168 |     "\n",
169 |     "I stated above that most people pay for their taxi rides using a credit card. Show this, and find what percentages normally pay in cash vs. a credit card."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "data": {
179 |       "text/plain": [
180 |        "payment_type\n",
181 |        "1    0.715464\n",
182 |        "2    0.278752\n",
183 |        "Name: proportion, dtype: float64"
184 |       ]
185 |      },
186 |      "execution_count": 8,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "# 1 == credit card\n",
193 |     "# 2 == cash\n",
194 |     "\n",
195 |     "df['payment_type'].value_counts(normalize=True)[[1,2]]"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": []
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "Python 3 (ipykernel)",
209 |    "language": "python",
210 |    "name": "python3"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.11.6"
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 4
227 | }
228 | 


--------------------------------------------------------------------------------
/chapter-03/.ipynb_checkpoints/Exercise 16 — Pandemic taxis-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "df_2019_jul = pd.read_csv('../data/nyc_taxi_2019-07.csv',\n",
 13 |     "                usecols=['passenger_count', \n",
 14 |     "                        'total_amount', 'payment_type'])\n",
 15 |     "df_2019_jul['year'] = 2019\n",
 16 |     "\n",
 17 |     "df_2020_jul = pd.read_csv('../data/nyc_taxi_2020-07.csv',\n",
 18 |     "                usecols=['passenger_count', \n",
 19 |     "                        'total_amount', 'payment_type'])\n",
 20 |     "df_2020_jul['year'] = 2020\n",
 21 |     "\n",
 22 |     "df = pd.concat([df_2019_jul, df_2020_jul])"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "5510007"
 34 |       ]
 35 |      },
 36 |      "execution_count": 2,
 37 |      "metadata": {},
 38 |      "output_type": "execute_result"
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "# How many rides were taken in 2019 vs. 2020?\n",
 43 |     "(\n",
 44 |     "    df.loc[df['year'] == 2019, 'total_amount'].count() -\n",
 45 |     "    df.loc[df['year'] == 2020, 'total_amount'].count()\n",
 46 |     ")\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "108848979.24000001"
 58 |       ]
 59 |      },
 60 |      "execution_count": 3,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# How much money was collected in 2019 vs. 2020?\n",
 67 |     "(\n",
 68 |     "    df.loc[df['year'] == 2019, 'total_amount'].sum() -\n",
 69 |     "    df.loc[df['year'] == 2020, 'total_amount'].sum()\n",
 70 |     ")\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "0.2833900000955953"
 82 |       ]
 83 |      },
 84 |      "execution_count": 4,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "# Did the proportion of trips with more than passenger change dramatically?\n",
 91 |     "df.loc[(df['year'] == 2019) & \n",
 92 |     "       (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "0.2061513222563435"
104 |       ]
105 |      },
106 |      "execution_count": 5,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "df.loc[(df['year'] == 2020) & \n",
113 |     "       (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "0.2870595845428793"
125 |       ]
126 |      },
127 |      "execution_count": 6,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "# Did people use cash less in 2019 or 2020?\n",
134 |     "df.loc[(df['year'] == 2019) & \n",
135 |     "       (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 7,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "data": {
145 |       "text/plain": [
146 |        "0.320558865998251"
147 |       ]
148 |      },
149 |      "execution_count": 7,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "df.loc[(df['year'] == 2020) & \n",
156 |     "       (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": []
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 3 (ipykernel)",
170 |    "language": "python",
171 |    "name": "python3"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 3
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython3",
183 |    "version": "3.11.6"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 4
188 | }
189 | 


--------------------------------------------------------------------------------
/chapter-03/.ipynb_checkpoints/Exercise 20 — Big cities-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "filename = '../data/cities.json'\n",
 13 |     "df = pd.read_json(filename)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/plain": [
 24 |        "mean    131132.443\n",
 25 |        "50%      68207.000\n",
 26 |        "Name: population, dtype: float64"
 27 |       ]
 28 |      },
 29 |      "execution_count": 2,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "# What are the mean and median populations for these 1,000 largest cities?\n",
 36 |     "# What does that tell us?\n",
 37 |     "\n",
 38 |     "df['population'].describe()[['mean', '50%']]"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "mean    87027.387368\n",
 50 |        "50%     65796.000000\n",
 51 |        "Name: population, dtype: float64"
 52 |       ]
 53 |      },
 54 |      "execution_count": 3,
 55 |      "metadata": {},
 56 |      "output_type": "execute_result"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "# Along these lines: If we remove the 50 most populous cities, \n",
 61 |     "# what happens to the mean population?  What happens to the median?\n",
 62 |     "df.loc[50:, 'population'].describe()[['mean', '50%']]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/html": [
 73 |        "<div>\n",
 74 |        "<style scoped>\n",
 75 |        "    .dataframe tbody tr th:only-of-type {\n",
 76 |        "        vertical-align: middle;\n",
 77 |        "    }\n",
 78 |        "\n",
 79 |        "    .dataframe tbody tr th {\n",
 80 |        "        vertical-align: top;\n",
 81 |        "    }\n",
 82 |        "\n",
 83 |        "    .dataframe thead th {\n",
 84 |        "        text-align: right;\n",
 85 |        "    }\n",
 86 |        "</style>\n",
 87 |        "<table border=\"1\" class=\"dataframe\">\n",
 88 |        "  <thead>\n",
 89 |        "    <tr style=\"text-align: right;\">\n",
 90 |        "      <th></th>\n",
 91 |        "      <th>city</th>\n",
 92 |        "      <th>state</th>\n",
 93 |        "      <th>rank</th>\n",
 94 |        "    </tr>\n",
 95 |        "  </thead>\n",
 96 |        "  <tbody>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>62</th>\n",
 99 |        "      <td>Anchorage</td>\n",
100 |        "      <td>Alaska</td>\n",
101 |        "      <td>63</td>\n",
102 |        "    </tr>\n",
103 |        "  </tbody>\n",
104 |        "</table>\n",
105 |        "</div>"
106 |       ],
107 |       "text/plain": [
108 |        "         city   state  rank\n",
109 |        "62  Anchorage  Alaska    63"
110 |       ]
111 |      },
112 |      "execution_count": 4,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "# What is the northernmost city, and where does it rank?\n",
119 |     "df.loc[df['latitude'] == df['latitude'].max(), ['city', 'state', 'rank']]"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 5,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "state\n",
131 |        "California    212\n",
132 |        "Name: count, dtype: int64"
133 |       ]
134 |      },
135 |      "execution_count": 5,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "# Which state has the largest number of cities in this list?\n",
142 |     "df['state'].value_counts().head(1)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "state\n",
154 |        "Alaska                  1\n",
155 |        "Hawaii                  1\n",
156 |        "District of Columbia    1\n",
157 |        "Maine                   1\n",
158 |        "Vermont                 1\n",
159 |        "Name: count, dtype: int64"
160 |       ]
161 |      },
162 |      "execution_count": 6,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "# Which state has the smallest number of cities in this list?\n",
169 |     "df['state'].value_counts().tail(5)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": []
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3 (ipykernel)",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.11.6"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 4
201 | }
202 | 


--------------------------------------------------------------------------------
/chapter-03/Exercise 15 — Weird taxi rides.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n",
 13 |     "                usecols=['passenger_count', 'trip_distance',\n",
 14 |     "                        'total_amount', 'payment_type'])"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [
 22 |     {
 23 |      "data": {
 24 |       "text/plain": [
 25 |        "9"
 26 |       ]
 27 |      },
 28 |      "execution_count": 2,
 29 |      "metadata": {},
 30 |      "output_type": "execute_result"
 31 |     }
 32 |    ],
 33 |    "source": [
 34 |     "# How many taxi rides had more than 8 passengers?\n",
 35 |     "df.loc[df['passenger_count'] > 8, 'passenger_count'].count()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/plain": [
 46 |        "117381"
 47 |       ]
 48 |      },
 49 |      "execution_count": 3,
 50 |      "metadata": {},
 51 |      "output_type": "execute_result"
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "# How many taxi rides had zero passengers?\n",
 56 |     "df.loc[\n",
 57 |     "    df['passenger_count'] == 0, 'passenger_count'\n",
 58 |     "].count()"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "5"
 70 |       ]
 71 |      },
 72 |      "execution_count": 4,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "# How many taxi rides were paid for in cash, and cost more than $1,000?\n",
 79 |     "df.loc[\n",
 80 |     "    (df['payment_type'] == 2) & (df['total_amount'] > 1000), \n",
 81 |     "       'passenger_count'\n",
 82 |     "].count()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 5,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "7131"
 94 |       ]
 95 |      },
 96 |      "execution_count": 5,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "# How many rides cost less than 0?\n",
103 |     "df.loc[\n",
104 |     "    df['total_amount'] < 0, \n",
105 |     "    'total_amount'\n",
106 |     "].count()"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 6,
112 |    "metadata": {},
113 |    "outputs": [
114 |     {
115 |      "data": {
116 |       "text/plain": [
117 |        "411255"
118 |       ]
119 |      },
120 |      "execution_count": 6,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "# How many rides traveled a below-average distance, but cost an above-average amount?\n",
127 |     "df.loc[((df['trip_distance'] < df['trip_distance'].mean()) &\n",
128 |     "        (df['total_amount'] > df['total_amount'].mean())), \n",
129 |     "       'trip_distance'].count()"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3 (ipykernel)",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.11.6"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 4
154 | }
155 | 


--------------------------------------------------------------------------------
/chapter-03/Exercise 15b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "df = pd.read_csv('../data/nyc_taxi_2019-01.csv',\n",
 13 |     "                usecols=['passenger_count', 'trip_distance',\n",
 14 |     "                        'total_amount', 'payment_type'])"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Beyond 1\n",
 22 |     "\n",
 23 |     "Repeat this exercise, but using the `query` method rather than a boolean index."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "data": {
 33 |       "text/plain": [
 34 |        "9"
 35 |       ]
 36 |      },
 37 |      "execution_count": 2,
 38 |      "metadata": {},
 39 |      "output_type": "execute_result"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# How many taxi rides had more than 8 passengers (query version)\n",
 44 |     "df.query('passenger_count > 8')['passenger_count'].count()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/plain": [
 55 |        "117381"
 56 |       ]
 57 |      },
 58 |      "execution_count": 3,
 59 |      "metadata": {},
 60 |      "output_type": "execute_result"
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "# How many taxi rides had zero passengers (query version)\n",
 65 |     "df.query('passenger_count == 0')['passenger_count'].count()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 4,
 71 |    "metadata": {},
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "5"
 77 |       ]
 78 |      },
 79 |      "execution_count": 4,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "# How many taxi rides were paid for in cash, and cost more than $1,000? (query version)\n",
 86 |     "df.query('payment_type == 2 & total_amount > 1000')['payment_type'].count()"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "7131"
 98 |       ]
 99 |      },
100 |      "execution_count": 5,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "# How many rides cost less than 0? (query version)\n",
107 |     "df.query('total_amount < 0')['total_amount'].count()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 6,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "data": {
117 |       "text/plain": [
118 |        "411255"
119 |       ]
120 |      },
121 |      "execution_count": 6,
122 |      "metadata": {},
123 |      "output_type": "execute_result"
124 |     }
125 |    ],
126 |    "source": [
127 |     "# How many rides traveled a below-average distance, but cost an above-average amount?\n",
128 |     "# (query version)\n",
129 |     "df.query('trip_distance < trip_distance.mean() & total_amount > total_amount.mean()')['trip_distance'].count()"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "# Beyond 2\n",
137 |     "\n",
138 |     "How many of the rides that cost less than 0 were indeed for either a dispute (`payment_type` of 4) or a voided trip (`payment_type` of 6)?"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 7,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "data": {
148 |       "text/plain": [
149 |        "2666"
150 |       ]
151 |      },
152 |      "execution_count": 7,
153 |      "metadata": {},
154 |      "output_type": "execute_result"
155 |     }
156 |    ],
157 |    "source": [
158 |     "df.loc[(df['total_amount'] < 0) & \n",
159 |     "       ((df['payment_type'] == 4) | \n",
160 |     "        (df['payment_type'] == 6)), 'total_amount'].count()"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# Beyond 3\n",
168 |     "\n",
169 |     "I stated above that most people pay for their taxi rides using a credit card. Show this, and find what percentages normally pay in cash vs. a credit card."
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 8,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "data": {
179 |       "text/plain": [
180 |        "payment_type\n",
181 |        "1    0.715464\n",
182 |        "2    0.278752\n",
183 |        "Name: proportion, dtype: float64"
184 |       ]
185 |      },
186 |      "execution_count": 8,
187 |      "metadata": {},
188 |      "output_type": "execute_result"
189 |     }
190 |    ],
191 |    "source": [
192 |     "# 1 == credit card\n",
193 |     "# 2 == cash\n",
194 |     "\n",
195 |     "df['payment_type'].value_counts(normalize=True)[[1,2]]"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": []
204 |   }
205 |  ],
206 |  "metadata": {
207 |   "kernelspec": {
208 |    "display_name": "Python 3 (ipykernel)",
209 |    "language": "python",
210 |    "name": "python3"
211 |   },
212 |   "language_info": {
213 |    "codemirror_mode": {
214 |     "name": "ipython",
215 |     "version": 3
216 |    },
217 |    "file_extension": ".py",
218 |    "mimetype": "text/x-python",
219 |    "name": "python",
220 |    "nbconvert_exporter": "python",
221 |    "pygments_lexer": "ipython3",
222 |    "version": "3.11.6"
223 |   }
224 |  },
225 |  "nbformat": 4,
226 |  "nbformat_minor": 4
227 | }
228 | 


--------------------------------------------------------------------------------
/chapter-03/Exercise 16 — Pandemic taxis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "df_2019_jul = pd.read_csv('../data/nyc_taxi_2019-07.csv',\n",
 13 |     "                usecols=['passenger_count', \n",
 14 |     "                        'total_amount', 'payment_type'])\n",
 15 |     "df_2019_jul['year'] = 2019\n",
 16 |     "\n",
 17 |     "df_2020_jul = pd.read_csv('../data/nyc_taxi_2020-07.csv',\n",
 18 |     "                usecols=['passenger_count', \n",
 19 |     "                        'total_amount', 'payment_type'])\n",
 20 |     "df_2020_jul['year'] = 2020\n",
 21 |     "\n",
 22 |     "df = pd.concat([df_2019_jul, df_2020_jul])"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "data": {
 32 |       "text/plain": [
 33 |        "5510007"
 34 |       ]
 35 |      },
 36 |      "execution_count": 2,
 37 |      "metadata": {},
 38 |      "output_type": "execute_result"
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "# How many rides were taken in 2019 vs. 2020?\n",
 43 |     "(\n",
 44 |     "    df.loc[df['year'] == 2019, 'total_amount'].count() -\n",
 45 |     "    df.loc[df['year'] == 2020, 'total_amount'].count()\n",
 46 |     ")\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 3,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "108848979.24000001"
 58 |       ]
 59 |      },
 60 |      "execution_count": 3,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# How much money was collected in 2019 vs. 2020?\n",
 67 |     "(\n",
 68 |     "    df.loc[df['year'] == 2019, 'total_amount'].sum() -\n",
 69 |     "    df.loc[df['year'] == 2020, 'total_amount'].sum()\n",
 70 |     ")\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "0.2833900000955953"
 82 |       ]
 83 |      },
 84 |      "execution_count": 4,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "# Did the proportion of trips with more than passenger change dramatically?\n",
 91 |     "df.loc[(df['year'] == 2019) & \n",
 92 |     "       (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 5,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "0.2061513222563435"
104 |       ]
105 |      },
106 |      "execution_count": 5,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "df.loc[(df['year'] == 2020) & \n",
113 |     "       (df['passenger_count'] > 1), 'passenger_count'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 6,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "data": {
123 |       "text/plain": [
124 |        "0.2870595845428793"
125 |       ]
126 |      },
127 |      "execution_count": 6,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "# Did people use cash less in 2019 or 2020?\n",
134 |     "df.loc[(df['year'] == 2019) & \n",
135 |     "       (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2019, 'payment_type'].count()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 7,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "data": {
145 |       "text/plain": [
146 |        "0.320558865998251"
147 |       ]
148 |      },
149 |      "execution_count": 7,
150 |      "metadata": {},
151 |      "output_type": "execute_result"
152 |     }
153 |    ],
154 |    "source": [
155 |     "df.loc[(df['year'] == 2020) & \n",
156 |     "       (df['payment_type'] == 2), 'payment_type'].count() / df.loc[df['year'] == 2020, 'payment_type'].count()"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": []
165 |   }
166 |  ],
167 |  "metadata": {
168 |   "kernelspec": {
169 |    "display_name": "Python 3 (ipykernel)",
170 |    "language": "python",
171 |    "name": "python3"
172 |   },
173 |   "language_info": {
174 |    "codemirror_mode": {
175 |     "name": "ipython",
176 |     "version": 3
177 |    },
178 |    "file_extension": ".py",
179 |    "mimetype": "text/x-python",
180 |    "name": "python",
181 |    "nbconvert_exporter": "python",
182 |    "pygments_lexer": "ipython3",
183 |    "version": "3.11.6"
184 |   }
185 |  },
186 |  "nbformat": 4,
187 |  "nbformat_minor": 4
188 | }
189 | 


--------------------------------------------------------------------------------
/chapter-03/Exercise 17b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "# Beyond 1\n",
 19 |     "\n",
 20 |     "Create a data frame from four other columns (`VendorID`, `trip_distance`, `tip_amount`, and `total_amount`), specifying the `dtype` for each. What types are most appropriate? Can you use them directly, or must you first clean the data?"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 10,
 26 |    "metadata": {
 27 |     "scrolled": true
 28 |    },
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stderr",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "/var/folders/rr/0mnyyv811fs5vyp22gf4fxk00000gn/T/ipykernel_72055/3862543684.py:9: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
 35 |       "  df.loc['VendorID'] = df['VendorID'].astype(np.int8)\n"
 36 |      ]
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "df = pd.read_csv('../data/nyc_taxi_2020-01.csv',\n",
 41 |     "                usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],\n",
 42 |     "                dtype={'VendorID':np.float32,\n",
 43 |     "                       'trip_distance':np.float32, \n",
 44 |     "                       'tip_amount':np.float32,\n",
 45 |     "                       'total_amount':np.float32})\n",
 46 |     "\n",
 47 |     "df = df.dropna().copy()\n",
 48 |     "df.loc['VendorID'] = df['VendorID'].astype(np.int8)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "# Beyond 2\n",
 56 |     "\n",
 57 |     "Instead of removing `NaN` values from the `VendorID` column, set it to a new value, 3. How does that affect your specifications and cleaning of the data?"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 16,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "df = pd.read_csv('../data/nyc_taxi_2020-01.csv',\n",
 67 |     "            usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],\n",
 68 |     "            dtype={'VendorID':np.float32, \n",
 69 |     "                   'trip_distance':np.float32,\n",
 70 |     "                   'tip_amount':np.float32,\n",
 71 |     "                  'total_amount':np.float32})\n",
 72 |     "\n",
 73 |     "df['VendorID'] = df['VendorID'].fillna(3)\n",
 74 |     "df['VendorID'] = df['VendorID'].astype(np.int8)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "# Beyond 3\n",
 82 |     "\n",
 83 |     "We'll talk more about this in future chapters, but the `memory_usage` method allows you to see how much memory is being used by each column in a data frame. It returns a series of integers, in which the index lists the columns and the values represent the memory used by each column. Compare the memory used by the data frame with `float16` (which you've already used) and when you use `float64` instead for the final three columns."
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 17,
 89 |    "metadata": {
 90 |     "scrolled": true
 91 |    },
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "83265236"
 97 |       ]
 98 |      },
 99 |      "execution_count": 17,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "# Memory usage with float16\n",
106 |     "df.memory_usage().sum()  "
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 19,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "df = pd.read_csv('../data/nyc_taxi_2020-01.csv',\n",
116 |     "            usecols=['VendorID', 'trip_distance', 'tip_amount', 'total_amount'],\n",
117 |     "            dtype={'VendorID':np.float32, \n",
118 |     "                   'trip_distance':np.float32,\n",
119 |     "                   'tip_amount':np.float32,\n",
120 |     "                  'total_amount':np.float32})\n",
121 |     "\n",
122 |     "df['VendorID'] = df['VendorID'].fillna(3)\n",
123 |     "df['VendorID'] = df['VendorID'].astype(np.int8)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "scrolled": true
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "# Memory usage with float64\n",
135 |     "df.memory_usage().sum()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "# float64 uses about 3.5x the memory as float16!\n",
145 |     "160125328 / 44835184"
146 |    ]
147 |   }
148 |  ],
149 |  "metadata": {
150 |   "kernelspec": {
151 |    "display_name": "Python 3 (ipykernel)",
152 |    "language": "python",
153 |    "name": "python3"
154 |   },
155 |   "language_info": {
156 |    "codemirror_mode": {
157 |     "name": "ipython",
158 |     "version": 3
159 |    },
160 |    "file_extension": ".py",
161 |    "mimetype": "text/x-python",
162 |    "name": "python",
163 |    "nbconvert_exporter": "python",
164 |    "pygments_lexer": "ipython3",
165 |    "version": "3.11.6"
166 |   }
167 |  },
168 |  "nbformat": 4,
169 |  "nbformat_minor": 4
170 | }
171 | 


--------------------------------------------------------------------------------
/chapter-03/Exercise 20 — Big cities.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame\n",
 11 |     "\n",
 12 |     "filename = '../data/cities.json'\n",
 13 |     "df = pd.read_json(filename)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/plain": [
 24 |        "mean    131132.443\n",
 25 |        "50%      68207.000\n",
 26 |        "Name: population, dtype: float64"
 27 |       ]
 28 |      },
 29 |      "execution_count": 2,
 30 |      "metadata": {},
 31 |      "output_type": "execute_result"
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "# What are the mean and median populations for these 1,000 largest cities?\n",
 36 |     "# What does that tell us?\n",
 37 |     "\n",
 38 |     "df['population'].describe()[['mean', '50%']]"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "metadata": {},
 45 |    "outputs": [
 46 |     {
 47 |      "data": {
 48 |       "text/plain": [
 49 |        "mean    87027.387368\n",
 50 |        "50%     65796.000000\n",
 51 |        "Name: population, dtype: float64"
 52 |       ]
 53 |      },
 54 |      "execution_count": 3,
 55 |      "metadata": {},
 56 |      "output_type": "execute_result"
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "# Along these lines: If we remove the 50 most populous cities, \n",
 61 |     "# what happens to the mean population?  What happens to the median?\n",
 62 |     "df.loc[50:, 'population'].describe()[['mean', '50%']]"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/html": [
 73 |        "<div>\n",
 74 |        "<style scoped>\n",
 75 |        "    .dataframe tbody tr th:only-of-type {\n",
 76 |        "        vertical-align: middle;\n",
 77 |        "    }\n",
 78 |        "\n",
 79 |        "    .dataframe tbody tr th {\n",
 80 |        "        vertical-align: top;\n",
 81 |        "    }\n",
 82 |        "\n",
 83 |        "    .dataframe thead th {\n",
 84 |        "        text-align: right;\n",
 85 |        "    }\n",
 86 |        "</style>\n",
 87 |        "<table border=\"1\" class=\"dataframe\">\n",
 88 |        "  <thead>\n",
 89 |        "    <tr style=\"text-align: right;\">\n",
 90 |        "      <th></th>\n",
 91 |        "      <th>city</th>\n",
 92 |        "      <th>state</th>\n",
 93 |        "      <th>rank</th>\n",
 94 |        "    </tr>\n",
 95 |        "  </thead>\n",
 96 |        "  <tbody>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>62</th>\n",
 99 |        "      <td>Anchorage</td>\n",
100 |        "      <td>Alaska</td>\n",
101 |        "      <td>63</td>\n",
102 |        "    </tr>\n",
103 |        "  </tbody>\n",
104 |        "</table>\n",
105 |        "</div>"
106 |       ],
107 |       "text/plain": [
108 |        "         city   state  rank\n",
109 |        "62  Anchorage  Alaska    63"
110 |       ]
111 |      },
112 |      "execution_count": 4,
113 |      "metadata": {},
114 |      "output_type": "execute_result"
115 |     }
116 |    ],
117 |    "source": [
118 |     "# What is the northernmost city, and where does it rank?\n",
119 |     "df.loc[df['latitude'] == df['latitude'].max(), ['city', 'state', 'rank']]"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 5,
125 |    "metadata": {},
126 |    "outputs": [
127 |     {
128 |      "data": {
129 |       "text/plain": [
130 |        "state\n",
131 |        "California    212\n",
132 |        "Name: count, dtype: int64"
133 |       ]
134 |      },
135 |      "execution_count": 5,
136 |      "metadata": {},
137 |      "output_type": "execute_result"
138 |     }
139 |    ],
140 |    "source": [
141 |     "# Which state has the largest number of cities in this list?\n",
142 |     "df['state'].value_counts().head(1)"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "state\n",
154 |        "Alaska                  1\n",
155 |        "Hawaii                  1\n",
156 |        "District of Columbia    1\n",
157 |        "Maine                   1\n",
158 |        "Vermont                 1\n",
159 |        "Name: count, dtype: int64"
160 |       ]
161 |      },
162 |      "execution_count": 6,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "# Which state has the smallest number of cities in this list?\n",
169 |     "df['state'].value_counts().tail(5)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": []
178 |   }
179 |  ],
180 |  "metadata": {
181 |   "kernelspec": {
182 |    "display_name": "Python 3 (ipykernel)",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.11.6"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 4
201 | }
202 | 


--------------------------------------------------------------------------------
/chapter-04/.ipynb_checkpoints/Exercise 21 — Parking tickets-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "data": {
 20 |       "text/plain": [
 21 |        "Index(['Plate ID', 'Registration State', 'Issue Date', 'Vehicle Make',\n",
 22 |        "       'Street Name', 'Date First Observed', 'Vehicle Color'],\n",
 23 |        "      dtype='object')"
 24 |       ]
 25 |      },
 26 |      "execution_count": 2,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 33 |     "\n",
 34 |     "df = pd.read_csv(filename,\n",
 35 |     "                usecols=['Date First Observed', 'Plate ID', 'Registration State',\n",
 36 |     "                        'Issue Date', 'Vehicle Make', 'Street Name', 'Vehicle Color'])\n",
 37 |     "df.columns"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# set the index to the \"Issue Date\" column\n",
 47 |     "df = df.set_index('Issue Date')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "Vehicle Make\n",
 59 |        "TOYOT    3829\n",
 60 |        "HONDA    3593\n",
 61 |        "FORD     3164\n",
 62 |        "Name: count, dtype: int64"
 63 |       ]
 64 |      },
 65 |      "execution_count": 4,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "# what three vehicle makes were most likely to be ticketed on January 2nd?\n",
 72 |     "df.loc['01/02/2020 12:00:00 AM', 'Vehicle Make'].value_counts().head(3)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "Street Name\n",
 84 |        "WB CONDUIT BLVD @ LI    1068\n",
 85 |        "SB WEST ST @ LEROY S     335\n",
 86 |        "EB HORACE HARDING EX     273\n",
 87 |        "EB QUEENS BLVD @ 82N     245\n",
 88 |        "WB ATLANTIC AVE @ CL     229\n",
 89 |        "Name: count, dtype: int64"
 90 |       ]
 91 |      },
 92 |      "execution_count": 5,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "# On what five streets were cars most likely to be ticketed on June 1st, 2020?\n",
 99 |     "df.loc['06/01/2020 12:00:00 AM', 'Street Name'].value_counts().head(5)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 6,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# change the index to be \"Vehicle Color\"\n",
109 |     "df = df.reset_index().set_index('Vehicle Color')"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 7,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "Vehicle Make\n",
121 |        "HONDA    39353\n",
122 |        "Name: count, dtype: int64"
123 |       ]
124 |      },
125 |      "execution_count": 7,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "# What was the most common make of ticketed cars that were either blue or red?\n",
132 |     "df.loc[['BLUE', 'RED'], 'Vehicle Make'].value_counts().head(1)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": []
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 3 (ipykernel)",
146 |    "language": "python",
147 |    "name": "python3"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 3
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython3",
159 |    "version": "3.11.6"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 4
164 | }
165 | 


--------------------------------------------------------------------------------
/chapter-04/.ipynb_checkpoints/Sandbox-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 5
6 | }
7 | 


--------------------------------------------------------------------------------
/chapter-04/Exercise 21 — Parking tickets.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "from pandas import Series, DataFrame"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 2,
 16 |    "metadata": {},
 17 |    "outputs": [
 18 |     {
 19 |      "data": {
 20 |       "text/plain": [
 21 |        "Index(['Plate ID', 'Registration State', 'Issue Date', 'Vehicle Make',\n",
 22 |        "       'Street Name', 'Date First Observed', 'Vehicle Color'],\n",
 23 |        "      dtype='object')"
 24 |       ]
 25 |      },
 26 |      "execution_count": 2,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 33 |     "\n",
 34 |     "df = pd.read_csv(filename,\n",
 35 |     "                usecols=['Date First Observed', 'Plate ID', 'Registration State',\n",
 36 |     "                        'Issue Date', 'Vehicle Make', 'Street Name', 'Vehicle Color'])\n",
 37 |     "df.columns"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# set the index to the \"Issue Date\" column\n",
 47 |     "df = df.set_index('Issue Date')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 4,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/plain": [
 58 |        "Vehicle Make\n",
 59 |        "TOYOT    3829\n",
 60 |        "HONDA    3593\n",
 61 |        "FORD     3164\n",
 62 |        "Name: count, dtype: int64"
 63 |       ]
 64 |      },
 65 |      "execution_count": 4,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "# what three vehicle makes were most likely to be ticketed on January 2nd?\n",
 72 |     "df.loc['01/02/2020 12:00:00 AM', 'Vehicle Make'].value_counts().head(3)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "Street Name\n",
 84 |        "WB CONDUIT BLVD @ LI    1068\n",
 85 |        "SB WEST ST @ LEROY S     335\n",
 86 |        "EB HORACE HARDING EX     273\n",
 87 |        "EB QUEENS BLVD @ 82N     245\n",
 88 |        "WB ATLANTIC AVE @ CL     229\n",
 89 |        "Name: count, dtype: int64"
 90 |       ]
 91 |      },
 92 |      "execution_count": 5,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "# On what five streets were cars most likely to be ticketed on June 1st, 2020?\n",
 99 |     "df.loc['06/01/2020 12:00:00 AM', 'Street Name'].value_counts().head(5)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 6,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "# change the index to be \"Vehicle Color\"\n",
109 |     "df = df.reset_index().set_index('Vehicle Color')"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 7,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "data": {
119 |       "text/plain": [
120 |        "Vehicle Make\n",
121 |        "HONDA    39353\n",
122 |        "Name: count, dtype: int64"
123 |       ]
124 |      },
125 |      "execution_count": 7,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "# What was the most common make of ticketed cars that were either blue or red?\n",
132 |     "df.loc[['BLUE', 'RED'], 'Vehicle Make'].value_counts().head(1)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": []
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 3 (ipykernel)",
146 |    "language": "python",
147 |    "name": "python3"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 3
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython3",
159 |    "version": "3.11.6"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 4
164 | }
165 | 


--------------------------------------------------------------------------------
/chapter-06/.ipynb_checkpoints/Joining sidebar-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 5
6 | }
7 | 


--------------------------------------------------------------------------------
/chapter-09/.ipynb_checkpoints/Exercise 36b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "0         ﻿The\n",
 23 |        "1      Project\n",
 24 |        "2    Gutenberg\n",
 25 |        "3        EBook\n",
 26 |        "4           of\n",
 27 |        "dtype: object"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "filename = '../data/alice-in-wonderland.txt'\n",
 37 |     "\n",
 38 |     "s = Series(open(filename).read().split())\n",
 39 |     "s.head()"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "# Beyond 1\n",
 47 |     "\n",
 48 |     "What is the mean of all integers in Alice?"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 4,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "8030.851851851852"
 60 |       ]
 61 |      },
 62 |      "execution_count": 4,
 63 |      "metadata": {},
 64 |      "output_type": "execute_result"
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "import string\n",
 69 |     "\n",
 70 |     "(\n",
 71 |     "    s\n",
 72 |     "    .str\n",
 73 |     "    .strip(string.punctuation)\n",
 74 |     "    .loc[lambda s_: s_.str.isdigit()]\n",
 75 |     "    .astype(int)\n",
 76 |     "    .mean()\n",
 77 |     ")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# Beyond 2\n",
 85 |     "\n",
 86 |     "What words in Alice don't appear in the dictionary? Which are the five most common such words?"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "words = {one_word.strip() for one_word in open('../data/words.txt')}"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 7,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "Project      83\n",
107 |        "She          36\n",
108 |        "Rabbit       28\n",
109 |        "Queen        27\n",
110 |        "Gutenberg    27\n",
111 |        "             ..\n",
112 |        "reasons       1\n",
113 |        "knocked       1\n",
114 |        "curls         1\n",
115 |        "From          1\n",
116 |        "includes      1\n",
117 |        "Name: count, Length: 758, dtype: int64"
118 |       ]
119 |      },
120 |      "execution_count": 7,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "(\n",
127 |     "    s\n",
128 |     "    .str.strip(string.punctuation)      # Strip punctuation\n",
129 |     "    .loc[lambda s_: s_.str.isalpha()]   # Keep only those with letters\n",
130 |     "    .loc[lambda s_: ~s_.isin(words)]    # Now keep those *not* in the dictionary, and find the most common ones\n",
131 |     "    .value_counts()\n",
132 |     ")"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "# Beyond 3\n",
140 |     "\n",
141 |     "What is the mean number of words per paragraph?"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "count    393.000000\n",
153 |        "mean      32.475827\n",
154 |        "std       32.428415\n",
155 |        "min        0.000000\n",
156 |        "25%        7.000000\n",
157 |        "50%       22.000000\n",
158 |        "75%       48.000000\n",
159 |        "max      169.000000\n",
160 |        "dtype: float64"
161 |       ]
162 |      },
163 |      "execution_count": 8,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "# Read the file into a series by paragraph\n",
170 |     "s = Series(open(filename).read().split('\\n\\n'))\n",
171 |     "\n",
172 |     "# Just use describe to get min, max, and everything else\n",
173 |     "(\n",
174 |     "    s\n",
175 |     "    .str\n",
176 |     "    .split()\n",
177 |     "    .str\n",
178 |     "    .len()\n",
179 |     "    .describe() \n",
180 |     ")"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": []
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernelspec": {
193 |    "display_name": "Python 3 (ipykernel)",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.11.7"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 4
212 | }
213 | 


--------------------------------------------------------------------------------
/chapter-09/Exercise 36b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "from pandas import Series, DataFrame"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/plain": [
 22 |        "0         ﻿The\n",
 23 |        "1      Project\n",
 24 |        "2    Gutenberg\n",
 25 |        "3        EBook\n",
 26 |        "4           of\n",
 27 |        "dtype: object"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "filename = '../data/alice-in-wonderland.txt'\n",
 37 |     "\n",
 38 |     "s = Series(open(filename).read().split())\n",
 39 |     "s.head()"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {},
 45 |    "source": [
 46 |     "# Beyond 1\n",
 47 |     "\n",
 48 |     "What is the mean of all integers in Alice?"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 4,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "8030.851851851852"
 60 |       ]
 61 |      },
 62 |      "execution_count": 4,
 63 |      "metadata": {},
 64 |      "output_type": "execute_result"
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "import string\n",
 69 |     "\n",
 70 |     "(\n",
 71 |     "    s\n",
 72 |     "    .str\n",
 73 |     "    .strip(string.punctuation)\n",
 74 |     "    .loc[lambda s_: s_.str.isdigit()]\n",
 75 |     "    .astype(int)\n",
 76 |     "    .mean()\n",
 77 |     ")"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "# Beyond 2\n",
 85 |     "\n",
 86 |     "What words in Alice don't appear in the dictionary? Which are the five most common such words?"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "words = {one_word.strip() for one_word in open('../data/words.txt')}"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 7,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "Project      83\n",
107 |        "She          36\n",
108 |        "Rabbit       28\n",
109 |        "Queen        27\n",
110 |        "Gutenberg    27\n",
111 |        "             ..\n",
112 |        "reasons       1\n",
113 |        "knocked       1\n",
114 |        "curls         1\n",
115 |        "From          1\n",
116 |        "includes      1\n",
117 |        "Name: count, Length: 758, dtype: int64"
118 |       ]
119 |      },
120 |      "execution_count": 7,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "(\n",
127 |     "    s\n",
128 |     "    .str.strip(string.punctuation)      # Strip punctuation\n",
129 |     "    .loc[lambda s_: s_.str.isalpha()]   # Keep only those with letters\n",
130 |     "    .loc[lambda s_: ~s_.isin(words)]    # Now keep those *not* in the dictionary, and find the most common ones\n",
131 |     "    .value_counts()\n",
132 |     ")"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "# Beyond 3\n",
140 |     "\n",
141 |     "What is the mean number of words per paragraph?"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 8,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "count    393.000000\n",
153 |        "mean      32.475827\n",
154 |        "std       32.428415\n",
155 |        "min        0.000000\n",
156 |        "25%        7.000000\n",
157 |        "50%       22.000000\n",
158 |        "75%       48.000000\n",
159 |        "max      169.000000\n",
160 |        "dtype: float64"
161 |       ]
162 |      },
163 |      "execution_count": 8,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "# Read the file into a series by paragraph\n",
170 |     "s = Series(open(filename).read().split('\\n\\n'))\n",
171 |     "\n",
172 |     "# Just use describe to get min, max, and everything else\n",
173 |     "(\n",
174 |     "    s\n",
175 |     "    .str\n",
176 |     "    .split()\n",
177 |     "    .str\n",
178 |     "    .len()\n",
179 |     "    .describe() \n",
180 |     ")"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": []
189 |   }
190 |  ],
191 |  "metadata": {
192 |   "kernelspec": {
193 |    "display_name": "Python 3 (ipykernel)",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.11.7"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 4
212 | }
213 | 


--------------------------------------------------------------------------------
/chapter-10/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 5
6 | }
7 | 


--------------------------------------------------------------------------------
/chapter-10/Untitled.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "f6b49bb8",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from pandas import Series, DataFrame"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 3,
 18 |    "id": "720ae110",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "data": {
 23 |       "text/html": [
 24 |        "<div>\n",
 25 |        "<style scoped>\n",
 26 |        "    .dataframe tbody tr th:only-of-type {\n",
 27 |        "        vertical-align: middle;\n",
 28 |        "    }\n",
 29 |        "\n",
 30 |        "    .dataframe tbody tr th {\n",
 31 |        "        vertical-align: top;\n",
 32 |        "    }\n",
 33 |        "\n",
 34 |        "    .dataframe thead th {\n",
 35 |        "        text-align: right;\n",
 36 |        "    }\n",
 37 |        "</style>\n",
 38 |        "<table border=\"1\" class=\"dataframe\">\n",
 39 |        "  <thead>\n",
 40 |        "    <tr style=\"text-align: right;\">\n",
 41 |        "      <th></th>\n",
 42 |        "      <th>x</th>\n",
 43 |        "      <th>y</th>\n",
 44 |        "      <th>z</th>\n",
 45 |        "    </tr>\n",
 46 |        "  </thead>\n",
 47 |        "  <tbody>\n",
 48 |        "    <tr>\n",
 49 |        "      <th>a</th>\n",
 50 |        "      <td>72</td>\n",
 51 |        "      <td>56</td>\n",
 52 |        "      <td>60</td>\n",
 53 |        "    </tr>\n",
 54 |        "    <tr>\n",
 55 |        "      <th>b</th>\n",
 56 |        "      <td>29</td>\n",
 57 |        "      <td>37</td>\n",
 58 |        "      <td>11</td>\n",
 59 |        "    </tr>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>c</th>\n",
 62 |        "      <td>54</td>\n",
 63 |        "      <td>13</td>\n",
 64 |        "      <td>36</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>d</th>\n",
 68 |        "      <td>94</td>\n",
 69 |        "      <td>23</td>\n",
 70 |        "      <td>79</td>\n",
 71 |        "    </tr>\n",
 72 |        "  </tbody>\n",
 73 |        "</table>\n",
 74 |        "</div>"
 75 |       ],
 76 |       "text/plain": [
 77 |        "    x   y   z\n",
 78 |        "a  72  56  60\n",
 79 |        "b  29  37  11\n",
 80 |        "c  54  13  36\n",
 81 |        "d  94  23  79"
 82 |       ]
 83 |      },
 84 |      "execution_count": 3,
 85 |      "metadata": {},
 86 |      "output_type": "execute_result"
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "df = DataFrame(np.random.randint(0, 100, [4,3]),\n",
 91 |     "                                index=list('abcd'),\n",
 92 |     "                                columns=list('xyz'))\n",
 93 |     "df"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 4,
 99 |    "id": "ae5daa0d",
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "x    249\n",
106 |        "y    129\n",
107 |        "z    186\n",
108 |        "dtype: int64"
109 |       ]
110 |      },
111 |      "execution_count": 4,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "df.sum()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "id": "f017d8b2",
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "a    188\n",
130 |        "b     77\n",
131 |        "c    103\n",
132 |        "d    196\n",
133 |        "dtype: int64"
134 |       ]
135 |      },
136 |      "execution_count": 5,
137 |      "metadata": {},
138 |      "output_type": "execute_result"
139 |     }
140 |    ],
141 |    "source": [
142 |     "df.sum(axis='columns')"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "id": "a4ac793a",
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "a    188\n",
155 |        "b     77\n",
156 |        "c    103\n",
157 |        "d    196\n",
158 |        "dtype: int64"
159 |       ]
160 |      },
161 |      "execution_count": 6,
162 |      "metadata": {},
163 |      "output_type": "execute_result"
164 |     }
165 |    ],
166 |    "source": [
167 |     "df.T.sum()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "id": "5b0e6ab5",
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": []
177 |   }
178 |  ],
179 |  "metadata": {
180 |   "kernelspec": {
181 |    "display_name": "Python 3 (ipykernel)",
182 |    "language": "python",
183 |    "name": "python3"
184 |   },
185 |   "language_info": {
186 |    "codemirror_mode": {
187 |     "name": "ipython",
188 |     "version": 3
189 |    },
190 |    "file_extension": ".py",
191 |    "mimetype": "text/x-python",
192 |    "name": "python",
193 |    "nbconvert_exporter": "python",
194 |    "pygments_lexer": "ipython3",
195 |    "version": "3.11.2"
196 |   }
197 |  },
198 |  "nbformat": 4,
199 |  "nbformat_minor": 5
200 | }
201 | 


--------------------------------------------------------------------------------
/chapter-11/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 5
6 | }
7 | 


--------------------------------------------------------------------------------
/chapter-12/.gitignore:
--------------------------------------------------------------------------------
1 | parking-violations.csv
2 | parking-violations.feather
3 | parking-violations.json
4 | 


--------------------------------------------------------------------------------
/chapter-12/.ipynb_checkpoints/Exercise 49 — Faster reading and writing-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "75417ef5",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from pandas import Series, DataFrame\n",
 13 |     "import time"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "2dbd4a89",
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# Read the full NYC parking-violations CSV file into memory.\n",
 24 |     "\n",
 25 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 26 |     "df = pd.read_csv(filename, low_memory=False)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "id": "76aa5472",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "root = 'parking-violations'\n",
 37 |     "write_methods = {'JSON': df.to_json,\n",
 38 |     "           'CSV': df.to_csv,\n",
 39 |     "           'feather': df.to_feather\n",
 40 |     "          }"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 5,
 46 |    "id": "0c0bb56a",
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "Saving in JSON\n",
 54 |       "\tWriting JSON: total_time=47.94986385299126\n",
 55 |       "Saving in CSV\n",
 56 |       "\tWriting CSV: total_time=84.28116728103487\n",
 57 |       "Saving in feather\n",
 58 |       "\tWriting feather: total_time=10.2521946990164\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "for one_format, method in write_methods.items():\n",
 64 |     "    print(f'Saving in {one_format}')\n",
 65 |     "    start_time = time.perf_counter()\n",
 66 |     "    write_methods[one_format](f'parking-violations.{one_format.lower()}')\n",
 67 |     "    end_time = time.perf_counter()\n",
 68 |     "\n",
 69 |     "    total_time = end_time - start_time\n",
 70 |     "    print(f'\\tWriting {one_format}: {total_time=}')    "
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 6,
 76 |    "id": "e20e3ab7",
 77 |    "metadata": {
 78 |     "scrolled": true
 79 |    },
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "parking-violations.json    : 8,820,247,015\n",
 86 |       "parking-violations.csv     : 2,440,860,181\n",
 87 |       "parking-violations.feather : 1,466,536,058\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# How big are the files you've created?\n",
 93 |     "import glob\n",
 94 |     "import os\n",
 95 |     "\n",
 96 |     "for one_filename in glob.glob(f'{root}*'):\n",
 97 |     "    print(f'{one_filename:27}: {os.stat(one_filename).st_size:,}')"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 7,
103 |    "id": "258f10b7",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "read_methods = {'JSON': pd.read_json,\n",
108 |     "           'CSV': pd.read_csv,\n",
109 |     "           'feather': pd.read_feather\n",
110 |     "          }"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 8,
116 |    "id": "05175935",
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "Reading from JSON\n",
124 |       "\tReading JSON: total_time=512.0497572919703\n",
125 |       "Reading from CSV\n"
126 |      ]
127 |     },
128 |     {
129 |      "name": "stderr",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "/var/folders/rr/0mnyyv811fs5vyp22gf4fxk00000gn/T/ipykernel_52668/1751173684.py:4: DtypeWarning: Columns (19,30,39,40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
133 |       "  df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n"
134 |      ]
135 |     },
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "\tReading CSV: total_time=44.657161787035875\n",
141 |       "Reading from feather\n",
142 |       "\tReading feather: total_time=13.85696751094656\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "for one_format, method in read_methods.items():\n",
148 |     "    print(f'Reading from {one_format}')\n",
149 |     "    start_time = time.perf_counter()\n",
150 |     "    df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n",
151 |     "    end_time = time.perf_counter()\n",
152 |     "\n",
153 |     "    total_time = end_time - start_time\n",
154 |     "    print(f'\\tReading {one_format}: {total_time=}')    "
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "404fa135",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": []
164 |   }
165 |  ],
166 |  "metadata": {
167 |   "kernelspec": {
168 |    "display_name": "Python 3 (ipykernel)",
169 |    "language": "python",
170 |    "name": "python3"
171 |   },
172 |   "language_info": {
173 |    "codemirror_mode": {
174 |     "name": "ipython",
175 |     "version": 3
176 |    },
177 |    "file_extension": ".py",
178 |    "mimetype": "text/x-python",
179 |    "name": "python",
180 |    "nbconvert_exporter": "python",
181 |    "pygments_lexer": "ipython3",
182 |    "version": "3.12.1"
183 |   }
184 |  },
185 |  "nbformat": 4,
186 |  "nbformat_minor": 5
187 | }
188 | 


--------------------------------------------------------------------------------
/chapter-12/.ipynb_checkpoints/Exercise 49b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "0bb1cbb2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from pandas import Series, DataFrame\n",
 13 |     "import time"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "33611a51",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Beyond 1\n",
 22 |     "\n",
 23 |     "If we read the CSV file using the \"pyarrow\" engine, do we see any speedup? That is, can we read CSV files into memory any faster if we use a different engine?"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "id": "b95a0fc2",
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stdout",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "Reading via pyarrow engine, total_time=9.923564148019068\n"
 37 |      ]
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 42 |     "start_time = time.perf_counter()\n",
 43 |     "df = pd.read_csv(filename, engine='pyarrow')\n",
 44 |     "end_time = time.perf_counter()\n",
 45 |     "total_time = end_time - start_time\n",
 46 |     "print(f'Reading via pyarrow engine, {total_time=}')    "
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "54c0002c",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "# Beyond 2\n",
 55 |     "\n",
 56 |     "If we specify the dtypes when reading from a CSV file, do we save any time?"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "id": "87edccd4",
 63 |    "metadata": {
 64 |     "scrolled": true
 65 |    },
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "63.521172957960516"
 71 |       ]
 72 |      },
 73 |      "execution_count": 3,
 74 |      "metadata": {},
 75 |      "output_type": "execute_result"
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "start_time = time.perf_counter()\n",
 80 |     "df = pd.read_csv(filename, low_memory=False,\n",
 81 |     "                 dtype=dict(df.dtypes))\n",
 82 |     "end_time = time.perf_counter()\n",
 83 |     "\n",
 84 |     "total_time = end_time - start_time\n",
 85 |     "total_time"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "id": "261622c5",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "# Beyond 3\n",
 94 |     "\n",
 95 |     "How much memory does our data frame take in as a `pandas` data frame? How much memory does it require as an Arrow table?"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "id": "14842cf1",
102 |    "metadata": {
103 |     "scrolled": true
104 |    },
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "16,789,335,057\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "# Pandas table\n",
116 |     "n = df.memory_usage(deep=True).sum()\n",
117 |     "print(f'{n:,}')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "id": "947e968a",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# Arrow table\n",
128 |     "import pyarrow.feather as feather\n",
129 |     "read_arrow = feather.read_table('parking-violations.feather')"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 6,
135 |    "id": "a0d4be71",
136 |    "metadata": {
137 |     "scrolled": true
138 |    },
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "4,309,680,899\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "n = read_arrow.nbytes\n",
150 |     "print(f'{n:,}')"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "5d110443-4287-47dc-b3ef-117930957cbf",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": []
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 3 (ipykernel)",
165 |    "language": "python",
166 |    "name": "python3"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 3
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython3",
178 |    "version": "3.12.1"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 5
183 | }
184 | 


--------------------------------------------------------------------------------
/chapter-12/.ipynb_checkpoints/Exercise 50b — Beyond the exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "0bb1cbb2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from pandas import Series, DataFrame"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "id": "7edd172f",
 19 |    "metadata": {
 20 |     "scrolled": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 25 |     "df = pd.read_csv(filename,\n",
 26 |     "                usecols=['Plate ID', 'Registration State', 'Plate Type', 'Feet From Curb',\n",
 27 |     "                        'Vehicle Make', 'Vehicle Color'])\n",
 28 |     "df.columns = ['pid', 'state', 'ptype', 'make', 'color', 'feet']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "ed955dcd",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "# Beyond 1\n",
 37 |     "\n",
 38 |     "In `df.query`, we can use the words `and` and `or`, rather than the symbols `&` and `|`, thanks to the `numexpr` library. Rewrite our final query using the words. Does this change the speed at all?"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "id": "2a1b21d5",
 45 |    "metadata": {
 46 |     "scrolled": true
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "914 ms ± 7.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "%timeit df.query('state == \"NY\" and ptype == \"PAS\" and color == \"WHITE\" and feet > 1 and make == \"TOYOT\"')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "id": "33611a51",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Beyond 2\n",
 67 |     "\n",
 68 |     "I prefer measuring distance in meters, rather than in feet. I thus want to find all of the cars that were ticketed when they were more than 1 meter from the curb. Perform this query using the traditional `df.loc` and also using `df.query`. Which one runs faster?"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "id": "b95a0fc2",
 75 |    "metadata": {
 76 |     "scrolled": true
 77 |    },
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "63.2 ms ± 2.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "%timeit df.loc[(df['feet'] * 0.3048) > 1]"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "id": "d3a6a8c0",
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "84.4 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "%timeit df.query('(feet * 0.3048) > 1')"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "261622c5",
112 |    "metadata": {},
113 |    "source": [
114 |     "# Beyond 3\n",
115 |     "\n",
116 |     "What if we modify our query, such that we look for cars that are > 1 meter from the curb and the state is New York? Which query runs faster, and by how much?"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 6,
122 |    "id": "f51ddb07",
123 |    "metadata": {
124 |     "scrolled": true
125 |    },
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "507 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "%timeit df.loc[((df['feet'] * 0.3048) > 1) & (df['state'] == 'NY')]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 7,
142 |    "id": "5ebb17b3",
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "314 ms ± 4.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "%timeit df.query('(feet * 0.3048) > 1 and state == \"NY\" ')"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "ada77895",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": []
164 |   }
165 |  ],
166 |  "metadata": {
167 |   "kernelspec": {
168 |    "display_name": "Python 3 (ipykernel)",
169 |    "language": "python",
170 |    "name": "python3"
171 |   },
172 |   "language_info": {
173 |    "codemirror_mode": {
174 |     "name": "ipython",
175 |     "version": 3
176 |    },
177 |    "file_extension": ".py",
178 |    "mimetype": "text/x-python",
179 |    "name": "python",
180 |    "nbconvert_exporter": "python",
181 |    "pygments_lexer": "ipython3",
182 |    "version": "3.12.1"
183 |   }
184 |  },
185 |  "nbformat": 4,
186 |  "nbformat_minor": 5
187 | }
188 | 


--------------------------------------------------------------------------------
/chapter-12/Exercise 49 — Faster reading and writing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "75417ef5",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from pandas import Series, DataFrame\n",
 13 |     "import time"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "id": "2dbd4a89",
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# Read the full NYC parking-violations CSV file into memory.\n",
 24 |     "\n",
 25 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 26 |     "df = pd.read_csv(filename, low_memory=False)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "id": "76aa5472",
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "root = 'parking-violations'\n",
 37 |     "write_methods = {'JSON': df.to_json,\n",
 38 |     "           'CSV': df.to_csv,\n",
 39 |     "           'feather': df.to_feather\n",
 40 |     "          }"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 5,
 46 |    "id": "0c0bb56a",
 47 |    "metadata": {},
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "Saving in JSON\n",
 54 |       "\tWriting JSON: total_time=47.94986385299126\n",
 55 |       "Saving in CSV\n",
 56 |       "\tWriting CSV: total_time=84.28116728103487\n",
 57 |       "Saving in feather\n",
 58 |       "\tWriting feather: total_time=10.2521946990164\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "for one_format, method in write_methods.items():\n",
 64 |     "    print(f'Saving in {one_format}')\n",
 65 |     "    start_time = time.perf_counter()\n",
 66 |     "    write_methods[one_format](f'parking-violations.{one_format.lower()}')\n",
 67 |     "    end_time = time.perf_counter()\n",
 68 |     "\n",
 69 |     "    total_time = end_time - start_time\n",
 70 |     "    print(f'\\tWriting {one_format}: {total_time=}')    "
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 6,
 76 |    "id": "e20e3ab7",
 77 |    "metadata": {
 78 |     "scrolled": true
 79 |    },
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "parking-violations.json    : 8,820,247,015\n",
 86 |       "parking-violations.csv     : 2,440,860,181\n",
 87 |       "parking-violations.feather : 1,466,536,058\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# How big are the files you've created?\n",
 93 |     "import glob\n",
 94 |     "import os\n",
 95 |     "\n",
 96 |     "for one_filename in glob.glob(f'{root}*'):\n",
 97 |     "    print(f'{one_filename:27}: {os.stat(one_filename).st_size:,}')"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 7,
103 |    "id": "258f10b7",
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "read_methods = {'JSON': pd.read_json,\n",
108 |     "           'CSV': pd.read_csv,\n",
109 |     "           'feather': pd.read_feather\n",
110 |     "          }"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 8,
116 |    "id": "05175935",
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "Reading from JSON\n",
124 |       "\tReading JSON: total_time=512.0497572919703\n",
125 |       "Reading from CSV\n"
126 |      ]
127 |     },
128 |     {
129 |      "name": "stderr",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "/var/folders/rr/0mnyyv811fs5vyp22gf4fxk00000gn/T/ipykernel_52668/1751173684.py:4: DtypeWarning: Columns (19,30,39,40) have mixed types. Specify dtype option on import or set low_memory=False.\n",
133 |       "  df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n"
134 |      ]
135 |     },
136 |     {
137 |      "name": "stdout",
138 |      "output_type": "stream",
139 |      "text": [
140 |       "\tReading CSV: total_time=44.657161787035875\n",
141 |       "Reading from feather\n",
142 |       "\tReading feather: total_time=13.85696751094656\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "for one_format, method in read_methods.items():\n",
148 |     "    print(f'Reading from {one_format}')\n",
149 |     "    start_time = time.perf_counter()\n",
150 |     "    df = read_methods[one_format](f'parking-violations.{one_format.lower()}')\n",
151 |     "    end_time = time.perf_counter()\n",
152 |     "\n",
153 |     "    total_time = end_time - start_time\n",
154 |     "    print(f'\\tReading {one_format}: {total_time=}')    "
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "404fa135",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": []
164 |   }
165 |  ],
166 |  "metadata": {
167 |   "kernelspec": {
168 |    "display_name": "Python 3 (ipykernel)",
169 |    "language": "python",
170 |    "name": "python3"
171 |   },
172 |   "language_info": {
173 |    "codemirror_mode": {
174 |     "name": "ipython",
175 |     "version": 3
176 |    },
177 |    "file_extension": ".py",
178 |    "mimetype": "text/x-python",
179 |    "name": "python",
180 |    "nbconvert_exporter": "python",
181 |    "pygments_lexer": "ipython3",
182 |    "version": "3.12.1"
183 |   }
184 |  },
185 |  "nbformat": 4,
186 |  "nbformat_minor": 5
187 | }
188 | 


--------------------------------------------------------------------------------
/chapter-12/Exercise 49b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "0bb1cbb2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from pandas import Series, DataFrame\n",
 13 |     "import time"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "id": "33611a51",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "# Beyond 1\n",
 22 |     "\n",
 23 |     "If we read the CSV file using the \"pyarrow\" engine, do we see any speedup? That is, can we read CSV files into memory any faster if we use a different engine?"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "id": "b95a0fc2",
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "name": "stdout",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "Reading via pyarrow engine, total_time=9.923564148019068\n"
 37 |      ]
 38 |     }
 39 |    ],
 40 |    "source": [
 41 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 42 |     "start_time = time.perf_counter()\n",
 43 |     "df = pd.read_csv(filename, engine='pyarrow')\n",
 44 |     "end_time = time.perf_counter()\n",
 45 |     "total_time = end_time - start_time\n",
 46 |     "print(f'Reading via pyarrow engine, {total_time=}')    "
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "54c0002c",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "# Beyond 2\n",
 55 |     "\n",
 56 |     "If we specify the dtypes when reading from a CSV file, do we save any time?"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "id": "87edccd4",
 63 |    "metadata": {
 64 |     "scrolled": true
 65 |    },
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "63.521172957960516"
 71 |       ]
 72 |      },
 73 |      "execution_count": 3,
 74 |      "metadata": {},
 75 |      "output_type": "execute_result"
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "start_time = time.perf_counter()\n",
 80 |     "df = pd.read_csv(filename, low_memory=False,\n",
 81 |     "                 dtype=dict(df.dtypes))\n",
 82 |     "end_time = time.perf_counter()\n",
 83 |     "\n",
 84 |     "total_time = end_time - start_time\n",
 85 |     "total_time"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "id": "261622c5",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "# Beyond 3\n",
 94 |     "\n",
 95 |     "How much memory does our data frame take in as a `pandas` data frame? How much memory does it require as an Arrow table?"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 4,
101 |    "id": "14842cf1",
102 |    "metadata": {
103 |     "scrolled": true
104 |    },
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "16,789,335,057\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "# Pandas table\n",
116 |     "n = df.memory_usage(deep=True).sum()\n",
117 |     "print(f'{n:,}')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "id": "947e968a",
124 |    "metadata": {},
125 |    "outputs": [],
126 |    "source": [
127 |     "# Arrow table\n",
128 |     "import pyarrow.feather as feather\n",
129 |     "read_arrow = feather.read_table('parking-violations.feather')"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 6,
135 |    "id": "a0d4be71",
136 |    "metadata": {
137 |     "scrolled": true
138 |    },
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "4,309,680,899\n"
145 |      ]
146 |     }
147 |    ],
148 |    "source": [
149 |     "n = read_arrow.nbytes\n",
150 |     "print(f'{n:,}')"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "id": "5d110443-4287-47dc-b3ef-117930957cbf",
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": []
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 3 (ipykernel)",
165 |    "language": "python",
166 |    "name": "python3"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 3
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython3",
178 |    "version": "3.12.1"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 5
183 | }
184 | 


--------------------------------------------------------------------------------
/chapter-12/Exercise 50b — Beyond the exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "0bb1cbb2",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import numpy as np\n",
 11 |     "import pandas as pd\n",
 12 |     "from pandas import Series, DataFrame"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "id": "7edd172f",
 19 |    "metadata": {
 20 |     "scrolled": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "filename = '../data/nyc-parking-violations-2020.csv'\n",
 25 |     "df = pd.read_csv(filename,\n",
 26 |     "                usecols=['Plate ID', 'Registration State', 'Plate Type', 'Feet From Curb',\n",
 27 |     "                        'Vehicle Make', 'Vehicle Color'])\n",
 28 |     "df.columns = ['pid', 'state', 'ptype', 'make', 'color', 'feet']"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "id": "ed955dcd",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "# Beyond 1\n",
 37 |     "\n",
 38 |     "In `df.query`, we can use the words `and` and `or`, rather than the symbols `&` and `|`, thanks to the `numexpr` library. Rewrite our final query using the words. Does this change the speed at all?"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 3,
 44 |    "id": "2a1b21d5",
 45 |    "metadata": {
 46 |     "scrolled": true
 47 |    },
 48 |    "outputs": [
 49 |     {
 50 |      "name": "stdout",
 51 |      "output_type": "stream",
 52 |      "text": [
 53 |       "914 ms ± 7.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
 54 |      ]
 55 |     }
 56 |    ],
 57 |    "source": [
 58 |     "%timeit df.query('state == \"NY\" and ptype == \"PAS\" and color == \"WHITE\" and feet > 1 and make == \"TOYOT\"')"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "id": "33611a51",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Beyond 2\n",
 67 |     "\n",
 68 |     "I prefer measuring distance in meters, rather than in feet. I thus want to find all of the cars that were ticketed when they were more than 1 meter from the curb. Perform this query using the traditional `df.loc` and also using `df.query`. Which one runs faster?"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 4,
 74 |    "id": "b95a0fc2",
 75 |    "metadata": {
 76 |     "scrolled": true
 77 |    },
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "63.2 ms ± 2.4 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 84 |      ]
 85 |     }
 86 |    ],
 87 |    "source": [
 88 |     "%timeit df.loc[(df['feet'] * 0.3048) > 1]"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "id": "d3a6a8c0",
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "84.4 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "%timeit df.query('(feet * 0.3048) > 1')"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "id": "261622c5",
112 |    "metadata": {},
113 |    "source": [
114 |     "# Beyond 3\n",
115 |     "\n",
116 |     "What if we modify our query, such that we look for cars that are > 1 meter from the curb and the state is New York? Which query runs faster, and by how much?"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 6,
122 |    "id": "f51ddb07",
123 |    "metadata": {
124 |     "scrolled": true
125 |    },
126 |    "outputs": [
127 |     {
128 |      "name": "stdout",
129 |      "output_type": "stream",
130 |      "text": [
131 |       "507 ms ± 1.65 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
132 |      ]
133 |     }
134 |    ],
135 |    "source": [
136 |     "%timeit df.loc[((df['feet'] * 0.3048) > 1) & (df['state'] == 'NY')]"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 7,
142 |    "id": "5ebb17b3",
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "name": "stdout",
147 |      "output_type": "stream",
148 |      "text": [
149 |       "314 ms ± 4.27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
150 |      ]
151 |     }
152 |    ],
153 |    "source": [
154 |     "%timeit df.query('(feet * 0.3048) > 1 and state == \"NY\" ')"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "id": "ada77895",
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": []
164 |   }
165 |  ],
166 |  "metadata": {
167 |   "kernelspec": {
168 |    "display_name": "Python 3 (ipykernel)",
169 |    "language": "python",
170 |    "name": "python3"
171 |   },
172 |   "language_info": {
173 |    "codemirror_mode": {
174 |     "name": "ipython",
175 |     "version": 3
176 |    },
177 |    "file_extension": ".py",
178 |    "mimetype": "text/x-python",
179 |    "name": "python",
180 |    "nbconvert_exporter": "python",
181 |    "pygments_lexer": "ipython3",
182 |    "version": "3.12.1"
183 |   }
184 |  },
185 |  "nbformat": 4,
186 |  "nbformat_minor": 5
187 | }
188 | 


--------------------------------------------------------------------------------
/chapter-12/dask-worker-space/global.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reuven/pandas-workout/e79d3429f194fb05bfae4aa48f9970bd254a89b6/chapter-12/dask-worker-space/global.lock


--------------------------------------------------------------------------------
/chapter-12/dask-worker-space/purge.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/reuven/pandas-workout/e79d3429f194fb05bfae4aa48f9970bd254a89b6/chapter-12/dask-worker-space/purge.lock


--------------------------------------------------------------------------------