├── .gitignore ├── requirements.txt ├── README.md ├── movielens.py ├── usbaby.py ├── appb.ipynb ├── bitly_usagov.py └── appa.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | 3 | #ignore data in the datasets folder 4 | datasets/ -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | matplotlib 4 | lxml 5 | seaborn 6 | statsmodels 7 | scipy 8 | patsy 9 | scikit-learn 10 | beautifulsoup4 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python for Data Analysis, 3rd Edition 2 | 3 | My practice on 5 case studies in the chapter 13. 4 | 5 | Materials and IPython notebooks for "Python for Data Analysis, 3rd 6 | Edition" by Wes McKinney, published by O'Reilly Media. 7 | 8 | 9 | 10 | ### Code 11 | 12 | The code in this repository, including all code samples in the notebooks listed 13 | above, is released under the [MIT license](LICENSE-CODE). Read more at the 14 | [Open Source Initiative](https://opensource.org/licenses/MIT). 15 | 16 | [1]: https://amzn.to/3DyLaJc 17 | [2]: https://github.com/wesm/pydata-book/tree/1st-edition 18 | [5]: https://github.com/wesm/pydata-book/tree/2nd-edition 19 | [6]: https://wesmckinney.com/book/ 20 | -------------------------------------------------------------------------------- /movielens.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | 5 | mnames = ['movie_id', 'title', 'genre'] 6 | movies = pd.read_table('datasets/movielens/movies.dat', sep='::', names=mnames, header=None) 7 | 8 | unames = ['user_id', 'gender', 'age', 'occupation', 'zipcode'] 9 | users = pd.read_table('datasets/movielens/users.dat', sep='::', names=unames, header=None) 10 | 11 | rnames = ['user_id', 'movie_id', 'rating', 'timestamp'] 12 | ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::', names=rnames, header=None) 13 | 14 | data = pd.merge(users, pd.merge(ratings, movies)) 15 | 16 | mean_rating = data.pivot_table('rating', index = "title", columns = "gender", aggfunc = 'mean') 17 | 18 | rating_by_title = data.groupby('title').size() 19 | active_title = rating_by_title[rating_by_title >= 250] 20 | 21 | mean_rating_active = mean_rating.iloc[active_title] 22 | top_rating_female = mean_rating_active.sort_values(by='F',ascending=False) 23 | 24 | #rating difference between men and women 25 | mean_rating_active['dif'] = mean_rating_active['F'] - mean_rating_active['M'] 26 | top_rating_dif = mean_rating_active.sort_values(by='dif', ascending=False) -------------------------------------------------------------------------------- /usbaby.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import seaborn as sns 4 | 5 | years = range(1880, 2011) 6 | columns = ['name', 'gender', 'number'] 7 | 8 | baby_data = [] 9 | for year in years: 10 | frame = pd.read_table('datasets/babynames/yob{}.txt'.format(year), sep=',', names=columns) 11 | frame['year'] = year 12 | baby_data.append(frame) 13 | baby_data = pd.concat(baby_data, ignore_index=True) 14 | 15 | total_birth = baby_data.pivot_table('number', index='year', columns='gender', aggfunc='sum') 16 | total_birth.plot(title = 'male and female birth along the year') 17 | 18 | def prop(group): 19 | group['prop'] = group.number/group.number.sum() 20 | return group 21 | 22 | names = baby_data.groupby(['year','gender']).apply(prop) 23 | 24 | def get_top1000(group): 25 | return group.sort_values(by='number', ascending=False)[:1000] 26 | 27 | grouped = names.groupby(['year', 'gender']) 28 | top1000 = grouped.apply(get_top1000) 29 | # Drop the group index, not needed 30 | top1000.reset_index(inplace=True, drop=True) 31 | 32 | #name trend according to year 33 | name_boy = top1000[top1000.gender == 'M'] 34 | name_girl = top1000[top1000.gender == 'F'] 35 | 36 | total_name = top1000.pivot_table('prop', index = 'year', columns='name', aggfunc = 'sum') 37 | 38 | sub_set = total_name[['Mary', 'John', 'Harry', 'Duke', 'Jane']] 39 | sub_set.plot(subplots=True, title = 'name proportions according to year') 40 | 41 | table = top1000.pivot_table('prop', index='year', columns='gender', aggfunc='sum') 42 | table.plot(title='top1000 name proportion') 43 | 44 | total_birth.plot(title = 'total birth') 45 | 46 | def group_cumsum(group, q=0.5): 47 | group.sort_values(by='prop', ascending=False) 48 | return group.prop.cumsum().values.searchsorted(q) 49 | 50 | diversity = top1000.groupby(['year', 'gender']).apply(group_cumsum).unstack('gender') 51 | diversity.plot() 52 | 53 | 54 | -------------------------------------------------------------------------------- /appb.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "plt.rc('figure', figsize=(10, 6))\n", 13 | "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", 14 | "pd.options.display.max_columns = 20\n", 15 | "pd.options.display.max_rows = 20\n", 16 | "pd.options.display.max_colwidth = 80\n", 17 | "np.set_printoptions(precision=4, suppress=True)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# a very large list of strings\n", 27 | "strings = ['foo', 'foobar', 'baz', 'qux',\n", 28 | " 'python', 'Guido Van Rossum'] * 100000\n", 29 | "\n", 30 | "method1 = [x for x in strings if x.startswith('foo')]\n", 31 | "\n", 32 | "method2 = [x for x in strings if x[:3] == 'foo']" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%time method1 = [x for x in strings if x.startswith('foo')]\n", 42 | "%time method2 = [x for x in strings if x[:3] == 'foo']" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 4, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 5, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" 59 | ] 60 | } 61 | ], 62 | "metadata": { 63 | "kernelspec": { 64 | "display_name": "Python 3", 65 | "language": "python", 66 | "name": "python3" 67 | }, 68 | "language_info": { 69 | "codemirror_mode": { 70 | "name": "ipython", 71 | "version": 3 72 | }, 73 | "file_extension": ".py", 74 | "mimetype": "text/x-python", 75 | "name": "python", 76 | "nbconvert_exporter": "python", 77 | "pygments_lexer": "ipython3", 78 | "version": "3.7.6" 79 | } 80 | }, 81 | "nbformat": 4, 82 | "nbformat_minor": 4 83 | } -------------------------------------------------------------------------------- /bitly_usagov.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import json 6 | from collections import defaultdict, Counter 7 | 8 | path = "datasets/bitly_usagov/example.txt" 9 | records = [json.loads(line) for line in open(path)] 10 | 11 | time_zones = [rec['tz'] for rec in records if 'tz' in rec] 12 | 13 | def get_counts1(sequence): 14 | counts = {} 15 | for i in sequence: 16 | if i in counts: 17 | counts[i] += 1 18 | else: 19 | counts[i] = 1 20 | return counts 21 | 22 | def get_counts2(sequence): 23 | counts = defaultdict(int) 24 | for i in sequence: 25 | counts[i] += 1 26 | return counts 27 | 28 | def top_counts1(count_dict, n = 10): 29 | counts = get_counts1(count_dict) 30 | pairs = [(tz, i) for tz, i in counts.items()] 31 | return pairs[:n] 32 | 33 | def top_counts2(count_dict, n = 10): 34 | counts = Counter(count_dict) 35 | pairs = counts.most_common(n) 36 | return pairs 37 | 38 | tz_count = get_counts1(time_zones) 39 | top_10 = top_counts1(time_zones) 40 | print(tz_count['America/New_York']) 41 | print(top_10) 42 | 43 | #using pandas 44 | frame = pd.DataFrame(records) 45 | time_zones_pd = frame['tz'].value_counts() 46 | print(time_zones_pd[:10]) 47 | 48 | subset = time_zones_pd[:10] 49 | sns.barplot(y = subset.index, x = subset.values) 50 | 51 | result = pd.Series([a.split()[0] for a in frame.a.dropna()]) 52 | browser_count = result.value_counts() 53 | print(browser_count[:10]) 54 | 55 | cframe = frame[frame.a.notnull()] 56 | cframe['os'] = np.where(cframe.a.str.contains('Windows'), 'Windows', 'Not Windows') 57 | os_count = cframe['os'].value_counts() 58 | print(os_count) 59 | 60 | by_tz_os = cframe.groupby(['tz', 'os']) 61 | agg_counts = by_tz_os.size().unstack().fillna(0) 62 | print(agg_counts.head()) 63 | 64 | indexer = agg_counts.sum(1).argsort() 65 | print(indexer.head()) 66 | 67 | count_subset = agg_counts.take(indexer[-10:]) 68 | count_subset = count_subset.stack() 69 | count_subset.name = 'total' 70 | count_subset = count_subset.reset_index() 71 | print(count_subset[:10]) 72 | sns.barplot(data=count_subset, y='tz', x='total', hue='os') 73 | 74 | def norm_total(group): 75 | group['normed_total'] = group.total / group.total.sum() 76 | return group 77 | 78 | results = count_subset.groupby('tz').apply(norm_total) 79 | 80 | sns.barplot(x='normed_total', y='tz', hue='os', data=results) 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /appa.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "plt.rc('figure', figsize=(10, 6))\n", 13 | "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", 14 | "pd.options.display.max_columns = 20\n", 15 | "pd.options.display.max_rows = 20\n", 16 | "pd.options.display.max_colwidth = 80\n", 17 | "np.set_printoptions(precision=4, suppress=True)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "rng = np.random.default_rng(seed=12345)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 3, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "np.ones((10, 5)).shape" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 4, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "np.ones((3, 4, 5), dtype=np.float64).strides" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 5, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "ints = np.ones(10, dtype=np.uint16)\n", 54 | "floats = np.ones(10, dtype=np.float32)\n", 55 | "np.issubdtype(ints.dtype, np.integer)\n", 56 | "np.issubdtype(floats.dtype, np.floating)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 6, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "np.float64.mro()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 7, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "np.issubdtype(ints.dtype, np.number)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 8, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "arr = np.arange(8)\n", 84 | "arr\n", 85 | "arr.reshape((4, 2))" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 9, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "arr.reshape((4, 2)).reshape((2, 4))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 10, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "arr = np.arange(15)\n", 104 | "arr.reshape((5, -1))" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 11, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "other_arr = np.ones((3, 5))\n", 114 | "other_arr.shape\n", 115 | "arr.reshape(other_arr.shape)" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 12, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "arr = np.arange(15).reshape((5, 3))\n", 125 | "arr\n", 126 | "arr.ravel()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 13, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "arr.flatten()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 14, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "arr = np.arange(12).reshape((3, 4))\n", 145 | "arr\n", 146 | "arr.ravel()\n", 147 | "arr.ravel('F')" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 15, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n", 157 | "arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n", 158 | "np.concatenate([arr1, arr2], axis=0)\n", 159 | "np.concatenate([arr1, arr2], axis=1)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 16, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "np.vstack((arr1, arr2))\n", 169 | "np.hstack((arr1, arr2))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 17, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "arr = rng.standard_normal((5, 2))\n", 179 | "arr\n", 180 | "first, second, third = np.split(arr, [1, 3])\n", 181 | "first\n", 182 | "second\n", 183 | "third" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 18, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "arr = np.arange(6)\n", 193 | "arr1 = arr.reshape((3, 2))\n", 194 | "arr2 = rng.standard_normal((3, 2))\n", 195 | "np.r_[arr1, arr2]\n", 196 | "np.c_[np.r_[arr1, arr2], arr]" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 19, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "np.c_[1:6, -10:-5]" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 20, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "arr = np.arange(3)\n", 215 | "arr\n", 216 | "arr.repeat(3)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 21, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "arr.repeat([2, 3, 4])" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 22, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "arr = rng.standard_normal((2, 2))\n", 235 | "arr\n", 236 | "arr.repeat(2, axis=0)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 23, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "arr.repeat([2, 3], axis=0)\n", 246 | "arr.repeat([2, 3], axis=1)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 24, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "arr\n", 256 | "np.tile(arr, 2)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 25, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "arr\n", 266 | "np.tile(arr, (2, 1))\n", 267 | "np.tile(arr, (3, 2))" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 26, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "arr = np.arange(10) * 100\n", 277 | "inds = [7, 1, 2, 6]\n", 278 | "arr[inds]" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 27, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "arr.take(inds)\n", 288 | "arr.put(inds, 42)\n", 289 | "arr\n", 290 | "arr.put(inds, [40, 41, 42, 43])\n", 291 | "arr" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 28, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "inds = [2, 0, 2, 1]\n", 301 | "arr = rng.standard_normal((2, 4))\n", 302 | "arr\n", 303 | "arr.take(inds, axis=1)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 29, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "arr = np.arange(5)\n", 313 | "arr\n", 314 | "arr * 4" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 30, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "arr = rng.standard_normal((4, 3))\n", 324 | "arr.mean(0)\n", 325 | "demeaned = arr - arr.mean(0)\n", 326 | "demeaned\n", 327 | "demeaned.mean(0)" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 31, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "arr\n", 337 | "row_means = arr.mean(1)\n", 338 | "row_means.shape\n", 339 | "row_means.reshape((4, 1))\n", 340 | "demeaned = arr - row_means.reshape((4, 1))\n", 341 | "demeaned.mean(1)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 32, 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "arr - arr.mean(1)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 33, 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "arr - arr.mean(1).reshape((4, 1))" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 34, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "arr = np.zeros((4, 4))\n", 369 | "arr_3d = arr[:, np.newaxis, :]\n", 370 | "arr_3d.shape\n", 371 | "arr_1d = rng.standard_normal(3)\n", 372 | "arr_1d[:, np.newaxis]\n", 373 | "arr_1d[np.newaxis, :]" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": 35, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "arr = rng.standard_normal((3, 4, 5))\n", 383 | "depth_means = arr.mean(2)\n", 384 | "depth_means\n", 385 | "depth_means.shape\n", 386 | "demeaned = arr - depth_means[:, :, np.newaxis]\n", 387 | "demeaned.mean(2)" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 36, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "arr = np.zeros((4, 3))\n", 397 | "arr[:] = 5\n", 398 | "arr" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 37, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "col = np.array([1.28, -0.42, 0.44, 1.6])\n", 408 | "arr[:] = col[:, np.newaxis]\n", 409 | "arr\n", 410 | "arr[:2] = [[-1.37], [0.509]]\n", 411 | "arr" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 38, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "arr = np.arange(10)\n", 421 | "np.add.reduce(arr)\n", 422 | "arr.sum()" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 39, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "my_rng = np.random.default_rng(12346) # for reproducibility\n", 432 | "arr = my_rng.standard_normal((5, 5))\n", 433 | "arr\n", 434 | "arr[::2].sort(1) # sort a few rows\n", 435 | "arr[:, :-1] < arr[:, 1:]\n", 436 | "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 40, 442 | "metadata": {}, 443 | "outputs": [], 444 | "source": [ 445 | "arr = np.arange(15).reshape((3, 5))\n", 446 | "np.add.accumulate(arr, axis=1)" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 41, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "arr = np.arange(3).repeat([1, 2, 2])\n", 456 | "arr\n", 457 | "np.multiply.outer(arr, np.arange(5))" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": 42, 463 | "metadata": {}, 464 | "outputs": [], 465 | "source": [ 466 | "x, y = rng.standard_normal((3, 4)), rng.standard_normal(5)\n", 467 | "result = np.subtract.outer(x, y)\n", 468 | "result.shape" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 43, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "arr = np.arange(10)\n", 478 | "np.add.reduceat(arr, [0, 5, 8])" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 44, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "arr = np.multiply.outer(np.arange(4), np.arange(5))\n", 488 | "arr\n", 489 | "np.add.reduceat(arr, [0, 2, 4], axis=1)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 45, 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "def add_elements(x, y):\n", 499 | " return x + y\n", 500 | "add_them = np.frompyfunc(add_elements, 2, 1)\n", 501 | "add_them(np.arange(8), np.arange(8))" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 46, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "add_them = np.vectorize(add_elements, otypes=[np.float64])\n", 511 | "add_them(np.arange(8), np.arange(8))" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 47, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "arr = rng.standard_normal(10000)\n", 521 | "%timeit add_them(arr, arr)\n", 522 | "%timeit np.add(arr, arr)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 48, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "dtype = [('x', np.float64), ('y', np.int32)]\n", 532 | "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n", 533 | "sarr" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 49, 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "sarr[0]\n", 543 | "sarr[0]['y']" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 50, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "sarr['x']" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 51, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "dtype = [('x', np.int64, 3), ('y', np.int32)]\n", 562 | "arr = np.zeros(4, dtype=dtype)\n", 563 | "arr" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 52, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "arr[0]['x']" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 53, 578 | "metadata": {}, 579 | "outputs": [], 580 | "source": [ 581 | "arr['x']" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 54, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]\n", 591 | "data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)\n", 592 | "data['x']\n", 593 | "data['y']\n", 594 | "data['x']['a']" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 55, 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "arr = rng.standard_normal(6)\n", 604 | "arr.sort()\n", 605 | "arr" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 56, 611 | "metadata": {}, 612 | "outputs": [], 613 | "source": [ 614 | "arr = rng.standard_normal((3, 5))\n", 615 | "arr\n", 616 | "arr[:, 0].sort() # Sort first column values in place\n", 617 | "arr" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 57, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "arr = rng.standard_normal(5)\n", 627 | "arr\n", 628 | "np.sort(arr)\n", 629 | "arr" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 58, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "arr = rng.standard_normal((3, 5))\n", 639 | "arr\n", 640 | "arr.sort(axis=1)\n", 641 | "arr" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 59, 647 | "metadata": {}, 648 | "outputs": [], 649 | "source": [ 650 | "arr[:, ::-1]" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": 60, 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [ 659 | "values = np.array([5, 0, 1, 3, 2])\n", 660 | "indexer = values.argsort()\n", 661 | "indexer\n", 662 | "values[indexer]" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": 61, 668 | "metadata": {}, 669 | "outputs": [], 670 | "source": [ 671 | "arr = rng.standard_normal((3, 5))\n", 672 | "arr[0] = values\n", 673 | "arr\n", 674 | "arr[:, arr[0].argsort()]" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 62, 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])\n", 684 | "last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])\n", 685 | "sorter = np.lexsort((first_name, last_name))\n", 686 | "sorter\n", 687 | "list(zip(last_name[sorter], first_name[sorter]))" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": 63, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [ 696 | "values = np.array(['2:first', '2:second', '1:first', '1:second',\n", 697 | " '1:third'])\n", 698 | "key = np.array([2, 2, 1, 1, 1])\n", 699 | "indexer = key.argsort(kind='mergesort')\n", 700 | "indexer\n", 701 | "values.take(indexer)" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 64, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "rng = np.random.default_rng(12345)\n", 711 | "arr = rng.standard_normal(20)\n", 712 | "arr\n", 713 | "np.partition(arr, 3)" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 65, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "indices = np.argpartition(arr, 3)\n", 723 | "indices\n", 724 | "arr.take(indices)" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 66, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "arr = np.array([0, 1, 7, 12, 15])\n", 734 | "arr.searchsorted(9)" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": 67, 740 | "metadata": {}, 741 | "outputs": [], 742 | "source": [ 743 | "arr.searchsorted([0, 8, 11, 16])" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": 68, 749 | "metadata": {}, 750 | "outputs": [], 751 | "source": [ 752 | "arr = np.array([0, 0, 0, 1, 1, 1, 1])\n", 753 | "arr.searchsorted([0, 1])\n", 754 | "arr.searchsorted([0, 1], side='right')" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": 69, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "data = np.floor(rng.uniform(0, 10000, size=50))\n", 764 | "bins = np.array([0, 100, 1000, 5000, 10000])\n", 765 | "data" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": 70, 771 | "metadata": {}, 772 | "outputs": [], 773 | "source": [ 774 | "labels = bins.searchsorted(data)\n", 775 | "labels" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 71, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "pd.Series(data).groupby(labels).mean()" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 72, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "import numpy as np\n", 794 | "\n", 795 | "def mean_distance(x, y):\n", 796 | " nx = len(x)\n", 797 | " result = 0.0\n", 798 | " count = 0\n", 799 | " for i in range(nx):\n", 800 | " result += x[i] - y[i]\n", 801 | " count += 1\n", 802 | " return result / count" 803 | ] 804 | }, 805 | { 806 | "cell_type": "code", 807 | "execution_count": 73, 808 | "metadata": {}, 809 | "outputs": [], 810 | "source": [ 811 | "mmap = np.memmap('mymmap', dtype='float64', mode='w+',\n", 812 | " shape=(10000, 10000))\n", 813 | "mmap" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": 74, 819 | "metadata": {}, 820 | "outputs": [], 821 | "source": [ 822 | "section = mmap[:5]" 823 | ] 824 | }, 825 | { 826 | "cell_type": "code", 827 | "execution_count": 75, 828 | "metadata": {}, 829 | "outputs": [], 830 | "source": [ 831 | "section[:] = rng.standard_normal((5, 10000))\n", 832 | "mmap.flush()\n", 833 | "mmap\n", 834 | "del mmap" 835 | ] 836 | }, 837 | { 838 | "cell_type": "code", 839 | "execution_count": 76, 840 | "metadata": {}, 841 | "outputs": [], 842 | "source": [ 843 | "mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))\n", 844 | "mmap" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 77, 850 | "metadata": {}, 851 | "outputs": [], 852 | "source": [ 853 | "%xdel mmap\n", 854 | "!rm mymmap" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": 78, 860 | "metadata": {}, 861 | "outputs": [], 862 | "source": [ 863 | "arr_c = np.ones((100, 10000), order='C')\n", 864 | "arr_f = np.ones((100, 10000), order='F')\n", 865 | "arr_c.flags\n", 866 | "arr_f.flags\n", 867 | "arr_f.flags.f_contiguous" 868 | ] 869 | }, 870 | { 871 | "cell_type": "code", 872 | "execution_count": 79, 873 | "metadata": {}, 874 | "outputs": [], 875 | "source": [ 876 | "%timeit arr_c.sum(1)\n", 877 | "%timeit arr_f.sum(1)" 878 | ] 879 | }, 880 | { 881 | "cell_type": "code", 882 | "execution_count": 80, 883 | "metadata": {}, 884 | "outputs": [], 885 | "source": [ 886 | "arr_f.copy('C').flags" 887 | ] 888 | }, 889 | { 890 | "cell_type": "code", 891 | "execution_count": 81, 892 | "metadata": {}, 893 | "outputs": [], 894 | "source": [ 895 | "arr_c[:50].flags.contiguous\n", 896 | "arr_c[:, :50].flags" 897 | ] 898 | }, 899 | { 900 | "cell_type": "code", 901 | "execution_count": 82, 902 | "metadata": {}, 903 | "outputs": [], 904 | "source": [ 905 | "%xdel arr_c\n", 906 | "%xdel arr_f" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 83, 912 | "metadata": {}, 913 | "outputs": [], 914 | "source": [] 915 | }, 916 | { 917 | "cell_type": "code", 918 | "execution_count": 84, 919 | "metadata": {}, 920 | "outputs": [], 921 | "source": [ 922 | "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" 923 | ] 924 | } 925 | ], 926 | "metadata": { 927 | "kernelspec": { 928 | "display_name": "Python 3", 929 | "language": "python", 930 | "name": "python3" 931 | }, 932 | "language_info": { 933 | "codemirror_mode": { 934 | "name": "ipython", 935 | "version": 3 936 | }, 937 | "file_extension": ".py", 938 | "mimetype": "text/x-python", 939 | "name": "python", 940 | "nbconvert_exporter": "python", 941 | "pygments_lexer": "ipython3", 942 | "version": "3.7.6" 943 | } 944 | }, 945 | "nbformat": 4, 946 | "nbformat_minor": 4 947 | } --------------------------------------------------------------------------------