├── .gitignore
├── requirements.txt
├── README.md
├── movielens.py
├── usbaby.py
├── appb.ipynb
├── bitly_usagov.py
└── appa.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | 
3 | #ignore data in the datasets folder
4 | datasets/


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy
 2 | pandas
 3 | matplotlib
 4 | lxml
 5 | seaborn
 6 | statsmodels
 7 | scipy
 8 | patsy
 9 | scikit-learn
10 | beautifulsoup4
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Python for Data Analysis, 3rd Edition
 2 | 
 3 | My practice on 5 case studies in the chapter 13.
 4 | 
 5 | Materials and IPython notebooks for "Python for Data Analysis, 3rd
 6 | Edition" by Wes McKinney, published by O'Reilly Media. 
 7 | 
 8 | 
 9 | 
10 | ### Code
11 | 
12 | The code in this repository, including all code samples in the notebooks listed
13 | above, is released under the [MIT license](LICENSE-CODE). Read more at the
14 | [Open Source Initiative](https://opensource.org/licenses/MIT).
15 | 
16 | [1]: https://amzn.to/3DyLaJc
17 | [2]: https://github.com/wesm/pydata-book/tree/1st-edition
18 | [5]: https://github.com/wesm/pydata-book/tree/2nd-edition
19 | [6]: https://wesmckinney.com/book/
20 | 


--------------------------------------------------------------------------------
/movielens.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import seaborn as sns
 4 | 
 5 | mnames = ['movie_id', 'title', 'genre']
 6 | movies = pd.read_table('datasets/movielens/movies.dat', sep='::', names=mnames, header=None)
 7 | 
 8 | unames = ['user_id', 'gender', 'age', 'occupation', 'zipcode']
 9 | users = pd.read_table('datasets/movielens/users.dat', sep='::', names=unames, header=None)
10 | 
11 | rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
12 | ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::', names=rnames, header=None)
13 | 
14 | data = pd.merge(users, pd.merge(ratings, movies))
15 | 
16 | mean_rating = data.pivot_table('rating', index = "title", columns = "gender", aggfunc = 'mean')
17 | 
18 | rating_by_title = data.groupby('title').size()
19 | active_title = rating_by_title[rating_by_title >= 250]
20 | 
21 | mean_rating_active = mean_rating.iloc[active_title]
22 | top_rating_female = mean_rating_active.sort_values(by='F',ascending=False)
23 | 
24 | #rating difference between men and women
25 | mean_rating_active['dif'] = mean_rating_active['F'] - mean_rating_active['M']
26 | top_rating_dif = mean_rating_active.sort_values(by='dif', ascending=False)


--------------------------------------------------------------------------------
/usbaby.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import seaborn as sns
 4 | 
 5 | years = range(1880, 2011)
 6 | columns = ['name', 'gender', 'number']
 7 | 
 8 | baby_data = []
 9 | for year in years:
10 |     frame = pd.read_table('datasets/babynames/yob{}.txt'.format(year), sep=',', names=columns)
11 |     frame['year'] = year
12 |     baby_data.append(frame)
13 | baby_data = pd.concat(baby_data, ignore_index=True)
14 | 
15 | total_birth = baby_data.pivot_table('number', index='year', columns='gender', aggfunc='sum')
16 | total_birth.plot(title = 'male and female birth along the year')
17 | 
18 | def prop(group):
19 |     group['prop'] = group.number/group.number.sum()
20 |     return group
21 | 
22 | names = baby_data.groupby(['year','gender']).apply(prop)
23 | 
24 | def get_top1000(group):
25 |     return group.sort_values(by='number', ascending=False)[:1000]
26 | 
27 | grouped = names.groupby(['year', 'gender'])
28 | top1000 = grouped.apply(get_top1000)
29 | # Drop the group index, not needed
30 | top1000.reset_index(inplace=True, drop=True)
31 | 
32 | #name trend according to year
33 | name_boy = top1000[top1000.gender == 'M']
34 | name_girl = top1000[top1000.gender == 'F']
35 | 
36 | total_name = top1000.pivot_table('prop', index = 'year', columns='name', aggfunc = 'sum')
37 | 
38 | sub_set = total_name[['Mary', 'John', 'Harry', 'Duke', 'Jane']]
39 | sub_set.plot(subplots=True, title = 'name proportions according to year')
40 | 
41 | table = top1000.pivot_table('prop', index='year', columns='gender', aggfunc='sum')
42 | table.plot(title='top1000 name proportion')
43 | 
44 | total_birth.plot(title = 'total birth')
45 | 
46 | def group_cumsum(group, q=0.5):
47 |     group.sort_values(by='prop', ascending=False)
48 |     return group.prop.cumsum().values.searchsorted(q)
49 | 
50 | diversity = top1000.groupby(['year', 'gender']).apply(group_cumsum).unstack('gender')
51 | diversity.plot()
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/appb.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import numpy as np\n",
10 |     "import pandas as pd\n",
11 |     "import matplotlib.pyplot as plt\n",
12 |     "plt.rc('figure', figsize=(10, 6))\n",
13 |     "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
14 |     "pd.options.display.max_columns = 20\n",
15 |     "pd.options.display.max_rows = 20\n",
16 |     "pd.options.display.max_colwidth = 80\n",
17 |     "np.set_printoptions(precision=4, suppress=True)"
18 |    ]
19 |   },
20 |   {
21 |    "cell_type": "code",
22 |    "execution_count": 2,
23 |    "metadata": {},
24 |    "outputs": [],
25 |    "source": [
26 |     "# a very large list of strings\n",
27 |     "strings = ['foo', 'foobar', 'baz', 'qux',\n",
28 |     "           'python', 'Guido Van Rossum'] * 100000\n",
29 |     "\n",
30 |     "method1 = [x for x in strings if x.startswith('foo')]\n",
31 |     "\n",
32 |     "method2 = [x for x in strings if x[:3] == 'foo']"
33 |    ]
34 |   },
35 |   {
36 |    "cell_type": "code",
37 |    "execution_count": 3,
38 |    "metadata": {},
39 |    "outputs": [],
40 |    "source": [
41 |     "%time method1 = [x for x in strings if x.startswith('foo')]\n",
42 |     "%time method2 = [x for x in strings if x[:3] == 'foo']"
43 |    ]
44 |   },
45 |   {
46 |    "cell_type": "code",
47 |    "execution_count": 4,
48 |    "metadata": {},
49 |    "outputs": [],
50 |    "source": []
51 |   },
52 |   {
53 |    "cell_type": "code",
54 |    "execution_count": 5,
55 |    "metadata": {},
56 |    "outputs": [],
57 |    "source": [
58 |     "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
59 |    ]
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "kernelspec": {
64 |    "display_name": "Python 3",
65 |    "language": "python",
66 |    "name": "python3"
67 |   },
68 |   "language_info": {
69 |    "codemirror_mode": {
70 |     "name": "ipython",
71 |     "version": 3
72 |    },
73 |    "file_extension": ".py",
74 |    "mimetype": "text/x-python",
75 |    "name": "python",
76 |    "nbconvert_exporter": "python",
77 |    "pygments_lexer": "ipython3",
78 |    "version": "3.7.6"
79 |   }
80 |  },
81 |  "nbformat": 4,
82 |  "nbformat_minor": 4
83 | }


--------------------------------------------------------------------------------
/bitly_usagov.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | import json
 6 | from collections import defaultdict, Counter
 7 | 
 8 | path = "datasets/bitly_usagov/example.txt"
 9 | records = [json.loads(line) for line in open(path)]
10 | 
11 | time_zones = [rec['tz'] for rec in records if 'tz' in rec]
12 | 
13 | def get_counts1(sequence):
14 |     counts = {}
15 |     for i in sequence:
16 |         if i in counts:
17 |             counts[i] += 1
18 |         else:
19 |             counts[i] = 1
20 |     return counts
21 | 
22 | def get_counts2(sequence):
23 |     counts = defaultdict(int)
24 |     for i in sequence:
25 |         counts[i] += 1
26 |     return counts
27 | 
28 | def top_counts1(count_dict, n = 10):
29 |     counts = get_counts1(count_dict)
30 |     pairs = [(tz, i) for tz, i in counts.items()]
31 |     return pairs[:n]
32 | 
33 | def top_counts2(count_dict, n = 10):
34 |     counts = Counter(count_dict)
35 |     pairs = counts.most_common(n)
36 |     return pairs
37 | 
38 | tz_count = get_counts1(time_zones)
39 | top_10 = top_counts1(time_zones)
40 | print(tz_count['America/New_York'])
41 | print(top_10)
42 | 
43 | #using pandas
44 | frame = pd.DataFrame(records)
45 | time_zones_pd = frame['tz'].value_counts()
46 | print(time_zones_pd[:10])
47 | 
48 | subset = time_zones_pd[:10]
49 | sns.barplot(y = subset.index, x = subset.values)
50 | 
51 | result = pd.Series([a.split()[0] for a in frame.a.dropna()])
52 | browser_count = result.value_counts()
53 | print(browser_count[:10])
54 | 
55 | cframe = frame[frame.a.notnull()]
56 | cframe['os'] = np.where(cframe.a.str.contains('Windows'), 'Windows', 'Not Windows')
57 | os_count = cframe['os'].value_counts()
58 | print(os_count)
59 | 
60 | by_tz_os = cframe.groupby(['tz', 'os'])
61 | agg_counts = by_tz_os.size().unstack().fillna(0)
62 | print(agg_counts.head())
63 | 
64 | indexer = agg_counts.sum(1).argsort()
65 | print(indexer.head())
66 | 
67 | count_subset = agg_counts.take(indexer[-10:])
68 | count_subset = count_subset.stack()
69 | count_subset.name = 'total'
70 | count_subset = count_subset.reset_index()
71 | print(count_subset[:10])
72 | sns.barplot(data=count_subset, y='tz', x='total', hue='os')
73 | 
74 | def norm_total(group):
75 |     group['normed_total'] = group.total / group.total.sum()
76 |     return group
77 | 
78 | results = count_subset.groupby('tz').apply(norm_total)
79 | 
80 | sns.barplot(x='normed_total', y='tz', hue='os', data=results)
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/appa.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pandas as pd\n",
 11 |     "import matplotlib.pyplot as plt\n",
 12 |     "plt.rc('figure', figsize=(10, 6))\n",
 13 |     "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n",
 14 |     "pd.options.display.max_columns = 20\n",
 15 |     "pd.options.display.max_rows = 20\n",
 16 |     "pd.options.display.max_colwidth = 80\n",
 17 |     "np.set_printoptions(precision=4, suppress=True)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "rng = np.random.default_rng(seed=12345)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 3,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "np.ones((10, 5)).shape"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 4,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "np.ones((3, 4, 5), dtype=np.float64).strides"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 5,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "ints = np.ones(10, dtype=np.uint16)\n",
 54 |     "floats = np.ones(10, dtype=np.float32)\n",
 55 |     "np.issubdtype(ints.dtype, np.integer)\n",
 56 |     "np.issubdtype(floats.dtype, np.floating)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 6,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "np.float64.mro()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 7,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "np.issubdtype(ints.dtype, np.number)"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 8,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "arr = np.arange(8)\n",
 84 |     "arr\n",
 85 |     "arr.reshape((4, 2))"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 9,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "arr.reshape((4, 2)).reshape((2, 4))"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 10,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "arr = np.arange(15)\n",
104 |     "arr.reshape((5, -1))"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 11,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "other_arr = np.ones((3, 5))\n",
114 |     "other_arr.shape\n",
115 |     "arr.reshape(other_arr.shape)"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 12,
121 |    "metadata": {},
122 |    "outputs": [],
123 |    "source": [
124 |     "arr = np.arange(15).reshape((5, 3))\n",
125 |     "arr\n",
126 |     "arr.ravel()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 13,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "arr.flatten()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 14,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "arr = np.arange(12).reshape((3, 4))\n",
145 |     "arr\n",
146 |     "arr.ravel()\n",
147 |     "arr.ravel('F')"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 15,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n",
157 |     "arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n",
158 |     "np.concatenate([arr1, arr2], axis=0)\n",
159 |     "np.concatenate([arr1, arr2], axis=1)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 16,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "np.vstack((arr1, arr2))\n",
169 |     "np.hstack((arr1, arr2))"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 17,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "arr = rng.standard_normal((5, 2))\n",
179 |     "arr\n",
180 |     "first, second, third = np.split(arr, [1, 3])\n",
181 |     "first\n",
182 |     "second\n",
183 |     "third"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 18,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "arr = np.arange(6)\n",
193 |     "arr1 = arr.reshape((3, 2))\n",
194 |     "arr2 = rng.standard_normal((3, 2))\n",
195 |     "np.r_[arr1, arr2]\n",
196 |     "np.c_[np.r_[arr1, arr2], arr]"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 19,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "np.c_[1:6, -10:-5]"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 20,
211 |    "metadata": {},
212 |    "outputs": [],
213 |    "source": [
214 |     "arr = np.arange(3)\n",
215 |     "arr\n",
216 |     "arr.repeat(3)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 21,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "arr.repeat([2, 3, 4])"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 22,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "arr = rng.standard_normal((2, 2))\n",
235 |     "arr\n",
236 |     "arr.repeat(2, axis=0)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": 23,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "arr.repeat([2, 3], axis=0)\n",
246 |     "arr.repeat([2, 3], axis=1)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 24,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "arr\n",
256 |     "np.tile(arr, 2)"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 25,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "arr\n",
266 |     "np.tile(arr, (2, 1))\n",
267 |     "np.tile(arr, (3, 2))"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 26,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "arr = np.arange(10) * 100\n",
277 |     "inds = [7, 1, 2, 6]\n",
278 |     "arr[inds]"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 27,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "arr.take(inds)\n",
288 |     "arr.put(inds, 42)\n",
289 |     "arr\n",
290 |     "arr.put(inds, [40, 41, 42, 43])\n",
291 |     "arr"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "code",
296 |    "execution_count": 28,
297 |    "metadata": {},
298 |    "outputs": [],
299 |    "source": [
300 |     "inds = [2, 0, 2, 1]\n",
301 |     "arr = rng.standard_normal((2, 4))\n",
302 |     "arr\n",
303 |     "arr.take(inds, axis=1)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 29,
309 |    "metadata": {},
310 |    "outputs": [],
311 |    "source": [
312 |     "arr = np.arange(5)\n",
313 |     "arr\n",
314 |     "arr * 4"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 30,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "arr = rng.standard_normal((4, 3))\n",
324 |     "arr.mean(0)\n",
325 |     "demeaned = arr - arr.mean(0)\n",
326 |     "demeaned\n",
327 |     "demeaned.mean(0)"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 31,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "arr\n",
337 |     "row_means = arr.mean(1)\n",
338 |     "row_means.shape\n",
339 |     "row_means.reshape((4, 1))\n",
340 |     "demeaned = arr - row_means.reshape((4, 1))\n",
341 |     "demeaned.mean(1)"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 32,
347 |    "metadata": {},
348 |    "outputs": [],
349 |    "source": [
350 |     "arr - arr.mean(1)"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 33,
356 |    "metadata": {},
357 |    "outputs": [],
358 |    "source": [
359 |     "arr - arr.mean(1).reshape((4, 1))"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 34,
365 |    "metadata": {},
366 |    "outputs": [],
367 |    "source": [
368 |     "arr = np.zeros((4, 4))\n",
369 |     "arr_3d = arr[:, np.newaxis, :]\n",
370 |     "arr_3d.shape\n",
371 |     "arr_1d = rng.standard_normal(3)\n",
372 |     "arr_1d[:, np.newaxis]\n",
373 |     "arr_1d[np.newaxis, :]"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": 35,
379 |    "metadata": {},
380 |    "outputs": [],
381 |    "source": [
382 |     "arr = rng.standard_normal((3, 4, 5))\n",
383 |     "depth_means = arr.mean(2)\n",
384 |     "depth_means\n",
385 |     "depth_means.shape\n",
386 |     "demeaned = arr - depth_means[:, :, np.newaxis]\n",
387 |     "demeaned.mean(2)"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "code",
392 |    "execution_count": 36,
393 |    "metadata": {},
394 |    "outputs": [],
395 |    "source": [
396 |     "arr = np.zeros((4, 3))\n",
397 |     "arr[:] = 5\n",
398 |     "arr"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 37,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "col = np.array([1.28, -0.42, 0.44, 1.6])\n",
408 |     "arr[:] = col[:, np.newaxis]\n",
409 |     "arr\n",
410 |     "arr[:2] = [[-1.37], [0.509]]\n",
411 |     "arr"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 38,
417 |    "metadata": {},
418 |    "outputs": [],
419 |    "source": [
420 |     "arr = np.arange(10)\n",
421 |     "np.add.reduce(arr)\n",
422 |     "arr.sum()"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": 39,
428 |    "metadata": {},
429 |    "outputs": [],
430 |    "source": [
431 |     "my_rng = np.random.default_rng(12346)  # for reproducibility\n",
432 |     "arr = my_rng.standard_normal((5, 5))\n",
433 |     "arr\n",
434 |     "arr[::2].sort(1) # sort a few rows\n",
435 |     "arr[:, :-1] < arr[:, 1:]\n",
436 |     "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 40,
442 |    "metadata": {},
443 |    "outputs": [],
444 |    "source": [
445 |     "arr = np.arange(15).reshape((3, 5))\n",
446 |     "np.add.accumulate(arr, axis=1)"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 41,
452 |    "metadata": {},
453 |    "outputs": [],
454 |    "source": [
455 |     "arr = np.arange(3).repeat([1, 2, 2])\n",
456 |     "arr\n",
457 |     "np.multiply.outer(arr, np.arange(5))"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "code",
462 |    "execution_count": 42,
463 |    "metadata": {},
464 |    "outputs": [],
465 |    "source": [
466 |     "x, y = rng.standard_normal((3, 4)), rng.standard_normal(5)\n",
467 |     "result = np.subtract.outer(x, y)\n",
468 |     "result.shape"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 43,
474 |    "metadata": {},
475 |    "outputs": [],
476 |    "source": [
477 |     "arr = np.arange(10)\n",
478 |     "np.add.reduceat(arr, [0, 5, 8])"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 44,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "arr = np.multiply.outer(np.arange(4), np.arange(5))\n",
488 |     "arr\n",
489 |     "np.add.reduceat(arr, [0, 2, 4], axis=1)"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 45,
495 |    "metadata": {},
496 |    "outputs": [],
497 |    "source": [
498 |     "def add_elements(x, y):\n",
499 |     "    return x + y\n",
500 |     "add_them = np.frompyfunc(add_elements, 2, 1)\n",
501 |     "add_them(np.arange(8), np.arange(8))"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 46,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "add_them = np.vectorize(add_elements, otypes=[np.float64])\n",
511 |     "add_them(np.arange(8), np.arange(8))"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 47,
517 |    "metadata": {},
518 |    "outputs": [],
519 |    "source": [
520 |     "arr = rng.standard_normal(10000)\n",
521 |     "%timeit add_them(arr, arr)\n",
522 |     "%timeit np.add(arr, arr)"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 48,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "dtype = [('x', np.float64), ('y', np.int32)]\n",
532 |     "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n",
533 |     "sarr"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 49,
539 |    "metadata": {},
540 |    "outputs": [],
541 |    "source": [
542 |     "sarr[0]\n",
543 |     "sarr[0]['y']"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": 50,
549 |    "metadata": {},
550 |    "outputs": [],
551 |    "source": [
552 |     "sarr['x']"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 51,
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "dtype = [('x', np.int64, 3), ('y', np.int32)]\n",
562 |     "arr = np.zeros(4, dtype=dtype)\n",
563 |     "arr"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": 52,
569 |    "metadata": {},
570 |    "outputs": [],
571 |    "source": [
572 |     "arr[0]['x']"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 53,
578 |    "metadata": {},
579 |    "outputs": [],
580 |    "source": [
581 |     "arr['x']"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "code",
586 |    "execution_count": 54,
587 |    "metadata": {},
588 |    "outputs": [],
589 |    "source": [
590 |     "dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]\n",
591 |     "data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)\n",
592 |     "data['x']\n",
593 |     "data['y']\n",
594 |     "data['x']['a']"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": 55,
600 |    "metadata": {},
601 |    "outputs": [],
602 |    "source": [
603 |     "arr = rng.standard_normal(6)\n",
604 |     "arr.sort()\n",
605 |     "arr"
606 |    ]
607 |   },
608 |   {
609 |    "cell_type": "code",
610 |    "execution_count": 56,
611 |    "metadata": {},
612 |    "outputs": [],
613 |    "source": [
614 |     "arr = rng.standard_normal((3, 5))\n",
615 |     "arr\n",
616 |     "arr[:, 0].sort()  # Sort first column values in place\n",
617 |     "arr"
618 |    ]
619 |   },
620 |   {
621 |    "cell_type": "code",
622 |    "execution_count": 57,
623 |    "metadata": {},
624 |    "outputs": [],
625 |    "source": [
626 |     "arr = rng.standard_normal(5)\n",
627 |     "arr\n",
628 |     "np.sort(arr)\n",
629 |     "arr"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": 58,
635 |    "metadata": {},
636 |    "outputs": [],
637 |    "source": [
638 |     "arr = rng.standard_normal((3, 5))\n",
639 |     "arr\n",
640 |     "arr.sort(axis=1)\n",
641 |     "arr"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": 59,
647 |    "metadata": {},
648 |    "outputs": [],
649 |    "source": [
650 |     "arr[:, ::-1]"
651 |    ]
652 |   },
653 |   {
654 |    "cell_type": "code",
655 |    "execution_count": 60,
656 |    "metadata": {},
657 |    "outputs": [],
658 |    "source": [
659 |     "values = np.array([5, 0, 1, 3, 2])\n",
660 |     "indexer = values.argsort()\n",
661 |     "indexer\n",
662 |     "values[indexer]"
663 |    ]
664 |   },
665 |   {
666 |    "cell_type": "code",
667 |    "execution_count": 61,
668 |    "metadata": {},
669 |    "outputs": [],
670 |    "source": [
671 |     "arr = rng.standard_normal((3, 5))\n",
672 |     "arr[0] = values\n",
673 |     "arr\n",
674 |     "arr[:, arr[0].argsort()]"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "code",
679 |    "execution_count": 62,
680 |    "metadata": {},
681 |    "outputs": [],
682 |    "source": [
683 |     "first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])\n",
684 |     "last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])\n",
685 |     "sorter = np.lexsort((first_name, last_name))\n",
686 |     "sorter\n",
687 |     "list(zip(last_name[sorter], first_name[sorter]))"
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "code",
692 |    "execution_count": 63,
693 |    "metadata": {},
694 |    "outputs": [],
695 |    "source": [
696 |     "values = np.array(['2:first', '2:second', '1:first', '1:second',\n",
697 |     "                   '1:third'])\n",
698 |     "key = np.array([2, 2, 1, 1, 1])\n",
699 |     "indexer = key.argsort(kind='mergesort')\n",
700 |     "indexer\n",
701 |     "values.take(indexer)"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": 64,
707 |    "metadata": {},
708 |    "outputs": [],
709 |    "source": [
710 |     "rng = np.random.default_rng(12345)\n",
711 |     "arr = rng.standard_normal(20)\n",
712 |     "arr\n",
713 |     "np.partition(arr, 3)"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "code",
718 |    "execution_count": 65,
719 |    "metadata": {},
720 |    "outputs": [],
721 |    "source": [
722 |     "indices = np.argpartition(arr, 3)\n",
723 |     "indices\n",
724 |     "arr.take(indices)"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "code",
729 |    "execution_count": 66,
730 |    "metadata": {},
731 |    "outputs": [],
732 |    "source": [
733 |     "arr = np.array([0, 1, 7, 12, 15])\n",
734 |     "arr.searchsorted(9)"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": 67,
740 |    "metadata": {},
741 |    "outputs": [],
742 |    "source": [
743 |     "arr.searchsorted([0, 8, 11, 16])"
744 |    ]
745 |   },
746 |   {
747 |    "cell_type": "code",
748 |    "execution_count": 68,
749 |    "metadata": {},
750 |    "outputs": [],
751 |    "source": [
752 |     "arr = np.array([0, 0, 0, 1, 1, 1, 1])\n",
753 |     "arr.searchsorted([0, 1])\n",
754 |     "arr.searchsorted([0, 1], side='right')"
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "code",
759 |    "execution_count": 69,
760 |    "metadata": {},
761 |    "outputs": [],
762 |    "source": [
763 |     "data = np.floor(rng.uniform(0, 10000, size=50))\n",
764 |     "bins = np.array([0, 100, 1000, 5000, 10000])\n",
765 |     "data"
766 |    ]
767 |   },
768 |   {
769 |    "cell_type": "code",
770 |    "execution_count": 70,
771 |    "metadata": {},
772 |    "outputs": [],
773 |    "source": [
774 |     "labels = bins.searchsorted(data)\n",
775 |     "labels"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": 71,
781 |    "metadata": {},
782 |    "outputs": [],
783 |    "source": [
784 |     "pd.Series(data).groupby(labels).mean()"
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "code",
789 |    "execution_count": 72,
790 |    "metadata": {},
791 |    "outputs": [],
792 |    "source": [
793 |     "import numpy as np\n",
794 |     "\n",
795 |     "def mean_distance(x, y):\n",
796 |     "    nx = len(x)\n",
797 |     "    result = 0.0\n",
798 |     "    count = 0\n",
799 |     "    for i in range(nx):\n",
800 |     "        result += x[i] - y[i]\n",
801 |     "        count += 1\n",
802 |     "    return result / count"
803 |    ]
804 |   },
805 |   {
806 |    "cell_type": "code",
807 |    "execution_count": 73,
808 |    "metadata": {},
809 |    "outputs": [],
810 |    "source": [
811 |     "mmap = np.memmap('mymmap', dtype='float64', mode='w+',\n",
812 |     "                 shape=(10000, 10000))\n",
813 |     "mmap"
814 |    ]
815 |   },
816 |   {
817 |    "cell_type": "code",
818 |    "execution_count": 74,
819 |    "metadata": {},
820 |    "outputs": [],
821 |    "source": [
822 |     "section = mmap[:5]"
823 |    ]
824 |   },
825 |   {
826 |    "cell_type": "code",
827 |    "execution_count": 75,
828 |    "metadata": {},
829 |    "outputs": [],
830 |    "source": [
831 |     "section[:] = rng.standard_normal((5, 10000))\n",
832 |     "mmap.flush()\n",
833 |     "mmap\n",
834 |     "del mmap"
835 |    ]
836 |   },
837 |   {
838 |    "cell_type": "code",
839 |    "execution_count": 76,
840 |    "metadata": {},
841 |    "outputs": [],
842 |    "source": [
843 |     "mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))\n",
844 |     "mmap"
845 |    ]
846 |   },
847 |   {
848 |    "cell_type": "code",
849 |    "execution_count": 77,
850 |    "metadata": {},
851 |    "outputs": [],
852 |    "source": [
853 |     "%xdel mmap\n",
854 |     "!rm mymmap"
855 |    ]
856 |   },
857 |   {
858 |    "cell_type": "code",
859 |    "execution_count": 78,
860 |    "metadata": {},
861 |    "outputs": [],
862 |    "source": [
863 |     "arr_c = np.ones((100, 10000), order='C')\n",
864 |     "arr_f = np.ones((100, 10000), order='F')\n",
865 |     "arr_c.flags\n",
866 |     "arr_f.flags\n",
867 |     "arr_f.flags.f_contiguous"
868 |    ]
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": 79,
873 |    "metadata": {},
874 |    "outputs": [],
875 |    "source": [
876 |     "%timeit arr_c.sum(1)\n",
877 |     "%timeit arr_f.sum(1)"
878 |    ]
879 |   },
880 |   {
881 |    "cell_type": "code",
882 |    "execution_count": 80,
883 |    "metadata": {},
884 |    "outputs": [],
885 |    "source": [
886 |     "arr_f.copy('C').flags"
887 |    ]
888 |   },
889 |   {
890 |    "cell_type": "code",
891 |    "execution_count": 81,
892 |    "metadata": {},
893 |    "outputs": [],
894 |    "source": [
895 |     "arr_c[:50].flags.contiguous\n",
896 |     "arr_c[:, :50].flags"
897 |    ]
898 |   },
899 |   {
900 |    "cell_type": "code",
901 |    "execution_count": 82,
902 |    "metadata": {},
903 |    "outputs": [],
904 |    "source": [
905 |     "%xdel arr_c\n",
906 |     "%xdel arr_f"
907 |    ]
908 |   },
909 |   {
910 |    "cell_type": "code",
911 |    "execution_count": 83,
912 |    "metadata": {},
913 |    "outputs": [],
914 |    "source": []
915 |   },
916 |   {
917 |    "cell_type": "code",
918 |    "execution_count": 84,
919 |    "metadata": {},
920 |    "outputs": [],
921 |    "source": [
922 |     "pd.options.display.max_rows = PREVIOUS_MAX_ROWS"
923 |    ]
924 |   }
925 |  ],
926 |  "metadata": {
927 |   "kernelspec": {
928 |    "display_name": "Python 3",
929 |    "language": "python",
930 |    "name": "python3"
931 |   },
932 |   "language_info": {
933 |    "codemirror_mode": {
934 |     "name": "ipython",
935 |     "version": 3
936 |    },
937 |    "file_extension": ".py",
938 |    "mimetype": "text/x-python",
939 |    "name": "python",
940 |    "nbconvert_exporter": "python",
941 |    "pygments_lexer": "ipython3",
942 |    "version": "3.7.6"
943 |   }
944 |  },
945 |  "nbformat": 4,
946 |  "nbformat_minor": 4
947 | }


--------------------------------------------------------------------------------