├── pandas 09 - Pandas and Python Data Structure.ipynb
├── pandas 10 - Best Practices in Data Analysis.ipynb
├── Pandas 00 - Intro.ipynb
├── pandas 08 - Speed up with pandas.ipynb
├── pandas 04 - Data IO.ipynb
├── Pandas 02 - Series.ipynb
└── Pandas 01 - Basics of Pandas.ipynb


/pandas 09 - Pandas and Python Data Structure.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# pandas 09 - Pandas and Python Data Structure\n",
  8 |     "\n",
  9 |     "by Nova@Douban\n",
 10 |     "\n",
 11 |     "The video record of this session is here: https://zoom.us/recording/share/-AyhhqiRKrw42R8xjEWHfXKDs-w2-IGS_NLh01a9q5SwIumekTziMw\n",
 12 |     "\n",
 13 |     "---\n",
 14 |     "\n",
 15 |     "## 9.1 Series and dict"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {
 22 |     "ExecuteTime": {
 23 |      "end_time": "2019-01-25T12:05:46.484050Z",
 24 |      "start_time": "2019-01-25T12:05:45.138406Z"
 25 |     }
 26 |    },
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/plain": [
 31 |        "a    1\n",
 32 |        "b    2\n",
 33 |        "c    9\n",
 34 |        "dtype: int64"
 35 |       ]
 36 |      },
 37 |      "metadata": {},
 38 |      "output_type": "display_data"
 39 |     },
 40 |     {
 41 |      "data": {
 42 |       "text/plain": [
 43 |        "{'a': 1, 'b': 2, 'c': 9}"
 44 |       ]
 45 |      },
 46 |      "metadata": {},
 47 |      "output_type": "display_data"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "# Conversion between a pandas Series and a dict\n",
 52 |     "import pandas as pd\n",
 53 |     "import numpy as np\n",
 54 |     "\n",
 55 |     "dd = {'a': 1, 'b': 2, 'c': 9}\n",
 56 |     "series_dd = pd.Series(dd)\n",
 57 |     "display(series_dd, series_dd.to_dict())"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## 9.2 Series and array"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 2,
 70 |    "metadata": {
 71 |     "ExecuteTime": {
 72 |      "end_time": "2019-01-22T09:21:37.535074Z",
 73 |      "start_time": "2019-01-22T09:21:37.287231Z"
 74 |     }
 75 |    },
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "0   -2.845180\n",
 81 |        "1   -2.469059\n",
 82 |        "2   -0.156874\n",
 83 |        "3   -0.290911\n",
 84 |        "4    0.876318\n",
 85 |        "5   -0.104034\n",
 86 |        "dtype: float64"
 87 |       ]
 88 |      },
 89 |      "metadata": {},
 90 |      "output_type": "display_data"
 91 |     },
 92 |     {
 93 |      "data": {
 94 |       "text/plain": [
 95 |        "a   -2.845180\n",
 96 |        "a   -2.469059\n",
 97 |        "a   -0.156874\n",
 98 |        "a   -0.290911\n",
 99 |        "a    0.876318\n",
100 |        "a   -0.104034\n",
101 |        "dtype: float64"
102 |       ]
103 |      },
104 |      "metadata": {},
105 |      "output_type": "display_data"
106 |     },
107 |     {
108 |      "data": {
109 |       "text/plain": [
110 |        "<xarray.DataArray (index: 6)>\n",
111 |        "array([-2.84518 , -2.469059, -0.156874, -0.290911,  0.876318, -0.104034])\n",
112 |        "Coordinates:\n",
113 |        "  * index    (index) object 'a' 'a' 'a' 'a' 'a' 'a'"
114 |       ]
115 |      },
116 |      "metadata": {},
117 |      "output_type": "display_data"
118 |     }
119 |    ],
120 |    "source": [
121 |     "# The default index is int\n",
122 |     "aray = np.random.randn(6)\n",
123 |     "srs = pd.Series(aray)\n",
124 |     "display(srs)\n",
125 |     "\n",
126 |     "# We can set repeated non-int labels to index\n",
127 |     "ind = ['a'] * 6\n",
128 |     "srs.index = ind\n",
129 |     "display(srs, srs.to_xarray())"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "markdown",
134 |    "metadata": {},
135 |    "source": [
136 |     "## 9.3 Series and list\n",
137 |     "\n",
138 |     "It is `pd.Series.tolist()`, not `pd.Series.to_list()`."
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 3,
144 |    "metadata": {
145 |     "ExecuteTime": {
146 |      "end_time": "2019-01-22T09:21:37.577756Z",
147 |      "start_time": "2019-01-22T09:21:37.549862Z"
148 |     }
149 |    },
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "0    0.725138\n",
155 |        "1    0.543878\n",
156 |        "2    0.226283\n",
157 |        "3    1.267045\n",
158 |        "4   -0.495132\n",
159 |        "5   -0.192349\n",
160 |        "dtype: float64"
161 |       ]
162 |      },
163 |      "metadata": {},
164 |      "output_type": "display_data"
165 |     },
166 |     {
167 |      "data": {
168 |       "text/plain": [
169 |        "[0.725138, 0.543878, 0.226283, 1.267045, -0.495132, -0.192349]"
170 |       ]
171 |      },
172 |      "metadata": {},
173 |      "output_type": "display_data"
174 |     }
175 |    ],
176 |    "source": [
177 |     "lst = [0.725138,  0.543878,  0.226283,  1.267045, -0.495132, -0.192349]\n",
178 |     "srs = pd.Series(lst)\n",
179 |     "lst = srs.tolist()\n",
180 |     "\n",
181 |     "display(srs, lst)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## 9.4 DataFrame and dict"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 3,
194 |    "metadata": {
195 |     "ExecuteTime": {
196 |      "end_time": "2019-01-25T12:05:51.701915Z",
197 |      "start_time": "2019-01-25T12:05:51.628849Z"
198 |     }
199 |    },
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/html": [
204 |        "<div>\n",
205 |        "<style scoped>\n",
206 |        "    .dataframe tbody tr th:only-of-type {\n",
207 |        "        vertical-align: middle;\n",
208 |        "    }\n",
209 |        "\n",
210 |        "    .dataframe tbody tr th {\n",
211 |        "        vertical-align: top;\n",
212 |        "    }\n",
213 |        "\n",
214 |        "    .dataframe thead th {\n",
215 |        "        text-align: right;\n",
216 |        "    }\n",
217 |        "</style>\n",
218 |        "<table border=\"1\" class=\"dataframe\">\n",
219 |        "  <thead>\n",
220 |        "    <tr style=\"text-align: right;\">\n",
221 |        "      <th></th>\n",
222 |        "      <th>a</th>\n",
223 |        "      <th>b</th>\n",
224 |        "      <th>c</th>\n",
225 |        "    </tr>\n",
226 |        "  </thead>\n",
227 |        "  <tbody>\n",
228 |        "    <tr>\n",
229 |        "      <th>0</th>\n",
230 |        "      <td>1</td>\n",
231 |        "      <td>82</td>\n",
232 |        "      <td>9</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>1</th>\n",
236 |        "      <td>8</td>\n",
237 |        "      <td>9</td>\n",
238 |        "      <td>6</td>\n",
239 |        "    </tr>\n",
240 |        "    <tr>\n",
241 |        "      <th>2</th>\n",
242 |        "      <td>0</td>\n",
243 |        "      <td>2</td>\n",
244 |        "      <td>3</td>\n",
245 |        "    </tr>\n",
246 |        "  </tbody>\n",
247 |        "</table>\n",
248 |        "</div>"
249 |       ],
250 |       "text/plain": [
251 |        "   a   b  c\n",
252 |        "0  1  82  9\n",
253 |        "1  8   9  6\n",
254 |        "2  0   2  3"
255 |       ]
256 |      },
257 |      "metadata": {},
258 |      "output_type": "display_data"
259 |     },
260 |     {
261 |      "data": {
262 |       "text/plain": [
263 |        "{'a': {0: 1, 1: 8, 2: 0}, 'b': {0: 82, 1: 9, 2: 2}, 'c': {0: 9, 1: 6, 2: 3}}"
264 |       ]
265 |      },
266 |      "metadata": {},
267 |      "output_type": "display_data"
268 |     }
269 |    ],
270 |    "source": [
271 |     "dd = {'a': [1, 8, 0], 'b': [82, 9, 2], 'c': [9, 6, 3]}\n",
272 |     "df_dd = pd.DataFrame.from_dict(dd)\n",
273 |     "display(df_dd, df_dd.to_dict())"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": []
282 |   }
283 |  ],
284 |  "metadata": {
285 |   "kernelspec": {
286 |    "display_name": "Python 3",
287 |    "language": "python",
288 |    "name": "python3"
289 |   },
290 |   "language_info": {
291 |    "codemirror_mode": {
292 |     "name": "ipython",
293 |     "version": 3
294 |    },
295 |    "file_extension": ".py",
296 |    "mimetype": "text/x-python",
297 |    "name": "python",
298 |    "nbconvert_exporter": "python",
299 |    "pygments_lexer": "ipython3",
300 |    "version": "3.7.2"
301 |   },
302 |   "toc": {
303 |    "base_numbering": 1,
304 |    "nav_menu": {},
305 |    "number_sections": false,
306 |    "sideBar": true,
307 |    "skip_h1_title": false,
308 |    "title_cell": "Table of Contents",
309 |    "title_sidebar": "Contents",
310 |    "toc_cell": false,
311 |    "toc_position": {},
312 |    "toc_section_display": true,
313 |    "toc_window_display": false
314 |   }
315 |  },
316 |  "nbformat": 4,
317 |  "nbformat_minor": 2
318 | }
319 | 


--------------------------------------------------------------------------------
/pandas 10 - Best Practices in Data Analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# pandas 10 - Best Practices in Data Analysis\n",
  8 |     "\n",
  9 |     "by Nova@Douban\n",
 10 |     "\n",
 11 |     "The video record of this session is here: https://zoom.us/recording/share/-AyhhqiRKrw42R8xjEWHfXKDs-w2-IGS_NLh01a9q5SwIumekTziMw\n",
 12 |     "\n",
 13 |     "\n",
 14 |     "---\n",
 15 |     "\n",
 16 |     "This sharing focuses on some best practices I gained during the past years working as a NLP engineer and data scientist.\n",
 17 |     "\n",
 18 |     "## 10.1 Version control\n",
 19 |     "\n",
 20 |     "Version conttol is an important practice in software development, and we shall follow this best practice to reduce errors, such as removing code by mistake.\n",
 21 |     "\n",
 22 |     "### 10.1.1 Git\n",
 23 |     "\n",
 24 |     "Git is one of the most popular version control tools, and we can find main version control platforms supporting this protocal. We can choose from Github, Gitlab or Bitbucket.\n",
 25 |     "\n",
 26 |     "Git has many workfolows to follow, we can start with gitflow, which is simple to follow. \n",
 27 |     "\n",
 28 |     "<img src=\"../image/gitflow.svg\">\n",
 29 |     "\n",
 30 |     "### 10.1.2 Git LFS\n",
 31 |     "\n",
 32 |     "Git is suitable to store code, but not for big files. Therefore, another version control protocal for big file emerged -- Git LFS. We can use it to store our datasets.\n",
 33 |     "\n",
 34 |     "### 10.1.3 Version control your code, not your data\n",
 35 |     "\n",
 36 |     "Some of us may used to store temporary / intermediate datasets on harddisk. However, we do not recommend this way; instead, we recommend version contoal your code, and commit as frequent as possible.\n",
 37 |     "\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "---\n",
 45 |     "\n",
 46 |     "## 10.2 Folder structure\n",
 47 |     "\n",
 48 |     "1. If our code folder is managed by git, it can be an individual folder only for code;\n",
 49 |     "2. At the same level, we can have another two folder for raw data and and results. Therefore, we can user git LFS to manage these two folders.\n",
 50 |     "3. We can use virtualenv to control the package environment.\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "<img src=\"../image/folder.png\">"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "markdown",
 58 |    "metadata": {},
 59 |    "source": [
 60 |     "---\n",
 61 |     "\n",
 62 |     "## 10.3 Jupyter Notebook and IPython\n",
 63 |     "\n",
 64 |     "### 10.3.1 Jupyter Notebook\n",
 65 |     "\n",
 66 |     "1. Jupyter Notebook is a convinient tool for interacting data analysis.\n",
 67 |     "2. We can put code, document, visualisation in a single Jupyter notebook.\n",
 68 |     "3. We can experiment / draft / benchmark in Jupyter notebook.\n",
 69 |     "4. We can export to markdown / python code / PDF, etc.\n",
 70 |     "5. We can install extensions to make Jupyter Notebook easier to use. See [this](https://towardsdatascience.com/jupyter-notebook-extensions-517fa69d2231?gi=e865fc4d7033).\n",
 71 |     "6. All outlines of this course are created with Jupyter Notebook.\n",
 72 |     "\n",
 73 |     "### 10.3.2 IPython\n",
 74 |     "\n",
 75 |     "1. IPython is the backend of Jupyter Notebook.\n",
 76 |     "2. Jupyter Notebook needs browser support. For example, if we need fix on a server, and we only can access the server via SSH, then IPython is the only choice.\n",
 77 |     "3. If you are familiar with Jupyter Notebook, then IPython is farily easy to use."
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 10,
 83 |    "metadata": {
 84 |     "ExecuteTime": {
 85 |      "end_time": "2019-01-25T12:25:27.954978Z",
 86 |      "start_time": "2019-01-25T12:25:27.739130Z"
 87 |     }
 88 |    },
 89 |    "outputs": [
 90 |     {
 91 |      "name": "stdout",
 92 |      "output_type": "stream",
 93 |      "text": [
 94 |       "The autoreload extension is already loaded. To reload it, use:\n",
 95 |       "  %reload_ext autoreload\n"
 96 |      ]
 97 |     },
 98 |     {
 99 |      "data": {
100 |       "text/html": [
101 |        "<div>\n",
102 |        "<style scoped>\n",
103 |        "    .dataframe tbody tr th:only-of-type {\n",
104 |        "        vertical-align: middle;\n",
105 |        "    }\n",
106 |        "\n",
107 |        "    .dataframe tbody tr th {\n",
108 |        "        vertical-align: top;\n",
109 |        "    }\n",
110 |        "\n",
111 |        "    .dataframe thead th {\n",
112 |        "        text-align: right;\n",
113 |        "    }\n",
114 |        "</style>\n",
115 |        "<table border=\"1\" class=\"dataframe\">\n",
116 |        "  <thead>\n",
117 |        "    <tr style=\"text-align: right;\">\n",
118 |        "      <th></th>\n",
119 |        "      <th>Open</th>\n",
120 |        "      <th>Close</th>\n",
121 |        "    </tr>\n",
122 |        "  </thead>\n",
123 |        "  <tbody>\n",
124 |        "    <tr>\n",
125 |        "      <th>0</th>\n",
126 |        "      <td>2498.770020</td>\n",
127 |        "      <td>2485.739990</td>\n",
128 |        "    </tr>\n",
129 |        "    <tr>\n",
130 |        "      <th>1</th>\n",
131 |        "      <td>2498.939941</td>\n",
132 |        "      <td>2506.850098</td>\n",
133 |        "    </tr>\n",
134 |        "    <tr>\n",
135 |        "      <th>2</th>\n",
136 |        "      <td>2476.959961</td>\n",
137 |        "      <td>2510.030029</td>\n",
138 |        "    </tr>\n",
139 |        "    <tr>\n",
140 |        "      <th>3</th>\n",
141 |        "      <td>2491.919922</td>\n",
142 |        "      <td>2447.889893</td>\n",
143 |        "    </tr>\n",
144 |        "    <tr>\n",
145 |        "      <th>4</th>\n",
146 |        "      <td>2474.330078</td>\n",
147 |        "      <td>2531.939941</td>\n",
148 |        "    </tr>\n",
149 |        "  </tbody>\n",
150 |        "</table>\n",
151 |        "</div>"
152 |       ],
153 |       "text/plain": [
154 |        "          Open        Close\n",
155 |        "0  2498.770020  2485.739990\n",
156 |        "1  2498.939941  2506.850098\n",
157 |        "2  2476.959961  2510.030029\n",
158 |        "3  2491.919922  2447.889893\n",
159 |        "4  2474.330078  2531.939941"
160 |       ]
161 |      },
162 |      "metadata": {},
163 |      "output_type": "display_data"
164 |     }
165 |    ],
166 |    "source": [
167 |     "%load_ext autoreload\n",
168 |     "%autoreload 2\n",
169 |     "\n",
170 |     "from test_script import fast_read\n",
171 |     "\n",
172 |     "data = fast_read('../data/gspc.csv', ['Open', 'Close'])\n",
173 |     "display(data)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "---\n",
181 |     "\n",
182 |     "## 10.4 Data file format\n",
183 |     "\n",
184 |     "Please refer to pandas 04 - pandas IO.\n",
185 |     "\n",
186 |     "1. For data serializaion, we can choose from JSON / Parquet / Arrow / HDF5.\n",
187 |     "2. When we use JSON, try using line-based JSON (JSONL) in case for possible stream processing.\n",
188 |     "3. NEVER use Python pickle!"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "---\n",
196 |     "\n",
197 |     "## 10.5 Script structure\n",
198 |     "\n",
199 |     "1. Collect all frequently-used snippets to a single script, and we can import these snippets to other scripts.\n",
200 |     "2. Build core functions with those snippets, and put them into another script.\n",
201 |     "3. Build a script of wrappers to call core functions."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "---\n",
209 |     "\n",
210 |     "## 10.6 How to write functions?\n",
211 |     "\n",
212 |     "1. Make your functions readable and pure;\n",
213 |     "2. Make the function name simple to remember;\n",
214 |     "3. Always add docstring and comments;\n",
215 |     "4. Always abstract your functions to be used repeatedly;\n",
216 |     "5. Always refactor your code;\n",
217 |     "6. If a problem is to difficult to solve, try writing pseudo code first. "
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "## 10.7 Exercises\n",
225 |     "\n",
226 |     "1. Read [Cookiecutter Data Science — Organize your Projects — Atom and Jupyter](https://medium.com/@rrfd/cookiecutter-data-science-organize-your-projects-atom-and-jupyter-2be7862f487e)\n",
227 |     "2. Read [Gitflow introduction](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow) by Bitbucket"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "markdown",
232 |    "metadata": {},
233 |    "source": []
234 |   }
235 |  ],
236 |  "metadata": {
237 |   "kernelspec": {
238 |    "display_name": "Python 3",
239 |    "language": "python",
240 |    "name": "python3"
241 |   },
242 |   "language_info": {
243 |    "codemirror_mode": {
244 |     "name": "ipython",
245 |     "version": 3
246 |    },
247 |    "file_extension": ".py",
248 |    "mimetype": "text/x-python",
249 |    "name": "python",
250 |    "nbconvert_exporter": "python",
251 |    "pygments_lexer": "ipython3",
252 |    "version": "3.7.2"
253 |   },
254 |   "toc": {
255 |    "base_numbering": 1,
256 |    "nav_menu": {},
257 |    "number_sections": false,
258 |    "sideBar": true,
259 |    "skip_h1_title": false,
260 |    "title_cell": "Table of Contents",
261 |    "title_sidebar": "Contents",
262 |    "toc_cell": false,
263 |    "toc_position": {},
264 |    "toc_section_display": true,
265 |    "toc_window_display": false
266 |   }
267 |  },
268 |  "nbformat": 4,
269 |  "nbformat_minor": 2
270 | }
271 | 


--------------------------------------------------------------------------------
/Pandas 00 - Intro.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Pandas 00 - Intro\n",
  8 |     "\n",
  9 |     "by Nova@Douban\n",
 10 |     "\n",
 11 |     "The video record of this session is here: https://zoom.us/recording/share/rDS-o_BWuPyBYIbswQ6bKJ5QGeFzY50BVFnBnw4t7pOwIumekTziMw?startTime=1545565951000\n",
 12 |     "\n",
 13 |     "---\n",
 14 |     "\n",
 15 |     "## 0.1 Course overview\n",
 16 |     "\n",
 17 |     "<img src=\"../image/outline.png\">\n",
 18 |     "\n",
 19 |     "----"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## 0.2 How to learn pandas?\n",
 27 |     "\n",
 28 |     "1. __Code, Code, and Code!__\n",
 29 |     "2. Read [pandas documentation](http://pandas.pydata.org/pandas-docs/stable/)\n",
 30 |     "3. Check [StackOverflow](http://stackoverflow.com)\n",
 31 |     "4. Check reference books\n",
 32 |     "    1. _Python for Data Analysis_\n",
 33 |     "    2. _Learning Pandas - Python Data Discovery and Analysis Made Easy_\n",
 34 |     "5. Check blogs\n",
 35 |     "    1. [pandas's Author Wes McKinney](http://wesmckinney.com/archives.html)\n",
 36 |     "    2. [Dataquest](https://www.dataquest.io/blog/)\n",
 37 |     "    3. [Introduction to Pandas by Ritchie Ng](https://www.ritchieng.com/tag_pandas/)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "---"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## 0.3 A Brief Overview of pandas\n",
 52 |     "    \n",
 53 |     "### 0.3.1 When to use pandas?\n",
 54 |     "\n",
 55 |     "1. If the dataset can fit in your local machine / single server, use pandas;\n",
 56 |     "2. If you want to speed up Python computing, use pandas;\n",
 57 |     "3. If the computing logic is too complicated to simple SQL queries, use pandas;\n",
 58 |     "4. If you want to convert data file format, use pandas;\n",
 59 |     "\n",
 60 |     "### 0.3.2 How to use pandas?\n",
 61 |     "\n",
 62 |     "1. If you use pandas, use it in pandas way;\n",
 63 |     "2. If you use pandas, use it as a framework.\n",
 64 |     "3. If you use pandas, track the code, not the data.\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "### 0.3.3 Some basic principles of pandas\n",
 68 |     "\n",
 69 |     "1. There are multiple ways to finish a task in pandas.\n",
 70 |     "\n",
 71 |     "2. If there are multiple ways to write in pandas, we shall choose the most suitable way.\n",
 72 |     "\n",
 73 |     "3. If we use pandas, take it as a framework, not just a tool.\n",
 74 |     "\n",
 75 |     "---"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "## 0.4 Examples\n",
 83 |     "\n",
 84 |     "### 0.4.1 Multiple ways to drop a column"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 1,
 90 |    "metadata": {
 91 |     "ExecuteTime": {
 92 |      "end_time": "2019-01-26T08:32:14.654893Z",
 93 |      "start_time": "2019-01-26T08:32:13.755566Z"
 94 |     }
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "data": {
 99 |       "text/html": [
100 |        "<div>\n",
101 |        "<style scoped>\n",
102 |        "    .dataframe tbody tr th:only-of-type {\n",
103 |        "        vertical-align: middle;\n",
104 |        "    }\n",
105 |        "\n",
106 |        "    .dataframe tbody tr th {\n",
107 |        "        vertical-align: top;\n",
108 |        "    }\n",
109 |        "\n",
110 |        "    .dataframe thead th {\n",
111 |        "        text-align: right;\n",
112 |        "    }\n",
113 |        "</style>\n",
114 |        "<table border=\"1\" class=\"dataframe\">\n",
115 |        "  <thead>\n",
116 |        "    <tr style=\"text-align: right;\">\n",
117 |        "      <th></th>\n",
118 |        "      <th>Date</th>\n",
119 |        "      <th>Open</th>\n",
120 |        "      <th>High</th>\n",
121 |        "      <th>Low</th>\n",
122 |        "      <th>Close</th>\n",
123 |        "      <th>Adj Close</th>\n",
124 |        "      <th>Volume</th>\n",
125 |        "    </tr>\n",
126 |        "  </thead>\n",
127 |        "  <tbody>\n",
128 |        "    <tr>\n",
129 |        "      <th>0</th>\n",
130 |        "      <td>2018-11-23</td>\n",
131 |        "      <td>6919.520020</td>\n",
132 |        "      <td>6987.890137</td>\n",
133 |        "      <td>6919.160156</td>\n",
134 |        "      <td>6938.979980</td>\n",
135 |        "      <td>6938.979980</td>\n",
136 |        "      <td>958950000</td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>1</th>\n",
140 |        "      <td>2018-11-26</td>\n",
141 |        "      <td>7026.500000</td>\n",
142 |        "      <td>7083.930176</td>\n",
143 |        "      <td>7003.120117</td>\n",
144 |        "      <td>7081.850098</td>\n",
145 |        "      <td>7081.850098</td>\n",
146 |        "      <td>2011180000</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>2</th>\n",
150 |        "      <td>2018-11-27</td>\n",
151 |        "      <td>7041.229980</td>\n",
152 |        "      <td>7105.140137</td>\n",
153 |        "      <td>7014.359863</td>\n",
154 |        "      <td>7082.700195</td>\n",
155 |        "      <td>7082.700195</td>\n",
156 |        "      <td>2067360000</td>\n",
157 |        "    </tr>\n",
158 |        "    <tr>\n",
159 |        "      <th>3</th>\n",
160 |        "      <td>2018-11-28</td>\n",
161 |        "      <td>7135.080078</td>\n",
162 |        "      <td>7292.709961</td>\n",
163 |        "      <td>7090.979980</td>\n",
164 |        "      <td>7291.589844</td>\n",
165 |        "      <td>7291.589844</td>\n",
166 |        "      <td>2390260000</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>4</th>\n",
170 |        "      <td>2018-11-29</td>\n",
171 |        "      <td>7267.370117</td>\n",
172 |        "      <td>7319.959961</td>\n",
173 |        "      <td>7217.689941</td>\n",
174 |        "      <td>7273.080078</td>\n",
175 |        "      <td>7273.080078</td>\n",
176 |        "      <td>1983460000</td>\n",
177 |        "    </tr>\n",
178 |        "  </tbody>\n",
179 |        "</table>\n",
180 |        "</div>"
181 |       ],
182 |       "text/plain": [
183 |        "         Date         Open         High          Low        Close  \\\n",
184 |        "0  2018-11-23  6919.520020  6987.890137  6919.160156  6938.979980   \n",
185 |        "1  2018-11-26  7026.500000  7083.930176  7003.120117  7081.850098   \n",
186 |        "2  2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195   \n",
187 |        "3  2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844   \n",
188 |        "4  2018-11-29  7267.370117  7319.959961  7217.689941  7273.080078   \n",
189 |        "\n",
190 |        "     Adj Close      Volume  \n",
191 |        "0  6938.979980   958950000  \n",
192 |        "1  7081.850098  2011180000  \n",
193 |        "2  7082.700195  2067360000  \n",
194 |        "3  7291.589844  2390260000  \n",
195 |        "4  7273.080078  1983460000  "
196 |       ]
197 |      },
198 |      "execution_count": 1,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "# Download Nasdaq dataset: https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC\n",
205 |     "\n",
206 |     "import pandas as pd\n",
207 |     "\n",
208 |     "in_file = '../data/nasdaq.csv'\n",
209 |     "df = pd.read_csv(in_file, engine='c')\n",
210 |     "df.head()\n",
211 |     "# df.describe()"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 2,
217 |    "metadata": {
218 |     "ExecuteTime": {
219 |      "end_time": "2019-01-26T08:32:14.667953Z",
220 |      "start_time": "2019-01-26T08:32:14.658978Z"
221 |     }
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "def drop_col_1(df, col):\n",
226 |     "    '''\n",
227 |     "    using drop\n",
228 |     "    '''\n",
229 |     "    df1 = df.copy()\n",
230 |     "    df1.drop(col, axis=1, inplace=True)\n",
231 |     "    return df1\n",
232 |     "\n",
233 |     "def drop_col_2(df, col):\n",
234 |     "    '''\n",
235 |     "    using del\n",
236 |     "    '''\n",
237 |     "    df2 = df.copy()\n",
238 |     "    del(df2[col])\n",
239 |     "    return df2\n",
240 |     "\n",
241 |     "def drop_col_3(df, col):\n",
242 |     "    '''\n",
243 |     "    using boolean selection\n",
244 |     "    '''\n",
245 |     "    df3 = df.copy()\n",
246 |     "    \n",
247 |     "    cols = list(df3.columns)\n",
248 |     "    cols.remove(col)\n",
249 |     "    \n",
250 |     "    df3 = df3[cols]\n",
251 |     "    return df3"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "markdown",
256 |    "metadata": {},
257 |    "source": [
258 |     "## 0.4.2 Choose the most suitable way from multiple ways\n",
259 |     "\n",
260 |     "We can use different profiling tools to benchmark the performence of different ways:\n",
261 |     "\n",
262 |     "1. %timeit for speed\n",
263 |     "2. %memit for memory consumption"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": 3,
269 |    "metadata": {
270 |     "ExecuteTime": {
271 |      "end_time": "2019-01-26T08:32:39.807971Z",
272 |      "start_time": "2019-01-26T08:32:14.672425Z"
273 |     }
274 |    },
275 |    "outputs": [
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "1.08 ms ± 97.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
281 |       "510 µs ± 22.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n",
282 |       "1 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
283 |      ]
284 |     }
285 |    ],
286 |    "source": [
287 |     "%timeit r1 = drop_col_1(df, ['Open'])\n",
288 |     "%timeit r2 = drop_col_2(df, 'Open')\n",
289 |     "%timeit r3 = drop_col_3(df, 'Open')"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 4,
295 |    "metadata": {
296 |     "ExecuteTime": {
297 |      "end_time": "2019-01-26T08:32:40.455867Z",
298 |      "start_time": "2019-01-26T08:32:39.812272Z"
299 |     }
300 |    },
301 |    "outputs": [
302 |     {
303 |      "name": "stdout",
304 |      "output_type": "stream",
305 |      "text": [
306 |       "peak memory: 76.87 MiB, increment: 0.45 MiB\n",
307 |       "peak memory: 76.88 MiB, increment: 0.00 MiB\n",
308 |       "peak memory: 76.88 MiB, increment: 0.00 MiB\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "%load_ext memory_profiler\n",
314 |     "%memit r1 = drop_col_1(df, ['Open'])\n",
315 |     "%memit r2 = drop_col_2(df, 'Open')\n",
316 |     "%memit r2 = drop_col_3(df, 'Open')"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "markdown",
321 |    "metadata": {},
322 |    "source": [
323 |     "__Conclusion__\n",
324 |     "\n",
325 |     "1.  `drop_col_2` used the least time among the three methods;\n",
326 |     "2. Three methods consumed same memory;\n",
327 |     "3. `drop_col_1` is the most pandas way, and `drop_col_3` is the least readable\n",
328 |     "\n",
329 |     "Therefore, we choose `drop_col_2` to finish this task"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "## 0.4.3 Use pandas a framework\n",
337 |     "\n",
338 |     "Suppose we have a task as following:\n",
339 |     "\n",
340 |     "1. read data from a CSV file;\n",
341 |     "2. calculate some results according to requirements;\n",
342 |     "3. output results to a Json and an Excel file.\n",
343 |     "\n",
344 |     "These jobs can be easily handled by pandas."
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": 5,
350 |    "metadata": {
351 |     "ExecuteTime": {
352 |      "end_time": "2019-01-26T08:32:40.475927Z",
353 |      "start_time": "2019-01-26T08:32:40.462048Z"
354 |     }
355 |    },
356 |    "outputs": [],
357 |    "source": [
358 |     "def process_nasdaq(in_csv, out_json, out_excel):\n",
359 |     "    # read from CSV\n",
360 |     "    df = pd.read_csv(in_csv, engine='c')\n",
361 |     "\n",
362 |     "    # Clean data\n",
363 |     "    df.rename(columns={'Adj Close': 'Adj_close'}, inplace=True)\n",
364 |     "\n",
365 |     "    # Calcualtion\n",
366 |     "    df['Max_diff'] = df['High'] - df['Low']\n",
367 |     "    df['Open_close_diff'] = df['Close'] - df['Open']\n",
368 |     "\n",
369 |     "    # Output to Json\n",
370 |     "    df.to_json(out_json, lines=True, orient='records')\n",
371 |     "    \n",
372 |     "    # Output to Excel\n",
373 |     "    writer = pd.ExcelWriter(out_excel)\n",
374 |     "    df.to_excel(writer,'Sheet1')\n",
375 |     "    writer.save()"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 6,
381 |    "metadata": {
382 |     "ExecuteTime": {
383 |      "end_time": "2019-01-26T08:32:40.815973Z",
384 |      "start_time": "2019-01-26T08:32:40.479655Z"
385 |     }
386 |    },
387 |    "outputs": [],
388 |    "source": [
389 |     "in_csv = '../data/nasdaq.csv'\n",
390 |     "out_json = '../data/nasdaq.json'\n",
391 |     "out_excel = '../data/nasdaq.xlsx'\n",
392 |     "process_nasdaq(in_csv, out_json, out_excel)"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "---\n",
400 |     "\n",
401 |     "To the rest sessions (outlines and video records), please scan the QR code below to pay.\n",
402 |     "\n",
403 |     "1. The price is 799 RMB.\n",
404 |     "2. Please leave your email address in the __payment comment__, so I will send you the links of the rest sessions.\n",
405 |     "\n",
406 |     "\n",
407 |     "<img src=\"../image/alipay.jpg\">"
408 |    ]
409 |   }
410 |  ],
411 |  "metadata": {
412 |   "kernelspec": {
413 |    "display_name": "Python 3",
414 |    "language": "python",
415 |    "name": "python3"
416 |   },
417 |   "language_info": {
418 |    "codemirror_mode": {
419 |     "name": "ipython",
420 |     "version": 3
421 |    },
422 |    "file_extension": ".py",
423 |    "mimetype": "text/x-python",
424 |    "name": "python",
425 |    "nbconvert_exporter": "python",
426 |    "pygments_lexer": "ipython3",
427 |    "version": "3.7.2"
428 |   },
429 |   "toc": {
430 |    "base_numbering": 1,
431 |    "nav_menu": {},
432 |    "number_sections": false,
433 |    "sideBar": true,
434 |    "skip_h1_title": false,
435 |    "title_cell": "Table of Contents",
436 |    "title_sidebar": "Contents",
437 |    "toc_cell": false,
438 |    "toc_position": {},
439 |    "toc_section_display": true,
440 |    "toc_window_display": false
441 |   }
442 |  },
443 |  "nbformat": 4,
444 |  "nbformat_minor": 2
445 | }
446 | 


--------------------------------------------------------------------------------
/pandas 08 - Speed up with pandas.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# pandas 08 - Speed up with pandas\n",
  8 |     "\n",
  9 |     "by Nova@Douban\n",
 10 |     "\n",
 11 |     "The video record of this session is here: https://zoom.us/recording/share/1ZCrNvlQG3rchlnczSpwCdq89ZZR12SZ75a2QvcI19WwIumekTziMw\n",
 12 |     "\n",
 13 |     "---\n",
 14 |     "\n",
 15 |     "In this tutotial, we will share how to process big datasets with pandas. Here, the big datasets mean those datasets are too big for a single machine.\n",
 16 |     "\n",
 17 |     "## 8.1 Stream processing\n",
 18 |     "\n",
 19 |     "pandas is an efficient tool to process data, but when the dataset cannot be fit in memory, using pandas could be a little bit tricky. If the dataset is big enough to take all of the memorys, the pandas task will get stuck there.\n",
 20 |     "\n",
 21 |     "One way to deal this problem is to apply stream processing to pandas with `chunksize` parameter in  `pd.read_csv()`, `pd.read_table()`, `pd.read_json(lines=True)` \n",
 22 |     "\n",
 23 |     "---\n",
 24 |     "\n",
 25 |     "### 8.1.1 An example of stream processing"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 1,
 31 |    "metadata": {
 32 |     "ExecuteTime": {
 33 |      "end_time": "2019-01-20T12:51:33.558058Z",
 34 |      "start_time": "2019-01-20T12:51:33.546775Z"
 35 |     }
 36 |    },
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "def preprocess_patent(in_f, out_f):\n",
 40 |     "    '''\n",
 41 |     "    normal read and write\n",
 42 |     "    '''\n",
 43 |     "    df = pd.read_table(in_f, sep='##')\n",
 44 |     "    df.columns = ['id0', 'id1', 'ref']\n",
 45 |     "    result = df[(df['ref'].str.contains('^[a-zA-Z]+')) & (df.ref['ref'].len() > 80)]\n",
 46 |     "    result.to_csv(out_f, index=False, header=False, mode='w')\n",
 47 |     "\n",
 48 |     "def preprocess_patent(in_f, out_f, size):\n",
 49 |     "    '''\n",
 50 |     "    read a chunk,\n",
 51 |     "    process a chunk,\n",
 52 |     "    write a chunk,\n",
 53 |     "    then repeat\n",
 54 |     "    '''\n",
 55 |     "    reader = pd.read_table(in_f, sep='##', chunksize=size)\n",
 56 |     "    for chunk in reader:\n",
 57 |     "        chunk.columns = ['id0', 'id1', 'ref']\n",
 58 |     "        result = chunk[(chunk['ref'].str.contains('^[a-zA-Z]+')) & (chunk['ref'].str.len() > 80)]\n",
 59 |     "        result.to_csv(out_f, index=False, header=False, mode='a')"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "Some aspects are worth paying attetion to:\n",
 67 |     "\n",
 68 |     "1. The `chunksize` should not be too small. If it is too small, the IO cost will be high to overcome the benefit. For example, if we have a file with one million lines, we did a little experiment:\n",
 69 |     "\n",
 70 |     "| Chunksize | Memory (MiB) | Time (s) |\n",
 71 |     "|-----------|--------------|----------|\n",
 72 |     "| 100       | 142.13       | 36.9     |\n",
 73 |     "| 1,000     | 141.38       | 13.8     |\n",
 74 |     "| 10,000    | 141.38       | 12.1     |\n",
 75 |     "| 100,000   | 209.88       | 12.7     |\n",
 76 |     "| 200,000   | 312.15       | 12.5     |\n",
 77 |     "\n",
 78 |     "In our main task, we set `chunksize` as 200,000, and it used 211.22MiB memory to process the 10G+ dataset with 9min 54s.\n",
 79 |     "\n",
 80 |     "2. the `pandas.DataFrame.to_csv()` mode should be set as 'a' to append chunk results to a single file; otherwise, only the last chunk will be saved.\n",
 81 |     "\n",
 82 |     "### 8.1.2 Be Careful with the Index\n",
 83 |     "\n",
 84 |     "Once, I had a strange with above stream processing logic:"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 2,
 90 |    "metadata": {
 91 |     "ExecuteTime": {
 92 |      "end_time": "2019-01-20T12:51:35.512216Z",
 93 |      "start_time": "2019-01-20T12:51:33.562070Z"
 94 |     }
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stderr",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.\n",
102 |       "Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-01-20_20-51-34_41781/logs.\n",
103 |       "Waiting for redis server at 127.0.0.1:59136 to respond...\n",
104 |       "Waiting for redis server at 127.0.0.1:53672 to respond...\n",
105 |       "Starting the Plasma object store with 6.871947672999999 GB memory using /tmp.\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "import pandas as pd\n",
111 |     "import modin.pandas as mp\n",
112 |     "\n",
113 |     "def stream_process(IN_FILE, OUT_FILE):\n",
114 |     "    reader = pd.read_csv(IN_FILE, chunksize = 1000, engine='c')\n",
115 |     "    for chunk in reader:\n",
116 |     "        result = []\n",
117 |     "        for line in chunk.tolist():\n",
118 |     "             temp = complicated_process(chunk)  # this involves a very complicated processing, so here is just a simplified version\n",
119 |     "             result.append(temp)\n",
120 |     "        chunk['new_series'] = pd.series(result)\n",
121 |     "        chunk.to_csv(OUT_TILE, index=False, mode='a')"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "---\n",
129 |     "\n",
130 |     "I can confirm each loop of result is not empty. But only in the first time of the loop, line `chunk['new_series'] = pd.series(result)` has result, and the rest are empty. Therefore, only the first chunk of the output contains new_series, and the rest are empty.\n",
131 |     "\n",
132 |     "When we tracked the index of each chunk, we found that they are not independent. We assumed that each chunk would start the index from 0, but in reality, it is NOT. The index of each chunk is a subset of the whole CSV in this situation, so their index derives from the CSV. This is what caused the problem. In our initial logic, the `pandas.to_csv` writes only the result of the first chunk, instead of the last chunk.\n",
133 |     "\n",
134 |     "Therefore, a better solution would be rebuild index for each chunk, and concatenating it with result."
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 3,
140 |    "metadata": {
141 |     "ExecuteTime": {
142 |      "end_time": "2019-01-20T12:51:35.528613Z",
143 |      "start_time": "2019-01-20T12:51:35.518159Z"
144 |     }
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     "def stream_process(IN_FILE, OUT_FILE):\n",
149 |     "    reader = pd.read_csv(IN_FILE, chunksize = 1000, engine='c')\n",
150 |     "    for chunk in reader:\n",
151 |     "        result = []\n",
152 |     "        for line in chunk.tolist():\n",
153 |     "             temp = complicated_process(chunk)  # this involves a very complicated processing, so here is just a simplified version\n",
154 |     "             result.append(temp)\n",
155 |     "        new_chunk = chunk.reindex()  # key solver\n",
156 |     "        new_chunk = new_chunk.assign(new_series=result)\n",
157 |     "        new_chunk.to_csv(OUT_TILE, index=False, mode='a')"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "## 8.2 Reduce memory usage with pandas\n",
165 |     "\n",
166 |     "### 8.2.1 Reduce with categorical type\n",
167 |     "\n",
168 |     "Often, some columns of data are catagorical, but they are saved as non-catagorical types. Therefore, we can convert them to categorical type with `pd.Series.astype('category')`"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 4,
174 |    "metadata": {
175 |     "ExecuteTime": {
176 |      "end_time": "2019-01-20T12:51:36.014380Z",
177 |      "start_time": "2019-01-20T12:51:35.532740Z"
178 |     }
179 |    },
180 |    "outputs": [
181 |     {
182 |      "name": "stdout",
183 |      "output_type": "stream",
184 |      "text": [
185 |       "<class 'pandas.core.frame.DataFrame'>\n",
186 |       "RangeIndex: 321792 entries, 0 to 321791\n",
187 |       "Data columns (total 3 columns):\n",
188 |       "MSISDN_SEG    321792 non-null int64\n",
189 |       "AREA_CODE     321792 non-null int64\n",
190 |       "ASP           321792 non-null int64\n",
191 |       "dtypes: int64(3)\n",
192 |       "memory usage: 7.4 MB\n"
193 |      ]
194 |     },
195 |     {
196 |      "data": {
197 |       "text/html": [
198 |        "<div>\n",
199 |        "<style scoped>\n",
200 |        "    .dataframe tbody tr th:only-of-type {\n",
201 |        "        vertical-align: middle;\n",
202 |        "    }\n",
203 |        "\n",
204 |        "    .dataframe tbody tr th {\n",
205 |        "        vertical-align: top;\n",
206 |        "    }\n",
207 |        "\n",
208 |        "    .dataframe thead th {\n",
209 |        "        text-align: right;\n",
210 |        "    }\n",
211 |        "</style>\n",
212 |        "<table border=\"1\" class=\"dataframe\">\n",
213 |        "  <thead>\n",
214 |        "    <tr style=\"text-align: right;\">\n",
215 |        "      <th></th>\n",
216 |        "      <th>MSISDN_SEG</th>\n",
217 |        "      <th>AREA_CODE</th>\n",
218 |        "      <th>ASP</th>\n",
219 |        "    </tr>\n",
220 |        "  </thead>\n",
221 |        "  <tbody>\n",
222 |        "    <tr>\n",
223 |        "      <th>0</th>\n",
224 |        "      <td>1451091</td>\n",
225 |        "      <td>10</td>\n",
226 |        "      <td>2</td>\n",
227 |        "    </tr>\n",
228 |        "    <tr>\n",
229 |        "      <th>1</th>\n",
230 |        "      <td>1451092</td>\n",
231 |        "      <td>10</td>\n",
232 |        "      <td>2</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>2</th>\n",
236 |        "      <td>1451093</td>\n",
237 |        "      <td>10</td>\n",
238 |        "      <td>2</td>\n",
239 |        "    </tr>\n",
240 |        "    <tr>\n",
241 |        "      <th>3</th>\n",
242 |        "      <td>1451094</td>\n",
243 |        "      <td>10</td>\n",
244 |        "      <td>2</td>\n",
245 |        "    </tr>\n",
246 |        "    <tr>\n",
247 |        "      <th>4</th>\n",
248 |        "      <td>1451095</td>\n",
249 |        "      <td>10</td>\n",
250 |        "      <td>2</td>\n",
251 |        "    </tr>\n",
252 |        "  </tbody>\n",
253 |        "</table>\n",
254 |        "</div>"
255 |       ],
256 |       "text/plain": [
257 |        "   MSISDN_SEG  AREA_CODE  ASP\n",
258 |        "0     1451091         10    2\n",
259 |        "1     1451092         10    2\n",
260 |        "2     1451093         10    2\n",
261 |        "3     1451094         10    2\n",
262 |        "4     1451095         10    2"
263 |       ]
264 |      },
265 |      "metadata": {},
266 |      "output_type": "display_data"
267 |     },
268 |     {
269 |      "data": {
270 |       "text/plain": [
271 |        "None"
272 |       ]
273 |      },
274 |      "metadata": {},
275 |      "output_type": "display_data"
276 |     },
277 |     {
278 |      "data": {
279 |       "text/plain": [
280 |        "'2.46 MB'"
281 |       ]
282 |      },
283 |      "metadata": {},
284 |      "output_type": "display_data"
285 |     },
286 |     {
287 |      "data": {
288 |       "text/plain": [
289 |        "'0.63 MB'"
290 |       ]
291 |      },
292 |      "metadata": {},
293 |      "output_type": "display_data"
294 |     }
295 |    ],
296 |    "source": [
297 |     "def mem_usage(pandas_obj):\n",
298 |     "    '''\n",
299 |     "    We will use this to check memory usage\n",
300 |     "    '''\n",
301 |     "    if isinstance(pandas_obj,pd.DataFrame):\n",
302 |     "        usage_b = pandas_obj.memory_usage(deep=True).sum()\n",
303 |     "    else: # we assume if not a df it's a series\n",
304 |     "        usage_b = pandas_obj.memory_usage(deep=True)\n",
305 |     "    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes\n",
306 |     "    return \"{:03.2f} MB\".format(usage_mb)\n",
307 |     "\n",
308 |     "df = pd.read_csv('../data/mobile_phone.csv')\n",
309 |     "display(df.head(), df.info())\n",
310 |     "display(mem_usage(df['AREA_CODE']), mem_usage(df['AREA_CODE'].astype('category')))"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "### 8.2.2 Choose the right subtypes\n",
318 |     "\n",
319 |     "pandas often chooses a safer dtype to store data; however, this may additional memory usage. For example, we can downcast numerical to `unsigned int` to save space for positive-only integers.\n",
320 |     "\n",
321 |     "<img src=\"../image/subtypes.png\">\n"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 5,
327 |    "metadata": {
328 |     "ExecuteTime": {
329 |      "end_time": "2019-01-20T12:51:36.383063Z",
330 |      "start_time": "2019-01-20T12:51:36.018784Z"
331 |     }
332 |    },
333 |    "outputs": [
334 |     {
335 |      "name": "stdout",
336 |      "output_type": "stream",
337 |      "text": [
338 |       "<class 'pandas.core.frame.DataFrame'>\n",
339 |       "RangeIndex: 321792 entries, 0 to 321791\n",
340 |       "Data columns (total 3 columns):\n",
341 |       "MSISDN_SEG    321792 non-null uint32\n",
342 |       "AREA_CODE     321792 non-null uint16\n",
343 |       "ASP           321792 non-null uint8\n",
344 |       "dtypes: uint16(1), uint32(1), uint8(1)\n",
345 |       "memory usage: 2.1 MB\n"
346 |      ]
347 |     },
348 |     {
349 |      "data": {
350 |       "text/html": [
351 |        "<div>\n",
352 |        "<style scoped>\n",
353 |        "    .dataframe tbody tr th:only-of-type {\n",
354 |        "        vertical-align: middle;\n",
355 |        "    }\n",
356 |        "\n",
357 |        "    .dataframe tbody tr th {\n",
358 |        "        vertical-align: top;\n",
359 |        "    }\n",
360 |        "\n",
361 |        "    .dataframe thead th {\n",
362 |        "        text-align: right;\n",
363 |        "    }\n",
364 |        "</style>\n",
365 |        "<table border=\"1\" class=\"dataframe\">\n",
366 |        "  <thead>\n",
367 |        "    <tr style=\"text-align: right;\">\n",
368 |        "      <th></th>\n",
369 |        "      <th>MSISDN_SEG</th>\n",
370 |        "      <th>AREA_CODE</th>\n",
371 |        "      <th>ASP</th>\n",
372 |        "    </tr>\n",
373 |        "  </thead>\n",
374 |        "  <tbody>\n",
375 |        "    <tr>\n",
376 |        "      <th>0</th>\n",
377 |        "      <td>1451091</td>\n",
378 |        "      <td>10</td>\n",
379 |        "      <td>2</td>\n",
380 |        "    </tr>\n",
381 |        "    <tr>\n",
382 |        "      <th>1</th>\n",
383 |        "      <td>1451092</td>\n",
384 |        "      <td>10</td>\n",
385 |        "      <td>2</td>\n",
386 |        "    </tr>\n",
387 |        "    <tr>\n",
388 |        "      <th>2</th>\n",
389 |        "      <td>1451093</td>\n",
390 |        "      <td>10</td>\n",
391 |        "      <td>2</td>\n",
392 |        "    </tr>\n",
393 |        "    <tr>\n",
394 |        "      <th>3</th>\n",
395 |        "      <td>1451094</td>\n",
396 |        "      <td>10</td>\n",
397 |        "      <td>2</td>\n",
398 |        "    </tr>\n",
399 |        "    <tr>\n",
400 |        "      <th>4</th>\n",
401 |        "      <td>1451095</td>\n",
402 |        "      <td>10</td>\n",
403 |        "      <td>2</td>\n",
404 |        "    </tr>\n",
405 |        "  </tbody>\n",
406 |        "</table>\n",
407 |        "</div>"
408 |       ],
409 |       "text/plain": [
410 |        "   MSISDN_SEG  AREA_CODE  ASP\n",
411 |        "0     1451091         10    2\n",
412 |        "1     1451092         10    2\n",
413 |        "2     1451093         10    2\n",
414 |        "3     1451094         10    2\n",
415 |        "4     1451095         10    2"
416 |       ]
417 |      },
418 |      "metadata": {},
419 |      "output_type": "display_data"
420 |     },
421 |     {
422 |      "data": {
423 |       "text/plain": [
424 |        "None"
425 |       ]
426 |      },
427 |      "metadata": {},
428 |      "output_type": "display_data"
429 |     },
430 |     {
431 |      "data": {
432 |       "text/plain": [
433 |        "'7.37 MB'"
434 |       ]
435 |      },
436 |      "metadata": {},
437 |      "output_type": "display_data"
438 |     },
439 |     {
440 |      "data": {
441 |       "text/plain": [
442 |        "'2.15 MB'"
443 |       ]
444 |      },
445 |      "metadata": {},
446 |      "output_type": "display_data"
447 |     }
448 |    ],
449 |    "source": [
450 |     "df_int = df.select_dtypes(include=['int'])\n",
451 |     "converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')\n",
452 |     "\n",
453 |     "display(df_int.head(), converted_int.info())\n",
454 |     "display(mem_usage(df_int), mem_usage(converted_int))"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "markdown",
459 |    "metadata": {},
460 |    "source": [
461 |     "### 8.2.3 Read data with clarifying dtype of each column\n",
462 |     "\n",
463 |     "When we use pandas to read date, we can set `dtype` for each column, so pandas doesn't have to guess, and also saves space."
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": 6,
469 |    "metadata": {
470 |     "ExecuteTime": {
471 |      "end_time": "2019-01-20T12:51:36.890337Z",
472 |      "start_time": "2019-01-20T12:51:36.390905Z"
473 |     }
474 |    },
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "CPU times: user 123 ms, sys: 51.9 ms, total: 175 ms\n",
481 |       "Wall time: 299 ms\n",
482 |       "CPU times: user 111 ms, sys: 21.2 ms, total: 132 ms\n",
483 |       "Wall time: 133 ms\n",
484 |       "<class 'pandas.core.frame.DataFrame'>\n",
485 |       "RangeIndex: 321792 entries, 0 to 321791\n",
486 |       "Data columns (total 3 columns):\n",
487 |       "MSISDN_SEG    321792 non-null int64\n",
488 |       "AREA_CODE     321792 non-null int64\n",
489 |       "ASP           321792 non-null int64\n",
490 |       "dtypes: int64(3)\n",
491 |       "memory usage: 7.4 MB\n",
492 |       "<class 'pandas.core.frame.DataFrame'>\n",
493 |       "RangeIndex: 321792 entries, 0 to 321791\n",
494 |       "Data columns (total 3 columns):\n",
495 |       "MSISDN_SEG    321792 non-null uint32\n",
496 |       "AREA_CODE     321792 non-null category\n",
497 |       "ASP           321792 non-null category\n",
498 |       "dtypes: category(2), uint32(1)\n",
499 |       "memory usage: 2.2 MB\n"
500 |      ]
501 |     },
502 |     {
503 |      "data": {
504 |       "text/plain": [
505 |        "None"
506 |       ]
507 |      },
508 |      "metadata": {},
509 |      "output_type": "display_data"
510 |     },
511 |     {
512 |      "data": {
513 |       "text/plain": [
514 |        "None"
515 |       ]
516 |      },
517 |      "metadata": {},
518 |      "output_type": "display_data"
519 |     }
520 |    ],
521 |    "source": [
522 |     "%time df1 = pd.read_csv('../data/mobile_phone.csv', engine='c')\n",
523 |     "\n",
524 |     "column_type = {'MSISDN_SEG': 'uint32', 'AREA_CODE': 'category', 'ASP': 'category'}\n",
525 |     "%time df2 = pd.read_csv('../data/mobile_phone.csv', dtype=column_type, engine='c')\n",
526 |     "\n",
527 |     "display(df1.info(), df2.info())"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "markdown",
532 |    "metadata": {},
533 |    "source": [
534 |     "## 8.3 Cython\n",
535 |     "\n",
536 |     "Cython is a C implementation of Python, and pandas can work nicely with Cython. If you don't know C, you can just import Cython to gain speed; moreover, if you know C, you can modify your code to Cython syntax to gain extra speed."
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 7,
542 |    "metadata": {
543 |     "ExecuteTime": {
544 |      "end_time": "2019-01-20T12:51:36.914987Z",
545 |      "start_time": "2019-01-20T12:51:36.893911Z"
546 |     }
547 |    },
548 |    "outputs": [
549 |     {
550 |      "name": "stdout",
551 |      "output_type": "stream",
552 |      "text": [
553 |       "CPU times: user 5.89 ms, sys: 2.62 ms, total: 8.51 ms\n",
554 |       "Wall time: 8.62 ms\n"
555 |      ]
556 |     }
557 |    ],
558 |    "source": [
559 |     "def demo_calc(series):\n",
560 |     "    return series * 67 - 89 / 45\n",
561 |     "\n",
562 |     "%time series1 = demo_calc(df1['MSISDN_SEG'])"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 8,
568 |    "metadata": {
569 |     "ExecuteTime": {
570 |      "end_time": "2019-01-20T12:51:37.879341Z",
571 |      "start_time": "2019-01-20T12:51:36.923659Z"
572 |     }
573 |    },
574 |    "outputs": [
575 |     {
576 |      "name": "stdout",
577 |      "output_type": "stream",
578 |      "text": [
579 |       "CPU times: user 2.09 ms, sys: 853 µs, total: 2.94 ms\n",
580 |       "Wall time: 2.95 ms\n"
581 |      ]
582 |     }
583 |    ],
584 |    "source": [
585 |     "%load_ext cython\n",
586 |     "\n",
587 |     "%time series2 = demo_calc(df1['MSISDN_SEG'])"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "markdown",
592 |    "metadata": {},
593 |    "source": [
594 |     "## 8.4 Modin\n",
595 |     "\n",
596 |     "Modin is a DataFrame library that allows you to speed up your pandas workflows by changing one line of code. "
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": 9,
602 |    "metadata": {
603 |     "ExecuteTime": {
604 |      "end_time": "2019-01-20T12:51:38.090576Z",
605 |      "start_time": "2019-01-20T12:51:37.882419Z"
606 |     }
607 |    },
608 |    "outputs": [
609 |     {
610 |      "name": "stdout",
611 |      "output_type": "stream",
612 |      "text": [
613 |       "CPU times: user 89.7 ms, sys: 29.9 ms, total: 120 ms\n",
614 |       "Wall time: 118 ms\n",
615 |       "CPU times: user 8.75 ms, sys: 2.44 ms, total: 11.2 ms\n",
616 |       "Wall time: 81.7 ms\n"
617 |      ]
618 |     }
619 |    ],
620 |    "source": [
621 |     "%time df1 = pd.read_csv('../data/mobile_phone.csv')\n",
622 |     "%time df2 = mp.read_csv('../data/mobile_phone.csv')"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": 10,
628 |    "metadata": {
629 |     "ExecuteTime": {
630 |      "end_time": "2019-01-20T12:51:38.249991Z",
631 |      "start_time": "2019-01-20T12:51:38.094025Z"
632 |     }
633 |    },
634 |    "outputs": [
635 |     {
636 |      "name": "stdout",
637 |      "output_type": "stream",
638 |      "text": [
639 |       "CPU times: user 7.7 ms, sys: 2.33 ms, total: 10 ms\n",
640 |       "Wall time: 6.94 ms\n",
641 |       "CPU times: user 113 ms, sys: 11.6 ms, total: 124 ms\n",
642 |       "Wall time: 136 ms\n"
643 |      ]
644 |     }
645 |    ],
646 |    "source": [
647 |     "def demo_calc(series):\n",
648 |     "    return series * 67 - 89 / 45\n",
649 |     "\n",
650 |     "def demo_calc2(series):\n",
651 |     "    return mp.Series(series) * 67 - 89 / 45\n",
652 |     "\n",
653 |     "%time series1 = demo_calc(df1['MSISDN_SEG'])\n",
654 |     "%time series2 = demo_calc2(df2['MSISDN_SEG'])"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "markdown",
659 |    "metadata": {},
660 |    "source": [
661 |     "---\n",
662 |     "\n",
663 |     "## 8.5 Exercises\n",
664 |     "\n",
665 |     "1. Read [Enhancing Performance](https://pandas.pydata.org/pandas-docs/stable/enhancingperf.html) by pandas\n",
666 |     "2. Read this post [Tutorial: Using pandas with Large Data Sets](https://www.dataquest.io/blog/pandas-big-data/)\n",
667 |     "3. We showed using `chunk_size` with `pd.read_table`, can you try this with `pd.read_json`?"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "markdown",
672 |    "metadata": {
673 |     "ExecuteTime": {
674 |      "end_time": "2019-01-19T23:46:44.266549Z",
675 |      "start_time": "2019-01-19T23:46:44.256174Z"
676 |     }
677 |    },
678 |    "source": [
679 |     "---"
680 |    ]
681 |   }
682 |  ],
683 |  "metadata": {
684 |   "kernelspec": {
685 |    "display_name": "Python 3",
686 |    "language": "python",
687 |    "name": "python3"
688 |   },
689 |   "language_info": {
690 |    "codemirror_mode": {
691 |     "name": "ipython",
692 |     "version": 3
693 |    },
694 |    "file_extension": ".py",
695 |    "mimetype": "text/x-python",
696 |    "name": "python",
697 |    "nbconvert_exporter": "python",
698 |    "pygments_lexer": "ipython3",
699 |    "version": "3.7.2"
700 |   },
701 |   "toc": {
702 |    "base_numbering": 1,
703 |    "nav_menu": {},
704 |    "number_sections": false,
705 |    "sideBar": true,
706 |    "skip_h1_title": false,
707 |    "title_cell": "Table of Contents",
708 |    "title_sidebar": "Contents",
709 |    "toc_cell": false,
710 |    "toc_position": {},
711 |    "toc_section_display": true,
712 |    "toc_window_display": false
713 |   }
714 |  },
715 |  "nbformat": 4,
716 |  "nbformat_minor": 2
717 | }
718 | 


--------------------------------------------------------------------------------
/pandas 04 - Data IO.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# pandas 04 - Data IO\n",
  8 |     "\n",
  9 |     "by Nova@Douban\n",
 10 |     "\n",
 11 |     "The video record of this session is here: https://zoom.us/recording/share/p2BRTD-McWEb51tNf6S1SBBBw9FDO3GJdL4JrbaG-uiwIumekTziMw\n",
 12 |     "\n",
 13 |     "\n",
 14 |     "---\n",
 15 |     "\n",
 16 |     "When we load data into Python, we have a demand: using a unified and powerful tool to read / write data.\n",
 17 |     "\n",
 18 |     "According to the [latest Pandas doc](http://pandas.pydata.org/pandas-docs/stable/io.html), Pandas supports reading and supporting these commonly-used file format: \n",
 19 |     "\n",
 20 |     "1. CSV, \n",
 21 |     "2. JSON, \n",
 22 |     "3. HTML, \n",
 23 |     "4. Local clipboard, \n",
 24 |     "5. MS Excel, \n",
 25 |     "6. HDF5 Format, \n",
 26 |     "7. Feather Format, \n",
 27 |     "8. Msgpack, \n",
 28 |     "9. Stata, \n",
 29 |     "10. SAS, \n",
 30 |     "11. Python Pickle Format, \n",
 31 |     "12. SQL, \n",
 32 |     "13. Google Big Query. \n",
 33 |     "\n",
 34 |     "If we visualize these data formats, we can have a clearer idea:\n",
 35 |     "\n",
 36 |     "![pandoc file conversion map](http://acepor.github.io/images/pandas_relations.png)\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "__Advantages__\n",
 40 |     "\n",
 41 |     "Using Pandas as a unified IO tool has two main advantages:\n",
 42 |     "\n",
 43 |     "1. Pandas IO tools provide a significant performance increase when reading or writing data.\n",
 44 |     "2. Pandas has very detailed document, so the learning curse is reduced."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "---\n",
 52 |     "\n",
 53 |     "## 4.1 CSV\n",
 54 |     "\n",
 55 |     "CSV (comma-separated-value) format is one of the most common formats in data processing. It is easy for both human and machine to read.\n",
 56 |     "\n",
 57 |     "### 4.1.1 Read CSV to DataFrame\n",
 58 |     "\n",
 59 |     "`pd.read_csv(in_file, quoting=0, sep=',', engine='c')`\n",
 60 |     "\n",
 61 |     "1. `quoting` is to tell which quotation convention the data uses.\n",
 62 |     "\n",
 63 |     "2. If the `sep` set as `None` and `engine` as 'python', this function will automatically sniff the delimiter.\n",
 64 |     "\n",
 65 |     "3. `c` engine is much faster (at least 50%) than `python` engine, but `python` engine supports more features\n",
 66 |     "\n",
 67 |     "4. `usecols` to select columns in order to reduce memory usage.\n",
 68 |     "\n",
 69 |     "### 4.1.2 Write DataFrame to CSV\n",
 70 |     "\n",
 71 |     "`pd.DataFrame.to_csv(out_file, header=True, index=False)`\n",
 72 |     "\n",
 73 |     "1. If we want to keep header and index, we can set `header` and `index` as `True`, and vice versa."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 1,
 79 |    "metadata": {
 80 |     "ExecuteTime": {
 81 |      "end_time": "2019-01-06T11:08:31.180777Z",
 82 |      "start_time": "2019-01-06T11:08:30.639715Z"
 83 |     }
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "import pandas as pd\n",
 88 |     "%load_ext memory_profiler\n",
 89 |     "in_csv = '../data/first_count.csv'"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 2,
 95 |    "metadata": {
 96 |     "ExecuteTime": {
 97 |      "end_time": "2019-01-06T11:08:31.598577Z",
 98 |      "start_time": "2019-01-06T11:08:31.184404Z"
 99 |     }
100 |    },
101 |    "outputs": [
102 |     {
103 |      "name": "stdout",
104 |      "output_type": "stream",
105 |      "text": [
106 |       "peak memory: 108.51 MiB, increment: 34.71 MiB\n",
107 |       "CPU times: user 210 ms, sys: 75 ms, total: 285 ms\n",
108 |       "Wall time: 407 ms\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "%time %memit first_name = pd.read_csv(in_csv, engine='c')"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 3,
119 |    "metadata": {
120 |     "ExecuteTime": {
121 |      "end_time": "2019-01-06T11:08:33.201264Z",
122 |      "start_time": "2019-01-06T11:08:31.604767Z"
123 |     }
124 |    },
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "peak memory: 173.65 MiB, increment: 67.99 MiB\n",
131 |       "CPU times: user 1.35 s, sys: 105 ms, total: 1.45 s\n",
132 |       "Wall time: 1.59 s\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "%time %memit first_name = pd.read_csv(in_csv, engine='python')"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 4,
143 |    "metadata": {
144 |     "ExecuteTime": {
145 |      "end_time": "2019-01-06T11:08:33.236176Z",
146 |      "start_time": "2019-01-06T11:08:33.204439Z"
147 |     }
148 |    },
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "<class 'pandas.core.frame.DataFrame'>\n",
155 |       "RangeIndex: 321792 entries, 0 to 321791\n",
156 |       "Data columns (total 3 columns):\n",
157 |       "MSISDN_SEG    321792 non-null int64\n",
158 |       "AREA_CODE     321792 non-null int64\n",
159 |       "ASP           321792 non-null int64\n",
160 |       "dtypes: int64(3)\n",
161 |       "memory usage: 7.4 MB\n"
162 |      ]
163 |     }
164 |    ],
165 |    "source": [
166 |     "first_name.info()"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 5,
172 |    "metadata": {
173 |     "ExecuteTime": {
174 |      "end_time": "2019-01-06T11:08:34.941681Z",
175 |      "start_time": "2019-01-06T11:08:33.239012Z"
176 |     }
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "peak memory: 125.99 MiB, increment: 14.78 MiB\n",
184 |       "CPU times: user 1.48 s, sys: 59.4 ms, total: 1.54 s\n",
185 |       "Wall time: 1.7 s\n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "out_csv = '../data/first_count.csv'\n",
191 |     "%time %memit first_name.to_csv(out_csv, header=True, index=False)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "---\n",
199 |     "\n",
200 |     "## 4.2 JSON\n",
201 |     "\n",
202 |     "JSON has gain more popularity recently. It has more controls on data, but it is not very human-friendly. JSON has different orients: `split`, `records`, `index`, `columns` or `values`. \n",
203 |     "\n",
204 |     "_Screenshot of JSON columns file_\n",
205 |     "\n",
206 |     "<img src=\"../image/json_columns.png\">\n",
207 |     "\n",
208 |     "_Screenshot of JSON index file_\n",
209 |     "\n",
210 |     "<img src=\"../image/json_index.png\">\n",
211 |     "\n",
212 |     "_Screenshot of JSON split file_\n",
213 |     "\n",
214 |     "<img src=\"../image/json_split.png\">\n",
215 |     "\n",
216 |     "_Screenshot of JSON values file\n",
217 |     "\n",
218 |     "<img src=\"../image/json_values.png\">\n",
219 |     "\n",
220 |     "_Screenshot of JSON lines file\n",
221 |     "\n",
222 |     "<img src=\"../image/json_lines.png\">"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "### 4.2.1 Read JSON to DataFrame\n",
230 |     "\n",
231 |     "Because it has a number of orients, it is quite easy to get confused. Therefore, when we use Pandas to read a JSON file, we have to specify the orient. \n",
232 |     "\n",
233 |     "__Moreover, it the file is line-based, we can set `lines` as `True`.__\n",
234 |     "\n",
235 |     "`pd.read_json(in_file, orient='records', lines=False)`\n",
236 |     "\n",
237 |     "### 4.2.2 Write  DataFrame to JSON\n",
238 |     "\n",
239 |     "Always save Json as `lines`\n",
240 |     "\n",
241 |     "`pd.DataFrame.to_json(out_file, orient='records', lines=False)`"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 6,
247 |    "metadata": {
248 |     "ExecuteTime": {
249 |      "end_time": "2019-01-06T11:08:35.173713Z",
250 |      "start_time": "2019-01-06T11:08:34.945172Z"
251 |     }
252 |    },
253 |    "outputs": [
254 |     {
255 |      "name": "stdout",
256 |      "output_type": "stream",
257 |      "text": [
258 |       "peak memory: 123.61 MiB, increment: 0.23 MiB\n",
259 |       "CPU times: user 68.2 ms, sys: 37.2 ms, total: 105 ms\n",
260 |       "Wall time: 219 ms\n"
261 |      ]
262 |     }
263 |    ],
264 |    "source": [
265 |     "in_json = '../data/nasdaq.json'\n",
266 |     "%time %memit nasdaq = pd.read_json(in_json, orient='records', lines=True)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 7,
272 |    "metadata": {
273 |     "ExecuteTime": {
274 |      "end_time": "2019-01-06T11:08:35.547768Z",
275 |      "start_time": "2019-01-06T11:08:35.177696Z"
276 |     }
277 |    },
278 |    "outputs": [
279 |     {
280 |      "name": "stdout",
281 |      "output_type": "stream",
282 |      "text": [
283 |       "peak memory: 180.29 MiB, increment: 56.68 MiB\n",
284 |       "CPU times: user 180 ms, sys: 62.8 ms, total: 243 ms\n",
285 |       "Wall time: 361 ms\n"
286 |      ]
287 |     }
288 |    ],
289 |    "source": [
290 |     "out_json = '../data/first_count.json'\n",
291 |     "%time %memit first_name.to_json(out_json, orient='records', lines=True)"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "### 4.2.3 Swiss knife for JSON\n",
299 |     "\n",
300 |     "Sometimes, a JSON file can be very nasty, and we just couldn't figure out how to read it. Luckily, `pandas` has a Swiss knife for this task -- `pd.io.json.json_normalize`.\n",
301 |     "\n",
302 |     "The example in ths screenshot is an Unserialized JSON file generated by `request` lib. This file cannot be read by `pd.read_json`, so we used `pd.io.json.json_normalize` instead. \n",
303 |     "\n",
304 |     "<img src=\"../image/json_screenshot.png\">"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 8,
310 |    "metadata": {
311 |     "ExecuteTime": {
312 |      "end_time": "2019-01-06T11:08:35.801634Z",
313 |      "start_time": "2019-01-06T11:08:35.551407Z"
314 |     }
315 |    },
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "peak memory: 175.99 MiB, increment: 0.04 MiB\n",
322 |       "CPU times: user 55.8 ms, sys: 30.7 ms, total: 86.5 ms\n",
323 |       "Wall time: 199 ms\n"
324 |      ]
325 |     },
326 |     {
327 |      "data": {
328 |       "text/html": [
329 |        "<div>\n",
330 |        "<style scoped>\n",
331 |        "    .dataframe tbody tr th:only-of-type {\n",
332 |        "        vertical-align: middle;\n",
333 |        "    }\n",
334 |        "\n",
335 |        "    .dataframe tbody tr th {\n",
336 |        "        vertical-align: top;\n",
337 |        "    }\n",
338 |        "\n",
339 |        "    .dataframe thead th {\n",
340 |        "        text-align: right;\n",
341 |        "    }\n",
342 |        "</style>\n",
343 |        "<table border=\"1\" class=\"dataframe\">\n",
344 |        "  <thead>\n",
345 |        "    <tr style=\"text-align: right;\">\n",
346 |        "      <th></th>\n",
347 |        "      <th>_geoloc.lat</th>\n",
348 |        "      <th>_geoloc.lng</th>\n",
349 |        "      <th>_highlightResult.region.hasc</th>\n",
350 |        "      <th>advertiser.avatar_url</th>\n",
351 |        "      <th>advertiser.category</th>\n",
352 |        "      <th>advertiser.id</th>\n",
353 |        "      <th>advertiser.name</th>\n",
354 |        "      <th>advertiser.phone</th>\n",
355 |        "      <th>advertiser.phone_full.phone</th>\n",
356 |        "      <th>advertiser.phone_full.status</th>\n",
357 |        "      <th>...</th>\n",
358 |        "      <th>rental_yield</th>\n",
359 |        "      <th>sale_price</th>\n",
360 |        "      <th>slug</th>\n",
361 |        "      <th>station</th>\n",
362 |        "      <th>title.en</th>\n",
363 |        "      <th>title.ja</th>\n",
364 |        "      <th>title.ru</th>\n",
365 |        "      <th>title.th</th>\n",
366 |        "      <th>token</th>\n",
367 |        "      <th>transaction</th>\n",
368 |        "    </tr>\n",
369 |        "  </thead>\n",
370 |        "  <tbody>\n",
371 |        "    <tr>\n",
372 |        "      <th>0</th>\n",
373 |        "      <td>13.723916</td>\n",
374 |        "      <td>100.566902</td>\n",
375 |        "      <td>[{'value': 'TH', 'matchLevel': 'none', 'matche...</td>\n",
376 |        "      <td>https://files.hipcdn.com/avatars/53faa8bd93164...</td>\n",
377 |        "      <td>Agent</td>\n",
378 |        "      <td>559636cd70726f2451000094</td>\n",
379 |        "      <td>Findbangkokroom.com</td>\n",
380 |        "      <td>099-095-5...</td>\n",
381 |        "      <td>099-095-5535</td>\n",
382 |        "      <td>ok</td>\n",
383 |        "      <td>...</td>\n",
384 |        "      <td>None</td>\n",
385 |        "      <td>None</td>\n",
386 |        "      <td>bangkok-condo</td>\n",
387 |        "      <td>[509ea305d2af11286e000ace, 509ea305d2af11286e0...</td>\n",
388 |        "      <td>For Rent 5 Beds Condo in Khlong Toei, Bangkok,...</td>\n",
389 |        "      <td>For Rent 5 Beds コンド in Khlong Toei, Bangkok, T...</td>\n",
390 |        "      <td>В аренду: Кондо с 5 спальнями в районе Khlong ...</td>\n",
391 |        "      <td>ให้เช่า คอนโด 5 ห้องนอน คลองเตย กรุงเทพฯ</td>\n",
392 |        "      <td>AAFBALPC</td>\n",
393 |        "      <td>[rent]</td>\n",
394 |        "    </tr>\n",
395 |        "  </tbody>\n",
396 |        "</table>\n",
397 |        "<p>1 rows × 75 columns</p>\n",
398 |        "</div>"
399 |       ],
400 |       "text/plain": [
401 |        "   _geoloc.lat  _geoloc.lng  \\\n",
402 |        "0    13.723916   100.566902   \n",
403 |        "\n",
404 |        "                        _highlightResult.region.hasc  \\\n",
405 |        "0  [{'value': 'TH', 'matchLevel': 'none', 'matche...   \n",
406 |        "\n",
407 |        "                               advertiser.avatar_url advertiser.category  \\\n",
408 |        "0  https://files.hipcdn.com/avatars/53faa8bd93164...               Agent   \n",
409 |        "\n",
410 |        "              advertiser.id      advertiser.name advertiser.phone  \\\n",
411 |        "0  559636cd70726f2451000094  Findbangkokroom.com     099-095-5...   \n",
412 |        "\n",
413 |        "  advertiser.phone_full.phone advertiser.phone_full.status     ...      \\\n",
414 |        "0                099-095-5535                           ok     ...       \n",
415 |        "\n",
416 |        "   rental_yield  sale_price           slug  \\\n",
417 |        "0          None        None  bangkok-condo   \n",
418 |        "\n",
419 |        "                                             station  \\\n",
420 |        "0  [509ea305d2af11286e000ace, 509ea305d2af11286e0...   \n",
421 |        "\n",
422 |        "                                            title.en  \\\n",
423 |        "0  For Rent 5 Beds Condo in Khlong Toei, Bangkok,...   \n",
424 |        "\n",
425 |        "                                            title.ja  \\\n",
426 |        "0  For Rent 5 Beds コンド in Khlong Toei, Bangkok, T...   \n",
427 |        "\n",
428 |        "                                            title.ru  \\\n",
429 |        "0  В аренду: Кондо с 5 спальнями в районе Khlong ...   \n",
430 |        "\n",
431 |        "                                   title.th     token transaction  \n",
432 |        "0  ให้เช่า คอนโด 5 ห้องนอน คลองเตย กรุงเทพฯ  AAFBALPC      [rent]  \n",
433 |        "\n",
434 |        "[1 rows x 75 columns]"
435 |       ]
436 |      },
437 |      "execution_count": 8,
438 |      "metadata": {},
439 |      "output_type": "execute_result"
440 |     }
441 |    ],
442 |    "source": [
443 |     "from pandas.io.json import json_normalize\n",
444 |     "\n",
445 |     "in_json = '../data/AAFBALPC.json'\n",
446 |     "\n",
447 |     "def convert_json(in_file):\n",
448 |     "    with open(in_file) as json_data:\n",
449 |     "        data = json.load(json_data)\n",
450 |     "        del data['formatted']\n",
451 |     "        df = json_normalize(data)\n",
452 |     "        return df\n",
453 |     "\n",
454 |     "%time %memit df = convert_json(in_json)\n",
455 |     "df.head()"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "markdown",
460 |    "metadata": {},
461 |    "source": [
462 |     "---\n",
463 |     "\n",
464 |     "## 4.3 HDF5\n",
465 |     "\n",
466 |     "HDF5 is a unique file format. We can include multiple other-format files into a single HDF5 file, and used a key to index them. Therefore, we can save space and reading speed of multiple files."
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 9,
472 |    "metadata": {
473 |     "ExecuteTime": {
474 |      "end_time": "2019-01-06T11:08:35.812367Z",
475 |      "start_time": "2019-01-06T11:08:35.805229Z"
476 |     }
477 |    },
478 |    "outputs": [],
479 |    "source": [
480 |     "def hdf2df(in_hdf, hdf_keys):\n",
481 |     "    \"\"\"\n",
482 |     "    Read a hdf5 file and return all dfs\n",
483 |     "    :param in_hdf: a hdf5 file\n",
484 |     "    :param hdf_keys:\n",
485 |     "    :return a dict of df\n",
486 |     "    \"\"\"\n",
487 |     "    return {i: pd.read_hdf(in_hdf, i) for i in hdf_keys}\n",
488 |     "\n",
489 |     "\n",
490 |     "def df2hdf(out_hdf, dfs, hdf_keys, mode='a'):\n",
491 |     "    \"\"\"\n",
492 |     "    Store single or multiple dfs to one hdf5 file\n",
493 |     "    :param dfs: single of multiple dfs\n",
494 |     "    :param out_hdf: the output file\n",
495 |     "    :param hdf_keys: [key for hdf]\n",
496 |     "    \"\"\"\n",
497 |     "    for j, k in zip(dfs, hdf_keys):\n",
498 |     "        j.to_hdf(out_hdf, k, table=True, mode=mode)"
499 |    ]
500 |   },
501 |   {
502 |    "cell_type": "markdown",
503 |    "metadata": {},
504 |    "source": [
505 |     "---\n",
506 |     "\n",
507 |     "## 4.4 MySQL\n",
508 |     "\n",
509 |     "MySQL is one of the most popular databases, and `pandas` can easily read the data from it with the help of another Python library `sqlalchemy`.\n",
510 |     "\n",
511 |     "### 4.4.1 Read MySQL table to DataFrame\n",
512 |     "\n",
513 |     "1. use `sqlalchemy` to make a MySQL connection.\n",
514 |     "\n",
515 |     "2. give a SQL query to pandas, and query from the created connection.\n",
516 |     "\n",
517 |     "\n",
518 |     "### 4.4.2 Write DataFrame to MySQL\n",
519 |     "\n",
520 |     "1. make MySQL connection,\n",
521 |     "2. write DataFrame to MySQL"
522 |    ]
523 |   },
524 |   {
525 |    "cell_type": "code",
526 |    "execution_count": 10,
527 |    "metadata": {
528 |     "ExecuteTime": {
529 |      "end_time": "2019-01-06T11:08:35.940990Z",
530 |      "start_time": "2019-01-06T11:08:35.817447Z"
531 |     }
532 |    },
533 |    "outputs": [],
534 |    "source": [
535 |     "from sqlalchemy import create_engine\n",
536 |     "def connect_db(host):\n",
537 |     "    return create_engine(host)\n",
538 |     "\n",
539 |     "def mysql2df(sql, con):\n",
540 |     "    \"\"\"\n",
541 |     "    pull data from SQl to dataframe\n",
542 |     "    :param sql: sql query\n",
543 |     "    :param con: sql connection\n",
544 |     "    :return: df\n",
545 |     "    \"\"\"\n",
546 |     "    return pd.read_sql_query(sql=sql, con=con)\n",
547 |     "    \n",
548 |     "    \n",
549 |     "def df2mysql(df, table_name, con, if_exist):\n",
550 |     "    \"\"\"\n",
551 |     "    save df to sql\n",
552 |     "    :param df:\n",
553 |     "    :param table_name: sql table name\n",
554 |     "    :param con: sql connection\n",
555 |     "    :param if_exist: append if existed\n",
556 |     "    :return:\n",
557 |     "    \"\"\"\n",
558 |     "    df.to_sql(table_name, con, if_exists=if_exist, index=False) "
559 |    ]
560 |   },
561 |   {
562 |    "cell_type": "markdown",
563 |    "metadata": {},
564 |    "source": [
565 |     "---\n",
566 |     "\n",
567 |     "## 4.5 Excel\n",
568 |     "\n",
569 |     "Excel is one of the most common data file formats, and pandas can handle it as well."
570 |    ]
571 |   },
572 |   {
573 |    "cell_type": "code",
574 |    "execution_count": 11,
575 |    "metadata": {
576 |     "ExecuteTime": {
577 |      "end_time": "2019-01-06T11:08:36.413028Z",
578 |      "start_time": "2019-01-06T11:08:35.943521Z"
579 |     }
580 |    },
581 |    "outputs": [
582 |     {
583 |      "name": "stdout",
584 |      "output_type": "stream",
585 |      "text": [
586 |       "peak memory: 189.64 MiB, increment: 9.37 MiB\n",
587 |       "CPU times: user 245 ms, sys: 66.8 ms, total: 312 ms\n",
588 |       "Wall time: 458 ms\n"
589 |      ]
590 |     }
591 |    ],
592 |    "source": [
593 |     "def df2excel(df, out_excel):\n",
594 |     "    writer = pd.ExcelWriter(out_excel)\n",
595 |     "    df.to_excel(writer,'Sheet1')\n",
596 |     "    writer.save()\n",
597 |     "\n",
598 |     "out_excel = '../data/test.xlsx'\n",
599 |     "%time %memit df2excel(nasdaq, out_excel)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "markdown",
604 |    "metadata": {},
605 |    "source": [
606 |     "---\n",
607 |     "\n",
608 |     "## 4.6 Benchmark of reading / writing large files with pandas\n",
609 |     "\n",
610 |     "This benchmark was run on a Google full name count file: 2 columns * 25,891,901 rows.\n",
611 |     "\n",
612 |     "<img src=\"../image/io_benchmark.png\">\n",
613 |     "\n",
614 |     "---\n",
615 |     "\n",
616 |     "The conclusion is that __Parquet__ uses the least time to read and write, requires least time to read, and the output size is the smallest, although it requires the most time to write.\n",
617 |     "\n",
618 |     "### 4.6.1 Converting between DataFrame and Arrow table\n",
619 |     "\n",
620 |     "> Apache Arrow is a cross-language development platform for in-memory data. \n",
621 |     "\n",
622 |     "> It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. \n",
623 |     "\n",
624 |     "> It also provides computational libraries and zero-copy streaming messaging and interprocess communication. "
625 |    ]
626 |   },
627 |   {
628 |    "cell_type": "code",
629 |    "execution_count": 12,
630 |    "metadata": {
631 |     "ExecuteTime": {
632 |      "end_time": "2019-01-06T11:08:36.937389Z",
633 |      "start_time": "2019-01-06T11:08:36.418748Z"
634 |     }
635 |    },
636 |    "outputs": [
637 |     {
638 |      "name": "stdout",
639 |      "output_type": "stream",
640 |      "text": [
641 |       "peak memory: 192.28 MiB, increment: 0.39 MiB\n",
642 |       "CPU times: user 58.8 ms, sys: 46.3 ms, total: 105 ms\n",
643 |       "Wall time: 224 ms\n",
644 |       "peak memory: 184.56 MiB, increment: -7.65 MiB\n",
645 |       "CPU times: user 67.5 ms, sys: 87 ms, total: 154 ms\n",
646 |       "Wall time: 246 ms\n"
647 |      ]
648 |     }
649 |    ],
650 |    "source": [
651 |     "import pyarrow as pa\n",
652 |     "\n",
653 |     "%time %memit table = pa.Table.from_pandas(first_name)\n",
654 |     "%time %memit df_new = table.to_pandas()"
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "markdown",
659 |    "metadata": {},
660 |    "source": [
661 |     "---\n",
662 |     "\n",
663 |     "### 4.6.2 Fatest way to write a DataFrame to disk\n",
664 |     "\n",
665 |     "write a DataFrame to parquet without compression with `pyarrow` lib\n",
666 |     "\n",
667 |     "1. convert DataFrame to Arrow table\n",
668 |     "2. write table to Parquet on disk"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 13,
674 |    "metadata": {
675 |     "ExecuteTime": {
676 |      "end_time": "2019-01-06T11:08:38.676088Z",
677 |      "start_time": "2019-01-06T11:08:36.942137Z"
678 |     }
679 |    },
680 |    "outputs": [
681 |     {
682 |      "name": "stdout",
683 |      "output_type": "stream",
684 |      "text": [
685 |       "peak memory: 170.10 MiB, increment: 2.57 MiB\n",
686 |       "CPU times: user 1.47 s, sys: 71.8 ms, total: 1.54 s\n",
687 |       "Wall time: 1.73 s\n"
688 |      ]
689 |     }
690 |    ],
691 |    "source": [
692 |     "%time %memit first_name.to_csv(out_csv, header=True, index=False)"
693 |    ]
694 |   },
695 |   {
696 |    "cell_type": "code",
697 |    "execution_count": 14,
698 |    "metadata": {
699 |     "ExecuteTime": {
700 |      "end_time": "2019-01-06T11:08:39.051013Z",
701 |      "start_time": "2019-01-06T11:08:38.679447Z"
702 |     }
703 |    },
704 |    "outputs": [
705 |     {
706 |      "name": "stdout",
707 |      "output_type": "stream",
708 |      "text": [
709 |       "peak memory: 150.27 MiB, increment: 6.93 MiB\n",
710 |       "CPU times: user 161 ms, sys: 69.6 ms, total: 231 ms\n",
711 |       "Wall time: 351 ms\n"
712 |      ]
713 |     }
714 |    ],
715 |    "source": [
716 |     "import pyarrow.parquet as pq\n",
717 |     "out_pq = '../data/test.pq'\n",
718 |     "\n",
719 |     "def df_parquet(df, out_pq):\n",
720 |     "    table = pa.Table.from_pandas(df)\n",
721 |     "    pq.write_table(table, out_pq, compression='none')\n",
722 |     "\n",
723 |     "\n",
724 |     "%time %memit df_parquet(first_name, out_pq)"
725 |    ]
726 |   },
727 |   {
728 |    "cell_type": "markdown",
729 |    "metadata": {},
730 |    "source": [
731 |     "---\n",
732 |     "\n",
733 |     "### 4.6.3 Fatest way to read a file to a DataFrame\n",
734 |     "\n",
735 |     "read an uncompressed parquet to a DataFrame with `pyarrow` lib.\n",
736 |     "\n",
737 |     "1. read Parquet file to Arrow table\n",
738 |     "2. convert table to pandas DataFrame"
739 |    ]
740 |   },
741 |   {
742 |    "cell_type": "code",
743 |    "execution_count": 15,
744 |    "metadata": {
745 |     "ExecuteTime": {
746 |      "end_time": "2019-01-06T11:08:39.510881Z",
747 |      "start_time": "2019-01-06T11:08:39.055755Z"
748 |     }
749 |    },
750 |    "outputs": [
751 |     {
752 |      "name": "stdout",
753 |      "output_type": "stream",
754 |      "text": [
755 |       "peak memory: 160.75 MiB, increment: 14.10 MiB\n",
756 |       "CPU times: user 230 ms, sys: 87.7 ms, total: 318 ms\n",
757 |       "Wall time: 446 ms\n"
758 |      ]
759 |     }
760 |    ],
761 |    "source": [
762 |     "%time %memit first_name = pd.read_csv(in_csv, engine='c')"
763 |    ]
764 |   },
765 |   {
766 |    "cell_type": "code",
767 |    "execution_count": 16,
768 |    "metadata": {
769 |     "ExecuteTime": {
770 |      "end_time": "2019-01-06T11:08:39.771942Z",
771 |      "start_time": "2019-01-06T11:08:39.519861Z"
772 |     }
773 |    },
774 |    "outputs": [
775 |     {
776 |      "name": "stdout",
777 |      "output_type": "stream",
778 |      "text": [
779 |       "peak memory: 162.25 MiB, increment: 13.16 MiB\n",
780 |       "CPU times: user 97.6 ms, sys: 90.9 ms, total: 188 ms\n",
781 |       "Wall time: 239 ms\n"
782 |      ]
783 |     }
784 |    ],
785 |    "source": [
786 |     "def parquet_df(in_pq):\n",
787 |     "    table = pq.read_table(in_pq)\n",
788 |     "    return table.to_pandas()\n",
789 |     "\n",
790 |     "in_pq = '../data/test.pq'\n",
791 |     "\n",
792 |     "%time %memit parquet_df(in_pq)"
793 |    ]
794 |   },
795 |   {
796 |    "cell_type": "markdown",
797 |    "metadata": {},
798 |    "source": [
799 |     "## 4.7 Exercise\n",
800 |     "\n",
801 |     "1. Read the comprehensive introduction of Pandas IO tools [here](http://pandas.pydata.org/pandas-docs/stable/io.html).\n",
802 |     "\n",
803 |     "2. Find out how to read an Excel file to pandas DataFrame.\n",
804 |     "\n",
805 |     "3. Test all solutions in the benchmark table."
806 |    ]
807 |   }
808 |  ],
809 |  "metadata": {
810 |   "kernelspec": {
811 |    "display_name": "Python 3",
812 |    "language": "python",
813 |    "name": "python3"
814 |   },
815 |   "language_info": {
816 |    "codemirror_mode": {
817 |     "name": "ipython",
818 |     "version": 3
819 |    },
820 |    "file_extension": ".py",
821 |    "mimetype": "text/x-python",
822 |    "name": "python",
823 |    "nbconvert_exporter": "python",
824 |    "pygments_lexer": "ipython3",
825 |    "version": "3.7.2"
826 |   },
827 |   "toc": {
828 |    "base_numbering": 1,
829 |    "nav_menu": {},
830 |    "number_sections": false,
831 |    "sideBar": true,
832 |    "skip_h1_title": false,
833 |    "title_cell": "Table of Contents",
834 |    "title_sidebar": "Contents",
835 |    "toc_cell": false,
836 |    "toc_position": {},
837 |    "toc_section_display": true,
838 |    "toc_window_display": false
839 |   }
840 |  },
841 |  "nbformat": 4,
842 |  "nbformat_minor": 2
843 | }
844 | 


--------------------------------------------------------------------------------
/Pandas 02 - Series.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Pandas 02 - Series\n",
   8 |     "\n",
   9 |     "by Nova@Douban\n",
  10 |     "\n",
  11 |     "The video record of this session is here: https://zoom.us/recording/share/hMxWmW7CRemL7wT8495JdIDXFCiyU6TAkMO4fL7J9GOwIumekTziMw?startTime=1546170816000\n",
  12 |     "\n",
  13 |     "---\n",
  14 |     "\n",
  15 |     "## 2.1 The Series object\n",
  16 |     "\n",
  17 |     "### 2.1.1 Concept\n",
  18 |     "\n",
  19 |     "pandas Series:\n",
  20 |     "\n",
  21 |     "1. represents a one-dimensional labeled indexed array;\n",
  22 |     "\n",
  23 |     "2. deviates from NumPy arrays by adding an index.\n",
  24 |     "\n",
  25 |     "### 2.1.2 Examples of pandas Series"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "code",
  30 |    "execution_count": 1,
  31 |    "metadata": {
  32 |     "ExecuteTime": {
  33 |      "end_time": "2018-12-31T01:54:56.702203Z",
  34 |      "start_time": "2018-12-31T01:54:56.063833Z"
  35 |     }
  36 |    },
  37 |    "outputs": [
  38 |     {
  39 |      "data": {
  40 |       "text/plain": [
  41 |        "array([ 0.46676643, -0.15545763,  0.44000794,  1.88418346,  1.35743695,\n",
  42 |        "       -0.92247118])"
  43 |       ]
  44 |      },
  45 |      "execution_count": 1,
  46 |      "metadata": {},
  47 |      "output_type": "execute_result"
  48 |     }
  49 |    ],
  50 |    "source": [
  51 |     "import numpy as np\n",
  52 |     "import pandas as pd\n",
  53 |     "\n",
  54 |     "aray = np.random.randn(6)\n",
  55 |     "aray"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "code",
  60 |    "execution_count": 2,
  61 |    "metadata": {
  62 |     "ExecuteTime": {
  63 |      "end_time": "2018-12-31T01:54:56.759244Z",
  64 |      "start_time": "2018-12-31T01:54:56.707515Z"
  65 |     }
  66 |    },
  67 |    "outputs": [
  68 |     {
  69 |      "data": {
  70 |       "text/plain": [
  71 |        "m    0.466766\n",
  72 |        "a   -0.155458\n",
  73 |        "f    0.440008\n",
  74 |        "9    1.884183\n",
  75 |        "h    1.357437\n",
  76 |        "l   -0.922471\n",
  77 |        "dtype: float64"
  78 |       ]
  79 |      },
  80 |      "execution_count": 2,
  81 |      "metadata": {},
  82 |      "output_type": "execute_result"
  83 |     }
  84 |    ],
  85 |    "source": [
  86 |     "srs = pd.Series(aray, index = ['m', 'a', 'f', '9', 'h', 'l'])\n",
  87 |     "srs"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "code",
  92 |    "execution_count": 3,
  93 |    "metadata": {
  94 |     "ExecuteTime": {
  95 |      "end_time": "2018-12-31T01:54:56.784106Z",
  96 |      "start_time": "2018-12-31T01:54:56.765979Z"
  97 |     }
  98 |    },
  99 |    "outputs": [
 100 |     {
 101 |      "name": "stdout",
 102 |      "output_type": "stream",
 103 |      "text": [
 104 |       "[ 0.46676643 -0.15545763  0.44000794  1.88418346  1.35743695 -0.92247118]\n",
 105 |       "Index(['m', 'a', 'f', '9', 'h', 'l'], dtype='object')\n"
 106 |      ]
 107 |     }
 108 |    ],
 109 |    "source": [
 110 |     "print(srs.values)\n",
 111 |     "print(srs.index)"
 112 |    ]
 113 |   },
 114 |   {
 115 |    "cell_type": "markdown",
 116 |    "metadata": {},
 117 |    "source": [
 118 |     "---\n",
 119 |     "\n",
 120 |     "\n",
 121 |     "## 2.2 Creating Series\n",
 122 |     "\n",
 123 |     "### 2.2.1 Creating from other data structures\n",
 124 |     "\n",
 125 |     "A Series can be created and initialized by passing \n",
 126 |     "\n",
 127 |     "1. a scalar value, \n",
 128 |     "2. a NumPy ndarray,\n",
 129 |     "3. a Python list, \n",
 130 |     "4. a Python Dict,\n",
 131 |     "\n",
 132 |     "### 2.2.2 Examples of creating Series"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": 4,
 138 |    "metadata": {
 139 |     "ExecuteTime": {
 140 |      "end_time": "2018-12-31T01:54:56.797762Z",
 141 |      "start_time": "2018-12-31T01:54:56.786810Z"
 142 |     }
 143 |    },
 144 |    "outputs": [
 145 |     {
 146 |      "data": {
 147 |       "text/plain": [
 148 |        " 1.278977    a\n",
 149 |        "-1.642189    a\n",
 150 |        " 0.300328    a\n",
 151 |        " 1.407208    a\n",
 152 |        " 1.018008    a\n",
 153 |        " 0.101313    a\n",
 154 |        "dtype: object"
 155 |       ]
 156 |      },
 157 |      "execution_count": 4,
 158 |      "metadata": {},
 159 |      "output_type": "execute_result"
 160 |     }
 161 |    ],
 162 |    "source": [
 163 |     "# from a scaler value\n",
 164 |     "ind = np.random.randn(6)\n",
 165 |     "pd.Series('a', index=ind)"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "code",
 170 |    "execution_count": 5,
 171 |    "metadata": {
 172 |     "ExecuteTime": {
 173 |      "end_time": "2018-12-31T01:54:56.828669Z",
 174 |      "start_time": "2018-12-31T01:54:56.807395Z"
 175 |     }
 176 |    },
 177 |    "outputs": [
 178 |     {
 179 |      "data": {
 180 |       "text/plain": [
 181 |        "0   -1.180448\n",
 182 |        "1    0.274229\n",
 183 |        "2   -1.972792\n",
 184 |        "3   -0.108837\n",
 185 |        "4   -0.255784\n",
 186 |        "5    1.419131\n",
 187 |        "dtype: float64"
 188 |       ]
 189 |      },
 190 |      "execution_count": 5,
 191 |      "metadata": {},
 192 |      "output_type": "execute_result"
 193 |     }
 194 |    ],
 195 |    "source": [
 196 |     "# from a numpy ndarray\n",
 197 |     "aray = np.random.randn(6)\n",
 198 |     "pd.Series(aray)"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": 6,
 204 |    "metadata": {
 205 |     "ExecuteTime": {
 206 |      "end_time": "2018-12-31T01:54:56.844589Z",
 207 |      "start_time": "2018-12-31T01:54:56.835249Z"
 208 |     }
 209 |    },
 210 |    "outputs": [
 211 |     {
 212 |      "data": {
 213 |       "text/plain": [
 214 |        "0     0\n",
 215 |        "1     1\n",
 216 |        "2     3\n",
 217 |        "3    89\n",
 218 |        "dtype: int64"
 219 |       ]
 220 |      },
 221 |      "execution_count": 6,
 222 |      "metadata": {},
 223 |      "output_type": "execute_result"
 224 |     }
 225 |    ],
 226 |    "source": [
 227 |     "# from a list\n",
 228 |     "lst = [0, 1, 3, 89]\n",
 229 |     "pd.Series(lst)"
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "code",
 234 |    "execution_count": 7,
 235 |    "metadata": {
 236 |     "ExecuteTime": {
 237 |      "end_time": "2018-12-31T01:54:56.859083Z",
 238 |      "start_time": "2018-12-31T01:54:56.848838Z"
 239 |     }
 240 |    },
 241 |    "outputs": [
 242 |     {
 243 |      "data": {
 244 |       "text/plain": [
 245 |        "a    9.1\n",
 246 |        "i    0.0\n",
 247 |        "dtype: float64"
 248 |       ]
 249 |      },
 250 |      "execution_count": 7,
 251 |      "metadata": {},
 252 |      "output_type": "execute_result"
 253 |     }
 254 |    ],
 255 |    "source": [
 256 |     "# from a dict\n",
 257 |     "dic = {'a': 9.1, 'i': 0}\n",
 258 |     "pd.Series(dic)"
 259 |    ]
 260 |   },
 261 |   {
 262 |    "cell_type": "markdown",
 263 |    "metadata": {},
 264 |    "source": [
 265 |     "---\n",
 266 |     "\n",
 267 |     "### 2.2.3 Index and values of Series\n",
 268 |     "\n",
 269 |     "1. By default, the Series object will construct an index automatically using integer values.\n",
 270 |     "\n",
 271 |     "\n",
 272 |     "2. To specify the index, use the index parameter of the constructor.\n",
 273 |     "\n",
 274 |     "\n",
 275 |     "3. A Series created with scaler value allows you to apply an operation and a single value across all elements of a Series.\n",
 276 |     "\n",
 277 |     "### 2.2.4 Examples of index and values of Series"
 278 |    ]
 279 |   },
 280 |   {
 281 |    "cell_type": "code",
 282 |    "execution_count": 8,
 283 |    "metadata": {
 284 |     "ExecuteTime": {
 285 |      "end_time": "2018-12-31T01:54:56.889714Z",
 286 |      "start_time": "2018-12-31T01:54:56.862232Z"
 287 |     }
 288 |    },
 289 |    "outputs": [
 290 |     {
 291 |      "data": {
 292 |       "text/plain": [
 293 |        " 0.284722     0\n",
 294 |        "-0.761072     1\n",
 295 |        "-1.104793     3\n",
 296 |        " 0.730485    89\n",
 297 |        "dtype: int64"
 298 |       ]
 299 |      },
 300 |      "execution_count": 8,
 301 |      "metadata": {},
 302 |      "output_type": "execute_result"
 303 |     }
 304 |    ],
 305 |    "source": [
 306 |     "# Set the index when creating the Series\n",
 307 |     "srs = pd.Series(lst, index = np.random.randn(4))\n",
 308 |     "srs"
 309 |    ]
 310 |   },
 311 |   {
 312 |    "cell_type": "code",
 313 |    "execution_count": 9,
 314 |    "metadata": {
 315 |     "ExecuteTime": {
 316 |      "end_time": "2018-12-31T01:54:56.916903Z",
 317 |      "start_time": "2018-12-31T01:54:56.897158Z"
 318 |     }
 319 |    },
 320 |    "outputs": [
 321 |     {
 322 |      "data": {
 323 |       "text/plain": [
 324 |        "array([ 0,  1,  3, 89])"
 325 |       ]
 326 |      },
 327 |      "execution_count": 9,
 328 |      "metadata": {},
 329 |      "output_type": "execute_result"
 330 |     }
 331 |    ],
 332 |    "source": [
 333 |     "# get the values of srs\n",
 334 |     "srs.values"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": 10,
 340 |    "metadata": {
 341 |     "ExecuteTime": {
 342 |      "end_time": "2018-12-31T01:54:56.946243Z",
 343 |      "start_time": "2018-12-31T01:54:56.921324Z"
 344 |     }
 345 |    },
 346 |    "outputs": [
 347 |     {
 348 |      "name": "stdout",
 349 |      "output_type": "stream",
 350 |      "text": [
 351 |       " 0.284722    5\n",
 352 |       "-0.761072    5\n",
 353 |       "-1.104793    5\n",
 354 |       " 0.730485    5\n",
 355 |       "dtype: int64\n",
 356 |       "\n",
 357 |       " 0.284722      0\n",
 358 |       "-0.761072      5\n",
 359 |       "-1.104793     15\n",
 360 |       " 0.730485    445\n",
 361 |       "dtype: int64\n",
 362 |       "\n",
 363 |       " 0.284722      0\n",
 364 |       "-0.761072      5\n",
 365 |       "-1.104793     15\n",
 366 |       " 0.730485    445\n",
 367 |       "dtype: int64\n"
 368 |      ]
 369 |     }
 370 |    ],
 371 |    "source": [
 372 |     "# A Series created from a scaler value is useful\n",
 373 |     "scaler = pd.Series(5, index=srs.index)\n",
 374 |     "print(scaler)\n",
 375 |     "print()\n",
 376 |     "print(srs * scaler)\n",
 377 |     "print()\n",
 378 |     "print(srs * 5)"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "markdown",
 383 |    "metadata": {},
 384 |    "source": [
 385 |     "---\n",
 386 |     "\n",
 387 |     "## 2.3 Accessing Series\n",
 388 |     "\n",
 389 |     "1. `pd.Series.size()`: return the number of elements in the underlying data;\n",
 390 |     "2. `pd.Series.shape`: return a tuple of the shape of the underlying data;\n",
 391 |     "3. `pd.Series.unique()`: return unique values of Series object;\n",
 392 |     "4. `pd.Series.count()`: return number of non-NA/null observations in the Series;\n",
 393 |     "5. `pd.Series.head()`: return the first `n` rows;\n",
 394 |     "6. `pd.Series.tail()`: return the last `n` rows;\n",
 395 |     "7. `pd.Series.take()`: return the elements in the given *positional* indices along an axis;"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "code",
 400 |    "execution_count": 11,
 401 |    "metadata": {
 402 |     "ExecuteTime": {
 403 |      "end_time": "2018-12-31T01:54:56.962751Z",
 404 |      "start_time": "2018-12-31T01:54:56.951383Z"
 405 |     }
 406 |    },
 407 |    "outputs": [
 408 |     {
 409 |      "data": {
 410 |       "text/plain": [
 411 |        "4"
 412 |       ]
 413 |      },
 414 |      "execution_count": 11,
 415 |      "metadata": {},
 416 |      "output_type": "execute_result"
 417 |     }
 418 |    ],
 419 |    "source": [
 420 |     "srs.size"
 421 |    ]
 422 |   },
 423 |   {
 424 |    "cell_type": "code",
 425 |    "execution_count": 12,
 426 |    "metadata": {
 427 |     "ExecuteTime": {
 428 |      "end_time": "2018-12-31T01:54:56.988077Z",
 429 |      "start_time": "2018-12-31T01:54:56.967448Z"
 430 |     }
 431 |    },
 432 |    "outputs": [
 433 |     {
 434 |      "data": {
 435 |       "text/plain": [
 436 |        "(4,)"
 437 |       ]
 438 |      },
 439 |      "execution_count": 12,
 440 |      "metadata": {},
 441 |      "output_type": "execute_result"
 442 |     }
 443 |    ],
 444 |    "source": [
 445 |     "srs.shape"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "code",
 450 |    "execution_count": 13,
 451 |    "metadata": {
 452 |     "ExecuteTime": {
 453 |      "end_time": "2018-12-31T01:54:57.025706Z",
 454 |      "start_time": "2018-12-31T01:54:56.995758Z"
 455 |     }
 456 |    },
 457 |    "outputs": [
 458 |     {
 459 |      "data": {
 460 |       "text/plain": [
 461 |        "array([ 0,  1,  3, 89])"
 462 |       ]
 463 |      },
 464 |      "execution_count": 13,
 465 |      "metadata": {},
 466 |      "output_type": "execute_result"
 467 |     }
 468 |    ],
 469 |    "source": [
 470 |     "srs.unique()"
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "code",
 475 |    "execution_count": 14,
 476 |    "metadata": {
 477 |     "ExecuteTime": {
 478 |      "end_time": "2018-12-31T01:54:57.053575Z",
 479 |      "start_time": "2018-12-31T01:54:57.030985Z"
 480 |     }
 481 |    },
 482 |    "outputs": [
 483 |     {
 484 |      "data": {
 485 |       "text/plain": [
 486 |        "4"
 487 |       ]
 488 |      },
 489 |      "execution_count": 14,
 490 |      "metadata": {},
 491 |      "output_type": "execute_result"
 492 |     }
 493 |    ],
 494 |    "source": [
 495 |     "srs.count()"
 496 |    ]
 497 |   },
 498 |   {
 499 |    "cell_type": "code",
 500 |    "execution_count": 15,
 501 |    "metadata": {
 502 |     "ExecuteTime": {
 503 |      "end_time": "2018-12-31T01:54:57.074672Z",
 504 |      "start_time": "2018-12-31T01:54:57.061985Z"
 505 |     }
 506 |    },
 507 |    "outputs": [
 508 |     {
 509 |      "data": {
 510 |       "text/plain": [
 511 |        " 0.284722    0\n",
 512 |        "-0.761072    1\n",
 513 |        "dtype: int64"
 514 |       ]
 515 |      },
 516 |      "execution_count": 15,
 517 |      "metadata": {},
 518 |      "output_type": "execute_result"
 519 |     }
 520 |    ],
 521 |    "source": [
 522 |     "srs.head(2)"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "code",
 527 |    "execution_count": 16,
 528 |    "metadata": {
 529 |     "ExecuteTime": {
 530 |      "end_time": "2018-12-31T01:54:57.107392Z",
 531 |      "start_time": "2018-12-31T01:54:57.085274Z"
 532 |     }
 533 |    },
 534 |    "outputs": [
 535 |     {
 536 |      "data": {
 537 |       "text/plain": [
 538 |        "-1.104793     3\n",
 539 |        " 0.730485    89\n",
 540 |        "dtype: int64"
 541 |       ]
 542 |      },
 543 |      "execution_count": 16,
 544 |      "metadata": {},
 545 |      "output_type": "execute_result"
 546 |     }
 547 |    ],
 548 |    "source": [
 549 |     "srs.tail(2)"
 550 |    ]
 551 |   },
 552 |   {
 553 |    "cell_type": "code",
 554 |    "execution_count": 17,
 555 |    "metadata": {
 556 |     "ExecuteTime": {
 557 |      "end_time": "2018-12-31T01:54:57.129753Z",
 558 |      "start_time": "2018-12-31T01:54:57.117178Z"
 559 |     }
 560 |    },
 561 |    "outputs": [
 562 |     {
 563 |      "data": {
 564 |       "text/plain": [
 565 |        " 0.284722    0\n",
 566 |        "-0.761072    1\n",
 567 |        "dtype: int64"
 568 |       ]
 569 |      },
 570 |      "execution_count": 17,
 571 |      "metadata": {},
 572 |      "output_type": "execute_result"
 573 |     }
 574 |    ],
 575 |    "source": [
 576 |     "srs.take([0, 1], axis=0)"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "markdown",
 581 |    "metadata": {},
 582 |    "source": [
 583 |     "---\n",
 584 |     "\n",
 585 |     "## 2.4 More about alignment\n",
 586 |     "\n",
 587 |     "### 2.4.1 Always start with alignment\n",
 588 |     "\n",
 589 |     "The computing between multiple Series always start with alignment."
 590 |    ]
 591 |   },
 592 |   {
 593 |    "cell_type": "code",
 594 |    "execution_count": 18,
 595 |    "metadata": {
 596 |     "ExecuteTime": {
 597 |      "end_time": "2018-12-31T01:55:13.395623Z",
 598 |      "start_time": "2018-12-31T01:54:57.135720Z"
 599 |     }
 600 |    },
 601 |    "outputs": [
 602 |     {
 603 |      "name": "stdout",
 604 |      "output_type": "stream",
 605 |      "text": [
 606 |       "97.1 µs ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n",
 607 |       "102 µs ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
 608 |      ]
 609 |     }
 610 |    ],
 611 |    "source": [
 612 |     "# A Series * scaler values VS. vectorization\n",
 613 |     "scaler = pd.Series(5, index=srs.index)\n",
 614 |     "%timeit srs * scaler\n",
 615 |     "%timeit srs * 5"
 616 |    ]
 617 |   },
 618 |   {
 619 |    "cell_type": "markdown",
 620 |    "metadata": {},
 621 |    "source": [
 622 |     "---\n",
 623 |     "\n",
 624 |     "### 2.4.2 Repeated labels in index\n",
 625 |     "\n",
 626 |     "If there are repeated labels in the index, the result will be surprising.\n",
 627 |     "\n",
 628 |     "    Cartesian product: an index having duplicate labels will result in a number of index labels equivalent to the products of the number of the labels in each Series."
 629 |    ]
 630 |   },
 631 |   {
 632 |    "cell_type": "code",
 633 |    "execution_count": 19,
 634 |    "metadata": {
 635 |     "ExecuteTime": {
 636 |      "end_time": "2018-12-31T01:55:13.423295Z",
 637 |      "start_time": "2018-12-31T01:55:13.401712Z"
 638 |     }
 639 |    },
 640 |    "outputs": [
 641 |     {
 642 |      "name": "stdout",
 643 |      "output_type": "stream",
 644 |      "text": [
 645 |       "1    1.026273\n",
 646 |       "2    1.339302\n",
 647 |       "2    1.551478\n",
 648 |       "2    0.327764\n",
 649 |       "2    0.539940\n",
 650 |       "3    3.797827\n",
 651 |       "dtype: float64\n",
 652 |       "\n",
 653 |       "1    0.007499\n",
 654 |       "2    0.788075\n",
 655 |       "2   -0.223463\n",
 656 |       "3    2.548161\n",
 657 |       "dtype: float64\n",
 658 |       "\n",
 659 |       "{1: 0.007498765264798723, 2: -0.22346329852722358, 3: 2.548161210676347}\n"
 660 |      ]
 661 |     }
 662 |    ],
 663 |    "source": [
 664 |     "ind = [1, 2, 2, 3]\n",
 665 |     "s1 = pd.Series(np.random.randn(4), index=ind)\n",
 666 |     "s2 = pd.Series(np.random.randn(4), index=reversed(ind))\n",
 667 |     "print(s1+s2)\n",
 668 |     "print()\n",
 669 |     "print(s1)\n",
 670 |     "print()\n",
 671 |     "print(s1.to_dict())"
 672 |    ]
 673 |   },
 674 |   {
 675 |    "cell_type": "markdown",
 676 |    "metadata": {},
 677 |    "source": [
 678 |     "---\n",
 679 |     "\n",
 680 |     "## 2.5 Boolean selection\n",
 681 |     "\n",
 682 |     "\n",
 683 |     "1. Boolean selection produces a new Series with a copy of index and value for the selected rows.\n",
 684 |     "\n",
 685 |     "\n",
 686 |     "2. With the `[]` operator, Boolean selection can get the values of the original Series.\n",
 687 |     "\n",
 688 |     "\n",
 689 |     "3. Chain selection does not work with Series, instead, put parentheses around logical conditions and use '|' and '&'."
 690 |    ]
 691 |   },
 692 |   {
 693 |    "cell_type": "code",
 694 |    "execution_count": 20,
 695 |    "metadata": {
 696 |     "ExecuteTime": {
 697 |      "end_time": "2018-12-31T01:55:13.448878Z",
 698 |      "start_time": "2018-12-31T01:55:13.432424Z"
 699 |     }
 700 |    },
 701 |    "outputs": [
 702 |     {
 703 |      "name": "stdout",
 704 |      "output_type": "stream",
 705 |      "text": [
 706 |       "1     True\n",
 707 |       "2     True\n",
 708 |       "2    False\n",
 709 |       "3     True\n",
 710 |       "dtype: bool\n",
 711 |       "<class 'pandas.core.series.Series'>\n"
 712 |      ]
 713 |     }
 714 |    ],
 715 |    "source": [
 716 |     "ss = (s1 > 0)\n",
 717 |     "print(ss)\n",
 718 |     "print(type(ss))"
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "code",
 723 |    "execution_count": 21,
 724 |    "metadata": {
 725 |     "ExecuteTime": {
 726 |      "end_time": "2018-12-31T01:55:13.466712Z",
 727 |      "start_time": "2018-12-31T01:55:13.456102Z"
 728 |     }
 729 |    },
 730 |    "outputs": [
 731 |     {
 732 |      "data": {
 733 |       "text/plain": [
 734 |        "1    0.007499\n",
 735 |        "2    0.788075\n",
 736 |        "3    2.548161\n",
 737 |        "dtype: float64"
 738 |       ]
 739 |      },
 740 |      "execution_count": 21,
 741 |      "metadata": {},
 742 |      "output_type": "execute_result"
 743 |     }
 744 |    ],
 745 |    "source": [
 746 |     "s1[(s1 > 0)]"
 747 |    ]
 748 |   },
 749 |   {
 750 |    "cell_type": "code",
 751 |    "execution_count": 22,
 752 |    "metadata": {
 753 |     "ExecuteTime": {
 754 |      "end_time": "2018-12-31T01:55:13.484357Z",
 755 |      "start_time": "2018-12-31T01:55:13.470986Z"
 756 |     }
 757 |    },
 758 |    "outputs": [
 759 |     {
 760 |      "data": {
 761 |       "text/plain": [
 762 |        "False"
 763 |       ]
 764 |      },
 765 |      "execution_count": 22,
 766 |      "metadata": {},
 767 |      "output_type": "execute_result"
 768 |     }
 769 |    ],
 770 |    "source": [
 771 |     "s1[(s1 > 0)]._is_view"
 772 |    ]
 773 |   },
 774 |   {
 775 |    "cell_type": "code",
 776 |    "execution_count": 23,
 777 |    "metadata": {
 778 |     "ExecuteTime": {
 779 |      "end_time": "2018-12-31T01:55:13.511880Z",
 780 |      "start_time": "2018-12-31T01:55:13.488409Z"
 781 |     }
 782 |    },
 783 |    "outputs": [
 784 |     {
 785 |      "data": {
 786 |       "text/plain": [
 787 |        "1    0.007499\n",
 788 |        "dtype: float64"
 789 |       ]
 790 |      },
 791 |      "execution_count": 23,
 792 |      "metadata": {},
 793 |      "output_type": "execute_result"
 794 |     }
 795 |    ],
 796 |    "source": [
 797 |     "# s1[(0.5 > s1 > 0)]\n",
 798 |     "\n",
 799 |     "s1[(0.5 > s1)&(s1 > 0)]"
 800 |    ]
 801 |   },
 802 |   {
 803 |    "cell_type": "code",
 804 |    "execution_count": 24,
 805 |    "metadata": {
 806 |     "ExecuteTime": {
 807 |      "end_time": "2018-12-31T01:55:13.532020Z",
 808 |      "start_time": "2018-12-31T01:55:13.517926Z"
 809 |     }
 810 |    },
 811 |    "outputs": [
 812 |     {
 813 |      "data": {
 814 |       "text/plain": [
 815 |        "1    0.007499\n",
 816 |        "2    0.788075\n",
 817 |        "3    2.548161\n",
 818 |        "dtype: float64"
 819 |       ]
 820 |      },
 821 |      "execution_count": 24,
 822 |      "metadata": {},
 823 |      "output_type": "execute_result"
 824 |     }
 825 |    ],
 826 |    "source": [
 827 |     "s1[(0.5 < s1)|(s1 > 0)]"
 828 |    ]
 829 |   },
 830 |   {
 831 |    "cell_type": "markdown",
 832 |    "metadata": {},
 833 |    "source": [
 834 |     "---\n",
 835 |     "\n",
 836 |     "## 2.6  Slicing a Series\n",
 837 |     "\n",
 838 |     "Slicing a Series is siilar to slicing a list, and the result is a view, instead of a copy.\n",
 839 |     "\n",
 840 |     "If the series has n elements, then negative values for the start and end of the slice represent elements n + start through and not including n + end."
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "code",
 845 |    "execution_count": 25,
 846 |    "metadata": {
 847 |     "ExecuteTime": {
 848 |      "end_time": "2018-12-31T01:55:13.546357Z",
 849 |      "start_time": "2018-12-31T01:55:13.535770Z"
 850 |     }
 851 |    },
 852 |    "outputs": [
 853 |     {
 854 |      "name": "stdout",
 855 |      "output_type": "stream",
 856 |      "text": [
 857 |       "1    0.007499\n",
 858 |       "2    0.788075\n",
 859 |       "2   -0.223463\n",
 860 |       "3    2.548161\n",
 861 |       "dtype: float64\n",
 862 |       "\n",
 863 |       "1    0.007499\n",
 864 |       "2    0.788075\n",
 865 |       "2   -0.223463\n",
 866 |       "3    2.548161\n",
 867 |       "dtype: float64\n"
 868 |      ]
 869 |     }
 870 |    ],
 871 |    "source": [
 872 |     "print(s1)\n",
 873 |     "print()\n",
 874 |     "s3 = s1[0:]\n",
 875 |     "print(s3)"
 876 |    ]
 877 |   },
 878 |   {
 879 |    "cell_type": "code",
 880 |    "execution_count": 26,
 881 |    "metadata": {
 882 |     "ExecuteTime": {
 883 |      "end_time": "2018-12-31T01:55:13.575589Z",
 884 |      "start_time": "2018-12-31T01:55:13.554323Z"
 885 |     }
 886 |    },
 887 |    "outputs": [
 888 |     {
 889 |      "name": "stdout",
 890 |      "output_type": "stream",
 891 |      "text": [
 892 |       "True\n",
 893 |       "\n",
 894 |       "False\n"
 895 |      ]
 896 |     }
 897 |    ],
 898 |    "source": [
 899 |     "print(s3._is_view)\n",
 900 |     "print()\n",
 901 |     "print(s3.copy()._is_view)"
 902 |    ]
 903 |   },
 904 |   {
 905 |    "cell_type": "code",
 906 |    "execution_count": 27,
 907 |    "metadata": {
 908 |     "ExecuteTime": {
 909 |      "end_time": "2018-12-31T01:55:13.595081Z",
 910 |      "start_time": "2018-12-31T01:55:13.580618Z"
 911 |     }
 912 |    },
 913 |    "outputs": [
 914 |     {
 915 |      "data": {
 916 |       "text/plain": [
 917 |        "3    2.548161\n",
 918 |        "dtype: float64"
 919 |       ]
 920 |      },
 921 |      "execution_count": 27,
 922 |      "metadata": {},
 923 |      "output_type": "execute_result"
 924 |     }
 925 |    ],
 926 |    "source": [
 927 |     "s3 = s1[-1:]\n",
 928 |     "s3"
 929 |    ]
 930 |   },
 931 |   {
 932 |    "cell_type": "code",
 933 |    "execution_count": 28,
 934 |    "metadata": {
 935 |     "ExecuteTime": {
 936 |      "end_time": "2018-12-31T01:55:13.614137Z",
 937 |      "start_time": "2018-12-31T01:55:13.604408Z"
 938 |     }
 939 |    },
 940 |    "outputs": [
 941 |     {
 942 |      "name": "stdout",
 943 |      "output_type": "stream",
 944 |      "text": [
 945 |       "1    0.007499\n",
 946 |       "2    0.788075\n",
 947 |       "2   -0.223463\n",
 948 |       "3    2.548161\n",
 949 |       "dtype: float64\n",
 950 |       "\n",
 951 |       "3    2.548161\n",
 952 |       "2   -0.223463\n",
 953 |       "2    0.788075\n",
 954 |       "1    0.007499\n",
 955 |       "dtype: float64\n"
 956 |      ]
 957 |     }
 958 |    ],
 959 |    "source": [
 960 |     "print(s1)\n",
 961 |     "print()\n",
 962 |     "print(s1[::-1])"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "markdown",
 967 |    "metadata": {},
 968 |    "source": [
 969 |     "---\n",
 970 |     "\n",
 971 |     "## 2.7 Sorting and ranking\n",
 972 |     "\n",
 973 |     "Sorting Series can be based on indices of values, and pandas provides both solutions:\n",
 974 |     "\n",
 975 |     "`pd.Series.sort_index()`: sort a Series by row indexs, and returns a new, sorted object\n",
 976 |     "\n",
 977 |     "`pd.Series.sort_values()`: sort a Series by its values\n",
 978 |     "\n",
 979 |     "\n",
 980 |     "Please note: any missing values are sorted to the end of the Series by default"
 981 |    ]
 982 |   },
 983 |   {
 984 |    "cell_type": "code",
 985 |    "execution_count": 29,
 986 |    "metadata": {
 987 |     "ExecuteTime": {
 988 |      "end_time": "2018-12-31T01:55:13.647511Z",
 989 |      "start_time": "2018-12-31T01:55:13.619834Z"
 990 |     }
 991 |    },
 992 |    "outputs": [
 993 |     {
 994 |      "data": {
 995 |       "text/plain": [
 996 |        "1   -0.329720\n",
 997 |        "2    0.083856\n",
 998 |        "2    1.009812\n",
 999 |        "9    0.325275\n",
1000 |        "dtype: float64"
1001 |       ]
1002 |      },
1003 |      "execution_count": 29,
1004 |      "metadata": {},
1005 |      "output_type": "execute_result"
1006 |     }
1007 |    ],
1008 |    "source": [
1009 |     "ind = [9, 2, 2, 1]\n",
1010 |     "s1 = pd.Series(np.random.randn(4), index=ind)\n",
1011 |     "s1.sort_index()"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "code",
1016 |    "execution_count": 30,
1017 |    "metadata": {
1018 |     "ExecuteTime": {
1019 |      "end_time": "2018-12-31T01:55:13.673645Z",
1020 |      "start_time": "2018-12-31T01:55:13.649936Z"
1021 |     }
1022 |    },
1023 |    "outputs": [
1024 |     {
1025 |      "data": {
1026 |       "text/plain": [
1027 |        "1   -0.329720\n",
1028 |        "2    0.083856\n",
1029 |        "9    0.325275\n",
1030 |        "2    1.009812\n",
1031 |        "dtype: float64"
1032 |       ]
1033 |      },
1034 |      "execution_count": 30,
1035 |      "metadata": {},
1036 |      "output_type": "execute_result"
1037 |     }
1038 |    ],
1039 |    "source": [
1040 |     "s1.sort_values()"
1041 |    ]
1042 |   },
1043 |   {
1044 |    "cell_type": "code",
1045 |    "execution_count": 31,
1046 |    "metadata": {
1047 |     "ExecuteTime": {
1048 |      "end_time": "2018-12-31T01:55:13.704324Z",
1049 |      "start_time": "2018-12-31T01:55:13.683345Z"
1050 |     }
1051 |    },
1052 |    "outputs": [
1053 |     {
1054 |      "data": {
1055 |       "text/plain": [
1056 |        "5    1.0\n",
1057 |        "2    2.0\n",
1058 |        "4    2.0\n",
1059 |        "0    9.0\n",
1060 |        "1    NaN\n",
1061 |        "3    NaN\n",
1062 |        "dtype: float64"
1063 |       ]
1064 |      },
1065 |      "execution_count": 31,
1066 |      "metadata": {},
1067 |      "output_type": "execute_result"
1068 |     }
1069 |    ],
1070 |    "source": [
1071 |     "s3 = pd.Series([9, None, 2, None, 2, 1])\n",
1072 |     "s3.sort_values()"
1073 |    ]
1074 |   },
1075 |   {
1076 |    "cell_type": "code",
1077 |    "execution_count": 32,
1078 |    "metadata": {
1079 |     "ExecuteTime": {
1080 |      "end_time": "2018-12-31T01:55:13.720857Z",
1081 |      "start_time": "2018-12-31T01:55:13.711894Z"
1082 |     }
1083 |    },
1084 |    "outputs": [
1085 |     {
1086 |      "name": "stdout",
1087 |      "output_type": "stream",
1088 |      "text": [
1089 |       "0    9.0\n",
1090 |       "1    NaN\n",
1091 |       "2    2.0\n",
1092 |       "3    NaN\n",
1093 |       "4    2.0\n",
1094 |       "5    1.0\n",
1095 |       "dtype: float64\n",
1096 |       "\n",
1097 |       "0    4.0\n",
1098 |       "1    NaN\n",
1099 |       "2    2.5\n",
1100 |       "3    NaN\n",
1101 |       "4    2.5\n",
1102 |       "5    1.0\n",
1103 |       "dtype: float64\n"
1104 |      ]
1105 |     }
1106 |    ],
1107 |    "source": [
1108 |     "print(s3)\n",
1109 |     "print()\n",
1110 |     "print(s3.rank())"
1111 |    ]
1112 |   },
1113 |   {
1114 |    "cell_type": "markdown",
1115 |    "metadata": {},
1116 |    "source": [
1117 |     "---\n",
1118 |     "\n",
1119 |     "## 2.8 Copy VS view\n",
1120 |     "\n",
1121 |     "This warning often occurs when we write pandas functions:\n",
1122 |     "\n",
1123 |     "> /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ipykernel/__main__.py:1: SettingWithCopyWarning: \n",
1124 |     "A value is trying to be set on a copy of a slice from a DataFrame.\n",
1125 |     "Try using .loc[row_indexer,col_indexer] = value instead\n",
1126 |     "\n",
1127 |     "\n",
1128 |     "The fundamental cause of this warning is that we used chain indexing in pandas, which is a taboo. To solve this problem, we need clarity copy and view in pandas first.\n",
1129 |     "\n",
1130 |     "View: \n",
1131 |     "\n",
1132 |     "    1. can be regarded as a reference to the original DataFrame / Series;\n",
1133 |     "    2. the modofication on the view affects the original DataFrame / Series.\n",
1134 |     "\n",
1135 |     "\n",
1136 |     "Copy: \n",
1137 |     "\n",
1138 |     "    1. a new DataFrame / Series based on the original DataFrame / Series;\n",
1139 |     "    2. the modification doesn't affect the original DataFrame / Series.\n",
1140 |     "\n",
1141 |     "<img src=\"../image/view_copy_1.png\">\n",
1142 |     "\n",
1143 |     "---\n",
1144 |     "\n",
1145 |     "<img src=\"../image/view_copy_2.png\">\n",
1146 |     "\n",
1147 |     "\n",
1148 |     "---\n",
1149 |     "\n",
1150 |     "The chain indexing may introduce views and copies at the same time, so the original DataFrame might be affected without noticing. This is super dangerous!\n",
1151 |     "\n",
1152 |     "We can avoid chain indexing by using `pd.DataFrame.loc()` / `pd.DataFrame.iloc()`"
1153 |    ]
1154 |   },
1155 |   {
1156 |    "cell_type": "code",
1157 |    "execution_count": 33,
1158 |    "metadata": {
1159 |     "ExecuteTime": {
1160 |      "end_time": "2018-12-31T01:55:13.738121Z",
1161 |      "start_time": "2018-12-31T01:55:13.723858Z"
1162 |     }
1163 |    },
1164 |    "outputs": [
1165 |     {
1166 |      "data": {
1167 |       "text/plain": [
1168 |        "False"
1169 |       ]
1170 |      },
1171 |      "execution_count": 33,
1172 |      "metadata": {},
1173 |      "output_type": "execute_result"
1174 |     }
1175 |    ],
1176 |    "source": [
1177 |     "s3._is_view"
1178 |    ]
1179 |   },
1180 |   {
1181 |    "cell_type": "code",
1182 |    "execution_count": 34,
1183 |    "metadata": {
1184 |     "ExecuteTime": {
1185 |      "end_time": "2018-12-31T01:55:13.751490Z",
1186 |      "start_time": "2018-12-31T01:55:13.743247Z"
1187 |     }
1188 |    },
1189 |    "outputs": [
1190 |     {
1191 |      "data": {
1192 |       "text/plain": [
1193 |        "False"
1194 |       ]
1195 |      },
1196 |      "execution_count": 34,
1197 |      "metadata": {},
1198 |      "output_type": "execute_result"
1199 |     }
1200 |    ],
1201 |    "source": [
1202 |     "# Converting a view to a copy\n",
1203 |     "\n",
1204 |     "s4 = s3.copy()\n",
1205 |     "s4._is_view"
1206 |    ]
1207 |   },
1208 |   {
1209 |    "cell_type": "code",
1210 |    "execution_count": 35,
1211 |    "metadata": {
1212 |     "ExecuteTime": {
1213 |      "end_time": "2018-12-31T01:55:13.769675Z",
1214 |      "start_time": "2018-12-31T01:55:13.753811Z"
1215 |     }
1216 |    },
1217 |    "outputs": [
1218 |     {
1219 |      "data": {
1220 |       "text/plain": [
1221 |        "True"
1222 |       ]
1223 |      },
1224 |      "execution_count": 35,
1225 |      "metadata": {},
1226 |      "output_type": "execute_result"
1227 |     }
1228 |    ],
1229 |    "source": [
1230 |     "s5 = s4.view()\n",
1231 |     "s5._is_view"
1232 |    ]
1233 |   },
1234 |   {
1235 |    "cell_type": "markdown",
1236 |    "metadata": {},
1237 |    "source": [
1238 |     "## 2.9 Exercises\n",
1239 |     "\n",
1240 |     "1. For a detailed answer of chain indexing warning, please read [this great post](https://www.dataquest.io/blog/settingwithcopywarning/)\n",
1241 |     "\n",
1242 |     "2. Find the parameter settings of following pandas functions:\n",
1243 |     "\n",
1244 |     "`pd.Series.reindex()`\n",
1245 |     "\n",
1246 |     "`pd.Series.sort_values()`\n",
1247 |     "\n",
1248 |     "`pd.Series.sort_index()`\n",
1249 |     "\n",
1250 |     "`pd.Series.loc()`\n",
1251 |     "\n",
1252 |     "`pd.Series.iloc()`\n",
1253 |     "\n",
1254 |     "3. Check the result of the following functions to see if they return a copy or a view?\n",
1255 |     "\n",
1256 |     "`pd.Series.reindex()`\n",
1257 |     "\n",
1258 |     "`pd.Series.sort_values()`\n",
1259 |     "\n",
1260 |     "`pd.Series.sort_index()`\n",
1261 |     "\n",
1262 |     "`pd.Series.loc()`\n",
1263 |     "\n",
1264 |     "`pd.Series.iloc()`"
1265 |    ]
1266 |   },
1267 |   {
1268 |    "cell_type": "markdown",
1269 |    "metadata": {},
1270 |    "source": [
1271 |     "---\n",
1272 |     "\n",
1273 |     "To the rest sessions (outlines and video records), please scan the QR code below to pay.\n",
1274 |     "\n",
1275 |     "1. The price is 799 RMB.\n",
1276 |     "2. Please leave your email address in the __payment comment__, so I will send you the links of the rest sessions.\n",
1277 |     "\n",
1278 |     "\n",
1279 |     "<img src=\"../image/alipay.jpg\">"
1280 |    ]
1281 |   }
1282 |  ],
1283 |  "metadata": {
1284 |   "kernelspec": {
1285 |    "display_name": "Python 3",
1286 |    "language": "python",
1287 |    "name": "python3"
1288 |   },
1289 |   "language_info": {
1290 |    "codemirror_mode": {
1291 |     "name": "ipython",
1292 |     "version": 3
1293 |    },
1294 |    "file_extension": ".py",
1295 |    "mimetype": "text/x-python",
1296 |    "name": "python",
1297 |    "nbconvert_exporter": "python",
1298 |    "pygments_lexer": "ipython3",
1299 |    "version": "3.7.2"
1300 |   },
1301 |   "toc": {
1302 |    "base_numbering": "2",
1303 |    "nav_menu": {},
1304 |    "number_sections": false,
1305 |    "sideBar": true,
1306 |    "skip_h1_title": false,
1307 |    "title_cell": "Table of Contents",
1308 |    "title_sidebar": "Contents",
1309 |    "toc_cell": false,
1310 |    "toc_position": {
1311 |     "height": "calc(100% - 180px)",
1312 |     "left": "10px",
1313 |     "top": "150px",
1314 |     "width": "295.3333435058594px"
1315 |    },
1316 |    "toc_section_display": true,
1317 |    "toc_window_display": false
1318 |   }
1319 |  },
1320 |  "nbformat": 4,
1321 |  "nbformat_minor": 2
1322 | }
1323 | 


--------------------------------------------------------------------------------
/Pandas 01 - Basics of Pandas.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Pandas 01 - Basics of Pandas\n",
   8 |     "\n",
   9 |     "by Nova@Douban\n",
  10 |     "\n",
  11 |     "The video record of this session is here: https://zoom.us/recording/share/L9Jwofdbg3CX2L4wLoPAVrHYyi0F0ok2_58ozScsXsmwIumekTziMw\n",
  12 |     "\n",
  13 |     "\n",
  14 |     "---\n",
  15 |     "\n",
  16 |     "## 1.1 Data Structure of pandas\n",
  17 |     "\n",
  18 |     "`pandas` significantly simplies data structures. If you used `R` or a retional database, you will find `pandas` very similar.\n",
  19 |     "\n",
  20 |     "### 1.1.1 Three primary data structures in pandas\n",
  21 |     "\n",
  22 |     "1. `Series` (a column):\n",
  23 |     "\n",
  24 |     "    1. A one-dimensional array-like object containing an array of data.\n",
  25 |     "    \n",
  26 |     "    2. A fixed-length, __ordered dict__.\n",
  27 |     "    \n",
  28 |     "    3. Automatically aligns differently-indexed data in operations\n",
  29 |     "    \n",
  30 |     "    4. The column returned when indexing a DataFrame is a view, not a copy.\n",
  31 |     "\n",
  32 |     "\n",
  33 |     "2. `DataFrame` (a collection of columns): \n",
  34 |     "\n",
  35 |     "    1. A tabular, spreadsheet-like data structure containing an ordered collection of columns;\n",
  36 |     " \n",
  37 |     "    2. __A collection of Series__.\n",
  38 |     "    \n",
  39 |     "   \n",
  40 |     "3. `index`:\n",
  41 |     "        \n",
  42 |     "    1. an Index also functions as __a fixed-size set__\n",
  43 |     "    \n",
  44 |     "    2. Index objects are __immutable__ and thus can’t be modified by the user\n",
  45 |     "    \n",
  46 |     "    3. It is a class in pandas, more complicated than the one in RDS.\n",
  47 |     "    \n",
  48 |     "        a. Identication: Indices are used to locate Series / rows / items in a DataFrame.   \n",
  49 |     "        \n",
  50 |     "        b. Alignemnt: pandas will always align with index automatically first.\n",
  51 |     "        \n",
  52 |     "        c. Selection: using index to select relevant columns/rows.\n",
  53 |     "        \n",
  54 |     "\n",
  55 |     "### 1.1.2 Example of DataFrame, Series and index"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "code",
  60 |    "execution_count": 1,
  61 |    "metadata": {
  62 |     "ExecuteTime": {
  63 |      "end_time": "2018-12-28T11:47:52.640421Z",
  64 |      "start_time": "2018-12-28T11:47:52.039021Z"
  65 |     }
  66 |    },
  67 |    "outputs": [
  68 |     {
  69 |      "name": "stdout",
  70 |      "output_type": "stream",
  71 |      "text": [
  72 |       "         Date         Open         High          Low        Close  \\\n",
  73 |       "0  2018-11-23  6919.520020  6987.890137  6919.160156  6938.979980   \n",
  74 |       "1  2018-11-26  7026.500000  7083.930176  7003.120117  7081.850098   \n",
  75 |       "2  2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195   \n",
  76 |       "3  2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844   \n",
  77 |       "4  2018-11-29  7267.370117  7319.959961  7217.689941  7273.080078   \n",
  78 |       "\n",
  79 |       "     Adj Close      Volume  \n",
  80 |       "0  6938.979980   958950000  \n",
  81 |       "1  7081.850098  2011180000  \n",
  82 |       "2  7082.700195  2067360000  \n",
  83 |       "3  7291.589844  2390260000  \n",
  84 |       "4  7273.080078  1983460000  \n",
  85 |       "\n",
  86 |       "<class 'pandas.core.frame.DataFrame'>\n"
  87 |      ]
  88 |     }
  89 |    ],
  90 |    "source": [
  91 |     "# Download Nasdaq dataset: https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC\n",
  92 |     "\n",
  93 |     "import pandas as pd\n",
  94 |     "\n",
  95 |     "in_file = '../data/nasdaq.csv'\n",
  96 |     "df = pd.read_csv(in_file, engine='c')\n",
  97 |     "\n",
  98 |     "print(df.head())\n",
  99 |     "print()\n",
 100 |     "print(type(df))"
 101 |    ]
 102 |   },
 103 |   {
 104 |    "cell_type": "code",
 105 |    "execution_count": 2,
 106 |    "metadata": {
 107 |     "ExecuteTime": {
 108 |      "end_time": "2018-12-28T11:47:52.656310Z",
 109 |      "start_time": "2018-12-28T11:47:52.644446Z"
 110 |     }
 111 |    },
 112 |    "outputs": [
 113 |     {
 114 |      "name": "stdout",
 115 |      "output_type": "stream",
 116 |      "text": [
 117 |       "0    2018-11-23\n",
 118 |       "1    2018-11-26\n",
 119 |       "2    2018-11-27\n",
 120 |       "3    2018-11-28\n",
 121 |       "4    2018-11-29\n",
 122 |       "Name: Date, dtype: object\n",
 123 |       "\n",
 124 |       "<class 'pandas.core.series.Series'>\n",
 125 |       "\n"
 126 |      ]
 127 |     }
 128 |    ],
 129 |    "source": [
 130 |     "print(df['Date'].head())\n",
 131 |     "print()\n",
 132 |     "print(type(df['Date']))\n",
 133 |     "print()"
 134 |    ]
 135 |   },
 136 |   {
 137 |    "cell_type": "code",
 138 |    "execution_count": 3,
 139 |    "metadata": {
 140 |     "ExecuteTime": {
 141 |      "end_time": "2018-12-28T11:47:52.678080Z",
 142 |      "start_time": "2018-12-28T11:47:52.661673Z"
 143 |     }
 144 |    },
 145 |    "outputs": [
 146 |     {
 147 |      "data": {
 148 |       "text/plain": [
 149 |        "RangeIndex(start=0, stop=20, step=1)"
 150 |       ]
 151 |      },
 152 |      "execution_count": 3,
 153 |      "metadata": {},
 154 |      "output_type": "execute_result"
 155 |     }
 156 |    ],
 157 |    "source": [
 158 |     "df.index"
 159 |    ]
 160 |   },
 161 |   {
 162 |    "cell_type": "code",
 163 |    "execution_count": 4,
 164 |    "metadata": {
 165 |     "ExecuteTime": {
 166 |      "end_time": "2018-12-28T11:47:52.703980Z",
 167 |      "start_time": "2018-12-28T11:47:52.681052Z"
 168 |     }
 169 |    },
 170 |    "outputs": [
 171 |     {
 172 |      "data": {
 173 |       "text/plain": [
 174 |        "array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,\n",
 175 |        "       17, 18, 19])"
 176 |       ]
 177 |      },
 178 |      "execution_count": 4,
 179 |      "metadata": {},
 180 |      "output_type": "execute_result"
 181 |     }
 182 |    ],
 183 |    "source": [
 184 |     "df.index.values"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "markdown",
 189 |    "metadata": {},
 190 |    "source": [
 191 |     "---\n",
 192 |     "\n",
 193 |     "### 1.1.3 Two other data structure in pandas\n",
 194 |     "\n",
 195 |     "1. items:\n",
 196 |     "\n",
 197 |     "    1. The smallest unit in pandas.\n",
 198 |     "    \n",
 199 |     "    \n",
 200 |     "2. rows:\n",
 201 |     "\n",
 202 |     "    1. Row is not a primary data structure in pandas\n",
 203 |     "    \n",
 204 |     "### 1.1.4 Examples of items and rows"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "code",
 209 |    "execution_count": 5,
 210 |    "metadata": {
 211 |     "ExecuteTime": {
 212 |      "end_time": "2018-12-28T11:47:52.724236Z",
 213 |      "start_time": "2018-12-28T11:47:52.710937Z"
 214 |     }
 215 |    },
 216 |    "outputs": [
 217 |     {
 218 |      "data": {
 219 |       "text/plain": [
 220 |        "'2018-11-23'"
 221 |       ]
 222 |      },
 223 |      "execution_count": 5,
 224 |      "metadata": {},
 225 |      "output_type": "execute_result"
 226 |     }
 227 |    ],
 228 |    "source": [
 229 |     "df['Date'][0]"
 230 |    ]
 231 |   },
 232 |   {
 233 |    "cell_type": "code",
 234 |    "execution_count": 6,
 235 |    "metadata": {
 236 |     "ExecuteTime": {
 237 |      "end_time": "2018-12-28T11:47:52.761309Z",
 238 |      "start_time": "2018-12-28T11:47:52.731362Z"
 239 |     }
 240 |    },
 241 |    "outputs": [
 242 |     {
 243 |      "data": {
 244 |       "text/html": [
 245 |        "<div>\n",
 246 |        "<style scoped>\n",
 247 |        "    .dataframe tbody tr th:only-of-type {\n",
 248 |        "        vertical-align: middle;\n",
 249 |        "    }\n",
 250 |        "\n",
 251 |        "    .dataframe tbody tr th {\n",
 252 |        "        vertical-align: top;\n",
 253 |        "    }\n",
 254 |        "\n",
 255 |        "    .dataframe thead th {\n",
 256 |        "        text-align: right;\n",
 257 |        "    }\n",
 258 |        "</style>\n",
 259 |        "<table border=\"1\" class=\"dataframe\">\n",
 260 |        "  <thead>\n",
 261 |        "    <tr style=\"text-align: right;\">\n",
 262 |        "      <th></th>\n",
 263 |        "      <th>Date</th>\n",
 264 |        "      <th>Open</th>\n",
 265 |        "      <th>High</th>\n",
 266 |        "      <th>Low</th>\n",
 267 |        "      <th>Close</th>\n",
 268 |        "      <th>Adj Close</th>\n",
 269 |        "      <th>Volume</th>\n",
 270 |        "    </tr>\n",
 271 |        "  </thead>\n",
 272 |        "  <tbody>\n",
 273 |        "    <tr>\n",
 274 |        "      <th>0</th>\n",
 275 |        "      <td>2018-11-23</td>\n",
 276 |        "      <td>6919.52002</td>\n",
 277 |        "      <td>6987.890137</td>\n",
 278 |        "      <td>6919.160156</td>\n",
 279 |        "      <td>6938.97998</td>\n",
 280 |        "      <td>6938.97998</td>\n",
 281 |        "      <td>958950000</td>\n",
 282 |        "    </tr>\n",
 283 |        "  </tbody>\n",
 284 |        "</table>\n",
 285 |        "</div>"
 286 |       ],
 287 |       "text/plain": [
 288 |        "         Date        Open         High          Low       Close   Adj Close  \\\n",
 289 |        "0  2018-11-23  6919.52002  6987.890137  6919.160156  6938.97998  6938.97998   \n",
 290 |        "\n",
 291 |        "      Volume  \n",
 292 |        "0  958950000  "
 293 |       ]
 294 |      },
 295 |      "execution_count": 6,
 296 |      "metadata": {},
 297 |      "output_type": "execute_result"
 298 |     }
 299 |    ],
 300 |    "source": [
 301 |     "df.loc[0:0]"
 302 |    ]
 303 |   },
 304 |   {
 305 |    "cell_type": "markdown",
 306 |    "metadata": {},
 307 |    "source": [
 308 |     "---\n",
 309 |     "\n",
 310 |     "## 1.2 Functions based on pandas\n",
 311 |     "\n",
 312 |     "<img src=\"../image/folder3.png\">\n",
 313 |     "\n",
 314 |     "\n",
 315 |     "\n",
 316 |     "### 1.2.1 Three levels of functions\n",
 317 |     "\n",
 318 |     "Each level of function only handles ite related levels of problems.\n",
 319 |     "\n",
 320 |     "1. DataFrame-level functions\n",
 321 |     "\n",
 322 |     "2. Series-level functions\n",
 323 |     "\n",
 324 |     "3. Item-level functions\n",
 325 |     "\n",
 326 |     "### 1.2.1 Example of different levels of pandas functions\n",
 327 |     "\n",
 328 |     "The following is a sample script to analyse logs:\n",
 329 |     "\n",
 330 |     "1. `prepare_overall_chat` is the overall wrapper;\n",
 331 |     "2. `clean_chat_log` is a DataFrame-level funtion;\n",
 332 |     "3. `get_all_mentions` and `count_active_user` are Series-level functions."
 333 |    ]
 334 |   },
 335 |   {
 336 |    "cell_type": "code",
 337 |    "execution_count": 7,
 338 |    "metadata": {
 339 |     "ExecuteTime": {
 340 |      "end_time": "2018-12-28T11:47:52.786553Z",
 341 |      "start_time": "2018-12-28T11:47:52.765005Z"
 342 |     }
 343 |    },
 344 |    "outputs": [],
 345 |    "source": [
 346 |     "def prepare_overall_chat(chat_base, time, day):\n",
 347 |     "    '''\n",
 348 |     "    an overall wrapper\n",
 349 |     "    '''\n",
 350 |     "    # 'reading starts'\n",
 351 |     "    overall = csv2pd(chat_base, time, day, HEAD_ANALYSIS, sep=',', engine='c')\n",
 352 |     "    \n",
 353 |     "    # 'overall'\n",
 354 |     "    clean_overall = clean_chat_log(overall)\n",
 355 |     "    \n",
 356 |     "    # 'cleaned kom records'\n",
 357 |     "    all_mentions = get_all_mentions(clean_overall, switch=False)\n",
 358 |     "    \n",
 359 |     "    # 'all_mentions'\n",
 360 |     "    active_user_count = count_active_user(overall, overall['RoomName'], colname='Name', header=HEAD_ACTIVE)\n",
 361 |     "    return clean_overall, all_mentions, active_user_count\n",
 362 |     "\n",
 363 |     "    \n",
 364 |     "def clean_chat_log(df):\n",
 365 |     "    '''\n",
 366 |     "    at Dataframe level\n",
 367 |     "    '''\n",
 368 |     "    # 'remove null or duplicates'\n",
 369 |     "    df = df[df.TextMsg.notnull()]\n",
 370 |     "    df = df[df.Name.notnull()]\n",
 371 |     "    df = df.drop_duplicates()\n",
 372 |     "\n",
 373 |     "    # replace ?? or ** in data\n",
 374 |     "    df.Name = df.loc[:, 'Name'].str.replace('\\?\\?|\\*\\*', '?#')\n",
 375 |     "    df.TextMsg = df.loc[:, 'TextMsg'].str.replace('\\?\\?|\\*\\*', '?#')\n",
 376 |     "\n",
 377 |     "    # 'to_uni' and 'strip new lines'\n",
 378 |     "    df = batch_to_uni(df, col_list=['Name', 'TextMsg', 'RoomName'])\n",
 379 |     "    df = batch_strip(df, col_list=['Name', 'RoomName'], strip_str='\\n\\r ')\n",
 380 |     "    return df\n",
 381 |     "\n",
 382 |     "    \n",
 383 |     "def get_all_mentions(df, switch=True):\n",
 384 |     "    '''\n",
 385 |     "    at Series level\n",
 386 |     "    '''\n",
 387 |     "    all_mentions = df[df['TextMsg'].str.contains('@')]\n",
 388 |     "    all_mentions.MsgTime = pd.to_datetime(pd.Series(all_mentions.MsgTime))  # todo fix .loc\n",
 389 |     "    if switch:\n",
 390 |     "        cleaned_mentions = pd.DataFrame.copy(all_mentions)\n",
 391 |     "        cleaned_mentions = batch_replace(cleaned_mentions, 'TextMsg', CN_PUNCS, '')\n",
 392 |     "        return all_mentions, cleaned_mentions\n",
 393 |     "    else:\n",
 394 |     "        return all_mentions\n",
 395 |     "    \n",
 396 |     "def count_active_user(df, col1, colname, header):\n",
 397 |     "    '''\n",
 398 |     "    at Series level\n",
 399 |     "    '''\n",
 400 |     "    active_user_count = df.groupby([col1])[colname].unique().apply(len)\n",
 401 |     "    active_user_count = active_user_count.subtract(1)  # Exclude 班长\n",
 402 |     "    active_user_count = active_user_count.reset_index()\n",
 403 |     "    active_user_count.columns = header\n",
 404 |     "    active_user_count = batch_to_uni(active_user_count, ['RoomName'])\n",
 405 |     "    return active_user_count"
 406 |    ]
 407 |   },
 408 |   {
 409 |    "cell_type": "markdown",
 410 |    "metadata": {},
 411 |    "source": [
 412 |     "---"
 413 |    ]
 414 |   },
 415 |   {
 416 |    "cell_type": "markdown",
 417 |    "metadata": {},
 418 |    "source": [
 419 |     "## 1.3 Summarizing and computing descriptive statistics\n",
 420 |     "\n",
 421 |     "\n",
 422 |     "### 1.3.1 Take a glance at the dataset\n",
 423 |     "\n",
 424 |     "__Be careful if the function should include parentheses:)__\n",
 425 |     "\n",
 426 |     "1. `DataFrame.describe()`: provide descriptive stats of the dataset\n",
 427 |     "2. `DataFrame.values`: access values of the dataset\n",
 428 |     "3. `DataFrame.head()`: access the head of the dataset\n",
 429 |     "4. `DataFrame.tail()`: access the tail of the dataset\n",
 430 |     "5. `DataFrame.shape`: provide the length and width of the dataset\n",
 431 |     "6. `DataFrame.size`: provide the product of the length and width of the dataset\n",
 432 |     "7. `DataFrame.columns`: provide the colomn names of the dataset\n",
 433 |     "8. `DataFrame.index`: provide the row index of the dataset\n",
 434 |     "9. `DataFrame.axes`: provide the colomn names and row index of the dataset\n",
 435 |     "\n",
 436 |     "### 1.3.2 Examples of a glance at the dataset"
 437 |    ]
 438 |   },
 439 |   {
 440 |    "cell_type": "code",
 441 |    "execution_count": 8,
 442 |    "metadata": {
 443 |     "ExecuteTime": {
 444 |      "end_time": "2018-12-28T11:47:52.848115Z",
 445 |      "start_time": "2018-12-28T11:47:52.798310Z"
 446 |     }
 447 |    },
 448 |    "outputs": [
 449 |     {
 450 |      "data": {
 451 |       "text/html": [
 452 |        "<div>\n",
 453 |        "<style scoped>\n",
 454 |        "    .dataframe tbody tr th:only-of-type {\n",
 455 |        "        vertical-align: middle;\n",
 456 |        "    }\n",
 457 |        "\n",
 458 |        "    .dataframe tbody tr th {\n",
 459 |        "        vertical-align: top;\n",
 460 |        "    }\n",
 461 |        "\n",
 462 |        "    .dataframe thead th {\n",
 463 |        "        text-align: right;\n",
 464 |        "    }\n",
 465 |        "</style>\n",
 466 |        "<table border=\"1\" class=\"dataframe\">\n",
 467 |        "  <thead>\n",
 468 |        "    <tr style=\"text-align: right;\">\n",
 469 |        "      <th></th>\n",
 470 |        "      <th>Open</th>\n",
 471 |        "      <th>High</th>\n",
 472 |        "      <th>Low</th>\n",
 473 |        "      <th>Close</th>\n",
 474 |        "      <th>Adj Close</th>\n",
 475 |        "      <th>Volume</th>\n",
 476 |        "    </tr>\n",
 477 |        "  </thead>\n",
 478 |        "  <tbody>\n",
 479 |        "    <tr>\n",
 480 |        "      <th>count</th>\n",
 481 |        "      <td>20.000000</td>\n",
 482 |        "      <td>20.000000</td>\n",
 483 |        "      <td>20.000000</td>\n",
 484 |        "      <td>20.000000</td>\n",
 485 |        "      <td>20.000000</td>\n",
 486 |        "      <td>2.000000e+01</td>\n",
 487 |        "    </tr>\n",
 488 |        "    <tr>\n",
 489 |        "      <th>mean</th>\n",
 490 |        "      <td>7036.433984</td>\n",
 491 |        "      <td>7094.115039</td>\n",
 492 |        "      <td>6932.403003</td>\n",
 493 |        "      <td>6996.186035</td>\n",
 494 |        "      <td>6996.186035</td>\n",
 495 |        "      <td>2.492132e+09</td>\n",
 496 |        "    </tr>\n",
 497 |        "    <tr>\n",
 498 |        "      <th>std</th>\n",
 499 |        "      <td>236.773607</td>\n",
 500 |        "      <td>234.200045</td>\n",
 501 |        "      <td>268.133301</td>\n",
 502 |        "      <td>278.415894</td>\n",
 503 |        "      <td>278.415894</td>\n",
 504 |        "      <td>6.665222e+08</td>\n",
 505 |        "    </tr>\n",
 506 |        "    <tr>\n",
 507 |        "      <th>min</th>\n",
 508 |        "      <td>6573.490234</td>\n",
 509 |        "      <td>6586.680176</td>\n",
 510 |        "      <td>6304.629883</td>\n",
 511 |        "      <td>6333.000000</td>\n",
 512 |        "      <td>6333.000000</td>\n",
 513 |        "      <td>9.589500e+08</td>\n",
 514 |        "    </tr>\n",
 515 |        "    <tr>\n",
 516 |        "      <th>25%</th>\n",
 517 |        "      <td>6911.255005</td>\n",
 518 |        "      <td>6973.870117</td>\n",
 519 |        "      <td>6842.670166</td>\n",
 520 |        "      <td>6878.972656</td>\n",
 521 |        "      <td>6878.972656</td>\n",
 522 |        "      <td>2.186262e+09</td>\n",
 523 |        "    </tr>\n",
 524 |        "    <tr>\n",
 525 |        "      <th>50%</th>\n",
 526 |        "      <td>7033.864990</td>\n",
 527 |        "      <td>7117.485108</td>\n",
 528 |        "      <td>6983.674805</td>\n",
 529 |        "      <td>7051.080078</td>\n",
 530 |        "      <td>7051.080078</td>\n",
 531 |        "      <td>2.443730e+09</td>\n",
 532 |        "    </tr>\n",
 533 |        "    <tr>\n",
 534 |        "      <th>75%</th>\n",
 535 |        "      <td>7142.332397</td>\n",
 536 |        "      <td>7227.205078</td>\n",
 537 |        "      <td>7092.375000</td>\n",
 538 |        "      <td>7165.887574</td>\n",
 539 |        "      <td>7165.887574</td>\n",
 540 |        "      <td>2.643168e+09</td>\n",
 541 |        "    </tr>\n",
 542 |        "    <tr>\n",
 543 |        "      <th>max</th>\n",
 544 |        "      <td>7486.129883</td>\n",
 545 |        "      <td>7486.509766</td>\n",
 546 |        "      <td>7392.220215</td>\n",
 547 |        "      <td>7441.509766</td>\n",
 548 |        "      <td>7441.509766</td>\n",
 549 |        "      <td>4.534120e+09</td>\n",
 550 |        "    </tr>\n",
 551 |        "  </tbody>\n",
 552 |        "</table>\n",
 553 |        "</div>"
 554 |       ],
 555 |       "text/plain": [
 556 |        "              Open         High          Low        Close    Adj Close  \\\n",
 557 |        "count    20.000000    20.000000    20.000000    20.000000    20.000000   \n",
 558 |        "mean   7036.433984  7094.115039  6932.403003  6996.186035  6996.186035   \n",
 559 |        "std     236.773607   234.200045   268.133301   278.415894   278.415894   \n",
 560 |        "min    6573.490234  6586.680176  6304.629883  6333.000000  6333.000000   \n",
 561 |        "25%    6911.255005  6973.870117  6842.670166  6878.972656  6878.972656   \n",
 562 |        "50%    7033.864990  7117.485108  6983.674805  7051.080078  7051.080078   \n",
 563 |        "75%    7142.332397  7227.205078  7092.375000  7165.887574  7165.887574   \n",
 564 |        "max    7486.129883  7486.509766  7392.220215  7441.509766  7441.509766   \n",
 565 |        "\n",
 566 |        "             Volume  \n",
 567 |        "count  2.000000e+01  \n",
 568 |        "mean   2.492132e+09  \n",
 569 |        "std    6.665222e+08  \n",
 570 |        "min    9.589500e+08  \n",
 571 |        "25%    2.186262e+09  \n",
 572 |        "50%    2.443730e+09  \n",
 573 |        "75%    2.643168e+09  \n",
 574 |        "max    4.534120e+09  "
 575 |       ]
 576 |      },
 577 |      "execution_count": 8,
 578 |      "metadata": {},
 579 |      "output_type": "execute_result"
 580 |     }
 581 |    ],
 582 |    "source": [
 583 |     "df.describe()"
 584 |    ]
 585 |   },
 586 |   {
 587 |    "cell_type": "code",
 588 |    "execution_count": 9,
 589 |    "metadata": {
 590 |     "ExecuteTime": {
 591 |      "end_time": "2018-12-28T11:47:52.858589Z",
 592 |      "start_time": "2018-12-28T11:47:52.850909Z"
 593 |     }
 594 |    },
 595 |    "outputs": [
 596 |     {
 597 |      "data": {
 598 |       "text/plain": [
 599 |        "array([['2018-11-23', 6919.52002, 6987.890137, 6919.160156, 6938.97998,\n",
 600 |        "        6938.97998, 958950000],\n",
 601 |        "       ['2018-11-26', 7026.5, 7083.930176000001, 7003.120117,\n",
 602 |        "        7081.850098000001, 7081.850098000001, 2011180000],\n",
 603 |        "       ['2018-11-27', 7041.22998, 7105.140137, 7014.359863, 7082.700195,\n",
 604 |        "        7082.700195, 2067360000],\n",
 605 |        "       ['2018-11-28', 7135.080078, 7292.709961, 7090.97998, 7291.589844,\n",
 606 |        "        7291.589844, 2390260000],\n",
 607 |        "       ['2018-11-29', 7267.370117, 7319.959961, 7217.689941, 7273.080078,\n",
 608 |        "        7273.080078, 1983460000],\n",
 609 |        "       ['2018-11-30', 7279.299805, 7332.790039, 7255.680176000001,\n",
 610 |        "        7330.540039, 7330.540039, 2542820000],\n",
 611 |        "       ['2018-12-03', 7486.129883, 7486.509765999999, 7392.220215,\n",
 612 |        "        7441.509765999999, 7441.509765999999, 2621020000],\n",
 613 |        "       ['2018-12-04', 7407.950195, 7421.109863, 7150.109863,\n",
 614 |        "        7158.430176000001, 7158.430176000001, 2635810000],\n",
 615 |        "       ['2018-12-06', 7017.049805, 7189.52002, 6984.339844,\n",
 616 |        "        7188.259765999999, 7188.259765999999, 2833870000],\n",
 617 |        "       ['2018-12-07', 7163.490234000001, 7205.370117, 6945.27002,\n",
 618 |        "        6969.25, 6969.25, 2475160000],\n",
 619 |        "       ['2018-12-10', 6959.629883, 7047.620117, 6878.990234000001,\n",
 620 |        "        7020.52002, 7020.52002, 2367560000],\n",
 621 |        "       ['2018-12-11', 7121.660156, 7129.830078, 6983.009765999999,\n",
 622 |        "        7031.830078, 7031.830078, 2246060000],\n",
 623 |        "       ['2018-12-12', 7127.0, 7197.290039, 7096.560059, 7098.310059,\n",
 624 |        "        7098.310059, 2412300000],\n",
 625 |        "       ['2018-12-13', 7135.279785, 7154.640137, 7034.819823999999,\n",
 626 |        "        7070.330078, 7070.330078, 2143520000],\n",
 627 |        "       ['2018-12-14', 6986.370117, 7027.169922, 6898.990234000001,\n",
 628 |        "        6910.660156, 6910.660156, 2200510000],\n",
 629 |        "       ['2018-12-17', 6886.459961, 6931.810059, 6710.009765999999,\n",
 630 |        "        6753.72998, 6753.72998, 2665240000],\n",
 631 |        "       ['2018-12-18', 6809.819823999999, 6847.27002, 6733.709961,\n",
 632 |        "        6783.910156, 6783.910156, 2595400000],\n",
 633 |        "       ['2018-12-19', 6777.589844, 6868.859863, 6586.5, 6636.830078,\n",
 634 |        "        6636.830078, 2899950000],\n",
 635 |        "       ['2018-12-20', 6607.759765999999, 6666.200195, 6447.910156,\n",
 636 |        "        6528.410156, 6528.410156, 3258090000],\n",
 637 |        "       ['2018-12-21', 6573.490234000001, 6586.680176, 6304.629883,\n",
 638 |        "        6333.0, 6333.0, 4534120000]], dtype=object)"
 639 |       ]
 640 |      },
 641 |      "execution_count": 9,
 642 |      "metadata": {},
 643 |      "output_type": "execute_result"
 644 |     }
 645 |    ],
 646 |    "source": [
 647 |     "df.values"
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "code",
 652 |    "execution_count": 10,
 653 |    "metadata": {
 654 |     "ExecuteTime": {
 655 |      "end_time": "2018-12-28T11:47:52.878993Z",
 656 |      "start_time": "2018-12-28T11:47:52.861182Z"
 657 |     }
 658 |    },
 659 |    "outputs": [
 660 |     {
 661 |      "data": {
 662 |       "text/html": [
 663 |        "<div>\n",
 664 |        "<style scoped>\n",
 665 |        "    .dataframe tbody tr th:only-of-type {\n",
 666 |        "        vertical-align: middle;\n",
 667 |        "    }\n",
 668 |        "\n",
 669 |        "    .dataframe tbody tr th {\n",
 670 |        "        vertical-align: top;\n",
 671 |        "    }\n",
 672 |        "\n",
 673 |        "    .dataframe thead th {\n",
 674 |        "        text-align: right;\n",
 675 |        "    }\n",
 676 |        "</style>\n",
 677 |        "<table border=\"1\" class=\"dataframe\">\n",
 678 |        "  <thead>\n",
 679 |        "    <tr style=\"text-align: right;\">\n",
 680 |        "      <th></th>\n",
 681 |        "      <th>Date</th>\n",
 682 |        "      <th>Open</th>\n",
 683 |        "      <th>High</th>\n",
 684 |        "      <th>Low</th>\n",
 685 |        "      <th>Close</th>\n",
 686 |        "      <th>Adj Close</th>\n",
 687 |        "      <th>Volume</th>\n",
 688 |        "    </tr>\n",
 689 |        "  </thead>\n",
 690 |        "  <tbody>\n",
 691 |        "    <tr>\n",
 692 |        "      <th>0</th>\n",
 693 |        "      <td>2018-11-23</td>\n",
 694 |        "      <td>6919.520020</td>\n",
 695 |        "      <td>6987.890137</td>\n",
 696 |        "      <td>6919.160156</td>\n",
 697 |        "      <td>6938.979980</td>\n",
 698 |        "      <td>6938.979980</td>\n",
 699 |        "      <td>958950000</td>\n",
 700 |        "    </tr>\n",
 701 |        "    <tr>\n",
 702 |        "      <th>1</th>\n",
 703 |        "      <td>2018-11-26</td>\n",
 704 |        "      <td>7026.500000</td>\n",
 705 |        "      <td>7083.930176</td>\n",
 706 |        "      <td>7003.120117</td>\n",
 707 |        "      <td>7081.850098</td>\n",
 708 |        "      <td>7081.850098</td>\n",
 709 |        "      <td>2011180000</td>\n",
 710 |        "    </tr>\n",
 711 |        "    <tr>\n",
 712 |        "      <th>2</th>\n",
 713 |        "      <td>2018-11-27</td>\n",
 714 |        "      <td>7041.229980</td>\n",
 715 |        "      <td>7105.140137</td>\n",
 716 |        "      <td>7014.359863</td>\n",
 717 |        "      <td>7082.700195</td>\n",
 718 |        "      <td>7082.700195</td>\n",
 719 |        "      <td>2067360000</td>\n",
 720 |        "    </tr>\n",
 721 |        "    <tr>\n",
 722 |        "      <th>3</th>\n",
 723 |        "      <td>2018-11-28</td>\n",
 724 |        "      <td>7135.080078</td>\n",
 725 |        "      <td>7292.709961</td>\n",
 726 |        "      <td>7090.979980</td>\n",
 727 |        "      <td>7291.589844</td>\n",
 728 |        "      <td>7291.589844</td>\n",
 729 |        "      <td>2390260000</td>\n",
 730 |        "    </tr>\n",
 731 |        "    <tr>\n",
 732 |        "      <th>4</th>\n",
 733 |        "      <td>2018-11-29</td>\n",
 734 |        "      <td>7267.370117</td>\n",
 735 |        "      <td>7319.959961</td>\n",
 736 |        "      <td>7217.689941</td>\n",
 737 |        "      <td>7273.080078</td>\n",
 738 |        "      <td>7273.080078</td>\n",
 739 |        "      <td>1983460000</td>\n",
 740 |        "    </tr>\n",
 741 |        "  </tbody>\n",
 742 |        "</table>\n",
 743 |        "</div>"
 744 |       ],
 745 |       "text/plain": [
 746 |        "         Date         Open         High          Low        Close  \\\n",
 747 |        "0  2018-11-23  6919.520020  6987.890137  6919.160156  6938.979980   \n",
 748 |        "1  2018-11-26  7026.500000  7083.930176  7003.120117  7081.850098   \n",
 749 |        "2  2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195   \n",
 750 |        "3  2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844   \n",
 751 |        "4  2018-11-29  7267.370117  7319.959961  7217.689941  7273.080078   \n",
 752 |        "\n",
 753 |        "     Adj Close      Volume  \n",
 754 |        "0  6938.979980   958950000  \n",
 755 |        "1  7081.850098  2011180000  \n",
 756 |        "2  7082.700195  2067360000  \n",
 757 |        "3  7291.589844  2390260000  \n",
 758 |        "4  7273.080078  1983460000  "
 759 |       ]
 760 |      },
 761 |      "execution_count": 10,
 762 |      "metadata": {},
 763 |      "output_type": "execute_result"
 764 |     }
 765 |    ],
 766 |    "source": [
 767 |     "df.head()"
 768 |    ]
 769 |   },
 770 |   {
 771 |    "cell_type": "code",
 772 |    "execution_count": 11,
 773 |    "metadata": {
 774 |     "ExecuteTime": {
 775 |      "end_time": "2018-12-28T11:47:52.903141Z",
 776 |      "start_time": "2018-12-28T11:47:52.882368Z"
 777 |     }
 778 |    },
 779 |    "outputs": [
 780 |     {
 781 |      "data": {
 782 |       "text/html": [
 783 |        "<div>\n",
 784 |        "<style scoped>\n",
 785 |        "    .dataframe tbody tr th:only-of-type {\n",
 786 |        "        vertical-align: middle;\n",
 787 |        "    }\n",
 788 |        "\n",
 789 |        "    .dataframe tbody tr th {\n",
 790 |        "        vertical-align: top;\n",
 791 |        "    }\n",
 792 |        "\n",
 793 |        "    .dataframe thead th {\n",
 794 |        "        text-align: right;\n",
 795 |        "    }\n",
 796 |        "</style>\n",
 797 |        "<table border=\"1\" class=\"dataframe\">\n",
 798 |        "  <thead>\n",
 799 |        "    <tr style=\"text-align: right;\">\n",
 800 |        "      <th></th>\n",
 801 |        "      <th>Date</th>\n",
 802 |        "      <th>Open</th>\n",
 803 |        "      <th>High</th>\n",
 804 |        "      <th>Low</th>\n",
 805 |        "      <th>Close</th>\n",
 806 |        "      <th>Adj Close</th>\n",
 807 |        "      <th>Volume</th>\n",
 808 |        "    </tr>\n",
 809 |        "  </thead>\n",
 810 |        "  <tbody>\n",
 811 |        "    <tr>\n",
 812 |        "      <th>15</th>\n",
 813 |        "      <td>2018-12-17</td>\n",
 814 |        "      <td>6886.459961</td>\n",
 815 |        "      <td>6931.810059</td>\n",
 816 |        "      <td>6710.009766</td>\n",
 817 |        "      <td>6753.729980</td>\n",
 818 |        "      <td>6753.729980</td>\n",
 819 |        "      <td>2665240000</td>\n",
 820 |        "    </tr>\n",
 821 |        "    <tr>\n",
 822 |        "      <th>16</th>\n",
 823 |        "      <td>2018-12-18</td>\n",
 824 |        "      <td>6809.819824</td>\n",
 825 |        "      <td>6847.270020</td>\n",
 826 |        "      <td>6733.709961</td>\n",
 827 |        "      <td>6783.910156</td>\n",
 828 |        "      <td>6783.910156</td>\n",
 829 |        "      <td>2595400000</td>\n",
 830 |        "    </tr>\n",
 831 |        "    <tr>\n",
 832 |        "      <th>17</th>\n",
 833 |        "      <td>2018-12-19</td>\n",
 834 |        "      <td>6777.589844</td>\n",
 835 |        "      <td>6868.859863</td>\n",
 836 |        "      <td>6586.500000</td>\n",
 837 |        "      <td>6636.830078</td>\n",
 838 |        "      <td>6636.830078</td>\n",
 839 |        "      <td>2899950000</td>\n",
 840 |        "    </tr>\n",
 841 |        "    <tr>\n",
 842 |        "      <th>18</th>\n",
 843 |        "      <td>2018-12-20</td>\n",
 844 |        "      <td>6607.759766</td>\n",
 845 |        "      <td>6666.200195</td>\n",
 846 |        "      <td>6447.910156</td>\n",
 847 |        "      <td>6528.410156</td>\n",
 848 |        "      <td>6528.410156</td>\n",
 849 |        "      <td>3258090000</td>\n",
 850 |        "    </tr>\n",
 851 |        "    <tr>\n",
 852 |        "      <th>19</th>\n",
 853 |        "      <td>2018-12-21</td>\n",
 854 |        "      <td>6573.490234</td>\n",
 855 |        "      <td>6586.680176</td>\n",
 856 |        "      <td>6304.629883</td>\n",
 857 |        "      <td>6333.000000</td>\n",
 858 |        "      <td>6333.000000</td>\n",
 859 |        "      <td>4534120000</td>\n",
 860 |        "    </tr>\n",
 861 |        "  </tbody>\n",
 862 |        "</table>\n",
 863 |        "</div>"
 864 |       ],
 865 |       "text/plain": [
 866 |        "          Date         Open         High          Low        Close  \\\n",
 867 |        "15  2018-12-17  6886.459961  6931.810059  6710.009766  6753.729980   \n",
 868 |        "16  2018-12-18  6809.819824  6847.270020  6733.709961  6783.910156   \n",
 869 |        "17  2018-12-19  6777.589844  6868.859863  6586.500000  6636.830078   \n",
 870 |        "18  2018-12-20  6607.759766  6666.200195  6447.910156  6528.410156   \n",
 871 |        "19  2018-12-21  6573.490234  6586.680176  6304.629883  6333.000000   \n",
 872 |        "\n",
 873 |        "      Adj Close      Volume  \n",
 874 |        "15  6753.729980  2665240000  \n",
 875 |        "16  6783.910156  2595400000  \n",
 876 |        "17  6636.830078  2899950000  \n",
 877 |        "18  6528.410156  3258090000  \n",
 878 |        "19  6333.000000  4534120000  "
 879 |       ]
 880 |      },
 881 |      "execution_count": 11,
 882 |      "metadata": {},
 883 |      "output_type": "execute_result"
 884 |     }
 885 |    ],
 886 |    "source": [
 887 |     "df.tail()"
 888 |    ]
 889 |   },
 890 |   {
 891 |    "cell_type": "code",
 892 |    "execution_count": 12,
 893 |    "metadata": {
 894 |     "ExecuteTime": {
 895 |      "end_time": "2018-12-28T11:47:52.936981Z",
 896 |      "start_time": "2018-12-28T11:47:52.915987Z"
 897 |     }
 898 |    },
 899 |    "outputs": [
 900 |     {
 901 |      "data": {
 902 |       "text/plain": [
 903 |        "(20, 7)"
 904 |       ]
 905 |      },
 906 |      "execution_count": 12,
 907 |      "metadata": {},
 908 |      "output_type": "execute_result"
 909 |     }
 910 |    ],
 911 |    "source": [
 912 |     "df.shape"
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "code",
 917 |    "execution_count": 13,
 918 |    "metadata": {
 919 |     "ExecuteTime": {
 920 |      "end_time": "2018-12-28T11:47:52.949585Z",
 921 |      "start_time": "2018-12-28T11:47:52.941610Z"
 922 |     }
 923 |    },
 924 |    "outputs": [
 925 |     {
 926 |      "data": {
 927 |       "text/plain": [
 928 |        "140"
 929 |       ]
 930 |      },
 931 |      "execution_count": 13,
 932 |      "metadata": {},
 933 |      "output_type": "execute_result"
 934 |     }
 935 |    ],
 936 |    "source": [
 937 |     "df.size"
 938 |    ]
 939 |   },
 940 |   {
 941 |    "cell_type": "code",
 942 |    "execution_count": 14,
 943 |    "metadata": {
 944 |     "ExecuteTime": {
 945 |      "end_time": "2018-12-28T11:47:52.960181Z",
 946 |      "start_time": "2018-12-28T11:47:52.952861Z"
 947 |     }
 948 |    },
 949 |    "outputs": [
 950 |     {
 951 |      "data": {
 952 |       "text/plain": [
 953 |        "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')"
 954 |       ]
 955 |      },
 956 |      "execution_count": 14,
 957 |      "metadata": {},
 958 |      "output_type": "execute_result"
 959 |     }
 960 |    ],
 961 |    "source": [
 962 |     "df.columns"
 963 |    ]
 964 |   },
 965 |   {
 966 |    "cell_type": "code",
 967 |    "execution_count": 15,
 968 |    "metadata": {
 969 |     "ExecuteTime": {
 970 |      "end_time": "2018-12-28T11:47:52.971291Z",
 971 |      "start_time": "2018-12-28T11:47:52.964021Z"
 972 |     }
 973 |    },
 974 |    "outputs": [
 975 |     {
 976 |      "data": {
 977 |       "text/plain": [
 978 |        "RangeIndex(start=0, stop=20, step=1)"
 979 |       ]
 980 |      },
 981 |      "execution_count": 15,
 982 |      "metadata": {},
 983 |      "output_type": "execute_result"
 984 |     }
 985 |    ],
 986 |    "source": [
 987 |     "df.index"
 988 |    ]
 989 |   },
 990 |   {
 991 |    "cell_type": "code",
 992 |    "execution_count": 16,
 993 |    "metadata": {
 994 |     "ExecuteTime": {
 995 |      "end_time": "2018-12-28T11:47:52.984913Z",
 996 |      "start_time": "2018-12-28T11:47:52.973669Z"
 997 |     }
 998 |    },
 999 |    "outputs": [
1000 |     {
1001 |      "data": {
1002 |       "text/plain": [
1003 |        "[RangeIndex(start=0, stop=20, step=1),\n",
1004 |        " Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')]"
1005 |       ]
1006 |      },
1007 |      "execution_count": 16,
1008 |      "metadata": {},
1009 |      "output_type": "execute_result"
1010 |     }
1011 |    ],
1012 |    "source": [
1013 |     "df.axes"
1014 |    ]
1015 |   },
1016 |   {
1017 |    "cell_type": "code",
1018 |    "execution_count": 35,
1019 |    "metadata": {
1020 |     "ExecuteTime": {
1021 |      "end_time": "2018-12-28T12:34:54.088767Z",
1022 |      "start_time": "2018-12-28T12:34:54.045968Z"
1023 |     }
1024 |    },
1025 |    "outputs": [
1026 |     {
1027 |      "name": "stdout",
1028 |      "output_type": "stream",
1029 |      "text": [
1030 |       "<class 'pandas.core.frame.DataFrame'>\n",
1031 |       "RangeIndex: 20 entries, 0 to 19\n",
1032 |       "Data columns (total 7 columns):\n",
1033 |       "Date         20 non-null object\n",
1034 |       "Open         20 non-null float64\n",
1035 |       "High         20 non-null float64\n",
1036 |       "Low          20 non-null float64\n",
1037 |       "Close        20 non-null float64\n",
1038 |       "Adj Close    20 non-null float64\n",
1039 |       "Volume       20 non-null int64\n",
1040 |       "dtypes: float64(5), int64(1), object(1)\n",
1041 |       "memory usage: 1.2+ KB\n"
1042 |      ]
1043 |     }
1044 |    ],
1045 |    "source": [
1046 |     "df.info()"
1047 |    ]
1048 |   },
1049 |   {
1050 |    "cell_type": "markdown",
1051 |    "metadata": {},
1052 |    "source": [
1053 |     "---\n",
1054 |     "\n",
1055 |     "## 1.4 Dive into index\n",
1056 |     "\n",
1057 |     "### 1.4.1 Index labels\n",
1058 |     "\n",
1059 |     "Index labels:\n",
1060 |     "    \n",
1061 |     "1. do not need to be integers;\n",
1062 |     "\n",
1063 |     "2. can have repeated labels (__Be careful, this is different from dict__);\n",
1064 |     "\n",
1065 |     "3. can have hierarchical sets of labels.\n",
1066 |     "\n",
1067 |     "### 1.4.2 Examples of index labels"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "code",
1072 |    "execution_count": 17,
1073 |    "metadata": {
1074 |     "ExecuteTime": {
1075 |      "end_time": "2018-12-28T11:47:53.010408Z",
1076 |      "start_time": "2018-12-28T11:47:52.990897Z"
1077 |     }
1078 |    },
1079 |    "outputs": [
1080 |     {
1081 |      "name": "stdout",
1082 |      "output_type": "stream",
1083 |      "text": [
1084 |       "0   -0.281210\n",
1085 |       "1    0.770726\n",
1086 |       "2   -0.176266\n",
1087 |       "3   -1.612378\n",
1088 |       "4   -1.868139\n",
1089 |       "5   -0.496955\n",
1090 |       "dtype: float64\n",
1091 |       "\n",
1092 |       "a   -0.281210\n",
1093 |       "a    0.770726\n",
1094 |       "a   -0.176266\n",
1095 |       "a   -1.612378\n",
1096 |       "a   -1.868139\n",
1097 |       "a   -0.496955\n",
1098 |       "dtype: float64\n",
1099 |       "\n",
1100 |       "MultiIndex(levels=[['a', 'b'], [-1.1125657480588875, -0.17357278036727714, -0.14227622735236414, 0.04140226853409916, 0.923176857116319, 1.2940772084264573]],\n",
1101 |       "           labels=[[0, 0, 0, 1, 1, 1], [4, 1, 3, 0, 5, 2]],\n",
1102 |       "           names=['letter', 'float'])\n",
1103 |       "\n",
1104 |       "letter  float    \n",
1105 |       "a        0.923177   -0.281210\n",
1106 |       "        -0.173573    0.770726\n",
1107 |       "         0.041402   -0.176266\n",
1108 |       "b       -1.112566   -1.612378\n",
1109 |       "         1.294077   -1.868139\n",
1110 |       "        -0.142276   -0.496955\n",
1111 |       "dtype: float64\n"
1112 |      ]
1113 |     }
1114 |    ],
1115 |    "source": [
1116 |     "import numpy as np\n",
1117 |     "import pandas as pd\n",
1118 |     "\n",
1119 |     "# The default index is int\n",
1120 |     "aray = np.random.randn(6)\n",
1121 |     "srs = pd.Series(aray)\n",
1122 |     "print(srs)\n",
1123 |     "print()\n",
1124 |     "\n",
1125 |     "# We can set repeated non-int labels to index\n",
1126 |     "ind = ['a'] * 6\n",
1127 |     "srs.index = ind\n",
1128 |     "print(srs)\n",
1129 |     "print()\n",
1130 |     "\n",
1131 |     "# We can set multi-level labels to index\n",
1132 |     "ind = zip(['a'] * 3 + ['b'] * 3, np.random.randn(6))\n",
1133 |     "ind = pd.MultiIndex.from_tuples(ind, names=['letter', 'float'])\n",
1134 |     "srs.index = ind\n",
1135 |     "print(ind)\n",
1136 |     "print()\n",
1137 |     "print(srs)"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "code",
1142 |    "execution_count": 37,
1143 |    "metadata": {
1144 |     "ExecuteTime": {
1145 |      "end_time": "2018-12-28T12:40:36.060328Z",
1146 |      "start_time": "2018-12-28T12:40:36.027835Z"
1147 |     }
1148 |    },
1149 |    "outputs": [
1150 |     {
1151 |      "data": {
1152 |       "text/plain": [
1153 |        "a   -0.281210\n",
1154 |        "a    0.770726\n",
1155 |        "a   -0.176266\n",
1156 |        "a   -1.612378\n",
1157 |        "a   -1.868139\n",
1158 |        "a   -0.496955\n",
1159 |        "dtype: float64"
1160 |       ]
1161 |      },
1162 |      "execution_count": 37,
1163 |      "metadata": {},
1164 |      "output_type": "execute_result"
1165 |     }
1166 |    ],
1167 |    "source": [
1168 |     "sr1 = pd.Series(aray, index=['a'] * 6)\n",
1169 |     "sr1"
1170 |    ]
1171 |   },
1172 |   {
1173 |    "cell_type": "markdown",
1174 |    "metadata": {},
1175 |    "source": [
1176 |     "---\n",
1177 |     "\n",
1178 |     "### 1.4.3 Three major usages\n",
1179 |     "\n",
1180 |     "a. Identication: Indices are used to locate Series / rows / items in a DataFrame.   \n",
1181 |     "\n",
1182 |     "b. Alignment: pandas will always align with index automatically first.\n",
1183 |     "\n",
1184 |     "c. Selection: using index to select relevant columns/rows.\n",
1185 |     "\n",
1186 |     "### 1.4.4 Examples of identication"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "code",
1191 |    "execution_count": 38,
1192 |    "metadata": {
1193 |     "ExecuteTime": {
1194 |      "end_time": "2018-12-28T12:44:22.736405Z",
1195 |      "start_time": "2018-12-28T12:44:22.689162Z"
1196 |     }
1197 |    },
1198 |    "outputs": [
1199 |     {
1200 |      "data": {
1201 |       "text/html": [
1202 |        "<div>\n",
1203 |        "<style scoped>\n",
1204 |        "    .dataframe tbody tr th:only-of-type {\n",
1205 |        "        vertical-align: middle;\n",
1206 |        "    }\n",
1207 |        "\n",
1208 |        "    .dataframe tbody tr th {\n",
1209 |        "        vertical-align: top;\n",
1210 |        "    }\n",
1211 |        "\n",
1212 |        "    .dataframe thead th {\n",
1213 |        "        text-align: right;\n",
1214 |        "    }\n",
1215 |        "</style>\n",
1216 |        "<table border=\"1\" class=\"dataframe\">\n",
1217 |        "  <thead>\n",
1218 |        "    <tr style=\"text-align: right;\">\n",
1219 |        "      <th></th>\n",
1220 |        "      <th>Date</th>\n",
1221 |        "      <th>Open</th>\n",
1222 |        "      <th>High</th>\n",
1223 |        "      <th>Low</th>\n",
1224 |        "      <th>Close</th>\n",
1225 |        "      <th>Adj Close</th>\n",
1226 |        "      <th>Volume</th>\n",
1227 |        "    </tr>\n",
1228 |        "  </thead>\n",
1229 |        "  <tbody>\n",
1230 |        "    <tr>\n",
1231 |        "      <th>14</th>\n",
1232 |        "      <td>2018-12-14</td>\n",
1233 |        "      <td>6986.370117</td>\n",
1234 |        "      <td>7027.169922</td>\n",
1235 |        "      <td>6898.990234</td>\n",
1236 |        "      <td>6910.660156</td>\n",
1237 |        "      <td>6910.660156</td>\n",
1238 |        "      <td>2200510000</td>\n",
1239 |        "    </tr>\n",
1240 |        "  </tbody>\n",
1241 |        "</table>\n",
1242 |        "</div>"
1243 |       ],
1244 |       "text/plain": [
1245 |        "          Date         Open         High          Low        Close  \\\n",
1246 |        "14  2018-12-14  6986.370117  7027.169922  6898.990234  6910.660156   \n",
1247 |        "\n",
1248 |        "      Adj Close      Volume  \n",
1249 |        "14  6910.660156  2200510000  "
1250 |       ]
1251 |      },
1252 |      "execution_count": 38,
1253 |      "metadata": {},
1254 |      "output_type": "execute_result"
1255 |     }
1256 |    ],
1257 |    "source": [
1258 |     "index_df = df.copy()\n",
1259 |     "\n",
1260 |     "index_df[index_df['Date'] == '2018-12-14']"
1261 |    ]
1262 |   },
1263 |   {
1264 |    "cell_type": "markdown",
1265 |    "metadata": {},
1266 |    "source": [
1267 |     "---\n",
1268 |     "\n",
1269 |     "### 1.4.5 Examples of alignment"
1270 |    ]
1271 |   },
1272 |   {
1273 |    "cell_type": "code",
1274 |    "execution_count": 32,
1275 |    "metadata": {
1276 |     "ExecuteTime": {
1277 |      "end_time": "2018-12-28T11:48:52.767186Z",
1278 |      "start_time": "2018-12-28T11:48:52.741402Z"
1279 |     }
1280 |    },
1281 |    "outputs": [
1282 |     {
1283 |      "name": "stdout",
1284 |      "output_type": "stream",
1285 |      "text": [
1286 |       "         Date         Open         High          Low        Close  \\\n",
1287 |       "0  2018-11-23  6919.520020  6987.890137  6919.160156  6938.979980   \n",
1288 |       "1  2018-11-26  7026.500000  7083.930176  7003.120117  7081.850098   \n",
1289 |       "2  2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195   \n",
1290 |       "3  2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844   \n",
1291 |       "4  2018-11-29  7267.370117  7319.959961  7217.689941  7273.080078   \n",
1292 |       "\n",
1293 |       "     Adj Close      Volume    Max_diff  \n",
1294 |       "0  6938.979980   958950000   68.729981  \n",
1295 |       "1  7081.850098  2011180000   80.810059  \n",
1296 |       "2  7082.700195  2067360000   90.780274  \n",
1297 |       "3  7291.589844  2390260000  201.729981  \n",
1298 |       "4  7273.080078  1983460000  102.270020  \n"
1299 |      ]
1300 |     }
1301 |    ],
1302 |    "source": [
1303 |     "index_df['Max_diff'] = index_df['High'] - index_df['Low']\n",
1304 |     "index_df.head()"
1305 |    ]
1306 |   },
1307 |   {
1308 |    "cell_type": "markdown",
1309 |    "metadata": {},
1310 |    "source": [
1311 |     "---\n",
1312 |     "\n",
1313 |     "### 1.4.6 Examples of selection"
1314 |    ]
1315 |   },
1316 |   {
1317 |    "cell_type": "code",
1318 |    "execution_count": 33,
1319 |    "metadata": {
1320 |     "ExecuteTime": {
1321 |      "end_time": "2018-12-28T11:48:54.255104Z",
1322 |      "start_time": "2018-12-28T11:48:54.237334Z"
1323 |     }
1324 |    },
1325 |    "outputs": [
1326 |     {
1327 |      "name": "stdout",
1328 |      "output_type": "stream",
1329 |      "text": [
1330 |       "Date     2018-12-14\n",
1331 |       "Close       6910.66\n",
1332 |       "Name: 14, dtype: object\n"
1333 |      ]
1334 |     }
1335 |    ],
1336 |    "source": [
1337 |     "index_df.loc[14, ['Date', 'Close']]"
1338 |    ]
1339 |   },
1340 |   {
1341 |    "cell_type": "code",
1342 |    "execution_count": 34,
1343 |    "metadata": {
1344 |     "ExecuteTime": {
1345 |      "end_time": "2018-12-28T11:48:55.189637Z",
1346 |      "start_time": "2018-12-28T11:48:55.168154Z"
1347 |     }
1348 |    },
1349 |    "outputs": [
1350 |     {
1351 |      "name": "stdout",
1352 |      "output_type": "stream",
1353 |      "text": [
1354 |       "Index(['2018-11-23', '2018-11-26', '2018-11-27', '2018-11-28', '2018-11-29',\n",
1355 |       "       '2018-11-30', '2018-12-03', '2018-12-04', '2018-12-06', '2018-12-07',\n",
1356 |       "       '2018-12-10', '2018-12-11', '2018-12-12', '2018-12-13', '2018-12-14',\n",
1357 |       "       '2018-12-17', '2018-12-18', '2018-12-19', '2018-12-20', '2018-12-21'],\n",
1358 |       "      dtype='object', name='Date')\n",
1359 |       "\n",
1360 |       "Date             NaN\n",
1361 |       "Close    6910.660156\n",
1362 |       "Name: 2018-12-14, dtype: float64\n"
1363 |      ]
1364 |     }
1365 |    ],
1366 |    "source": [
1367 |     "index_df['Date'] = index_df['Date'].astype('str')\n",
1368 |     "index_df.set_index('Date', inplace=True)\n",
1369 |     "print(index_df.index)\n",
1370 |     "print()\n",
1371 |     "print(index_df.loc['2018-12-14', ['Date', 'Close']])"
1372 |    ]
1373 |   },
1374 |   {
1375 |    "cell_type": "code",
1376 |    "execution_count": 22,
1377 |    "metadata": {
1378 |     "ExecuteTime": {
1379 |      "end_time": "2018-12-28T11:47:53.123177Z",
1380 |      "start_time": "2018-12-28T11:47:53.110037Z"
1381 |     }
1382 |    },
1383 |    "outputs": [
1384 |     {
1385 |      "name": "stdout",
1386 |      "output_type": "stream",
1387 |      "text": [
1388 |       "                   Open         High          Low        Close    Adj Close  \\\n",
1389 |       "2018-11-23  6919.520020  6987.890137  6919.160156  6938.979980  6938.979980   \n",
1390 |       "2018-11-26  7026.500000  7083.930176  7003.120117  7081.850098  7081.850098   \n",
1391 |       "2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195  7082.700195   \n",
1392 |       "2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844  7291.589844   \n",
1393 |       "2018-11-29  7267.370117  7319.959961  7217.689941  7273.080078  7273.080078   \n",
1394 |       "\n",
1395 |       "                Volume    Max_diff  \n",
1396 |       "2018-11-23   958950000   68.729981  \n",
1397 |       "2018-11-26  2011180000   80.810059  \n",
1398 |       "2018-11-27  2067360000   90.780274  \n",
1399 |       "2018-11-28  2390260000  201.729981  \n",
1400 |       "2018-11-29  1983460000  102.270020  \n"
1401 |      ]
1402 |     }
1403 |    ],
1404 |    "source": [
1405 |     "index_df.index.name = None\n",
1406 |     "print(index_df.head())"
1407 |    ]
1408 |   },
1409 |   {
1410 |    "cell_type": "code",
1411 |    "execution_count": 23,
1412 |    "metadata": {
1413 |     "ExecuteTime": {
1414 |      "end_time": "2018-12-28T11:47:53.139863Z",
1415 |      "start_time": "2018-12-28T11:47:53.126153Z"
1416 |     },
1417 |     "scrolled": true
1418 |    },
1419 |    "outputs": [
1420 |     {
1421 |      "name": "stdout",
1422 |      "output_type": "stream",
1423 |      "text": [
1424 |       "         Date         Open         High          Low        Close  \\\n",
1425 |       "0  2018-11-23  6919.520020  6987.890137  6919.160156  6938.979980   \n",
1426 |       "1  2018-11-26  7026.500000  7083.930176  7003.120117  7081.850098   \n",
1427 |       "2  2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195   \n",
1428 |       "3  2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844   \n",
1429 |       "4  2018-11-29  7267.370117  7319.959961  7217.689941  7273.080078   \n",
1430 |       "\n",
1431 |       "     Adj Close      Volume    Max_diff  \n",
1432 |       "0  6938.979980   958950000   68.729981  \n",
1433 |       "1  7081.850098  2011180000   80.810059  \n",
1434 |       "2  7082.700195  2067360000   90.780274  \n",
1435 |       "3  7291.589844  2390260000  201.729981  \n",
1436 |       "4  7273.080078  1983460000  102.270020  \n"
1437 |      ]
1438 |     }
1439 |    ],
1440 |    "source": [
1441 |     "index_df.index.name = \"Date\"\n",
1442 |     "index_df.reset_index(inplace=True)\n",
1443 |     "print(index_df.head())"
1444 |    ]
1445 |   },
1446 |   {
1447 |    "cell_type": "markdown",
1448 |    "metadata": {},
1449 |    "source": [
1450 |     "---\n",
1451 |     "\n",
1452 |     "### 1.4.7 Five ways of index selection\n",
1453 |     "\n",
1454 |     "1. `[]` operator: using index / column names to access data.\n",
1455 |     "2. `df.loc`: Access a group of rows and columns by label(s)\n",
1456 |     "3. `df.iloc`: Access a group of rows and columns by integer position(s)\n",
1457 |     "4. `df.at`: Access a single value for a row/column label pair.\n",
1458 |     "5. `df.iat`: Access a single value for a row/column pair by integer position."
1459 |    ]
1460 |   },
1461 |   {
1462 |    "cell_type": "code",
1463 |    "execution_count": 24,
1464 |    "metadata": {
1465 |     "ExecuteTime": {
1466 |      "end_time": "2018-12-28T11:47:53.169526Z",
1467 |      "start_time": "2018-12-28T11:47:53.144080Z"
1468 |     }
1469 |    },
1470 |    "outputs": [
1471 |     {
1472 |      "name": "stdout",
1473 |      "output_type": "stream",
1474 |      "text": [
1475 |       "                   Open         High          Low        Close    Adj Close  \\\n",
1476 |       "Date                                                                          \n",
1477 |       "2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195  7082.700195   \n",
1478 |       "2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844  7291.589844   \n",
1479 |       "\n",
1480 |       "                Volume    Max_diff  \n",
1481 |       "Date                                \n",
1482 |       "2018-11-27  2067360000   90.780274  \n",
1483 |       "2018-11-28  2390260000  201.729981  \n",
1484 |       "\n",
1485 |       "                   Open         High          Low        Close    Adj Close  \\\n",
1486 |       "Date                                                                          \n",
1487 |       "2018-11-27  7041.229980  7105.140137  7014.359863  7082.700195  7082.700195   \n",
1488 |       "2018-11-28  7135.080078  7292.709961  7090.979980  7291.589844  7291.589844   \n",
1489 |       "\n",
1490 |       "                Volume    Max_diff  \n",
1491 |       "Date                                \n",
1492 |       "2018-11-27  2067360000   90.780274  \n",
1493 |       "2018-11-28  2390260000  201.729981  \n",
1494 |       "\n",
1495 |       "                  Open         High          Low        Close    Adj Close  \\\n",
1496 |       "Date                                                                         \n",
1497 |       "2018-11-27  7041.22998  7105.140137  7014.359863  7082.700195  7082.700195   \n",
1498 |       "\n",
1499 |       "                Volume   Max_diff  \n",
1500 |       "Date                               \n",
1501 |       "2018-11-27  2067360000  90.780274  \n",
1502 |       "\n",
1503 |       "7041.22998\n",
1504 |       "\n",
1505 |       "7041.22998\n"
1506 |      ]
1507 |     }
1508 |    ],
1509 |    "source": [
1510 |     "index_df.set_index('Date', inplace=True)\n",
1511 |     "\n",
1512 |     "print(index_df['2018-11-27':'2018-11-28'])\n",
1513 |     "print()\n",
1514 |     "print(index_df.loc['2018-11-27':'2018-11-28'])\n",
1515 |     "print()\n",
1516 |     "print(index_df.iloc[2:3])\n",
1517 |     "print()\n",
1518 |     "print(index_df.at['2018-11-27','Open'])\n",
1519 |     "print()\n",
1520 |     "print(index_df.iat[2, 0])"
1521 |    ]
1522 |   },
1523 |   {
1524 |    "cell_type": "code",
1525 |    "execution_count": 25,
1526 |    "metadata": {
1527 |     "ExecuteTime": {
1528 |      "end_time": "2018-12-28T11:48:03.115744Z",
1529 |      "start_time": "2018-12-28T11:47:53.175722Z"
1530 |     }
1531 |    },
1532 |    "outputs": [
1533 |     {
1534 |      "name": "stdout",
1535 |      "output_type": "stream",
1536 |      "text": [
1537 |       "124 µs ± 42.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
1538 |      ]
1539 |     }
1540 |    ],
1541 |    "source": [
1542 |     "%timeit index_df['2018-11-27':'2018-11-28']"
1543 |    ]
1544 |   },
1545 |   {
1546 |    "cell_type": "code",
1547 |    "execution_count": 26,
1548 |    "metadata": {
1549 |     "ExecuteTime": {
1550 |      "end_time": "2018-12-28T11:48:12.157624Z",
1551 |      "start_time": "2018-12-28T11:48:03.122063Z"
1552 |     }
1553 |    },
1554 |    "outputs": [
1555 |     {
1556 |      "name": "stdout",
1557 |      "output_type": "stream",
1558 |      "text": [
1559 |       "112 µs ± 6.67 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
1560 |      ]
1561 |     }
1562 |    ],
1563 |    "source": [
1564 |     "%timeit index_df.loc['2018-11-27':'2018-11-28']"
1565 |    ]
1566 |   },
1567 |   {
1568 |    "cell_type": "code",
1569 |    "execution_count": 27,
1570 |    "metadata": {
1571 |     "ExecuteTime": {
1572 |      "end_time": "2018-12-28T11:48:25.491913Z",
1573 |      "start_time": "2018-12-28T11:48:12.161462Z"
1574 |     }
1575 |    },
1576 |    "outputs": [
1577 |     {
1578 |      "name": "stdout",
1579 |      "output_type": "stream",
1580 |      "text": [
1581 |       "175 µs ± 90.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
1582 |      ]
1583 |     }
1584 |    ],
1585 |    "source": [
1586 |     "%timeit index_df.iloc[2:3]"
1587 |    ]
1588 |   },
1589 |   {
1590 |    "cell_type": "code",
1591 |    "execution_count": 28,
1592 |    "metadata": {
1593 |     "ExecuteTime": {
1594 |      "end_time": "2018-12-28T11:48:32.855730Z",
1595 |      "start_time": "2018-12-28T11:48:25.499466Z"
1596 |     }
1597 |    },
1598 |    "outputs": [
1599 |     {
1600 |      "name": "stdout",
1601 |      "output_type": "stream",
1602 |      "text": [
1603 |       "8.53 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
1604 |      ]
1605 |     }
1606 |    ],
1607 |    "source": [
1608 |     "%timeit index_df.at['2018-11-27','Open']"
1609 |    ]
1610 |   },
1611 |   {
1612 |    "cell_type": "code",
1613 |    "execution_count": 29,
1614 |    "metadata": {
1615 |     "ExecuteTime": {
1616 |      "end_time": "2018-12-28T11:48:38.653643Z",
1617 |      "start_time": "2018-12-28T11:48:32.858760Z"
1618 |     }
1619 |    },
1620 |    "outputs": [
1621 |     {
1622 |      "name": "stdout",
1623 |      "output_type": "stream",
1624 |      "text": [
1625 |       "7.2 µs ± 1.86 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n"
1626 |      ]
1627 |     }
1628 |    ],
1629 |    "source": [
1630 |     "%timeit index_df.iat[2, 0]"
1631 |    ]
1632 |   },
1633 |   {
1634 |    "cell_type": "markdown",
1635 |    "metadata": {},
1636 |    "source": [
1637 |     "__We recommend using `pd.DataFrame.iloc() / pd.DataFrame.loc()` in this case for the best performance and readibility.__\n",
1638 |     "\n",
1639 |     "---\n",
1640 |     "\n",
1641 |     "## 1.5 Exercises\n",
1642 |     "\n",
1643 |     "### 1.5.1 Reviewing\n",
1644 |     "\n",
1645 |     "Please review the code above.\n",
1646 |     "\n",
1647 |     "### 1.5.1 Refactoring\n",
1648 |     "\n",
1649 |     "If you have written pandas scripts before, try to refactor them into different levels of functions.\n",
1650 |     "\n",
1651 |     "### 1.5.3 Checking parameters\n",
1652 |     "\n",
1653 |     "Check the default and optional parameters of the following methods:\n",
1654 |     "\n",
1655 |     "1. `DataFrame.desribe()`: provide descriptive stats of the dataset\n",
1656 |     "2. `DataFrame.values`: access values of the dataset\n",
1657 |     "3. `DataFrame.head()`: access the head of the dataset\n",
1658 |     "4. `DataFrame.tail()`: access the tail of the dataset\n",
1659 |     "5. `DataFrame.shape`: provide the length and width of the dataset\n",
1660 |     "6. `DataFrame.size`: provide the product of the length and width of the dataset\n",
1661 |     "7. `DataFrame.columns`: provide the colomn names of the dataset\n",
1662 |     "8. `DataFrame.index`: provide the row index of the dataset\n",
1663 |     "9. `DataFrame.axes`: provide the colomn names and row index of the dataset"
1664 |    ]
1665 |   },
1666 |   {
1667 |    "cell_type": "markdown",
1668 |    "metadata": {},
1669 |    "source": [
1670 |     "---\n",
1671 |     "\n",
1672 |     "To the rest sessions (outlines and video records), please scan the QR code below to pay.\n",
1673 |     "\n",
1674 |     "1. The price is 799 RMB.\n",
1675 |     "2. Please leave your email address in the __payment comment__, so I will send you the links of the rest sessions.\n",
1676 |     "\n",
1677 |     "\n",
1678 |     "<img src=\"../image/alipay.jpg\">"
1679 |    ]
1680 |   },
1681 |   {
1682 |    "cell_type": "markdown",
1683 |    "metadata": {},
1684 |    "source": [
1685 |     "---"
1686 |    ]
1687 |   }
1688 |  ],
1689 |  "metadata": {
1690 |   "kernelspec": {
1691 |    "display_name": "Python 3",
1692 |    "language": "python",
1693 |    "name": "python3"
1694 |   },
1695 |   "language_info": {
1696 |    "codemirror_mode": {
1697 |     "name": "ipython",
1698 |     "version": 3
1699 |    },
1700 |    "file_extension": ".py",
1701 |    "mimetype": "text/x-python",
1702 |    "name": "python",
1703 |    "nbconvert_exporter": "python",
1704 |    "pygments_lexer": "ipython3",
1705 |    "version": "3.7.2"
1706 |   },
1707 |   "toc": {
1708 |    "base_numbering": 1,
1709 |    "nav_menu": {},
1710 |    "number_sections": false,
1711 |    "sideBar": true,
1712 |    "skip_h1_title": false,
1713 |    "title_cell": "Table of Contents",
1714 |    "title_sidebar": "Contents",
1715 |    "toc_cell": false,
1716 |    "toc_position": {
1717 |     "height": "calc(100% - 180px)",
1718 |     "left": "10px",
1719 |     "top": "150px",
1720 |     "width": "335.8541564941406px"
1721 |    },
1722 |    "toc_section_display": true,
1723 |    "toc_window_display": false
1724 |   }
1725 |  },
1726 |  "nbformat": 4,
1727 |  "nbformat_minor": 2
1728 | }
1729 | 


--------------------------------------------------------------------------------