├── pandas 09 - Pandas and Python Data Structure.ipynb ├── pandas 10 - Best Practices in Data Analysis.ipynb ├── Pandas 00 - Intro.ipynb ├── pandas 08 - Speed up with pandas.ipynb ├── pandas 04 - Data IO.ipynb ├── Pandas 02 - Series.ipynb └── Pandas 01 - Basics of Pandas.ipynb /pandas 09 - Pandas and Python Data Structure.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# pandas 09 - Pandas and Python Data Structure\n", 8 | "\n", 9 | "by Nova@Douban\n", 10 | "\n", 11 | "The video record of this session is here: https://zoom.us/recording/share/-AyhhqiRKrw42R8xjEWHfXKDs-w2-IGS_NLh01a9q5SwIumekTziMw\n", 12 | "\n", 13 | "---\n", 14 | "\n", 15 | "## 9.1 Series and dict" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "ExecuteTime": { 23 | "end_time": "2019-01-25T12:05:46.484050Z", 24 | "start_time": "2019-01-25T12:05:45.138406Z" 25 | } 26 | }, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "a 1\n", 32 | "b 2\n", 33 | "c 9\n", 34 | "dtype: int64" 35 | ] 36 | }, 37 | "metadata": {}, 38 | "output_type": "display_data" 39 | }, 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "{'a': 1, 'b': 2, 'c': 9}" 44 | ] 45 | }, 46 | "metadata": {}, 47 | "output_type": "display_data" 48 | } 49 | ], 50 | "source": [ 51 | "# Conversion between a pandas Series and a dict\n", 52 | "import pandas as pd\n", 53 | "import numpy as np\n", 54 | "\n", 55 | "dd = {'a': 1, 'b': 2, 'c': 9}\n", 56 | "series_dd = pd.Series(dd)\n", 57 | "display(series_dd, series_dd.to_dict())" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## 9.2 Series and array" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 2, 70 | "metadata": { 71 | "ExecuteTime": { 72 | "end_time": "2019-01-22T09:21:37.535074Z", 73 | "start_time": "2019-01-22T09:21:37.287231Z" 74 | } 75 | }, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "0 -2.845180\n", 81 | "1 -2.469059\n", 82 | "2 -0.156874\n", 83 | "3 -0.290911\n", 84 | "4 0.876318\n", 85 | "5 -0.104034\n", 86 | "dtype: float64" 87 | ] 88 | }, 89 | "metadata": {}, 90 | "output_type": "display_data" 91 | }, 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "a -2.845180\n", 96 | "a -2.469059\n", 97 | "a -0.156874\n", 98 | "a -0.290911\n", 99 | "a 0.876318\n", 100 | "a -0.104034\n", 101 | "dtype: float64" 102 | ] 103 | }, 104 | "metadata": {}, 105 | "output_type": "display_data" 106 | }, 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "\n", 111 | "array([-2.84518 , -2.469059, -0.156874, -0.290911, 0.876318, -0.104034])\n", 112 | "Coordinates:\n", 113 | " * index (index) object 'a' 'a' 'a' 'a' 'a' 'a'" 114 | ] 115 | }, 116 | "metadata": {}, 117 | "output_type": "display_data" 118 | } 119 | ], 120 | "source": [ 121 | "# The default index is int\n", 122 | "aray = np.random.randn(6)\n", 123 | "srs = pd.Series(aray)\n", 124 | "display(srs)\n", 125 | "\n", 126 | "# We can set repeated non-int labels to index\n", 127 | "ind = ['a'] * 6\n", 128 | "srs.index = ind\n", 129 | "display(srs, srs.to_xarray())" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## 9.3 Series and list\n", 137 | "\n", 138 | "It is `pd.Series.tolist()`, not `pd.Series.to_list()`." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 3, 144 | "metadata": { 145 | "ExecuteTime": { 146 | "end_time": "2019-01-22T09:21:37.577756Z", 147 | "start_time": "2019-01-22T09:21:37.549862Z" 148 | } 149 | }, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "0 0.725138\n", 155 | "1 0.543878\n", 156 | "2 0.226283\n", 157 | "3 1.267045\n", 158 | "4 -0.495132\n", 159 | "5 -0.192349\n", 160 | "dtype: float64" 161 | ] 162 | }, 163 | "metadata": {}, 164 | "output_type": "display_data" 165 | }, 166 | { 167 | "data": { 168 | "text/plain": [ 169 | "[0.725138, 0.543878, 0.226283, 1.267045, -0.495132, -0.192349]" 170 | ] 171 | }, 172 | "metadata": {}, 173 | "output_type": "display_data" 174 | } 175 | ], 176 | "source": [ 177 | "lst = [0.725138, 0.543878, 0.226283, 1.267045, -0.495132, -0.192349]\n", 178 | "srs = pd.Series(lst)\n", 179 | "lst = srs.tolist()\n", 180 | "\n", 181 | "display(srs, lst)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## 9.4 DataFrame and dict" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 3, 194 | "metadata": { 195 | "ExecuteTime": { 196 | "end_time": "2019-01-25T12:05:51.701915Z", 197 | "start_time": "2019-01-25T12:05:51.628849Z" 198 | } 199 | }, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/html": [ 204 | "
\n", 205 | "\n", 218 | "\n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | "
abc
01829
1896
2023
\n", 248 | "
" 249 | ], 250 | "text/plain": [ 251 | " a b c\n", 252 | "0 1 82 9\n", 253 | "1 8 9 6\n", 254 | "2 0 2 3" 255 | ] 256 | }, 257 | "metadata": {}, 258 | "output_type": "display_data" 259 | }, 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "{'a': {0: 1, 1: 8, 2: 0}, 'b': {0: 82, 1: 9, 2: 2}, 'c': {0: 9, 1: 6, 2: 3}}" 264 | ] 265 | }, 266 | "metadata": {}, 267 | "output_type": "display_data" 268 | } 269 | ], 270 | "source": [ 271 | "dd = {'a': [1, 8, 0], 'b': [82, 9, 2], 'c': [9, 6, 3]}\n", 272 | "df_dd = pd.DataFrame.from_dict(dd)\n", 273 | "display(df_dd, df_dd.to_dict())" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [] 282 | } 283 | ], 284 | "metadata": { 285 | "kernelspec": { 286 | "display_name": "Python 3", 287 | "language": "python", 288 | "name": "python3" 289 | }, 290 | "language_info": { 291 | "codemirror_mode": { 292 | "name": "ipython", 293 | "version": 3 294 | }, 295 | "file_extension": ".py", 296 | "mimetype": "text/x-python", 297 | "name": "python", 298 | "nbconvert_exporter": "python", 299 | "pygments_lexer": "ipython3", 300 | "version": "3.7.2" 301 | }, 302 | "toc": { 303 | "base_numbering": 1, 304 | "nav_menu": {}, 305 | "number_sections": false, 306 | "sideBar": true, 307 | "skip_h1_title": false, 308 | "title_cell": "Table of Contents", 309 | "title_sidebar": "Contents", 310 | "toc_cell": false, 311 | "toc_position": {}, 312 | "toc_section_display": true, 313 | "toc_window_display": false 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 2 318 | } 319 | -------------------------------------------------------------------------------- /pandas 10 - Best Practices in Data Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# pandas 10 - Best Practices in Data Analysis\n", 8 | "\n", 9 | "by Nova@Douban\n", 10 | "\n", 11 | "The video record of this session is here: https://zoom.us/recording/share/-AyhhqiRKrw42R8xjEWHfXKDs-w2-IGS_NLh01a9q5SwIumekTziMw\n", 12 | "\n", 13 | "\n", 14 | "---\n", 15 | "\n", 16 | "This sharing focuses on some best practices I gained during the past years working as a NLP engineer and data scientist.\n", 17 | "\n", 18 | "## 10.1 Version control\n", 19 | "\n", 20 | "Version conttol is an important practice in software development, and we shall follow this best practice to reduce errors, such as removing code by mistake.\n", 21 | "\n", 22 | "### 10.1.1 Git\n", 23 | "\n", 24 | "Git is one of the most popular version control tools, and we can find main version control platforms supporting this protocal. We can choose from Github, Gitlab or Bitbucket.\n", 25 | "\n", 26 | "Git has many workfolows to follow, we can start with gitflow, which is simple to follow. \n", 27 | "\n", 28 | "\n", 29 | "\n", 30 | "### 10.1.2 Git LFS\n", 31 | "\n", 32 | "Git is suitable to store code, but not for big files. Therefore, another version control protocal for big file emerged -- Git LFS. We can use it to store our datasets.\n", 33 | "\n", 34 | "### 10.1.3 Version control your code, not your data\n", 35 | "\n", 36 | "Some of us may used to store temporary / intermediate datasets on harddisk. However, we do not recommend this way; instead, we recommend version contoal your code, and commit as frequent as possible.\n", 37 | "\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "---\n", 45 | "\n", 46 | "## 10.2 Folder structure\n", 47 | "\n", 48 | "1. If our code folder is managed by git, it can be an individual folder only for code;\n", 49 | "2. At the same level, we can have another two folder for raw data and and results. Therefore, we can user git LFS to manage these two folders.\n", 50 | "3. We can use virtualenv to control the package environment.\n", 51 | "\n", 52 | "\n", 53 | "" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "---\n", 61 | "\n", 62 | "## 10.3 Jupyter Notebook and IPython\n", 63 | "\n", 64 | "### 10.3.1 Jupyter Notebook\n", 65 | "\n", 66 | "1. Jupyter Notebook is a convinient tool for interacting data analysis.\n", 67 | "2. We can put code, document, visualisation in a single Jupyter notebook.\n", 68 | "3. We can experiment / draft / benchmark in Jupyter notebook.\n", 69 | "4. We can export to markdown / python code / PDF, etc.\n", 70 | "5. We can install extensions to make Jupyter Notebook easier to use. See [this](https://towardsdatascience.com/jupyter-notebook-extensions-517fa69d2231?gi=e865fc4d7033).\n", 71 | "6. All outlines of this course are created with Jupyter Notebook.\n", 72 | "\n", 73 | "### 10.3.2 IPython\n", 74 | "\n", 75 | "1. IPython is the backend of Jupyter Notebook.\n", 76 | "2. Jupyter Notebook needs browser support. For example, if we need fix on a server, and we only can access the server via SSH, then IPython is the only choice.\n", 77 | "3. If you are familiar with Jupyter Notebook, then IPython is farily easy to use." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 10, 83 | "metadata": { 84 | "ExecuteTime": { 85 | "end_time": "2019-01-25T12:25:27.954978Z", 86 | "start_time": "2019-01-25T12:25:27.739130Z" 87 | } 88 | }, 89 | "outputs": [ 90 | { 91 | "name": "stdout", 92 | "output_type": "stream", 93 | "text": [ 94 | "The autoreload extension is already loaded. To reload it, use:\n", 95 | " %reload_ext autoreload\n" 96 | ] 97 | }, 98 | { 99 | "data": { 100 | "text/html": [ 101 | "
\n", 102 | "\n", 115 | "\n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | "
OpenClose
02498.7700202485.739990
12498.9399412506.850098
22476.9599612510.030029
32491.9199222447.889893
42474.3300782531.939941
\n", 151 | "
" 152 | ], 153 | "text/plain": [ 154 | " Open Close\n", 155 | "0 2498.770020 2485.739990\n", 156 | "1 2498.939941 2506.850098\n", 157 | "2 2476.959961 2510.030029\n", 158 | "3 2491.919922 2447.889893\n", 159 | "4 2474.330078 2531.939941" 160 | ] 161 | }, 162 | "metadata": {}, 163 | "output_type": "display_data" 164 | } 165 | ], 166 | "source": [ 167 | "%load_ext autoreload\n", 168 | "%autoreload 2\n", 169 | "\n", 170 | "from test_script import fast_read\n", 171 | "\n", 172 | "data = fast_read('../data/gspc.csv', ['Open', 'Close'])\n", 173 | "display(data)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "---\n", 181 | "\n", 182 | "## 10.4 Data file format\n", 183 | "\n", 184 | "Please refer to pandas 04 - pandas IO.\n", 185 | "\n", 186 | "1. For data serializaion, we can choose from JSON / Parquet / Arrow / HDF5.\n", 187 | "2. When we use JSON, try using line-based JSON (JSONL) in case for possible stream processing.\n", 188 | "3. NEVER use Python pickle!" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "---\n", 196 | "\n", 197 | "## 10.5 Script structure\n", 198 | "\n", 199 | "1. Collect all frequently-used snippets to a single script, and we can import these snippets to other scripts.\n", 200 | "2. Build core functions with those snippets, and put them into another script.\n", 201 | "3. Build a script of wrappers to call core functions." 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "---\n", 209 | "\n", 210 | "## 10.6 How to write functions?\n", 211 | "\n", 212 | "1. Make your functions readable and pure;\n", 213 | "2. Make the function name simple to remember;\n", 214 | "3. Always add docstring and comments;\n", 215 | "4. Always abstract your functions to be used repeatedly;\n", 216 | "5. Always refactor your code;\n", 217 | "6. If a problem is to difficult to solve, try writing pseudo code first. " 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "## 10.7 Exercises\n", 225 | "\n", 226 | "1. Read [Cookiecutter Data Science — Organize your Projects — Atom and Jupyter](https://medium.com/@rrfd/cookiecutter-data-science-organize-your-projects-atom-and-jupyter-2be7862f487e)\n", 227 | "2. Read [Gitflow introduction](https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow) by Bitbucket" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [] 234 | } 235 | ], 236 | "metadata": { 237 | "kernelspec": { 238 | "display_name": "Python 3", 239 | "language": "python", 240 | "name": "python3" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.7.2" 253 | }, 254 | "toc": { 255 | "base_numbering": 1, 256 | "nav_menu": {}, 257 | "number_sections": false, 258 | "sideBar": true, 259 | "skip_h1_title": false, 260 | "title_cell": "Table of Contents", 261 | "title_sidebar": "Contents", 262 | "toc_cell": false, 263 | "toc_position": {}, 264 | "toc_section_display": true, 265 | "toc_window_display": false 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 2 270 | } 271 | -------------------------------------------------------------------------------- /Pandas 00 - Intro.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas 00 - Intro\n", 8 | "\n", 9 | "by Nova@Douban\n", 10 | "\n", 11 | "The video record of this session is here: https://zoom.us/recording/share/rDS-o_BWuPyBYIbswQ6bKJ5QGeFzY50BVFnBnw4t7pOwIumekTziMw?startTime=1545565951000\n", 12 | "\n", 13 | "---\n", 14 | "\n", 15 | "## 0.1 Course overview\n", 16 | "\n", 17 | "\n", 18 | "\n", 19 | "----" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## 0.2 How to learn pandas?\n", 27 | "\n", 28 | "1. __Code, Code, and Code!__\n", 29 | "2. Read [pandas documentation](http://pandas.pydata.org/pandas-docs/stable/)\n", 30 | "3. Check [StackOverflow](http://stackoverflow.com)\n", 31 | "4. Check reference books\n", 32 | " 1. _Python for Data Analysis_\n", 33 | " 2. _Learning Pandas - Python Data Discovery and Analysis Made Easy_\n", 34 | "5. Check blogs\n", 35 | " 1. [pandas's Author Wes McKinney](http://wesmckinney.com/archives.html)\n", 36 | " 2. [Dataquest](https://www.dataquest.io/blog/)\n", 37 | " 3. [Introduction to Pandas by Ritchie Ng](https://www.ritchieng.com/tag_pandas/)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "---" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## 0.3 A Brief Overview of pandas\n", 52 | " \n", 53 | "### 0.3.1 When to use pandas?\n", 54 | "\n", 55 | "1. If the dataset can fit in your local machine / single server, use pandas;\n", 56 | "2. If you want to speed up Python computing, use pandas;\n", 57 | "3. If the computing logic is too complicated to simple SQL queries, use pandas;\n", 58 | "4. If you want to convert data file format, use pandas;\n", 59 | "\n", 60 | "### 0.3.2 How to use pandas?\n", 61 | "\n", 62 | "1. If you use pandas, use it in pandas way;\n", 63 | "2. If you use pandas, use it as a framework.\n", 64 | "3. If you use pandas, track the code, not the data.\n", 65 | "\n", 66 | "\n", 67 | "### 0.3.3 Some basic principles of pandas\n", 68 | "\n", 69 | "1. There are multiple ways to finish a task in pandas.\n", 70 | "\n", 71 | "2. If there are multiple ways to write in pandas, we shall choose the most suitable way.\n", 72 | "\n", 73 | "3. If we use pandas, take it as a framework, not just a tool.\n", 74 | "\n", 75 | "---" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "## 0.4 Examples\n", 83 | "\n", 84 | "### 0.4.1 Multiple ways to drop a column" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 1, 90 | "metadata": { 91 | "ExecuteTime": { 92 | "end_time": "2019-01-26T08:32:14.654893Z", 93 | "start_time": "2019-01-26T08:32:13.755566Z" 94 | } 95 | }, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/html": [ 100 | "
\n", 101 | "\n", 114 | "\n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | "
DateOpenHighLowCloseAdj CloseVolume
02018-11-236919.5200206987.8901376919.1601566938.9799806938.979980958950000
12018-11-267026.5000007083.9301767003.1201177081.8500987081.8500982011180000
22018-11-277041.2299807105.1401377014.3598637082.7001957082.7001952067360000
32018-11-287135.0800787292.7099617090.9799807291.5898447291.5898442390260000
42018-11-297267.3701177319.9599617217.6899417273.0800787273.0800781983460000
\n", 180 | "
" 181 | ], 182 | "text/plain": [ 183 | " Date Open High Low Close \\\n", 184 | "0 2018-11-23 6919.520020 6987.890137 6919.160156 6938.979980 \n", 185 | "1 2018-11-26 7026.500000 7083.930176 7003.120117 7081.850098 \n", 186 | "2 2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 \n", 187 | "3 2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 \n", 188 | "4 2018-11-29 7267.370117 7319.959961 7217.689941 7273.080078 \n", 189 | "\n", 190 | " Adj Close Volume \n", 191 | "0 6938.979980 958950000 \n", 192 | "1 7081.850098 2011180000 \n", 193 | "2 7082.700195 2067360000 \n", 194 | "3 7291.589844 2390260000 \n", 195 | "4 7273.080078 1983460000 " 196 | ] 197 | }, 198 | "execution_count": 1, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "# Download Nasdaq dataset: https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC\n", 205 | "\n", 206 | "import pandas as pd\n", 207 | "\n", 208 | "in_file = '../data/nasdaq.csv'\n", 209 | "df = pd.read_csv(in_file, engine='c')\n", 210 | "df.head()\n", 211 | "# df.describe()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 2, 217 | "metadata": { 218 | "ExecuteTime": { 219 | "end_time": "2019-01-26T08:32:14.667953Z", 220 | "start_time": "2019-01-26T08:32:14.658978Z" 221 | } 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "def drop_col_1(df, col):\n", 226 | " '''\n", 227 | " using drop\n", 228 | " '''\n", 229 | " df1 = df.copy()\n", 230 | " df1.drop(col, axis=1, inplace=True)\n", 231 | " return df1\n", 232 | "\n", 233 | "def drop_col_2(df, col):\n", 234 | " '''\n", 235 | " using del\n", 236 | " '''\n", 237 | " df2 = df.copy()\n", 238 | " del(df2[col])\n", 239 | " return df2\n", 240 | "\n", 241 | "def drop_col_3(df, col):\n", 242 | " '''\n", 243 | " using boolean selection\n", 244 | " '''\n", 245 | " df3 = df.copy()\n", 246 | " \n", 247 | " cols = list(df3.columns)\n", 248 | " cols.remove(col)\n", 249 | " \n", 250 | " df3 = df3[cols]\n", 251 | " return df3" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "## 0.4.2 Choose the most suitable way from multiple ways\n", 259 | "\n", 260 | "We can use different profiling tools to benchmark the performence of different ways:\n", 261 | "\n", 262 | "1. %timeit for speed\n", 263 | "2. %memit for memory consumption" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": 3, 269 | "metadata": { 270 | "ExecuteTime": { 271 | "end_time": "2019-01-26T08:32:39.807971Z", 272 | "start_time": "2019-01-26T08:32:14.672425Z" 273 | } 274 | }, 275 | "outputs": [ 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "1.08 ms ± 97.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n", 281 | "510 µs ± 22.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n", 282 | "1 ms ± 124 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "%timeit r1 = drop_col_1(df, ['Open'])\n", 288 | "%timeit r2 = drop_col_2(df, 'Open')\n", 289 | "%timeit r3 = drop_col_3(df, 'Open')" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 4, 295 | "metadata": { 296 | "ExecuteTime": { 297 | "end_time": "2019-01-26T08:32:40.455867Z", 298 | "start_time": "2019-01-26T08:32:39.812272Z" 299 | } 300 | }, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "peak memory: 76.87 MiB, increment: 0.45 MiB\n", 307 | "peak memory: 76.88 MiB, increment: 0.00 MiB\n", 308 | "peak memory: 76.88 MiB, increment: 0.00 MiB\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "%load_ext memory_profiler\n", 314 | "%memit r1 = drop_col_1(df, ['Open'])\n", 315 | "%memit r2 = drop_col_2(df, 'Open')\n", 316 | "%memit r2 = drop_col_3(df, 'Open')" 317 | ] 318 | }, 319 | { 320 | "cell_type": "markdown", 321 | "metadata": {}, 322 | "source": [ 323 | "__Conclusion__\n", 324 | "\n", 325 | "1. `drop_col_2` used the least time among the three methods;\n", 326 | "2. Three methods consumed same memory;\n", 327 | "3. `drop_col_1` is the most pandas way, and `drop_col_3` is the least readable\n", 328 | "\n", 329 | "Therefore, we choose `drop_col_2` to finish this task" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "## 0.4.3 Use pandas a framework\n", 337 | "\n", 338 | "Suppose we have a task as following:\n", 339 | "\n", 340 | "1. read data from a CSV file;\n", 341 | "2. calculate some results according to requirements;\n", 342 | "3. output results to a Json and an Excel file.\n", 343 | "\n", 344 | "These jobs can be easily handled by pandas." 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 5, 350 | "metadata": { 351 | "ExecuteTime": { 352 | "end_time": "2019-01-26T08:32:40.475927Z", 353 | "start_time": "2019-01-26T08:32:40.462048Z" 354 | } 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "def process_nasdaq(in_csv, out_json, out_excel):\n", 359 | " # read from CSV\n", 360 | " df = pd.read_csv(in_csv, engine='c')\n", 361 | "\n", 362 | " # Clean data\n", 363 | " df.rename(columns={'Adj Close': 'Adj_close'}, inplace=True)\n", 364 | "\n", 365 | " # Calcualtion\n", 366 | " df['Max_diff'] = df['High'] - df['Low']\n", 367 | " df['Open_close_diff'] = df['Close'] - df['Open']\n", 368 | "\n", 369 | " # Output to Json\n", 370 | " df.to_json(out_json, lines=True, orient='records')\n", 371 | " \n", 372 | " # Output to Excel\n", 373 | " writer = pd.ExcelWriter(out_excel)\n", 374 | " df.to_excel(writer,'Sheet1')\n", 375 | " writer.save()" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 6, 381 | "metadata": { 382 | "ExecuteTime": { 383 | "end_time": "2019-01-26T08:32:40.815973Z", 384 | "start_time": "2019-01-26T08:32:40.479655Z" 385 | } 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "in_csv = '../data/nasdaq.csv'\n", 390 | "out_json = '../data/nasdaq.json'\n", 391 | "out_excel = '../data/nasdaq.xlsx'\n", 392 | "process_nasdaq(in_csv, out_json, out_excel)" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "---\n", 400 | "\n", 401 | "To the rest sessions (outlines and video records), please scan the QR code below to pay.\n", 402 | "\n", 403 | "1. The price is 799 RMB.\n", 404 | "2. Please leave your email address in the __payment comment__, so I will send you the links of the rest sessions.\n", 405 | "\n", 406 | "\n", 407 | "" 408 | ] 409 | } 410 | ], 411 | "metadata": { 412 | "kernelspec": { 413 | "display_name": "Python 3", 414 | "language": "python", 415 | "name": "python3" 416 | }, 417 | "language_info": { 418 | "codemirror_mode": { 419 | "name": "ipython", 420 | "version": 3 421 | }, 422 | "file_extension": ".py", 423 | "mimetype": "text/x-python", 424 | "name": "python", 425 | "nbconvert_exporter": "python", 426 | "pygments_lexer": "ipython3", 427 | "version": "3.7.2" 428 | }, 429 | "toc": { 430 | "base_numbering": 1, 431 | "nav_menu": {}, 432 | "number_sections": false, 433 | "sideBar": true, 434 | "skip_h1_title": false, 435 | "title_cell": "Table of Contents", 436 | "title_sidebar": "Contents", 437 | "toc_cell": false, 438 | "toc_position": {}, 439 | "toc_section_display": true, 440 | "toc_window_display": false 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 2 445 | } 446 | -------------------------------------------------------------------------------- /pandas 08 - Speed up with pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# pandas 08 - Speed up with pandas\n", 8 | "\n", 9 | "by Nova@Douban\n", 10 | "\n", 11 | "The video record of this session is here: https://zoom.us/recording/share/1ZCrNvlQG3rchlnczSpwCdq89ZZR12SZ75a2QvcI19WwIumekTziMw\n", 12 | "\n", 13 | "---\n", 14 | "\n", 15 | "In this tutotial, we will share how to process big datasets with pandas. Here, the big datasets mean those datasets are too big for a single machine.\n", 16 | "\n", 17 | "## 8.1 Stream processing\n", 18 | "\n", 19 | "pandas is an efficient tool to process data, but when the dataset cannot be fit in memory, using pandas could be a little bit tricky. If the dataset is big enough to take all of the memorys, the pandas task will get stuck there.\n", 20 | "\n", 21 | "One way to deal this problem is to apply stream processing to pandas with `chunksize` parameter in `pd.read_csv()`, `pd.read_table()`, `pd.read_json(lines=True)` \n", 22 | "\n", 23 | "---\n", 24 | "\n", 25 | "### 8.1.1 An example of stream processing" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "ExecuteTime": { 33 | "end_time": "2019-01-20T12:51:33.558058Z", 34 | "start_time": "2019-01-20T12:51:33.546775Z" 35 | } 36 | }, 37 | "outputs": [], 38 | "source": [ 39 | "def preprocess_patent(in_f, out_f):\n", 40 | " '''\n", 41 | " normal read and write\n", 42 | " '''\n", 43 | " df = pd.read_table(in_f, sep='##')\n", 44 | " df.columns = ['id0', 'id1', 'ref']\n", 45 | " result = df[(df['ref'].str.contains('^[a-zA-Z]+')) & (df.ref['ref'].len() > 80)]\n", 46 | " result.to_csv(out_f, index=False, header=False, mode='w')\n", 47 | "\n", 48 | "def preprocess_patent(in_f, out_f, size):\n", 49 | " '''\n", 50 | " read a chunk,\n", 51 | " process a chunk,\n", 52 | " write a chunk,\n", 53 | " then repeat\n", 54 | " '''\n", 55 | " reader = pd.read_table(in_f, sep='##', chunksize=size)\n", 56 | " for chunk in reader:\n", 57 | " chunk.columns = ['id0', 'id1', 'ref']\n", 58 | " result = chunk[(chunk['ref'].str.contains('^[a-zA-Z]+')) & (chunk['ref'].str.len() > 80)]\n", 59 | " result.to_csv(out_f, index=False, header=False, mode='a')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "Some aspects are worth paying attetion to:\n", 67 | "\n", 68 | "1. The `chunksize` should not be too small. If it is too small, the IO cost will be high to overcome the benefit. For example, if we have a file with one million lines, we did a little experiment:\n", 69 | "\n", 70 | "| Chunksize | Memory (MiB) | Time (s) |\n", 71 | "|-----------|--------------|----------|\n", 72 | "| 100 | 142.13 | 36.9 |\n", 73 | "| 1,000 | 141.38 | 13.8 |\n", 74 | "| 10,000 | 141.38 | 12.1 |\n", 75 | "| 100,000 | 209.88 | 12.7 |\n", 76 | "| 200,000 | 312.15 | 12.5 |\n", 77 | "\n", 78 | "In our main task, we set `chunksize` as 200,000, and it used 211.22MiB memory to process the 10G+ dataset with 9min 54s.\n", 79 | "\n", 80 | "2. the `pandas.DataFrame.to_csv()` mode should be set as 'a' to append chunk results to a single file; otherwise, only the last chunk will be saved.\n", 81 | "\n", 82 | "### 8.1.2 Be Careful with the Index\n", 83 | "\n", 84 | "Once, I had a strange with above stream processing logic:" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 2, 90 | "metadata": { 91 | "ExecuteTime": { 92 | "end_time": "2019-01-20T12:51:35.512216Z", 93 | "start_time": "2019-01-20T12:51:33.562070Z" 94 | } 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stderr", 99 | "output_type": "stream", 100 | "text": [ 101 | "WARNING: Not updating worker name since `setproctitle` is not installed. Install this with `pip install setproctitle` (or ray[debug]) to enable monitoring of worker processes.\n", 102 | "Process STDOUT and STDERR is being redirected to /tmp/ray/session_2019-01-20_20-51-34_41781/logs.\n", 103 | "Waiting for redis server at 127.0.0.1:59136 to respond...\n", 104 | "Waiting for redis server at 127.0.0.1:53672 to respond...\n", 105 | "Starting the Plasma object store with 6.871947672999999 GB memory using /tmp.\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "import pandas as pd\n", 111 | "import modin.pandas as mp\n", 112 | "\n", 113 | "def stream_process(IN_FILE, OUT_FILE):\n", 114 | " reader = pd.read_csv(IN_FILE, chunksize = 1000, engine='c')\n", 115 | " for chunk in reader:\n", 116 | " result = []\n", 117 | " for line in chunk.tolist():\n", 118 | " temp = complicated_process(chunk) # this involves a very complicated processing, so here is just a simplified version\n", 119 | " result.append(temp)\n", 120 | " chunk['new_series'] = pd.series(result)\n", 121 | " chunk.to_csv(OUT_TILE, index=False, mode='a')" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "---\n", 129 | "\n", 130 | "I can confirm each loop of result is not empty. But only in the first time of the loop, line `chunk['new_series'] = pd.series(result)` has result, and the rest are empty. Therefore, only the first chunk of the output contains new_series, and the rest are empty.\n", 131 | "\n", 132 | "When we tracked the index of each chunk, we found that they are not independent. We assumed that each chunk would start the index from 0, but in reality, it is NOT. The index of each chunk is a subset of the whole CSV in this situation, so their index derives from the CSV. This is what caused the problem. In our initial logic, the `pandas.to_csv` writes only the result of the first chunk, instead of the last chunk.\n", 133 | "\n", 134 | "Therefore, a better solution would be rebuild index for each chunk, and concatenating it with result." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 3, 140 | "metadata": { 141 | "ExecuteTime": { 142 | "end_time": "2019-01-20T12:51:35.528613Z", 143 | "start_time": "2019-01-20T12:51:35.518159Z" 144 | } 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | "def stream_process(IN_FILE, OUT_FILE):\n", 149 | " reader = pd.read_csv(IN_FILE, chunksize = 1000, engine='c')\n", 150 | " for chunk in reader:\n", 151 | " result = []\n", 152 | " for line in chunk.tolist():\n", 153 | " temp = complicated_process(chunk) # this involves a very complicated processing, so here is just a simplified version\n", 154 | " result.append(temp)\n", 155 | " new_chunk = chunk.reindex() # key solver\n", 156 | " new_chunk = new_chunk.assign(new_series=result)\n", 157 | " new_chunk.to_csv(OUT_TILE, index=False, mode='a')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "## 8.2 Reduce memory usage with pandas\n", 165 | "\n", 166 | "### 8.2.1 Reduce with categorical type\n", 167 | "\n", 168 | "Often, some columns of data are catagorical, but they are saved as non-catagorical types. Therefore, we can convert them to categorical type with `pd.Series.astype('category')`" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 4, 174 | "metadata": { 175 | "ExecuteTime": { 176 | "end_time": "2019-01-20T12:51:36.014380Z", 177 | "start_time": "2019-01-20T12:51:35.532740Z" 178 | } 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "\n", 186 | "RangeIndex: 321792 entries, 0 to 321791\n", 187 | "Data columns (total 3 columns):\n", 188 | "MSISDN_SEG 321792 non-null int64\n", 189 | "AREA_CODE 321792 non-null int64\n", 190 | "ASP 321792 non-null int64\n", 191 | "dtypes: int64(3)\n", 192 | "memory usage: 7.4 MB\n" 193 | ] 194 | }, 195 | { 196 | "data": { 197 | "text/html": [ 198 | "
\n", 199 | "\n", 212 | "\n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | "
MSISDN_SEGAREA_CODEASP
01451091102
11451092102
21451093102
31451094102
41451095102
\n", 254 | "
" 255 | ], 256 | "text/plain": [ 257 | " MSISDN_SEG AREA_CODE ASP\n", 258 | "0 1451091 10 2\n", 259 | "1 1451092 10 2\n", 260 | "2 1451093 10 2\n", 261 | "3 1451094 10 2\n", 262 | "4 1451095 10 2" 263 | ] 264 | }, 265 | "metadata": {}, 266 | "output_type": "display_data" 267 | }, 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "None" 272 | ] 273 | }, 274 | "metadata": {}, 275 | "output_type": "display_data" 276 | }, 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "'2.46 MB'" 281 | ] 282 | }, 283 | "metadata": {}, 284 | "output_type": "display_data" 285 | }, 286 | { 287 | "data": { 288 | "text/plain": [ 289 | "'0.63 MB'" 290 | ] 291 | }, 292 | "metadata": {}, 293 | "output_type": "display_data" 294 | } 295 | ], 296 | "source": [ 297 | "def mem_usage(pandas_obj):\n", 298 | " '''\n", 299 | " We will use this to check memory usage\n", 300 | " '''\n", 301 | " if isinstance(pandas_obj,pd.DataFrame):\n", 302 | " usage_b = pandas_obj.memory_usage(deep=True).sum()\n", 303 | " else: # we assume if not a df it's a series\n", 304 | " usage_b = pandas_obj.memory_usage(deep=True)\n", 305 | " usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes\n", 306 | " return \"{:03.2f} MB\".format(usage_mb)\n", 307 | "\n", 308 | "df = pd.read_csv('../data/mobile_phone.csv')\n", 309 | "display(df.head(), df.info())\n", 310 | "display(mem_usage(df['AREA_CODE']), mem_usage(df['AREA_CODE'].astype('category')))" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "### 8.2.2 Choose the right subtypes\n", 318 | "\n", 319 | "pandas often chooses a safer dtype to store data; however, this may additional memory usage. For example, we can downcast numerical to `unsigned int` to save space for positive-only integers.\n", 320 | "\n", 321 | "\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 5, 327 | "metadata": { 328 | "ExecuteTime": { 329 | "end_time": "2019-01-20T12:51:36.383063Z", 330 | "start_time": "2019-01-20T12:51:36.018784Z" 331 | } 332 | }, 333 | "outputs": [ 334 | { 335 | "name": "stdout", 336 | "output_type": "stream", 337 | "text": [ 338 | "\n", 339 | "RangeIndex: 321792 entries, 0 to 321791\n", 340 | "Data columns (total 3 columns):\n", 341 | "MSISDN_SEG 321792 non-null uint32\n", 342 | "AREA_CODE 321792 non-null uint16\n", 343 | "ASP 321792 non-null uint8\n", 344 | "dtypes: uint16(1), uint32(1), uint8(1)\n", 345 | "memory usage: 2.1 MB\n" 346 | ] 347 | }, 348 | { 349 | "data": { 350 | "text/html": [ 351 | "
\n", 352 | "\n", 365 | "\n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | "
MSISDN_SEGAREA_CODEASP
01451091102
11451092102
21451093102
31451094102
41451095102
\n", 407 | "
" 408 | ], 409 | "text/plain": [ 410 | " MSISDN_SEG AREA_CODE ASP\n", 411 | "0 1451091 10 2\n", 412 | "1 1451092 10 2\n", 413 | "2 1451093 10 2\n", 414 | "3 1451094 10 2\n", 415 | "4 1451095 10 2" 416 | ] 417 | }, 418 | "metadata": {}, 419 | "output_type": "display_data" 420 | }, 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "None" 425 | ] 426 | }, 427 | "metadata": {}, 428 | "output_type": "display_data" 429 | }, 430 | { 431 | "data": { 432 | "text/plain": [ 433 | "'7.37 MB'" 434 | ] 435 | }, 436 | "metadata": {}, 437 | "output_type": "display_data" 438 | }, 439 | { 440 | "data": { 441 | "text/plain": [ 442 | "'2.15 MB'" 443 | ] 444 | }, 445 | "metadata": {}, 446 | "output_type": "display_data" 447 | } 448 | ], 449 | "source": [ 450 | "df_int = df.select_dtypes(include=['int'])\n", 451 | "converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')\n", 452 | "\n", 453 | "display(df_int.head(), converted_int.info())\n", 454 | "display(mem_usage(df_int), mem_usage(converted_int))" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "### 8.2.3 Read data with clarifying dtype of each column\n", 462 | "\n", 463 | "When we use pandas to read date, we can set `dtype` for each column, so pandas doesn't have to guess, and also saves space." 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": 6, 469 | "metadata": { 470 | "ExecuteTime": { 471 | "end_time": "2019-01-20T12:51:36.890337Z", 472 | "start_time": "2019-01-20T12:51:36.390905Z" 473 | } 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "CPU times: user 123 ms, sys: 51.9 ms, total: 175 ms\n", 481 | "Wall time: 299 ms\n", 482 | "CPU times: user 111 ms, sys: 21.2 ms, total: 132 ms\n", 483 | "Wall time: 133 ms\n", 484 | "\n", 485 | "RangeIndex: 321792 entries, 0 to 321791\n", 486 | "Data columns (total 3 columns):\n", 487 | "MSISDN_SEG 321792 non-null int64\n", 488 | "AREA_CODE 321792 non-null int64\n", 489 | "ASP 321792 non-null int64\n", 490 | "dtypes: int64(3)\n", 491 | "memory usage: 7.4 MB\n", 492 | "\n", 493 | "RangeIndex: 321792 entries, 0 to 321791\n", 494 | "Data columns (total 3 columns):\n", 495 | "MSISDN_SEG 321792 non-null uint32\n", 496 | "AREA_CODE 321792 non-null category\n", 497 | "ASP 321792 non-null category\n", 498 | "dtypes: category(2), uint32(1)\n", 499 | "memory usage: 2.2 MB\n" 500 | ] 501 | }, 502 | { 503 | "data": { 504 | "text/plain": [ 505 | "None" 506 | ] 507 | }, 508 | "metadata": {}, 509 | "output_type": "display_data" 510 | }, 511 | { 512 | "data": { 513 | "text/plain": [ 514 | "None" 515 | ] 516 | }, 517 | "metadata": {}, 518 | "output_type": "display_data" 519 | } 520 | ], 521 | "source": [ 522 | "%time df1 = pd.read_csv('../data/mobile_phone.csv', engine='c')\n", 523 | "\n", 524 | "column_type = {'MSISDN_SEG': 'uint32', 'AREA_CODE': 'category', 'ASP': 'category'}\n", 525 | "%time df2 = pd.read_csv('../data/mobile_phone.csv', dtype=column_type, engine='c')\n", 526 | "\n", 527 | "display(df1.info(), df2.info())" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "## 8.3 Cython\n", 535 | "\n", 536 | "Cython is a C implementation of Python, and pandas can work nicely with Cython. If you don't know C, you can just import Cython to gain speed; moreover, if you know C, you can modify your code to Cython syntax to gain extra speed." 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 7, 542 | "metadata": { 543 | "ExecuteTime": { 544 | "end_time": "2019-01-20T12:51:36.914987Z", 545 | "start_time": "2019-01-20T12:51:36.893911Z" 546 | } 547 | }, 548 | "outputs": [ 549 | { 550 | "name": "stdout", 551 | "output_type": "stream", 552 | "text": [ 553 | "CPU times: user 5.89 ms, sys: 2.62 ms, total: 8.51 ms\n", 554 | "Wall time: 8.62 ms\n" 555 | ] 556 | } 557 | ], 558 | "source": [ 559 | "def demo_calc(series):\n", 560 | " return series * 67 - 89 / 45\n", 561 | "\n", 562 | "%time series1 = demo_calc(df1['MSISDN_SEG'])" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 8, 568 | "metadata": { 569 | "ExecuteTime": { 570 | "end_time": "2019-01-20T12:51:37.879341Z", 571 | "start_time": "2019-01-20T12:51:36.923659Z" 572 | } 573 | }, 574 | "outputs": [ 575 | { 576 | "name": "stdout", 577 | "output_type": "stream", 578 | "text": [ 579 | "CPU times: user 2.09 ms, sys: 853 µs, total: 2.94 ms\n", 580 | "Wall time: 2.95 ms\n" 581 | ] 582 | } 583 | ], 584 | "source": [ 585 | "%load_ext cython\n", 586 | "\n", 587 | "%time series2 = demo_calc(df1['MSISDN_SEG'])" 588 | ] 589 | }, 590 | { 591 | "cell_type": "markdown", 592 | "metadata": {}, 593 | "source": [ 594 | "## 8.4 Modin\n", 595 | "\n", 596 | "Modin is a DataFrame library that allows you to speed up your pandas workflows by changing one line of code. " 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": 9, 602 | "metadata": { 603 | "ExecuteTime": { 604 | "end_time": "2019-01-20T12:51:38.090576Z", 605 | "start_time": "2019-01-20T12:51:37.882419Z" 606 | } 607 | }, 608 | "outputs": [ 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "CPU times: user 89.7 ms, sys: 29.9 ms, total: 120 ms\n", 614 | "Wall time: 118 ms\n", 615 | "CPU times: user 8.75 ms, sys: 2.44 ms, total: 11.2 ms\n", 616 | "Wall time: 81.7 ms\n" 617 | ] 618 | } 619 | ], 620 | "source": [ 621 | "%time df1 = pd.read_csv('../data/mobile_phone.csv')\n", 622 | "%time df2 = mp.read_csv('../data/mobile_phone.csv')" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 10, 628 | "metadata": { 629 | "ExecuteTime": { 630 | "end_time": "2019-01-20T12:51:38.249991Z", 631 | "start_time": "2019-01-20T12:51:38.094025Z" 632 | } 633 | }, 634 | "outputs": [ 635 | { 636 | "name": "stdout", 637 | "output_type": "stream", 638 | "text": [ 639 | "CPU times: user 7.7 ms, sys: 2.33 ms, total: 10 ms\n", 640 | "Wall time: 6.94 ms\n", 641 | "CPU times: user 113 ms, sys: 11.6 ms, total: 124 ms\n", 642 | "Wall time: 136 ms\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "def demo_calc(series):\n", 648 | " return series * 67 - 89 / 45\n", 649 | "\n", 650 | "def demo_calc2(series):\n", 651 | " return mp.Series(series) * 67 - 89 / 45\n", 652 | "\n", 653 | "%time series1 = demo_calc(df1['MSISDN_SEG'])\n", 654 | "%time series2 = demo_calc2(df2['MSISDN_SEG'])" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "---\n", 662 | "\n", 663 | "## 8.5 Exercises\n", 664 | "\n", 665 | "1. Read [Enhancing Performance](https://pandas.pydata.org/pandas-docs/stable/enhancingperf.html) by pandas\n", 666 | "2. Read this post [Tutorial: Using pandas with Large Data Sets](https://www.dataquest.io/blog/pandas-big-data/)\n", 667 | "3. We showed using `chunk_size` with `pd.read_table`, can you try this with `pd.read_json`?" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": { 673 | "ExecuteTime": { 674 | "end_time": "2019-01-19T23:46:44.266549Z", 675 | "start_time": "2019-01-19T23:46:44.256174Z" 676 | } 677 | }, 678 | "source": [ 679 | "---" 680 | ] 681 | } 682 | ], 683 | "metadata": { 684 | "kernelspec": { 685 | "display_name": "Python 3", 686 | "language": "python", 687 | "name": "python3" 688 | }, 689 | "language_info": { 690 | "codemirror_mode": { 691 | "name": "ipython", 692 | "version": 3 693 | }, 694 | "file_extension": ".py", 695 | "mimetype": "text/x-python", 696 | "name": "python", 697 | "nbconvert_exporter": "python", 698 | "pygments_lexer": "ipython3", 699 | "version": "3.7.2" 700 | }, 701 | "toc": { 702 | "base_numbering": 1, 703 | "nav_menu": {}, 704 | "number_sections": false, 705 | "sideBar": true, 706 | "skip_h1_title": false, 707 | "title_cell": "Table of Contents", 708 | "title_sidebar": "Contents", 709 | "toc_cell": false, 710 | "toc_position": {}, 711 | "toc_section_display": true, 712 | "toc_window_display": false 713 | } 714 | }, 715 | "nbformat": 4, 716 | "nbformat_minor": 2 717 | } 718 | -------------------------------------------------------------------------------- /pandas 04 - Data IO.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# pandas 04 - Data IO\n", 8 | "\n", 9 | "by Nova@Douban\n", 10 | "\n", 11 | "The video record of this session is here: https://zoom.us/recording/share/p2BRTD-McWEb51tNf6S1SBBBw9FDO3GJdL4JrbaG-uiwIumekTziMw\n", 12 | "\n", 13 | "\n", 14 | "---\n", 15 | "\n", 16 | "When we load data into Python, we have a demand: using a unified and powerful tool to read / write data.\n", 17 | "\n", 18 | "According to the [latest Pandas doc](http://pandas.pydata.org/pandas-docs/stable/io.html), Pandas supports reading and supporting these commonly-used file format: \n", 19 | "\n", 20 | "1. CSV, \n", 21 | "2. JSON, \n", 22 | "3. HTML, \n", 23 | "4. Local clipboard, \n", 24 | "5. MS Excel, \n", 25 | "6. HDF5 Format, \n", 26 | "7. Feather Format, \n", 27 | "8. Msgpack, \n", 28 | "9. Stata, \n", 29 | "10. SAS, \n", 30 | "11. Python Pickle Format, \n", 31 | "12. SQL, \n", 32 | "13. Google Big Query. \n", 33 | "\n", 34 | "If we visualize these data formats, we can have a clearer idea:\n", 35 | "\n", 36 | "![pandoc file conversion map](http://acepor.github.io/images/pandas_relations.png)\n", 37 | "\n", 38 | "\n", 39 | "__Advantages__\n", 40 | "\n", 41 | "Using Pandas as a unified IO tool has two main advantages:\n", 42 | "\n", 43 | "1. Pandas IO tools provide a significant performance increase when reading or writing data.\n", 44 | "2. Pandas has very detailed document, so the learning curse is reduced." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "---\n", 52 | "\n", 53 | "## 4.1 CSV\n", 54 | "\n", 55 | "CSV (comma-separated-value) format is one of the most common formats in data processing. It is easy for both human and machine to read.\n", 56 | "\n", 57 | "### 4.1.1 Read CSV to DataFrame\n", 58 | "\n", 59 | "`pd.read_csv(in_file, quoting=0, sep=',', engine='c')`\n", 60 | "\n", 61 | "1. `quoting` is to tell which quotation convention the data uses.\n", 62 | "\n", 63 | "2. If the `sep` set as `None` and `engine` as 'python', this function will automatically sniff the delimiter.\n", 64 | "\n", 65 | "3. `c` engine is much faster (at least 50%) than `python` engine, but `python` engine supports more features\n", 66 | "\n", 67 | "4. `usecols` to select columns in order to reduce memory usage.\n", 68 | "\n", 69 | "### 4.1.2 Write DataFrame to CSV\n", 70 | "\n", 71 | "`pd.DataFrame.to_csv(out_file, header=True, index=False)`\n", 72 | "\n", 73 | "1. If we want to keep header and index, we can set `header` and `index` as `True`, and vice versa." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 1, 79 | "metadata": { 80 | "ExecuteTime": { 81 | "end_time": "2019-01-06T11:08:31.180777Z", 82 | "start_time": "2019-01-06T11:08:30.639715Z" 83 | } 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "import pandas as pd\n", 88 | "%load_ext memory_profiler\n", 89 | "in_csv = '../data/first_count.csv'" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 2, 95 | "metadata": { 96 | "ExecuteTime": { 97 | "end_time": "2019-01-06T11:08:31.598577Z", 98 | "start_time": "2019-01-06T11:08:31.184404Z" 99 | } 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "peak memory: 108.51 MiB, increment: 34.71 MiB\n", 107 | "CPU times: user 210 ms, sys: 75 ms, total: 285 ms\n", 108 | "Wall time: 407 ms\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "%time %memit first_name = pd.read_csv(in_csv, engine='c')" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 3, 119 | "metadata": { 120 | "ExecuteTime": { 121 | "end_time": "2019-01-06T11:08:33.201264Z", 122 | "start_time": "2019-01-06T11:08:31.604767Z" 123 | } 124 | }, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "peak memory: 173.65 MiB, increment: 67.99 MiB\n", 131 | "CPU times: user 1.35 s, sys: 105 ms, total: 1.45 s\n", 132 | "Wall time: 1.59 s\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "%time %memit first_name = pd.read_csv(in_csv, engine='python')" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 4, 143 | "metadata": { 144 | "ExecuteTime": { 145 | "end_time": "2019-01-06T11:08:33.236176Z", 146 | "start_time": "2019-01-06T11:08:33.204439Z" 147 | } 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "\n", 155 | "RangeIndex: 321792 entries, 0 to 321791\n", 156 | "Data columns (total 3 columns):\n", 157 | "MSISDN_SEG 321792 non-null int64\n", 158 | "AREA_CODE 321792 non-null int64\n", 159 | "ASP 321792 non-null int64\n", 160 | "dtypes: int64(3)\n", 161 | "memory usage: 7.4 MB\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "first_name.info()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 5, 172 | "metadata": { 173 | "ExecuteTime": { 174 | "end_time": "2019-01-06T11:08:34.941681Z", 175 | "start_time": "2019-01-06T11:08:33.239012Z" 176 | } 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "peak memory: 125.99 MiB, increment: 14.78 MiB\n", 184 | "CPU times: user 1.48 s, sys: 59.4 ms, total: 1.54 s\n", 185 | "Wall time: 1.7 s\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "out_csv = '../data/first_count.csv'\n", 191 | "%time %memit first_name.to_csv(out_csv, header=True, index=False)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "---\n", 199 | "\n", 200 | "## 4.2 JSON\n", 201 | "\n", 202 | "JSON has gain more popularity recently. It has more controls on data, but it is not very human-friendly. JSON has different orients: `split`, `records`, `index`, `columns` or `values`. \n", 203 | "\n", 204 | "_Screenshot of JSON columns file_\n", 205 | "\n", 206 | "\n", 207 | "\n", 208 | "_Screenshot of JSON index file_\n", 209 | "\n", 210 | "\n", 211 | "\n", 212 | "_Screenshot of JSON split file_\n", 213 | "\n", 214 | "\n", 215 | "\n", 216 | "_Screenshot of JSON values file\n", 217 | "\n", 218 | "\n", 219 | "\n", 220 | "_Screenshot of JSON lines file\n", 221 | "\n", 222 | "" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "### 4.2.1 Read JSON to DataFrame\n", 230 | "\n", 231 | "Because it has a number of orients, it is quite easy to get confused. Therefore, when we use Pandas to read a JSON file, we have to specify the orient. \n", 232 | "\n", 233 | "__Moreover, it the file is line-based, we can set `lines` as `True`.__\n", 234 | "\n", 235 | "`pd.read_json(in_file, orient='records', lines=False)`\n", 236 | "\n", 237 | "### 4.2.2 Write DataFrame to JSON\n", 238 | "\n", 239 | "Always save Json as `lines`\n", 240 | "\n", 241 | "`pd.DataFrame.to_json(out_file, orient='records', lines=False)`" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 6, 247 | "metadata": { 248 | "ExecuteTime": { 249 | "end_time": "2019-01-06T11:08:35.173713Z", 250 | "start_time": "2019-01-06T11:08:34.945172Z" 251 | } 252 | }, 253 | "outputs": [ 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "peak memory: 123.61 MiB, increment: 0.23 MiB\n", 259 | "CPU times: user 68.2 ms, sys: 37.2 ms, total: 105 ms\n", 260 | "Wall time: 219 ms\n" 261 | ] 262 | } 263 | ], 264 | "source": [ 265 | "in_json = '../data/nasdaq.json'\n", 266 | "%time %memit nasdaq = pd.read_json(in_json, orient='records', lines=True)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 7, 272 | "metadata": { 273 | "ExecuteTime": { 274 | "end_time": "2019-01-06T11:08:35.547768Z", 275 | "start_time": "2019-01-06T11:08:35.177696Z" 276 | } 277 | }, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "peak memory: 180.29 MiB, increment: 56.68 MiB\n", 284 | "CPU times: user 180 ms, sys: 62.8 ms, total: 243 ms\n", 285 | "Wall time: 361 ms\n" 286 | ] 287 | } 288 | ], 289 | "source": [ 290 | "out_json = '../data/first_count.json'\n", 291 | "%time %memit first_name.to_json(out_json, orient='records', lines=True)" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "### 4.2.3 Swiss knife for JSON\n", 299 | "\n", 300 | "Sometimes, a JSON file can be very nasty, and we just couldn't figure out how to read it. Luckily, `pandas` has a Swiss knife for this task -- `pd.io.json.json_normalize`.\n", 301 | "\n", 302 | "The example in ths screenshot is an Unserialized JSON file generated by `request` lib. This file cannot be read by `pd.read_json`, so we used `pd.io.json.json_normalize` instead. \n", 303 | "\n", 304 | "" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 8, 310 | "metadata": { 311 | "ExecuteTime": { 312 | "end_time": "2019-01-06T11:08:35.801634Z", 313 | "start_time": "2019-01-06T11:08:35.551407Z" 314 | } 315 | }, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "peak memory: 175.99 MiB, increment: 0.04 MiB\n", 322 | "CPU times: user 55.8 ms, sys: 30.7 ms, total: 86.5 ms\n", 323 | "Wall time: 199 ms\n" 324 | ] 325 | }, 326 | { 327 | "data": { 328 | "text/html": [ 329 | "
\n", 330 | "\n", 343 | "\n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | "
_geoloc.lat_geoloc.lng_highlightResult.region.hascadvertiser.avatar_urladvertiser.categoryadvertiser.idadvertiser.nameadvertiser.phoneadvertiser.phone_full.phoneadvertiser.phone_full.status...rental_yieldsale_priceslugstationtitle.entitle.jatitle.rutitle.thtokentransaction
013.723916100.566902[{'value': 'TH', 'matchLevel': 'none', 'matche...https://files.hipcdn.com/avatars/53faa8bd93164...Agent559636cd70726f2451000094Findbangkokroom.com099-095-5...099-095-5535ok...NoneNonebangkok-condo[509ea305d2af11286e000ace, 509ea305d2af11286e0...For Rent 5 Beds Condo in Khlong Toei, Bangkok,...For Rent 5 Beds コンド in Khlong Toei, Bangkok, T...В аренду: Кондо с 5 спальнями в районе Khlong ...ให้เช่า คอนโด 5 ห้องนอน คลองเตย กรุงเทพฯAAFBALPC[rent]
\n", 397 | "

1 rows × 75 columns

\n", 398 | "
" 399 | ], 400 | "text/plain": [ 401 | " _geoloc.lat _geoloc.lng \\\n", 402 | "0 13.723916 100.566902 \n", 403 | "\n", 404 | " _highlightResult.region.hasc \\\n", 405 | "0 [{'value': 'TH', 'matchLevel': 'none', 'matche... \n", 406 | "\n", 407 | " advertiser.avatar_url advertiser.category \\\n", 408 | "0 https://files.hipcdn.com/avatars/53faa8bd93164... Agent \n", 409 | "\n", 410 | " advertiser.id advertiser.name advertiser.phone \\\n", 411 | "0 559636cd70726f2451000094 Findbangkokroom.com 099-095-5... \n", 412 | "\n", 413 | " advertiser.phone_full.phone advertiser.phone_full.status ... \\\n", 414 | "0 099-095-5535 ok ... \n", 415 | "\n", 416 | " rental_yield sale_price slug \\\n", 417 | "0 None None bangkok-condo \n", 418 | "\n", 419 | " station \\\n", 420 | "0 [509ea305d2af11286e000ace, 509ea305d2af11286e0... \n", 421 | "\n", 422 | " title.en \\\n", 423 | "0 For Rent 5 Beds Condo in Khlong Toei, Bangkok,... \n", 424 | "\n", 425 | " title.ja \\\n", 426 | "0 For Rent 5 Beds コンド in Khlong Toei, Bangkok, T... \n", 427 | "\n", 428 | " title.ru \\\n", 429 | "0 В аренду: Кондо с 5 спальнями в районе Khlong ... \n", 430 | "\n", 431 | " title.th token transaction \n", 432 | "0 ให้เช่า คอนโด 5 ห้องนอน คลองเตย กรุงเทพฯ AAFBALPC [rent] \n", 433 | "\n", 434 | "[1 rows x 75 columns]" 435 | ] 436 | }, 437 | "execution_count": 8, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "from pandas.io.json import json_normalize\n", 444 | "\n", 445 | "in_json = '../data/AAFBALPC.json'\n", 446 | "\n", 447 | "def convert_json(in_file):\n", 448 | " with open(in_file) as json_data:\n", 449 | " data = json.load(json_data)\n", 450 | " del data['formatted']\n", 451 | " df = json_normalize(data)\n", 452 | " return df\n", 453 | "\n", 454 | "%time %memit df = convert_json(in_json)\n", 455 | "df.head()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "---\n", 463 | "\n", 464 | "## 4.3 HDF5\n", 465 | "\n", 466 | "HDF5 is a unique file format. We can include multiple other-format files into a single HDF5 file, and used a key to index them. Therefore, we can save space and reading speed of multiple files." 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 9, 472 | "metadata": { 473 | "ExecuteTime": { 474 | "end_time": "2019-01-06T11:08:35.812367Z", 475 | "start_time": "2019-01-06T11:08:35.805229Z" 476 | } 477 | }, 478 | "outputs": [], 479 | "source": [ 480 | "def hdf2df(in_hdf, hdf_keys):\n", 481 | " \"\"\"\n", 482 | " Read a hdf5 file and return all dfs\n", 483 | " :param in_hdf: a hdf5 file\n", 484 | " :param hdf_keys:\n", 485 | " :return a dict of df\n", 486 | " \"\"\"\n", 487 | " return {i: pd.read_hdf(in_hdf, i) for i in hdf_keys}\n", 488 | "\n", 489 | "\n", 490 | "def df2hdf(out_hdf, dfs, hdf_keys, mode='a'):\n", 491 | " \"\"\"\n", 492 | " Store single or multiple dfs to one hdf5 file\n", 493 | " :param dfs: single of multiple dfs\n", 494 | " :param out_hdf: the output file\n", 495 | " :param hdf_keys: [key for hdf]\n", 496 | " \"\"\"\n", 497 | " for j, k in zip(dfs, hdf_keys):\n", 498 | " j.to_hdf(out_hdf, k, table=True, mode=mode)" 499 | ] 500 | }, 501 | { 502 | "cell_type": "markdown", 503 | "metadata": {}, 504 | "source": [ 505 | "---\n", 506 | "\n", 507 | "## 4.4 MySQL\n", 508 | "\n", 509 | "MySQL is one of the most popular databases, and `pandas` can easily read the data from it with the help of another Python library `sqlalchemy`.\n", 510 | "\n", 511 | "### 4.4.1 Read MySQL table to DataFrame\n", 512 | "\n", 513 | "1. use `sqlalchemy` to make a MySQL connection.\n", 514 | "\n", 515 | "2. give a SQL query to pandas, and query from the created connection.\n", 516 | "\n", 517 | "\n", 518 | "### 4.4.2 Write DataFrame to MySQL\n", 519 | "\n", 520 | "1. make MySQL connection,\n", 521 | "2. write DataFrame to MySQL" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 10, 527 | "metadata": { 528 | "ExecuteTime": { 529 | "end_time": "2019-01-06T11:08:35.940990Z", 530 | "start_time": "2019-01-06T11:08:35.817447Z" 531 | } 532 | }, 533 | "outputs": [], 534 | "source": [ 535 | "from sqlalchemy import create_engine\n", 536 | "def connect_db(host):\n", 537 | " return create_engine(host)\n", 538 | "\n", 539 | "def mysql2df(sql, con):\n", 540 | " \"\"\"\n", 541 | " pull data from SQl to dataframe\n", 542 | " :param sql: sql query\n", 543 | " :param con: sql connection\n", 544 | " :return: df\n", 545 | " \"\"\"\n", 546 | " return pd.read_sql_query(sql=sql, con=con)\n", 547 | " \n", 548 | " \n", 549 | "def df2mysql(df, table_name, con, if_exist):\n", 550 | " \"\"\"\n", 551 | " save df to sql\n", 552 | " :param df:\n", 553 | " :param table_name: sql table name\n", 554 | " :param con: sql connection\n", 555 | " :param if_exist: append if existed\n", 556 | " :return:\n", 557 | " \"\"\"\n", 558 | " df.to_sql(table_name, con, if_exists=if_exist, index=False) " 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "---\n", 566 | "\n", 567 | "## 4.5 Excel\n", 568 | "\n", 569 | "Excel is one of the most common data file formats, and pandas can handle it as well." 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 11, 575 | "metadata": { 576 | "ExecuteTime": { 577 | "end_time": "2019-01-06T11:08:36.413028Z", 578 | "start_time": "2019-01-06T11:08:35.943521Z" 579 | } 580 | }, 581 | "outputs": [ 582 | { 583 | "name": "stdout", 584 | "output_type": "stream", 585 | "text": [ 586 | "peak memory: 189.64 MiB, increment: 9.37 MiB\n", 587 | "CPU times: user 245 ms, sys: 66.8 ms, total: 312 ms\n", 588 | "Wall time: 458 ms\n" 589 | ] 590 | } 591 | ], 592 | "source": [ 593 | "def df2excel(df, out_excel):\n", 594 | " writer = pd.ExcelWriter(out_excel)\n", 595 | " df.to_excel(writer,'Sheet1')\n", 596 | " writer.save()\n", 597 | "\n", 598 | "out_excel = '../data/test.xlsx'\n", 599 | "%time %memit df2excel(nasdaq, out_excel)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "---\n", 607 | "\n", 608 | "## 4.6 Benchmark of reading / writing large files with pandas\n", 609 | "\n", 610 | "This benchmark was run on a Google full name count file: 2 columns * 25,891,901 rows.\n", 611 | "\n", 612 | "\n", 613 | "\n", 614 | "---\n", 615 | "\n", 616 | "The conclusion is that __Parquet__ uses the least time to read and write, requires least time to read, and the output size is the smallest, although it requires the most time to write.\n", 617 | "\n", 618 | "### 4.6.1 Converting between DataFrame and Arrow table\n", 619 | "\n", 620 | "> Apache Arrow is a cross-language development platform for in-memory data. \n", 621 | "\n", 622 | "> It specifies a standardized language-independent columnar memory format for flat and hierarchical data, organized for efficient analytic operations on modern hardware. \n", 623 | "\n", 624 | "> It also provides computational libraries and zero-copy streaming messaging and interprocess communication. " 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 12, 630 | "metadata": { 631 | "ExecuteTime": { 632 | "end_time": "2019-01-06T11:08:36.937389Z", 633 | "start_time": "2019-01-06T11:08:36.418748Z" 634 | } 635 | }, 636 | "outputs": [ 637 | { 638 | "name": "stdout", 639 | "output_type": "stream", 640 | "text": [ 641 | "peak memory: 192.28 MiB, increment: 0.39 MiB\n", 642 | "CPU times: user 58.8 ms, sys: 46.3 ms, total: 105 ms\n", 643 | "Wall time: 224 ms\n", 644 | "peak memory: 184.56 MiB, increment: -7.65 MiB\n", 645 | "CPU times: user 67.5 ms, sys: 87 ms, total: 154 ms\n", 646 | "Wall time: 246 ms\n" 647 | ] 648 | } 649 | ], 650 | "source": [ 651 | "import pyarrow as pa\n", 652 | "\n", 653 | "%time %memit table = pa.Table.from_pandas(first_name)\n", 654 | "%time %memit df_new = table.to_pandas()" 655 | ] 656 | }, 657 | { 658 | "cell_type": "markdown", 659 | "metadata": {}, 660 | "source": [ 661 | "---\n", 662 | "\n", 663 | "### 4.6.2 Fatest way to write a DataFrame to disk\n", 664 | "\n", 665 | "write a DataFrame to parquet without compression with `pyarrow` lib\n", 666 | "\n", 667 | "1. convert DataFrame to Arrow table\n", 668 | "2. write table to Parquet on disk" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 13, 674 | "metadata": { 675 | "ExecuteTime": { 676 | "end_time": "2019-01-06T11:08:38.676088Z", 677 | "start_time": "2019-01-06T11:08:36.942137Z" 678 | } 679 | }, 680 | "outputs": [ 681 | { 682 | "name": "stdout", 683 | "output_type": "stream", 684 | "text": [ 685 | "peak memory: 170.10 MiB, increment: 2.57 MiB\n", 686 | "CPU times: user 1.47 s, sys: 71.8 ms, total: 1.54 s\n", 687 | "Wall time: 1.73 s\n" 688 | ] 689 | } 690 | ], 691 | "source": [ 692 | "%time %memit first_name.to_csv(out_csv, header=True, index=False)" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": 14, 698 | "metadata": { 699 | "ExecuteTime": { 700 | "end_time": "2019-01-06T11:08:39.051013Z", 701 | "start_time": "2019-01-06T11:08:38.679447Z" 702 | } 703 | }, 704 | "outputs": [ 705 | { 706 | "name": "stdout", 707 | "output_type": "stream", 708 | "text": [ 709 | "peak memory: 150.27 MiB, increment: 6.93 MiB\n", 710 | "CPU times: user 161 ms, sys: 69.6 ms, total: 231 ms\n", 711 | "Wall time: 351 ms\n" 712 | ] 713 | } 714 | ], 715 | "source": [ 716 | "import pyarrow.parquet as pq\n", 717 | "out_pq = '../data/test.pq'\n", 718 | "\n", 719 | "def df_parquet(df, out_pq):\n", 720 | " table = pa.Table.from_pandas(df)\n", 721 | " pq.write_table(table, out_pq, compression='none')\n", 722 | "\n", 723 | "\n", 724 | "%time %memit df_parquet(first_name, out_pq)" 725 | ] 726 | }, 727 | { 728 | "cell_type": "markdown", 729 | "metadata": {}, 730 | "source": [ 731 | "---\n", 732 | "\n", 733 | "### 4.6.3 Fatest way to read a file to a DataFrame\n", 734 | "\n", 735 | "read an uncompressed parquet to a DataFrame with `pyarrow` lib.\n", 736 | "\n", 737 | "1. read Parquet file to Arrow table\n", 738 | "2. convert table to pandas DataFrame" 739 | ] 740 | }, 741 | { 742 | "cell_type": "code", 743 | "execution_count": 15, 744 | "metadata": { 745 | "ExecuteTime": { 746 | "end_time": "2019-01-06T11:08:39.510881Z", 747 | "start_time": "2019-01-06T11:08:39.055755Z" 748 | } 749 | }, 750 | "outputs": [ 751 | { 752 | "name": "stdout", 753 | "output_type": "stream", 754 | "text": [ 755 | "peak memory: 160.75 MiB, increment: 14.10 MiB\n", 756 | "CPU times: user 230 ms, sys: 87.7 ms, total: 318 ms\n", 757 | "Wall time: 446 ms\n" 758 | ] 759 | } 760 | ], 761 | "source": [ 762 | "%time %memit first_name = pd.read_csv(in_csv, engine='c')" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": 16, 768 | "metadata": { 769 | "ExecuteTime": { 770 | "end_time": "2019-01-06T11:08:39.771942Z", 771 | "start_time": "2019-01-06T11:08:39.519861Z" 772 | } 773 | }, 774 | "outputs": [ 775 | { 776 | "name": "stdout", 777 | "output_type": "stream", 778 | "text": [ 779 | "peak memory: 162.25 MiB, increment: 13.16 MiB\n", 780 | "CPU times: user 97.6 ms, sys: 90.9 ms, total: 188 ms\n", 781 | "Wall time: 239 ms\n" 782 | ] 783 | } 784 | ], 785 | "source": [ 786 | "def parquet_df(in_pq):\n", 787 | " table = pq.read_table(in_pq)\n", 788 | " return table.to_pandas()\n", 789 | "\n", 790 | "in_pq = '../data/test.pq'\n", 791 | "\n", 792 | "%time %memit parquet_df(in_pq)" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "metadata": {}, 798 | "source": [ 799 | "## 4.7 Exercise\n", 800 | "\n", 801 | "1. Read the comprehensive introduction of Pandas IO tools [here](http://pandas.pydata.org/pandas-docs/stable/io.html).\n", 802 | "\n", 803 | "2. Find out how to read an Excel file to pandas DataFrame.\n", 804 | "\n", 805 | "3. Test all solutions in the benchmark table." 806 | ] 807 | } 808 | ], 809 | "metadata": { 810 | "kernelspec": { 811 | "display_name": "Python 3", 812 | "language": "python", 813 | "name": "python3" 814 | }, 815 | "language_info": { 816 | "codemirror_mode": { 817 | "name": "ipython", 818 | "version": 3 819 | }, 820 | "file_extension": ".py", 821 | "mimetype": "text/x-python", 822 | "name": "python", 823 | "nbconvert_exporter": "python", 824 | "pygments_lexer": "ipython3", 825 | "version": "3.7.2" 826 | }, 827 | "toc": { 828 | "base_numbering": 1, 829 | "nav_menu": {}, 830 | "number_sections": false, 831 | "sideBar": true, 832 | "skip_h1_title": false, 833 | "title_cell": "Table of Contents", 834 | "title_sidebar": "Contents", 835 | "toc_cell": false, 836 | "toc_position": {}, 837 | "toc_section_display": true, 838 | "toc_window_display": false 839 | } 840 | }, 841 | "nbformat": 4, 842 | "nbformat_minor": 2 843 | } 844 | -------------------------------------------------------------------------------- /Pandas 02 - Series.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas 02 - Series\n", 8 | "\n", 9 | "by Nova@Douban\n", 10 | "\n", 11 | "The video record of this session is here: https://zoom.us/recording/share/hMxWmW7CRemL7wT8495JdIDXFCiyU6TAkMO4fL7J9GOwIumekTziMw?startTime=1546170816000\n", 12 | "\n", 13 | "---\n", 14 | "\n", 15 | "## 2.1 The Series object\n", 16 | "\n", 17 | "### 2.1.1 Concept\n", 18 | "\n", 19 | "pandas Series:\n", 20 | "\n", 21 | "1. represents a one-dimensional labeled indexed array;\n", 22 | "\n", 23 | "2. deviates from NumPy arrays by adding an index.\n", 24 | "\n", 25 | "### 2.1.2 Examples of pandas Series" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 1, 31 | "metadata": { 32 | "ExecuteTime": { 33 | "end_time": "2018-12-31T01:54:56.702203Z", 34 | "start_time": "2018-12-31T01:54:56.063833Z" 35 | } 36 | }, 37 | "outputs": [ 38 | { 39 | "data": { 40 | "text/plain": [ 41 | "array([ 0.46676643, -0.15545763, 0.44000794, 1.88418346, 1.35743695,\n", 42 | " -0.92247118])" 43 | ] 44 | }, 45 | "execution_count": 1, 46 | "metadata": {}, 47 | "output_type": "execute_result" 48 | } 49 | ], 50 | "source": [ 51 | "import numpy as np\n", 52 | "import pandas as pd\n", 53 | "\n", 54 | "aray = np.random.randn(6)\n", 55 | "aray" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 2, 61 | "metadata": { 62 | "ExecuteTime": { 63 | "end_time": "2018-12-31T01:54:56.759244Z", 64 | "start_time": "2018-12-31T01:54:56.707515Z" 65 | } 66 | }, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "m 0.466766\n", 72 | "a -0.155458\n", 73 | "f 0.440008\n", 74 | "9 1.884183\n", 75 | "h 1.357437\n", 76 | "l -0.922471\n", 77 | "dtype: float64" 78 | ] 79 | }, 80 | "execution_count": 2, 81 | "metadata": {}, 82 | "output_type": "execute_result" 83 | } 84 | ], 85 | "source": [ 86 | "srs = pd.Series(aray, index = ['m', 'a', 'f', '9', 'h', 'l'])\n", 87 | "srs" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 3, 93 | "metadata": { 94 | "ExecuteTime": { 95 | "end_time": "2018-12-31T01:54:56.784106Z", 96 | "start_time": "2018-12-31T01:54:56.765979Z" 97 | } 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "[ 0.46676643 -0.15545763 0.44000794 1.88418346 1.35743695 -0.92247118]\n", 105 | "Index(['m', 'a', 'f', '9', 'h', 'l'], dtype='object')\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "print(srs.values)\n", 111 | "print(srs.index)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "---\n", 119 | "\n", 120 | "\n", 121 | "## 2.2 Creating Series\n", 122 | "\n", 123 | "### 2.2.1 Creating from other data structures\n", 124 | "\n", 125 | "A Series can be created and initialized by passing \n", 126 | "\n", 127 | "1. a scalar value, \n", 128 | "2. a NumPy ndarray,\n", 129 | "3. a Python list, \n", 130 | "4. a Python Dict,\n", 131 | "\n", 132 | "### 2.2.2 Examples of creating Series" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 4, 138 | "metadata": { 139 | "ExecuteTime": { 140 | "end_time": "2018-12-31T01:54:56.797762Z", 141 | "start_time": "2018-12-31T01:54:56.786810Z" 142 | } 143 | }, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | " 1.278977 a\n", 149 | "-1.642189 a\n", 150 | " 0.300328 a\n", 151 | " 1.407208 a\n", 152 | " 1.018008 a\n", 153 | " 0.101313 a\n", 154 | "dtype: object" 155 | ] 156 | }, 157 | "execution_count": 4, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "# from a scaler value\n", 164 | "ind = np.random.randn(6)\n", 165 | "pd.Series('a', index=ind)" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 5, 171 | "metadata": { 172 | "ExecuteTime": { 173 | "end_time": "2018-12-31T01:54:56.828669Z", 174 | "start_time": "2018-12-31T01:54:56.807395Z" 175 | } 176 | }, 177 | "outputs": [ 178 | { 179 | "data": { 180 | "text/plain": [ 181 | "0 -1.180448\n", 182 | "1 0.274229\n", 183 | "2 -1.972792\n", 184 | "3 -0.108837\n", 185 | "4 -0.255784\n", 186 | "5 1.419131\n", 187 | "dtype: float64" 188 | ] 189 | }, 190 | "execution_count": 5, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "# from a numpy ndarray\n", 197 | "aray = np.random.randn(6)\n", 198 | "pd.Series(aray)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 6, 204 | "metadata": { 205 | "ExecuteTime": { 206 | "end_time": "2018-12-31T01:54:56.844589Z", 207 | "start_time": "2018-12-31T01:54:56.835249Z" 208 | } 209 | }, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "0 0\n", 215 | "1 1\n", 216 | "2 3\n", 217 | "3 89\n", 218 | "dtype: int64" 219 | ] 220 | }, 221 | "execution_count": 6, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | } 225 | ], 226 | "source": [ 227 | "# from a list\n", 228 | "lst = [0, 1, 3, 89]\n", 229 | "pd.Series(lst)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 7, 235 | "metadata": { 236 | "ExecuteTime": { 237 | "end_time": "2018-12-31T01:54:56.859083Z", 238 | "start_time": "2018-12-31T01:54:56.848838Z" 239 | } 240 | }, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "a 9.1\n", 246 | "i 0.0\n", 247 | "dtype: float64" 248 | ] 249 | }, 250 | "execution_count": 7, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "# from a dict\n", 257 | "dic = {'a': 9.1, 'i': 0}\n", 258 | "pd.Series(dic)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "---\n", 266 | "\n", 267 | "### 2.2.3 Index and values of Series\n", 268 | "\n", 269 | "1. By default, the Series object will construct an index automatically using integer values.\n", 270 | "\n", 271 | "\n", 272 | "2. To specify the index, use the index parameter of the constructor.\n", 273 | "\n", 274 | "\n", 275 | "3. A Series created with scaler value allows you to apply an operation and a single value across all elements of a Series.\n", 276 | "\n", 277 | "### 2.2.4 Examples of index and values of Series" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 8, 283 | "metadata": { 284 | "ExecuteTime": { 285 | "end_time": "2018-12-31T01:54:56.889714Z", 286 | "start_time": "2018-12-31T01:54:56.862232Z" 287 | } 288 | }, 289 | "outputs": [ 290 | { 291 | "data": { 292 | "text/plain": [ 293 | " 0.284722 0\n", 294 | "-0.761072 1\n", 295 | "-1.104793 3\n", 296 | " 0.730485 89\n", 297 | "dtype: int64" 298 | ] 299 | }, 300 | "execution_count": 8, 301 | "metadata": {}, 302 | "output_type": "execute_result" 303 | } 304 | ], 305 | "source": [ 306 | "# Set the index when creating the Series\n", 307 | "srs = pd.Series(lst, index = np.random.randn(4))\n", 308 | "srs" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 9, 314 | "metadata": { 315 | "ExecuteTime": { 316 | "end_time": "2018-12-31T01:54:56.916903Z", 317 | "start_time": "2018-12-31T01:54:56.897158Z" 318 | } 319 | }, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "array([ 0, 1, 3, 89])" 325 | ] 326 | }, 327 | "execution_count": 9, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "# get the values of srs\n", 334 | "srs.values" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 10, 340 | "metadata": { 341 | "ExecuteTime": { 342 | "end_time": "2018-12-31T01:54:56.946243Z", 343 | "start_time": "2018-12-31T01:54:56.921324Z" 344 | } 345 | }, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | " 0.284722 5\n", 352 | "-0.761072 5\n", 353 | "-1.104793 5\n", 354 | " 0.730485 5\n", 355 | "dtype: int64\n", 356 | "\n", 357 | " 0.284722 0\n", 358 | "-0.761072 5\n", 359 | "-1.104793 15\n", 360 | " 0.730485 445\n", 361 | "dtype: int64\n", 362 | "\n", 363 | " 0.284722 0\n", 364 | "-0.761072 5\n", 365 | "-1.104793 15\n", 366 | " 0.730485 445\n", 367 | "dtype: int64\n" 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "# A Series created from a scaler value is useful\n", 373 | "scaler = pd.Series(5, index=srs.index)\n", 374 | "print(scaler)\n", 375 | "print()\n", 376 | "print(srs * scaler)\n", 377 | "print()\n", 378 | "print(srs * 5)" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "---\n", 386 | "\n", 387 | "## 2.3 Accessing Series\n", 388 | "\n", 389 | "1. `pd.Series.size()`: return the number of elements in the underlying data;\n", 390 | "2. `pd.Series.shape`: return a tuple of the shape of the underlying data;\n", 391 | "3. `pd.Series.unique()`: return unique values of Series object;\n", 392 | "4. `pd.Series.count()`: return number of non-NA/null observations in the Series;\n", 393 | "5. `pd.Series.head()`: return the first `n` rows;\n", 394 | "6. `pd.Series.tail()`: return the last `n` rows;\n", 395 | "7. `pd.Series.take()`: return the elements in the given *positional* indices along an axis;" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 11, 401 | "metadata": { 402 | "ExecuteTime": { 403 | "end_time": "2018-12-31T01:54:56.962751Z", 404 | "start_time": "2018-12-31T01:54:56.951383Z" 405 | } 406 | }, 407 | "outputs": [ 408 | { 409 | "data": { 410 | "text/plain": [ 411 | "4" 412 | ] 413 | }, 414 | "execution_count": 11, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "srs.size" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 12, 426 | "metadata": { 427 | "ExecuteTime": { 428 | "end_time": "2018-12-31T01:54:56.988077Z", 429 | "start_time": "2018-12-31T01:54:56.967448Z" 430 | } 431 | }, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/plain": [ 436 | "(4,)" 437 | ] 438 | }, 439 | "execution_count": 12, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "srs.shape" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 13, 451 | "metadata": { 452 | "ExecuteTime": { 453 | "end_time": "2018-12-31T01:54:57.025706Z", 454 | "start_time": "2018-12-31T01:54:56.995758Z" 455 | } 456 | }, 457 | "outputs": [ 458 | { 459 | "data": { 460 | "text/plain": [ 461 | "array([ 0, 1, 3, 89])" 462 | ] 463 | }, 464 | "execution_count": 13, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "srs.unique()" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 14, 476 | "metadata": { 477 | "ExecuteTime": { 478 | "end_time": "2018-12-31T01:54:57.053575Z", 479 | "start_time": "2018-12-31T01:54:57.030985Z" 480 | } 481 | }, 482 | "outputs": [ 483 | { 484 | "data": { 485 | "text/plain": [ 486 | "4" 487 | ] 488 | }, 489 | "execution_count": 14, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "srs.count()" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 15, 501 | "metadata": { 502 | "ExecuteTime": { 503 | "end_time": "2018-12-31T01:54:57.074672Z", 504 | "start_time": "2018-12-31T01:54:57.061985Z" 505 | } 506 | }, 507 | "outputs": [ 508 | { 509 | "data": { 510 | "text/plain": [ 511 | " 0.284722 0\n", 512 | "-0.761072 1\n", 513 | "dtype: int64" 514 | ] 515 | }, 516 | "execution_count": 15, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "srs.head(2)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 16, 528 | "metadata": { 529 | "ExecuteTime": { 530 | "end_time": "2018-12-31T01:54:57.107392Z", 531 | "start_time": "2018-12-31T01:54:57.085274Z" 532 | } 533 | }, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "text/plain": [ 538 | "-1.104793 3\n", 539 | " 0.730485 89\n", 540 | "dtype: int64" 541 | ] 542 | }, 543 | "execution_count": 16, 544 | "metadata": {}, 545 | "output_type": "execute_result" 546 | } 547 | ], 548 | "source": [ 549 | "srs.tail(2)" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 17, 555 | "metadata": { 556 | "ExecuteTime": { 557 | "end_time": "2018-12-31T01:54:57.129753Z", 558 | "start_time": "2018-12-31T01:54:57.117178Z" 559 | } 560 | }, 561 | "outputs": [ 562 | { 563 | "data": { 564 | "text/plain": [ 565 | " 0.284722 0\n", 566 | "-0.761072 1\n", 567 | "dtype: int64" 568 | ] 569 | }, 570 | "execution_count": 17, 571 | "metadata": {}, 572 | "output_type": "execute_result" 573 | } 574 | ], 575 | "source": [ 576 | "srs.take([0, 1], axis=0)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "---\n", 584 | "\n", 585 | "## 2.4 More about alignment\n", 586 | "\n", 587 | "### 2.4.1 Always start with alignment\n", 588 | "\n", 589 | "The computing between multiple Series always start with alignment." 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 18, 595 | "metadata": { 596 | "ExecuteTime": { 597 | "end_time": "2018-12-31T01:55:13.395623Z", 598 | "start_time": "2018-12-31T01:54:57.135720Z" 599 | } 600 | }, 601 | "outputs": [ 602 | { 603 | "name": "stdout", 604 | "output_type": "stream", 605 | "text": [ 606 | "97.1 µs ± 16.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n", 607 | "102 µs ± 27.4 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 608 | ] 609 | } 610 | ], 611 | "source": [ 612 | "# A Series * scaler values VS. vectorization\n", 613 | "scaler = pd.Series(5, index=srs.index)\n", 614 | "%timeit srs * scaler\n", 615 | "%timeit srs * 5" 616 | ] 617 | }, 618 | { 619 | "cell_type": "markdown", 620 | "metadata": {}, 621 | "source": [ 622 | "---\n", 623 | "\n", 624 | "### 2.4.2 Repeated labels in index\n", 625 | "\n", 626 | "If there are repeated labels in the index, the result will be surprising.\n", 627 | "\n", 628 | " Cartesian product: an index having duplicate labels will result in a number of index labels equivalent to the products of the number of the labels in each Series." 629 | ] 630 | }, 631 | { 632 | "cell_type": "code", 633 | "execution_count": 19, 634 | "metadata": { 635 | "ExecuteTime": { 636 | "end_time": "2018-12-31T01:55:13.423295Z", 637 | "start_time": "2018-12-31T01:55:13.401712Z" 638 | } 639 | }, 640 | "outputs": [ 641 | { 642 | "name": "stdout", 643 | "output_type": "stream", 644 | "text": [ 645 | "1 1.026273\n", 646 | "2 1.339302\n", 647 | "2 1.551478\n", 648 | "2 0.327764\n", 649 | "2 0.539940\n", 650 | "3 3.797827\n", 651 | "dtype: float64\n", 652 | "\n", 653 | "1 0.007499\n", 654 | "2 0.788075\n", 655 | "2 -0.223463\n", 656 | "3 2.548161\n", 657 | "dtype: float64\n", 658 | "\n", 659 | "{1: 0.007498765264798723, 2: -0.22346329852722358, 3: 2.548161210676347}\n" 660 | ] 661 | } 662 | ], 663 | "source": [ 664 | "ind = [1, 2, 2, 3]\n", 665 | "s1 = pd.Series(np.random.randn(4), index=ind)\n", 666 | "s2 = pd.Series(np.random.randn(4), index=reversed(ind))\n", 667 | "print(s1+s2)\n", 668 | "print()\n", 669 | "print(s1)\n", 670 | "print()\n", 671 | "print(s1.to_dict())" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "metadata": {}, 677 | "source": [ 678 | "---\n", 679 | "\n", 680 | "## 2.5 Boolean selection\n", 681 | "\n", 682 | "\n", 683 | "1. Boolean selection produces a new Series with a copy of index and value for the selected rows.\n", 684 | "\n", 685 | "\n", 686 | "2. With the `[]` operator, Boolean selection can get the values of the original Series.\n", 687 | "\n", 688 | "\n", 689 | "3. Chain selection does not work with Series, instead, put parentheses around logical conditions and use '|' and '&'." 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": 20, 695 | "metadata": { 696 | "ExecuteTime": { 697 | "end_time": "2018-12-31T01:55:13.448878Z", 698 | "start_time": "2018-12-31T01:55:13.432424Z" 699 | } 700 | }, 701 | "outputs": [ 702 | { 703 | "name": "stdout", 704 | "output_type": "stream", 705 | "text": [ 706 | "1 True\n", 707 | "2 True\n", 708 | "2 False\n", 709 | "3 True\n", 710 | "dtype: bool\n", 711 | "\n" 712 | ] 713 | } 714 | ], 715 | "source": [ 716 | "ss = (s1 > 0)\n", 717 | "print(ss)\n", 718 | "print(type(ss))" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 21, 724 | "metadata": { 725 | "ExecuteTime": { 726 | "end_time": "2018-12-31T01:55:13.466712Z", 727 | "start_time": "2018-12-31T01:55:13.456102Z" 728 | } 729 | }, 730 | "outputs": [ 731 | { 732 | "data": { 733 | "text/plain": [ 734 | "1 0.007499\n", 735 | "2 0.788075\n", 736 | "3 2.548161\n", 737 | "dtype: float64" 738 | ] 739 | }, 740 | "execution_count": 21, 741 | "metadata": {}, 742 | "output_type": "execute_result" 743 | } 744 | ], 745 | "source": [ 746 | "s1[(s1 > 0)]" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 22, 752 | "metadata": { 753 | "ExecuteTime": { 754 | "end_time": "2018-12-31T01:55:13.484357Z", 755 | "start_time": "2018-12-31T01:55:13.470986Z" 756 | } 757 | }, 758 | "outputs": [ 759 | { 760 | "data": { 761 | "text/plain": [ 762 | "False" 763 | ] 764 | }, 765 | "execution_count": 22, 766 | "metadata": {}, 767 | "output_type": "execute_result" 768 | } 769 | ], 770 | "source": [ 771 | "s1[(s1 > 0)]._is_view" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": 23, 777 | "metadata": { 778 | "ExecuteTime": { 779 | "end_time": "2018-12-31T01:55:13.511880Z", 780 | "start_time": "2018-12-31T01:55:13.488409Z" 781 | } 782 | }, 783 | "outputs": [ 784 | { 785 | "data": { 786 | "text/plain": [ 787 | "1 0.007499\n", 788 | "dtype: float64" 789 | ] 790 | }, 791 | "execution_count": 23, 792 | "metadata": {}, 793 | "output_type": "execute_result" 794 | } 795 | ], 796 | "source": [ 797 | "# s1[(0.5 > s1 > 0)]\n", 798 | "\n", 799 | "s1[(0.5 > s1)&(s1 > 0)]" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": 24, 805 | "metadata": { 806 | "ExecuteTime": { 807 | "end_time": "2018-12-31T01:55:13.532020Z", 808 | "start_time": "2018-12-31T01:55:13.517926Z" 809 | } 810 | }, 811 | "outputs": [ 812 | { 813 | "data": { 814 | "text/plain": [ 815 | "1 0.007499\n", 816 | "2 0.788075\n", 817 | "3 2.548161\n", 818 | "dtype: float64" 819 | ] 820 | }, 821 | "execution_count": 24, 822 | "metadata": {}, 823 | "output_type": "execute_result" 824 | } 825 | ], 826 | "source": [ 827 | "s1[(0.5 < s1)|(s1 > 0)]" 828 | ] 829 | }, 830 | { 831 | "cell_type": "markdown", 832 | "metadata": {}, 833 | "source": [ 834 | "---\n", 835 | "\n", 836 | "## 2.6 Slicing a Series\n", 837 | "\n", 838 | "Slicing a Series is siilar to slicing a list, and the result is a view, instead of a copy.\n", 839 | "\n", 840 | "If the series has n elements, then negative values for the start and end of the slice represent elements n + start through and not including n + end." 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": 25, 846 | "metadata": { 847 | "ExecuteTime": { 848 | "end_time": "2018-12-31T01:55:13.546357Z", 849 | "start_time": "2018-12-31T01:55:13.535770Z" 850 | } 851 | }, 852 | "outputs": [ 853 | { 854 | "name": "stdout", 855 | "output_type": "stream", 856 | "text": [ 857 | "1 0.007499\n", 858 | "2 0.788075\n", 859 | "2 -0.223463\n", 860 | "3 2.548161\n", 861 | "dtype: float64\n", 862 | "\n", 863 | "1 0.007499\n", 864 | "2 0.788075\n", 865 | "2 -0.223463\n", 866 | "3 2.548161\n", 867 | "dtype: float64\n" 868 | ] 869 | } 870 | ], 871 | "source": [ 872 | "print(s1)\n", 873 | "print()\n", 874 | "s3 = s1[0:]\n", 875 | "print(s3)" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": 26, 881 | "metadata": { 882 | "ExecuteTime": { 883 | "end_time": "2018-12-31T01:55:13.575589Z", 884 | "start_time": "2018-12-31T01:55:13.554323Z" 885 | } 886 | }, 887 | "outputs": [ 888 | { 889 | "name": "stdout", 890 | "output_type": "stream", 891 | "text": [ 892 | "True\n", 893 | "\n", 894 | "False\n" 895 | ] 896 | } 897 | ], 898 | "source": [ 899 | "print(s3._is_view)\n", 900 | "print()\n", 901 | "print(s3.copy()._is_view)" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": 27, 907 | "metadata": { 908 | "ExecuteTime": { 909 | "end_time": "2018-12-31T01:55:13.595081Z", 910 | "start_time": "2018-12-31T01:55:13.580618Z" 911 | } 912 | }, 913 | "outputs": [ 914 | { 915 | "data": { 916 | "text/plain": [ 917 | "3 2.548161\n", 918 | "dtype: float64" 919 | ] 920 | }, 921 | "execution_count": 27, 922 | "metadata": {}, 923 | "output_type": "execute_result" 924 | } 925 | ], 926 | "source": [ 927 | "s3 = s1[-1:]\n", 928 | "s3" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": 28, 934 | "metadata": { 935 | "ExecuteTime": { 936 | "end_time": "2018-12-31T01:55:13.614137Z", 937 | "start_time": "2018-12-31T01:55:13.604408Z" 938 | } 939 | }, 940 | "outputs": [ 941 | { 942 | "name": "stdout", 943 | "output_type": "stream", 944 | "text": [ 945 | "1 0.007499\n", 946 | "2 0.788075\n", 947 | "2 -0.223463\n", 948 | "3 2.548161\n", 949 | "dtype: float64\n", 950 | "\n", 951 | "3 2.548161\n", 952 | "2 -0.223463\n", 953 | "2 0.788075\n", 954 | "1 0.007499\n", 955 | "dtype: float64\n" 956 | ] 957 | } 958 | ], 959 | "source": [ 960 | "print(s1)\n", 961 | "print()\n", 962 | "print(s1[::-1])" 963 | ] 964 | }, 965 | { 966 | "cell_type": "markdown", 967 | "metadata": {}, 968 | "source": [ 969 | "---\n", 970 | "\n", 971 | "## 2.7 Sorting and ranking\n", 972 | "\n", 973 | "Sorting Series can be based on indices of values, and pandas provides both solutions:\n", 974 | "\n", 975 | "`pd.Series.sort_index()`: sort a Series by row indexs, and returns a new, sorted object\n", 976 | "\n", 977 | "`pd.Series.sort_values()`: sort a Series by its values\n", 978 | "\n", 979 | "\n", 980 | "Please note: any missing values are sorted to the end of the Series by default" 981 | ] 982 | }, 983 | { 984 | "cell_type": "code", 985 | "execution_count": 29, 986 | "metadata": { 987 | "ExecuteTime": { 988 | "end_time": "2018-12-31T01:55:13.647511Z", 989 | "start_time": "2018-12-31T01:55:13.619834Z" 990 | } 991 | }, 992 | "outputs": [ 993 | { 994 | "data": { 995 | "text/plain": [ 996 | "1 -0.329720\n", 997 | "2 0.083856\n", 998 | "2 1.009812\n", 999 | "9 0.325275\n", 1000 | "dtype: float64" 1001 | ] 1002 | }, 1003 | "execution_count": 29, 1004 | "metadata": {}, 1005 | "output_type": "execute_result" 1006 | } 1007 | ], 1008 | "source": [ 1009 | "ind = [9, 2, 2, 1]\n", 1010 | "s1 = pd.Series(np.random.randn(4), index=ind)\n", 1011 | "s1.sort_index()" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "code", 1016 | "execution_count": 30, 1017 | "metadata": { 1018 | "ExecuteTime": { 1019 | "end_time": "2018-12-31T01:55:13.673645Z", 1020 | "start_time": "2018-12-31T01:55:13.649936Z" 1021 | } 1022 | }, 1023 | "outputs": [ 1024 | { 1025 | "data": { 1026 | "text/plain": [ 1027 | "1 -0.329720\n", 1028 | "2 0.083856\n", 1029 | "9 0.325275\n", 1030 | "2 1.009812\n", 1031 | "dtype: float64" 1032 | ] 1033 | }, 1034 | "execution_count": 30, 1035 | "metadata": {}, 1036 | "output_type": "execute_result" 1037 | } 1038 | ], 1039 | "source": [ 1040 | "s1.sort_values()" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": 31, 1046 | "metadata": { 1047 | "ExecuteTime": { 1048 | "end_time": "2018-12-31T01:55:13.704324Z", 1049 | "start_time": "2018-12-31T01:55:13.683345Z" 1050 | } 1051 | }, 1052 | "outputs": [ 1053 | { 1054 | "data": { 1055 | "text/plain": [ 1056 | "5 1.0\n", 1057 | "2 2.0\n", 1058 | "4 2.0\n", 1059 | "0 9.0\n", 1060 | "1 NaN\n", 1061 | "3 NaN\n", 1062 | "dtype: float64" 1063 | ] 1064 | }, 1065 | "execution_count": 31, 1066 | "metadata": {}, 1067 | "output_type": "execute_result" 1068 | } 1069 | ], 1070 | "source": [ 1071 | "s3 = pd.Series([9, None, 2, None, 2, 1])\n", 1072 | "s3.sort_values()" 1073 | ] 1074 | }, 1075 | { 1076 | "cell_type": "code", 1077 | "execution_count": 32, 1078 | "metadata": { 1079 | "ExecuteTime": { 1080 | "end_time": "2018-12-31T01:55:13.720857Z", 1081 | "start_time": "2018-12-31T01:55:13.711894Z" 1082 | } 1083 | }, 1084 | "outputs": [ 1085 | { 1086 | "name": "stdout", 1087 | "output_type": "stream", 1088 | "text": [ 1089 | "0 9.0\n", 1090 | "1 NaN\n", 1091 | "2 2.0\n", 1092 | "3 NaN\n", 1093 | "4 2.0\n", 1094 | "5 1.0\n", 1095 | "dtype: float64\n", 1096 | "\n", 1097 | "0 4.0\n", 1098 | "1 NaN\n", 1099 | "2 2.5\n", 1100 | "3 NaN\n", 1101 | "4 2.5\n", 1102 | "5 1.0\n", 1103 | "dtype: float64\n" 1104 | ] 1105 | } 1106 | ], 1107 | "source": [ 1108 | "print(s3)\n", 1109 | "print()\n", 1110 | "print(s3.rank())" 1111 | ] 1112 | }, 1113 | { 1114 | "cell_type": "markdown", 1115 | "metadata": {}, 1116 | "source": [ 1117 | "---\n", 1118 | "\n", 1119 | "## 2.8 Copy VS view\n", 1120 | "\n", 1121 | "This warning often occurs when we write pandas functions:\n", 1122 | "\n", 1123 | "> /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/ipykernel/__main__.py:1: SettingWithCopyWarning: \n", 1124 | "A value is trying to be set on a copy of a slice from a DataFrame.\n", 1125 | "Try using .loc[row_indexer,col_indexer] = value instead\n", 1126 | "\n", 1127 | "\n", 1128 | "The fundamental cause of this warning is that we used chain indexing in pandas, which is a taboo. To solve this problem, we need clarity copy and view in pandas first.\n", 1129 | "\n", 1130 | "View: \n", 1131 | "\n", 1132 | " 1. can be regarded as a reference to the original DataFrame / Series;\n", 1133 | " 2. the modofication on the view affects the original DataFrame / Series.\n", 1134 | "\n", 1135 | "\n", 1136 | "Copy: \n", 1137 | "\n", 1138 | " 1. a new DataFrame / Series based on the original DataFrame / Series;\n", 1139 | " 2. the modification doesn't affect the original DataFrame / Series.\n", 1140 | "\n", 1141 | "\n", 1142 | "\n", 1143 | "---\n", 1144 | "\n", 1145 | "\n", 1146 | "\n", 1147 | "\n", 1148 | "---\n", 1149 | "\n", 1150 | "The chain indexing may introduce views and copies at the same time, so the original DataFrame might be affected without noticing. This is super dangerous!\n", 1151 | "\n", 1152 | "We can avoid chain indexing by using `pd.DataFrame.loc()` / `pd.DataFrame.iloc()`" 1153 | ] 1154 | }, 1155 | { 1156 | "cell_type": "code", 1157 | "execution_count": 33, 1158 | "metadata": { 1159 | "ExecuteTime": { 1160 | "end_time": "2018-12-31T01:55:13.738121Z", 1161 | "start_time": "2018-12-31T01:55:13.723858Z" 1162 | } 1163 | }, 1164 | "outputs": [ 1165 | { 1166 | "data": { 1167 | "text/plain": [ 1168 | "False" 1169 | ] 1170 | }, 1171 | "execution_count": 33, 1172 | "metadata": {}, 1173 | "output_type": "execute_result" 1174 | } 1175 | ], 1176 | "source": [ 1177 | "s3._is_view" 1178 | ] 1179 | }, 1180 | { 1181 | "cell_type": "code", 1182 | "execution_count": 34, 1183 | "metadata": { 1184 | "ExecuteTime": { 1185 | "end_time": "2018-12-31T01:55:13.751490Z", 1186 | "start_time": "2018-12-31T01:55:13.743247Z" 1187 | } 1188 | }, 1189 | "outputs": [ 1190 | { 1191 | "data": { 1192 | "text/plain": [ 1193 | "False" 1194 | ] 1195 | }, 1196 | "execution_count": 34, 1197 | "metadata": {}, 1198 | "output_type": "execute_result" 1199 | } 1200 | ], 1201 | "source": [ 1202 | "# Converting a view to a copy\n", 1203 | "\n", 1204 | "s4 = s3.copy()\n", 1205 | "s4._is_view" 1206 | ] 1207 | }, 1208 | { 1209 | "cell_type": "code", 1210 | "execution_count": 35, 1211 | "metadata": { 1212 | "ExecuteTime": { 1213 | "end_time": "2018-12-31T01:55:13.769675Z", 1214 | "start_time": "2018-12-31T01:55:13.753811Z" 1215 | } 1216 | }, 1217 | "outputs": [ 1218 | { 1219 | "data": { 1220 | "text/plain": [ 1221 | "True" 1222 | ] 1223 | }, 1224 | "execution_count": 35, 1225 | "metadata": {}, 1226 | "output_type": "execute_result" 1227 | } 1228 | ], 1229 | "source": [ 1230 | "s5 = s4.view()\n", 1231 | "s5._is_view" 1232 | ] 1233 | }, 1234 | { 1235 | "cell_type": "markdown", 1236 | "metadata": {}, 1237 | "source": [ 1238 | "## 2.9 Exercises\n", 1239 | "\n", 1240 | "1. For a detailed answer of chain indexing warning, please read [this great post](https://www.dataquest.io/blog/settingwithcopywarning/)\n", 1241 | "\n", 1242 | "2. Find the parameter settings of following pandas functions:\n", 1243 | "\n", 1244 | "`pd.Series.reindex()`\n", 1245 | "\n", 1246 | "`pd.Series.sort_values()`\n", 1247 | "\n", 1248 | "`pd.Series.sort_index()`\n", 1249 | "\n", 1250 | "`pd.Series.loc()`\n", 1251 | "\n", 1252 | "`pd.Series.iloc()`\n", 1253 | "\n", 1254 | "3. Check the result of the following functions to see if they return a copy or a view?\n", 1255 | "\n", 1256 | "`pd.Series.reindex()`\n", 1257 | "\n", 1258 | "`pd.Series.sort_values()`\n", 1259 | "\n", 1260 | "`pd.Series.sort_index()`\n", 1261 | "\n", 1262 | "`pd.Series.loc()`\n", 1263 | "\n", 1264 | "`pd.Series.iloc()`" 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "markdown", 1269 | "metadata": {}, 1270 | "source": [ 1271 | "---\n", 1272 | "\n", 1273 | "To the rest sessions (outlines and video records), please scan the QR code below to pay.\n", 1274 | "\n", 1275 | "1. The price is 799 RMB.\n", 1276 | "2. Please leave your email address in the __payment comment__, so I will send you the links of the rest sessions.\n", 1277 | "\n", 1278 | "\n", 1279 | "" 1280 | ] 1281 | } 1282 | ], 1283 | "metadata": { 1284 | "kernelspec": { 1285 | "display_name": "Python 3", 1286 | "language": "python", 1287 | "name": "python3" 1288 | }, 1289 | "language_info": { 1290 | "codemirror_mode": { 1291 | "name": "ipython", 1292 | "version": 3 1293 | }, 1294 | "file_extension": ".py", 1295 | "mimetype": "text/x-python", 1296 | "name": "python", 1297 | "nbconvert_exporter": "python", 1298 | "pygments_lexer": "ipython3", 1299 | "version": "3.7.2" 1300 | }, 1301 | "toc": { 1302 | "base_numbering": "2", 1303 | "nav_menu": {}, 1304 | "number_sections": false, 1305 | "sideBar": true, 1306 | "skip_h1_title": false, 1307 | "title_cell": "Table of Contents", 1308 | "title_sidebar": "Contents", 1309 | "toc_cell": false, 1310 | "toc_position": { 1311 | "height": "calc(100% - 180px)", 1312 | "left": "10px", 1313 | "top": "150px", 1314 | "width": "295.3333435058594px" 1315 | }, 1316 | "toc_section_display": true, 1317 | "toc_window_display": false 1318 | } 1319 | }, 1320 | "nbformat": 4, 1321 | "nbformat_minor": 2 1322 | } 1323 | -------------------------------------------------------------------------------- /Pandas 01 - Basics of Pandas.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pandas 01 - Basics of Pandas\n", 8 | "\n", 9 | "by Nova@Douban\n", 10 | "\n", 11 | "The video record of this session is here: https://zoom.us/recording/share/L9Jwofdbg3CX2L4wLoPAVrHYyi0F0ok2_58ozScsXsmwIumekTziMw\n", 12 | "\n", 13 | "\n", 14 | "---\n", 15 | "\n", 16 | "## 1.1 Data Structure of pandas\n", 17 | "\n", 18 | "`pandas` significantly simplies data structures. If you used `R` or a retional database, you will find `pandas` very similar.\n", 19 | "\n", 20 | "### 1.1.1 Three primary data structures in pandas\n", 21 | "\n", 22 | "1. `Series` (a column):\n", 23 | "\n", 24 | " 1. A one-dimensional array-like object containing an array of data.\n", 25 | " \n", 26 | " 2. A fixed-length, __ordered dict__.\n", 27 | " \n", 28 | " 3. Automatically aligns differently-indexed data in operations\n", 29 | " \n", 30 | " 4. The column returned when indexing a DataFrame is a view, not a copy.\n", 31 | "\n", 32 | "\n", 33 | "2. `DataFrame` (a collection of columns): \n", 34 | "\n", 35 | " 1. A tabular, spreadsheet-like data structure containing an ordered collection of columns;\n", 36 | " \n", 37 | " 2. __A collection of Series__.\n", 38 | " \n", 39 | " \n", 40 | "3. `index`:\n", 41 | " \n", 42 | " 1. an Index also functions as __a fixed-size set__\n", 43 | " \n", 44 | " 2. Index objects are __immutable__ and thus can’t be modified by the user\n", 45 | " \n", 46 | " 3. It is a class in pandas, more complicated than the one in RDS.\n", 47 | " \n", 48 | " a. Identication: Indices are used to locate Series / rows / items in a DataFrame. \n", 49 | " \n", 50 | " b. Alignemnt: pandas will always align with index automatically first.\n", 51 | " \n", 52 | " c. Selection: using index to select relevant columns/rows.\n", 53 | " \n", 54 | "\n", 55 | "### 1.1.2 Example of DataFrame, Series and index" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 1, 61 | "metadata": { 62 | "ExecuteTime": { 63 | "end_time": "2018-12-28T11:47:52.640421Z", 64 | "start_time": "2018-12-28T11:47:52.039021Z" 65 | } 66 | }, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | " Date Open High Low Close \\\n", 73 | "0 2018-11-23 6919.520020 6987.890137 6919.160156 6938.979980 \n", 74 | "1 2018-11-26 7026.500000 7083.930176 7003.120117 7081.850098 \n", 75 | "2 2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 \n", 76 | "3 2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 \n", 77 | "4 2018-11-29 7267.370117 7319.959961 7217.689941 7273.080078 \n", 78 | "\n", 79 | " Adj Close Volume \n", 80 | "0 6938.979980 958950000 \n", 81 | "1 7081.850098 2011180000 \n", 82 | "2 7082.700195 2067360000 \n", 83 | "3 7291.589844 2390260000 \n", 84 | "4 7273.080078 1983460000 \n", 85 | "\n", 86 | "\n" 87 | ] 88 | } 89 | ], 90 | "source": [ 91 | "# Download Nasdaq dataset: https://finance.yahoo.com/quote/%5EIXIC/history?p=%5EIXIC\n", 92 | "\n", 93 | "import pandas as pd\n", 94 | "\n", 95 | "in_file = '../data/nasdaq.csv'\n", 96 | "df = pd.read_csv(in_file, engine='c')\n", 97 | "\n", 98 | "print(df.head())\n", 99 | "print()\n", 100 | "print(type(df))" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 2, 106 | "metadata": { 107 | "ExecuteTime": { 108 | "end_time": "2018-12-28T11:47:52.656310Z", 109 | "start_time": "2018-12-28T11:47:52.644446Z" 110 | } 111 | }, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "0 2018-11-23\n", 118 | "1 2018-11-26\n", 119 | "2 2018-11-27\n", 120 | "3 2018-11-28\n", 121 | "4 2018-11-29\n", 122 | "Name: Date, dtype: object\n", 123 | "\n", 124 | "\n", 125 | "\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "print(df['Date'].head())\n", 131 | "print()\n", 132 | "print(type(df['Date']))\n", 133 | "print()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 3, 139 | "metadata": { 140 | "ExecuteTime": { 141 | "end_time": "2018-12-28T11:47:52.678080Z", 142 | "start_time": "2018-12-28T11:47:52.661673Z" 143 | } 144 | }, 145 | "outputs": [ 146 | { 147 | "data": { 148 | "text/plain": [ 149 | "RangeIndex(start=0, stop=20, step=1)" 150 | ] 151 | }, 152 | "execution_count": 3, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "df.index" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 4, 164 | "metadata": { 165 | "ExecuteTime": { 166 | "end_time": "2018-12-28T11:47:52.703980Z", 167 | "start_time": "2018-12-28T11:47:52.681052Z" 168 | } 169 | }, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,\n", 175 | " 17, 18, 19])" 176 | ] 177 | }, 178 | "execution_count": 4, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "df.index.values" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "---\n", 192 | "\n", 193 | "### 1.1.3 Two other data structure in pandas\n", 194 | "\n", 195 | "1. items:\n", 196 | "\n", 197 | " 1. The smallest unit in pandas.\n", 198 | " \n", 199 | " \n", 200 | "2. rows:\n", 201 | "\n", 202 | " 1. Row is not a primary data structure in pandas\n", 203 | " \n", 204 | "### 1.1.4 Examples of items and rows" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 5, 210 | "metadata": { 211 | "ExecuteTime": { 212 | "end_time": "2018-12-28T11:47:52.724236Z", 213 | "start_time": "2018-12-28T11:47:52.710937Z" 214 | } 215 | }, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "'2018-11-23'" 221 | ] 222 | }, 223 | "execution_count": 5, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "df['Date'][0]" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 6, 235 | "metadata": { 236 | "ExecuteTime": { 237 | "end_time": "2018-12-28T11:47:52.761309Z", 238 | "start_time": "2018-12-28T11:47:52.731362Z" 239 | } 240 | }, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/html": [ 245 | "
\n", 246 | "\n", 259 | "\n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | "
DateOpenHighLowCloseAdj CloseVolume
02018-11-236919.520026987.8901376919.1601566938.979986938.97998958950000
\n", 285 | "
" 286 | ], 287 | "text/plain": [ 288 | " Date Open High Low Close Adj Close \\\n", 289 | "0 2018-11-23 6919.52002 6987.890137 6919.160156 6938.97998 6938.97998 \n", 290 | "\n", 291 | " Volume \n", 292 | "0 958950000 " 293 | ] 294 | }, 295 | "execution_count": 6, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "df.loc[0:0]" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": {}, 307 | "source": [ 308 | "---\n", 309 | "\n", 310 | "## 1.2 Functions based on pandas\n", 311 | "\n", 312 | "\n", 313 | "\n", 314 | "\n", 315 | "\n", 316 | "### 1.2.1 Three levels of functions\n", 317 | "\n", 318 | "Each level of function only handles ite related levels of problems.\n", 319 | "\n", 320 | "1. DataFrame-level functions\n", 321 | "\n", 322 | "2. Series-level functions\n", 323 | "\n", 324 | "3. Item-level functions\n", 325 | "\n", 326 | "### 1.2.1 Example of different levels of pandas functions\n", 327 | "\n", 328 | "The following is a sample script to analyse logs:\n", 329 | "\n", 330 | "1. `prepare_overall_chat` is the overall wrapper;\n", 331 | "2. `clean_chat_log` is a DataFrame-level funtion;\n", 332 | "3. `get_all_mentions` and `count_active_user` are Series-level functions." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 7, 338 | "metadata": { 339 | "ExecuteTime": { 340 | "end_time": "2018-12-28T11:47:52.786553Z", 341 | "start_time": "2018-12-28T11:47:52.765005Z" 342 | } 343 | }, 344 | "outputs": [], 345 | "source": [ 346 | "def prepare_overall_chat(chat_base, time, day):\n", 347 | " '''\n", 348 | " an overall wrapper\n", 349 | " '''\n", 350 | " # 'reading starts'\n", 351 | " overall = csv2pd(chat_base, time, day, HEAD_ANALYSIS, sep=',', engine='c')\n", 352 | " \n", 353 | " # 'overall'\n", 354 | " clean_overall = clean_chat_log(overall)\n", 355 | " \n", 356 | " # 'cleaned kom records'\n", 357 | " all_mentions = get_all_mentions(clean_overall, switch=False)\n", 358 | " \n", 359 | " # 'all_mentions'\n", 360 | " active_user_count = count_active_user(overall, overall['RoomName'], colname='Name', header=HEAD_ACTIVE)\n", 361 | " return clean_overall, all_mentions, active_user_count\n", 362 | "\n", 363 | " \n", 364 | "def clean_chat_log(df):\n", 365 | " '''\n", 366 | " at Dataframe level\n", 367 | " '''\n", 368 | " # 'remove null or duplicates'\n", 369 | " df = df[df.TextMsg.notnull()]\n", 370 | " df = df[df.Name.notnull()]\n", 371 | " df = df.drop_duplicates()\n", 372 | "\n", 373 | " # replace ?? or ** in data\n", 374 | " df.Name = df.loc[:, 'Name'].str.replace('\\?\\?|\\*\\*', '?#')\n", 375 | " df.TextMsg = df.loc[:, 'TextMsg'].str.replace('\\?\\?|\\*\\*', '?#')\n", 376 | "\n", 377 | " # 'to_uni' and 'strip new lines'\n", 378 | " df = batch_to_uni(df, col_list=['Name', 'TextMsg', 'RoomName'])\n", 379 | " df = batch_strip(df, col_list=['Name', 'RoomName'], strip_str='\\n\\r ')\n", 380 | " return df\n", 381 | "\n", 382 | " \n", 383 | "def get_all_mentions(df, switch=True):\n", 384 | " '''\n", 385 | " at Series level\n", 386 | " '''\n", 387 | " all_mentions = df[df['TextMsg'].str.contains('@')]\n", 388 | " all_mentions.MsgTime = pd.to_datetime(pd.Series(all_mentions.MsgTime)) # todo fix .loc\n", 389 | " if switch:\n", 390 | " cleaned_mentions = pd.DataFrame.copy(all_mentions)\n", 391 | " cleaned_mentions = batch_replace(cleaned_mentions, 'TextMsg', CN_PUNCS, '')\n", 392 | " return all_mentions, cleaned_mentions\n", 393 | " else:\n", 394 | " return all_mentions\n", 395 | " \n", 396 | "def count_active_user(df, col1, colname, header):\n", 397 | " '''\n", 398 | " at Series level\n", 399 | " '''\n", 400 | " active_user_count = df.groupby([col1])[colname].unique().apply(len)\n", 401 | " active_user_count = active_user_count.subtract(1) # Exclude 班长\n", 402 | " active_user_count = active_user_count.reset_index()\n", 403 | " active_user_count.columns = header\n", 404 | " active_user_count = batch_to_uni(active_user_count, ['RoomName'])\n", 405 | " return active_user_count" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "---" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "## 1.3 Summarizing and computing descriptive statistics\n", 420 | "\n", 421 | "\n", 422 | "### 1.3.1 Take a glance at the dataset\n", 423 | "\n", 424 | "__Be careful if the function should include parentheses:)__\n", 425 | "\n", 426 | "1. `DataFrame.describe()`: provide descriptive stats of the dataset\n", 427 | "2. `DataFrame.values`: access values of the dataset\n", 428 | "3. `DataFrame.head()`: access the head of the dataset\n", 429 | "4. `DataFrame.tail()`: access the tail of the dataset\n", 430 | "5. `DataFrame.shape`: provide the length and width of the dataset\n", 431 | "6. `DataFrame.size`: provide the product of the length and width of the dataset\n", 432 | "7. `DataFrame.columns`: provide the colomn names of the dataset\n", 433 | "8. `DataFrame.index`: provide the row index of the dataset\n", 434 | "9. `DataFrame.axes`: provide the colomn names and row index of the dataset\n", 435 | "\n", 436 | "### 1.3.2 Examples of a glance at the dataset" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 8, 442 | "metadata": { 443 | "ExecuteTime": { 444 | "end_time": "2018-12-28T11:47:52.848115Z", 445 | "start_time": "2018-12-28T11:47:52.798310Z" 446 | } 447 | }, 448 | "outputs": [ 449 | { 450 | "data": { 451 | "text/html": [ 452 | "
\n", 453 | "\n", 466 | "\n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | "
OpenHighLowCloseAdj CloseVolume
count20.00000020.00000020.00000020.00000020.0000002.000000e+01
mean7036.4339847094.1150396932.4030036996.1860356996.1860352.492132e+09
std236.773607234.200045268.133301278.415894278.4158946.665222e+08
min6573.4902346586.6801766304.6298836333.0000006333.0000009.589500e+08
25%6911.2550056973.8701176842.6701666878.9726566878.9726562.186262e+09
50%7033.8649907117.4851086983.6748057051.0800787051.0800782.443730e+09
75%7142.3323977227.2050787092.3750007165.8875747165.8875742.643168e+09
max7486.1298837486.5097667392.2202157441.5097667441.5097664.534120e+09
\n", 553 | "
" 554 | ], 555 | "text/plain": [ 556 | " Open High Low Close Adj Close \\\n", 557 | "count 20.000000 20.000000 20.000000 20.000000 20.000000 \n", 558 | "mean 7036.433984 7094.115039 6932.403003 6996.186035 6996.186035 \n", 559 | "std 236.773607 234.200045 268.133301 278.415894 278.415894 \n", 560 | "min 6573.490234 6586.680176 6304.629883 6333.000000 6333.000000 \n", 561 | "25% 6911.255005 6973.870117 6842.670166 6878.972656 6878.972656 \n", 562 | "50% 7033.864990 7117.485108 6983.674805 7051.080078 7051.080078 \n", 563 | "75% 7142.332397 7227.205078 7092.375000 7165.887574 7165.887574 \n", 564 | "max 7486.129883 7486.509766 7392.220215 7441.509766 7441.509766 \n", 565 | "\n", 566 | " Volume \n", 567 | "count 2.000000e+01 \n", 568 | "mean 2.492132e+09 \n", 569 | "std 6.665222e+08 \n", 570 | "min 9.589500e+08 \n", 571 | "25% 2.186262e+09 \n", 572 | "50% 2.443730e+09 \n", 573 | "75% 2.643168e+09 \n", 574 | "max 4.534120e+09 " 575 | ] 576 | }, 577 | "execution_count": 8, 578 | "metadata": {}, 579 | "output_type": "execute_result" 580 | } 581 | ], 582 | "source": [ 583 | "df.describe()" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 9, 589 | "metadata": { 590 | "ExecuteTime": { 591 | "end_time": "2018-12-28T11:47:52.858589Z", 592 | "start_time": "2018-12-28T11:47:52.850909Z" 593 | } 594 | }, 595 | "outputs": [ 596 | { 597 | "data": { 598 | "text/plain": [ 599 | "array([['2018-11-23', 6919.52002, 6987.890137, 6919.160156, 6938.97998,\n", 600 | " 6938.97998, 958950000],\n", 601 | " ['2018-11-26', 7026.5, 7083.930176000001, 7003.120117,\n", 602 | " 7081.850098000001, 7081.850098000001, 2011180000],\n", 603 | " ['2018-11-27', 7041.22998, 7105.140137, 7014.359863, 7082.700195,\n", 604 | " 7082.700195, 2067360000],\n", 605 | " ['2018-11-28', 7135.080078, 7292.709961, 7090.97998, 7291.589844,\n", 606 | " 7291.589844, 2390260000],\n", 607 | " ['2018-11-29', 7267.370117, 7319.959961, 7217.689941, 7273.080078,\n", 608 | " 7273.080078, 1983460000],\n", 609 | " ['2018-11-30', 7279.299805, 7332.790039, 7255.680176000001,\n", 610 | " 7330.540039, 7330.540039, 2542820000],\n", 611 | " ['2018-12-03', 7486.129883, 7486.509765999999, 7392.220215,\n", 612 | " 7441.509765999999, 7441.509765999999, 2621020000],\n", 613 | " ['2018-12-04', 7407.950195, 7421.109863, 7150.109863,\n", 614 | " 7158.430176000001, 7158.430176000001, 2635810000],\n", 615 | " ['2018-12-06', 7017.049805, 7189.52002, 6984.339844,\n", 616 | " 7188.259765999999, 7188.259765999999, 2833870000],\n", 617 | " ['2018-12-07', 7163.490234000001, 7205.370117, 6945.27002,\n", 618 | " 6969.25, 6969.25, 2475160000],\n", 619 | " ['2018-12-10', 6959.629883, 7047.620117, 6878.990234000001,\n", 620 | " 7020.52002, 7020.52002, 2367560000],\n", 621 | " ['2018-12-11', 7121.660156, 7129.830078, 6983.009765999999,\n", 622 | " 7031.830078, 7031.830078, 2246060000],\n", 623 | " ['2018-12-12', 7127.0, 7197.290039, 7096.560059, 7098.310059,\n", 624 | " 7098.310059, 2412300000],\n", 625 | " ['2018-12-13', 7135.279785, 7154.640137, 7034.819823999999,\n", 626 | " 7070.330078, 7070.330078, 2143520000],\n", 627 | " ['2018-12-14', 6986.370117, 7027.169922, 6898.990234000001,\n", 628 | " 6910.660156, 6910.660156, 2200510000],\n", 629 | " ['2018-12-17', 6886.459961, 6931.810059, 6710.009765999999,\n", 630 | " 6753.72998, 6753.72998, 2665240000],\n", 631 | " ['2018-12-18', 6809.819823999999, 6847.27002, 6733.709961,\n", 632 | " 6783.910156, 6783.910156, 2595400000],\n", 633 | " ['2018-12-19', 6777.589844, 6868.859863, 6586.5, 6636.830078,\n", 634 | " 6636.830078, 2899950000],\n", 635 | " ['2018-12-20', 6607.759765999999, 6666.200195, 6447.910156,\n", 636 | " 6528.410156, 6528.410156, 3258090000],\n", 637 | " ['2018-12-21', 6573.490234000001, 6586.680176, 6304.629883,\n", 638 | " 6333.0, 6333.0, 4534120000]], dtype=object)" 639 | ] 640 | }, 641 | "execution_count": 9, 642 | "metadata": {}, 643 | "output_type": "execute_result" 644 | } 645 | ], 646 | "source": [ 647 | "df.values" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 10, 653 | "metadata": { 654 | "ExecuteTime": { 655 | "end_time": "2018-12-28T11:47:52.878993Z", 656 | "start_time": "2018-12-28T11:47:52.861182Z" 657 | } 658 | }, 659 | "outputs": [ 660 | { 661 | "data": { 662 | "text/html": [ 663 | "
\n", 664 | "\n", 677 | "\n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | "
DateOpenHighLowCloseAdj CloseVolume
02018-11-236919.5200206987.8901376919.1601566938.9799806938.979980958950000
12018-11-267026.5000007083.9301767003.1201177081.8500987081.8500982011180000
22018-11-277041.2299807105.1401377014.3598637082.7001957082.7001952067360000
32018-11-287135.0800787292.7099617090.9799807291.5898447291.5898442390260000
42018-11-297267.3701177319.9599617217.6899417273.0800787273.0800781983460000
\n", 743 | "
" 744 | ], 745 | "text/plain": [ 746 | " Date Open High Low Close \\\n", 747 | "0 2018-11-23 6919.520020 6987.890137 6919.160156 6938.979980 \n", 748 | "1 2018-11-26 7026.500000 7083.930176 7003.120117 7081.850098 \n", 749 | "2 2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 \n", 750 | "3 2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 \n", 751 | "4 2018-11-29 7267.370117 7319.959961 7217.689941 7273.080078 \n", 752 | "\n", 753 | " Adj Close Volume \n", 754 | "0 6938.979980 958950000 \n", 755 | "1 7081.850098 2011180000 \n", 756 | "2 7082.700195 2067360000 \n", 757 | "3 7291.589844 2390260000 \n", 758 | "4 7273.080078 1983460000 " 759 | ] 760 | }, 761 | "execution_count": 10, 762 | "metadata": {}, 763 | "output_type": "execute_result" 764 | } 765 | ], 766 | "source": [ 767 | "df.head()" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": 11, 773 | "metadata": { 774 | "ExecuteTime": { 775 | "end_time": "2018-12-28T11:47:52.903141Z", 776 | "start_time": "2018-12-28T11:47:52.882368Z" 777 | } 778 | }, 779 | "outputs": [ 780 | { 781 | "data": { 782 | "text/html": [ 783 | "
\n", 784 | "\n", 797 | "\n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | "
DateOpenHighLowCloseAdj CloseVolume
152018-12-176886.4599616931.8100596710.0097666753.7299806753.7299802665240000
162018-12-186809.8198246847.2700206733.7099616783.9101566783.9101562595400000
172018-12-196777.5898446868.8598636586.5000006636.8300786636.8300782899950000
182018-12-206607.7597666666.2001956447.9101566528.4101566528.4101563258090000
192018-12-216573.4902346586.6801766304.6298836333.0000006333.0000004534120000
\n", 863 | "
" 864 | ], 865 | "text/plain": [ 866 | " Date Open High Low Close \\\n", 867 | "15 2018-12-17 6886.459961 6931.810059 6710.009766 6753.729980 \n", 868 | "16 2018-12-18 6809.819824 6847.270020 6733.709961 6783.910156 \n", 869 | "17 2018-12-19 6777.589844 6868.859863 6586.500000 6636.830078 \n", 870 | "18 2018-12-20 6607.759766 6666.200195 6447.910156 6528.410156 \n", 871 | "19 2018-12-21 6573.490234 6586.680176 6304.629883 6333.000000 \n", 872 | "\n", 873 | " Adj Close Volume \n", 874 | "15 6753.729980 2665240000 \n", 875 | "16 6783.910156 2595400000 \n", 876 | "17 6636.830078 2899950000 \n", 877 | "18 6528.410156 3258090000 \n", 878 | "19 6333.000000 4534120000 " 879 | ] 880 | }, 881 | "execution_count": 11, 882 | "metadata": {}, 883 | "output_type": "execute_result" 884 | } 885 | ], 886 | "source": [ 887 | "df.tail()" 888 | ] 889 | }, 890 | { 891 | "cell_type": "code", 892 | "execution_count": 12, 893 | "metadata": { 894 | "ExecuteTime": { 895 | "end_time": "2018-12-28T11:47:52.936981Z", 896 | "start_time": "2018-12-28T11:47:52.915987Z" 897 | } 898 | }, 899 | "outputs": [ 900 | { 901 | "data": { 902 | "text/plain": [ 903 | "(20, 7)" 904 | ] 905 | }, 906 | "execution_count": 12, 907 | "metadata": {}, 908 | "output_type": "execute_result" 909 | } 910 | ], 911 | "source": [ 912 | "df.shape" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 13, 918 | "metadata": { 919 | "ExecuteTime": { 920 | "end_time": "2018-12-28T11:47:52.949585Z", 921 | "start_time": "2018-12-28T11:47:52.941610Z" 922 | } 923 | }, 924 | "outputs": [ 925 | { 926 | "data": { 927 | "text/plain": [ 928 | "140" 929 | ] 930 | }, 931 | "execution_count": 13, 932 | "metadata": {}, 933 | "output_type": "execute_result" 934 | } 935 | ], 936 | "source": [ 937 | "df.size" 938 | ] 939 | }, 940 | { 941 | "cell_type": "code", 942 | "execution_count": 14, 943 | "metadata": { 944 | "ExecuteTime": { 945 | "end_time": "2018-12-28T11:47:52.960181Z", 946 | "start_time": "2018-12-28T11:47:52.952861Z" 947 | } 948 | }, 949 | "outputs": [ 950 | { 951 | "data": { 952 | "text/plain": [ 953 | "Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')" 954 | ] 955 | }, 956 | "execution_count": 14, 957 | "metadata": {}, 958 | "output_type": "execute_result" 959 | } 960 | ], 961 | "source": [ 962 | "df.columns" 963 | ] 964 | }, 965 | { 966 | "cell_type": "code", 967 | "execution_count": 15, 968 | "metadata": { 969 | "ExecuteTime": { 970 | "end_time": "2018-12-28T11:47:52.971291Z", 971 | "start_time": "2018-12-28T11:47:52.964021Z" 972 | } 973 | }, 974 | "outputs": [ 975 | { 976 | "data": { 977 | "text/plain": [ 978 | "RangeIndex(start=0, stop=20, step=1)" 979 | ] 980 | }, 981 | "execution_count": 15, 982 | "metadata": {}, 983 | "output_type": "execute_result" 984 | } 985 | ], 986 | "source": [ 987 | "df.index" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": 16, 993 | "metadata": { 994 | "ExecuteTime": { 995 | "end_time": "2018-12-28T11:47:52.984913Z", 996 | "start_time": "2018-12-28T11:47:52.973669Z" 997 | } 998 | }, 999 | "outputs": [ 1000 | { 1001 | "data": { 1002 | "text/plain": [ 1003 | "[RangeIndex(start=0, stop=20, step=1),\n", 1004 | " Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'], dtype='object')]" 1005 | ] 1006 | }, 1007 | "execution_count": 16, 1008 | "metadata": {}, 1009 | "output_type": "execute_result" 1010 | } 1011 | ], 1012 | "source": [ 1013 | "df.axes" 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "code", 1018 | "execution_count": 35, 1019 | "metadata": { 1020 | "ExecuteTime": { 1021 | "end_time": "2018-12-28T12:34:54.088767Z", 1022 | "start_time": "2018-12-28T12:34:54.045968Z" 1023 | } 1024 | }, 1025 | "outputs": [ 1026 | { 1027 | "name": "stdout", 1028 | "output_type": "stream", 1029 | "text": [ 1030 | "\n", 1031 | "RangeIndex: 20 entries, 0 to 19\n", 1032 | "Data columns (total 7 columns):\n", 1033 | "Date 20 non-null object\n", 1034 | "Open 20 non-null float64\n", 1035 | "High 20 non-null float64\n", 1036 | "Low 20 non-null float64\n", 1037 | "Close 20 non-null float64\n", 1038 | "Adj Close 20 non-null float64\n", 1039 | "Volume 20 non-null int64\n", 1040 | "dtypes: float64(5), int64(1), object(1)\n", 1041 | "memory usage: 1.2+ KB\n" 1042 | ] 1043 | } 1044 | ], 1045 | "source": [ 1046 | "df.info()" 1047 | ] 1048 | }, 1049 | { 1050 | "cell_type": "markdown", 1051 | "metadata": {}, 1052 | "source": [ 1053 | "---\n", 1054 | "\n", 1055 | "## 1.4 Dive into index\n", 1056 | "\n", 1057 | "### 1.4.1 Index labels\n", 1058 | "\n", 1059 | "Index labels:\n", 1060 | " \n", 1061 | "1. do not need to be integers;\n", 1062 | "\n", 1063 | "2. can have repeated labels (__Be careful, this is different from dict__);\n", 1064 | "\n", 1065 | "3. can have hierarchical sets of labels.\n", 1066 | "\n", 1067 | "### 1.4.2 Examples of index labels" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "code", 1072 | "execution_count": 17, 1073 | "metadata": { 1074 | "ExecuteTime": { 1075 | "end_time": "2018-12-28T11:47:53.010408Z", 1076 | "start_time": "2018-12-28T11:47:52.990897Z" 1077 | } 1078 | }, 1079 | "outputs": [ 1080 | { 1081 | "name": "stdout", 1082 | "output_type": "stream", 1083 | "text": [ 1084 | "0 -0.281210\n", 1085 | "1 0.770726\n", 1086 | "2 -0.176266\n", 1087 | "3 -1.612378\n", 1088 | "4 -1.868139\n", 1089 | "5 -0.496955\n", 1090 | "dtype: float64\n", 1091 | "\n", 1092 | "a -0.281210\n", 1093 | "a 0.770726\n", 1094 | "a -0.176266\n", 1095 | "a -1.612378\n", 1096 | "a -1.868139\n", 1097 | "a -0.496955\n", 1098 | "dtype: float64\n", 1099 | "\n", 1100 | "MultiIndex(levels=[['a', 'b'], [-1.1125657480588875, -0.17357278036727714, -0.14227622735236414, 0.04140226853409916, 0.923176857116319, 1.2940772084264573]],\n", 1101 | " labels=[[0, 0, 0, 1, 1, 1], [4, 1, 3, 0, 5, 2]],\n", 1102 | " names=['letter', 'float'])\n", 1103 | "\n", 1104 | "letter float \n", 1105 | "a 0.923177 -0.281210\n", 1106 | " -0.173573 0.770726\n", 1107 | " 0.041402 -0.176266\n", 1108 | "b -1.112566 -1.612378\n", 1109 | " 1.294077 -1.868139\n", 1110 | " -0.142276 -0.496955\n", 1111 | "dtype: float64\n" 1112 | ] 1113 | } 1114 | ], 1115 | "source": [ 1116 | "import numpy as np\n", 1117 | "import pandas as pd\n", 1118 | "\n", 1119 | "# The default index is int\n", 1120 | "aray = np.random.randn(6)\n", 1121 | "srs = pd.Series(aray)\n", 1122 | "print(srs)\n", 1123 | "print()\n", 1124 | "\n", 1125 | "# We can set repeated non-int labels to index\n", 1126 | "ind = ['a'] * 6\n", 1127 | "srs.index = ind\n", 1128 | "print(srs)\n", 1129 | "print()\n", 1130 | "\n", 1131 | "# We can set multi-level labels to index\n", 1132 | "ind = zip(['a'] * 3 + ['b'] * 3, np.random.randn(6))\n", 1133 | "ind = pd.MultiIndex.from_tuples(ind, names=['letter', 'float'])\n", 1134 | "srs.index = ind\n", 1135 | "print(ind)\n", 1136 | "print()\n", 1137 | "print(srs)" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "code", 1142 | "execution_count": 37, 1143 | "metadata": { 1144 | "ExecuteTime": { 1145 | "end_time": "2018-12-28T12:40:36.060328Z", 1146 | "start_time": "2018-12-28T12:40:36.027835Z" 1147 | } 1148 | }, 1149 | "outputs": [ 1150 | { 1151 | "data": { 1152 | "text/plain": [ 1153 | "a -0.281210\n", 1154 | "a 0.770726\n", 1155 | "a -0.176266\n", 1156 | "a -1.612378\n", 1157 | "a -1.868139\n", 1158 | "a -0.496955\n", 1159 | "dtype: float64" 1160 | ] 1161 | }, 1162 | "execution_count": 37, 1163 | "metadata": {}, 1164 | "output_type": "execute_result" 1165 | } 1166 | ], 1167 | "source": [ 1168 | "sr1 = pd.Series(aray, index=['a'] * 6)\n", 1169 | "sr1" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "markdown", 1174 | "metadata": {}, 1175 | "source": [ 1176 | "---\n", 1177 | "\n", 1178 | "### 1.4.3 Three major usages\n", 1179 | "\n", 1180 | "a. Identication: Indices are used to locate Series / rows / items in a DataFrame. \n", 1181 | "\n", 1182 | "b. Alignment: pandas will always align with index automatically first.\n", 1183 | "\n", 1184 | "c. Selection: using index to select relevant columns/rows.\n", 1185 | "\n", 1186 | "### 1.4.4 Examples of identication" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "code", 1191 | "execution_count": 38, 1192 | "metadata": { 1193 | "ExecuteTime": { 1194 | "end_time": "2018-12-28T12:44:22.736405Z", 1195 | "start_time": "2018-12-28T12:44:22.689162Z" 1196 | } 1197 | }, 1198 | "outputs": [ 1199 | { 1200 | "data": { 1201 | "text/html": [ 1202 | "
\n", 1203 | "\n", 1216 | "\n", 1217 | " \n", 1218 | " \n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | "
DateOpenHighLowCloseAdj CloseVolume
142018-12-146986.3701177027.1699226898.9902346910.6601566910.6601562200510000
\n", 1242 | "
" 1243 | ], 1244 | "text/plain": [ 1245 | " Date Open High Low Close \\\n", 1246 | "14 2018-12-14 6986.370117 7027.169922 6898.990234 6910.660156 \n", 1247 | "\n", 1248 | " Adj Close Volume \n", 1249 | "14 6910.660156 2200510000 " 1250 | ] 1251 | }, 1252 | "execution_count": 38, 1253 | "metadata": {}, 1254 | "output_type": "execute_result" 1255 | } 1256 | ], 1257 | "source": [ 1258 | "index_df = df.copy()\n", 1259 | "\n", 1260 | "index_df[index_df['Date'] == '2018-12-14']" 1261 | ] 1262 | }, 1263 | { 1264 | "cell_type": "markdown", 1265 | "metadata": {}, 1266 | "source": [ 1267 | "---\n", 1268 | "\n", 1269 | "### 1.4.5 Examples of alignment" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "code", 1274 | "execution_count": 32, 1275 | "metadata": { 1276 | "ExecuteTime": { 1277 | "end_time": "2018-12-28T11:48:52.767186Z", 1278 | "start_time": "2018-12-28T11:48:52.741402Z" 1279 | } 1280 | }, 1281 | "outputs": [ 1282 | { 1283 | "name": "stdout", 1284 | "output_type": "stream", 1285 | "text": [ 1286 | " Date Open High Low Close \\\n", 1287 | "0 2018-11-23 6919.520020 6987.890137 6919.160156 6938.979980 \n", 1288 | "1 2018-11-26 7026.500000 7083.930176 7003.120117 7081.850098 \n", 1289 | "2 2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 \n", 1290 | "3 2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 \n", 1291 | "4 2018-11-29 7267.370117 7319.959961 7217.689941 7273.080078 \n", 1292 | "\n", 1293 | " Adj Close Volume Max_diff \n", 1294 | "0 6938.979980 958950000 68.729981 \n", 1295 | "1 7081.850098 2011180000 80.810059 \n", 1296 | "2 7082.700195 2067360000 90.780274 \n", 1297 | "3 7291.589844 2390260000 201.729981 \n", 1298 | "4 7273.080078 1983460000 102.270020 \n" 1299 | ] 1300 | } 1301 | ], 1302 | "source": [ 1303 | "index_df['Max_diff'] = index_df['High'] - index_df['Low']\n", 1304 | "index_df.head()" 1305 | ] 1306 | }, 1307 | { 1308 | "cell_type": "markdown", 1309 | "metadata": {}, 1310 | "source": [ 1311 | "---\n", 1312 | "\n", 1313 | "### 1.4.6 Examples of selection" 1314 | ] 1315 | }, 1316 | { 1317 | "cell_type": "code", 1318 | "execution_count": 33, 1319 | "metadata": { 1320 | "ExecuteTime": { 1321 | "end_time": "2018-12-28T11:48:54.255104Z", 1322 | "start_time": "2018-12-28T11:48:54.237334Z" 1323 | } 1324 | }, 1325 | "outputs": [ 1326 | { 1327 | "name": "stdout", 1328 | "output_type": "stream", 1329 | "text": [ 1330 | "Date 2018-12-14\n", 1331 | "Close 6910.66\n", 1332 | "Name: 14, dtype: object\n" 1333 | ] 1334 | } 1335 | ], 1336 | "source": [ 1337 | "index_df.loc[14, ['Date', 'Close']]" 1338 | ] 1339 | }, 1340 | { 1341 | "cell_type": "code", 1342 | "execution_count": 34, 1343 | "metadata": { 1344 | "ExecuteTime": { 1345 | "end_time": "2018-12-28T11:48:55.189637Z", 1346 | "start_time": "2018-12-28T11:48:55.168154Z" 1347 | } 1348 | }, 1349 | "outputs": [ 1350 | { 1351 | "name": "stdout", 1352 | "output_type": "stream", 1353 | "text": [ 1354 | "Index(['2018-11-23', '2018-11-26', '2018-11-27', '2018-11-28', '2018-11-29',\n", 1355 | " '2018-11-30', '2018-12-03', '2018-12-04', '2018-12-06', '2018-12-07',\n", 1356 | " '2018-12-10', '2018-12-11', '2018-12-12', '2018-12-13', '2018-12-14',\n", 1357 | " '2018-12-17', '2018-12-18', '2018-12-19', '2018-12-20', '2018-12-21'],\n", 1358 | " dtype='object', name='Date')\n", 1359 | "\n", 1360 | "Date NaN\n", 1361 | "Close 6910.660156\n", 1362 | "Name: 2018-12-14, dtype: float64\n" 1363 | ] 1364 | } 1365 | ], 1366 | "source": [ 1367 | "index_df['Date'] = index_df['Date'].astype('str')\n", 1368 | "index_df.set_index('Date', inplace=True)\n", 1369 | "print(index_df.index)\n", 1370 | "print()\n", 1371 | "print(index_df.loc['2018-12-14', ['Date', 'Close']])" 1372 | ] 1373 | }, 1374 | { 1375 | "cell_type": "code", 1376 | "execution_count": 22, 1377 | "metadata": { 1378 | "ExecuteTime": { 1379 | "end_time": "2018-12-28T11:47:53.123177Z", 1380 | "start_time": "2018-12-28T11:47:53.110037Z" 1381 | } 1382 | }, 1383 | "outputs": [ 1384 | { 1385 | "name": "stdout", 1386 | "output_type": "stream", 1387 | "text": [ 1388 | " Open High Low Close Adj Close \\\n", 1389 | "2018-11-23 6919.520020 6987.890137 6919.160156 6938.979980 6938.979980 \n", 1390 | "2018-11-26 7026.500000 7083.930176 7003.120117 7081.850098 7081.850098 \n", 1391 | "2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 7082.700195 \n", 1392 | "2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 7291.589844 \n", 1393 | "2018-11-29 7267.370117 7319.959961 7217.689941 7273.080078 7273.080078 \n", 1394 | "\n", 1395 | " Volume Max_diff \n", 1396 | "2018-11-23 958950000 68.729981 \n", 1397 | "2018-11-26 2011180000 80.810059 \n", 1398 | "2018-11-27 2067360000 90.780274 \n", 1399 | "2018-11-28 2390260000 201.729981 \n", 1400 | "2018-11-29 1983460000 102.270020 \n" 1401 | ] 1402 | } 1403 | ], 1404 | "source": [ 1405 | "index_df.index.name = None\n", 1406 | "print(index_df.head())" 1407 | ] 1408 | }, 1409 | { 1410 | "cell_type": "code", 1411 | "execution_count": 23, 1412 | "metadata": { 1413 | "ExecuteTime": { 1414 | "end_time": "2018-12-28T11:47:53.139863Z", 1415 | "start_time": "2018-12-28T11:47:53.126153Z" 1416 | }, 1417 | "scrolled": true 1418 | }, 1419 | "outputs": [ 1420 | { 1421 | "name": "stdout", 1422 | "output_type": "stream", 1423 | "text": [ 1424 | " Date Open High Low Close \\\n", 1425 | "0 2018-11-23 6919.520020 6987.890137 6919.160156 6938.979980 \n", 1426 | "1 2018-11-26 7026.500000 7083.930176 7003.120117 7081.850098 \n", 1427 | "2 2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 \n", 1428 | "3 2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 \n", 1429 | "4 2018-11-29 7267.370117 7319.959961 7217.689941 7273.080078 \n", 1430 | "\n", 1431 | " Adj Close Volume Max_diff \n", 1432 | "0 6938.979980 958950000 68.729981 \n", 1433 | "1 7081.850098 2011180000 80.810059 \n", 1434 | "2 7082.700195 2067360000 90.780274 \n", 1435 | "3 7291.589844 2390260000 201.729981 \n", 1436 | "4 7273.080078 1983460000 102.270020 \n" 1437 | ] 1438 | } 1439 | ], 1440 | "source": [ 1441 | "index_df.index.name = \"Date\"\n", 1442 | "index_df.reset_index(inplace=True)\n", 1443 | "print(index_df.head())" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "markdown", 1448 | "metadata": {}, 1449 | "source": [ 1450 | "---\n", 1451 | "\n", 1452 | "### 1.4.7 Five ways of index selection\n", 1453 | "\n", 1454 | "1. `[]` operator: using index / column names to access data.\n", 1455 | "2. `df.loc`: Access a group of rows and columns by label(s)\n", 1456 | "3. `df.iloc`: Access a group of rows and columns by integer position(s)\n", 1457 | "4. `df.at`: Access a single value for a row/column label pair.\n", 1458 | "5. `df.iat`: Access a single value for a row/column pair by integer position." 1459 | ] 1460 | }, 1461 | { 1462 | "cell_type": "code", 1463 | "execution_count": 24, 1464 | "metadata": { 1465 | "ExecuteTime": { 1466 | "end_time": "2018-12-28T11:47:53.169526Z", 1467 | "start_time": "2018-12-28T11:47:53.144080Z" 1468 | } 1469 | }, 1470 | "outputs": [ 1471 | { 1472 | "name": "stdout", 1473 | "output_type": "stream", 1474 | "text": [ 1475 | " Open High Low Close Adj Close \\\n", 1476 | "Date \n", 1477 | "2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 7082.700195 \n", 1478 | "2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 7291.589844 \n", 1479 | "\n", 1480 | " Volume Max_diff \n", 1481 | "Date \n", 1482 | "2018-11-27 2067360000 90.780274 \n", 1483 | "2018-11-28 2390260000 201.729981 \n", 1484 | "\n", 1485 | " Open High Low Close Adj Close \\\n", 1486 | "Date \n", 1487 | "2018-11-27 7041.229980 7105.140137 7014.359863 7082.700195 7082.700195 \n", 1488 | "2018-11-28 7135.080078 7292.709961 7090.979980 7291.589844 7291.589844 \n", 1489 | "\n", 1490 | " Volume Max_diff \n", 1491 | "Date \n", 1492 | "2018-11-27 2067360000 90.780274 \n", 1493 | "2018-11-28 2390260000 201.729981 \n", 1494 | "\n", 1495 | " Open High Low Close Adj Close \\\n", 1496 | "Date \n", 1497 | "2018-11-27 7041.22998 7105.140137 7014.359863 7082.700195 7082.700195 \n", 1498 | "\n", 1499 | " Volume Max_diff \n", 1500 | "Date \n", 1501 | "2018-11-27 2067360000 90.780274 \n", 1502 | "\n", 1503 | "7041.22998\n", 1504 | "\n", 1505 | "7041.22998\n" 1506 | ] 1507 | } 1508 | ], 1509 | "source": [ 1510 | "index_df.set_index('Date', inplace=True)\n", 1511 | "\n", 1512 | "print(index_df['2018-11-27':'2018-11-28'])\n", 1513 | "print()\n", 1514 | "print(index_df.loc['2018-11-27':'2018-11-28'])\n", 1515 | "print()\n", 1516 | "print(index_df.iloc[2:3])\n", 1517 | "print()\n", 1518 | "print(index_df.at['2018-11-27','Open'])\n", 1519 | "print()\n", 1520 | "print(index_df.iat[2, 0])" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": 25, 1526 | "metadata": { 1527 | "ExecuteTime": { 1528 | "end_time": "2018-12-28T11:48:03.115744Z", 1529 | "start_time": "2018-12-28T11:47:53.175722Z" 1530 | } 1531 | }, 1532 | "outputs": [ 1533 | { 1534 | "name": "stdout", 1535 | "output_type": "stream", 1536 | "text": [ 1537 | "124 µs ± 42.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 1538 | ] 1539 | } 1540 | ], 1541 | "source": [ 1542 | "%timeit index_df['2018-11-27':'2018-11-28']" 1543 | ] 1544 | }, 1545 | { 1546 | "cell_type": "code", 1547 | "execution_count": 26, 1548 | "metadata": { 1549 | "ExecuteTime": { 1550 | "end_time": "2018-12-28T11:48:12.157624Z", 1551 | "start_time": "2018-12-28T11:48:03.122063Z" 1552 | } 1553 | }, 1554 | "outputs": [ 1555 | { 1556 | "name": "stdout", 1557 | "output_type": "stream", 1558 | "text": [ 1559 | "112 µs ± 6.67 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 1560 | ] 1561 | } 1562 | ], 1563 | "source": [ 1564 | "%timeit index_df.loc['2018-11-27':'2018-11-28']" 1565 | ] 1566 | }, 1567 | { 1568 | "cell_type": "code", 1569 | "execution_count": 27, 1570 | "metadata": { 1571 | "ExecuteTime": { 1572 | "end_time": "2018-12-28T11:48:25.491913Z", 1573 | "start_time": "2018-12-28T11:48:12.161462Z" 1574 | } 1575 | }, 1576 | "outputs": [ 1577 | { 1578 | "name": "stdout", 1579 | "output_type": "stream", 1580 | "text": [ 1581 | "175 µs ± 90.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" 1582 | ] 1583 | } 1584 | ], 1585 | "source": [ 1586 | "%timeit index_df.iloc[2:3]" 1587 | ] 1588 | }, 1589 | { 1590 | "cell_type": "code", 1591 | "execution_count": 28, 1592 | "metadata": { 1593 | "ExecuteTime": { 1594 | "end_time": "2018-12-28T11:48:32.855730Z", 1595 | "start_time": "2018-12-28T11:48:25.499466Z" 1596 | } 1597 | }, 1598 | "outputs": [ 1599 | { 1600 | "name": "stdout", 1601 | "output_type": "stream", 1602 | "text": [ 1603 | "8.53 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 1604 | ] 1605 | } 1606 | ], 1607 | "source": [ 1608 | "%timeit index_df.at['2018-11-27','Open']" 1609 | ] 1610 | }, 1611 | { 1612 | "cell_type": "code", 1613 | "execution_count": 29, 1614 | "metadata": { 1615 | "ExecuteTime": { 1616 | "end_time": "2018-12-28T11:48:38.653643Z", 1617 | "start_time": "2018-12-28T11:48:32.858760Z" 1618 | } 1619 | }, 1620 | "outputs": [ 1621 | { 1622 | "name": "stdout", 1623 | "output_type": "stream", 1624 | "text": [ 1625 | "7.2 µs ± 1.86 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)\n" 1626 | ] 1627 | } 1628 | ], 1629 | "source": [ 1630 | "%timeit index_df.iat[2, 0]" 1631 | ] 1632 | }, 1633 | { 1634 | "cell_type": "markdown", 1635 | "metadata": {}, 1636 | "source": [ 1637 | "__We recommend using `pd.DataFrame.iloc() / pd.DataFrame.loc()` in this case for the best performance and readibility.__\n", 1638 | "\n", 1639 | "---\n", 1640 | "\n", 1641 | "## 1.5 Exercises\n", 1642 | "\n", 1643 | "### 1.5.1 Reviewing\n", 1644 | "\n", 1645 | "Please review the code above.\n", 1646 | "\n", 1647 | "### 1.5.1 Refactoring\n", 1648 | "\n", 1649 | "If you have written pandas scripts before, try to refactor them into different levels of functions.\n", 1650 | "\n", 1651 | "### 1.5.3 Checking parameters\n", 1652 | "\n", 1653 | "Check the default and optional parameters of the following methods:\n", 1654 | "\n", 1655 | "1. `DataFrame.desribe()`: provide descriptive stats of the dataset\n", 1656 | "2. `DataFrame.values`: access values of the dataset\n", 1657 | "3. `DataFrame.head()`: access the head of the dataset\n", 1658 | "4. `DataFrame.tail()`: access the tail of the dataset\n", 1659 | "5. `DataFrame.shape`: provide the length and width of the dataset\n", 1660 | "6. `DataFrame.size`: provide the product of the length and width of the dataset\n", 1661 | "7. `DataFrame.columns`: provide the colomn names of the dataset\n", 1662 | "8. `DataFrame.index`: provide the row index of the dataset\n", 1663 | "9. `DataFrame.axes`: provide the colomn names and row index of the dataset" 1664 | ] 1665 | }, 1666 | { 1667 | "cell_type": "markdown", 1668 | "metadata": {}, 1669 | "source": [ 1670 | "---\n", 1671 | "\n", 1672 | "To the rest sessions (outlines and video records), please scan the QR code below to pay.\n", 1673 | "\n", 1674 | "1. The price is 799 RMB.\n", 1675 | "2. Please leave your email address in the __payment comment__, so I will send you the links of the rest sessions.\n", 1676 | "\n", 1677 | "\n", 1678 | "" 1679 | ] 1680 | }, 1681 | { 1682 | "cell_type": "markdown", 1683 | "metadata": {}, 1684 | "source": [ 1685 | "---" 1686 | ] 1687 | } 1688 | ], 1689 | "metadata": { 1690 | "kernelspec": { 1691 | "display_name": "Python 3", 1692 | "language": "python", 1693 | "name": "python3" 1694 | }, 1695 | "language_info": { 1696 | "codemirror_mode": { 1697 | "name": "ipython", 1698 | "version": 3 1699 | }, 1700 | "file_extension": ".py", 1701 | "mimetype": "text/x-python", 1702 | "name": "python", 1703 | "nbconvert_exporter": "python", 1704 | "pygments_lexer": "ipython3", 1705 | "version": "3.7.2" 1706 | }, 1707 | "toc": { 1708 | "base_numbering": 1, 1709 | "nav_menu": {}, 1710 | "number_sections": false, 1711 | "sideBar": true, 1712 | "skip_h1_title": false, 1713 | "title_cell": "Table of Contents", 1714 | "title_sidebar": "Contents", 1715 | "toc_cell": false, 1716 | "toc_position": { 1717 | "height": "calc(100% - 180px)", 1718 | "left": "10px", 1719 | "top": "150px", 1720 | "width": "335.8541564941406px" 1721 | }, 1722 | "toc_section_display": true, 1723 | "toc_window_display": false 1724 | } 1725 | }, 1726 | "nbformat": 4, 1727 | "nbformat_minor": 2 1728 | } 1729 | --------------------------------------------------------------------------------