├── .gitignore ├── .vscode └── launch.json ├── LICENSE ├── README.md ├── data_exploratory ├── abnormal_check.py ├── correlation_analysis.py ├── data │ ├── catering_dish_profit.xls │ ├── catering_fish_congee.xls │ ├── catering_sale.csv │ ├── catering_sale.xls │ └── catering_sale_all.xls ├── dish_pareto.py └── statistic_analysis.py ├── data_modeling ├── cm_plot.py ├── data │ ├── arima_data.xls │ ├── bankloan.xls │ ├── consumption_data.xls │ ├── menu_orders.xls │ ├── neural_network.png │ └── sales_data.xls ├── decision_tree.py ├── kmeans.py ├── logistic_regression.py ├── neural_network.py └── tmp │ ├── data_type.xls │ ├── kmeans_pd_0.png │ ├── kmeans_pd_1.png │ ├── kmeans_pd_2.png │ ├── pd_0.png │ ├── pd_1.png │ ├── pd_2.png │ └── tree.dot ├── data_preprocess ├── attr_construct.py ├── data │ ├── catering_sale.xls │ ├── discretization_data.xls │ ├── electricity_data.xls │ ├── leleccum.mat │ ├── normalization_data.xls │ └── principal_component.xls ├── data_discretization.py ├── data_lagrange_interplate.py ├── data_normalization.py ├── principal_component_analyze.py ├── tmp │ ├── dimention_reducted.xls │ ├── electricity_data.xls │ └── sales.xls └── wave_analysis.py └── tools ├── hello.py ├── matplotlib_test.py ├── numpy_test.py ├── pandas_notes.ipynb └── pandas_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.vscode 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python", 9 | "type": "python", 10 | "request": "launch", 11 | "stopOnEntry": true, 12 | "pythonPath": "${config:python.pythonPath}", 13 | "program": "${file}", 14 | "cwd": "${workspaceFolder}", 15 | "env": {}, 16 | "envFile": "${workspaceFolder}/.env", 17 | "debugOptions": [ 18 | "RedirectOutput" 19 | ] 20 | }, 21 | { 22 | "name": "Python: Attach", 23 | "type": "python", 24 | "request": "attach", 25 | "localRoot": "${workspaceFolder}", 26 | "remoteRoot": "${workspaceFolder}", 27 | "port": 3000, 28 | "secret": "my_secret", 29 | "host": "localhost" 30 | }, 31 | { 32 | "name": "Python: Terminal (integrated)", 33 | "type": "python", 34 | "request": "launch", 35 | "stopOnEntry": true, 36 | "pythonPath": "${config:python.pythonPath}", 37 | "program": "${file}", 38 | "cwd": "", 39 | "console": "integratedTerminal", 40 | "env": {}, 41 | "envFile": "${workspaceFolder}/.env", 42 | "debugOptions": [] 43 | }, 44 | { 45 | "name": "Python: Terminal (external)", 46 | "type": "python", 47 | "request": "launch", 48 | "stopOnEntry": true, 49 | "pythonPath": "${config:python.pythonPath}", 50 | "program": "${file}", 51 | "cwd": "", 52 | "console": "externalTerminal", 53 | "env": {}, 54 | "envFile": "${workspaceFolder}/.env", 55 | "debugOptions": [] 56 | }, 57 | { 58 | "name": "Python: Django", 59 | "type": "python", 60 | "request": "launch", 61 | "stopOnEntry": true, 62 | "pythonPath": "${config:python.pythonPath}", 63 | "program": "${workspaceFolder}/manage.py", 64 | "cwd": "${workspaceFolder}", 65 | "args": [ 66 | "runserver", 67 | "--noreload", 68 | "--nothreading" 69 | ], 70 | "env": {}, 71 | "envFile": "${workspaceFolder}/.env", 72 | "debugOptions": [ 73 | "RedirectOutput", 74 | "DjangoDebugging" 75 | ] 76 | }, 77 | { 78 | "name": "Python: Flask (0.11.x or later)", 79 | "type": "python", 80 | "request": "launch", 81 | "stopOnEntry": false, 82 | "pythonPath": "${config:python.pythonPath}", 83 | "program": "fully qualified path fo 'flask' executable. Generally located along with python interpreter", 84 | "cwd": "${workspaceFolder}", 85 | "env": { 86 | "FLASK_APP": "${workspaceFolder}/quickstart/app.py" 87 | }, 88 | "args": [ 89 | "run", 90 | "--no-debugger", 91 | "--no-reload" 92 | ], 93 | "envFile": "${workspaceFolder}/.env", 94 | "debugOptions": [ 95 | "RedirectOutput" 96 | ] 97 | }, 98 | { 99 | "name": "Python: Flask (0.10.x or earlier)", 100 | "type": "python", 101 | "request": "launch", 102 | "stopOnEntry": false, 103 | "pythonPath": "${config:python.pythonPath}", 104 | "program": "${workspaceFolder}/run.py", 105 | "cwd": "${workspaceFolder}", 106 | "args": [], 107 | "env": {}, 108 | "envFile": "${workspaceFolder}/.env", 109 | "debugOptions": [ 110 | "RedirectOutput" 111 | ] 112 | }, 113 | { 114 | "name": "Python: PySpark", 115 | "type": "python", 116 | "request": "launch", 117 | "stopOnEntry": true, 118 | "osx": { 119 | "pythonPath": "${env:SPARK_HOME}/bin/spark-submit" 120 | }, 121 | "windows": { 122 | "pythonPath": "${env:SPARK_HOME}/bin/spark-submit.cmd" 123 | }, 124 | "linux": { 125 | "pythonPath": "${env:SPARK_HOME}/bin/spark-submit" 126 | }, 127 | "program": "${file}", 128 | "cwd": "${workspaceFolder}", 129 | "env": {}, 130 | "envFile": "${workspaceFolder}/.env", 131 | "debugOptions": [ 132 | "RedirectOutput" 133 | ] 134 | }, 135 | { 136 | "name": "Python: Module", 137 | "type": "python", 138 | "request": "launch", 139 | "stopOnEntry": true, 140 | "pythonPath": "${config:python.pythonPath}", 141 | "module": "module.name", 142 | "cwd": "${workspaceFolder}", 143 | "env": {}, 144 | "envFile": "${workspaceFolder}/.env", 145 | "debugOptions": [ 146 | "RedirectOutput" 147 | ] 148 | }, 149 | { 150 | "name": "Python: Pyramid", 151 | "type": "python", 152 | "request": "launch", 153 | "stopOnEntry": true, 154 | "pythonPath": "${config:python.pythonPath}", 155 | "cwd": "${workspaceFolder}", 156 | "env": {}, 157 | "envFile": "${workspaceFolder}/.env", 158 | "args": [ 159 | "${workspaceFolder}/development.ini" 160 | ], 161 | "debugOptions": [ 162 | "RedirectOutput", 163 | "Pyramid" 164 | ] 165 | }, 166 | { 167 | "name": "Python: Watson", 168 | "type": "python", 169 | "request": "launch", 170 | "stopOnEntry": true, 171 | "pythonPath": "${config:python.pythonPath}", 172 | "program": "${workspaceFolder}/console.py", 173 | "cwd": "${workspaceFolder}", 174 | "args": [ 175 | "dev", 176 | "runserver", 177 | "--noreload=True" 178 | ], 179 | "env": {}, 180 | "envFile": "${workspaceFolder}/.env", 181 | "debugOptions": [ 182 | "RedirectOutput" 183 | ] 184 | } 185 | ] 186 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU General Public License is a free, copyleft license for 11 | software and other kinds of works. 12 | 13 | The licenses for most software and other practical works are designed 14 | to take away your freedom to share and change the works. By contrast, 15 | the GNU General Public License is intended to guarantee your freedom to 16 | share and change all versions of a program--to make sure it remains free 17 | software for all its users. We, the Free Software Foundation, use the 18 | GNU General Public License for most of our software; it applies also to 19 | any other work released this way by its authors. You can apply it to 20 | your programs, too. 21 | 22 | When we speak of free software, we are referring to freedom, not 23 | price. Our General Public Licenses are designed to make sure that you 24 | have the freedom to distribute copies of free software (and charge for 25 | them if you wish), that you receive source code or can get it if you 26 | want it, that you can change the software or use pieces of it in new 27 | free programs, and that you know you can do these things. 28 | 29 | To protect your rights, we need to prevent others from denying you 30 | these rights or asking you to surrender the rights. Therefore, you have 31 | certain responsibilities if you distribute copies of the software, or if 32 | you modify it: responsibilities to respect the freedom of others. 33 | 34 | For example, if you distribute copies of such a program, whether 35 | gratis or for a fee, you must pass on to the recipients the same 36 | freedoms that you received. You must make sure that they, too, receive 37 | or can get the source code. And you must show them these terms so they 38 | know their rights. 39 | 40 | Developers that use the GNU GPL protect your rights with two steps: 41 | (1) assert copyright on the software, and (2) offer you this License 42 | giving you legal permission to copy, distribute and/or modify it. 43 | 44 | For the developers' and authors' protection, the GPL clearly explains 45 | that there is no warranty for this free software. For both users' and 46 | authors' sake, the GPL requires that modified versions be marked as 47 | changed, so that their problems will not be attributed erroneously to 48 | authors of previous versions. 49 | 50 | Some devices are designed to deny users access to install or run 51 | modified versions of the software inside them, although the manufacturer 52 | can do so. This is fundamentally incompatible with the aim of 53 | protecting users' freedom to change the software. The systematic 54 | pattern of such abuse occurs in the area of products for individuals to 55 | use, which is precisely where it is most unacceptable. Therefore, we 56 | have designed this version of the GPL to prohibit the practice for those 57 | products. If such problems arise substantially in other domains, we 58 | stand ready to extend this provision to those domains in future versions 59 | of the GPL, as needed to protect the freedom of users. 60 | 61 | Finally, every program is threatened constantly by software patents. 62 | States should not allow patents to restrict development and use of 63 | software on general-purpose computers, but in those that do, we wish to 64 | avoid the special danger that patents applied to a free program could 65 | make it effectively proprietary. To prevent this, the GPL assures that 66 | patents cannot be used to render the program non-free. 67 | 68 | The precise terms and conditions for copying, distribution and 69 | modification follow. 70 | 71 | TERMS AND CONDITIONS 72 | 73 | 0. Definitions. 74 | 75 | "This License" refers to version 3 of the GNU General Public License. 76 | 77 | "Copyright" also means copyright-like laws that apply to other kinds of 78 | works, such as semiconductor masks. 79 | 80 | "The Program" refers to any copyrightable work licensed under this 81 | License. Each licensee is addressed as "you". "Licensees" and 82 | "recipients" may be individuals or organizations. 83 | 84 | To "modify" a work means to copy from or adapt all or part of the work 85 | in a fashion requiring copyright permission, other than the making of an 86 | exact copy. The resulting work is called a "modified version" of the 87 | earlier work or a work "based on" the earlier work. 88 | 89 | A "covered work" means either the unmodified Program or a work based 90 | on the Program. 91 | 92 | To "propagate" a work means to do anything with it that, without 93 | permission, would make you directly or secondarily liable for 94 | infringement under applicable copyright law, except executing it on a 95 | computer or modifying a private copy. Propagation includes copying, 96 | distribution (with or without modification), making available to the 97 | public, and in some countries other activities as well. 98 | 99 | To "convey" a work means any kind of propagation that enables other 100 | parties to make or receive copies. Mere interaction with a user through 101 | a computer network, with no transfer of a copy, is not conveying. 102 | 103 | An interactive user interface displays "Appropriate Legal Notices" 104 | to the extent that it includes a convenient and prominently visible 105 | feature that (1) displays an appropriate copyright notice, and (2) 106 | tells the user that there is no warranty for the work (except to the 107 | extent that warranties are provided), that licensees may convey the 108 | work under this License, and how to view a copy of this License. If 109 | the interface presents a list of user commands or options, such as a 110 | menu, a prominent item in the list meets this criterion. 111 | 112 | 1. Source Code. 113 | 114 | The "source code" for a work means the preferred form of the work 115 | for making modifications to it. "Object code" means any non-source 116 | form of a work. 117 | 118 | A "Standard Interface" means an interface that either is an official 119 | standard defined by a recognized standards body, or, in the case of 120 | interfaces specified for a particular programming language, one that 121 | is widely used among developers working in that language. 122 | 123 | The "System Libraries" of an executable work include anything, other 124 | than the work as a whole, that (a) is included in the normal form of 125 | packaging a Major Component, but which is not part of that Major 126 | Component, and (b) serves only to enable use of the work with that 127 | Major Component, or to implement a Standard Interface for which an 128 | implementation is available to the public in source code form. A 129 | "Major Component", in this context, means a major essential component 130 | (kernel, window system, and so on) of the specific operating system 131 | (if any) on which the executable work runs, or a compiler used to 132 | produce the work, or an object code interpreter used to run it. 133 | 134 | The "Corresponding Source" for a work in object code form means all 135 | the source code needed to generate, install, and (for an executable 136 | work) run the object code and to modify the work, including scripts to 137 | control those activities. However, it does not include the work's 138 | System Libraries, or general-purpose tools or generally available free 139 | programs which are used unmodified in performing those activities but 140 | which are not part of the work. For example, Corresponding Source 141 | includes interface definition files associated with source files for 142 | the work, and the source code for shared libraries and dynamically 143 | linked subprograms that the work is specifically designed to require, 144 | such as by intimate data communication or control flow between those 145 | subprograms and other parts of the work. 146 | 147 | The Corresponding Source need not include anything that users 148 | can regenerate automatically from other parts of the Corresponding 149 | Source. 150 | 151 | The Corresponding Source for a work in source code form is that 152 | same work. 153 | 154 | 2. Basic Permissions. 155 | 156 | All rights granted under this License are granted for the term of 157 | copyright on the Program, and are irrevocable provided the stated 158 | conditions are met. This License explicitly affirms your unlimited 159 | permission to run the unmodified Program. The output from running a 160 | covered work is covered by this License only if the output, given its 161 | content, constitutes a covered work. This License acknowledges your 162 | rights of fair use or other equivalent, as provided by copyright law. 163 | 164 | You may make, run and propagate covered works that you do not 165 | convey, without conditions so long as your license otherwise remains 166 | in force. You may convey covered works to others for the sole purpose 167 | of having them make modifications exclusively for you, or provide you 168 | with facilities for running those works, provided that you comply with 169 | the terms of this License in conveying all material for which you do 170 | not control copyright. Those thus making or running the covered works 171 | for you must do so exclusively on your behalf, under your direction 172 | and control, on terms that prohibit them from making any copies of 173 | your copyrighted material outside their relationship with you. 174 | 175 | Conveying under any other circumstances is permitted solely under 176 | the conditions stated below. Sublicensing is not allowed; section 10 177 | makes it unnecessary. 178 | 179 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 180 | 181 | No covered work shall be deemed part of an effective technological 182 | measure under any applicable law fulfilling obligations under article 183 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 184 | similar laws prohibiting or restricting circumvention of such 185 | measures. 186 | 187 | When you convey a covered work, you waive any legal power to forbid 188 | circumvention of technological measures to the extent such circumvention 189 | is effected by exercising rights under this License with respect to 190 | the covered work, and you disclaim any intention to limit operation or 191 | modification of the work as a means of enforcing, against the work's 192 | users, your or third parties' legal rights to forbid circumvention of 193 | technological measures. 194 | 195 | 4. Conveying Verbatim Copies. 196 | 197 | You may convey verbatim copies of the Program's source code as you 198 | receive it, in any medium, provided that you conspicuously and 199 | appropriately publish on each copy an appropriate copyright notice; 200 | keep intact all notices stating that this License and any 201 | non-permissive terms added in accord with section 7 apply to the code; 202 | keep intact all notices of the absence of any warranty; and give all 203 | recipients a copy of this License along with the Program. 204 | 205 | You may charge any price or no price for each copy that you convey, 206 | and you may offer support or warranty protection for a fee. 207 | 208 | 5. Conveying Modified Source Versions. 209 | 210 | You may convey a work based on the Program, or the modifications to 211 | produce it from the Program, in the form of source code under the 212 | terms of section 4, provided that you also meet all of these conditions: 213 | 214 | a) The work must carry prominent notices stating that you modified 215 | it, and giving a relevant date. 216 | 217 | b) The work must carry prominent notices stating that it is 218 | released under this License and any conditions added under section 219 | 7. This requirement modifies the requirement in section 4 to 220 | "keep intact all notices". 221 | 222 | c) You must license the entire work, as a whole, under this 223 | License to anyone who comes into possession of a copy. This 224 | License will therefore apply, along with any applicable section 7 225 | additional terms, to the whole of the work, and all its parts, 226 | regardless of how they are packaged. This License gives no 227 | permission to license the work in any other way, but it does not 228 | invalidate such permission if you have separately received it. 229 | 230 | d) If the work has interactive user interfaces, each must display 231 | Appropriate Legal Notices; however, if the Program has interactive 232 | interfaces that do not display Appropriate Legal Notices, your 233 | work need not make them do so. 234 | 235 | A compilation of a covered work with other separate and independent 236 | works, which are not by their nature extensions of the covered work, 237 | and which are not combined with it such as to form a larger program, 238 | in or on a volume of a storage or distribution medium, is called an 239 | "aggregate" if the compilation and its resulting copyright are not 240 | used to limit the access or legal rights of the compilation's users 241 | beyond what the individual works permit. Inclusion of a covered work 242 | in an aggregate does not cause this License to apply to the other 243 | parts of the aggregate. 244 | 245 | 6. Conveying Non-Source Forms. 246 | 247 | You may convey a covered work in object code form under the terms 248 | of sections 4 and 5, provided that you also convey the 249 | machine-readable Corresponding Source under the terms of this License, 250 | in one of these ways: 251 | 252 | a) Convey the object code in, or embodied in, a physical product 253 | (including a physical distribution medium), accompanied by the 254 | Corresponding Source fixed on a durable physical medium 255 | customarily used for software interchange. 256 | 257 | b) Convey the object code in, or embodied in, a physical product 258 | (including a physical distribution medium), accompanied by a 259 | written offer, valid for at least three years and valid for as 260 | long as you offer spare parts or customer support for that product 261 | model, to give anyone who possesses the object code either (1) a 262 | copy of the Corresponding Source for all the software in the 263 | product that is covered by this License, on a durable physical 264 | medium customarily used for software interchange, for a price no 265 | more than your reasonable cost of physically performing this 266 | conveying of source, or (2) access to copy the 267 | Corresponding Source from a network server at no charge. 268 | 269 | c) Convey individual copies of the object code with a copy of the 270 | written offer to provide the Corresponding Source. This 271 | alternative is allowed only occasionally and noncommercially, and 272 | only if you received the object code with such an offer, in accord 273 | with subsection 6b. 274 | 275 | d) Convey the object code by offering access from a designated 276 | place (gratis or for a charge), and offer equivalent access to the 277 | Corresponding Source in the same way through the same place at no 278 | further charge. You need not require recipients to copy the 279 | Corresponding Source along with the object code. If the place to 280 | copy the object code is a network server, the Corresponding Source 281 | may be on a different server (operated by you or a third party) 282 | that supports equivalent copying facilities, provided you maintain 283 | clear directions next to the object code saying where to find the 284 | Corresponding Source. Regardless of what server hosts the 285 | Corresponding Source, you remain obligated to ensure that it is 286 | available for as long as needed to satisfy these requirements. 287 | 288 | e) Convey the object code using peer-to-peer transmission, provided 289 | you inform other peers where the object code and Corresponding 290 | Source of the work are being offered to the general public at no 291 | charge under subsection 6d. 292 | 293 | A separable portion of the object code, whose source code is excluded 294 | from the Corresponding Source as a System Library, need not be 295 | included in conveying the object code work. 296 | 297 | A "User Product" is either (1) a "consumer product", which means any 298 | tangible personal property which is normally used for personal, family, 299 | or household purposes, or (2) anything designed or sold for incorporation 300 | into a dwelling. In determining whether a product is a consumer product, 301 | doubtful cases shall be resolved in favor of coverage. For a particular 302 | product received by a particular user, "normally used" refers to a 303 | typical or common use of that class of product, regardless of the status 304 | of the particular user or of the way in which the particular user 305 | actually uses, or expects or is expected to use, the product. A product 306 | is a consumer product regardless of whether the product has substantial 307 | commercial, industrial or non-consumer uses, unless such uses represent 308 | the only significant mode of use of the product. 309 | 310 | "Installation Information" for a User Product means any methods, 311 | procedures, authorization keys, or other information required to install 312 | and execute modified versions of a covered work in that User Product from 313 | a modified version of its Corresponding Source. The information must 314 | suffice to ensure that the continued functioning of the modified object 315 | code is in no case prevented or interfered with solely because 316 | modification has been made. 317 | 318 | If you convey an object code work under this section in, or with, or 319 | specifically for use in, a User Product, and the conveying occurs as 320 | part of a transaction in which the right of possession and use of the 321 | User Product is transferred to the recipient in perpetuity or for a 322 | fixed term (regardless of how the transaction is characterized), the 323 | Corresponding Source conveyed under this section must be accompanied 324 | by the Installation Information. But this requirement does not apply 325 | if neither you nor any third party retains the ability to install 326 | modified object code on the User Product (for example, the work has 327 | been installed in ROM). 328 | 329 | The requirement to provide Installation Information does not include a 330 | requirement to continue to provide support service, warranty, or updates 331 | for a work that has been modified or installed by the recipient, or for 332 | the User Product in which it has been modified or installed. Access to a 333 | network may be denied when the modification itself materially and 334 | adversely affects the operation of the network or violates the rules and 335 | protocols for communication across the network. 336 | 337 | Corresponding Source conveyed, and Installation Information provided, 338 | in accord with this section must be in a format that is publicly 339 | documented (and with an implementation available to the public in 340 | source code form), and must require no special password or key for 341 | unpacking, reading or copying. 342 | 343 | 7. Additional Terms. 344 | 345 | "Additional permissions" are terms that supplement the terms of this 346 | License by making exceptions from one or more of its conditions. 347 | Additional permissions that are applicable to the entire Program shall 348 | be treated as though they were included in this License, to the extent 349 | that they are valid under applicable law. If additional permissions 350 | apply only to part of the Program, that part may be used separately 351 | under those permissions, but the entire Program remains governed by 352 | this License without regard to the additional permissions. 353 | 354 | When you convey a copy of a covered work, you may at your option 355 | remove any additional permissions from that copy, or from any part of 356 | it. (Additional permissions may be written to require their own 357 | removal in certain cases when you modify the work.) You may place 358 | additional permissions on material, added by you to a covered work, 359 | for which you have or can give appropriate copyright permission. 360 | 361 | Notwithstanding any other provision of this License, for material you 362 | add to a covered work, you may (if authorized by the copyright holders of 363 | that material) supplement the terms of this License with terms: 364 | 365 | a) Disclaiming warranty or limiting liability differently from the 366 | terms of sections 15 and 16 of this License; or 367 | 368 | b) Requiring preservation of specified reasonable legal notices or 369 | author attributions in that material or in the Appropriate Legal 370 | Notices displayed by works containing it; or 371 | 372 | c) Prohibiting misrepresentation of the origin of that material, or 373 | requiring that modified versions of such material be marked in 374 | reasonable ways as different from the original version; or 375 | 376 | d) Limiting the use for publicity purposes of names of licensors or 377 | authors of the material; or 378 | 379 | e) Declining to grant rights under trademark law for use of some 380 | trade names, trademarks, or service marks; or 381 | 382 | f) Requiring indemnification of licensors and authors of that 383 | material by anyone who conveys the material (or modified versions of 384 | it) with contractual assumptions of liability to the recipient, for 385 | any liability that these contractual assumptions directly impose on 386 | those licensors and authors. 387 | 388 | All other non-permissive additional terms are considered "further 389 | restrictions" within the meaning of section 10. If the Program as you 390 | received it, or any part of it, contains a notice stating that it is 391 | governed by this License along with a term that is a further 392 | restriction, you may remove that term. If a license document contains 393 | a further restriction but permits relicensing or conveying under this 394 | License, you may add to a covered work material governed by the terms 395 | of that license document, provided that the further restriction does 396 | not survive such relicensing or conveying. 397 | 398 | If you add terms to a covered work in accord with this section, you 399 | must place, in the relevant source files, a statement of the 400 | additional terms that apply to those files, or a notice indicating 401 | where to find the applicable terms. 402 | 403 | Additional terms, permissive or non-permissive, may be stated in the 404 | form of a separately written license, or stated as exceptions; 405 | the above requirements apply either way. 406 | 407 | 8. Termination. 408 | 409 | You may not propagate or modify a covered work except as expressly 410 | provided under this License. Any attempt otherwise to propagate or 411 | modify it is void, and will automatically terminate your rights under 412 | this License (including any patent licenses granted under the third 413 | paragraph of section 11). 414 | 415 | However, if you cease all violation of this License, then your 416 | license from a particular copyright holder is reinstated (a) 417 | provisionally, unless and until the copyright holder explicitly and 418 | finally terminates your license, and (b) permanently, if the copyright 419 | holder fails to notify you of the violation by some reasonable means 420 | prior to 60 days after the cessation. 421 | 422 | Moreover, your license from a particular copyright holder is 423 | reinstated permanently if the copyright holder notifies you of the 424 | violation by some reasonable means, this is the first time you have 425 | received notice of violation of this License (for any work) from that 426 | copyright holder, and you cure the violation prior to 30 days after 427 | your receipt of the notice. 428 | 429 | Termination of your rights under this section does not terminate the 430 | licenses of parties who have received copies or rights from you under 431 | this License. If your rights have been terminated and not permanently 432 | reinstated, you do not qualify to receive new licenses for the same 433 | material under section 10. 434 | 435 | 9. Acceptance Not Required for Having Copies. 436 | 437 | You are not required to accept this License in order to receive or 438 | run a copy of the Program. Ancillary propagation of a covered work 439 | occurring solely as a consequence of using peer-to-peer transmission 440 | to receive a copy likewise does not require acceptance. However, 441 | nothing other than this License grants you permission to propagate or 442 | modify any covered work. These actions infringe copyright if you do 443 | not accept this License. Therefore, by modifying or propagating a 444 | covered work, you indicate your acceptance of this License to do so. 445 | 446 | 10. Automatic Licensing of Downstream Recipients. 447 | 448 | Each time you convey a covered work, the recipient automatically 449 | receives a license from the original licensors, to run, modify and 450 | propagate that work, subject to this License. You are not responsible 451 | for enforcing compliance by third parties with this License. 452 | 453 | An "entity transaction" is a transaction transferring control of an 454 | organization, or substantially all assets of one, or subdividing an 455 | organization, or merging organizations. If propagation of a covered 456 | work results from an entity transaction, each party to that 457 | transaction who receives a copy of the work also receives whatever 458 | licenses to the work the party's predecessor in interest had or could 459 | give under the previous paragraph, plus a right to possession of the 460 | Corresponding Source of the work from the predecessor in interest, if 461 | the predecessor has it or can get it with reasonable efforts. 462 | 463 | You may not impose any further restrictions on the exercise of the 464 | rights granted or affirmed under this License. For example, you may 465 | not impose a license fee, royalty, or other charge for exercise of 466 | rights granted under this License, and you may not initiate litigation 467 | (including a cross-claim or counterclaim in a lawsuit) alleging that 468 | any patent claim is infringed by making, using, selling, offering for 469 | sale, or importing the Program or any portion of it. 470 | 471 | 11. Patents. 472 | 473 | A "contributor" is a copyright holder who authorizes use under this 474 | License of the Program or a work on which the Program is based. The 475 | work thus licensed is called the contributor's "contributor version". 476 | 477 | A contributor's "essential patent claims" are all patent claims 478 | owned or controlled by the contributor, whether already acquired or 479 | hereafter acquired, that would be infringed by some manner, permitted 480 | by this License, of making, using, or selling its contributor version, 481 | but do not include claims that would be infringed only as a 482 | consequence of further modification of the contributor version. For 483 | purposes of this definition, "control" includes the right to grant 484 | patent sublicenses in a manner consistent with the requirements of 485 | this License. 486 | 487 | Each contributor grants you a non-exclusive, worldwide, royalty-free 488 | patent license under the contributor's essential patent claims, to 489 | make, use, sell, offer for sale, import and otherwise run, modify and 490 | propagate the contents of its contributor version. 491 | 492 | In the following three paragraphs, a "patent license" is any express 493 | agreement or commitment, however denominated, not to enforce a patent 494 | (such as an express permission to practice a patent or covenant not to 495 | sue for patent infringement). To "grant" such a patent license to a 496 | party means to make such an agreement or commitment not to enforce a 497 | patent against the party. 498 | 499 | If you convey a covered work, knowingly relying on a patent license, 500 | and the Corresponding Source of the work is not available for anyone 501 | to copy, free of charge and under the terms of this License, through a 502 | publicly available network server or other readily accessible means, 503 | then you must either (1) cause the Corresponding Source to be so 504 | available, or (2) arrange to deprive yourself of the benefit of the 505 | patent license for this particular work, or (3) arrange, in a manner 506 | consistent with the requirements of this License, to extend the patent 507 | license to downstream recipients. "Knowingly relying" means you have 508 | actual knowledge that, but for the patent license, your conveying the 509 | covered work in a country, or your recipient's use of the covered work 510 | in a country, would infringe one or more identifiable patents in that 511 | country that you have reason to believe are valid. 512 | 513 | If, pursuant to or in connection with a single transaction or 514 | arrangement, you convey, or propagate by procuring conveyance of, a 515 | covered work, and grant a patent license to some of the parties 516 | receiving the covered work authorizing them to use, propagate, modify 517 | or convey a specific copy of the covered work, then the patent license 518 | you grant is automatically extended to all recipients of the covered 519 | work and works based on it. 520 | 521 | A patent license is "discriminatory" if it does not include within 522 | the scope of its coverage, prohibits the exercise of, or is 523 | conditioned on the non-exercise of one or more of the rights that are 524 | specifically granted under this License. You may not convey a covered 525 | work if you are a party to an arrangement with a third party that is 526 | in the business of distributing software, under which you make payment 527 | to the third party based on the extent of your activity of conveying 528 | the work, and under which the third party grants, to any of the 529 | parties who would receive the covered work from you, a discriminatory 530 | patent license (a) in connection with copies of the covered work 531 | conveyed by you (or copies made from those copies), or (b) primarily 532 | for and in connection with specific products or compilations that 533 | contain the covered work, unless you entered into that arrangement, 534 | or that patent license was granted, prior to 28 March 2007. 535 | 536 | Nothing in this License shall be construed as excluding or limiting 537 | any implied license or other defenses to infringement that may 538 | otherwise be available to you under applicable patent law. 539 | 540 | 12. No Surrender of Others' Freedom. 541 | 542 | If conditions are imposed on you (whether by court order, agreement or 543 | otherwise) that contradict the conditions of this License, they do not 544 | excuse you from the conditions of this License. If you cannot convey a 545 | covered work so as to satisfy simultaneously your obligations under this 546 | License and any other pertinent obligations, then as a consequence you may 547 | not convey it at all. For example, if you agree to terms that obligate you 548 | to collect a royalty for further conveying from those to whom you convey 549 | the Program, the only way you could satisfy both those terms and this 550 | License would be to refrain entirely from conveying the Program. 551 | 552 | 13. Use with the GNU Affero General Public License. 553 | 554 | Notwithstanding any other provision of this License, you have 555 | permission to link or combine any covered work with a work licensed 556 | under version 3 of the GNU Affero General Public License into a single 557 | combined work, and to convey the resulting work. The terms of this 558 | License will continue to apply to the part which is the covered work, 559 | but the special requirements of the GNU Affero General Public License, 560 | section 13, concerning interaction through a network will apply to the 561 | combination as such. 562 | 563 | 14. Revised Versions of this License. 564 | 565 | The Free Software Foundation may publish revised and/or new versions of 566 | the GNU General Public License from time to time. Such new versions will 567 | be similar in spirit to the present version, but may differ in detail to 568 | address new problems or concerns. 569 | 570 | Each version is given a distinguishing version number. If the 571 | Program specifies that a certain numbered version of the GNU General 572 | Public License "or any later version" applies to it, you have the 573 | option of following the terms and conditions either of that numbered 574 | version or of any later version published by the Free Software 575 | Foundation. If the Program does not specify a version number of the 576 | GNU General Public License, you may choose any version ever published 577 | by the Free Software Foundation. 578 | 579 | If the Program specifies that a proxy can decide which future 580 | versions of the GNU General Public License can be used, that proxy's 581 | public statement of acceptance of a version permanently authorizes you 582 | to choose that version for the Program. 583 | 584 | Later license versions may give you additional or different 585 | permissions. However, no additional obligations are imposed on any 586 | author or copyright holder as a result of your choosing to follow a 587 | later version. 588 | 589 | 15. Disclaimer of Warranty. 590 | 591 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 592 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 593 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 594 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 595 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 596 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 597 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 598 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 599 | 600 | 16. Limitation of Liability. 601 | 602 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 603 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 604 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 605 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 606 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 607 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 608 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 609 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 610 | SUCH DAMAGES. 611 | 612 | 17. Interpretation of Sections 15 and 16. 613 | 614 | If the disclaimer of warranty and limitation of liability provided 615 | above cannot be given local legal effect according to their terms, 616 | reviewing courts shall apply local law that most closely approximates 617 | an absolute waiver of all civil liability in connection with the 618 | Program, unless a warranty or assumption of liability accompanies a 619 | copy of the Program in return for a fee. 620 | 621 | END OF TERMS AND CONDITIONS 622 | 623 | How to Apply These Terms to Your New Programs 624 | 625 | If you develop a new program, and you want it to be of the greatest 626 | possible use to the public, the best way to achieve this is to make it 627 | free software which everyone can redistribute and change under these terms. 628 | 629 | To do so, attach the following notices to the program. It is safest 630 | to attach them to the start of each source file to most effectively 631 | state the exclusion of warranty; and each file should have at least 632 | the "copyright" line and a pointer to where the full notice is found. 633 | 634 | 635 | Copyright (C) 636 | 637 | This program is free software: you can redistribute it and/or modify 638 | it under the terms of the GNU General Public License as published by 639 | the Free Software Foundation, either version 3 of the License, or 640 | (at your option) any later version. 641 | 642 | This program is distributed in the hope that it will be useful, 643 | but WITHOUT ANY WARRANTY; without even the implied warranty of 644 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 645 | GNU General Public License for more details. 646 | 647 | You should have received a copy of the GNU General Public License 648 | along with this program. If not, see . 649 | 650 | Also add information on how to contact you by electronic and paper mail. 651 | 652 | If the program does terminal interaction, make it output a short 653 | notice like this when it starts in an interactive mode: 654 | 655 | Copyright (C) 656 | This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 657 | This is free software, and you are welcome to redistribute it 658 | under certain conditions; type `show c' for details. 659 | 660 | The hypothetical commands `show w' and `show c' should show the appropriate 661 | parts of the General Public License. Of course, your program's commands 662 | might be different; for a GUI interface, you would use an "about box". 663 | 664 | You should also get your employer (if you work as a programmer) or school, 665 | if any, to sign a "copyright disclaimer" for the program, if necessary. 666 | For more information on this, and how to apply and follow the GNU GPL, see 667 | . 668 | 669 | The GNU General Public License does not permit incorporating your program 670 | into proprietary programs. If your program is a subroutine library, you 671 | may consider it more useful to permit linking proprietary applications with 672 | the library. If this is what you want to do, use the GNU Lesser General 673 | Public License instead of this License. But first, please read 674 | . 675 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Analysis 2 | Python Practice of Data Analysis and Mining 3 | -------------------------------------------------------------------------------- /data_exploratory/abnormal_check.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*-D:\GitWork\Data\chapter3\demo\code\3-1_abnormal_check.py 2 | # 3-1 3 | ''' 4 | @ author: Amos 5 | ''' 6 | 7 | import pandas as pd 8 | import matplotlib.pyplot as plt 9 | 10 | catering_sale = './data/catering_sale.xls' 11 | data = pd.read_excel(catering_sale, index_col = u'日期') 12 | 13 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 14 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 15 | 16 | #建立图像 17 | plt.figure() 18 | p = data.boxplot(return_type = 'dict') 19 | x = p['fliers'][0].get_xdata() #‘flies’即为异常值 20 | y = p['fliers'][0].get_ydata() 21 | 22 | y.sort() 23 | 24 | #用annotate添加注释 25 | for i in range(len(x)): 26 | if i>0: 27 | plt.annotate(y[i], xy = (x[i], y[i]), xytext=(x[i]+0.05-0.8/(y[i]-y[i-1]),y[i])) 28 | else: 29 | plt.annotate(y[i], xy = (x[i],y[i]), xytext=(x[i]+0.08,y[i])) 30 | 31 | plt.show() 32 | -------------------------------------------------------------------------------- /data_exploratory/correlation_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pandas as pd 4 | 5 | catering_sale = './data/catering_sale_all.xls' #餐饮数据,含有其他属性 6 | data = pd.read_excel(catering_sale, index_col = u'日期') #读取数据,指定“日期”列为索引列 7 | 8 | #print(data.corr()) 9 | print(data.corr()[u'百合酱蒸凤爪']) #只显示“百合酱蒸凤爪”与其他菜式的相关系数 10 | print('\n') 11 | print(data[u'百合酱蒸凤爪'].corr(data[u'翡翠蒸香茜饺'])) #计算“百合酱蒸凤爪”与“翡翠蒸香茜饺”的相关系数 12 | -------------------------------------------------------------------------------- /data_exploratory/data/catering_dish_profit.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_dish_profit.xls -------------------------------------------------------------------------------- /data_exploratory/data/catering_fish_congee.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_fish_congee.xls -------------------------------------------------------------------------------- /data_exploratory/data/catering_sale.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_sale.csv -------------------------------------------------------------------------------- /data_exploratory/data/catering_sale.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_sale.xls -------------------------------------------------------------------------------- /data_exploratory/data/catering_sale_all.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_exploratory/data/catering_sale_all.xls -------------------------------------------------------------------------------- /data_exploratory/dish_pareto.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | #帕累托分析 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 7 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 8 | 9 | dish_profit = './data/catering_dish_profit.xls' 10 | data = pd.read_excel(dish_profit, index_col = u'菜品名') 11 | 12 | data = data[u'盈利'].copy() 13 | data.sort_index(ascending = False) 14 | 15 | plt.figure() 16 | data.plot(kind='bar') #柱状图 17 | plt.ylabel(u'盈利(元)') 18 | 19 | p = 1.0*data.cumsum()/data.sum() 20 | p.plot(color = 'r', secondary_y = True, style = '-o',linewidth = 2) #线 21 | #添加注释,即85%处的标记。这里包括了指定箭头样式。 22 | plt.annotate(format(p[6], '.4%'), \ 23 | xy = (6, p[6]), \ 24 | xytext=(6*0.9, p[6]*0.9), \ 25 | arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=.2")) 26 | plt.ylabel(u'盈利(比例)') 27 | 28 | plt.show() 29 | -------------------------------------------------------------------------------- /data_exploratory/statistic_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import pandas as pd 4 | 5 | catering_sale = './data/catering_sale.xls' 6 | data = pd.read_excel(catering_sale, index_col = u'日期') 7 | 8 | print(data.describe(),'\n') 9 | print('total: ',len(data)) 10 | 11 | data = data[(data[u'销量']>400) & (data[u'销量']<5000)] 12 | statistics = data.describe() 13 | 14 | s = statistics 15 | s.loc['range'] = s.loc['max'] - s.loc['min'] 16 | s.loc['var'] = s.loc['std'] / s.loc['mean'] 17 | s.loc['dis'] = s.loc['75%'] - s.loc['25%'] 18 | 19 | print(statistics) 20 | -------------------------------------------------------------------------------- /data_modeling/cm_plot.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # cm_plot.py 文件,包括了混淆矩阵可视化函数, 4 | # 放置在python的site-packages 目录,供调用 5 | # 例如:~/anaconda2/lib/python2.7/site-packages 6 | 7 | def cm_plot(y, yp): 8 | from sklearn.metrics import confusion_matrix#导入混淆矩阵函数 9 | cm = confusion_matrix(y, yp)#混淆矩阵 10 | import matplotlib.pyplot as plt #导入作图库 11 | #画混淆矩阵图,配色风格使用cm.Greens,更多风格请参考官网。 12 | plt.matshow(cm, cmap=plt.cm.Greens) 13 | plt.colorbar() 14 | for x in range(len(cm)): #数据标签 15 | for y in range(len(cm)): 16 | plt.annotate(cm[x,y], xy=(x, y), horizontalalignment='center', verticalalignment='center') 17 | plt.ylabel('True label') #坐标轴标签 18 | plt.xlabel('Predicted label') #坐标轴标签 19 | return plt 20 | -------------------------------------------------------------------------------- /data_modeling/data/arima_data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/arima_data.xls -------------------------------------------------------------------------------- /data_modeling/data/bankloan.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/bankloan.xls -------------------------------------------------------------------------------- /data_modeling/data/consumption_data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/consumption_data.xls -------------------------------------------------------------------------------- /data_modeling/data/menu_orders.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/menu_orders.xls -------------------------------------------------------------------------------- /data_modeling/data/neural_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/neural_network.png -------------------------------------------------------------------------------- /data_modeling/data/sales_data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/data/sales_data.xls -------------------------------------------------------------------------------- /data_modeling/decision_tree.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 29 22:34:51 2018 4 | 5 | @author: Amos 6 | """ 7 | import pandas as pd 8 | from sklearn.tree import DecisionTreeClassifier as DTC 9 | from sklearn.tree import export_graphviz as to_graphviz 10 | from sklearn.externals.six import StringIO 11 | 12 | filename = "./data/sales_data.xls" 13 | data = pd.read_excel(filename, index_col=u'序号') 14 | 15 | #数据变换为类别标签 16 | data[data == u'高'] = 1 17 | data[data == u'是'] = 1 18 | data[data == u'好'] = 1 19 | data[data != 1] = -1 20 | x = data.iloc[:,:3].astype(int) 21 | y = data.iloc[:,3].astype(int) 22 | 23 | #建立并训练决策树模型,基于信息熵 24 | dtc = DTC(criterion='entropy') 25 | dtc.fit(x, y) 26 | 27 | with open("./tmp/tree.dot", 'w') as f: 28 | f = to_graphviz(dtc, feature_names = x.columns, out_file= f) 29 | -------------------------------------------------------------------------------- /data_modeling/kmeans.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # 使用K-Means算法聚类消费行为特征数据 4 | import pandas as pd 5 | from sklearn.cluster import KMeans 6 | 7 | k = 3 #聚类的类别 8 | iteration = 5 #最大循环次数 9 | 10 | inputfile = './data/consumption_data.xls' 11 | outputfile = './tmp/out_consumption_data.xls' 12 | #读取并标准化数据 13 | data = pd.read_excel(inputfile) 14 | data_zs = 1.0*(data - data.mean())/data.std() 15 | 16 | #分为K类,并发数 17 | model = KMeans( 18 | n_clusters=k, n_jobs=1, max_iter = iteration) 19 | #开始聚类 20 | model.fit(data_zs) 21 | 22 | #聚类结果 23 | r1 = pd.Series(model.labels_).value_counts() #统计各个类别的数目 24 | r2 = pd.DataFrame(model.cluster_centers_) #找出聚类中心 25 | #横向连接(0是纵向),得到聚类中心对应的类别下的数目 26 | r = pd.concat([r2, r1], axis=1) 27 | r.columns = list(data.columns) + [u'类别数目'] 28 | #print(r) 29 | 30 | #详细输出原始数据及其类别 31 | r_detail = pd.concat( 32 | [data, pd.Series(model.labels_, index=data.index)], axis=1) 33 | r_detail.columns = list(data.columns) + [u'聚类类别'] 34 | #print(r_detail) 35 | 36 | ''' 37 | #自定义作图函数 38 | def density_plot(data): 39 | import matplotlib.pyplot as plt 40 | #用来正常显示中文标签和负号 41 | plt.rcParams['font.sans-serif'] = ['SimHei'] 42 | plt.rcParams['axes.unicode_minus'] = False 43 | p = data.plot( 44 | kind='kde', linewidth = 2, 45 | subplots = True, sharex = False) 46 | [p[i].set_ylabel(u'密度') for i in range(k)] 47 | plt.legend() 48 | return plt 49 | 50 | #作概率密度图 51 | fig_output = './tmp/kmeans_pd_' 52 | for i in range(k): 53 | data_r = data[r_detail[u'聚类类别'] == i].iloc[:, 1:] 54 | density_plot(data_r).savefig(u'%s%s'%(fig_output, i)) 55 | ''' 56 | 57 | #对kmeans结果可视化展示 58 | from sklearn.manifold import TSNE 59 | 60 | tsne = TSNE() 61 | #数据降维度 62 | tsne.fit_transform(data_zs) 63 | tsne = pd.DataFrame(tsne.embedding_, index = data_zs.index) 64 | 65 | import matplotlib.pyplot as plt 66 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 67 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 68 | 69 | #不同类别用不同颜色和样式绘图 70 | d = tsne[r_detail[u'聚类类别'] == 0] 71 | plt.plot(d[0], d[1], 'r.') 72 | d = tsne[r_detail[u'聚类类别'] == 1] 73 | plt.plot(d[0], d[1], 'go') 74 | d = tsne[r_detail[u'聚类类别'] == 2] 75 | plt.plot(d[0], d[1], 'b*') 76 | plt.show() 77 | -------------------------------------------------------------------------------- /data_modeling/logistic_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jan 22 19:36:10 2018 4 | 5 | @author: Amos 6 | """ 7 | #某银行在降低贷款拖欠率的数据进行回归建模 8 | #逻辑回归 自动建模 9 | import pandas as pd 10 | import numpy 11 | 12 | #参数初始化 13 | filename = './data/bankloan.xls' 14 | data = pd.read_excel(filename) 15 | 16 | x = data.iloc[:, :8].as_matrix() 17 | y = data.iloc[:, 8].as_matrix() 18 | 19 | from sklearn.linear_model import LogisticRegression as LR 20 | from sklearn.linear_model import RandomizedLogisticRegression as RLR 21 | 22 | 23 | lr = LR() #建立逻辑货柜模型 24 | lr.fit(x, y) #用筛选后的特征数据来训练模型 25 | print(u'逻辑回归模型训练结束。') 26 | print(u'未经过筛选特性模型的平均正确率为:%s' % lr.score(x, y)) 27 | 28 | #建立随机逻辑回归模型 29 | rlr = RLR() #帅选变量 30 | rlr.fit(x, y) 31 | #rlr.get_support() #获取特征筛选结果,也可以通过.scores_方法获取各个特征的分数 32 | selected_col = numpy.append(rlr.get_support(),[False]) 33 | print(u"通过随机逻辑回归模型筛选特征结束") 34 | print(u"有效特征为:%s" % ",".join(data.columns[selected_col])) 35 | x = data[data.columns[selected_col]].as_matrix() # 筛选好特征 36 | 37 | lr = LR() #建立逻辑货柜模型 38 | lr.fit(x, y) #用筛选后的特征数据来训练模型 39 | print(u'逻辑回归模型训练结束。') 40 | print(u'模型的平均正确率为:%s' % lr.score(x, y)) #给出模型的平均正确率,本例为81.4% 41 | -------------------------------------------------------------------------------- /data_modeling/neural_network.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #使用神经网络算法预测销量高低 3 | 4 | import pandas as pd 5 | 6 | inputfile = './data/sales_data.xls' 7 | data = pd.read_excel(inputfile, index_col = u'序号') 8 | 9 | data[data == u'高'] = 1 10 | data[data == u'是'] = 1 11 | data[data == u'好'] = 1 12 | data[data != 1] = -1 13 | x = data.iloc[:,:3].astype(int) 14 | y = data.iloc[:,3].astype(int) 15 | 16 | from keras.models import Sequential 17 | from keras.layers.core import Dense, Activation 18 | 19 | #建立模型 20 | model = Sequential(); 21 | model.add(Dense(input_dim = 3, output_dim = 10)) 22 | model.add(Activation('relu')) #用relu函数作为激活函数,能够大幅提供准确度 23 | model.add(Dense(input_dim = 10, output_dim = 1)) 24 | model.add(Activation('sigmoid')) #由于是0-1输出,用sigmoid函数作为激活函数 25 | 26 | #编译模型: 27 | #由于我们做的是二元分类,所以我们指定损失函数为binary_crossentropy,以及模式为binary 28 | #另外常见的损失函数还有mean_squared_error、categorical_crossentropy等,请阅读帮助文件。 29 | #求解方法我们指定用adam,还有sgd、rmsprop等可选 30 | model.compile(loss = 'binary_crossentropy', 31 | optimizer = 'adam') 32 | 33 | model.fit(x, y, epochs = 100, batch_size = 10) #训练模型,学习一千次 34 | yp = model.predict_classes(x).reshape(len(y)) #分类预测 35 | 36 | from cm_plot import * 37 | cm_plot(y, yp).savefig('./data/neiral_network.png') 38 | -------------------------------------------------------------------------------- /data_modeling/tmp/data_type.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/data_type.xls -------------------------------------------------------------------------------- /data_modeling/tmp/kmeans_pd_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/kmeans_pd_0.png -------------------------------------------------------------------------------- /data_modeling/tmp/kmeans_pd_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/kmeans_pd_1.png -------------------------------------------------------------------------------- /data_modeling/tmp/kmeans_pd_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/kmeans_pd_2.png -------------------------------------------------------------------------------- /data_modeling/tmp/pd_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/pd_0.png -------------------------------------------------------------------------------- /data_modeling/tmp/pd_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/pd_1.png -------------------------------------------------------------------------------- /data_modeling/tmp/pd_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/pd_2.png -------------------------------------------------------------------------------- /data_modeling/tmp/tree.dot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_modeling/tmp/tree.dot -------------------------------------------------------------------------------- /data_preprocess/attr_construct.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 21 18:54:48 2018 4 | 5 | @author: Amos 6 | """ 7 | import pandas as pd 8 | 9 | #构造线损率这个属性 10 | 11 | inputfile= './data/electricity_data.xls' #供入供出电量数据 12 | outputfile = './tmp/electricity_data.xls' #属性构造后数据文件 13 | 14 | data = pd.read_excel(inputfile) 15 | data[u'线损率'] = (data[u'供入电量'] - data[u'供出电量'])/data[u'供入电量'] 16 | data.to_excel(outputfile) 17 | -------------------------------------------------------------------------------- /data_preprocess/data/catering_sale.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/catering_sale.xls -------------------------------------------------------------------------------- /data_preprocess/data/discretization_data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/discretization_data.xls -------------------------------------------------------------------------------- /data_preprocess/data/electricity_data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/electricity_data.xls -------------------------------------------------------------------------------- /data_preprocess/data/leleccum.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/leleccum.mat -------------------------------------------------------------------------------- /data_preprocess/data/normalization_data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/normalization_data.xls -------------------------------------------------------------------------------- /data_preprocess/data/principal_component.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/data/principal_component.xls -------------------------------------------------------------------------------- /data_preprocess/data_discretization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 21 15:35:38 2018 4 | @author: Amos 5 | """ 6 | #数据离散化 7 | import pandas as pd 8 | 9 | DATA_FILE = './data/discretization_data.xls' #参数初始化 10 | DATA = pd.read_excel(DATA_FILE) #读取数据 11 | DATA = DATA.loc[:, u'肝气郁结证型系数'] 12 | k = 4 13 | 14 | #等宽离散化 15 | d1 = pd.cut(DATA, k, labels=range(k)) 16 | 17 | #等频率离散化 18 | w = [1.0*i/k for i in range(k+1)] 19 | #m = DATA.describe() 20 | #n = DATA.describe(percentiles=w) 21 | w = DATA.describe(percentiles=w)[4:(4+k+1)] 22 | w[0] = w[0]*(1-1e-10) 23 | d2 = pd.cut(DATA, w, labels=range(k)) 24 | 25 | from sklearn.cluster import KMeans #引入KMeans 26 | #一维聚类离散化 27 | kmodel = KMeans(n_clusters=k, n_jobs=2) #建立模型 28 | #kmodel.fit(DATA.reshape((len(DATA), 1))) #训练模型 29 | #c = pd.DataFrame(kmodel.cluster_centers_).sort(0) #输出聚类中心,并且排序 30 | #w = pd.rolling_mean(c, 2).iloc[1:] #相邻两项求中点,作为边界点 31 | #w = [0] + list(w[0]) + [DATA.max()] #把首末边界点加上 32 | #d3 = pd.cut(DATA, w, labels = range(k)) 33 | 34 | def cluster_plot(d, k): #自定义作图函数来显示聚类结果 35 | import matplotlib.pyplot as plt 36 | plt.rcParams['font.sans-serif'] = ['SimHei'] #用来正常显示中文标签 37 | plt.rcParams['axes.unicode_minus'] = False #用来正常显示负号 38 | 39 | plt.figure(figsize=(8, 3)) 40 | for j in range(0, k): 41 | plt.plot(DATA[d==j], [j for i in d[d==j]], 'o') 42 | plt.ylim(-0.5, k-0.5) 43 | return plt 44 | 45 | cluster_plot(d1, k).show() 46 | cluster_plot(d2, k).show() 47 | #cluster_plot(d3, k).show() 48 | -------------------------------------------------------------------------------- /data_preprocess/data_lagrange_interplate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 18 11:25:38 2018 4 | @author: Amos 5 | """ 6 | 7 | import pandas as pd 8 | from scipy.interpolate import lagrange 9 | 10 | inputfile = './data/catering_sale.xls' #销量数据路径 11 | outputfile = './tmp/sales.xls' #输出数据路径 12 | 13 | data = pd.read_excel(inputfile) 14 | #data[u'日期'].to_excel('./tmp/sales0.xls') 15 | #异常值过滤,变为空值 16 | #null_raw = list((data['销量']<400) | (data['销量']>5000)) 17 | #data.loc[:, '销量'][(data['销量']<400) | (data['销量']>5000)] = None 18 | data.loc[(data['销量']<400) | (data['销量']>5000), '销量'] = None 19 | #data.to_excel('./tmp/sales1.xls') 20 | 21 | #自定义列向量插值函数 22 | def polyinterp_column(s, n, k=5): 23 | y = s [list(range(n-k, n)) + list(range(n+1, n+1+k))] 24 | y = y[y.notnull()] #剔除空值 25 | return lagrange(y.index, list(y))(n) #插值并返回结果 26 | 27 | for i in data.columns: 28 | for j in range(len(data)): 29 | if (data[i].isnull())[j]: 30 | #data[i][j] = polyinterp_column(data[i], j) 31 | data.loc[j, [i]] = polyinterp_column(data[i], j) 32 | 33 | data.to_excel(outputfile) 34 | -------------------------------------------------------------------------------- /data_preprocess/data_normalization.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 18 16:14:38 2018 4 | 5 | @author: Amos 6 | 7 | 规范化:归一化 8 | """ 9 | 10 | import pandas as pd 11 | import numpy as np 12 | 13 | datafile = './data/normalization_data.xls' 14 | data = pd.read_excel(datafile, header = None) 15 | 16 | data_n1 = (data - data.min())/(data.max() - data.min()) #最小-最大规范化 17 | data_n2 = (data - data.mean())/data.std() #零-均值规范化 18 | data_n3 = data/10**np.ceil(np.log10(data.abs().max())) #小数定标规范化 19 | 20 | print(data_n1) 21 | print(data_n2) 22 | print(data_n3) 23 | -------------------------------------------------------------------------------- /data_preprocess/principal_component_analyze.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jan 21 19:19:29 2018 4 | 5 | @author: Amos 6 | """ 7 | 8 | #主成分分析 降维 9 | import pandas as pd 10 | 11 | #参数初始化 12 | inputfile = './data/principal_component.xls' 13 | outputfile = './tmp/dimention_reducted.xls' #降维后的数据 14 | 15 | data = pd.read_excel(inputfile, header = None) #读入数据 16 | 17 | from sklearn.decomposition import PCA 18 | 19 | pca = PCA(3) 20 | pca.fit(data) 21 | #pca.components_ #返回模型的各个特征向量 22 | #pca.explained_variance_ratio_ #返回各个成分各自的方差百分比 23 | 24 | low_d = pca.transform(data) #降低维度 25 | pd.DataFrame(low_d).to_excel(outputfile) 26 | pca.inverse_transform(low_d) #复原数据 27 | -------------------------------------------------------------------------------- /data_preprocess/tmp/dimention_reducted.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/tmp/dimention_reducted.xls -------------------------------------------------------------------------------- /data_preprocess/tmp/electricity_data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/tmp/electricity_data.xls -------------------------------------------------------------------------------- /data_preprocess/tmp/sales.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huiyuexu/data-analysis/25ed230f338a29ea5e91bb1a06b14000b6cc790c/data_preprocess/tmp/sales.xls -------------------------------------------------------------------------------- /data_preprocess/wave_analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Jan 18 17:36:54 2018 4 | @author: Amos 5 | """ 6 | 7 | #利用小波变换进行数据分析 8 | 9 | from scipy.io import loadmat 10 | 11 | inputfile= './data/leleccum.mat' #提取自Matlab的信号文件 12 | mat = loadmat(inputfile) 13 | signal = mat['leleccum'][0] 14 | 15 | #导入PyWavelets 16 | import pywt 17 | coeffs = pywt.wavedec(signal, 'bior3.7', level=5) 18 | #返回结果为level+1个数字,第一个数组为逼近系数数组,后面的依次是细节系数数组i 19 | -------------------------------------------------------------------------------- /tools/hello.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | print("hello world") 4 | 5 | import matplotlib as mpl 6 | print(mpl.get_cachedir()) -------------------------------------------------------------------------------- /tools/matplotlib_test.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | x = np.linspace(0, 10 , 1000) 6 | y = np.sin(x) + 1 7 | z = np.cos(x**2) + 1 8 | 9 | #plt.figure(figsize == (8, 4)) 10 | plt.plot(x, y, label = '$\sin x+1$', color = 'red', linewidth = 2) 11 | plt.plot(x, z, 'b--', label = '$\cos x^2+1$') 12 | 13 | plt.xlabel('Time(s)') 14 | plt.ylabel('Volt') 15 | plt.title('A Simple Eg.') 16 | 17 | plt.ylim(0, 2.2) #y轴范围 18 | 19 | plt.legend() #显示图例 20 | 21 | plt.show() 22 | -------------------------------------------------------------------------------- /tools/numpy_test.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import numpy as np 4 | 5 | a = np.array([2,0,1,5]) 6 | print(a) 7 | print(a[:2]) 8 | print(a.min()) 9 | a.sort() 10 | print(a) 11 | -------------------------------------------------------------------------------- /tools/pandas_notes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# pandas主要功能" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "在了解pandas数据结构的基础上,了解其常用功能。" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## 1.重新索引(Reindexing)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": { 28 | "collapsed": true 29 | }, 30 | "outputs": [], 31 | "source": [ 32 | "import pandas as pd" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "data": { 42 | "text/plain": [ 43 | "d 4.5\n", 44 | "b 7.2\n", 45 | "a -5.3\n", 46 | "c 3.6\n", 47 | "dtype: float64" 48 | ] 49 | }, 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n", 57 | "obj" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "更改index需要调用reindex,如果没有对应index会引入缺失值" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "a -5.3\n", 76 | "b 7.2\n", 77 | "c 3.6\n", 78 | "d 4.5\n", 79 | "e NaN\n", 80 | "dtype: float64" 81 | ] 82 | }, 83 | "execution_count": 3, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])\n", 90 | "obj2" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "对于DataFrame,reindex能更改row index,或column index。\n", 98 | "reindex the rows:" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 4, 104 | "metadata": { 105 | "collapsed": true 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "import numpy as np" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 5, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "frame = pd.DataFrame(np.arange(9).reshape(3, 3),\n", 119 | " index=['a', 'c', 'd'],\n", 120 | " columns=['Ohio', 'Texas', 'California'])" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 6, 126 | "metadata": { 127 | "scrolled": true 128 | }, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/html": [ 133 | "
\n", 134 | "\n", 147 | "\n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | "
OhioTexasCalifornia
a012
c345
d678
\n", 177 | "
" 178 | ], 179 | "text/plain": [ 180 | " Ohio Texas California\n", 181 | "a 0 1 2\n", 182 | "c 3 4 5\n", 183 | "d 6 7 8" 184 | ] 185 | }, 186 | "execution_count": 6, 187 | "metadata": {}, 188 | "output_type": "execute_result" 189 | } 190 | ], 191 | "source": [ 192 | "frame" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 7, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "data": { 202 | "text/html": [ 203 | "
\n", 204 | "\n", 217 | "\n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | "
OhioTexasCalifornia
a0.01.02.0
bNaNNaNNaN
c3.04.05.0
d6.07.08.0
\n", 253 | "
" 254 | ], 255 | "text/plain": [ 256 | " Ohio Texas California\n", 257 | "a 0.0 1.0 2.0\n", 258 | "b NaN NaN NaN\n", 259 | "c 3.0 4.0 5.0\n", 260 | "d 6.0 7.0 8.0" 261 | ] 262 | }, 263 | "execution_count": 7, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "frame2 = frame.reindex(['a','b','c','d'])\n", 270 | "frame2" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "reindex the columns:" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 8, 283 | "metadata": { 284 | "collapsed": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "states = ['Texes', 'Utah', 'California']" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 9, 294 | "metadata": { 295 | "scrolled": true 296 | }, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/html": [ 301 | "
\n", 302 | "\n", 315 | "\n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | "
TexesUtahCalifornia
aNaNNaN2
cNaNNaN5
dNaNNaN8
\n", 345 | "
" 346 | ], 347 | "text/plain": [ 348 | " Texes Utah California\n", 349 | "a NaN NaN 2\n", 350 | "c NaN NaN 5\n", 351 | "d NaN NaN 8" 352 | ] 353 | }, 354 | "execution_count": 9, 355 | "metadata": {}, 356 | "output_type": "execute_result" 357 | } 358 | ], 359 | "source": [ 360 | "frame.reindex(columns=states)" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "reinsex参数: " 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "![image](http://oydgk2hgw.bkt.clouddn.com/pydata-book/x0pq4.png)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 10, 380 | "metadata": {}, 381 | "outputs": [ 382 | { 383 | "data": { 384 | "text/html": [ 385 | "
\n", 386 | "\n", 399 | "\n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | "
TexesUtahCalifornia
aNaNNaN2.0
bNaNNaNNaN
cNaNNaN5.0
dNaNNaN8.0
\n", 435 | "
" 436 | ], 437 | "text/plain": [ 438 | " Texes Utah California\n", 439 | "a NaN NaN 2.0\n", 440 | "b NaN NaN NaN\n", 441 | "c NaN NaN 5.0\n", 442 | "d NaN NaN 8.0" 443 | ] 444 | }, 445 | "execution_count": 10, 446 | "metadata": {}, 447 | "output_type": "execute_result" 448 | } 449 | ], 450 | "source": [ 451 | "frame.loc[['a','b','c','d'], states]" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "## 2.按轴删除记录(Dropping Entries from an Axis)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "对于DataFrame,index能按行或列的axis来删除:" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 11, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "data": { 475 | "text/html": [ 476 | "
\n", 477 | "\n", 490 | "\n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | "
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York12131415
\n", 531 | "
" 532 | ], 533 | "text/plain": [ 534 | " one two three four\n", 535 | "Ohio 0 1 2 3\n", 536 | "Colorado 4 5 6 7\n", 537 | "Utah 8 9 10 11\n", 538 | "New York 12 13 14 15" 539 | ] 540 | }, 541 | "execution_count": 11, 542 | "metadata": {}, 543 | "output_type": "execute_result" 544 | } 545 | ], 546 | "source": [ 547 | "data = pd.DataFrame(np.arange(16).reshape(4, 4),\n", 548 | " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", 549 | " columns=['one', 'two', 'three', 'four'])\n", 550 | "data" 551 | ] 552 | }, 553 | { 554 | "cell_type": "markdown", 555 | "metadata": {}, 556 | "source": [ 557 | "行处理:(axis 0)" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 12, 563 | "metadata": {}, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/html": [ 568 | "
\n", 569 | "\n", 582 | "\n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | "
onetwothreefour
Colorado4567
Utah891011
New York12131415
\n", 616 | "
" 617 | ], 618 | "text/plain": [ 619 | " one two three four\n", 620 | "Colorado 4 5 6 7\n", 621 | "Utah 8 9 10 11\n", 622 | "New York 12 13 14 15" 623 | ] 624 | }, 625 | "execution_count": 12, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | } 629 | ], 630 | "source": [ 631 | "data.drop(['Ohio'])" 632 | ] 633 | }, 634 | { 635 | "cell_type": "markdown", 636 | "metadata": {}, 637 | "source": [ 638 | "列处理:(axis 1)" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": 13, 644 | "metadata": {}, 645 | "outputs": [ 646 | { 647 | "data": { 648 | "text/html": [ 649 | "
\n", 650 | "\n", 663 | "\n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | "
onethreefour
Ohio023
Colorado467
Utah81011
New York121415
\n", 699 | "
" 700 | ], 701 | "text/plain": [ 702 | " one three four\n", 703 | "Ohio 0 2 3\n", 704 | "Colorado 4 6 7\n", 705 | "Utah 8 10 11\n", 706 | "New York 12 14 15" 707 | ] 708 | }, 709 | "execution_count": 13, 710 | "metadata": {}, 711 | "output_type": "execute_result" 712 | } 713 | ], 714 | "source": [ 715 | "data.drop('two', axis=1)" 716 | ] 717 | }, 718 | { 719 | "cell_type": "markdown", 720 | "metadata": {}, 721 | "source": [ 722 | "## 2.索引,选择,过滤(indexing, selection, filtering)" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "metadata": {}, 728 | "source": [ 729 | "Series索引\n", 730 | "\n", 731 | "相当于numpy的Array索引,而且还可以使用label索引。注意使用label切片会包括尾节点。" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "metadata": {}, 737 | "source": [ 738 | "DataFrame 索引\n", 739 | "\n", 740 | "#### 值或序列索引:" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": 14, 746 | "metadata": {}, 747 | "outputs": [ 748 | { 749 | "data": { 750 | "text/plain": [ 751 | "Ohio 0\n", 752 | "Colorado 4\n", 753 | "Utah 8\n", 754 | "New York 12\n", 755 | "Name: one, dtype: int32" 756 | ] 757 | }, 758 | "execution_count": 14, 759 | "metadata": {}, 760 | "output_type": "execute_result" 761 | } 762 | ], 763 | "source": [ 764 | "data['one']" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": 15, 770 | "metadata": {}, 771 | "outputs": [ 772 | { 773 | "data": { 774 | "text/html": [ 775 | "
\n", 776 | "\n", 789 | "\n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | "
onetwo
Ohio01
Colorado45
Utah89
New York1213
\n", 820 | "
" 821 | ], 822 | "text/plain": [ 823 | " one two\n", 824 | "Ohio 0 1\n", 825 | "Colorado 4 5\n", 826 | "Utah 8 9\n", 827 | "New York 12 13" 828 | ] 829 | }, 830 | "execution_count": 15, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "data[['one', 'two']]" 837 | ] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": {}, 842 | "source": [ 843 | "#### 布尔数组索引:" 844 | ] 845 | }, 846 | { 847 | "cell_type": "code", 848 | "execution_count": 16, 849 | "metadata": {}, 850 | "outputs": [ 851 | { 852 | "data": { 853 | "text/html": [ 854 | "
\n", 855 | "\n", 868 | "\n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | "
onetwothreefour
Ohio0123
Colorado4567
\n", 895 | "
" 896 | ], 897 | "text/plain": [ 898 | " one two three four\n", 899 | "Ohio 0 1 2 3\n", 900 | "Colorado 4 5 6 7" 901 | ] 902 | }, 903 | "execution_count": 16, 904 | "metadata": {}, 905 | "output_type": "execute_result" 906 | } 907 | ], 908 | "source": [ 909 | "data[:2]" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": 17, 915 | "metadata": {}, 916 | "outputs": [ 917 | { 918 | "data": { 919 | "text/html": [ 920 | "
\n", 921 | "\n", 934 | "\n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | "
onetwothreefour
Colorado4567
Utah891011
New York12131415
\n", 968 | "
" 969 | ], 970 | "text/plain": [ 971 | " one two three four\n", 972 | "Colorado 4 5 6 7\n", 973 | "Utah 8 9 10 11\n", 974 | "New York 12 13 14 15" 975 | ] 976 | }, 977 | "execution_count": 17, 978 | "metadata": {}, 979 | "output_type": "execute_result" 980 | } 981 | ], 982 | "source": [ 983 | "data[data['three']>5]" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": 18, 989 | "metadata": {}, 990 | "outputs": [ 991 | { 992 | "data": { 993 | "text/html": [ 994 | "
\n", 995 | "\n", 1008 | "\n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | "
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York1213140
\n", 1049 | "
" 1050 | ], 1051 | "text/plain": [ 1052 | " one two three four\n", 1053 | "Ohio 0 1 2 3\n", 1054 | "Colorado 4 5 6 7\n", 1055 | "Utah 8 9 10 11\n", 1056 | "New York 12 13 14 0" 1057 | ] 1058 | }, 1059 | "execution_count": 18, 1060 | "metadata": {}, 1061 | "output_type": "execute_result" 1062 | } 1063 | ], 1064 | "source": [ 1065 | "data[data>14] = 0\n", 1066 | "data" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "markdown", 1071 | "metadata": {}, 1072 | "source": [ 1073 | "#### 标签和位置索引:\n", 1074 | "\n", 1075 | "对于label-indexing on rows:loc(for labels标签索引)、iloc(for integers位置索引)" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": 19, 1081 | "metadata": {}, 1082 | "outputs": [ 1083 | { 1084 | "data": { 1085 | "text/html": [ 1086 | "
\n", 1087 | "\n", 1100 | "\n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | "
onetwothreefour
Ohio0123
Colorado4567
Utah891011
New York1213140
\n", 1141 | "
" 1142 | ], 1143 | "text/plain": [ 1144 | " one two three four\n", 1145 | "Ohio 0 1 2 3\n", 1146 | "Colorado 4 5 6 7\n", 1147 | "Utah 8 9 10 11\n", 1148 | "New York 12 13 14 0" 1149 | ] 1150 | }, 1151 | "execution_count": 19, 1152 | "metadata": {}, 1153 | "output_type": "execute_result" 1154 | } 1155 | ], 1156 | "source": [ 1157 | "data" 1158 | ] 1159 | }, 1160 | { 1161 | "cell_type": "code", 1162 | "execution_count": 20, 1163 | "metadata": {}, 1164 | "outputs": [ 1165 | { 1166 | "data": { 1167 | "text/plain": [ 1168 | "one 0\n", 1169 | "two 1\n", 1170 | "Name: Ohio, dtype: int32" 1171 | ] 1172 | }, 1173 | "execution_count": 20, 1174 | "metadata": {}, 1175 | "output_type": "execute_result" 1176 | } 1177 | ], 1178 | "source": [ 1179 | "data.loc['Ohio', ['one', 'two']]" 1180 | ] 1181 | }, 1182 | { 1183 | "cell_type": "code", 1184 | "execution_count": 21, 1185 | "metadata": {}, 1186 | "outputs": [ 1187 | { 1188 | "data": { 1189 | "text/plain": [ 1190 | "one 0\n", 1191 | "two 1\n", 1192 | "Name: Ohio, dtype: int32" 1193 | ] 1194 | }, 1195 | "execution_count": 21, 1196 | "metadata": {}, 1197 | "output_type": "execute_result" 1198 | } 1199 | ], 1200 | "source": [ 1201 | "data.iloc[0, [0, 1]]" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "code", 1206 | "execution_count": 22, 1207 | "metadata": {}, 1208 | "outputs": [ 1209 | { 1210 | "data": { 1211 | "text/plain": [ 1212 | "Ohio 1\n", 1213 | "Colorado 5\n", 1214 | "Utah 9\n", 1215 | "Name: two, dtype: int32" 1216 | ] 1217 | }, 1218 | "execution_count": 22, 1219 | "metadata": {}, 1220 | "output_type": "execute_result" 1221 | } 1222 | ], 1223 | "source": [ 1224 | "data.loc[:'Utah', 'two']" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": 23, 1230 | "metadata": {}, 1231 | "outputs": [ 1232 | { 1233 | "data": { 1234 | "text/html": [ 1235 | "
\n", 1236 | "\n", 1249 | "\n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | "
onetwothree
Colorado456
Utah8910
New York121314
\n", 1279 | "
" 1280 | ], 1281 | "text/plain": [ 1282 | " one two three\n", 1283 | "Colorado 4 5 6\n", 1284 | "Utah 8 9 10\n", 1285 | "New York 12 13 14" 1286 | ] 1287 | }, 1288 | "execution_count": 23, 1289 | "metadata": {}, 1290 | "output_type": "execute_result" 1291 | } 1292 | ], 1293 | "source": [ 1294 | "data.iloc[:, :3][data.three>5]" 1295 | ] 1296 | }, 1297 | { 1298 | "cell_type": "markdown", 1299 | "metadata": {}, 1300 | "source": [ 1301 | "选择数据方法:\n", 1302 | "\n", 1303 | "![image](http://oydgk2hgw.bkt.clouddn.com/pydata-book/bwadf.png)\n", 1304 | "\n", 1305 | "![image](http://oydgk2hgw.bkt.clouddn.com/pydata-book/lc2uc.png)" 1306 | ] 1307 | }, 1308 | { 1309 | "cell_type": "markdown", 1310 | "metadata": {}, 1311 | "source": [ 1312 | "## 3.算数和数据对齐(Arithmetic and Data Alignment)" 1313 | ] 1314 | }, 1315 | { 1316 | "cell_type": "code", 1317 | "execution_count": 24, 1318 | "metadata": {}, 1319 | "outputs": [ 1320 | { 1321 | "data": { 1322 | "text/html": [ 1323 | "
\n", 1324 | "\n", 1337 | "\n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | "
bcd
Colorado0.01.02.0
Texas3.04.05.0
Ohio6.07.08.0
\n", 1367 | "
" 1368 | ], 1369 | "text/plain": [ 1370 | " b c d\n", 1371 | "Colorado 0.0 1.0 2.0\n", 1372 | "Texas 3.0 4.0 5.0\n", 1373 | "Ohio 6.0 7.0 8.0" 1374 | ] 1375 | }, 1376 | "execution_count": 24, 1377 | "metadata": {}, 1378 | "output_type": "execute_result" 1379 | } 1380 | ], 1381 | "source": [ 1382 | "df1 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns=list('bcd'),\n", 1383 | " index={'Ohio', 'Texas', 'Colorado'})\n", 1384 | "df1" 1385 | ] 1386 | }, 1387 | { 1388 | "cell_type": "code", 1389 | "execution_count": 25, 1390 | "metadata": {}, 1391 | "outputs": [ 1392 | { 1393 | "data": { 1394 | "text/html": [ 1395 | "
\n", 1396 | "\n", 1409 | "\n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | " \n", 1432 | " \n", 1433 | " \n", 1434 | " \n", 1435 | " \n", 1436 | " \n", 1437 | " \n", 1438 | " \n", 1439 | " \n", 1440 | " \n", 1441 | " \n", 1442 | " \n", 1443 | " \n", 1444 | "
bde
Utah0.01.02.0
Ohio3.04.05.0
Texas6.07.08.0
Oregon9.010.011.0
\n", 1445 | "
" 1446 | ], 1447 | "text/plain": [ 1448 | " b d e\n", 1449 | "Utah 0.0 1.0 2.0\n", 1450 | "Ohio 3.0 4.0 5.0\n", 1451 | "Texas 6.0 7.0 8.0\n", 1452 | "Oregon 9.0 10.0 11.0" 1453 | ] 1454 | }, 1455 | "execution_count": 25, 1456 | "metadata": {}, 1457 | "output_type": "execute_result" 1458 | } 1459 | ], 1460 | "source": [ 1461 | "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", 1462 | " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", 1463 | "df2" 1464 | ] 1465 | }, 1466 | { 1467 | "cell_type": "code", 1468 | "execution_count": 26, 1469 | "metadata": {}, 1470 | "outputs": [ 1471 | { 1472 | "data": { 1473 | "text/html": [ 1474 | "
\n", 1475 | "\n", 1488 | "\n", 1489 | " \n", 1490 | " \n", 1491 | " \n", 1492 | " \n", 1493 | " \n", 1494 | " \n", 1495 | " \n", 1496 | " \n", 1497 | " \n", 1498 | " \n", 1499 | " \n", 1500 | " \n", 1501 | " \n", 1502 | " \n", 1503 | " \n", 1504 | " \n", 1505 | " \n", 1506 | " \n", 1507 | " \n", 1508 | " \n", 1509 | " \n", 1510 | " \n", 1511 | " \n", 1512 | " \n", 1513 | " \n", 1514 | " \n", 1515 | " \n", 1516 | " \n", 1517 | " \n", 1518 | " \n", 1519 | " \n", 1520 | " \n", 1521 | " \n", 1522 | " \n", 1523 | " \n", 1524 | " \n", 1525 | " \n", 1526 | " \n", 1527 | " \n", 1528 | " \n", 1529 | " \n", 1530 | " \n", 1531 | " \n", 1532 | " \n", 1533 | " \n", 1534 | " \n", 1535 | "
bcde
ColoradoNaNNaNNaNNaN
Ohio9.0NaN12.0NaN
OregonNaNNaNNaNNaN
Texas9.0NaN12.0NaN
UtahNaNNaNNaNNaN
\n", 1536 | "
" 1537 | ], 1538 | "text/plain": [ 1539 | " b c d e\n", 1540 | "Colorado NaN NaN NaN NaN\n", 1541 | "Ohio 9.0 NaN 12.0 NaN\n", 1542 | "Oregon NaN NaN NaN NaN\n", 1543 | "Texas 9.0 NaN 12.0 NaN\n", 1544 | "Utah NaN NaN NaN NaN" 1545 | ] 1546 | }, 1547 | "execution_count": 26, 1548 | "metadata": {}, 1549 | "output_type": "execute_result" 1550 | } 1551 | ], 1552 | "source": [ 1553 | "df1 + df2" 1554 | ] 1555 | }, 1556 | { 1557 | "cell_type": "markdown", 1558 | "metadata": {}, 1559 | "source": [ 1560 | "因为'c'和'e'列都不在两个DataFrame里,所有全是缺失值。对于行,即使有相同的,但列不一样的话也会是缺失值。" 1561 | ] 1562 | }, 1563 | { 1564 | "cell_type": "markdown", 1565 | "metadata": {}, 1566 | "source": [ 1567 | "使用带填充值得方法:" 1568 | ] 1569 | }, 1570 | { 1571 | "cell_type": "code", 1572 | "execution_count": 27, 1573 | "metadata": {}, 1574 | "outputs": [ 1575 | { 1576 | "data": { 1577 | "text/html": [ 1578 | "
\n", 1579 | "\n", 1592 | "\n", 1593 | " \n", 1594 | " \n", 1595 | " \n", 1596 | " \n", 1597 | " \n", 1598 | " \n", 1599 | " \n", 1600 | " \n", 1601 | " \n", 1602 | " \n", 1603 | " \n", 1604 | " \n", 1605 | " \n", 1606 | " \n", 1607 | " \n", 1608 | " \n", 1609 | " \n", 1610 | " \n", 1611 | " \n", 1612 | " \n", 1613 | " \n", 1614 | " \n", 1615 | " \n", 1616 | " \n", 1617 | " \n", 1618 | " \n", 1619 | " \n", 1620 | " \n", 1621 | " \n", 1622 | " \n", 1623 | " \n", 1624 | " \n", 1625 | " \n", 1626 | " \n", 1627 | " \n", 1628 | " \n", 1629 | " \n", 1630 | " \n", 1631 | " \n", 1632 | " \n", 1633 | " \n", 1634 | " \n", 1635 | " \n", 1636 | " \n", 1637 | "
abcde
00.02.04.06.04.0
19.05.013.015.09.0
218.020.022.024.014.0
315.016.017.018.019.0
\n", 1638 | "
" 1639 | ], 1640 | "text/plain": [ 1641 | " a b c d e\n", 1642 | "0 0.0 2.0 4.0 6.0 4.0\n", 1643 | "1 9.0 5.0 13.0 15.0 9.0\n", 1644 | "2 18.0 20.0 22.0 24.0 14.0\n", 1645 | "3 15.0 16.0 17.0 18.0 19.0" 1646 | ] 1647 | }, 1648 | "execution_count": 27, 1649 | "metadata": {}, 1650 | "output_type": "execute_result" 1651 | } 1652 | ], 1653 | "source": [ 1654 | "df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), \n", 1655 | " columns=list('abcd'))\n", 1656 | "\n", 1657 | "df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), \n", 1658 | " columns=list('abcde'))\n", 1659 | "df2.loc[1, 'b'] = np.nan\n", 1660 | "df1.add(df2, fill_value=0)" 1661 | ] 1662 | }, 1663 | { 1664 | "cell_type": "markdown", 1665 | "metadata": {}, 1666 | "source": [ 1667 | "下表是这样的灵活算数方法:" 1668 | ] 1669 | }, 1670 | { 1671 | "cell_type": "markdown", 1672 | "metadata": {}, 1673 | "source": [ 1674 | "![image](http://oydgk2hgw.bkt.clouddn.com/pydata-book/y0rr4.png)" 1675 | ] 1676 | }, 1677 | { 1678 | "cell_type": "markdown", 1679 | "metadata": {}, 1680 | "source": [ 1681 | "每一个都有一个配对的,以r开头,意思是反转。" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "code", 1686 | "execution_count": 28, 1687 | "metadata": {}, 1688 | "outputs": [ 1689 | { 1690 | "data": { 1691 | "text/html": [ 1692 | "
\n", 1693 | "\n", 1706 | "\n", 1707 | " \n", 1708 | " \n", 1709 | " \n", 1710 | " \n", 1711 | " \n", 1712 | " \n", 1713 | " \n", 1714 | " \n", 1715 | " \n", 1716 | " \n", 1717 | " \n", 1718 | " \n", 1719 | " \n", 1720 | " \n", 1721 | " \n", 1722 | " \n", 1723 | " \n", 1724 | " \n", 1725 | " \n", 1726 | " \n", 1727 | " \n", 1728 | " \n", 1729 | " \n", 1730 | " \n", 1731 | " \n", 1732 | " \n", 1733 | " \n", 1734 | " \n", 1735 | " \n", 1736 | " \n", 1737 | " \n", 1738 | " \n", 1739 | "
abcd
0inf1.0000000.5000000.333333
10.2500000.2000000.1666670.142857
20.1250000.1111110.1000000.090909
\n", 1740 | "
" 1741 | ], 1742 | "text/plain": [ 1743 | " a b c d\n", 1744 | "0 inf 1.000000 0.500000 0.333333\n", 1745 | "1 0.250000 0.200000 0.166667 0.142857\n", 1746 | "2 0.125000 0.111111 0.100000 0.090909" 1747 | ] 1748 | }, 1749 | "execution_count": 28, 1750 | "metadata": {}, 1751 | "output_type": "execute_result" 1752 | } 1753 | ], 1754 | "source": [ 1755 | "1/df1" 1756 | ] 1757 | }, 1758 | { 1759 | "cell_type": "code", 1760 | "execution_count": 29, 1761 | "metadata": {}, 1762 | "outputs": [ 1763 | { 1764 | "data": { 1765 | "text/html": [ 1766 | "
\n", 1767 | "\n", 1780 | "\n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | " \n", 1789 | " \n", 1790 | " \n", 1791 | " \n", 1792 | " \n", 1793 | " \n", 1794 | " \n", 1795 | " \n", 1796 | " \n", 1797 | " \n", 1798 | " \n", 1799 | " \n", 1800 | " \n", 1801 | " \n", 1802 | " \n", 1803 | " \n", 1804 | " \n", 1805 | " \n", 1806 | " \n", 1807 | " \n", 1808 | " \n", 1809 | " \n", 1810 | " \n", 1811 | " \n", 1812 | " \n", 1813 | "
abcd
0inf1.0000000.5000000.333333
10.2500000.2000000.1666670.142857
20.1250000.1111110.1000000.090909
\n", 1814 | "
" 1815 | ], 1816 | "text/plain": [ 1817 | " a b c d\n", 1818 | "0 inf 1.000000 0.500000 0.333333\n", 1819 | "1 0.250000 0.200000 0.166667 0.142857\n", 1820 | "2 0.125000 0.111111 0.100000 0.090909" 1821 | ] 1822 | }, 1823 | "execution_count": 29, 1824 | "metadata": {}, 1825 | "output_type": "execute_result" 1826 | } 1827 | ], 1828 | "source": [ 1829 | "df1.rdiv(1)" 1830 | ] 1831 | }, 1832 | { 1833 | "cell_type": "markdown", 1834 | "metadata": {}, 1835 | "source": [ 1836 | "在reindexing(重建索引)时,也可以使用fill_value" 1837 | ] 1838 | }, 1839 | { 1840 | "cell_type": "code", 1841 | "execution_count": 30, 1842 | "metadata": {}, 1843 | "outputs": [ 1844 | { 1845 | "data": { 1846 | "text/html": [ 1847 | "
\n", 1848 | "\n", 1861 | "\n", 1862 | " \n", 1863 | " \n", 1864 | " \n", 1865 | " \n", 1866 | " \n", 1867 | " \n", 1868 | " \n", 1869 | " \n", 1870 | " \n", 1871 | " \n", 1872 | " \n", 1873 | " \n", 1874 | " \n", 1875 | " \n", 1876 | " \n", 1877 | " \n", 1878 | " \n", 1879 | " \n", 1880 | " \n", 1881 | " \n", 1882 | " \n", 1883 | " \n", 1884 | " \n", 1885 | " \n", 1886 | " \n", 1887 | " \n", 1888 | " \n", 1889 | " \n", 1890 | " \n", 1891 | " \n", 1892 | " \n", 1893 | " \n", 1894 | " \n", 1895 | " \n", 1896 | " \n", 1897 | " \n", 1898 | "
abcde
00.01.02.03.00
14.05.06.07.00
28.09.010.011.00
\n", 1899 | "
" 1900 | ], 1901 | "text/plain": [ 1902 | " a b c d e\n", 1903 | "0 0.0 1.0 2.0 3.0 0\n", 1904 | "1 4.0 5.0 6.0 7.0 0\n", 1905 | "2 8.0 9.0 10.0 11.0 0" 1906 | ] 1907 | }, 1908 | "execution_count": 30, 1909 | "metadata": {}, 1910 | "output_type": "execute_result" 1911 | } 1912 | ], 1913 | "source": [ 1914 | "df1.reindex(columns=df2.columns, fill_value=0)" 1915 | ] 1916 | }, 1917 | { 1918 | "cell_type": "markdown", 1919 | "metadata": {}, 1920 | "source": [ 1921 | "#### DataFrame和Series之间的操作:" 1922 | ] 1923 | }, 1924 | { 1925 | "cell_type": "markdown", 1926 | "metadata": {}, 1927 | "source": [ 1928 | "举一个numpy的例子:" 1929 | ] 1930 | }, 1931 | { 1932 | "cell_type": "code", 1933 | "execution_count": 31, 1934 | "metadata": { 1935 | "collapsed": true 1936 | }, 1937 | "outputs": [], 1938 | "source": [ 1939 | "arr = np.arange(12.).reshape((3, 4))" 1940 | ] 1941 | }, 1942 | { 1943 | "cell_type": "code", 1944 | "execution_count": 32, 1945 | "metadata": {}, 1946 | "outputs": [ 1947 | { 1948 | "data": { 1949 | "text/plain": [ 1950 | "array([[ 0., 0., 0., 0.],\n", 1951 | " [ 4., 4., 4., 4.],\n", 1952 | " [ 8., 8., 8., 8.]])" 1953 | ] 1954 | }, 1955 | "execution_count": 32, 1956 | "metadata": {}, 1957 | "output_type": "execute_result" 1958 | } 1959 | ], 1960 | "source": [ 1961 | "arr - arr[0]" 1962 | ] 1963 | }, 1964 | { 1965 | "cell_type": "markdown", 1966 | "metadata": {}, 1967 | "source": [ 1968 | "减法用在了每一行上,这种操作叫做broadcating(广播)。" 1969 | ] 1970 | }, 1971 | { 1972 | "cell_type": "code", 1973 | "execution_count": 33, 1974 | "metadata": { 1975 | "collapsed": true 1976 | }, 1977 | "outputs": [], 1978 | "source": [ 1979 | "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),\n", 1980 | " columns=list('bde'),\n", 1981 | " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", 1982 | "series = frame.iloc[0]" 1983 | ] 1984 | }, 1985 | { 1986 | "cell_type": "code", 1987 | "execution_count": 34, 1988 | "metadata": {}, 1989 | "outputs": [ 1990 | { 1991 | "data": { 1992 | "text/html": [ 1993 | "
\n", 1994 | "\n", 2007 | "\n", 2008 | " \n", 2009 | " \n", 2010 | " \n", 2011 | " \n", 2012 | " \n", 2013 | " \n", 2014 | " \n", 2015 | " \n", 2016 | " \n", 2017 | " \n", 2018 | " \n", 2019 | " \n", 2020 | " \n", 2021 | " \n", 2022 | " \n", 2023 | " \n", 2024 | " \n", 2025 | " \n", 2026 | " \n", 2027 | " \n", 2028 | " \n", 2029 | " \n", 2030 | " \n", 2031 | " \n", 2032 | " \n", 2033 | " \n", 2034 | " \n", 2035 | " \n", 2036 | " \n", 2037 | " \n", 2038 | " \n", 2039 | " \n", 2040 | " \n", 2041 | " \n", 2042 | "
bde
Utah0.01.02.0
Ohio3.04.05.0
Texas6.07.08.0
Oregon9.010.011.0
\n", 2043 | "
" 2044 | ], 2045 | "text/plain": [ 2046 | " b d e\n", 2047 | "Utah 0.0 1.0 2.0\n", 2048 | "Ohio 3.0 4.0 5.0\n", 2049 | "Texas 6.0 7.0 8.0\n", 2050 | "Oregon 9.0 10.0 11.0" 2051 | ] 2052 | }, 2053 | "execution_count": 34, 2054 | "metadata": {}, 2055 | "output_type": "execute_result" 2056 | } 2057 | ], 2058 | "source": [ 2059 | "frame" 2060 | ] 2061 | }, 2062 | { 2063 | "cell_type": "code", 2064 | "execution_count": 35, 2065 | "metadata": {}, 2066 | "outputs": [ 2067 | { 2068 | "data": { 2069 | "text/plain": [ 2070 | "b 0.0\n", 2071 | "d 1.0\n", 2072 | "e 2.0\n", 2073 | "Name: Utah, dtype: float64" 2074 | ] 2075 | }, 2076 | "execution_count": 35, 2077 | "metadata": {}, 2078 | "output_type": "execute_result" 2079 | } 2080 | ], 2081 | "source": [ 2082 | "series" 2083 | ] 2084 | }, 2085 | { 2086 | "cell_type": "markdown", 2087 | "metadata": {}, 2088 | "source": [ 2089 | "可以理解为Series和DataFrame的列匹配。\n", 2090 | "\n", 2091 | "Broadcasting down the rows(向下按行广播)" 2092 | ] 2093 | }, 2094 | { 2095 | "cell_type": "code", 2096 | "execution_count": 36, 2097 | "metadata": {}, 2098 | "outputs": [ 2099 | { 2100 | "data": { 2101 | "text/html": [ 2102 | "
\n", 2103 | "\n", 2116 | "\n", 2117 | " \n", 2118 | " \n", 2119 | " \n", 2120 | " \n", 2121 | " \n", 2122 | " \n", 2123 | " \n", 2124 | " \n", 2125 | " \n", 2126 | " \n", 2127 | " \n", 2128 | " \n", 2129 | " \n", 2130 | " \n", 2131 | " \n", 2132 | " \n", 2133 | " \n", 2134 | " \n", 2135 | " \n", 2136 | " \n", 2137 | " \n", 2138 | " \n", 2139 | " \n", 2140 | " \n", 2141 | " \n", 2142 | " \n", 2143 | " \n", 2144 | " \n", 2145 | " \n", 2146 | " \n", 2147 | " \n", 2148 | " \n", 2149 | " \n", 2150 | " \n", 2151 | "
bde
Utah0.00.00.0
Ohio3.03.03.0
Texas6.06.06.0
Oregon9.09.09.0
\n", 2152 | "
" 2153 | ], 2154 | "text/plain": [ 2155 | " b d e\n", 2156 | "Utah 0.0 0.0 0.0\n", 2157 | "Ohio 3.0 3.0 3.0\n", 2158 | "Texas 6.0 6.0 6.0\n", 2159 | "Oregon 9.0 9.0 9.0" 2160 | ] 2161 | }, 2162 | "execution_count": 36, 2163 | "metadata": {}, 2164 | "output_type": "execute_result" 2165 | } 2166 | ], 2167 | "source": [ 2168 | "frame - series" 2169 | ] 2170 | }, 2171 | { 2172 | "cell_type": "markdown", 2173 | "metadata": {}, 2174 | "source": [ 2175 | "如果Series和DataFrame有不同的index,那么相加结果也是合集:" 2176 | ] 2177 | }, 2178 | { 2179 | "cell_type": "code", 2180 | "execution_count": 37, 2181 | "metadata": {}, 2182 | "outputs": [ 2183 | { 2184 | "data": { 2185 | "text/html": [ 2186 | "
\n", 2187 | "\n", 2200 | "\n", 2201 | " \n", 2202 | " \n", 2203 | " \n", 2204 | " \n", 2205 | " \n", 2206 | " \n", 2207 | " \n", 2208 | " \n", 2209 | " \n", 2210 | " \n", 2211 | " \n", 2212 | " \n", 2213 | " \n", 2214 | " \n", 2215 | " \n", 2216 | " \n", 2217 | " \n", 2218 | " \n", 2219 | " \n", 2220 | " \n", 2221 | " \n", 2222 | " \n", 2223 | " \n", 2224 | " \n", 2225 | " \n", 2226 | " \n", 2227 | " \n", 2228 | " \n", 2229 | " \n", 2230 | " \n", 2231 | " \n", 2232 | " \n", 2233 | " \n", 2234 | " \n", 2235 | " \n", 2236 | " \n", 2237 | " \n", 2238 | " \n", 2239 | " \n", 2240 | "
bdef
Utah0.0NaN3.0NaN
Ohio3.0NaN6.0NaN
Texas6.0NaN9.0NaN
Oregon9.0NaN12.0NaN
\n", 2241 | "
" 2242 | ], 2243 | "text/plain": [ 2244 | " b d e f\n", 2245 | "Utah 0.0 NaN 3.0 NaN\n", 2246 | "Ohio 3.0 NaN 6.0 NaN\n", 2247 | "Texas 6.0 NaN 9.0 NaN\n", 2248 | "Oregon 9.0 NaN 12.0 NaN" 2249 | ] 2250 | }, 2251 | "execution_count": 37, 2252 | "metadata": {}, 2253 | "output_type": "execute_result" 2254 | } 2255 | ], 2256 | "source": [ 2257 | "series2 = pd.Series(range(3), index=['b', 'e', 'f'])\n", 2258 | "frame + series2" 2259 | ] 2260 | }, 2261 | { 2262 | "cell_type": "markdown", 2263 | "metadata": {}, 2264 | "source": [ 2265 | "如果想要广播列,去匹配行,必须要用到算数方法:" 2266 | ] 2267 | }, 2268 | { 2269 | "cell_type": "code", 2270 | "execution_count": 38, 2271 | "metadata": { 2272 | "collapsed": true 2273 | }, 2274 | "outputs": [], 2275 | "source": [ 2276 | "series = frame['d']" 2277 | ] 2278 | }, 2279 | { 2280 | "cell_type": "code", 2281 | "execution_count": 39, 2282 | "metadata": {}, 2283 | "outputs": [ 2284 | { 2285 | "data": { 2286 | "text/html": [ 2287 | "
\n", 2288 | "\n", 2301 | "\n", 2302 | " \n", 2303 | " \n", 2304 | " \n", 2305 | " \n", 2306 | " \n", 2307 | " \n", 2308 | " \n", 2309 | " \n", 2310 | " \n", 2311 | " \n", 2312 | " \n", 2313 | " \n", 2314 | " \n", 2315 | " \n", 2316 | " \n", 2317 | " \n", 2318 | " \n", 2319 | " \n", 2320 | " \n", 2321 | " \n", 2322 | " \n", 2323 | " \n", 2324 | " \n", 2325 | " \n", 2326 | " \n", 2327 | " \n", 2328 | " \n", 2329 | " \n", 2330 | " \n", 2331 | " \n", 2332 | " \n", 2333 | " \n", 2334 | " \n", 2335 | " \n", 2336 | "
bde
Utah-1.00.01.0
Ohio-1.00.01.0
Texas-1.00.01.0
Oregon-1.00.01.0
\n", 2337 | "
" 2338 | ], 2339 | "text/plain": [ 2340 | " b d e\n", 2341 | "Utah -1.0 0.0 1.0\n", 2342 | "Ohio -1.0 0.0 1.0\n", 2343 | "Texas -1.0 0.0 1.0\n", 2344 | "Oregon -1.0 0.0 1.0" 2345 | ] 2346 | }, 2347 | "execution_count": 39, 2348 | "metadata": {}, 2349 | "output_type": "execute_result" 2350 | } 2351 | ], 2352 | "source": [ 2353 | "frame.sub(series, axis='index')" 2354 | ] 2355 | }, 2356 | { 2357 | "cell_type": "markdown", 2358 | "metadata": {}, 2359 | "source": [ 2360 | "axis参数就是用来匹配轴的。在这个例子里是匹配dataframe的row index(axis='index' or axis=0),然后再广播。" 2361 | ] 2362 | }, 2363 | { 2364 | "cell_type": "markdown", 2365 | "metadata": {}, 2366 | "source": [ 2367 | "## 4.函数应用和映射(Fuction Application and Mappong)" 2368 | ] 2369 | }, 2370 | { 2371 | "cell_type": "markdown", 2372 | "metadata": {}, 2373 | "source": [ 2374 | "numpy的ufuncs(element-wise数组方法)也能用在pandas的object上:" 2375 | ] 2376 | }, 2377 | { 2378 | "cell_type": "code", 2379 | "execution_count": 40, 2380 | "metadata": {}, 2381 | "outputs": [ 2382 | { 2383 | "data": { 2384 | "text/html": [ 2385 | "
\n", 2386 | "\n", 2399 | "\n", 2400 | " \n", 2401 | " \n", 2402 | " \n", 2403 | " \n", 2404 | " \n", 2405 | " \n", 2406 | " \n", 2407 | " \n", 2408 | " \n", 2409 | " \n", 2410 | " \n", 2411 | " \n", 2412 | " \n", 2413 | " \n", 2414 | " \n", 2415 | " \n", 2416 | " \n", 2417 | " \n", 2418 | " \n", 2419 | " \n", 2420 | " \n", 2421 | " \n", 2422 | " \n", 2423 | " \n", 2424 | " \n", 2425 | " \n", 2426 | " \n", 2427 | " \n", 2428 | " \n", 2429 | " \n", 2430 | " \n", 2431 | " \n", 2432 | " \n", 2433 | " \n", 2434 | "
bde
Utah-1.326382-0.6909200.121802
Ohio1.2551000.4968091.017018
Texas0.752331-0.148764-1.549744
Oregon1.0638630.208184-1.328060
\n", 2435 | "
" 2436 | ], 2437 | "text/plain": [ 2438 | " b d e\n", 2439 | "Utah -1.326382 -0.690920 0.121802\n", 2440 | "Ohio 1.255100 0.496809 1.017018\n", 2441 | "Texas 0.752331 -0.148764 -1.549744\n", 2442 | "Oregon 1.063863 0.208184 -1.328060" 2443 | ] 2444 | }, 2445 | "execution_count": 40, 2446 | "metadata": {}, 2447 | "output_type": "execute_result" 2448 | } 2449 | ], 2450 | "source": [ 2451 | "frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), \n", 2452 | " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", 2453 | "frame" 2454 | ] 2455 | }, 2456 | { 2457 | "cell_type": "code", 2458 | "execution_count": 41, 2459 | "metadata": {}, 2460 | "outputs": [ 2461 | { 2462 | "data": { 2463 | "text/html": [ 2464 | "
\n", 2465 | "\n", 2478 | "\n", 2479 | " \n", 2480 | " \n", 2481 | " \n", 2482 | " \n", 2483 | " \n", 2484 | " \n", 2485 | " \n", 2486 | " \n", 2487 | " \n", 2488 | " \n", 2489 | " \n", 2490 | " \n", 2491 | " \n", 2492 | " \n", 2493 | " \n", 2494 | " \n", 2495 | " \n", 2496 | " \n", 2497 | " \n", 2498 | " \n", 2499 | " \n", 2500 | " \n", 2501 | " \n", 2502 | " \n", 2503 | " \n", 2504 | " \n", 2505 | " \n", 2506 | " \n", 2507 | " \n", 2508 | " \n", 2509 | " \n", 2510 | " \n", 2511 | " \n", 2512 | " \n", 2513 | "
bde
Utah1.3263820.6909200.121802
Ohio1.2551000.4968091.017018
Texas0.7523310.1487641.549744
Oregon1.0638630.2081841.328060
\n", 2514 | "
" 2515 | ], 2516 | "text/plain": [ 2517 | " b d e\n", 2518 | "Utah 1.326382 0.690920 0.121802\n", 2519 | "Ohio 1.255100 0.496809 1.017018\n", 2520 | "Texas 0.752331 0.148764 1.549744\n", 2521 | "Oregon 1.063863 0.208184 1.328060" 2522 | ] 2523 | }, 2524 | "execution_count": 41, 2525 | "metadata": {}, 2526 | "output_type": "execute_result" 2527 | } 2528 | ], 2529 | "source": [ 2530 | "np.abs(frame)" 2531 | ] 2532 | }, 2533 | { 2534 | "cell_type": "markdown", 2535 | "metadata": {}, 2536 | "source": [ 2537 | "此外,可以把一个用在一维数组上的函数应用在一行或者一列上。\n", 2538 | "\n", 2539 | "用到DataFrame的apply函数:" 2540 | ] 2541 | }, 2542 | { 2543 | "cell_type": "code", 2544 | "execution_count": 42, 2545 | "metadata": {}, 2546 | "outputs": [ 2547 | { 2548 | "data": { 2549 | "text/plain": [ 2550 | "b 2.581482\n", 2551 | "d 1.187729\n", 2552 | "e 2.566762\n", 2553 | "dtype: float64" 2554 | ] 2555 | }, 2556 | "execution_count": 42, 2557 | "metadata": {}, 2558 | "output_type": "execute_result" 2559 | } 2560 | ], 2561 | "source": [ 2562 | "f = lambda x: x.max()-x.min()\n", 2563 | "frame.apply(f)" 2564 | ] 2565 | }, 2566 | { 2567 | "cell_type": "markdown", 2568 | "metadata": {}, 2569 | "source": [ 2570 | "这里函数f,计算的是一个series中最大值和最小值的差,在frame中的每一列,这个函数被调用一次。作为结果的Series,它的index就是frame的column。\n", 2571 | "\n", 2572 | "如果你传入axis='column'用于apply,那么函数会被用在每一行。\n", 2573 | "\n", 2574 | "apply不会返回标量,只会返回一个含有多个值的Series:" 2575 | ] 2576 | }, 2577 | { 2578 | "cell_type": "code", 2579 | "execution_count": 43, 2580 | "metadata": { 2581 | "collapsed": true 2582 | }, 2583 | "outputs": [], 2584 | "source": [ 2585 | "def f(x):\n", 2586 | " return pd.Series([x.min, x.max], index=['min','max'])" 2587 | ] 2588 | }, 2589 | { 2590 | "cell_type": "code", 2591 | "execution_count": 44, 2592 | "metadata": {}, 2593 | "outputs": [ 2594 | { 2595 | "data": { 2596 | "text/html": [ 2597 | "
\n", 2598 | "\n", 2611 | "\n", 2612 | " \n", 2613 | " \n", 2614 | " \n", 2615 | " \n", 2616 | " \n", 2617 | " \n", 2618 | " \n", 2619 | " \n", 2620 | " \n", 2621 | " \n", 2622 | " \n", 2623 | " \n", 2624 | " \n", 2625 | " \n", 2626 | " \n", 2627 | " \n", 2628 | " \n", 2629 | " \n", 2630 | " \n", 2631 | " \n", 2632 | " \n", 2633 | " \n", 2634 | "
bde
min<bound method Series.min of Utah -1.326382...<bound method Series.min of Utah -0.690920...<bound method Series.min of Utah 0.121802...
max<bound method Series.max of Utah -1.326382...<bound method Series.max of Utah -0.690920...<bound method Series.max of Utah 0.121802...
\n", 2635 | "
" 2636 | ], 2637 | "text/plain": [ 2638 | " b \\\n", 2639 | "min \n", 2676 | "\n", 2689 | "\n", 2690 | " \n", 2691 | " \n", 2692 | " \n", 2693 | " \n", 2694 | " \n", 2695 | " \n", 2696 | " \n", 2697 | " \n", 2698 | " \n", 2699 | " \n", 2700 | " \n", 2701 | " \n", 2702 | " \n", 2703 | " \n", 2704 | " \n", 2705 | " \n", 2706 | " \n", 2707 | " \n", 2708 | " \n", 2709 | " \n", 2710 | " \n", 2711 | " \n", 2712 | " \n", 2713 | " \n", 2714 | " \n", 2715 | " \n", 2716 | " \n", 2717 | " \n", 2718 | " \n", 2719 | " \n", 2720 | " \n", 2721 | " \n", 2722 | " \n", 2723 | " \n", 2724 | "
bde
Utah-1.326382-0.6909200.121802
Ohio1.2551000.4968091.017018
Texas0.752331-0.148764-1.549744
Oregon1.0638630.208184-1.328060
\n", 2725 | "" 2726 | ], 2727 | "text/plain": [ 2728 | " b d e\n", 2729 | "Utah -1.326382 -0.690920 0.121802\n", 2730 | "Ohio 1.255100 0.496809 1.017018\n", 2731 | "Texas 0.752331 -0.148764 -1.549744\n", 2732 | "Oregon 1.063863 0.208184 -1.328060" 2733 | ] 2734 | }, 2735 | "execution_count": 45, 2736 | "metadata": {}, 2737 | "output_type": "execute_result" 2738 | } 2739 | ], 2740 | "source": [ 2741 | "format = lambda x:'%2f'%x\n", 2742 | "frame.applymap(format)" 2743 | ] 2744 | }, 2745 | { 2746 | "cell_type": "markdown", 2747 | "metadata": {}, 2748 | "source": [ 2749 | "applymap的做法是,Series有一个map函数,用来实现element-wise函数:" 2750 | ] 2751 | }, 2752 | { 2753 | "cell_type": "code", 2754 | "execution_count": 46, 2755 | "metadata": {}, 2756 | "outputs": [ 2757 | { 2758 | "data": { 2759 | "text/plain": [ 2760 | "Utah 0.121802\n", 2761 | "Ohio 1.017018\n", 2762 | "Texas -1.549744\n", 2763 | "Oregon -1.328060\n", 2764 | "Name: e, dtype: object" 2765 | ] 2766 | }, 2767 | "execution_count": 46, 2768 | "metadata": {}, 2769 | "output_type": "execute_result" 2770 | } 2771 | ], 2772 | "source": [ 2773 | "frame['e'].map(format)" 2774 | ] 2775 | }, 2776 | { 2777 | "cell_type": "markdown", 2778 | "metadata": {}, 2779 | "source": [ 2780 | "## 5.排序(Sorting and Ranking)" 2781 | ] 2782 | }, 2783 | { 2784 | "cell_type": "markdown", 2785 | "metadata": {}, 2786 | "source": [ 2787 | "按row或column index来排序的话,可以用sort_index方法,按照某个axis来排序,并且会返回一个新的object:" 2788 | ] 2789 | }, 2790 | { 2791 | "cell_type": "code", 2792 | "execution_count": 47, 2793 | "metadata": {}, 2794 | "outputs": [ 2795 | { 2796 | "data": { 2797 | "text/html": [ 2798 | "
\n", 2799 | "\n", 2812 | "\n", 2813 | " \n", 2814 | " \n", 2815 | " \n", 2816 | " \n", 2817 | " \n", 2818 | " \n", 2819 | " \n", 2820 | " \n", 2821 | " \n", 2822 | " \n", 2823 | " \n", 2824 | " \n", 2825 | " \n", 2826 | " \n", 2827 | " \n", 2828 | " \n", 2829 | " \n", 2830 | " \n", 2831 | " \n", 2832 | " \n", 2833 | " \n", 2834 | " \n", 2835 | " \n", 2836 | " \n", 2837 | " \n", 2838 | "
dabc
three0123
one4567
\n", 2839 | "
" 2840 | ], 2841 | "text/plain": [ 2842 | " d a b c\n", 2843 | "three 0 1 2 3\n", 2844 | "one 4 5 6 7" 2845 | ] 2846 | }, 2847 | "execution_count": 47, 2848 | "metadata": {}, 2849 | "output_type": "execute_result" 2850 | } 2851 | ], 2852 | "source": [ 2853 | "frame = pd.DataFrame(np.arange(8).reshape((2, 4)),\n", 2854 | " index=['three', 'one'],\n", 2855 | " columns=['d', 'a', 'b', 'c'])\n", 2856 | "frame" 2857 | ] 2858 | }, 2859 | { 2860 | "cell_type": "code", 2861 | "execution_count": 48, 2862 | "metadata": {}, 2863 | "outputs": [ 2864 | { 2865 | "data": { 2866 | "text/html": [ 2867 | "
\n", 2868 | "\n", 2881 | "\n", 2882 | " \n", 2883 | " \n", 2884 | " \n", 2885 | " \n", 2886 | " \n", 2887 | " \n", 2888 | " \n", 2889 | " \n", 2890 | " \n", 2891 | " \n", 2892 | " \n", 2893 | " \n", 2894 | " \n", 2895 | " \n", 2896 | " \n", 2897 | " \n", 2898 | " \n", 2899 | " \n", 2900 | " \n", 2901 | " \n", 2902 | " \n", 2903 | " \n", 2904 | " \n", 2905 | " \n", 2906 | " \n", 2907 | "
dabc
one4567
three0123
\n", 2908 | "
" 2909 | ], 2910 | "text/plain": [ 2911 | " d a b c\n", 2912 | "one 4 5 6 7\n", 2913 | "three 0 1 2 3" 2914 | ] 2915 | }, 2916 | "execution_count": 48, 2917 | "metadata": {}, 2918 | "output_type": "execute_result" 2919 | } 2920 | ], 2921 | "source": [ 2922 | "frame.sort_index()" 2923 | ] 2924 | }, 2925 | { 2926 | "cell_type": "code", 2927 | "execution_count": 49, 2928 | "metadata": {}, 2929 | "outputs": [ 2930 | { 2931 | "data": { 2932 | "text/html": [ 2933 | "
\n", 2934 | "\n", 2947 | "\n", 2948 | " \n", 2949 | " \n", 2950 | " \n", 2951 | " \n", 2952 | " \n", 2953 | " \n", 2954 | " \n", 2955 | " \n", 2956 | " \n", 2957 | " \n", 2958 | " \n", 2959 | " \n", 2960 | " \n", 2961 | " \n", 2962 | " \n", 2963 | " \n", 2964 | " \n", 2965 | " \n", 2966 | " \n", 2967 | " \n", 2968 | " \n", 2969 | " \n", 2970 | " \n", 2971 | " \n", 2972 | " \n", 2973 | "
abcd
three1230
one5674
\n", 2974 | "
" 2975 | ], 2976 | "text/plain": [ 2977 | " a b c d\n", 2978 | "three 1 2 3 0\n", 2979 | "one 5 6 7 4" 2980 | ] 2981 | }, 2982 | "execution_count": 49, 2983 | "metadata": {}, 2984 | "output_type": "execute_result" 2985 | } 2986 | ], 2987 | "source": [ 2988 | "frame.sort_index(axis=1)" 2989 | ] 2990 | }, 2991 | { 2992 | "cell_type": "code", 2993 | "execution_count": 50, 2994 | "metadata": {}, 2995 | "outputs": [ 2996 | { 2997 | "data": { 2998 | "text/html": [ 2999 | "
\n", 3000 | "\n", 3013 | "\n", 3014 | " \n", 3015 | " \n", 3016 | " \n", 3017 | " \n", 3018 | " \n", 3019 | " \n", 3020 | " \n", 3021 | " \n", 3022 | " \n", 3023 | " \n", 3024 | " \n", 3025 | " \n", 3026 | " \n", 3027 | " \n", 3028 | " \n", 3029 | " \n", 3030 | " \n", 3031 | " \n", 3032 | " \n", 3033 | " \n", 3034 | " \n", 3035 | " \n", 3036 | " \n", 3037 | " \n", 3038 | " \n", 3039 | "
dabc
three0123
one4567
\n", 3040 | "
" 3041 | ], 3042 | "text/plain": [ 3043 | " d a b c\n", 3044 | "three 0 1 2 3\n", 3045 | "one 4 5 6 7" 3046 | ] 3047 | }, 3048 | "execution_count": 50, 3049 | "metadata": {}, 3050 | "output_type": "execute_result" 3051 | } 3052 | ], 3053 | "source": [ 3054 | "frame.sort_index(axis=0, ascending=False)" 3055 | ] 3056 | }, 3057 | { 3058 | "cell_type": "markdown", 3059 | "metadata": {}, 3060 | "source": [ 3061 | "通过值来排序,使用sort_values方法:(缺失值会被排在最后)" 3062 | ] 3063 | }, 3064 | { 3065 | "cell_type": "code", 3066 | "execution_count": 51, 3067 | "metadata": {}, 3068 | "outputs": [ 3069 | { 3070 | "data": { 3071 | "text/plain": [ 3072 | "2 -3.0\n", 3073 | "3 2.0\n", 3074 | "0 4.0\n", 3075 | "1 NaN\n", 3076 | "dtype: float64" 3077 | ] 3078 | }, 3079 | "execution_count": 51, 3080 | "metadata": {}, 3081 | "output_type": "execute_result" 3082 | } 3083 | ], 3084 | "source": [ 3085 | "obj = pd.Series([4, np.nan, -3, 2])\n", 3086 | "obj.sort_values()" 3087 | ] 3088 | }, 3089 | { 3090 | "cell_type": "code", 3091 | "execution_count": 52, 3092 | "metadata": {}, 3093 | "outputs": [ 3094 | { 3095 | "data": { 3096 | "text/html": [ 3097 | "
\n", 3098 | "\n", 3111 | "\n", 3112 | " \n", 3113 | " \n", 3114 | " \n", 3115 | " \n", 3116 | " \n", 3117 | " \n", 3118 | " \n", 3119 | " \n", 3120 | " \n", 3121 | " \n", 3122 | " \n", 3123 | " \n", 3124 | " \n", 3125 | " \n", 3126 | " \n", 3127 | " \n", 3128 | " \n", 3129 | " \n", 3130 | " \n", 3131 | " \n", 3132 | " \n", 3133 | " \n", 3134 | " \n", 3135 | " \n", 3136 | " \n", 3137 | "
dabc
three0123
one4567
\n", 3138 | "
" 3139 | ], 3140 | "text/plain": [ 3141 | " d a b c\n", 3142 | "three 0 1 2 3\n", 3143 | "one 4 5 6 7" 3144 | ] 3145 | }, 3146 | "execution_count": 52, 3147 | "metadata": {}, 3148 | "output_type": "execute_result" 3149 | } 3150 | ], 3151 | "source": [ 3152 | "frame.sort_values(by=['a', 'b'])" 3153 | ] 3154 | }, 3155 | { 3156 | "cell_type": "markdown", 3157 | "metadata": {}, 3158 | "source": [ 3159 | "rank(略)" 3160 | ] 3161 | }, 3162 | { 3163 | "cell_type": "markdown", 3164 | "metadata": {}, 3165 | "source": [ 3166 | "## 6.有重复label的轴索引(Axis Indexes with Duplicate Labels)" 3167 | ] 3168 | }, 3169 | { 3170 | "cell_type": "markdown", 3171 | "metadata": {}, 3172 | "source": [ 3173 | "有一些有重复索引:" 3174 | ] 3175 | }, 3176 | { 3177 | "cell_type": "code", 3178 | "execution_count": 53, 3179 | "metadata": {}, 3180 | "outputs": [ 3181 | { 3182 | "data": { 3183 | "text/plain": [ 3184 | "a 0\n", 3185 | "a 1\n", 3186 | "b 2\n", 3187 | "b 3\n", 3188 | "c 4\n", 3189 | "dtype: int32" 3190 | ] 3191 | }, 3192 | "execution_count": 53, 3193 | "metadata": {}, 3194 | "output_type": "execute_result" 3195 | } 3196 | ], 3197 | "source": [ 3198 | "obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n", 3199 | "obj" 3200 | ] 3201 | }, 3202 | { 3203 | "cell_type": "code", 3204 | "execution_count": 54, 3205 | "metadata": {}, 3206 | "outputs": [ 3207 | { 3208 | "data": { 3209 | "text/plain": [ 3210 | "False" 3211 | ] 3212 | }, 3213 | "execution_count": 54, 3214 | "metadata": {}, 3215 | "output_type": "execute_result" 3216 | } 3217 | ], 3218 | "source": [ 3219 | "obj.index.is_unique" 3220 | ] 3221 | }, 3222 | { 3223 | "cell_type": "markdown", 3224 | "metadata": {}, 3225 | "source": [ 3226 | "数据选择时,对于Series,如果一个label有多个值,返回一个Series,反之返回一个标量。\n", 3227 | " 对于DataFrame,如果一个label有多行/列,返回一个DataFrame。" 3228 | ] 3229 | }, 3230 | { 3231 | "cell_type": "code", 3232 | "execution_count": 55, 3233 | "metadata": {}, 3234 | "outputs": [ 3235 | { 3236 | "data": { 3237 | "text/plain": [ 3238 | "a 0\n", 3239 | "a 1\n", 3240 | "dtype: int32" 3241 | ] 3242 | }, 3243 | "execution_count": 55, 3244 | "metadata": {}, 3245 | "output_type": "execute_result" 3246 | } 3247 | ], 3248 | "source": [ 3249 | "obj['a']" 3250 | ] 3251 | } 3252 | ], 3253 | "metadata": { 3254 | "kernelspec": { 3255 | "display_name": "Python 3", 3256 | "language": "python", 3257 | "name": "python3" 3258 | }, 3259 | "language_info": { 3260 | "codemirror_mode": { 3261 | "name": "ipython", 3262 | "version": 3 3263 | }, 3264 | "file_extension": ".py", 3265 | "mimetype": "text/x-python", 3266 | "name": "python", 3267 | "nbconvert_exporter": "python", 3268 | "pygments_lexer": "ipython3", 3269 | "version": "3.6.3" 3270 | }, 3271 | "widgets": { 3272 | "application/vnd.jupyter.widget-state+json": { 3273 | "state": {}, 3274 | "version_major": 2, 3275 | "version_minor": 0 3276 | } 3277 | } 3278 | }, 3279 | "nbformat": 4, 3280 | "nbformat_minor": 2 3281 | } 3282 | -------------------------------------------------------------------------------- /tools/pandas_test.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | 3 | import pandas as pd 4 | 5 | s = pd.Series([1,2,3], index=['a', 'b', 'c']) 6 | d = pd.DataFrame([[1,2,3], [3,4,5]],columns = ['a','b','c']) 7 | ds = pd.DataFrame(s) 8 | 9 | print(d.head()) 10 | print(d.describe()) 11 | 12 | print(s) 13 | --------------------------------------------------------------------------------