├── .gitignore ├── LICENSE ├── README.md ├── appendixC.ipynb ├── ch02.ipynb ├── ch03.ipynb ├── ch04.ipynb ├── ch05.ipynb ├── ch06.ipynb ├── ch07.ipynb ├── ch08.ipynb ├── ch09.ipynb ├── ch11.ipynb ├── ch12.ipynb ├── conda ├── xl310.yml └── xl38.yml ├── csv ├── AAPL.csv ├── AMZN.csv ├── GOOGL.csv └── MSFT.csv ├── debugging.py ├── environment.yml ├── excel.py ├── images ├── cover.png ├── python.bmp └── python.png ├── packagetracker ├── database.py ├── packagetracker.db ├── packagetracker.py └── packagetracker.xlsm ├── parallel_openpyxl.py ├── parallel_pandas.py ├── parallel_xlrd.py ├── pep8_sample.py ├── requirements.txt ├── sales_data ├── existing │ ├── April.xls │ ├── August.xls │ ├── December.xls │ ├── February.xls │ ├── January.xls │ ├── July.xls │ ├── June.xls │ ├── March.xls │ ├── May.xls │ ├── November.xls │ ├── October.xls │ └── September.xls └── new │ ├── April.xlsx │ ├── August.xlsx │ ├── December.xlsx │ ├── February.xlsx │ ├── January.xlsx │ ├── July.xlsx │ ├── June.xlsx │ ├── March.xlsx │ ├── May.xlsx │ ├── November.xlsx │ ├── October.xlsx │ └── September.xlsx ├── sales_report_openpyxl.py ├── sales_report_pandas.py ├── sales_report_xlsxwriter.py ├── sales_report_xlwings.py ├── temperature.py ├── udfs ├── describe │ ├── describe.py │ └── describe.xlsm ├── first_udf │ ├── first_udf.py │ └── first_udf.xlsm ├── google_trends │ ├── google_trends.py │ └── google_trends.xlsm ├── google_trends_cache │ ├── google_trends_cache.py │ └── google_trends_cache.xlsm ├── importsub │ ├── importsub.py │ └── importsub.xlsm ├── raw_values │ ├── raw_values.py │ └── raw_values.xlsm └── revenues │ ├── revenues.py │ └── revenues.xlsm └── xl ├── array_calculations.xlsx ├── big.xlsx ├── course_participants.xlsx ├── currency_converter.xlsx ├── macro.xlsm ├── sales_report_template.xlsx ├── stores.xls ├── stores.xlsb ├── stores.xlsx ├── vba.xlsm └── vbaProject.bin /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .DS_Store 3 | ~$*.xls* 4 | *.pyc -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Zoomer Analytics GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python for Excel (O'Reilly, 2021) 2 | 3 | 4 | 5 | This is the companion repository for the O'Reilly book [Python for Excel](https://learning.oreilly.com/library/view/python-for-excel/9781492080992/). 6 | 7 | All notebooks can be run in the cloud except `ch09.ipynb` (requires a local installation of Excel): 8 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/fzumstein/python-for-excel/1st-edition?urlpath=tree) 9 | -------------------------------------------------------------------------------- /appendixC.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Appendix C\n", 8 | "## Classes and Objects" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": { 15 | "pycharm": { 16 | "name": "#%%\n" 17 | } 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "class Car:\n", 22 | " def __init__(self, color, speed=0):\n", 23 | " self.color = color\n", 24 | " self.speed = speed\n", 25 | "\n", 26 | " def accelerate(self, mph):\n", 27 | " self.speed += mph" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "pycharm": { 35 | "name": "#%%\n" 36 | } 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "# Let's instantiate two car objects\n", 41 | "car1 = Car(\"red\")\n", 42 | "car2 = Car(color=\"blue\")" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "pycharm": { 50 | "name": "#%%\n" 51 | } 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "# By default, an object prints its memory location\n", 56 | "car1" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "pycharm": { 64 | "name": "#%%\n" 65 | } 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "# Attributes give you access to the data of an object\n", 70 | "car1.color" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "pycharm": { 78 | "name": "#%%\n" 79 | } 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "car1.speed" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": { 90 | "pycharm": { 91 | "name": "#%%\n" 92 | } 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "# Calling the accelerate method on car1\n", 97 | "car1.accelerate(20)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": { 104 | "pycharm": { 105 | "name": "#%%\n" 106 | } 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "# The speed attribute of car1 changed\n", 111 | "car1.speed" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "pycharm": { 119 | "name": "#%%\n" 120 | } 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "# The speed attribute of car2 remained the same\n", 125 | "car2.speed" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "pycharm": { 133 | "name": "#%%\n" 134 | } 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "car1.color = \"green\"" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": { 145 | "pycharm": { 146 | "name": "#%%\n" 147 | } 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "car1.color" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": { 158 | "pycharm": { 159 | "name": "#%%\n" 160 | } 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "car2.color # unchanged" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Working with time-zone-aware datetime objects" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "pycharm": { 179 | "name": "#%%\n" 180 | } 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "import datetime as dt\n", 185 | "from dateutil import tz" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": { 192 | "pycharm": { 193 | "name": "#%%\n" 194 | } 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "# Time-zone-naive datetime object\n", 199 | "timestamp = dt.datetime(2020, 1, 31, 14, 30)\n", 200 | "timestamp.isoformat()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "pycharm": { 208 | "name": "#%%\n" 209 | } 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "# Time-zone-aware datetime object\n", 214 | "timestamp_eastern = dt.datetime(2020, 1, 31, 14, 30,\n", 215 | " tzinfo=tz.gettz(\"US/Eastern\"))\n", 216 | "# Printing in isoformat makes it easy to\n", 217 | "# see the offset from UTC\n", 218 | "timestamp_eastern.isoformat()" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "pycharm": { 226 | "name": "#%%\n" 227 | } 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "# Assign a time zone to a naive datetime object\n", 232 | "timestamp_eastern = timestamp.replace(tzinfo=tz.gettz(\"US/Eastern\"))\n", 233 | "timestamp_eastern.isoformat()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "pycharm": { 241 | "name": "#%%\n" 242 | } 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "# Convert from one time zone to another.\n", 247 | "# Since the UTC time zone is so common,\n", 248 | "# there is a shortcut: tz.UTC\n", 249 | "timestamp_utc = timestamp_eastern.astimezone(tz.UTC)\n", 250 | "timestamp_utc.isoformat()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "metadata": { 257 | "pycharm": { 258 | "name": "#%%\n" 259 | } 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "# From time-zone-aware to naive\n", 264 | "timestamp_eastern.replace(tzinfo=None)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "pycharm": { 272 | "name": "#%%\n" 273 | } 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "# Current time without time zone\n", 278 | "dt.datetime.now()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "pycharm": { 286 | "name": "#%%\n" 287 | } 288 | }, 289 | "outputs": [], 290 | "source": [ 291 | "# Current time in UTC time zone\n", 292 | "dt.datetime.now(tz.UTC)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": {}, 298 | "source": [ 299 | "## Mutable vs. Immutable Objects" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "pycharm": { 307 | "name": "#%%\n" 308 | } 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "a = [1, 2, 3]\n", 313 | "b = a\n", 314 | "a[1] = 22\n", 315 | "print(a)\n", 316 | "print(b)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "pycharm": { 324 | "name": "#%%\n" 325 | } 326 | }, 327 | "outputs": [], 328 | "source": [ 329 | "a = [1, 2, 3]\n", 330 | "b = a.copy()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": { 337 | "pycharm": { 338 | "name": "#%%\n" 339 | } 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "a" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "pycharm": { 351 | "name": "#%%\n" 352 | } 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "b" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "metadata": { 363 | "pycharm": { 364 | "name": "#%%\n" 365 | } 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "a[1] = 22 # Changing \"a\"..." 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "pycharm": { 377 | "name": "#%%\n" 378 | } 379 | }, 380 | "outputs": [], 381 | "source": [ 382 | "a" 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "metadata": { 389 | "pycharm": { 390 | "name": "#%%\n" 391 | } 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "b # ...doesn't affect \"b\"" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": { 402 | "pycharm": { 403 | "name": "#%%\n" 404 | } 405 | }, 406 | "outputs": [], 407 | "source": [ 408 | "import copy\n", 409 | "b = copy.deepcopy(a)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": { 416 | "pycharm": { 417 | "name": "#%%\n" 418 | } 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "def increment(x):\n", 423 | " x = x + 1\n", 424 | " return x" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": { 431 | "pycharm": { 432 | "name": "#%%\n" 433 | } 434 | }, 435 | "outputs": [], 436 | "source": [ 437 | "a = 1\n", 438 | "print(increment(a))\n", 439 | "print(a)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "metadata": { 446 | "pycharm": { 447 | "name": "#%%\n" 448 | } 449 | }, 450 | "outputs": [], 451 | "source": [ 452 | "def increment(x):\n", 453 | " x[0] = x[0] + 1\n", 454 | " return x" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": { 461 | "pycharm": { 462 | "name": "#%%\n" 463 | } 464 | }, 465 | "outputs": [], 466 | "source": [ 467 | "a = [1]\n", 468 | "print(increment(a))\n", 469 | "print(a)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "metadata": { 476 | "pycharm": { 477 | "name": "#%%\n" 478 | } 479 | }, 480 | "outputs": [], 481 | "source": [ 482 | "a = [1]\n", 483 | "print(increment(a.copy()))\n", 484 | "print(a)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "metadata": { 491 | "pycharm": { 492 | "name": "#%%\n" 493 | } 494 | }, 495 | "outputs": [], 496 | "source": [ 497 | "# Don't do this:\n", 498 | "def add_one(x=[]):\n", 499 | " x.append(1)\n", 500 | " return x" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": { 507 | "pycharm": { 508 | "name": "#%%\n" 509 | } 510 | }, 511 | "outputs": [], 512 | "source": [ 513 | "add_one()" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "metadata": { 520 | "pycharm": { 521 | "name": "#%%\n" 522 | } 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "add_one()" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": { 533 | "pycharm": { 534 | "name": "#%%\n" 535 | } 536 | }, 537 | "outputs": [], 538 | "source": [ 539 | "def add_one(x=None):\n", 540 | " if x is None:\n", 541 | " x = []\n", 542 | " x.append(1)\n", 543 | " return x" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": { 550 | "pycharm": { 551 | "name": "#%%\n" 552 | } 553 | }, 554 | "outputs": [], 555 | "source": [ 556 | "add_one()" 557 | ] 558 | }, 559 | { 560 | "cell_type": "code", 561 | "execution_count": null, 562 | "metadata": { 563 | "pycharm": { 564 | "name": "#%%\n" 565 | } 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "add_one()" 570 | ] 571 | } 572 | ], 573 | "metadata": { 574 | "kernelspec": { 575 | "display_name": "Python 3", 576 | "language": "python", 577 | "name": "python3" 578 | }, 579 | "language_info": { 580 | "codemirror_mode": { 581 | "name": "ipython", 582 | "version": 3 583 | }, 584 | "file_extension": ".py", 585 | "mimetype": "text/x-python", 586 | "name": "python", 587 | "nbconvert_exporter": "python", 588 | "pygments_lexer": "ipython3", 589 | "version": "3.7.4" 590 | } 591 | }, 592 | "nbformat": 4, 593 | "nbformat_minor": 4 594 | } 595 | -------------------------------------------------------------------------------- /ch02.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "3 + 4" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# This is a first-level heading\n", 17 | "\n", 18 | "## This is a second-level heading\n", 19 | "\n", 20 | "You can make your text *italic* or **bold** or `monospaced`.\n", 21 | "\n", 22 | "* This is a bullet point\n", 23 | "* This is another bullet point" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Run Order Matters" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "a = 1" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "a" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "a = 2" 58 | ] 59 | } 60 | ], 61 | "metadata": { 62 | "kernelspec": { 63 | "display_name": "Python 3", 64 | "language": "python", 65 | "name": "python3" 66 | }, 67 | "language_info": { 68 | "codemirror_mode": { 69 | "name": "ipython", 70 | "version": 3 71 | }, 72 | "file_extension": ".py", 73 | "mimetype": "text/x-python", 74 | "name": "python", 75 | "nbconvert_exporter": "python", 76 | "pygments_lexer": "ipython3", 77 | "version": "3.7.4" 78 | } 79 | }, 80 | "nbformat": 4, 81 | "nbformat_minor": 4 82 | } 83 | -------------------------------------------------------------------------------- /ch04.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Foundations: NumPy\n", 8 | "## NumPy Array" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "matrix = [[1, 2, 3],\n", 18 | " [4, 5, 6],\n", 19 | " [7, 8, 9]]" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "[[i + 1 for i in row] for row in matrix]" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# First, let's import NumPy\n", 38 | "import numpy as np" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Constructing an array with a simple list results in a 1d array\n", 48 | "array1 = np.array([10, 100, 1000.])" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "# Constructing an array with a nested list results in a 2d array\n", 58 | "array2 = np.array([[1., 2., 3.],\n", 59 | " [4., 5., 6.]])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "array1.dtype" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "float(array1[0])" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## Vectorization and Broadcasting" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "array2 + 1" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "array2 * array2" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "array2 * array1" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "array2 @ array2.T # array2.T is a shortcut for array2.transpose()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "## Universal Functions (ufunc)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "import math" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "math.sqrt(array2) # This will raise en Error" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "np.array([[math.sqrt(i) for i in row] for row in array2])" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "np.sqrt(array2)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "array2.sum(axis=0) # Returns a 1d array" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "array2.sum()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## Getting and Setting Array Elements" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "array1[2] # Returns a scalar" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "array2[0, 0] # Returns a scalar" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "array2[:, 1:] # Returns a 2d array" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "array2[:, 1] # Returns a 1d array" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "array2[1, :2] # Returns a 1d array" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "## Useful Array Constructors" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "np.arange(2 * 5).reshape(2, 5) # 2 rows, 5 columns" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "np.random.randn(2, 3) # 2 rows, 3 columns" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## View vs. Copy" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "array2" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "subset = array2[:, :2]\n", 284 | "subset" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "metadata": {}, 291 | "outputs": [], 292 | "source": [ 293 | "subset[0, 0] = 1000" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "subset" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "array2" 312 | ] 313 | } 314 | ], 315 | "metadata": { 316 | "kernelspec": { 317 | "display_name": "Python 3", 318 | "language": "python", 319 | "name": "python3" 320 | }, 321 | "language_info": { 322 | "codemirror_mode": { 323 | "name": "ipython", 324 | "version": 3 325 | }, 326 | "file_extension": ".py", 327 | "mimetype": "text/x-python", 328 | "name": "python", 329 | "nbconvert_exporter": "python", 330 | "pygments_lexer": "ipython3", 331 | "version": "3.7.4" 332 | } 333 | }, 334 | "nbformat": 4, 335 | "nbformat_minor": 4 336 | } 337 | -------------------------------------------------------------------------------- /ch05.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# DataFrame and Series" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "pd.read_excel(\"xl/course_participants.xlsx\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "data = [[\"Mark\", 55, \"Italy\", 4.5, \"Europe\"],\n", 35 | " [\"John\", 33, \"USA\", 6.7, \"America\"],\n", 36 | " [\"Tim\", 41, \"USA\", 3.9, \"America\"],\n", 37 | " [\"Jenny\", 12, \"Germany\", 9.0, \"Europe\"]]\n", 38 | "df = pd.DataFrame(data=data,\n", 39 | " columns=[\"name\", \"age\", \"country\",\n", 40 | " \"score\", \"continent\"],\n", 41 | " index=[1001, 1000, 1002, 1003])\n", 42 | "df" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "df.info()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Index" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "df.index" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "df.index.name = \"user_id\"\n", 77 | "df" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# \"reset_index\" turns the index into a column, replacing the\n", 87 | "# index with the default index. This corresponds to the DataFrame\n", 88 | "# from the beginning that we loaded from Excel.\n", 89 | "df.reset_index()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# \"reset_index\" turns \"user_id\" into a regular column and\n", 99 | "# \"set_index\" turns the column \"name\" into the index\n", 100 | "df.reset_index().set_index(\"name\")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "df.reindex([999, 1000, 1001, 1004])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "df.sort_index()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "df.sort_values([\"continent\", \"age\"])" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Columns" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "df.columns" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "df.columns.name = \"properties\"\n", 153 | "df" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "df.rename(columns={\"name\": \"First Name\", \"age\": \"Age\"})" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "df.drop(columns=[\"name\", \"country\"],\n", 172 | " index=[1000, 1003])" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "df.T # Shortcut for df.transpose()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "df.loc[:, [\"continent\", \"country\", \"name\", \"age\", \"score\"]]" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": {}, 196 | "source": [ 197 | "# Data Manipulation\n", 198 | "## Selecting Data" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# Using scalars for both row and column selection returns a scalar\n", 208 | "df.loc[1001, \"name\"]" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# Using a scalar on either the row or column selection returns a Series\n", 218 | "df.loc[[1001, 1002], \"age\"]" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# Selecting multiple rows and columns returns a DataFrame\n", 228 | "df.loc[:1002, [\"name\", \"country\"]]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "df.iloc[0, 0] # Returns a Scalar" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "df.iloc[[0, 2], 1] # Returns a Series" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "df.iloc[:3, [0, 2]] # Returns a DataFrame" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "tf = (df[\"age\"] > 40) & (df[\"country\"] == \"USA\")\n", 265 | "tf # This is a Series with only True/False" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "df.loc[tf, :]" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "df.loc[df.index > 1001, :]" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "df.loc[df[\"country\"].isin([\"Italy\", \"Germany\"]), :]" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "# This could be the yearly rainfall in millimeters\n", 302 | "rainfall = pd.DataFrame(data={\"City 1\": [300.1, 100.2],\n", 303 | " \"City 2\": [400.3, 300.4],\n", 304 | " \"City 3\": [1000.5, 1100.6]})\n", 305 | "rainfall" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "rainfall < 400" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "rainfall[rainfall < 400]" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "# A MultiIndex needs to be sorted\n", 333 | "df_multi = df.reset_index().set_index([\"continent\", \"country\"])\n", 334 | "df_multi = df_multi.sort_index()\n", 335 | "df_multi" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "df_multi.loc[\"Europe\", :]" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": {}, 351 | "outputs": [], 352 | "source": [ 353 | "df_multi.loc[(\"Europe\", \"Italy\"), :]" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "df_multi.reset_index(level=0)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## Setting Data" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "# Copy the DataFrame first to leave the original untouched\n", 379 | "df2 = df.copy()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "df2.loc[1000, \"name\"] = \"JOHN\"\n", 389 | "df2" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "df2.loc[[1000, 1001], \"score\"] = [3, 4]\n", 399 | "df2" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "tf = (df2[\"age\"] < 20) | (df2[\"country\"] == \"USA\")\n", 409 | "df2.loc[tf, \"name\"] = \"xxx\"\n", 410 | "df2" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": null, 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "# Copy the DataFrame first to leave the original untouched\n", 420 | "rainfall2 = rainfall.copy()\n", 421 | "rainfall2" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": null, 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "# Set the values to 0 wherever they are below 400\n", 431 | "rainfall2[rainfall2 < 400] = 0\n", 432 | "rainfall2" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": {}, 439 | "outputs": [], 440 | "source": [ 441 | "df2.replace(\"USA\", \"U.S.\")" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "df2.loc[:, \"discount\"] = 0\n", 451 | "df2.loc[:, \"price\"] = [49.9, 49.9, 99.9, 99.9]\n", 452 | "df2" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "df2 = df.copy() # Let's start with a fresh copy\n", 462 | "df2.loc[:, \"birth year\"] = 2021 - df2[\"age\"]\n", 463 | "df2" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "## Missing Data" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": null, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "df2 = df.copy() # Let's start with a fresh copy\n", 480 | "df2.loc[1000, \"score\"] = None\n", 481 | "df2.loc[1003, :] = None\n", 482 | "df2" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "df2.dropna()" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "df2.dropna(how=\"all\")" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "df2.isna()" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "df2.fillna({\"score\": df2[\"score\"].mean()})" 519 | ] 520 | }, 521 | { 522 | "cell_type": "markdown", 523 | "metadata": {}, 524 | "source": [ 525 | "## Duplicate Data" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "metadata": {}, 532 | "outputs": [], 533 | "source": [ 534 | "df.drop_duplicates([\"country\", \"continent\"])" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "df[\"country\"].is_unique" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "df[\"country\"].unique()" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "# By default, it marks only duplicates as True, i.e.\n", 562 | "# without the first occurrence\n", 563 | "df[\"country\"].duplicated()" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "# To get all rows where \"country\" is duplicated, use\n", 573 | "# keep=False\n", 574 | "df.loc[df[\"country\"].duplicated(keep=False), :]" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "## Arithmetic Operations" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": null, 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "rainfall" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "rainfall + 100" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": {}, 606 | "outputs": [], 607 | "source": [ 608 | "more_rainfall = pd.DataFrame(data=[[100, 200], [300, 400]],\n", 609 | " index=[1, 2],\n", 610 | " columns=[\"City 1\", \"City 4\"])\n", 611 | "more_rainfall" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "rainfall + more_rainfall" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "rainfall.add(more_rainfall, fill_value=0)" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": null, 635 | "metadata": {}, 636 | "outputs": [], 637 | "source": [ 638 | "# A Series taken from a row\n", 639 | "rainfall.loc[1, :]" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "rainfall + rainfall.loc[1, :]" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": null, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "# A Series taken from a column\n", 658 | "rainfall.loc[:, \"City 2\"]" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": null, 664 | "metadata": {}, 665 | "outputs": [], 666 | "source": [ 667 | "rainfall.add(rainfall.loc[:, \"City 2\"], axis=0)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": null, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "# Let's create a new DataFrame\n", 677 | "users = pd.DataFrame(data=[\" mArk \", \"JOHN \", \"Tim\", \" jenny\"],\n", 678 | " columns=[\"name\"])\n", 679 | "users" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "users_cleaned = users.loc[:, \"name\"].str.strip().str.capitalize()\n", 689 | "users_cleaned" 690 | ] 691 | }, 692 | { 693 | "cell_type": "code", 694 | "execution_count": null, 695 | "metadata": {}, 696 | "outputs": [], 697 | "source": [ 698 | "users_cleaned.str.startswith(\"J\")" 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "metadata": {}, 704 | "source": [ 705 | "## Applying a Function" 706 | ] 707 | }, 708 | { 709 | "cell_type": "code", 710 | "execution_count": null, 711 | "metadata": {}, 712 | "outputs": [], 713 | "source": [ 714 | "rainfall" 715 | ] 716 | }, 717 | { 718 | "cell_type": "code", 719 | "execution_count": null, 720 | "metadata": {}, 721 | "outputs": [], 722 | "source": [ 723 | "def format_string(x):\n", 724 | " return f\"{x:,.2f}\"" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "# Note that we pass in the function without calling it,\n", 734 | "# i.e., format_string and not format_string()!\n", 735 | "rainfall.applymap(format_string)" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": {}, 742 | "outputs": [], 743 | "source": [ 744 | "rainfall.applymap(lambda x: f\"{x:,.2f}\")" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": {}, 750 | "source": [ 751 | "# Combining DataFrames\n", 752 | "## Concatenating" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": null, 758 | "metadata": {}, 759 | "outputs": [], 760 | "source": [ 761 | "data = [[15, \"France\", 4.1, \"Becky\"],\n", 762 | " [44, \"Canada\", 6.1, \"Leanne\"]]\n", 763 | "more_users = pd.DataFrame(data=data,\n", 764 | " columns=[\"age\", \"country\", \"score\", \"name\"],\n", 765 | " index=[1000, 1011])\n", 766 | "more_users" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": null, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "pd.concat([df, more_users], axis=0)" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "data = [[3, 4],\n", 785 | " [5, 6]]\n", 786 | "more_categories = pd.DataFrame(data=data,\n", 787 | " columns=[\"quizzes\", \"logins\"],\n", 788 | " index=[1000, 2000])\n", 789 | "more_categories" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": null, 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [ 798 | "pd.concat([df, more_categories], axis=1)" 799 | ] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": {}, 804 | "source": [ 805 | "## Joining and Merging" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": null, 811 | "metadata": {}, 812 | "outputs": [], 813 | "source": [ 814 | "df1 = pd.DataFrame(data=[[1, 2], [3, 4], [5, 6]],\n", 815 | " columns=[\"A\", \"B\"])\n", 816 | "df1" 817 | ] 818 | }, 819 | { 820 | "cell_type": "code", 821 | "execution_count": null, 822 | "metadata": {}, 823 | "outputs": [], 824 | "source": [ 825 | "df2 = pd.DataFrame(data=[[10, 20], [30, 40]],\n", 826 | " columns=[\"C\", \"D\"], index=[1, 3])\n", 827 | "df2" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": null, 833 | "metadata": {}, 834 | "outputs": [], 835 | "source": [ 836 | "df1.join(df2, how=\"inner\")" 837 | ] 838 | }, 839 | { 840 | "cell_type": "code", 841 | "execution_count": null, 842 | "metadata": {}, 843 | "outputs": [], 844 | "source": [ 845 | "df1.join(df2, how=\"left\")" 846 | ] 847 | }, 848 | { 849 | "cell_type": "code", 850 | "execution_count": null, 851 | "metadata": {}, 852 | "outputs": [], 853 | "source": [ 854 | "df1.join(df2, how=\"right\")" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "metadata": {}, 861 | "outputs": [], 862 | "source": [ 863 | "df1.join(df2, how=\"outer\")" 864 | ] 865 | }, 866 | { 867 | "cell_type": "code", 868 | "execution_count": null, 869 | "metadata": {}, 870 | "outputs": [], 871 | "source": [ 872 | "# Add a column called \"category\" to both DataFrames\n", 873 | "df1[\"category\"] = [\"a\", \"b\", \"c\"]\n", 874 | "df2[\"category\"] = [\"c\", \"b\"]" 875 | ] 876 | }, 877 | { 878 | "cell_type": "code", 879 | "execution_count": null, 880 | "metadata": {}, 881 | "outputs": [], 882 | "source": [ 883 | "df1" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [ 892 | "df2" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": null, 898 | "metadata": {}, 899 | "outputs": [], 900 | "source": [ 901 | "df1.merge(df2, how=\"inner\", on=[\"category\"])" 902 | ] 903 | }, 904 | { 905 | "cell_type": "code", 906 | "execution_count": null, 907 | "metadata": {}, 908 | "outputs": [], 909 | "source": [ 910 | "df1.merge(df2, how=\"left\", on=[\"category\"])" 911 | ] 912 | }, 913 | { 914 | "cell_type": "markdown", 915 | "metadata": {}, 916 | "source": [ 917 | "# Data Aggregation and Descriptive Statistics\n", 918 | "## Descriptive Statistics" 919 | ] 920 | }, 921 | { 922 | "cell_type": "code", 923 | "execution_count": null, 924 | "metadata": {}, 925 | "outputs": [], 926 | "source": [ 927 | "rainfall" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "metadata": {}, 934 | "outputs": [], 935 | "source": [ 936 | "rainfall.mean()" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "metadata": {}, 943 | "outputs": [], 944 | "source": [ 945 | "rainfall.mean(axis=1)" 946 | ] 947 | }, 948 | { 949 | "cell_type": "markdown", 950 | "metadata": {}, 951 | "source": [ 952 | "## Grouping" 953 | ] 954 | }, 955 | { 956 | "cell_type": "code", 957 | "execution_count": null, 958 | "metadata": {}, 959 | "outputs": [], 960 | "source": [ 961 | "df.groupby([\"continent\"]).mean()" 962 | ] 963 | }, 964 | { 965 | "cell_type": "code", 966 | "execution_count": null, 967 | "metadata": {}, 968 | "outputs": [], 969 | "source": [ 970 | "df.groupby([\"continent\", \"country\"]).mean()" 971 | ] 972 | }, 973 | { 974 | "cell_type": "code", 975 | "execution_count": null, 976 | "metadata": {}, 977 | "outputs": [], 978 | "source": [ 979 | "selection = df.loc[:, [\"age\", \"score\", \"continent\"]]\n", 980 | "selection.groupby([\"continent\"]).agg(lambda x: x.max() - x.min())" 981 | ] 982 | }, 983 | { 984 | "cell_type": "markdown", 985 | "metadata": {}, 986 | "source": [ 987 | "## Pivoting and Melting" 988 | ] 989 | }, 990 | { 991 | "cell_type": "code", 992 | "execution_count": null, 993 | "metadata": {}, 994 | "outputs": [], 995 | "source": [ 996 | "data = [[\"Oranges\", \"North\", 12.30],\n", 997 | " [\"Apples\", \"South\", 10.55],\n", 998 | " [\"Oranges\", \"South\", 22.00],\n", 999 | " [\"Bananas\", \"South\", 5.90],\n", 1000 | " [\"Bananas\", \"North\", 31.30],\n", 1001 | " [\"Oranges\", \"North\", 13.10]]\n", 1002 | "\n", 1003 | "sales = pd.DataFrame(data=data,\n", 1004 | " columns=[\"Fruit\", \"Region\", \"Revenue\"])\n", 1005 | "sales" 1006 | ] 1007 | }, 1008 | { 1009 | "cell_type": "code", 1010 | "execution_count": null, 1011 | "metadata": {}, 1012 | "outputs": [], 1013 | "source": [ 1014 | "pivot = pd.pivot_table(sales,\n", 1015 | " index=\"Fruit\", columns=\"Region\",\n", 1016 | " values=\"Revenue\", aggfunc=\"sum\",\n", 1017 | " margins=True, margins_name=\"Total\")\n", 1018 | "pivot" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": null, 1024 | "metadata": {}, 1025 | "outputs": [], 1026 | "source": [ 1027 | "pd.melt(pivot.iloc[:-1,:-1].reset_index(),\n", 1028 | " id_vars=\"Fruit\",\n", 1029 | " value_vars=[\"North\", \"South\"], value_name=\"Revenue\")" 1030 | ] 1031 | }, 1032 | { 1033 | "cell_type": "markdown", 1034 | "metadata": {}, 1035 | "source": [ 1036 | "# Plotting\n", 1037 | "## Matplotlib" 1038 | ] 1039 | }, 1040 | { 1041 | "cell_type": "code", 1042 | "execution_count": null, 1043 | "metadata": {}, 1044 | "outputs": [], 1045 | "source": [ 1046 | "import numpy as np\n", 1047 | "%matplotlib inline\n", 1048 | "# Or %matplotlib notebook" 1049 | ] 1050 | }, 1051 | { 1052 | "cell_type": "code", 1053 | "execution_count": null, 1054 | "metadata": {}, 1055 | "outputs": [], 1056 | "source": [ 1057 | "data = pd.DataFrame(data=np.random.rand(4, 4) * 100000,\n", 1058 | " index=[\"Q1\", \"Q2\", \"Q3\", \"Q4\"],\n", 1059 | " columns=[\"East\", \"West\", \"North\", \"South\"])\n", 1060 | "data.index.name = \"Quarters\"\n", 1061 | "data.columns.name = \"Region\"\n", 1062 | "data" 1063 | ] 1064 | }, 1065 | { 1066 | "cell_type": "code", 1067 | "execution_count": null, 1068 | "metadata": {}, 1069 | "outputs": [], 1070 | "source": [ 1071 | "data.plot() # Shortcut for data.plot.line()" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "markdown", 1076 | "metadata": {}, 1077 | "source": [ 1078 | "## Plotly" 1079 | ] 1080 | }, 1081 | { 1082 | "cell_type": "code", 1083 | "execution_count": null, 1084 | "metadata": {}, 1085 | "outputs": [], 1086 | "source": [ 1087 | "# Set the plotting backend to Plotly\n", 1088 | "pd.options.plotting.backend = \"plotly\"" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": null, 1094 | "metadata": {}, 1095 | "outputs": [], 1096 | "source": [ 1097 | "data.plot()" 1098 | ] 1099 | }, 1100 | { 1101 | "cell_type": "code", 1102 | "execution_count": null, 1103 | "metadata": {}, 1104 | "outputs": [], 1105 | "source": [ 1106 | "# Display the same data as bar plot\n", 1107 | "data.plot.bar(barmode=\"group\")" 1108 | ] 1109 | }, 1110 | { 1111 | "cell_type": "markdown", 1112 | "metadata": {}, 1113 | "source": [ 1114 | "# Data Import and Export\n", 1115 | "## Exporting to a CSV file" 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "code", 1120 | "execution_count": null, 1121 | "metadata": {}, 1122 | "outputs": [], 1123 | "source": [ 1124 | "df.to_csv(\"course_participants.csv\")" 1125 | ] 1126 | }, 1127 | { 1128 | "cell_type": "markdown", 1129 | "metadata": {}, 1130 | "source": [ 1131 | "## Importing a CSV file" 1132 | ] 1133 | }, 1134 | { 1135 | "cell_type": "code", 1136 | "execution_count": null, 1137 | "metadata": {}, 1138 | "outputs": [], 1139 | "source": [ 1140 | "msft = pd.read_csv(\"csv/MSFT.csv\")" 1141 | ] 1142 | }, 1143 | { 1144 | "cell_type": "code", 1145 | "execution_count": null, 1146 | "metadata": {}, 1147 | "outputs": [], 1148 | "source": [ 1149 | "msft.info()" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "code", 1154 | "execution_count": null, 1155 | "metadata": {}, 1156 | "outputs": [], 1157 | "source": [ 1158 | "# I am selecting a few columns because of space issues\n", 1159 | "# You can also just run: msft.head()\n", 1160 | "msft.loc[:, [\"Date\", \"Adj Close\", \"Volume\"]].head()" 1161 | ] 1162 | }, 1163 | { 1164 | "cell_type": "code", 1165 | "execution_count": null, 1166 | "metadata": {}, 1167 | "outputs": [], 1168 | "source": [ 1169 | "msft.loc[:, [\"Date\", \"Adj Close\", \"Volume\"]].tail(2)" 1170 | ] 1171 | }, 1172 | { 1173 | "cell_type": "code", 1174 | "execution_count": null, 1175 | "metadata": {}, 1176 | "outputs": [], 1177 | "source": [ 1178 | "msft.loc[:, [\"Adj Close\", \"Volume\"]].describe()" 1179 | ] 1180 | }, 1181 | { 1182 | "cell_type": "code", 1183 | "execution_count": null, 1184 | "metadata": {}, 1185 | "outputs": [], 1186 | "source": [ 1187 | "# The line break in the URL is only to make it fit on the page\n", 1188 | "url = (\"https://raw.githubusercontent.com/fzumstein/\"\n", 1189 | " \"python-for-excel/1st-edition/csv/MSFT.csv\")\n", 1190 | "msft = pd.read_csv(url)" 1191 | ] 1192 | }, 1193 | { 1194 | "cell_type": "code", 1195 | "execution_count": null, 1196 | "metadata": {}, 1197 | "outputs": [], 1198 | "source": [ 1199 | "msft.loc[:, [\"Date\", \"Adj Close\", \"Volume\"]].head(2)" 1200 | ] 1201 | } 1202 | ], 1203 | "metadata": { 1204 | "kernelspec": { 1205 | "display_name": "Python 3", 1206 | "language": "python", 1207 | "name": "python3" 1208 | }, 1209 | "language_info": { 1210 | "codemirror_mode": { 1211 | "name": "ipython", 1212 | "version": 3 1213 | }, 1214 | "file_extension": ".py", 1215 | "mimetype": "text/x-python", 1216 | "name": "python", 1217 | "nbconvert_exporter": "python", 1218 | "pygments_lexer": "ipython3", 1219 | "version": "3.7.4" 1220 | } 1221 | }, 1222 | "nbformat": 4, 1223 | "nbformat_minor": 4 1224 | } 1225 | -------------------------------------------------------------------------------- /ch06.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Time Series\n", 8 | "## DatetimeIndex" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# Let's start by importing the packages we use in this chapter\n", 18 | "# and by setting the plotting backend to Plotly\n", 19 | "import pandas as pd\n", 20 | "import numpy as np\n", 21 | "pd.options.plotting.backend = \"plotly\"" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# This creates a DatetimeIndex based on a start timestamp,\n", 31 | "# number of periods and frequency (\"D\" = daily).\n", 32 | "daily_index = pd.date_range(\"2020-02-28\", periods=4, freq=\"D\")\n", 33 | "daily_index" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# This creates a DatetimeIndex based on start/end timestamp.\n", 43 | "# The frequency is set to \"weekly on Sundays\" (\"W-SUN\").\n", 44 | "weekly_index = pd.date_range(\"2020-01-01\", \"2020-01-31\", freq=\"W-SUN\")\n", 45 | "weekly_index" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "# Construct a DataFrame based on the weekly_index. This could be\n", 55 | "# the visitor count of a museum that only opens on Sundays.\n", 56 | "pd.DataFrame(data=[21, 15, 33, 34],\n", 57 | " columns=[\"visitors\"], index=weekly_index)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "msft = pd.read_csv(\"csv/MSFT.csv\")" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "msft.info()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "msft.loc[:, \"Date\"] = pd.to_datetime(msft[\"Date\"])" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "msft.dtypes" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "msft = pd.read_csv(\"csv/MSFT.csv\",\n", 103 | " index_col=\"Date\", parse_dates=[\"Date\"])" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "msft.info()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "msft.loc[:, \"Volume\"] = msft[\"Volume\"].astype(\"float\")\n", 122 | "msft[\"Volume\"].dtype" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "msft = msft.sort_index()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "msft.index.date" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "msft.loc[\"2019\", \"Adj Close\"]" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "msft.loc[\"2019-06\":\"2020-05\", \"Adj Close\"].plot()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## Working with Time Zones" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "# Add the time information to the date\n", 175 | "msft_close = msft.loc[:, [\"Adj Close\"]].copy()\n", 176 | "msft_close.index = msft_close.index + pd.DateOffset(hours=16)\n", 177 | "msft_close.head(2)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "# Make the timestamps time-zone-aware\n", 187 | "msft_close = msft_close.tz_localize(\"America/New_York\")\n", 188 | "msft_close.head(2)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "msft_close = msft_close.tz_convert(\"UTC\")\n", 198 | "msft_close.loc[\"2020-01-02\", \"Adj Close\"] # 21:00 without DST" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "msft_close.loc[\"2020-05-01\", \"Adj Close\"] # 20:00 with DST" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Shifting and Percentage Changes" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "msft_close.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "msft_close.shift(1).head()" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "returns = np.log(msft_close / msft_close.shift(1))\n", 242 | "returns = returns.rename(columns={\"Adj Close\": \"returns\"})\n", 243 | "returns.head()" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# Plot a histogram with the daily log returns\n", 253 | "returns.plot.hist()" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "simple_rets = msft_close.pct_change()\n", 263 | "simple_rets = simple_rets.rename(columns={\"Adj Close\": \"simple rets\"})\n", 264 | "simple_rets.head()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Rebasing and Correlation" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": null, 277 | "metadata": {}, 278 | "outputs": [], 279 | "source": [ 280 | "parts = [] # List to collect individual DataFrames\n", 281 | "for ticker in [\"AAPL\", \"AMZN\", \"GOOGL\", \"MSFT\"]:\n", 282 | " # \"usecols\" allows us to only read in the Date and Adj Close\n", 283 | " # For a refresher about f-strings, see Chapter 3\n", 284 | " adj_close = pd.read_csv(f\"csv/{ticker}.csv\",\n", 285 | " index_col=\"Date\", parse_dates=[\"Date\"],\n", 286 | " usecols=[\"Date\", \"Adj Close\"])\n", 287 | " # Rename the column into the ticker symbol\n", 288 | " # (If you type this example by hand, make sure to keep the\n", 289 | " # following lines correctly indented!)\n", 290 | " adj_close = adj_close.rename(columns={\"Adj Close\": ticker})\n", 291 | " # Append the stock's DataFrame to the parts list\n", 292 | " parts.append(adj_close)" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "# Combine the 4 DataFrames into a single DataFrame\n", 302 | "adj_close = pd.concat(parts, axis=1)\n", 303 | "adj_close" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "adj_close = adj_close.dropna()\n", 313 | "adj_close.info()" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "# Use a sample from June 2019 - May 2020\n", 323 | "adj_close_sample = adj_close.loc[\"2019-06\":\"2020-05\", :]\n", 324 | "rebased_prices = adj_close_sample / adj_close_sample.iloc[0, :] * 100\n", 325 | "rebased_prices.head(2)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "rebased_prices.plot()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "# Correlation of daily log returns\n", 344 | "returns = np.log(adj_close / adj_close.shift(1))\n", 345 | "returns.corr()" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "import plotly.express as px" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "fig = px.imshow(returns.corr(),\n", 364 | " x=adj_close.columns,\n", 365 | " y=adj_close.columns,\n", 366 | " color_continuous_scale=list(\n", 367 | " reversed(px.colors.sequential.RdBu)),\n", 368 | " zmin=-1, zmax=1)\n", 369 | "fig.show()" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": {}, 375 | "source": [ 376 | "## Resampling" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "metadata": {}, 383 | "outputs": [], 384 | "source": [ 385 | "end_of_month = adj_close.resample(\"M\").last()\n", 386 | "end_of_month.head()" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "end_of_month.resample(\"D\").asfreq().head() # No transformation" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "end_of_month.resample(\"W-FRI\").ffill().head() # Forward fill" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "## Rolling Windows" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "# Plot the moving average for MSFT with data from 2019\n", 421 | "msft19 = msft.loc[\"2019\", [\"Adj Close\"]].copy()\n", 422 | "\n", 423 | "# Add the 25 day moving average as a new column to the DataFrame\n", 424 | "msft19.loc[:, \"25day average\"] = msft19[\"Adj Close\"].rolling(25).mean()\n", 425 | "msft19.plot()" 426 | ] 427 | } 428 | ], 429 | "metadata": { 430 | "kernelspec": { 431 | "display_name": "Python 3", 432 | "language": "python", 433 | "name": "python3" 434 | }, 435 | "language_info": { 436 | "codemirror_mode": { 437 | "name": "ipython", 438 | "version": 3 439 | }, 440 | "file_extension": ".py", 441 | "mimetype": "text/x-python", 442 | "name": "python", 443 | "nbconvert_exporter": "python", 444 | "pygments_lexer": "ipython3", 445 | "version": "3.7.4" 446 | } 447 | }, 448 | "nbformat": 4, 449 | "nbformat_minor": 4 450 | } -------------------------------------------------------------------------------- /ch07.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Excel File Manipulation with pandas" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Using pandas with Excel Files" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Case Study: Excel Reporting" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "import pandas as pd" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "df = pd.read_excel(\"sales_data/new/January.xlsx\")\n", 40 | "df.info()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## Reading Excel Files with pandas" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "df = pd.read_excel(\"xl/stores.xlsx\",\n", 57 | " sheet_name=\"2019\", skiprows=1, usecols=\"B:F\")\n", 58 | "df" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "df.info()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "def fix_missing(x):\n", 77 | " return False if x in [\"\", \"MISSING\"] else x" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "df = pd.read_excel(\"xl/stores.xlsx\",\n", 87 | " sheet_name=\"2019\", skiprows=1, usecols=\"B:F\",\n", 88 | " converters={\"Flagship\": fix_missing})\n", 89 | "df" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# The Flagship column now has Dtype \"bool\"\n", 99 | "df.info()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "sheets = pd.read_excel(\"xl/stores.xlsx\", sheet_name=[\"2019\", \"2020\"],\n", 109 | " skiprows=1, usecols=[\"Store\", \"Employees\"])\n", 110 | "sheets[\"2019\"].head(2)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "df = pd.read_excel(\"xl/stores.xlsx\", sheet_name=0,\n", 120 | " skiprows=2, skipfooter=3,\n", 121 | " usecols=\"B:C,F\", header=None,\n", 122 | " names=[\"Branch\", \"Employee_Count\", \"Is_Flagship\"])\n", 123 | "df" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "df = pd.read_excel(\"xl/stores.xlsx\", sheet_name=\"2019\",\n", 133 | " skiprows=1, usecols=\"B,C,F\", skipfooter=2,\n", 134 | " na_values=\"MISSING\", keep_default_na=False)\n", 135 | "df" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "f = open(\"output.txt\", \"w\")\n", 145 | "f.write(\"Some text\")\n", 146 | "f.close()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "### Context Managers and the with Statement" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "with open(\"output.txt\", \"w\") as f:\n", 163 | " f.write(\"Some text\")" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "with pd.ExcelFile(\"xl/stores.xls\") as f:\n", 173 | " df1 = pd.read_excel(f, \"2019\", skiprows=1, usecols=\"B:F\", nrows=2)\n", 174 | " df2 = pd.read_excel(f, \"2020\", skiprows=1, usecols=\"B:F\", nrows=2)\n", 175 | "\n", 176 | "df1" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "stores = pd.ExcelFile(\"xl/stores.xlsx\")\n", 186 | "stores.sheet_names" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "url = (\"https://raw.githubusercontent.com/fzumstein/\"\n", 196 | " \"python-for-excel/1st-edition/xl/stores.xlsx\")\n", 197 | "pd.read_excel(url, skiprows=1, usecols=\"B:E\", nrows=2)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Writing Excel Files with pandas" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "import numpy as np\n", 214 | "import datetime as dt" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "data = [[dt.datetime(2020,1,1, 10, 13), 2.222, 1, True],\n", 224 | " [dt.datetime(2020,1,2), np.nan, 2, False],\n", 225 | " [dt.datetime(2020,1,2), np.inf, 3, True]]\n", 226 | "df = pd.DataFrame(data=data,\n", 227 | " columns=[\"Dates\", \"Floats\", \"Integers\", \"Booleans\"])\n", 228 | "df.index.name=\"index\"\n", 229 | "df" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "df.to_excel(\"written_with_pandas.xlsx\", sheet_name=\"Output\",\n", 239 | " startrow=1, startcol=1, index=True, header=True,\n", 240 | " na_rep=\"\", inf_rep=\"\")" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "with pd.ExcelWriter(\"written_with_pandas2.xlsx\") as writer:\n", 250 | " df.to_excel(writer, sheet_name=\"Sheet1\", startrow=1, startcol=1)\n", 251 | " df.to_excel(writer, sheet_name=\"Sheet1\", startrow=10, startcol=1)\n", 252 | " df.to_excel(writer, sheet_name=\"Sheet2\")" 253 | ] 254 | } 255 | ], 256 | "metadata": { 257 | "kernelspec": { 258 | "display_name": "Python 3", 259 | "language": "python", 260 | "name": "python3" 261 | }, 262 | "language_info": { 263 | "codemirror_mode": { 264 | "name": "ipython", 265 | "version": 3 266 | }, 267 | "file_extension": ".py", 268 | "mimetype": "text/x-python", 269 | "name": "python", 270 | "nbconvert_exporter": "python", 271 | "pygments_lexer": "ipython3", 272 | "version": "3.7.4" 273 | } 274 | }, 275 | "nbformat": 4, 276 | "nbformat_minor": 4 277 | } 278 | -------------------------------------------------------------------------------- /ch08.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Reader and Writer Packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## OpenPyXL\n", 15 | "### Reading with OpenPyXL" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import pandas as pd\n", 25 | "import openpyxl\n", 26 | "import excel\n", 27 | "import datetime as dt" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# Open the workbook to read cell values.\n", 37 | "# The file is automatically closed again after loading the data.\n", 38 | "book = openpyxl.load_workbook(\"xl/stores.xlsx\", data_only=True)" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "# Get a worksheet object by name or index (0-based)\n", 48 | "sheet = book[\"2019\"]\n", 49 | "sheet = book.worksheets[0]" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# Get a list with all sheet names\n", 59 | "book.sheetnames" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# Loop through the sheet objects.\n", 69 | "# Instead of \"name\", openpyxl uses \"title\".\n", 70 | "for i in book.worksheets:\n", 71 | " print(i.title)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# Getting the dimensions,\n", 81 | "# i.e., the used range of the sheet\n", 82 | "sheet.max_row, sheet.max_column" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# Read the value of a single cell\n", 92 | "# using \"A1\" notation and using cell indices (1-based)\n", 93 | "sheet[\"B6\"].value\n", 94 | "sheet.cell(row=6, column=2).value" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "# Read in a range of cell values by using our excel module\n", 104 | "data = excel.read(book[\"2019\"], (2, 2), (8, 6))\n", 105 | "data[:2] # Print the first two rows" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "### Writing with OpenPyXL" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "import openpyxl\n", 122 | "from openpyxl.drawing.image import Image\n", 123 | "from openpyxl.chart import BarChart, Reference\n", 124 | "from openpyxl.styles import Font, colors\n", 125 | "from openpyxl.styles.borders import Border, Side\n", 126 | "from openpyxl.styles.alignment import Alignment\n", 127 | "from openpyxl.styles.fills import PatternFill\n", 128 | "import excel" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "# Instantiate a workbook\n", 138 | "book = openpyxl.Workbook()\n", 139 | "\n", 140 | "# Get the first sheet and give it a name\n", 141 | "sheet = book.active\n", 142 | "sheet.title = \"Sheet1\"\n", 143 | "\n", 144 | "# Writing individual cells using A1 notation\n", 145 | "# and cell indices (1-based)\n", 146 | "sheet[\"A1\"].value = \"Hello 1\"\n", 147 | "sheet.cell(row=2, column=1, value=\"Hello 2\")\n", 148 | "\n", 149 | "# Formatting: fill color, alignment, border and font\n", 150 | "font_format = Font(color=\"FF0000\", bold=True)\n", 151 | "thin = Side(border_style=\"thin\", color=\"FF0000\")\n", 152 | "sheet[\"A3\"].value = \"Hello 3\"\n", 153 | "sheet[\"A3\"].font = font_format\n", 154 | "sheet[\"A3\"].border = Border(top=thin, left=thin,\n", 155 | " right=thin, bottom=thin)\n", 156 | "sheet[\"A3\"].alignment = Alignment(horizontal=\"center\")\n", 157 | "sheet[\"A3\"].fill = PatternFill(fgColor=\"FFFF00\", fill_type=\"solid\")\n", 158 | "\n", 159 | "# Number formatting (using Excel's formatting strings)\n", 160 | "sheet[\"A4\"].value = 3.3333\n", 161 | "sheet[\"A4\"].number_format = \"0.00\"\n", 162 | "\n", 163 | "# Date formatting (using Excel's formatting strings)\n", 164 | "sheet[\"A5\"].value = dt.date(2016, 10, 13)\n", 165 | "sheet[\"A5\"].number_format = \"mm/dd/yy\"\n", 166 | "\n", 167 | "# Formula: you must use the English name of the formula\n", 168 | "# with commas as delimiters\n", 169 | "sheet[\"A6\"].value = \"=SUM(A4, 2)\"\n", 170 | "\n", 171 | "# Image\n", 172 | "sheet.add_image(Image(\"images/python.png\"), \"C1\")\n", 173 | "\n", 174 | "# Two-dimensional list (we're using our excel module)\n", 175 | "data = [[None, \"North\", \"South\"],\n", 176 | " [\"Last Year\", 2, 5],\n", 177 | " [\"This Year\", 3, 6]]\n", 178 | "excel.write(sheet, data, \"A10\")\n", 179 | "\n", 180 | "# Chart\n", 181 | "chart = BarChart()\n", 182 | "chart.type = \"col\"\n", 183 | "chart.title = \"Sales Per Region\"\n", 184 | "chart.x_axis.title = \"Regions\"\n", 185 | "chart.y_axis.title = \"Sales\"\n", 186 | "chart_data = Reference(sheet, min_row=11, min_col=1,\n", 187 | " max_row=12, max_col=3)\n", 188 | "chart_categories = Reference(sheet, min_row=10, min_col=2,\n", 189 | " max_row=10, max_col=3)\n", 190 | "# from_rows interprets the data in the same way\n", 191 | "# as if you would add a chart manually in Excel\n", 192 | "chart.add_data(chart_data, titles_from_data=True, from_rows=True)\n", 193 | "chart.set_categories(chart_categories)\n", 194 | "sheet.add_chart(chart, \"A15\")\n", 195 | "\n", 196 | "# Saving the workbook creates the file on disk\n", 197 | "book.save(\"openpyxl.xlsx\")" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "book = openpyxl.Workbook()\n", 207 | "sheet = book.active\n", 208 | "sheet[\"A1\"].value = \"This is a template\"\n", 209 | "book.template = True\n", 210 | "book.save(\"template.xltx\")" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "### Editing with OpenPyXL" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# Read the stores.xlsx file, change a cell\n", 227 | "# and store it under a new location/name.\n", 228 | "book = openpyxl.load_workbook(\"xl/stores.xlsx\")\n", 229 | "book[\"2019\"][\"A1\"].value = \"modified\"\n", 230 | "book.save(\"stores_edited.xlsx\")" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "book = openpyxl.load_workbook(\"xl/macro.xlsm\", keep_vba=True)\n", 240 | "book[\"Sheet1\"][\"A1\"].value = \"Click the button!\"\n", 241 | "book.save(\"macro_openpyxl.xlsm\")" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "## XlsxWriter" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "import datetime as dt\n", 258 | "import xlsxwriter\n", 259 | "import excel" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "# Instantiate a workbook\n", 269 | "book = xlsxwriter.Workbook(\"xlsxwriter.xlsx\")\n", 270 | "\n", 271 | "# Add a sheet and give it a name\n", 272 | "sheet = book.add_worksheet(\"Sheet1\")\n", 273 | "\n", 274 | "# Writing individual cells using A1 notation\n", 275 | "# and cell indices (0-based)\n", 276 | "sheet.write(\"A1\", \"Hello 1\")\n", 277 | "sheet.write(1, 0, \"Hello 2\")\n", 278 | "\n", 279 | "# Formatting: fill color, alignment, border and font\n", 280 | "formatting = book.add_format({\"font_color\": \"#FF0000\",\n", 281 | " \"bg_color\": \"#FFFF00\",\n", 282 | " \"bold\": True, \"align\": \"center\",\n", 283 | " \"border\": 1, \"border_color\": \"#FF0000\"})\n", 284 | "sheet.write(\"A3\", \"Hello 3\", formatting)\n", 285 | "\n", 286 | "# Number formatting (using Excel's formatting strings)\n", 287 | "number_format = book.add_format({\"num_format\": \"0.00\"})\n", 288 | "sheet.write(\"A4\", 3.3333, number_format)\n", 289 | "\n", 290 | "# Date formatting (using Excel's formatting strings)\n", 291 | "date_format = book.add_format({\"num_format\": \"mm/dd/yy\"})\n", 292 | "sheet.write(\"A5\", dt.date(2016, 10, 13), date_format)\n", 293 | "\n", 294 | "# Formula: you must use the English name of the formula\n", 295 | "# with commas as delimiters\n", 296 | "sheet.write(\"A6\", \"=SUM(A4, 2)\")\n", 297 | "\n", 298 | "# Image\n", 299 | "sheet.insert_image(0, 2, \"images/python.png\")\n", 300 | "\n", 301 | "# Two-dimensional list (we're using our excel module)\n", 302 | "data = [[None, \"North\", \"South\"],\n", 303 | " [\"Last Year\", 2, 5],\n", 304 | " [\"This Year\", 3, 6]]\n", 305 | "excel.write(sheet, data, \"A10\")\n", 306 | "\n", 307 | "# Chart: see the file \"sales_report_xlsxwriter.py\" in the\n", 308 | "# companion repo to see how you can work with indices\n", 309 | "# instead of cell addresses\n", 310 | "chart = book.add_chart({\"type\": \"column\"})\n", 311 | "chart.set_title({\"name\": \"Sales per Region\"})\n", 312 | "chart.add_series({\"name\": \"=Sheet1!A11\",\n", 313 | " \"categories\": \"=Sheet1!B10:C10\",\n", 314 | " \"values\": \"=Sheet1!B11:C11\"})\n", 315 | "chart.add_series({\"name\": \"=Sheet1!A12\",\n", 316 | " \"categories\": \"=Sheet1!B10:C10\",\n", 317 | " \"values\": \"=Sheet1!B12:C12\"})\n", 318 | "chart.set_x_axis({\"name\": \"Regions\"})\n", 319 | "chart.set_y_axis({\"name\": \"Sales\"})\n", 320 | "sheet.insert_chart(\"A15\", chart)\n", 321 | "\n", 322 | "# Closing the workbook creates the file on disk\n", 323 | "book.close()" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "book = xlsxwriter.Workbook(\"macro_xlsxwriter.xlsm\")\n", 333 | "sheet = book.add_worksheet(\"Sheet1\")\n", 334 | "sheet.write(\"A1\", \"Click the button!\")\n", 335 | "book.add_vba_project(\"xl/vbaProject.bin\")\n", 336 | "sheet.insert_button(\"A3\", {\"macro\": \"Hello\", \"caption\": \"Button 1\",\n", 337 | " \"width\": 130, \"height\": 35})\n", 338 | "book.close()" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "## pyxlsb" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "import pyxlsb\n", 355 | "import excel" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "# Loop through sheets. With pyxlsb, the workbook\n", 365 | "# and sheet objects can be used as context managers.\n", 366 | "# book.sheets returns a list of sheet names, not objects!\n", 367 | "# To get a sheet object, use get_sheet() instead.\n", 368 | "with pyxlsb.open_workbook(\"xl/stores.xlsb\") as book:\n", 369 | " for sheet_name in book.sheets:\n", 370 | " with book.get_sheet(sheet_name) as sheet:\n", 371 | " dim = sheet.dimension\n", 372 | " print(f\"Sheet '{sheet_name}' has \"\n", 373 | " f\"{dim.h} rows and {dim.w} cols\")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": {}, 380 | "outputs": [], 381 | "source": [ 382 | "# Read in the values of a range of cells by using our excel module.\n", 383 | "# Instead of \"2019\", you could also use its index (1-based).\n", 384 | "with pyxlsb.open_workbook(\"xl/stores.xlsb\") as book:\n", 385 | " with book.get_sheet(\"2019\") as sheet:\n", 386 | " data = excel.read(sheet, \"B2\")\n", 387 | "data[:2] # Print the first two rows" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "metadata": {}, 394 | "outputs": [], 395 | "source": [ 396 | "from pyxlsb import convert_date\n", 397 | "convert_date(data[1][3])" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": null, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "df = pd.read_excel(\"xl/stores.xlsb\", engine=\"pyxlsb\")" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": {}, 412 | "source": [ 413 | "## xlrd, xlwt and xlutils" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "### Reading with xlrd" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "import xlrd\n", 430 | "import xlwt\n", 431 | "from xlwt.Utils import cell_to_rowcol2\n", 432 | "import xlutils\n", 433 | "import excel" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "# Open the workbook to read cell values. The file is\n", 443 | "# automatically closed again after loading the data.\n", 444 | "book = xlrd.open_workbook(\"xl/stores.xls\")" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": {}, 451 | "outputs": [], 452 | "source": [ 453 | "# Get a list with all sheet names\n", 454 | "book.sheet_names()" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "# Loop through the sheet objects\n", 464 | "for sheet in book.sheets():\n", 465 | " print(sheet.name)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "# Get a sheet object by name or index (0-based)\n", 475 | "sheet = book.sheet_by_index(0)\n", 476 | "sheet = book.sheet_by_name(\"2019\")" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "# Dimensions\n", 486 | "sheet.nrows, sheet.ncols" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": null, 492 | "metadata": {}, 493 | "outputs": [], 494 | "source": [ 495 | "# Read the value of a single cell\n", 496 | "# using \"A1\" notation and using cell indices (0-based).\n", 497 | "# The \"*\" unpacks the tuple that cell_to_rowcol2 returns\n", 498 | "# into individual arguments.\n", 499 | "sheet.cell(*cell_to_rowcol2(\"B3\")).value\n", 500 | "sheet.cell(2, 1).value" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "# Read in a range of cell values by using our excel module\n", 510 | "data = excel.read(sheet, \"B2\")\n", 511 | "data[:2] # Print the first two rows" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "### Writing with xlwt" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "import xlwt\n", 528 | "from xlwt.Utils import cell_to_rowcol2\n", 529 | "import datetime as dt\n", 530 | "import excel" 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "# Instantiate a workbook\n", 540 | "book = xlwt.Workbook()\n", 541 | "\n", 542 | "# Add a sheet and give it a name\n", 543 | "sheet = book.add_sheet(\"Sheet1\")\n", 544 | "\n", 545 | "# Writing individual cells using A1 notation\n", 546 | "# and cell indices (0-based)\n", 547 | "sheet.write(*cell_to_rowcol2(\"A1\"), \"Hello 1\")\n", 548 | "sheet.write(r=1, c=0, label=\"Hello 2\")\n", 549 | "\n", 550 | "# Formatting: fill color, alignment, border and font\n", 551 | "formatting = xlwt.easyxf(\"font: bold on, color red;\"\n", 552 | " \"align: horiz center;\"\n", 553 | " \"borders: top_color red, bottom_color red,\"\n", 554 | " \"right_color red, left_color red,\"\n", 555 | " \"left thin, right thin,\"\n", 556 | " \"top thin, bottom thin;\"\n", 557 | " \"pattern: pattern solid, fore_color yellow;\")\n", 558 | "sheet.write(r=2, c=0, label=\"Hello 3\", style=formatting)\n", 559 | "\n", 560 | "# Number formatting (using Excel's formatting strings)\n", 561 | "number_format = xlwt.easyxf(num_format_str=\"0.00\")\n", 562 | "sheet.write(3, 0, 3.3333, number_format)\n", 563 | "\n", 564 | "# Date formatting (using Excel's formatting strings)\n", 565 | "date_format = xlwt.easyxf(num_format_str=\"mm/dd/yyyy\")\n", 566 | "sheet.write(4, 0, dt.datetime(2012, 2, 3), date_format)\n", 567 | "\n", 568 | "# Formula: you must use the English name of the formula\n", 569 | "# with commas as delimiters\n", 570 | "sheet.write(5, 0, xlwt.Formula(\"SUM(A4, 2)\"))\n", 571 | "\n", 572 | "# Two-dimensional list (we're using our excel module)\n", 573 | "data = [[None, \"North\", \"South\"],\n", 574 | " [\"Last Year\", 2, 5],\n", 575 | " [\"This Year\", 3, 6]]\n", 576 | "excel.write(sheet, data, \"A10\")\n", 577 | "\n", 578 | "# Picture (only allows to add bmp format)\n", 579 | "sheet.insert_bitmap(\"images/python.bmp\", 0, 2)\n", 580 | "\n", 581 | "# This writes the file to disk\n", 582 | "book.save(\"xlwt.xls\")" 583 | ] 584 | }, 585 | { 586 | "cell_type": "markdown", 587 | "metadata": {}, 588 | "source": [ 589 | "### Editing with xlutils" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "import xlutils.copy" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": {}, 605 | "outputs": [], 606 | "source": [ 607 | "book = xlrd.open_workbook(\"xl/stores.xls\", formatting_info=True)\n", 608 | "book = xlutils.copy.copy(book)\n", 609 | "book.get_sheet(0).write(0, 0, \"changed!\")\n", 610 | "book.save(\"stores_edited.xls\")" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "# Advanced Topics\n", 618 | "## Working with Big Files" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": {}, 624 | "source": [ 625 | "### Writing with OpenPyXL" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [ 634 | "book = openpyxl.Workbook(write_only=True)\n", 635 | "# With write_only=True, book.active doesn't work\n", 636 | "sheet = book.create_sheet()\n", 637 | "# This will produce a sheet with 1000 x 200 cells\n", 638 | "for row in range(1000):\n", 639 | " sheet.append(list(range(200)))\n", 640 | "book.save(\"openpyxl_optimized.xlsx\")" 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "metadata": {}, 646 | "source": [ 647 | "### Writing with XlsxWriter" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "book = xlsxwriter.Workbook(\"xlsxwriter_optimized.xlsx\",\n", 657 | " options={\"constant_memory\": True})\n", 658 | "sheet = book.add_worksheet()\n", 659 | "# This will produce a sheet with 1000 x 200 cells\n", 660 | "for row in range(1000):\n", 661 | " sheet.write_row(row , 0, list(range(200)))\n", 662 | "book.close()" 663 | ] 664 | }, 665 | { 666 | "cell_type": "markdown", 667 | "metadata": {}, 668 | "source": [ 669 | "### Reading with xlrd" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "with xlrd.open_workbook(\"xl/stores.xls\", on_demand=True) as book:\n", 679 | " sheet = book.sheet_by_index(0) # Only loads the first sheet" 680 | ] 681 | }, 682 | { 683 | "cell_type": "code", 684 | "execution_count": null, 685 | "metadata": {}, 686 | "outputs": [], 687 | "source": [ 688 | "with xlrd.open_workbook(\"xl/stores.xls\", on_demand=True) as book:\n", 689 | " with pd.ExcelFile(book, engine=\"xlrd\") as f:\n", 690 | " df = pd.read_excel(f, sheet_name=0)" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "### Reading with OpenPyXL" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "book = openpyxl.load_workbook(\"xl/big.xlsx\",\n", 707 | " data_only=True, read_only=True,\n", 708 | " keep_links=False)\n", 709 | "# Perform the desired read operations here\n", 710 | "book.close() # Required with read_only=True" 711 | ] 712 | }, 713 | { 714 | "cell_type": "markdown", 715 | "metadata": {}, 716 | "source": [ 717 | "### Reading in Parallel" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": null, 723 | "metadata": {}, 724 | "outputs": [], 725 | "source": [ 726 | "%%time\n", 727 | "data = pd.read_excel(\"xl/big.xlsx\",\n", 728 | " sheet_name=None, engine=\"openpyxl\")" 729 | ] 730 | }, 731 | { 732 | "cell_type": "code", 733 | "execution_count": null, 734 | "metadata": {}, 735 | "outputs": [], 736 | "source": [ 737 | "%%time\n", 738 | "import parallel_pandas\n", 739 | "data = parallel_pandas.read_excel(\"xl/big.xlsx\", sheet_name=None)" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "## Formatting DataFrames in Excel" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": null, 752 | "metadata": {}, 753 | "outputs": [], 754 | "source": [ 755 | "with pd.ExcelFile(\"xl/stores.xlsx\", engine=\"openpyxl\") as xlfile:\n", 756 | " # Read a DataFrame\n", 757 | " df = pd.read_excel(xlfile, sheet_name=\"2020\")\n", 758 | "\n", 759 | " # Get the OpenPyXL workbook object\n", 760 | " book = xlfile.book\n", 761 | "\n", 762 | " # From here on, it's OpenPyXL code\n", 763 | " sheet = book[\"2019\"]\n", 764 | " value = sheet[\"B3\"].value # Read a single value" 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "execution_count": null, 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "with pd.ExcelWriter(\"pandas_and_openpyxl.xlsx\",\n", 774 | " engine=\"openpyxl\") as writer:\n", 775 | " df = pd.DataFrame({\"col1\": [1, 2, 3, 4], \"col2\": [5, 6, 7, 8]})\n", 776 | " # Write a DataFrame\n", 777 | " df.to_excel(writer, \"Sheet1\", startrow=4, startcol=2)\n", 778 | "\n", 779 | " # Get the OpenPyXL workbook and sheet objects\n", 780 | " book = writer.book\n", 781 | " sheet = writer.sheets[\"Sheet1\"]\n", 782 | "\n", 783 | " # From here on, it's OpenPyXL code\n", 784 | " sheet[\"A1\"].value = \"This is a Title\" # Write a single cell value" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": null, 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "df = pd.DataFrame({\"col1\": [1, -2], \"col2\": [-3, 4]},\n", 794 | " index=[\"row1\", \"row2\"])\n", 795 | "df.index.name = \"ix\"\n", 796 | "df" 797 | ] 798 | }, 799 | { 800 | "cell_type": "code", 801 | "execution_count": null, 802 | "metadata": {}, 803 | "outputs": [], 804 | "source": [ 805 | "from openpyxl.styles import PatternFill" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": null, 811 | "metadata": {}, 812 | "outputs": [], 813 | "source": [ 814 | "with pd.ExcelWriter(\"formatting_openpyxl.xlsx\",\n", 815 | " engine=\"openpyxl\") as writer:\n", 816 | " # Write out the df with the default formatting to A1\n", 817 | " df.to_excel(writer, startrow=0, startcol=0)\n", 818 | "\n", 819 | " # Write out the df with custom index/header formatting to A6\n", 820 | " startrow, startcol = 0, 5\n", 821 | " # 1. Write out the data part of the DataFrame\n", 822 | " df.to_excel(writer, header=False, index=False,\n", 823 | " startrow=startrow + 1, startcol=startcol + 1)\n", 824 | " # Get the sheet object and create a style object\n", 825 | " sheet = writer.sheets[\"Sheet1\"]\n", 826 | " style = PatternFill(fgColor=\"D9D9D9\", fill_type=\"solid\")\n", 827 | "\n", 828 | " # 2. Write out the styled column headers\n", 829 | " for i, col in enumerate(df.columns):\n", 830 | " sheet.cell(row=startrow + 1, column=i + startcol + 2,\n", 831 | " value=col).fill = style\n", 832 | "\n", 833 | " # 3. Write out the styled index\n", 834 | " index = [df.index.name if df.index.name else None] + list(df.index)\n", 835 | " for i, row in enumerate(index):\n", 836 | " sheet.cell(row=i + startrow + 1, column=startcol + 1,\n", 837 | " value=row).fill = style" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": {}, 844 | "outputs": [], 845 | "source": [ 846 | "# Formatting index/headers with XlsxWriter\n", 847 | "with pd.ExcelWriter(\"formatting_xlsxwriter.xlsx\",\n", 848 | " engine=\"xlsxwriter\") as writer:\n", 849 | " # Write out the df with the default formatting to A1\n", 850 | " df.to_excel(writer, startrow=0, startcol=0)\n", 851 | "\n", 852 | " # Write out the df with custom index/header formatting to A6\n", 853 | " startrow, startcol = 0, 5\n", 854 | " # 1. Write out the data part of the DataFrame\n", 855 | " df.to_excel(writer, header=False, index=False,\n", 856 | " startrow=startrow + 1, startcol=startcol + 1)\n", 857 | " # Get the book and sheet object and create a style object\n", 858 | " book = writer.book\n", 859 | " sheet = writer.sheets[\"Sheet1\"]\n", 860 | " style = book.add_format({\"bg_color\": \"#D9D9D9\"})\n", 861 | "\n", 862 | " # 2. Write out the styled column headers\n", 863 | " for i, col in enumerate(df.columns):\n", 864 | " sheet.write(startrow, startcol + i + 1, col, style)\n", 865 | "\n", 866 | " # 3. Write out the styled index\n", 867 | " index = [df.index.name if df.index.name else None] + list(df.index)\n", 868 | " for i, row in enumerate(index):\n", 869 | " sheet.write(startrow + i, startcol, row, style)" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": null, 875 | "metadata": {}, 876 | "outputs": [], 877 | "source": [ 878 | "from openpyxl.styles import Alignment" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": null, 884 | "metadata": {}, 885 | "outputs": [], 886 | "source": [ 887 | "with pd.ExcelWriter(\"data_format_openpyxl.xlsx\",\n", 888 | " engine=\"openpyxl\") as writer:\n", 889 | " # Write out the DataFrame\n", 890 | " df.to_excel(writer)\n", 891 | " \n", 892 | " # Get the book and sheet objects\n", 893 | " book = writer.book\n", 894 | " sheet = writer.sheets[\"Sheet1\"]\n", 895 | " \n", 896 | " # Formatting individual cells\n", 897 | " nrows, ncols = df.shape\n", 898 | " for row in range(nrows):\n", 899 | " for col in range(ncols):\n", 900 | " # +1 to account for the header/index\n", 901 | " # +1 since OpenPyXL is 1-based\n", 902 | " cell = sheet.cell(row=row + 2,\n", 903 | " column=col + 2)\n", 904 | " cell.number_format = \"0.000\"\n", 905 | " cell.alignment = Alignment(horizontal=\"center\")" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": null, 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "with pd.ExcelWriter(\"data_format_xlsxwriter.xlsx\",\n", 915 | " engine=\"xlsxwriter\") as writer:\n", 916 | " # Write out the DataFrame\n", 917 | " df.to_excel(writer)\n", 918 | "\n", 919 | " # Get the book and sheet objects\n", 920 | " book = writer.book\n", 921 | " sheet = writer.sheets[\"Sheet1\"]\n", 922 | " \n", 923 | " # Formatting the columns (individual cells can't be formatted)\n", 924 | " number_format = book.add_format({\"num_format\": \"0.000\",\n", 925 | " \"align\": \"center\"})\n", 926 | " sheet.set_column(first_col=1, last_col=2,\n", 927 | " cell_format=number_format)" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": null, 933 | "metadata": {}, 934 | "outputs": [], 935 | "source": [ 936 | "df.style.applymap(lambda x: \"number-format: 0.000;\"\n", 937 | " \"text-align: center\")\\\n", 938 | " .to_excel(\"styled.xlsx\")" 939 | ] 940 | }, 941 | { 942 | "cell_type": "code", 943 | "execution_count": null, 944 | "metadata": {}, 945 | "outputs": [], 946 | "source": [ 947 | "df = pd.DataFrame({\"Date\": [dt.date(2020, 1, 1)],\n", 948 | " \"Datetime\": [dt.datetime(2020, 1, 1, 10)]})\n", 949 | "with pd.ExcelWriter(\"date.xlsx\",\n", 950 | " date_format=\"yyyy-mm-dd\",\n", 951 | " datetime_format=\"yyyy-mm-dd hh:mm:ss\") as writer:\n", 952 | " df.to_excel(writer)" 953 | ] 954 | } 955 | ], 956 | "metadata": { 957 | "kernelspec": { 958 | "display_name": "Python 3", 959 | "language": "python", 960 | "name": "python3" 961 | }, 962 | "language_info": { 963 | "codemirror_mode": { 964 | "name": "ipython", 965 | "version": 3 966 | }, 967 | "file_extension": ".py", 968 | "mimetype": "text/x-python", 969 | "name": "python", 970 | "nbconvert_exporter": "python", 971 | "pygments_lexer": "ipython3", 972 | "version": "3.7.4" 973 | } 974 | }, 975 | "nbformat": 4, 976 | "nbformat_minor": 4 977 | } 978 | -------------------------------------------------------------------------------- /ch09.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Excel Automation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Getting Started with xlwings" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Using Excel as Data Viewer" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# First, let's import the packages that we'll use in this chapter\n", 31 | "import datetime as dt\n", 32 | "import xlwings as xw\n", 33 | "import pandas as pd\n", 34 | "import numpy as np" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "# Let's create a DataFrame based on pseudorandom numbers and\n", 44 | "# with enough rows that only the head and tail are shown\n", 45 | "df = pd.DataFrame(data=np.random.randn(100, 5),\n", 46 | " columns=[f\"Trial {i}\" for i in range(1, 6)])\n", 47 | "df" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "# View the DataFrame in Excel\n", 57 | "xw.view(df)" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## The Excel Object Model" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# Create a new empty workbook and print its name. This is the\n", 74 | "# book we will use to run most of the code samples in this chapter.\n", 75 | "book = xw.Book()\n", 76 | "book.name" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# Accessing the sheets collection\n", 86 | "book.sheets" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "# Get a sheet object by index or name. You will need to adjust\n", 96 | "# \"Sheet1\" if your sheet is called differently.\n", 97 | "sheet1 = book.sheets[0]\n", 98 | "sheet1 = book.sheets[\"Sheet1\"]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "sheet1.range(\"A1\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# Most common tasks: write values...\n", 117 | "sheet1.range(\"A1\").value = [[1, 2],\n", 118 | " [3, 4]]\n", 119 | "sheet1.range(\"A4\").value = \"Hello!\"" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# ...and read values\n", 129 | "sheet1.range(\"A1:B2\").value" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "sheet1.range(\"A4\").value" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# Indexing\n", 148 | "sheet1.range(\"A1:B2\")[0, 0]" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "# Slicing\n", 158 | "sheet1.range(\"A1:B2\")[:, 1]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "# Single cell: A1 notation\n", 168 | "sheet1[\"A1\"]" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "# Multiple cells: A1 notation\n", 178 | "sheet1[\"A1:B2\"]" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# Single cell: indexing\n", 188 | "sheet1[0, 0]" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# Multiple cells: slicing\n", 198 | "sheet1[:2, :2]" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# D10 via sheet indexing\n", 208 | "sheet1[9, 3]" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# D10 via range object\n", 218 | "sheet1.range((10, 4))" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# D10:F11 via sheet slicing\n", 228 | "sheet1[9:11, 3:6]" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "# D10:F11 via range object\n", 238 | "sheet1.range((10, 4), (11, 6))" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "sheet1[\"A1\"].sheet.book.app" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": {}, 254 | "outputs": [], 255 | "source": [ 256 | "# Get one app object from the open workbook\n", 257 | "# and create an additional invisible app instance\n", 258 | "visible_app = sheet1.book.app\n", 259 | "invisible_app = xw.App(visible=False)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "# List the book names that are open in each instance\n", 269 | "# by using a list comprehension\n", 270 | "[book.name for book in visible_app.books]" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "[book.name for book in invisible_app.books]" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "# An app key represents the process ID (PID)\n", 289 | "xw.apps.keys()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "# It can also be accessed via the pid attribute\n", 299 | "xw.apps.active.pid" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "# Work with the book in the invisible Excel instance\n", 309 | "invisible_book = invisible_app.books[0]\n", 310 | "invisible_book.sheets[0][\"A1\"].value = \"Created by an invisible app.\"" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "metadata": {}, 317 | "outputs": [], 318 | "source": [ 319 | "# Save the Excel workbook in the xl directory\n", 320 | "invisible_book.save(\"xl/invisible.xlsx\")" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# Quit the invisible Excel instance\n", 330 | "invisible_app.quit()" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "## Running VBA Code" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "metadata": {}, 344 | "outputs": [], 345 | "source": [ 346 | "vba_book = xw.Book(\"xl/vba.xlsm\")" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": null, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "# Instantiate a macro object with the VBA function\n", 356 | "mysum = vba_book.macro(\"Module1.MySum\")\n", 357 | "# Call a VBA function\n", 358 | "mysum(5, 4)" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": null, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "# It works the same with a VBA Sub procedure\n", 368 | "show_msgbox = vba_book.macro(\"Module1.ShowMsgBox\")\n", 369 | "show_msgbox(\"Hello xlwings!\")" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "# Close the book again (make sure to close the MessageBox first)\n", 379 | "vba_book.close()" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "# Converters, Options and Collections" 387 | ] 388 | }, 389 | { 390 | "cell_type": "markdown", 391 | "metadata": {}, 392 | "source": [ 393 | "## Working with DataFrames" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "data = [[\"Mark\", 55, \"Italy\", 4.5, \"Europe\"],\n", 403 | " [\"John\", 33, \"USA\", 6.7, \"America\"]]\n", 404 | "df = pd.DataFrame(data=data,\n", 405 | " columns=[\"name\", \"age\", \"country\",\n", 406 | " \"score\", \"continent\"],\n", 407 | " index=[1001, 1000])\n", 408 | "df.index.name = \"user_id\"\n", 409 | "df" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "sheet1[\"A6\"].value = df" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "sheet1[\"B10\"].options(header=False, index=False).value = df" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "df2 = sheet1[\"A6\"].expand().options(pd.DataFrame).value\n", 437 | "df2" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "metadata": {}, 444 | "outputs": [], 445 | "source": [ 446 | "# If you want the index to be an integer index,\n", 447 | "# you can change its data type\n", 448 | "df2.index = df2.index.astype(int)\n", 449 | "df2" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": null, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "# By setting index=False, it will put all the values from Excel into\n", 459 | "# the data part of the DataFrame and will use the default index\n", 460 | "sheet1[\"A6\"].expand().options(pd.DataFrame, index=False).value" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "## Converters and Options" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "# Horizontal range (one-dimensional)\n", 477 | "sheet1[\"A1:B1\"].value" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": {}, 484 | "outputs": [], 485 | "source": [ 486 | "# Vertical range (one-dimensional)\n", 487 | "sheet1[\"A1:A2\"].value" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "# Horizontal range (two-dimensional)\n", 497 | "sheet1[\"A1:B1\"].options(ndim=2).value" 498 | ] 499 | }, 500 | { 501 | "cell_type": "code", 502 | "execution_count": null, 503 | "metadata": {}, 504 | "outputs": [], 505 | "source": [ 506 | "# Vertical range (two-dimensional)\n", 507 | "sheet1[\"A1:A2\"].options(ndim=2).value" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "# Using the NumPy array converter behaves the same:\n", 517 | "# vertical range leads to a one-dimensional array\n", 518 | "sheet1[\"A1:A2\"].options(np.array).value" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "# Preserving the column orientation\n", 528 | "sheet1[\"A1:A2\"].options(np.array, ndim=2).value" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [ 537 | "# If you need to write out a list vertically,\n", 538 | "# the \"transpose\" option comes in handy\n", 539 | "sheet1[\"D1\"].options(transpose=True).value = [100, 200]" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "# Write out some sample data\n", 549 | "sheet1[\"A13\"].value = [dt.datetime(2020, 1, 1), None, 1.0]" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": null, 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "# Read it back using the default options\n", 559 | "sheet1[\"A13:C13\"].value" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [ 568 | "# Read it back using non-default options\n", 569 | "sheet1[\"A13:C13\"].options(empty=\"NA\",\n", 570 | " dates=dt.date,\n", 571 | " numbers=int).value" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "metadata": {}, 577 | "source": [ 578 | "## Charts, Pictures and Defined Names" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [ 587 | "sheet1[\"A15\"].value = [[None, \"North\", \"South\"],\n", 588 | " [\"Last Year\", 2, 5],\n", 589 | " [\"This Year\", 3, 6]]" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": {}, 596 | "outputs": [], 597 | "source": [ 598 | "chart = sheet1.charts.add(top=sheet1[\"A19\"].top,\n", 599 | " left=sheet1[\"A19\"].left)\n", 600 | "chart.chart_type = \"column_clustered\"\n", 601 | "chart.set_source_data(sheet1[\"A15\"].expand())" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "# Read in the chart data as DataFrame\n", 611 | "df = sheet1[\"A15\"].expand().options(pd.DataFrame).value\n", 612 | "df" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [ 621 | "# Enable Matplotlib by using the notebook magic command\n", 622 | "# and switch to the \"seaborn\" style\n", 623 | "%matplotlib inline\n", 624 | "import matplotlib.pyplot as plt\n", 625 | "plt.style.use(\"seaborn\")" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": {}, 632 | "outputs": [], 633 | "source": [ 634 | "# The pandas plot method returns an \"axis\" object from\n", 635 | "# where you can get the figure. \"T\" transposes the\n", 636 | "# DataFrame to bring the plot into the desired orientation\n", 637 | "ax = df.T.plot.bar()\n", 638 | "fig = ax.get_figure()" 639 | ] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [ 647 | "# Send the plot to Excel\n", 648 | "plot = sheet1.pictures.add(fig, name=\"SalesPlot\",\n", 649 | " top=sheet1[\"H19\"].top,\n", 650 | " left=sheet1[\"H19\"].left)\n", 651 | "# Let's scale the plot to 70%\n", 652 | "plot.width, plot.height = plot.width * 0.7, plot.height * 0.7" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": {}, 659 | "outputs": [], 660 | "source": [ 661 | "ax = (df + 1).T.plot.bar()\n", 662 | "plot = plot.update(ax.get_figure())" 663 | ] 664 | }, 665 | { 666 | "cell_type": "code", 667 | "execution_count": null, 668 | "metadata": {}, 669 | "outputs": [], 670 | "source": [ 671 | "# The book scope is the default scope\n", 672 | "sheet1[\"A1:B2\"].name = \"matrix1\"" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": null, 678 | "metadata": {}, 679 | "outputs": [], 680 | "source": [ 681 | "# For the sheet scope, prepend the sheet name with\n", 682 | "# an exclamation point\n", 683 | "sheet1[\"B10:E11\"].name = \"Sheet1!matrix2\"" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "metadata": {}, 690 | "outputs": [], 691 | "source": [ 692 | "# Now you can access the range by name\n", 693 | "sheet1[\"matrix1\"]" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": null, 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [ 702 | "# If you access the names collection via the \"sheet1\" object,\n", 703 | "# it contains only names with that sheet's scope\n", 704 | "sheet1.names" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "# If you access the names collection via the \"book\" object,\n", 714 | "# it contains all names, including book and sheet scope\n", 715 | "book.names" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "# Names have various methods and attributes.\n", 725 | "# You can, for example, get the respective range object.\n", 726 | "book.names[\"matrix1\"].refers_to_range" 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": null, 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [ 735 | "# If you want to assign a name to a constant\n", 736 | "# or a formula, use the \"add\" method.\n", 737 | "# You may need to replace the decimal point with a comma\n", 738 | "# if your are using an international version of Excel.\n", 739 | "book.names.add(\"EURUSD\", \"=1.1151\")" 740 | ] 741 | }, 742 | { 743 | "cell_type": "markdown", 744 | "metadata": {}, 745 | "source": [ 746 | "# Advanced Topics" 747 | ] 748 | }, 749 | { 750 | "cell_type": "markdown", 751 | "metadata": {}, 752 | "source": [ 753 | "## Performance" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [ 762 | "# Add a new sheet and write 150 values\n", 763 | "# to it to have something to work with\n", 764 | "sheet2 = book.sheets.add()\n", 765 | "sheet2[\"A1\"].value = np.arange(150).reshape(30, 5)" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "metadata": {}, 772 | "outputs": [], 773 | "source": [ 774 | "%%time\n", 775 | "# This makes 150 cross-application calls\n", 776 | "for cell in sheet2[\"A1:E30\"]:\n", 777 | " cell.value += 1" 778 | ] 779 | }, 780 | { 781 | "cell_type": "code", 782 | "execution_count": null, 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "%%time\n", 787 | "# This makes just two cross-application calls\n", 788 | "values = sheet2[\"A1:E30\"].options(np.array).value\n", 789 | "sheet2[\"A1:E30\"].value = values + 1" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": null, 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [ 798 | "# With raw values, you must provide the full\n", 799 | "# target range, sheet[\"A35\"] doesn't work anymore\n", 800 | "sheet1[\"A35:B36\"].options(\"raw\").value = [[1, 2], [3, 4]]" 801 | ] 802 | } 803 | ], 804 | "metadata": { 805 | "kernelspec": { 806 | "display_name": "Python 3", 807 | "language": "python", 808 | "name": "python3" 809 | }, 810 | "language_info": { 811 | "codemirror_mode": { 812 | "name": "ipython", 813 | "version": 3 814 | }, 815 | "file_extension": ".py", 816 | "mimetype": "text/x-python", 817 | "name": "python", 818 | "nbconvert_exporter": "python", 819 | "pygments_lexer": "ipython3", 820 | "version": "3.7.4" 821 | } 822 | }, 823 | "nbformat": 4, 824 | "nbformat_minor": 4 825 | } 826 | -------------------------------------------------------------------------------- /ch11.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Case Study Preliminaries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Web APIs" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import json" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "pycharm": { 31 | "name": "#%%\n" 32 | } 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "# A Python dictionary...\n", 37 | "user_dict = {\"name\": \"Jane Doe\",\n", 38 | " \"age\": 23,\n", 39 | " \"married\": False,\n", 40 | " \"children\": None,\n", 41 | " \"hobbies\": [\"hiking\", \"reading\"]}" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "pycharm": { 49 | "name": "#%%\n" 50 | } 51 | }, 52 | "outputs": [], 53 | "source": [ 54 | "# ...converted to a JSON string\n", 55 | "# by json.dumps (\"dump string\"). The \"indent\" parameter is\n", 56 | "# optional and prettifies the printing.\n", 57 | "user_json = json.dumps(user_dict, indent=4)\n", 58 | "print(user_json)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "pycharm": { 66 | "name": "#%%\n" 67 | } 68 | }, 69 | "outputs": [], 70 | "source": [ 71 | "# Convert the JSON string back to a native Python data structure\n", 72 | "json.loads(user_json)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "import requests" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "pycharm": { 89 | "name": "#%%\n" 90 | } 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "response = requests.get(\"https://pypi.org/pypi/pandas/json\")\n", 95 | "response.status_code" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "pycharm": { 103 | "name": "#%%\n" 104 | } 105 | }, 106 | "outputs": [], 107 | "source": [ 108 | "# response.json()" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": { 115 | "pycharm": { 116 | "name": "#%%\n" 117 | } 118 | }, 119 | "outputs": [], 120 | "source": [ 121 | "releases = []\n", 122 | "for version, files in response.json()['releases'].items():\n", 123 | " releases.append(f\"{version}: {files[0]['upload_time']}\")\n", 124 | "releases[:3] # show the first 3 elements of the list" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## Databases" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "import urllib.parse" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": { 147 | "pycharm": { 148 | "name": "#%%\n" 149 | } 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "urllib.parse.quote_plus(\"pa$$word\")" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# Let's start with the imports\n", 163 | "import sqlite3\n", 164 | "from sqlalchemy import create_engine\n", 165 | "import pandas as pd" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": { 172 | "pycharm": { 173 | "name": "#%%\n" 174 | } 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "# Our SQL query: \"select all columns from the packages table\"\n", 179 | "sql = \"SELECT * FROM packages\"" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "pycharm": { 187 | "name": "#%%\n" 188 | } 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "# Option 1: Database driver (sqlite3 is part of the standard library)\n", 193 | "# Using the connection as context manager automatically commits\n", 194 | "# the transaction or rolls it back in case of an error.\n", 195 | "with sqlite3.connect(\"packagetracker/packagetracker.db\") as con:\n", 196 | " cursor = con.cursor() # We need a cursor to run SQL queries\n", 197 | " result = cursor.execute(sql).fetchall() # Return all records\n", 198 | "result" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "pycharm": { 206 | "name": "#%%\n" 207 | } 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "# Option 2: SQLAlchemy\n", 212 | "# \"create_engine\" expects the connection string of your database.\n", 213 | "# Here, we can execute a query as a method of the connection object.\n", 214 | "engine = create_engine(\"sqlite:///packagetracker/packagetracker.db\")\n", 215 | "with engine.connect() as con:\n", 216 | " result = con.execute(sql).fetchall()\n", 217 | "result" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "pycharm": { 225 | "name": "#%%\n" 226 | } 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "# Option 3: pandas\n", 231 | "# Providing a table name to \"read_sql\" reads the full table.\n", 232 | "# Pandas requires an SQLAlchemy engine that we reuse from\n", 233 | "# the previous example.\n", 234 | "df = pd.read_sql(\"packages\", engine, index_col=\"package_id\")\n", 235 | "df" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": { 242 | "pycharm": { 243 | "name": "#%%\n" 244 | } 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "# \"read_sql\" also accepts an SQL query\n", 249 | "pd.read_sql(sql, engine, index_col=\"package_id\")" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "pycharm": { 257 | "name": "#%%\n" 258 | } 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "# The DataFrame method \"to_sql\" writes DataFrames to tables\n", 263 | "# \"if_exists\" has to be either \"fail\", \"append\" or \"replace\"\n", 264 | "# and defines what happens if the table already exists\n", 265 | "df.to_sql(\"packages2\", con=engine, if_exists=\"append\")" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": { 272 | "pycharm": { 273 | "name": "#%%\n" 274 | } 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "# The previous command created a new table \"packages2\" and\n", 279 | "# inserted the records from the DataFrame df as we can\n", 280 | "# verify by reading it back\n", 281 | "pd.read_sql(\"packages2\", engine, index_col=\"package_id\")" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "pycharm": { 289 | "name": "#%%\n" 290 | } 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "# Let's get rid of the table again by running the\n", 295 | "# \"drop table\" command via SQLAlchemy\n", 296 | "with engine.connect() as con:\n", 297 | " con.execute(\"DROP TABLE packages2\")" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "# Let's start by importing SQLAlchemy's text function\n", 307 | "from sqlalchemy.sql import text" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "pycharm": { 315 | "name": "#%%\n" 316 | } 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "# \":package_id\" is the placeholder\n", 321 | "sql = \"\"\"\n", 322 | "SELECT v.uploaded_at, v.version_string\n", 323 | "FROM packages p\n", 324 | "INNER JOIN package_versions v ON p.package_id = v.package_id\n", 325 | "WHERE p.package_id = :package_id\n", 326 | "ORDER BY v.uploaded_at\n", 327 | "\"\"\"" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": { 334 | "pycharm": { 335 | "name": "#%%\n" 336 | } 337 | }, 338 | "outputs": [], 339 | "source": [ 340 | "# Via SQLAlchemy\n", 341 | "with engine.connect() as con:\n", 342 | " result = con.execute(text(sql), package_id=1).fetchall()\n", 343 | "result[:3] # Print the first 3 records" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "pycharm": { 351 | "name": "#%%\n" 352 | } 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "# Via pandas\n", 357 | "pd.read_sql(text(sql), engine, parse_dates=[\"uploaded_at\"],\n", 358 | " params={\"package_id\": 1},\n", 359 | " index_col=[\"uploaded_at\"]).head(3)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "## Exceptions" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "pycharm": { 374 | "name": "#%%\n" 375 | } 376 | }, 377 | "outputs": [], 378 | "source": [ 379 | "def print_reciprocal(number):\n", 380 | " result = 1 / number\n", 381 | " print(f\"The reciprocal is: {result}\")" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": { 388 | "pycharm": { 389 | "name": "#%%\n" 390 | } 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "print_reciprocal(0) # This will raise an error" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": { 401 | "pycharm": { 402 | "name": "#%%\n" 403 | } 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "def print_reciprocal(number):\n", 408 | " try:\n", 409 | " result = 1 / number\n", 410 | " except Exception as e:\n", 411 | " # \"as e\" makes the Exception object available as variable \"e\"\n", 412 | " # \"repr\" stands for \"printable representation\" of an object\n", 413 | " # and gives you back a string with the error message\n", 414 | " print(f\"There was an error: {repr(e)}\")\n", 415 | " result = \"N/A\"\n", 416 | " else:\n", 417 | " print(\"There was no error!\")\n", 418 | " finally:\n", 419 | " print(f\"The reciprocal is: {result}\")" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "metadata": { 426 | "pycharm": { 427 | "name": "#%%\n" 428 | } 429 | }, 430 | "outputs": [], 431 | "source": [ 432 | "print_reciprocal(10)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "metadata": { 439 | "pycharm": { 440 | "name": "#%%\n" 441 | } 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "print_reciprocal(\"a\")" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": { 452 | "pycharm": { 453 | "name": "#%%\n" 454 | } 455 | }, 456 | "outputs": [], 457 | "source": [ 458 | "print_reciprocal(0)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "pycharm": { 466 | "name": "#%%\n" 467 | } 468 | }, 469 | "outputs": [], 470 | "source": [ 471 | "def print_reciprocal(number):\n", 472 | " try:\n", 473 | " result = 1 / number\n", 474 | " print(f\"The reciprocal is: {result}\")\n", 475 | " except (TypeError, ZeroDivisionError):\n", 476 | " print(\"Please type in any number except 0.\")" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": { 483 | "pycharm": { 484 | "name": "#%%\n" 485 | } 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "print_reciprocal(\"a\")" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "metadata": { 496 | "pycharm": { 497 | "name": "#%%\n" 498 | } 499 | }, 500 | "outputs": [], 501 | "source": [ 502 | "def print_reciprocal(number):\n", 503 | " try:\n", 504 | " result = 1 / number\n", 505 | " print(f\"The reciprocal is: {result}\")\n", 506 | " except TypeError:\n", 507 | " print(\"Please type in a number.\")\n", 508 | " except ZeroDivisionError:\n", 509 | " print(\"The reciprocal of 0 is not defined.\")" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "print_reciprocal(\"a\")" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "print_reciprocal(0)" 528 | ] 529 | } 530 | ], 531 | "metadata": { 532 | "kernelspec": { 533 | "display_name": "Python 3", 534 | "language": "python", 535 | "name": "python3" 536 | }, 537 | "language_info": { 538 | "codemirror_mode": { 539 | "name": "ipython", 540 | "version": 3 541 | }, 542 | "file_extension": ".py", 543 | "mimetype": "text/x-python", 544 | "name": "python", 545 | "nbconvert_exporter": "python", 546 | "pygments_lexer": "ipython3", 547 | "version": "3.7.4" 548 | } 549 | }, 550 | "nbformat": 4, 551 | "nbformat_minor": 4 552 | } 553 | -------------------------------------------------------------------------------- /ch12.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# User-Defined Functions (UDFs)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Function Decorators" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# This is the definition of the function decorator\n", 24 | "def verbose(func):\n", 25 | " def wrapper():\n", 26 | " print(\"Before calling the function.\")\n", 27 | " func()\n", 28 | " print(\"After calling the function.\")\n", 29 | " return wrapper" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "# Using a function decorator\n", 39 | "@verbose\n", 40 | "def print_hello():\n", 41 | " print(\"hello!\")" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "# Effect of calling the decorated function\n", 51 | "print_hello()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": {}, 57 | "source": [ 58 | "## Fetching Data from Google Trends" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "from pytrends.request import TrendReq" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# First, let's instantiate a TrendRequest object\n", 77 | "trend = TrendReq()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "# Now we can print the suggestions as they would appear\n", 87 | "# online in the dropdown of Google Trends after typing in \"Python\"\n", 88 | "trend.suggestions(\"Python\")" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": {}, 94 | "source": [ 95 | "## Caching" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import time" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "cache = {}\n", 114 | "\n", 115 | "def slow_sum(a, b):\n", 116 | " key = (a, b)\n", 117 | " if key in cache:\n", 118 | " return cache[key]\n", 119 | " else:\n", 120 | " time.sleep(2) # sleep for 2 seconds\n", 121 | " result = a + b\n", 122 | " cache[key] = result\n", 123 | " return result" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "%%time\n", 133 | "slow_sum(1, 2)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "%%time\n", 143 | "slow_sum(1, 2)" 144 | ] 145 | } 146 | ], 147 | "metadata": { 148 | "kernelspec": { 149 | "display_name": "Python 3", 150 | "language": "python", 151 | "name": "python3" 152 | }, 153 | "language_info": { 154 | "codemirror_mode": { 155 | "name": "ipython", 156 | "version": 3 157 | }, 158 | "file_extension": ".py", 159 | "mimetype": "text/x-python", 160 | "name": "python", 161 | "nbconvert_exporter": "python", 162 | "pygments_lexer": "ipython3", 163 | "version": "3.7.4" 164 | } 165 | }, 166 | "nbformat": 4, 167 | "nbformat_minor": 4 168 | } 169 | -------------------------------------------------------------------------------- /conda/xl310.yml: -------------------------------------------------------------------------------- 1 | name: xl310 2 | channels: 3 | - defaults 4 | dependencies: 5 | - python=3.10 6 | - pip=21.2.4 7 | - pip: 8 | - flake8==4.0.1 9 | - lxml==4.7.1 10 | - matplotlib==3.5.1 11 | - notebook==6.4.6 12 | - openpyxl==3.0.9 13 | - pandas==1.3.5 14 | - numpy==1.21.0 15 | - pillow==8.4.0 16 | - plotly==5.4.0 17 | - python-dateutil==2.8.2 18 | - requests==2.26.0 19 | - sqlalchemy==1.4.28 20 | - xlrd==2.0.1 21 | - xlsxwriter==3.0.2 22 | - xlutils==2.0.0 23 | - xlwings==0.25.3 24 | - xlwt==1.3.0 25 | - pytrends==4.7.3 26 | - pyxlsb==1.0.9 27 | -------------------------------------------------------------------------------- /conda/xl38.yml: -------------------------------------------------------------------------------- 1 | name: xl38 2 | channels: 3 | - defaults 4 | dependencies: 5 | - flake8=3.8.4 6 | - lxml=4.6.1 7 | - matplotlib=3.3.2 8 | - notebook=6.1.4 9 | - openpyxl=3.0.5 10 | - pandas=1.1.3 11 | - numpy=1.19.2 12 | - pillow=8.0.1 13 | - pip=20.2.4 14 | - plotly=4.14.1 15 | - python=3.8.5 16 | - python-dateutil=2.8.1 17 | - requests=2.24.0 18 | - sqlalchemy=1.3.20 19 | - xlrd=1.2.0 20 | - xlsxwriter=1.3.7 21 | - xlutils=2.0.0 22 | - xlwings=0.20.8 23 | - xlwt=1.3.0 24 | - pip: 25 | - pytrends==4.7.3 26 | - pyxlsb==1.0.7 -------------------------------------------------------------------------------- /debugging.py: -------------------------------------------------------------------------------- 1 | a = 3 2 | b = 4 3 | 4 | c = a + b 5 | 6 | print(c) 7 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | # This is a copy of conda/xl38.yml (but without xlwings) for Binder 2 | name: xl38 3 | channels: 4 | - defaults 5 | dependencies: 6 | - flake8=3.8.4 7 | - lxml=4.6.1 8 | - matplotlib=3.3.2 9 | - notebook=6.1.4 10 | - openpyxl=3.0.5 11 | - pandas=1.1.3 12 | - numpy=1.19.2 13 | - pillow=8.0.1 14 | - pip=20.2.4 15 | - plotly=4.14.1 16 | - python=3.8.5 17 | - python-dateutil=2.8.1 18 | - requests=2.24.0 19 | - sqlalchemy=1.3.20 20 | - xlrd=1.2.0 21 | - xlsxwriter=1.3.7 22 | - xlutils=2.0.0 23 | - xlwt=1.3.0 24 | - pip: 25 | - pytrends==4.7.3 26 | - pyxlsb==1.0.7 27 | -------------------------------------------------------------------------------- /excel.py: -------------------------------------------------------------------------------- 1 | """This module offers a read and write function to get 2 | 2-dimensional lists in and out of Excel files. 3 | """ 4 | import re 5 | import itertools 6 | import datetime as dt 7 | 8 | # Optional dependencies 9 | try: 10 | import openpyxl 11 | except ImportError: 12 | openpyxl = None 13 | try: 14 | import pyxlsb 15 | except ImportError: 16 | pyxlsb = None 17 | try: 18 | import xlrd 19 | from xlrd.biffh import error_text_from_code 20 | except ImportError: 21 | xlrd = None 22 | try: 23 | import xlwt 24 | except ImportError: 25 | xlwt = None 26 | try: 27 | import xlsxwriter 28 | except ImportError: 29 | xlsxwriter = None 30 | 31 | 32 | def read(sheet, first_cell="A1", last_cell=None): 33 | """Read a 2-dimensional list from an Excel range. 34 | 35 | Parameters 36 | ---------- 37 | sheet : object 38 | An xlrd, openpyxl or pyxlsb sheet object 39 | first_cell : str or tuple, optional 40 | Top-left corner of the Excel range you want to read. 41 | Can be a string like "A1" or a row/col tuple like (1, 1), 42 | default is "A1". 43 | last_cell : str or tuple, optional 44 | Bottom-right corner of the Excel range you want to read. 45 | Can be a string like "A1" or a row/col tuple like (1, 1), 46 | default is the bottom-right cell of the used range. 47 | 48 | Returns 49 | ------- 50 | list 51 | A 2-dimensional list with the values of the Excel range 52 | """ 53 | # xlrd 54 | if xlrd and isinstance(sheet, xlrd.sheet.Sheet): 55 | # isinstance returns True if sheet is of type xlrd.sheet.Sheet 56 | if last_cell is None: 57 | # actual range with data, not used range 58 | last_cell = (sheet.nrows, sheet.ncols) 59 | # Transform "A1" notation into tuples of 1-based indices 60 | if not isinstance(first_cell, tuple): 61 | first_cell = xl_cell_to_rowcol(first_cell) 62 | first_cell = (first_cell[0] + 1, first_cell[1] + 1) 63 | if not isinstance(last_cell, tuple): 64 | last_cell = xl_cell_to_rowcol(last_cell) 65 | last_cell = (last_cell[0] + 1, last_cell[1] + 1) 66 | values = [] 67 | for r in range(first_cell[0] - 1, last_cell[0]): 68 | row = [] 69 | for c in range(first_cell[1] - 1, last_cell[1]): 70 | # Handle the different cell types 71 | if sheet.cell(r, c).ctype == xlrd.XL_CELL_DATE: 72 | value = xlrd.xldate.xldate_as_datetime( 73 | sheet.cell(r, c).value, sheet.book.datemode) 74 | elif sheet.cell(r, c).ctype in [xlrd.XL_CELL_EMPTY, 75 | xlrd.XL_CELL_BLANK]: 76 | value = None 77 | elif sheet.cell(r, c).ctype == xlrd.XL_CELL_ERROR: 78 | value = error_text_from_code[sheet.cell(r, c).value] 79 | elif sheet.cell(r, c).ctype == xlrd.XL_CELL_BOOLEAN: 80 | value = bool(sheet.cell(r, c).value) 81 | else: 82 | value = sheet.cell(r, c).value 83 | row.append(value) 84 | values.append(row) 85 | return values 86 | 87 | # OpenPyXL 88 | elif openpyxl and isinstance( 89 | sheet, 90 | (openpyxl.worksheet.worksheet.Worksheet, 91 | openpyxl.worksheet._read_only.ReadOnlyWorksheet)): 92 | if last_cell is None: 93 | # used range 94 | last_cell = (sheet.max_row, sheet.max_column) 95 | if not isinstance(first_cell, tuple): 96 | first_cell = openpyxl.utils.cell.coordinate_to_tuple(first_cell) 97 | if not isinstance(last_cell, tuple): 98 | last_cell = openpyxl.utils.cell.coordinate_to_tuple(last_cell) 99 | data = [] 100 | for row in sheet.iter_rows(min_row=first_cell[0], min_col=first_cell[1], 101 | max_row=last_cell[0], max_col=last_cell[1], 102 | values_only=True): 103 | data.append(list(row)) 104 | return data 105 | 106 | # pyxlsb 107 | elif pyxlsb and isinstance(sheet, pyxlsb.worksheet.Worksheet): 108 | errors = {"0x0": "#NULL!", "0x7": "#DIV/0!", "0xf": "#VALUE!", 109 | "0x17": "#REF!", "0x1d": "#NAME?", "0x24": "#NUM!", 110 | "0x2a": "#N/A"} 111 | if not isinstance(first_cell, tuple): 112 | first_cell = xl_cell_to_rowcol(first_cell) 113 | first_cell = (first_cell[0] + 1, first_cell[1] + 1) 114 | if last_cell and not isinstance(last_cell, tuple): 115 | last_cell = xl_cell_to_rowcol(last_cell) 116 | last_cell = (last_cell[0] + 1, last_cell[1] + 1) 117 | data = [] 118 | # sheet.rows() is a generator that requires islice to slice it 119 | for row in itertools.islice(sheet.rows(), 120 | first_cell[0] - 1, 121 | last_cell[0] if last_cell else None): 122 | data.append([errors.get(cell.v, cell.v) for cell in row] 123 | [first_cell[1] - 1 : last_cell[1] if last_cell else None]) 124 | return data 125 | else: 126 | raise TypeError(f"Couldn't handle sheet of type {type(sheet)}") 127 | 128 | 129 | def write(sheet, values, first_cell="A1", date_format=None): 130 | """Write a 2-dimensional list to an Excel range. 131 | 132 | Parameters 133 | ---------- 134 | sheet : object 135 | An openpyxl, xlsxwriter or xlwt sheet object. openpyxl's 136 | write_only=True mode is not supported. 137 | values : list 138 | A 2-dimensional list of values 139 | first_cell : str or tuple, optional 140 | Top-left corner of the Excel range where you want to write out 141 | the DataFrame. Can be a string like "A1" or a row/col tuple 142 | like (1, 1), default is "A1". 143 | date_format : str, optional 144 | Only accepted if sheet is an openpyxl or xlwt sheet. By default, 145 | formats dates in the following format: "mm/dd/yy". For xlsxwriter, 146 | set the format when you instantiate a Workbook by providing: 147 | options={"default_date_format": "mm/dd/yy"} 148 | """ 149 | # OpenPyXL 150 | if openpyxl and isinstance( 151 | sheet, openpyxl.worksheet.worksheet.Worksheet): 152 | if date_format is None: 153 | date_format = "mm/dd/yy" 154 | if not isinstance(first_cell, tuple): 155 | first_cell = openpyxl.utils.coordinate_to_tuple(first_cell) 156 | for i, row in enumerate(values): 157 | for j, value in enumerate(row): 158 | cell = sheet.cell(row=first_cell[0] + i, 159 | column=first_cell[1] + j) 160 | cell.value = value 161 | if date_format and isinstance(value, (dt.datetime, dt.date)): 162 | cell.number_format = date_format 163 | 164 | # XlsxWriter 165 | elif xlsxwriter and isinstance(sheet, xlsxwriter.worksheet.Worksheet): 166 | if date_format is not None: 167 | raise ValueError("date_format must be set as Workbook option") 168 | if isinstance(first_cell, tuple): 169 | first_cell = first_cell[0] - 1, first_cell[1] - 1 170 | else: 171 | first_cell = xl_cell_to_rowcol(first_cell) 172 | for r, row_data in enumerate(values): 173 | sheet.write_row(first_cell[0] + r, first_cell[1], row_data) 174 | 175 | # xlwt 176 | elif xlwt and isinstance(sheet, xlwt.Worksheet): 177 | if date_format is None: 178 | date_format = "mm/dd/yy" 179 | date_format = xlwt.easyxf(num_format_str=date_format) 180 | if isinstance(first_cell, tuple): 181 | first_cell = (first_cell[0] - 1, first_cell[1] - 1) 182 | else: 183 | first_cell = xl_cell_to_rowcol(first_cell) 184 | for i, row in enumerate(values): 185 | for j, cell in enumerate(row): 186 | if isinstance(cell, (dt.datetime, dt.date)): 187 | sheet.write(i + first_cell[0], j + first_cell[1], 188 | cell, date_format) 189 | else: 190 | sheet.write(i + first_cell[0], j + first_cell[1], 191 | cell) 192 | else: 193 | raise TypeError(f"Couldn't handle sheet of type {type(sheet)}") 194 | 195 | 196 | def xl_cell_to_rowcol(cell_str): 197 | """ 198 | Convert a cell reference in A1 notation to a zero indexed row and column. 199 | 200 | Args: 201 | cell_str: A1 style string. 202 | 203 | Returns: 204 | row, col: Zero indexed cell row and column indices. 205 | 206 | This function is from XlsxWriter 207 | Copyright (c) 2013-2020, John McNamara 208 | All rights reserved. 209 | 210 | Redistribution and use in source and binary forms, with or without 211 | modification, are permitted provided that the following conditions are met: 212 | 213 | 1. Redistributions of source code must retain the above copyright notice, this 214 | list of conditions and the following disclaimer. 215 | 2. Redistributions in binary form must reproduce the above copyright notice, 216 | this list of conditions and the following disclaimer in the documentation 217 | and/or other materials provided with the distribution. 218 | 219 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 220 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 221 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 222 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 223 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 224 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 225 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 226 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 227 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 228 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 229 | 230 | The views and conclusions contained in the software and documentation are those 231 | of the authors and should not be interpreted as representing official policies, 232 | either expressed or implied, of the FreeBSD Project. 233 | 234 | """ 235 | if not cell_str: 236 | return 0, 0 237 | 238 | match = re.compile(r"(\$?)([A-Z]{1,3})(\$?)(\d+)").match(cell_str) 239 | col_str = match.group(2) 240 | row_str = match.group(4) 241 | 242 | # Convert base26 column string to number. 243 | expn = 0 244 | col = 0 245 | for char in reversed(col_str): 246 | col += (ord(char) - ord("A") + 1) * (26 ** expn) 247 | expn += 1 248 | 249 | # Convert 1-index to zero-index 250 | row = int(row_str) - 1 251 | col -= 1 252 | 253 | return row, col 254 | -------------------------------------------------------------------------------- /images/cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/images/cover.png -------------------------------------------------------------------------------- /images/python.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/images/python.bmp -------------------------------------------------------------------------------- /images/python.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/images/python.png -------------------------------------------------------------------------------- /packagetracker/database.py: -------------------------------------------------------------------------------- 1 | """This module handles all database interactions""" 2 | 3 | from pathlib import Path 4 | from sqlite3 import Connection as SQLite3Connection 5 | 6 | import sqlalchemy 7 | from sqlalchemy import event 8 | from sqlalchemy.sql import text 9 | from sqlalchemy.engine import Engine 10 | import pandas as pd 11 | 12 | 13 | # Have SQLAlchemy enforce foreign keys with SQLite, see: 14 | # https://docs.sqlalchemy.org/en/latest/dialects/sqlite.html#foreign-key-support 15 | @event.listens_for(Engine, "connect") 16 | def set_sqlite_pragma(dbapi_connection, connection_record): 17 | if isinstance(dbapi_connection, SQLite3Connection): 18 | cursor = dbapi_connection.cursor() 19 | cursor.execute("PRAGMA foreign_keys=ON") 20 | cursor.close() 21 | 22 | 23 | # We want the database file to sit next to this file. 24 | # Here, we are turning the path into an absolute path. 25 | this_dir = Path(__file__).resolve().parent 26 | db_path = this_dir / "packagetracker.db" 27 | 28 | # Database engine 29 | engine = sqlalchemy.create_engine(f"sqlite:///{db_path}") 30 | 31 | 32 | def get_packages(): 33 | """Get all packages as DataFrame""" 34 | 35 | return pd.read_sql_table("packages", con=engine, index_col="package_id") 36 | 37 | 38 | def store_package(package_name): 39 | """Insert a new package_name into the packages table""" 40 | 41 | try: 42 | with engine.connect() as con: 43 | con.execute(text("INSERT INTO packages (package_name) VALUES (:package_name)"), 44 | package_name=package_name) 45 | return None 46 | except sqlalchemy.exc.IntegrityError: 47 | return f"{package_name} already exists" 48 | except Exception as e: 49 | return repr(e) 50 | 51 | 52 | def get_versions(package_name): 53 | """Get all versions for the package with the name package_name""" 54 | 55 | sql = """ 56 | SELECT v.uploaded_at, v.version_string 57 | FROM packages p 58 | INNER JOIN package_versions v ON p.package_id = v.package_id 59 | WHERE p.package_name = :package_name 60 | """ 61 | return pd.read_sql_query(text(sql), engine, parse_dates=["uploaded_at"], 62 | params={"package_name": package_name}, 63 | index_col=["uploaded_at"]) 64 | 65 | 66 | def store_versions(df): 67 | """Insert the records of the provided DataFrame df into the package_versions table""" 68 | 69 | df.to_sql("package_versions", con=engine, if_exists="append", index=False) 70 | 71 | 72 | def delete_versions(): 73 | """Delete all records from the version table""" 74 | 75 | with engine.connect() as con: 76 | con.execute("DELETE FROM package_versions") 77 | 78 | 79 | def create_db(): 80 | """Run this function to create the database tables. 81 | In case of sqlite, this is also creating the database file. 82 | """ 83 | 84 | sql_table_packages = """ 85 | CREATE TABLE packages ( 86 | package_id INTEGER PRIMARY KEY, 87 | package_name TEXT NOT NULL, 88 | UNIQUE(package_name) 89 | ) 90 | """ 91 | 92 | sql_table_versions = """ 93 | CREATE TABLE package_versions ( 94 | package_id INTEGER, 95 | version_string TEXT, 96 | uploaded_at TIMESTAMP NOT NULL, 97 | PRIMARY KEY (package_id, version_string), 98 | FOREIGN KEY (package_id) REFERENCES packages (package_id) 99 | ) 100 | """ 101 | 102 | sql_statements = [sql_table_packages, sql_table_versions] 103 | with engine.connect() as con: 104 | for sql in sql_statements: 105 | con.execute(sql) 106 | 107 | 108 | if __name__ == "__main__": 109 | # Run this as a script to create the packagetracker.db database 110 | create_db() 111 | -------------------------------------------------------------------------------- /packagetracker/packagetracker.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/packagetracker/packagetracker.db -------------------------------------------------------------------------------- /packagetracker/packagetracker.py: -------------------------------------------------------------------------------- 1 | """This module contains all functions that are either called from Excel 2 | or manipulate Excel. 3 | """ 4 | 5 | import datetime as dt 6 | 7 | from dateutil import tz 8 | import requests 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | import xlwings as xw 12 | 13 | import database 14 | 15 | 16 | # This is the part of the URL that is the same for every request 17 | BASE_URL = "https://pypi.org/pypi" 18 | 19 | 20 | def add_package(): 21 | """ Adds a new package including the version history to the database. 22 | Triggers an update of the dropdown on the Tracker tab. 23 | """ 24 | # Excel objects 25 | db_sheet = xw.Book.caller().sheets["Database"] 26 | package_name = db_sheet["new_package"].value 27 | feedback_cell = db_sheet["new_package"].offset(column_offset=1) 28 | 29 | # Clear feedback cell 30 | feedback_cell.clear_contents() 31 | 32 | # Check if the package exists on PyPI 33 | if not package_name: 34 | feedback_cell.value = "Error: Please provide a name!" 35 | return 36 | if requests.get(f"{BASE_URL}/{package_name}/json", 37 | timeout=6).status_code != 200: 38 | feedback_cell.value = "Error: Package not found!" 39 | return 40 | 41 | # Insert the package name into the packages table 42 | error = database.store_package(package_name) 43 | db_sheet["new_package"].clear_contents() 44 | 45 | # Show any errors, otherwise kick off a database update and 46 | # refresh the dropdown so you can select the new package 47 | if error: 48 | feedback_cell.value = f"Error: {error}" 49 | else: 50 | feedback_cell.value = f"Added {package_name} successfully." 51 | update_database() 52 | refresh_dropdown() 53 | 54 | 55 | def update_database(): 56 | """ Deletes all records from the versions table, fetches all 57 | data again from PyPI and stores the versions again in the table. 58 | """ 59 | # Excel objects 60 | sheet_db = xw.Book.caller().sheets["Database"] 61 | 62 | # Clear logs 63 | sheet_db["log"].expand().clear_contents() 64 | 65 | # Keeping things super simple: Delete all versions for all packages 66 | # and repopulate the package_versions table from scratch 67 | database.delete_versions() 68 | df_packages = database.get_packages() 69 | logs = [] 70 | 71 | # Query the PyPI REST API 72 | for package_id, row in df_packages.iterrows(): 73 | ret = requests.get(f"{BASE_URL}/{row['package_name']}/json", 74 | timeout=6) 75 | if ret.status_code == 200: 76 | ret = ret.json() # parse the JSON string into a dictionary 77 | logs.append(f"INFO: {row['package_name']} downloaded successfully") 78 | else: 79 | logs.append(f"ERROR: Could not download data for {row['package_name']}") 80 | continue 81 | 82 | # Instantiate a DataFrame by extracting data from the REST API response 83 | releases = [] 84 | for version, files in ret["releases"].items(): 85 | if ret["releases"][version]: # ignore releases without info 86 | releases.append((files[0]["upload_time"], version, package_id)) 87 | df_releases = pd.DataFrame(columns=["uploaded_at", "version_string", "package_id"], 88 | data=releases) 89 | df_releases["uploaded_at"] = pd.to_datetime(df_releases["uploaded_at"]) 90 | df_releases = df_releases.sort_values("uploaded_at") 91 | database.store_versions(df_releases) 92 | logs.append(f"INFO: {row['package_name']} stored to database successfully") 93 | 94 | # Write out the last updated timestamp and logs 95 | sheet_db["updated_at"].value = (f"Last updated: " 96 | f"{dt.datetime.now(tz.UTC).isoformat()}") 97 | sheet_db["log"].options(transpose=True).value = logs 98 | 99 | 100 | def show_history(): 101 | """ Shows the latest release and plots the release history 102 | (number of releases per year) 103 | """ 104 | # Excel objects 105 | book = xw.Book.caller() 106 | tracker_sheet = book.sheets["Tracker"] 107 | package_name = tracker_sheet["package_selection"].value 108 | feedback_cell = tracker_sheet["package_selection"].offset(column_offset=1) 109 | picture_cell = tracker_sheet["latest_release"].offset(row_offset=2) 110 | 111 | # Use the "seaborn" style for the Matplotlib plots produced by pandas 112 | plt.style.use("seaborn") 113 | 114 | # Check input 115 | if not package_name: 116 | feedback_cell.value = ("Error: Please select a package first! " 117 | "You may first have to add one to the database.") 118 | return 119 | 120 | # Clear output cells and picture 121 | feedback_cell.clear_contents() 122 | tracker_sheet["latest_release"].clear_contents() 123 | if "releases_per_year" in tracker_sheet.pictures: 124 | tracker_sheet.pictures["releases_per_year"].delete() 125 | 126 | # Get all versions of the package from the database 127 | try: 128 | df_releases = database.get_versions(package_name) 129 | except Exception as e: 130 | feedback_cell.value = repr(e) 131 | return 132 | if df_releases.empty: 133 | feedback_cell.value = f"Error: Didn't find any releases for {package_name}" 134 | return 135 | 136 | # Calculate the number of releases per year and plot it 137 | df_releases_yearly = df_releases.resample("Y").count() 138 | df_releases_yearly.index = df_releases_yearly.index.year 139 | df_releases_yearly.index.name = "Years" 140 | df_releases_yearly = df_releases_yearly.rename( 141 | columns={"version_string": "Number of Releases"}) 142 | ax = df_releases_yearly.plot.bar( 143 | title=f"Number of Releases per Year " 144 | f"({tracker_sheet['package_selection'].value})") 145 | 146 | # Write the results and plot to Excel 147 | version = df_releases.loc[df_releases.index.max(), "version_string"] 148 | tracker_sheet["latest_release"].value = ( 149 | f"{version} ({df_releases.index.max():%B %d, %Y})") 150 | tracker_sheet.pictures.add(ax.get_figure(), name="releases_per_year", 151 | top=picture_cell.top, 152 | left=picture_cell.left) 153 | 154 | 155 | def refresh_dropdown(): 156 | """ Refreshes the dropdown on the Tracker tab with the content of 157 | the packages table. 158 | """ 159 | # Excel objects 160 | book = xw.Book.caller() 161 | dropdown_sheet = book.sheets["Dropdown"] 162 | tracker_sheet = book.sheets["Tracker"] 163 | 164 | # Clear the current value in the dropdown 165 | tracker_sheet["package_selection"].clear_contents() 166 | 167 | # If the Excel table has non-empty rows, delete them before repopulating 168 | # it again with the values from the packages database table 169 | if dropdown_sheet["dropdown_content"].value: 170 | dropdown_sheet["dropdown_content"].delete() 171 | dropdown_sheet["dropdown_content"].options( 172 | header=False, index=False).value = database.get_packages() 173 | 174 | 175 | if __name__ == "__main__": 176 | xw.Book("packagetracker.xlsm").set_mock_caller() 177 | add_package() 178 | -------------------------------------------------------------------------------- /packagetracker/packagetracker.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/packagetracker/packagetracker.xlsm -------------------------------------------------------------------------------- /parallel_openpyxl.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from itertools import repeat 3 | 4 | import openpyxl 5 | import excel 6 | 7 | 8 | def _read_sheet(filename, sheetname): 9 | # The leading underscore in the function name is used by convention 10 | # to mark it as "private", i.e., it shouldn't be used directly outside 11 | # of this module. 12 | book = openpyxl.load_workbook(filename, 13 | read_only=True, data_only=True) 14 | sheet = book[sheetname] 15 | data = excel.read(sheet) 16 | book.close() 17 | return sheet.title, data 18 | 19 | def load_workbook(filename, sheetnames=None): 20 | if sheetnames is None: 21 | book = openpyxl.load_workbook(filename, 22 | read_only=True, data_only=True) 23 | sheetnames = book.sheetnames 24 | book.close() 25 | with multiprocessing.Pool() as pool: 26 | # By default, Pool spawns as many processes as there are CPU cores. 27 | # starmap maps a tuple of arguments to a function. The zip expression 28 | # produces a list with tuples of the following form: 29 | # [('filename.xlsx', 'Sheet1'), ('filename.xlsx', 'Sheet2)] 30 | data = pool.starmap(_read_sheet, zip(repeat(filename), sheetnames)) 31 | return {i[0]: i[1] for i in data} 32 | -------------------------------------------------------------------------------- /parallel_pandas.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from itertools import repeat 3 | 4 | import pandas as pd 5 | import openpyxl 6 | 7 | 8 | def _read_sheet(filename, sheet_name): 9 | # The leading underscore in the function name is used by convention 10 | # to mark it as "private", i.e., it shouldn't be used directly outside 11 | # of this module. 12 | df = pd.read_excel(filename, sheet_name=sheet_name, engine='openpyxl') 13 | return sheet_name, df 14 | 15 | 16 | def read_excel(filename, sheet_name=None): 17 | if sheet_name is None: 18 | book = openpyxl.load_workbook(filename, 19 | read_only=True, data_only=True) 20 | sheet_name = book.sheetnames 21 | book.close() 22 | with multiprocessing.Pool() as pool: 23 | # By default, Pool spawns as many processes as there are CPU cores. 24 | # starmap maps a tuple of arguments to a function. The zip expression 25 | # produces a list with tuples of the following form: 26 | # [('filename.xlsx', 'Sheet1'), ('filename.xlsx', 'Sheet2)] 27 | data = pool.starmap(_read_sheet, zip(repeat(filename), sheet_name)) 28 | return {i[0]: i[1] for i in data} 29 | -------------------------------------------------------------------------------- /parallel_xlrd.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | from itertools import repeat 3 | 4 | import xlrd 5 | import excel 6 | 7 | 8 | def _read_sheet(filename, sheetname): 9 | # The leading underscore in the function name is used by convention 10 | # to mark it as "private", i.e., it shouldn't be used directly outside 11 | # of this module. 12 | with xlrd.open_workbook(filename, on_demand=True) as book: 13 | sheet = book.sheet_by_name(sheetname) 14 | data = excel.read(sheet) 15 | return sheet.name, data 16 | 17 | 18 | def open_workbook(filename, sheetnames=None): 19 | if sheetnames is None: 20 | with xlrd.open_workbook(filename, on_demand=True) as book: 21 | sheetnames = book.sheet_names() 22 | with multiprocessing.Pool() as pool: 23 | # By default, Pool spawns as many processes as there are CPU cores. 24 | # starmap maps a tuple of arguments to a function. The zip expression 25 | # produces a list with tuples of the following form: 26 | # [('filename.xlsx', 'Sheet1'), ('filename.xlsx', 'Sheet2)] 27 | data = pool.starmap(_read_sheet, zip(repeat(filename), sheetnames)) 28 | return {i[0]: i[1] for i in data} 29 | -------------------------------------------------------------------------------- /pep8_sample.py: -------------------------------------------------------------------------------- 1 | """This script shows a few PEP 8 rules. 2 | """ 3 | 4 | import datetime as dt 5 | 6 | 7 | TEMPERATURE_SCALES = ("fahrenheit", "kelvin", 8 | "celsius") 9 | 10 | 11 | class TemperatureConverter: 12 | pass # Doesn't do anything at the moment 13 | 14 | 15 | def convert_to_celsius(degrees, source="fahrenheit"): 16 | """This function converts degrees Fahrenheit or Kelvin 17 | into degrees Celsius. 18 | """ 19 | if source.lower() == "fahrenheit": 20 | return (degrees-32) * (5/9) 21 | elif source.lower() == "kelvin": 22 | return degrees - 273.15 23 | else: 24 | return f"Don't know how to convert from {source}" 25 | 26 | 27 | celsius = convert_to_celsius(44, source="fahrenheit") 28 | non_celsius_scales = TEMPERATURE_SCALES[:-1] 29 | 30 | print("Current time: " + dt.datetime.now().isoformat()) 31 | print(f"The temperature in Celsius is: {celsius}") 32 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # If you don't use Anaconda, you can use this file to install all dependencies. 2 | # Run it like so from a Command Prompt or Terminal: 3 | # 4 | # pip install -r requirements.txt 5 | 6 | flake8==3.8.4 7 | lxml==4.6.2 8 | matplotlib==3.3.2 9 | notebook==6.1.5 10 | openpyxl==3.0.5 11 | pandas==1.1.3 12 | numpy==1.19.2 13 | pillow==8.0.1 14 | plotly==4.12.0 15 | python-dateutil==2.8.1 16 | requests==2.25.0 17 | sqlalchemy==1.3.20 18 | xlrd==1.2.0 19 | xlsxwriter==1.3.7 20 | xlutils==2.0.0 21 | xlwings==0.20.8 22 | xlwt==1.3.0 23 | pytrends==4.7.3 24 | pyxlsb==1.0.6 25 | -------------------------------------------------------------------------------- /sales_data/existing/April.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/April.xls -------------------------------------------------------------------------------- /sales_data/existing/August.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/August.xls -------------------------------------------------------------------------------- /sales_data/existing/December.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/December.xls -------------------------------------------------------------------------------- /sales_data/existing/February.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/February.xls -------------------------------------------------------------------------------- /sales_data/existing/January.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/January.xls -------------------------------------------------------------------------------- /sales_data/existing/July.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/July.xls -------------------------------------------------------------------------------- /sales_data/existing/June.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/June.xls -------------------------------------------------------------------------------- /sales_data/existing/March.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/March.xls -------------------------------------------------------------------------------- /sales_data/existing/May.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/May.xls -------------------------------------------------------------------------------- /sales_data/existing/November.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/November.xls -------------------------------------------------------------------------------- /sales_data/existing/October.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/October.xls -------------------------------------------------------------------------------- /sales_data/existing/September.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/existing/September.xls -------------------------------------------------------------------------------- /sales_data/new/April.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/April.xlsx -------------------------------------------------------------------------------- /sales_data/new/August.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/August.xlsx -------------------------------------------------------------------------------- /sales_data/new/December.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/December.xlsx -------------------------------------------------------------------------------- /sales_data/new/February.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/February.xlsx -------------------------------------------------------------------------------- /sales_data/new/January.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/January.xlsx -------------------------------------------------------------------------------- /sales_data/new/July.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/July.xlsx -------------------------------------------------------------------------------- /sales_data/new/June.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/June.xlsx -------------------------------------------------------------------------------- /sales_data/new/March.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/March.xlsx -------------------------------------------------------------------------------- /sales_data/new/May.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/May.xlsx -------------------------------------------------------------------------------- /sales_data/new/November.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/November.xlsx -------------------------------------------------------------------------------- /sales_data/new/October.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/October.xlsx -------------------------------------------------------------------------------- /sales_data/new/September.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/sales_data/new/September.xlsx -------------------------------------------------------------------------------- /sales_report_openpyxl.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | from openpyxl.styles import Font, Alignment 5 | from openpyxl.formatting.rule import CellIsRule 6 | from openpyxl.chart import BarChart, Reference 7 | from openpyxl.chart.shapes import GraphicalProperties 8 | from openpyxl.drawing.line import LineProperties 9 | 10 | 11 | # Directory of this file 12 | this_dir = Path(__file__).resolve().parent 13 | 14 | # Read in all files 15 | parts = [] 16 | for path in (this_dir / "sales_data").rglob("*.xls*"): 17 | print(f'Reading {path.name}') 18 | part = pd.read_excel(path) 19 | parts.append(part) 20 | 21 | # Combine the DataFrames from each file into a single DataFrame 22 | df = pd.concat(parts) 23 | 24 | # Pivot each store into a column and sum up all transactions per date 25 | pivot = pd.pivot_table(df, 26 | index="transaction_date", columns="store", 27 | values="amount", aggfunc="sum") 28 | 29 | # Resample to end of month and assign an index name 30 | summary = pivot.resample("M").sum() 31 | summary.index.name = "Month" 32 | 33 | # Sort columns by total revenue 34 | summary = summary.loc[:, summary.sum().sort_values().index] 35 | 36 | # Add row and column totals: Using "append" together with "rename" 37 | # is a convenient way to add a row to the bottom of a DataFrame 38 | summary.loc[:, "Total"] = summary.sum(axis=1) 39 | summary = summary.append(summary.sum(axis=0).rename("Total")) 40 | 41 | #### Write summary report to Excel file #### 42 | 43 | # DataFrame position and number of rows/columns 44 | # openpxyl uses 1-based indices 45 | startrow, startcol = 3, 2 46 | nrows, ncols = summary.shape 47 | 48 | # Starting with pandas 1.3.0, the following line will raise a FutureWarning. 49 | # To fix this, replace write_only=True with engine_kwargs={"write_only": True} 50 | with pd.ExcelWriter(this_dir / "sales_report_openpyxl.xlsx", 51 | engine="openpyxl", write_only=True) as writer: 52 | # pandas uses 0-based indices 53 | summary.to_excel(writer, sheet_name="Sheet1", 54 | startrow=startrow - 1, startcol=startcol - 1) 55 | 56 | # Get openpyxl book and sheet object 57 | book = writer.book 58 | sheet = writer.sheets["Sheet1"] 59 | 60 | # Set title 61 | sheet.cell(row=1, column=startcol, value="Sales Report") 62 | sheet.cell(row=1, column=startcol).font = Font(size=24, bold=True) 63 | 64 | # Sheet formatting 65 | sheet.sheet_view.showGridLines = False 66 | 67 | # Format the DataFrame with 68 | # - number format 69 | # - column width 70 | # - conditional formatting 71 | for row in range(startrow + 1, startrow + nrows + 1): 72 | for col in range(startcol + 1, startcol + ncols + 1): 73 | cell = sheet.cell(row=row, column=col) 74 | cell.number_format = "#,##0" 75 | cell.alignment = Alignment(horizontal="center") 76 | 77 | for cell in sheet["B"]: 78 | cell.number_format = "mmm yy" 79 | 80 | for col in range(startcol, startcol + ncols + 1): 81 | cell = sheet.cell(row=startrow, column=col) 82 | sheet.column_dimensions[cell.column_letter].width = 14 83 | 84 | first_cell = sheet.cell(row=startrow + 1, column=startcol + 1) 85 | last_cell = sheet.cell(row=startrow + nrows, column=startcol + ncols) 86 | range_address = f"{first_cell.coordinate}:{last_cell.coordinate}" 87 | sheet.conditional_formatting.add(range_address, 88 | CellIsRule(operator="lessThan", 89 | formula=["20000"], 90 | stopIfTrue=True, 91 | font=Font(color="E93423"))) 92 | 93 | # Chart 94 | chart = BarChart() 95 | chart.type = "col" 96 | chart.title = "Sales per Month and Store" 97 | chart.height = 11.5 98 | chart.width = 20.5 99 | 100 | # Add each column as a series, ignoring total row and col 101 | data = Reference(sheet, min_col=startcol + 1, min_row=startrow, 102 | max_row=startrow + nrows - 1, 103 | max_col=startcol + ncols - 1) 104 | categories = Reference(sheet, min_col=startcol, min_row=startrow + 1, 105 | max_row=startrow + nrows - 1) 106 | chart.add_data(data, titles_from_data=True) 107 | chart.set_categories(categories) 108 | cell = sheet.cell(row=startrow + nrows + 2, column=startcol) 109 | sheet.add_chart(chart=chart, anchor=cell.coordinate) 110 | 111 | # Chart formatting 112 | chart.y_axis.title = "Sales" 113 | chart.x_axis.title = summary.index.name 114 | # Hide y-axis line: spPR stands for ShapeProperties 115 | chart.y_axis.spPr = GraphicalProperties(ln=LineProperties(noFill=True)) 116 | -------------------------------------------------------------------------------- /sales_report_pandas.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | 5 | 6 | # Directory of this file 7 | this_dir = Path(__file__).resolve().parent 8 | 9 | # Read in all Excel files from all subfolders of sales_data 10 | parts = [] 11 | for path in (this_dir / "sales_data").rglob("*.xls*"): 12 | print(f'Reading {path.name}') 13 | part = pd.read_excel(path, index_col="transaction_id") 14 | parts.append(part) 15 | 16 | # Combine the DataFrames from each file into a single DataFrame 17 | # pandas takes care of properly aligning the columns 18 | df = pd.concat(parts) 19 | 20 | # Pivot each store into a column and sum up all transactions per date 21 | pivot = pd.pivot_table(df, 22 | index="transaction_date", columns="store", 23 | values="amount", aggfunc="sum") 24 | 25 | # Resample to end of month and assign an index name 26 | summary = pivot.resample("M").sum() 27 | summary.index.name = "Month" 28 | 29 | # Write summary report to Excel file 30 | summary.to_excel(this_dir / "sales_report_pandas.xlsx") 31 | -------------------------------------------------------------------------------- /sales_report_xlsxwriter.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | 5 | 6 | # Directory of this file 7 | this_dir = Path(__file__).resolve().parent 8 | 9 | # Read in all files 10 | parts = [] 11 | for path in (this_dir / "sales_data").rglob("*.xls*"): 12 | print(f'Reading {path.name}') 13 | part = pd.read_excel(path) 14 | parts.append(part) 15 | 16 | # Combine the DataFrames from each file into a single DataFrame 17 | df = pd.concat(parts) 18 | 19 | # Pivot each store into a column and sum up all transactions per date 20 | pivot = pd.pivot_table(df, 21 | index="transaction_date", columns="store", 22 | values="amount", aggfunc="sum") 23 | 24 | # Resample to end of month and assign an index name 25 | summary = pivot.resample("M").sum() 26 | summary.index.name = "Month" 27 | 28 | # Sort columns by total revenue 29 | summary = summary.loc[:, summary.sum().sort_values().index] 30 | 31 | # Add row and column totals: Using "append" together with "rename" 32 | # is a convenient way to add a row to the bottom of a DataFrame 33 | summary.loc[:, "Total"] = summary.sum(axis=1) 34 | summary = summary.append(summary.sum(axis=0).rename("Total")) 35 | 36 | #### Write summary report to Excel file #### 37 | 38 | # DataFrame position and number of rows/columns 39 | # xlsxwriter uses 0-based indices 40 | startrow, startcol = 2, 1 41 | nrows, ncols = summary.shape 42 | 43 | with pd.ExcelWriter(this_dir / "sales_report_xlsxwriter.xlsx", 44 | engine="xlsxwriter", datetime_format="mmm yy") as writer: 45 | summary.to_excel(writer, sheet_name="Sheet1", 46 | startrow=startrow, startcol=startcol) 47 | 48 | # Get xlsxwriter book and sheet object 49 | book = writer.book 50 | sheet = writer.sheets["Sheet1"] 51 | 52 | # Set title 53 | title_format = book.add_format({"bold": True, "size": 24}) 54 | sheet.write(0, startcol, "Sales Report", title_format) 55 | 56 | # Sheet formatting 57 | # 2 = hide on screen and when printing 58 | sheet.hide_gridlines(2) 59 | 60 | # Format the DataFrame with 61 | # - number format 62 | # - column width 63 | # - conditional formatting 64 | number_format = book.add_format({"num_format": "#,##0", 65 | "align": "center"}) 66 | below_target_format = book.add_format({"font_color": "#E93423"}) 67 | sheet.set_column(first_col=startcol, last_col=startcol + ncols, 68 | width=14, cell_format=number_format) 69 | sheet.conditional_format(first_row=startrow + 1, 70 | first_col=startcol + 1, 71 | last_row=startrow + nrows, 72 | last_col=startcol + ncols, 73 | options={"type": "cell", "criteria": "<=", 74 | "value": 20000, 75 | "format": below_target_format}) 76 | 77 | # Chart 78 | chart = book.add_chart({"type": "column"}) 79 | chart.set_title({"name": "Sales per Month and Store"}) 80 | chart.set_size({"width": 830, "height": 450}) 81 | 82 | # Add each column as a series, ignoring total row and col 83 | for col in range(1, ncols): 84 | chart.add_series({ 85 | # [sheetname, first_row, first_col, last_row, last_col] 86 | "name": ["Sheet1", startrow, startcol + col], 87 | "categories": ["Sheet1", startrow + 1, startcol, 88 | startrow + nrows - 1, startcol], 89 | "values": ["Sheet1", startrow + 1, startcol + col, 90 | startrow + nrows - 1, startcol + col], 91 | }) 92 | 93 | # Chart formatting 94 | chart.set_x_axis({"name": summary.index.name, 95 | "major_tick_mark": "none"}) 96 | chart.set_y_axis({"name": "Sales", 97 | "line": {"none": True}, 98 | "major_gridlines": {"visible": True}, 99 | "major_tick_mark": "none"}) 100 | 101 | # Add the chart to the sheet 102 | sheet.insert_chart(startrow + nrows + 2, startcol, chart) 103 | -------------------------------------------------------------------------------- /sales_report_xlwings.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | import xlwings as xw 5 | 6 | 7 | # Directory of this file 8 | this_dir = Path(__file__).resolve().parent 9 | 10 | # Read in all files 11 | parts = [] 12 | for path in (this_dir / "sales_data").rglob("*.xls*"): 13 | print(f'Reading {path.name}') 14 | part = pd.read_excel(path) 15 | parts.append(part) 16 | 17 | # Combine the DataFrames from each file into a single DataFrame 18 | df = pd.concat(parts) 19 | 20 | # Pivot each store into a column and sum up all transactions per date 21 | pivot = pd.pivot_table(df, 22 | index="transaction_date", columns="store", 23 | values="amount", aggfunc="sum") 24 | 25 | # Resample to end of month and assign an index name 26 | summary = pivot.resample("M").sum() 27 | summary.index.name = "Month" 28 | 29 | # Sort columns by total revenue 30 | summary = summary.loc[:, summary.sum().sort_values().index] 31 | 32 | # Add row and column totals: Using "append" together with "rename" 33 | # is a convenient way to add a row to the bottom of a DataFrame 34 | summary.loc[:, "Total"] = summary.sum(axis=1) 35 | summary = summary.append(summary.sum(axis=0).rename("Total")) 36 | 37 | #### Write summary report to Excel file #### 38 | 39 | # Open the template, paste the data, autofit the columns 40 | # and adjust the chart source. Then save it under a different name. 41 | template = xw.Book(this_dir / "xl" / "sales_report_template.xlsx") 42 | sheet = template.sheets["Sheet1"] 43 | sheet["B3"].value = summary 44 | sheet["B3"].expand().columns.autofit() 45 | sheet.charts["Chart 1"].set_source_data(sheet["B3"].expand()[:-1, :-1]) 46 | template.save(this_dir / "sales_report_xlwings.xlsx") 47 | -------------------------------------------------------------------------------- /temperature.py: -------------------------------------------------------------------------------- 1 | TEMPERATURE_SCALES = ("fahrenheit", "kelvin", "celsius") 2 | 3 | 4 | def convert_to_celsius(degrees, source="fahrenheit"): 5 | if source.lower() == "fahrenheit": 6 | return (degrees-32) * (5/9) 7 | elif source.lower() == "kelvin": 8 | return degrees - 273.15 9 | else: 10 | return f"Don't know how to convert from {source}" 11 | 12 | 13 | print("This is the temperature module.") 14 | -------------------------------------------------------------------------------- /udfs/describe/describe.py: -------------------------------------------------------------------------------- 1 | import xlwings as xw 2 | import pandas as pd 3 | 4 | 5 | @xw.func 6 | @xw.arg("df", pd.DataFrame, index=True, header=True) 7 | def describe(df): 8 | return df.describe() 9 | -------------------------------------------------------------------------------- /udfs/describe/describe.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/udfs/describe/describe.xlsm -------------------------------------------------------------------------------- /udfs/first_udf/first_udf.py: -------------------------------------------------------------------------------- 1 | import xlwings as xw 2 | 3 | 4 | def main(): 5 | wb = xw.Book.caller() 6 | sheet = wb.sheets[0] 7 | if sheet["A1"].value == "Hello xlwings!": 8 | sheet["A1"].value = "Bye xlwings!" 9 | else: 10 | sheet["A1"].value = "Hello xlwings!" 11 | 12 | 13 | @xw.func 14 | def hello(name): 15 | return f"Hello {name}!" 16 | 17 | 18 | if __name__ == "__main__": 19 | xw.Book("first_udf.xlsm").set_mock_caller() 20 | main() 21 | -------------------------------------------------------------------------------- /udfs/first_udf/first_udf.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/udfs/first_udf/first_udf.xlsm -------------------------------------------------------------------------------- /udfs/google_trends/google_trends.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pytrends.request import TrendReq 3 | import matplotlib.pyplot as plt 4 | import xlwings as xw 5 | 6 | 7 | @xw.func(call_in_wizard=False) 8 | @xw.arg("mids", doc="Machine IDs: A range of max 5 cells") 9 | @xw.arg("start_date", doc="A date-formatted cell") 10 | @xw.arg("end_date", doc="A date-formatted cell") 11 | def get_interest_over_time(mids, start_date, end_date): 12 | """Query Google Trends - replaces the Machine ID (mid) of 13 | common programming languages with their human-readable 14 | equivalent in the return value, e.g., instead of "/m/05z1_" 15 | it returns "Python". 16 | """ 17 | # Check and transform parameters 18 | assert len(mids) <= 5, "Too many mids (max: 5)" 19 | start_date = start_date.date().isoformat() 20 | end_date = end_date.date().isoformat() 21 | 22 | # Make the Google Trends request and return the DataFrame 23 | trend = TrendReq(timeout=10) 24 | trend.build_payload(kw_list=mids, 25 | timeframe=f"{start_date} {end_date}") 26 | df = trend.interest_over_time() 27 | 28 | # Replace Google's "mid" with a human-readable word 29 | mids = {"/m/05z1_": "Python", "/m/02p97": "JavaScript", 30 | "/m/0jgqg": "C++", "/m/07sbkfb": "Java", "/m/060kv": "PHP"} 31 | df = df.rename(columns=mids) 32 | 33 | # Drop the isPartial column 34 | return df.drop(columns="isPartial") 35 | 36 | 37 | @xw.func 38 | @xw.arg("df", pd.DataFrame) 39 | def plot(df, name, caller): 40 | plt.style.use("seaborn") 41 | if not df.empty: 42 | caller.sheet.pictures.add(df.plot().get_figure(), 43 | top=caller.offset(row_offset=1).top, 44 | left=caller.left, 45 | name=name, update=True) 46 | return f"" 47 | 48 | 49 | if __name__ == "__main__": 50 | xw.serve() 51 | -------------------------------------------------------------------------------- /udfs/google_trends/google_trends.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/udfs/google_trends/google_trends.xlsm -------------------------------------------------------------------------------- /udfs/google_trends_cache/google_trends_cache.py: -------------------------------------------------------------------------------- 1 | from functools import lru_cache 2 | 3 | import pandas as pd 4 | from pytrends.request import TrendReq 5 | import matplotlib.pyplot as plt 6 | import xlwings as xw 7 | 8 | 9 | @lru_cache() 10 | @xw.func(call_in_wizard=False) 11 | @xw.arg("mids", xw.Range, doc="Machine IDs: A range of max 5 cells") 12 | @xw.arg("start_date", doc="A date-formatted cell") 13 | @xw.arg("end_date", doc="A date-formatted cell") 14 | def get_interest_over_time(mids, start_date, end_date): 15 | """Query Google Trends - replaces the Machine ID (mid) of 16 | common programming languages with their human-readable 17 | equivalent in the return value, e.g., instead of "/m/05z1_" 18 | it returns "Python". 19 | """ 20 | mids = mids.value 21 | 22 | # Check and transform parameters 23 | assert len(mids) <= 5, "Too many mids (max: 5)" 24 | start_date = start_date.date().isoformat() 25 | end_date = end_date.date().isoformat() 26 | 27 | # Make the Google Trends request and return the DataFrame 28 | trend = TrendReq(timeout=10) 29 | trend.build_payload(kw_list=mids, 30 | timeframe=f"{start_date} {end_date}") 31 | df = trend.interest_over_time() 32 | 33 | # Replace Google's "mid" with a human-readable word 34 | mids = {"/m/05z1_": "Python", "/m/02p97": "JavaScript", 35 | "/m/0jgqg": "C++", "/m/07sbkfb": "Java", "/m/060kv": "PHP"} 36 | df = df.rename(columns=mids) 37 | 38 | # Drop the isPartial column 39 | return df.drop(columns="isPartial") 40 | 41 | 42 | @xw.func 43 | @xw.arg("df", pd.DataFrame) 44 | def plot(df, name, caller): 45 | plt.style.use("seaborn") 46 | if not df.empty: 47 | caller.sheet.pictures.add(df.plot().get_figure(), 48 | top=caller.offset(row_offset=1).top, 49 | left=caller.left, 50 | name=name, update=True) 51 | return f"" 52 | 53 | 54 | if __name__ == "__main__": 55 | xw.serve() 56 | -------------------------------------------------------------------------------- /udfs/google_trends_cache/google_trends_cache.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/udfs/google_trends_cache/google_trends_cache.xlsm -------------------------------------------------------------------------------- /udfs/importsub/importsub.py: -------------------------------------------------------------------------------- 1 | import xlwings as xw 2 | 3 | 4 | @xw.sub 5 | def main(): 6 | wb = xw.Book.caller() 7 | sheet = wb.sheets[0] 8 | if sheet["A1"].value == "Hello xlwings!": 9 | sheet["A1"].value = "Bye xlwings!" 10 | else: 11 | sheet["A1"].value = "Hello xlwings!" 12 | 13 | 14 | @xw.func 15 | def hello(name): 16 | return f"Hello {name}!" 17 | 18 | 19 | if __name__ == "__main__": 20 | xw.Book("importsub.xlsm").set_mock_caller() 21 | main() 22 | -------------------------------------------------------------------------------- /udfs/importsub/importsub.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/udfs/importsub/importsub.xlsm -------------------------------------------------------------------------------- /udfs/raw_values/raw_values.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import xlwings as xw 3 | 4 | 5 | @xw.func 6 | @xw.ret("raw") 7 | def randn(i=1000, j=1000): 8 | """Returns an array with dimensions (i, j) with normally distributed 9 | pseudorandom numbers provided by NumPy's random.randn 10 | """ 11 | return np.random.randn(i, j) 12 | -------------------------------------------------------------------------------- /udfs/raw_values/raw_values.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/udfs/raw_values/raw_values.xlsm -------------------------------------------------------------------------------- /udfs/revenues/revenues.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import xlwings as xw 3 | 4 | 5 | @xw.func 6 | def revenue(base_fee, users, price): 7 | return base_fee + users * price 8 | 9 | 10 | @xw.func 11 | @xw.arg("users", np.array, ndim=2) 12 | @xw.arg("price", np.array) 13 | def revenue2(base_fee, users, price): 14 | return base_fee + users * price 15 | -------------------------------------------------------------------------------- /udfs/revenues/revenues.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/udfs/revenues/revenues.xlsm -------------------------------------------------------------------------------- /xl/array_calculations.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/array_calculations.xlsx -------------------------------------------------------------------------------- /xl/big.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/big.xlsx -------------------------------------------------------------------------------- /xl/course_participants.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/course_participants.xlsx -------------------------------------------------------------------------------- /xl/currency_converter.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/currency_converter.xlsx -------------------------------------------------------------------------------- /xl/macro.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/macro.xlsm -------------------------------------------------------------------------------- /xl/sales_report_template.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/sales_report_template.xlsx -------------------------------------------------------------------------------- /xl/stores.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/stores.xls -------------------------------------------------------------------------------- /xl/stores.xlsb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/stores.xlsb -------------------------------------------------------------------------------- /xl/stores.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/stores.xlsx -------------------------------------------------------------------------------- /xl/vba.xlsm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/vba.xlsm -------------------------------------------------------------------------------- /xl/vbaProject.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fzumstein/python-for-excel/ffbb631e1a9e0dee9bc9b3098f1448db58736aec/xl/vbaProject.bin --------------------------------------------------------------------------------