├── .github
    └── workflows
    │   └── test.yaml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── data
    ├── airlines.csv
    ├── airports.csv
    ├── avg_year_by_man.csv
    ├── companies.csv
    ├── dict_example_export.json
    ├── flights.csv
    ├── json_example.json
    ├── pickle_example.pickle
    ├── pickle_example_export.pickle
    ├── planes.csv
    ├── prices.csv
    └── weather.csv
├── environment.yml
├── notebooks
    ├── 00-Introduction.ipynb
    ├── 01-Python-and-Jupyter.ipynb
    ├── 02-Fundamentals.ipynb
    ├── 03-Packages-Modules-Function.ipynb
    ├── 04-Importing-Data.ipynb
    ├── 05-Selecting-and-Filtering.ipynb
    ├── 06-Manipulating-Columns.ipynb
    ├── 07-Review-Day-1.ipynb
    ├── 08-Summarizing-Data.ipynb
    ├── 09-Summarizing-Grouped-Data.ipynb
    ├── 10-Joining-Data.ipynb
    ├── 11-Exporting-Data.ipynb
    ├── 12-Visualizing-Data.ipynb
    ├── 99-Conclusion.ipynb
    ├── Case-Study-Solutions.ipynb
    ├── Case-Study.ipynb
    ├── images
    │   ├── aggregate-series.png
    │   ├── applied-data-science.gif
    │   ├── binder-launching.png
    │   ├── brad.jpg
    │   ├── combine-horizontally-key.png
    │   ├── combine-horizontally-unordered.png
    │   ├── combine-horizontally.png
    │   ├── combine-vertically.png
    │   ├── concept_map.jpg
    │   ├── data-science-and-tech.png
    │   ├── data-science.png
    │   ├── dataframe-groups-unordered.png
    │   ├── dataframe-groups.png
    │   ├── dataframe-series.png
    │   ├── ethan.jpg
    │   ├── export-framework.png
    │   ├── full-outer-join.png
    │   ├── gus.jpg
    │   ├── import-framework.png
    │   ├── inner-join.png
    │   ├── insert-new-cell.png
    │   ├── jay.jpg
    │   ├── jupyter-file-structure.png
    │   ├── left-outer-join.png
    │   ├── markdown-cell-rendered.png
    │   ├── markdown-cell-selection.png
    │   ├── markdown-cell-unrendered.png
    │   ├── model-for-grouped-aggs.png
    │   ├── navigator-jupyter.png
    │   ├── new-jupyter-notebook.png
    │   ├── open-jupyter-notebook.png
    │   ├── python-code-cell.png
    │   ├── python-logo.png
    │   ├── python_jupyter.png
    │   ├── right-outer-join.png
    │   ├── selecting_columns.png
    │   ├── selecting_rows.png
    │   ├── selecting_rows_columns.png
    │   ├── series-plus-series.png
    │   ├── so_dev_survey.png
    │   ├── subsetting_result.png
    │   └── summarizing-by-groups.png
    └── rise.css
└── scripts
    ├── generate_slides.sh
    └── prep_nb_for_ci.py


/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on: [push, workflow_dispatch]
 4 | 
 5 | jobs:
 6 |   validate-notebooks:
 7 |     name: Validate Notebooks
 8 |     runs-on: ubuntu-latest
 9 |     defaults:
10 |       run:
11 |         # Required for "run" commands to execute in the conda env.
12 |         shell: bash -l {0}
13 |     steps:
14 |       - name: Checkout Code
15 |         uses: actions/checkout@v3
16 |       - name: Set Up Conda Environment
17 |         uses: conda-incubator/setup-miniconda@v2
18 |         with:
19 |           activate-environment: uc-python
20 |           environment-file: environment.yml
21 |       - name: Set Up Jupyter Kernel
22 |         run: |
23 |           python -m ipykernel install --user --name uc-python
24 |       - name: Install Papermill
25 |         run: |
26 |           conda install papermill
27 |       - name: Prep notebooks
28 |         run: |
29 |           # Remove nb cells that should be skipped in CI.
30 |           for nb in notebooks/*.ipynb; do
31 |             python scripts/prep_nb_for_ci.py "$nb"
32 |           done
33 |       - name: Run notebooks
34 |         run: |
35 |           for nb in notebooks/*.ipynb; do
36 |             echo "running $nb"
37 |             output=$(papermill --cwd notebooks/ "$nb" -)
38 |           done
39 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # VS code
107 | .vscode
108 | 
109 | # Custom
110 | slides/
111 | 
112 | .DS_Store
113 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Python for Data Science @ University of Cincinnati
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | IMAGESIN=$(wildcard notebooks/images/*)
 2 | IMAGESOUT=$(patsubst notebooks/%,slides/%,$(IMAGESIN))
 3 | NBFILES=$(wildcard notebooks/*-*.ipynb)
 4 | HTMLFILES=$(patsubst notebooks/%.ipynb,slides/%.slides.html,$(NBFILES))
 5 | 
 6 | slides: html
 7 | 
 8 | images: slides/images $(IMAGESOUT)
 9 | 
10 | slides/images:
11 | 	mkdir -p slides/images
12 | 
13 | $(IMAGESOUT): slides/images/%: notebooks/images/%
14 | 	cp -a $< $@
15 | 
16 | html: images $(HTMLFILES)
17 | 
18 | $(HTMLFILES): slides/%.slides.html: notebooks/%.ipynb
19 | 	bash scripts/generate_slides.sh $<
20 | 
21 | clean:
22 | 	rm -rf slides/
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction to Python for Data Science
 2 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/uc-python/intro-python-datasci/main?urlpath=lab)
 3 | 
 4 | ### Course Description
 5 | 
 6 | Welcome to Introduction to Python for Data Science! This short course provides a gentle, hands-on introduction to the Python programming language for data science applications. You will learn the fundamentals of Python as a language and how to work with data using the `pandas` library.
 7 | 
 8 | ### Objectives
 9 | 
10 | The following are the primary learning objectives of students:
11 | 
12 | 1. Develop comprehensive skills in the importing/exporting, wrangling, aggregating and joining of data using Python.
13 | 
14 | 2. Establish a mental model of the Python programming language to enable future self-learning.
15 | 
16 | 3. Build awareness and basic skills in the core data science area of data visualization.
17 | 
18 | ### Agenda
19 | 
20 | ***This workshop offering will be 100% virtual and span 4 half-days.***
21 | 
22 | | Day       | Topic                                                                          |     Time      |
23 | | :--------:| :----------------------------------------------------------------------------- | :-----------: |
24 | | __Day 1__ | Introductions                                                                  | 12:30 - 12:45 |
25 | |           | Python and Jupyter Overview                                                    | 12:45 - 1:15  |
26 | |           | Fundamentals                                                                   |  1:15 - 2:00  |
27 | |           | Break                                                                          |  2:00 - 2:15  |
28 | |           | Packages, Modules, Methods, Functions                                          |  2:15 - 3:00  |
29 | |           | Importing Data                                                                 |  3:00 - 3:45  |
30 | |           | Q\&A                                                                           |  3:45 - 4:15  |
31 | | __Day 2__ | Q\&A                                                                           | 12:45 - 1:00  |
32 | |           | Selecting and Filtering Data                                                   |  1:00 - 1:45  |
33 | |           | Working with Columns                                                           |  1:45 - 2:30  |
34 | |           | Break                                                                          |  2:30 - 2:45  |
35 | |           | Case Study, pt. 1                                                              |  2:45 - 3:45  |
36 | |           | Q\&A                                                                           |  3:45 - 4:15  |
37 | | __Day 3__ | Q\&A                                                                           | 12:45 - 1:00  |
38 | |           | Review                                                                         |  1:00 - 1:15  |
39 | |           | Summarizing Data                                                               |  1:15 - 2:00  |
40 | |           | Break                                                                          |  2:00 - 2:15  |
41 | |           | Summarizing Grouped Data                                                       |  2:15 - 3:00  |
42 | |           | Joining Data                                                                   |  3:00 - 3:45  |
43 | |           | Q\&A                                                                           |  3:45 - 4:15  |
44 | | __Day 4__ | Q\&A                                                                           | 12:45 - 1:00  |
45 | |           | Exporting Data                                                                 |  1:00 - 1:30  |
46 | |           | Visualizing Data                                                               |  1:30 - 2:30  |
47 | |           | Break                                                                          |  2:30 - 2:45  |
48 | |           | Case Study, pt. 2                                                              |  2:45 - 3:45  |
49 | |           | Q\&A                                                                           |  3:45 - 4:15  |
50 | 
51 | ### Course Preparation
52 | 
53 | In an effort to simplify the setup for this class, we are using Binder for all materials (slides, worksheets, etc.). In result, there is no pre-requisite installation required for the in-class material.
54 | 
55 | With that being said, it's smart to install the appropriate technologies and download the materials anyways. This will provide you a backup in case there are network issues, and *it will also be required to apply your learnings outside of class*.
56 | 
57 | Follow these steps to download the technologies and materials:
58 | 
59 | #### 1. Python, Jupyter and package installation.
60 | 
61 | These easiest way to install Python, Jupyter, and the necessary packages is through Anaconda. To download and install Anaconda:
62 | 
63 | 1. Visit the [Anaconda Individual Edition page](https://www.anaconda.com/products/individual)
64 | 2. Scroll down to the "Anaconda Installers" section.
65 | 3. Choose to download the **Graphical Installer** for your operating system. (If you are comfortable with the command line, you may choose that option instead.)
66 | 4. Open the installer when the download completes, and then follow the prompts. If you are prompted about installing PyCharm, elect **not** to do so.
67 | 5. Once installed, open the Anaconda Navigator and launch a Jupyter Notebook to ensure it works.
68 | 6. Follow [the package installation instructions](https://docs.anaconda.com/free/navigator/tutorials/manage-packages/#installing-a-package) to ensure `pandas` and `seaborn` packages are installed.
69 | 
70 | #### 2. Download class materials
71 | 
72 | There are two ways to download the class materials:
73 | 
74 | 1. Clone it - If you're familiar with how to do so, you can clone this repository.
75 | 2. Download the files as a zip - use [this link](https://github.com/uc-python/intro-python-datasci/archive/refs/heads/main.zip)
76 | 
77 | ### Questions
78 | 
79 | If you have any specific questions prior to the class you can reach out to us directly via GitHub or email:
80 | 
81 |   * Ethan Swan: [GitHub](https://www.github.com/eswan18) & [Email](mailto:ethanpswan@gmail.com)
82 |   * Bradley Boehmke: [GitHub](https://www.github.com/bradleyboehmke) & [Email](mailto:bradleyboehmke@gmail.com)
83 |   * Gus Powers: [GitHub](https://www.github.com/augustopher) & [Email](mailto:guspowers0@gmail.com)
84 |   * Jay Cunningham: [GitHub](https://github.com/cunningjames) & [Email](mailto:james@notbadafterall.com)
85 | 


--------------------------------------------------------------------------------
/data/airlines.csv:
--------------------------------------------------------------------------------
 1 | carrier,name
 2 | 9E,Endeavor Air Inc.
 3 | AA,American Airlines Inc.
 4 | AS,Alaska Airlines Inc.
 5 | B6,JetBlue Airways
 6 | DL,Delta Air Lines Inc.
 7 | EV,ExpressJet Airlines Inc.
 8 | F9,Frontier Airlines Inc.
 9 | FL,AirTran Airways Corporation
10 | HA,Hawaiian Airlines Inc.
11 | MQ,Envoy Air
12 | OO,SkyWest Airlines Inc.
13 | UA,United Air Lines Inc.
14 | US,US Airways Inc.
15 | VX,Virgin America
16 | WN,Southwest Airlines Co.
17 | YV,Mesa Airlines Inc.
18 | 


--------------------------------------------------------------------------------
/data/avg_year_by_man.csv:
--------------------------------------------------------------------------------
 1 | manufacturer,year
 2 | AGUSTA SPA,2001.0
 3 | AIRBUS,2007.2012195121952
 4 | AIRBUS INDUSTRIE,1998.2333333333333
 5 | AMERICAN AIRCRAFT INC,
 6 | AVIAT AIRCRAFT INC,2007.0
 7 | AVIONS MARCEL DASSAULT,1986.0
 8 | BARKER JACK L,
 9 | BEECH,1969.5
10 | BELL,1984.5
11 | BOEING,2000.1441048034935
12 | BOMBARDIER INC,2004.486187845304
13 | CANADAIR,1997.3333333333333
14 | CANADAIR LTD,1974.0
15 | CESSNA,1972.4444444444443
16 | CIRRUS DESIGN CORP,2007.0
17 | DEHAVILLAND,1959.0
18 | DOUGLAS,1956.0
19 | EMBRAER,2003.5972696245733
20 | FRIEDEMANN JON,2007.0
21 | GULFSTREAM AEROSPACE,1984.0
22 | HURLEY JAMES LARRY,
23 | JOHN G HESS,
24 | KILDALL GARY,1985.0
25 | LAMBERT RICHARD,
26 | LEARJET INC,
27 | LEBLANC GLENN T,1985.0
28 | MARZ BARRY,1993.0
29 | MCDONNELL DOUGLAS,1989.948275862069
30 | MCDONNELL DOUGLAS AIRCRAFT CO,1989.7378640776699
31 | MCDONNELL DOUGLAS CORPORATION,1991.9285714285713
32 | PAIR MIKE E,
33 | PIPER,1976.4
34 | ROBINSON HELICOPTER CO,2012.0
35 | SIKORSKY,1985.0
36 | STEWART MACO,1985.0
37 | 


--------------------------------------------------------------------------------
/data/companies.csv:
--------------------------------------------------------------------------------
  1 | Symbol,Name,Sector
  2 | MMM,3M Company,Industrials
  3 | AOS,A.O. Smith Corp,Industrials
  4 | ABT,Abbott Laboratories,Health Care
  5 | ABBV,AbbVie Inc.,Health Care
  6 | ACN,Accenture plc,Information Technology
  7 | ATVI,Activision Blizzard,Information Technology
  8 | AYI,Acuity Brands Inc,Industrials
  9 | ADBE,Adobe Systems Inc,Information Technology
 10 | AAP,Advance Auto Parts,Consumer Discretionary
 11 | AMD,Advanced Micro Devices Inc,Information Technology
 12 | AES,AES Corp,Utilities
 13 | AET,Aetna Inc,Health Care
 14 | AMG,Affiliated Managers Group Inc,Financials
 15 | AFL,AFLAC Inc,Financials
 16 | A,Agilent Technologies Inc,Health Care
 17 | APD,Air Products & Chemicals Inc,Materials
 18 | AKAM,Akamai Technologies Inc,Information Technology
 19 | ALK,Alaska Air Group Inc,Industrials
 20 | ALB,Albemarle Corp,Materials
 21 | ARE,Alexandria Real Estate Equities Inc,Real Estate
 22 | ALXN,Alexion Pharmaceuticals,Health Care
 23 | ALGN,Align Technology,Health Care
 24 | ALLE,Allegion,Industrials
 25 | AGN,"Allergan, Plc",Health Care
 26 | ADS,Alliance Data Systems,Information Technology
 27 | LNT,Alliant Energy Corp,Utilities
 28 | ALL,Allstate Corp,Financials
 29 | GOOGL,Alphabet Inc Class A,Information Technology
 30 | GOOG,Alphabet Inc Class C,Information Technology
 31 | MO,Altria Group Inc,Consumer Staples
 32 | AMZN,Amazon.com Inc.,Consumer Discretionary
 33 | AEE,Ameren Corp,Utilities
 34 | AAL,American Airlines Group,Industrials
 35 | AEP,American Electric Power,Utilities
 36 | AXP,American Express Co,Financials
 37 | AIG,"American International Group, Inc.",Financials
 38 | AMT,American Tower Corp A,Real Estate
 39 | AWK,American Water Works Company Inc,Utilities
 40 | AMP,Ameriprise Financial,Financials
 41 | ABC,AmerisourceBergen Corp,Health Care
 42 | AME,AMETEK Inc.,Industrials
 43 | AMGN,Amgen Inc.,Health Care
 44 | APH,Amphenol Corp,Information Technology
 45 | APC,Anadarko Petroleum Corp,Energy
 46 | ADI,"Analog Devices, Inc.",Information Technology
 47 | ANDV,Andeavor,Energy
 48 | ANSS,ANSYS,Information Technology
 49 | ANTM,Anthem Inc.,Health Care
 50 | AON,Aon plc,Financials
 51 | APA,Apache Corporation,Energy
 52 | AIV,Apartment Investment & Management,Real Estate
 53 | AAPL,Apple Inc.,Information Technology
 54 | AMAT,Applied Materials Inc.,Information Technology
 55 | APTV,Aptiv Plc,Consumer Discretionary
 56 | ADM,Archer-Daniels-Midland Co,Consumer Staples
 57 | ARNC,Arconic Inc.,Industrials
 58 | AJG,Arthur J. Gallagher & Co.,Financials
 59 | AIZ,Assurant Inc.,Financials
 60 | T,AT&T Inc.,Telecommunication Services
 61 | ADSK,Autodesk Inc.,Information Technology
 62 | ADP,Automatic Data Processing,Information Technology
 63 | AZO,AutoZone Inc,Consumer Discretionary
 64 | AVB,"AvalonBay Communities, Inc.",Real Estate
 65 | AVY,Avery Dennison Corp,Materials
 66 | BHGE,"Baker Hughes, a GE Company",Energy
 67 | BLL,Ball Corp,Materials
 68 | BAC,Bank of America Corp,Financials
 69 | BAX,Baxter International Inc.,Health Care
 70 | BBT,BB&T Corporation,Financials
 71 | BDX,Becton Dickinson,Health Care
 72 | BRK.B,Berkshire Hathaway,Financials
 73 | BBY,Best Buy Co. Inc.,Consumer Discretionary
 74 | BIIB,Biogen Inc.,Health Care
 75 | BLK,BlackRock,Financials
 76 | HRB,Block H&R,Financials
 77 | BA,Boeing Company,Industrials
 78 | BKNG,Booking Holdings Inc,Consumer Discretionary
 79 | BWA,BorgWarner,Consumer Discretionary
 80 | BXP,Boston Properties,Real Estate
 81 | BSX,Boston Scientific,Health Care
 82 | BHF,Brighthouse Financial Inc,Financials
 83 | BMY,Bristol-Myers Squibb,Health Care
 84 | AVGO,Broadcom,Information Technology
 85 | BF.B,Brown-Forman Corp.,Consumer Staples
 86 | CHRW,C. H. Robinson Worldwide,Industrials
 87 | CA,"CA, Inc.",Information Technology
 88 | COG,Cabot Oil & Gas,Energy
 89 | CDNS,Cadence Design Systems,Information Technology
 90 | CPB,Campbell Soup,Consumer Staples
 91 | COF,Capital One Financial,Financials
 92 | CAH,Cardinal Health Inc.,Health Care
 93 | KMX,Carmax Inc,Consumer Discretionary
 94 | CCL,Carnival Corp.,Consumer Discretionary
 95 | CAT,Caterpillar Inc.,Industrials
 96 | CBOE,Cboe Global Markets,Financials
 97 | CBRE,CBRE Group,Real Estate
 98 | CBS,CBS Corp.,Consumer Discretionary
 99 | CELG,Celgene Corp.,Health Care
100 | CNC,Centene Corporation,Health Care
101 | CNP,CenterPoint Energy,Utilities
102 | CTL,CenturyLink Inc,Telecommunication Services
103 | CERN,Cerner,Health Care
104 | CF,CF Industries Holdings Inc,Materials
105 | SCHW,Charles Schwab Corporation,Financials
106 | CHTR,Charter Communications,Consumer Discretionary
107 | CVX,Chevron Corp.,Energy
108 | CMG,Chipotle Mexican Grill,Consumer Discretionary
109 | CB,Chubb Limited,Financials
110 | CHD,Church & Dwight,Consumer Staples
111 | CI,CIGNA Corp.,Health Care
112 | XEC,Cimarex Energy,Energy
113 | CINF,Cincinnati Financial,Financials
114 | CTAS,Cintas Corporation,Industrials
115 | CSCO,Cisco Systems,Information Technology
116 | C,Citigroup Inc.,Financials
117 | CFG,Citizens Financial Group,Financials
118 | CTXS,Citrix Systems,Information Technology
119 | CME,CME Group Inc.,Financials
120 | CMS,CMS Energy,Utilities
121 | KO,Coca-Cola Company (The),Consumer Staples
122 | CTSH,Cognizant Technology Solutions,Information Technology
123 | CL,Colgate-Palmolive,Consumer Staples
124 | CMCSA,Comcast Corp.,Consumer Discretionary
125 | CMA,Comerica Inc.,Financials
126 | CAG,Conagra Brands,Consumer Staples
127 | CXO,Concho Resources,Energy
128 | COP,ConocoPhillips,Energy
129 | ED,Consolidated Edison,Utilities
130 | STZ,Constellation Brands,Consumer Staples
131 | GLW,Corning Inc.,Information Technology
132 | COST,Costco Wholesale Corp.,Consumer Staples
133 | COTY,"Coty, Inc",Consumer Staples
134 | CCI,Crown Castle International Corp.,Real Estate
135 | CSRA,CSRA Inc.,Information Technology
136 | CSX,CSX Corp.,Industrials
137 | CMI,Cummins Inc.,Industrials
138 | CVS,CVS Health,Consumer Staples
139 | DHI,D. R. Horton,Consumer Discretionary
140 | DHR,Danaher Corp.,Health Care
141 | DRI,Darden Restaurants,Consumer Discretionary
142 | DVA,DaVita Inc.,Health Care
143 | DE,Deere & Co.,Industrials
144 | DAL,Delta Air Lines Inc.,Industrials
145 | XRAY,Dentsply Sirona,Health Care
146 | DVN,Devon Energy Corp.,Energy
147 | DLR,Digital Realty Trust Inc,Real Estate
148 | DFS,Discover Financial Services,Financials
149 | DISCA,Discovery Inc. Class A,Consumer Discretionary
150 | DISCK,Discovery Inc. Class C,Consumer Discretionary
151 | DISH,Dish Network,Consumer Discretionary
152 | DG,Dollar General,Consumer Discretionary
153 | DLTR,Dollar Tree,Consumer Discretionary
154 | D,Dominion Energy,Utilities
155 | DOV,Dover Corp.,Industrials
156 | DWDP,DowDuPont,Materials
157 | DPS,Dr Pepper Snapple Group,Consumer Staples
158 | DTE,DTE Energy Co.,Utilities
159 | DUK,Duke Energy,Utilities
160 | DRE,Duke Realty Corp,Real Estate
161 | DXC,DXC Technology,Information Technology
162 | ETFC,E*Trade,Financials
163 | EMN,Eastman Chemical,Materials
164 | ETN,Eaton Corporation,Industrials
165 | EBAY,eBay Inc.,Information Technology
166 | ECL,Ecolab Inc.,Materials
167 | EIX,Edison Int'l,Utilities
168 | EW,Edwards Lifesciences,Health Care
169 | EA,Electronic Arts,Information Technology
170 | EMR,Emerson Electric Company,Industrials
171 | ETR,Entergy Corp.,Utilities
172 | EVHC,Envision Healthcare,Health Care
173 | EOG,EOG Resources,Energy
174 | EQT,EQT Corporation,Energy
175 | EFX,Equifax Inc.,Industrials
176 | EQIX,Equinix,Real Estate
177 | EQR,Equity Residential,Real Estate
178 | ESS,"Essex Property Trust, Inc.",Real Estate
179 | EL,Estee Lauder Cos.,Consumer Staples
180 | RE,Everest Re Group Ltd.,Financials
181 | ES,Eversource Energy,Utilities
182 | EXC,Exelon Corp.,Utilities
183 | EXPE,Expedia Inc.,Consumer Discretionary
184 | EXPD,Expeditors International,Industrials
185 | ESRX,Express Scripts,Health Care
186 | EXR,Extra Space Storage,Real Estate
187 | XOM,Exxon Mobil Corp.,Energy
188 | FFIV,F5 Networks,Information Technology
189 | FB,"Facebook, Inc.",Information Technology
190 | FAST,Fastenal Co,Industrials
191 | FRT,Federal Realty Investment Trust,Real Estate
192 | FDX,FedEx Corporation,Industrials
193 | FIS,Fidelity National Information Services,Information Technology
194 | FITB,Fifth Third Bancorp,Financials
195 | FE,FirstEnergy Corp,Utilities
196 | FISV,Fiserv Inc,Information Technology
197 | FLIR,FLIR Systems,Information Technology
198 | FLS,Flowserve Corporation,Industrials
199 | FLR,Fluor Corp.,Industrials
200 | FMC,FMC Corporation,Materials
201 | FL,Foot Locker Inc,Consumer Discretionary
202 | F,Ford Motor,Consumer Discretionary
203 | FTV,Fortive Corp,Industrials
204 | FBHS,Fortune Brands Home & Security,Industrials
205 | BEN,Franklin Resources,Financials
206 | FCX,Freeport-McMoRan Inc.,Materials
207 | GPS,Gap Inc.,Consumer Discretionary
208 | GRMN,Garmin Ltd.,Consumer Discretionary
209 | IT,Gartner Inc,Information Technology
210 | GD,General Dynamics,Industrials
211 | GE,General Electric,Industrials
212 | GGP,General Growth Properties Inc.,Real Estate
213 | GIS,General Mills,Consumer Staples
214 | GM,General Motors,Consumer Discretionary
215 | GPC,Genuine Parts,Consumer Discretionary
216 | GILD,Gilead Sciences,Health Care
217 | GPN,Global Payments Inc.,Information Technology
218 | GS,Goldman Sachs Group,Financials
219 | GT,Goodyear Tire & Rubber,Consumer Discretionary
220 | GWW,Grainger (W.W.) Inc.,Industrials
221 | HAL,Halliburton Co.,Energy
222 | HBI,Hanesbrands Inc,Consumer Discretionary
223 | HOG,Harley-Davidson,Consumer Discretionary
224 | HRS,Harris Corporation,Information Technology
225 | HIG,Hartford Financial Svc.Gp.,Financials
226 | HAS,Hasbro Inc.,Consumer Discretionary
227 | HCA,HCA Holdings,Health Care
228 | HCP,HCP Inc.,Real Estate
229 | HP,Helmerich & Payne,Energy
230 | HSIC,Henry Schein,Health Care
231 | HES,Hess Corporation,Energy
232 | HPE,Hewlett Packard Enterprise,Information Technology
233 | HLT,Hilton Worldwide Holdings Inc,Consumer Discretionary
234 | HOLX,Hologic,Health Care
235 | HD,Home Depot,Consumer Discretionary
236 | HON,Honeywell Int'l Inc.,Industrials
237 | HRL,Hormel Foods Corp.,Consumer Staples
238 | HST,Host Hotels & Resorts,Real Estate
239 | HPQ,HP Inc.,Information Technology
240 | HUM,Humana Inc.,Health Care
241 | HBAN,Huntington Bancshares,Financials
242 | HII,Huntington Ingalls Industries,Industrials
243 | IDXX,IDEXX Laboratories,Health Care
244 | INFO,IHS Markit Ltd.,Industrials
245 | ITW,Illinois Tool Works,Industrials
246 | ILMN,Illumina Inc,Health Care
247 | INCY,Incyte,Health Care
248 | IR,Ingersoll-Rand PLC,Industrials
249 | INTC,Intel Corp.,Information Technology
250 | ICE,Intercontinental Exchange,Financials
251 | IBM,International Business Machines,Information Technology
252 | IP,International Paper,Materials
253 | IPG,Interpublic Group,Consumer Discretionary
254 | IFF,Intl Flavors & Fragrances,Materials
255 | INTU,Intuit Inc.,Information Technology
256 | ISRG,Intuitive Surgical Inc.,Health Care
257 | IVZ,Invesco Ltd.,Financials
258 | IPGP,IPG Photonics Corp.,Information Technology
259 | IQV,IQVIA Holdings Inc.,Health Care
260 | IRM,Iron Mountain Incorporated,Real Estate
261 | JBHT,J. B. Hunt Transport Services,Industrials
262 | JEC,Jacobs Engineering Group,Industrials
263 | SJM,JM Smucker,Consumer Staples
264 | JNJ,Johnson & Johnson,Health Care
265 | JCI,Johnson Controls International,Industrials
266 | JPM,JPMorgan Chase & Co.,Financials
267 | JNPR,Juniper Networks,Information Technology
268 | KSU,Kansas City Southern,Industrials
269 | K,Kellogg Co.,Consumer Staples
270 | KEY,KeyCorp,Financials
271 | KMB,Kimberly-Clark,Consumer Staples
272 | KIM,Kimco Realty,Real Estate
273 | KMI,Kinder Morgan,Energy
274 | KLAC,KLA-Tencor Corp.,Information Technology
275 | KSS,Kohl's Corp.,Consumer Discretionary
276 | KHC,Kraft Heinz Co,Consumer Staples
277 | KR,Kroger Co.,Consumer Staples
278 | LB,L Brands Inc.,Consumer Discretionary
279 | LLL,L-3 Communications Holdings,Industrials
280 | LH,Laboratory Corp. of America Holding,Health Care
281 | LRCX,Lam Research,Information Technology
282 | LEG,Leggett & Platt,Consumer Discretionary
283 | LEN,Lennar Corp.,Consumer Discretionary
284 | LUK,Leucadia National Corp.,Financials
285 | LLY,Lilly (Eli) & Co.,Health Care
286 | LNC,Lincoln National,Financials
287 | LKQ,LKQ Corporation,Consumer Discretionary
288 | LMT,Lockheed Martin Corp.,Industrials
289 | L,Loews Corp.,Financials
290 | LOW,Lowe's Cos.,Consumer Discretionary
291 | LYB,LyondellBasell,Materials
292 | MTB,M&T Bank Corp.,Financials
293 | MAC,Macerich,Real Estate
294 | M,Macy's Inc.,Consumer Discretionary
295 | MRO,Marathon Oil Corp.,Energy
296 | MPC,Marathon Petroleum,Energy
297 | MAR,Marriott Int'l.,Consumer Discretionary
298 | MMC,Marsh & McLennan,Financials
299 | MLM,Martin Marietta Materials,Materials
300 | MAS,Masco Corp.,Industrials
301 | MA,Mastercard Inc.,Information Technology
302 | MAT,Mattel Inc.,Consumer Discretionary
303 | MKC,McCormick & Co.,Consumer Staples
304 | MCD,McDonald's Corp.,Consumer Discretionary
305 | MCK,McKesson Corp.,Health Care
306 | MDT,Medtronic plc,Health Care
307 | MRK,Merck & Co.,Health Care
308 | MET,MetLife Inc.,Financials
309 | MTD,Mettler Toledo,Health Care
310 | MGM,MGM Resorts International,Consumer Discretionary
311 | KORS,Michael Kors Holdings,Consumer Discretionary
312 | MCHP,Microchip Technology,Information Technology
313 | MU,Micron Technology,Information Technology
314 | MSFT,Microsoft Corp.,Information Technology
315 | MAA,Mid-America Apartments,Real Estate
316 | MHK,Mohawk Industries,Consumer Discretionary
317 | TAP,Molson Coors Brewing Company,Consumer Staples
318 | MDLZ,Mondelez International,Consumer Staples
319 | MON,Monsanto Co.,Materials
320 | MNST,Monster Beverage,Consumer Staples
321 | MCO,Moody's Corp,Financials
322 | MS,Morgan Stanley,Financials
323 | MSI,Motorola Solutions Inc.,Information Technology
324 | MYL,Mylan N.V.,Health Care
325 | NDAQ,"Nasdaq, Inc.",Financials
326 | NOV,National Oilwell Varco Inc.,Energy
327 | NAVI,Navient,Financials
328 | NKTR,Nektar Therapeutics,Health Care
329 | NTAP,NetApp,Information Technology
330 | NFLX,Netflix Inc.,Information Technology
331 | NWL,Newell Brands,Consumer Discretionary
332 | NFX,Newfield Exploration Co,Energy
333 | NEM,Newmont Mining Corporation,Materials
334 | NWSA,News Corp. Class A,Consumer Discretionary
335 | NWS,News Corp. Class B,Consumer Discretionary
336 | NEE,NextEra Energy,Utilities
337 | NLSN,Nielsen Holdings,Industrials
338 | NKE,Nike,Consumer Discretionary
339 | NI,NiSource Inc.,Utilities
340 | NBL,Noble Energy Inc,Energy
341 | JWN,Nordstrom,Consumer Discretionary
342 | NSC,Norfolk Southern Corp.,Industrials
343 | NTRS,Northern Trust Corp.,Financials
344 | NOC,Northrop Grumman Corp.,Industrials
345 | NCLH,Norwegian Cruise Line,Consumer Discretionary
346 | NRG,NRG Energy,Utilities
347 | NUE,Nucor Corp.,Materials
348 | NVDA,Nvidia Corporation,Information Technology
349 | ORLY,O'Reilly Automotive,Consumer Discretionary
350 | OXY,Occidental Petroleum,Energy
351 | OMC,Omnicom Group,Consumer Discretionary
352 | OKE,ONEOK,Energy
353 | ORCL,Oracle Corp.,Information Technology
354 | PCAR,PACCAR Inc.,Industrials
355 | PKG,Packaging Corporation of America,Materials
356 | PH,Parker-Hannifin,Industrials
357 | PAYX,Paychex Inc.,Information Technology
358 | PYPL,PayPal,Information Technology
359 | PNR,Pentair Ltd.,Industrials
360 | PBCT,People's United Financial,Financials
361 | PEP,PepsiCo Inc.,Consumer Staples
362 | PKI,PerkinElmer,Health Care
363 | PRGO,Perrigo,Health Care
364 | PFE,Pfizer Inc.,Health Care
365 | PCG,PG&E Corp.,Utilities
366 | PM,Philip Morris International,Consumer Staples
367 | PSX,Phillips 66,Energy
368 | PNW,Pinnacle West Capital,Utilities
369 | PXD,Pioneer Natural Resources,Energy
370 | PNC,PNC Financial Services,Financials
371 | RL,Polo Ralph Lauren Corp.,Consumer Discretionary
372 | PPG,PPG Industries,Materials
373 | PPL,PPL Corp.,Utilities
374 | PX,Praxair Inc.,Materials
375 | PFG,Principal Financial Group,Financials
376 | PG,Procter & Gamble,Consumer Staples
377 | PGR,Progressive Corp.,Financials
378 | PLD,Prologis,Real Estate
379 | PRU,Prudential Financial,Financials
380 | PEG,Public Serv. Enterprise Inc.,Utilities
381 | PSA,Public Storage,Real Estate
382 | PHM,Pulte Homes Inc.,Consumer Discretionary
383 | PVH,PVH Corp.,Consumer Discretionary
384 | QRVO,Qorvo,Information Technology
385 | QCOM,QUALCOMM Inc.,Information Technology
386 | PWR,Quanta Services Inc.,Industrials
387 | DGX,Quest Diagnostics,Health Care
388 | RRC,Range Resources Corp.,Energy
389 | RJF,Raymond James Financial Inc.,Financials
390 | RTN,Raytheon Co.,Industrials
391 | O,Realty Income Corporation,Real Estate
392 | RHT,Red Hat Inc.,Information Technology
393 | REG,Regency Centers Corporation,Real Estate
394 | REGN,Regeneron,Health Care
395 | RF,Regions Financial Corp.,Financials
396 | RSG,Republic Services Inc,Industrials
397 | RMD,ResMed,Health Care
398 | RHI,Robert Half International,Industrials
399 | ROK,Rockwell Automation Inc.,Industrials
400 | COL,Rockwell Collins,Industrials
401 | ROP,Roper Technologies,Industrials
402 | ROST,Ross Stores,Consumer Discretionary
403 | RCL,Royal Caribbean Cruises Ltd,Consumer Discretionary
404 | SPGI,"S&P Global, Inc.",Financials
405 | CRM,Salesforce.com,Information Technology
406 | SBAC,SBA Communications,Real Estate
407 | SCG,SCANA Corp,Utilities
408 | SLB,Schlumberger Ltd.,Energy
409 | STX,Seagate Technology,Information Technology
410 | SEE,Sealed Air,Materials
411 | SRE,Sempra Energy,Utilities
412 | SHW,Sherwin-Williams,Materials
413 | SPG,Simon Property Group Inc,Real Estate
414 | SWKS,Skyworks Solutions,Information Technology
415 | SLG,SL Green Realty,Real Estate
416 | SNA,Snap-On Inc.,Consumer Discretionary
417 | SO,Southern Co.,Utilities
418 | LUV,Southwest Airlines,Industrials
419 | SWK,Stanley Black & Decker,Consumer Discretionary
420 | SBUX,Starbucks Corp.,Consumer Discretionary
421 | STT,State Street Corp.,Financials
422 | SRCL,Stericycle Inc,Industrials
423 | SYK,Stryker Corp.,Health Care
424 | STI,SunTrust Banks,Financials
425 | SIVB,SVB Financial,Financials
426 | SYMC,Symantec Corp.,Information Technology
427 | SYF,Synchrony Financial,Financials
428 | SNPS,Synopsys Inc.,Information Technology
429 | SYY,Sysco Corp.,Consumer Staples
430 | TROW,T. Rowe Price Group,Financials
431 | TTWO,Take-Two Interactive,Information Technology
432 | TPR,"Tapestry, Inc.",Consumer Discretionary
433 | TGT,Target Corp.,Consumer Discretionary
434 | TEL,TE Connectivity Ltd.,Information Technology
435 | FTI,TechnipFMC,Energy
436 | TXN,Texas Instruments,Information Technology
437 | TXT,Textron Inc.,Industrials
438 | BK,The Bank of New York Mellon Corp.,Financials
439 | CLX,The Clorox Company,Consumer Staples
440 | COO,The Cooper Companies,Health Care
441 | HSY,The Hershey Company,Consumer Staples
442 | MOS,The Mosaic Company,Materials
443 | TRV,The Travelers Companies Inc.,Financials
444 | DIS,The Walt Disney Company,Consumer Discretionary
445 | TMO,Thermo Fisher Scientific,Health Care
446 | TIF,Tiffany & Co.,Consumer Discretionary
447 | TWX,Time Warner Inc.,Consumer Discretionary
448 | TJX,TJX Companies Inc.,Consumer Discretionary
449 | TMK,Torchmark Corp.,Financials
450 | TSS,Total System Services,Information Technology
451 | TSCO,Tractor Supply Company,Consumer Discretionary
452 | TDG,TransDigm Group,Industrials
453 | TRIP,TripAdvisor,Consumer Discretionary
454 | FOXA,Twenty-First Century Fox Class A,Consumer Discretionary
455 | FOX,Twenty-First Century Fox Class B,Consumer Discretionary
456 | TSN,Tyson Foods,Consumer Staples
457 | USB,U.S. Bancorp,Financials
458 | UDR,UDR Inc,Real Estate
459 | ULTA,Ulta Beauty,Consumer Discretionary
460 | UAA,Under Armour Class A,Consumer Discretionary
461 | UA,Under Armour Class C,Consumer Discretionary
462 | UNP,Union Pacific,Industrials
463 | UAL,United Continental Holdings,Industrials
464 | UNH,United Health Group Inc.,Health Care
465 | UPS,United Parcel Service,Industrials
466 | URI,"United Rentals, Inc.",Industrials
467 | UTX,United Technologies,Industrials
468 | UHS,"Universal Health Services, Inc.",Health Care
469 | UNM,Unum Group,Financials
470 | VFC,V.F. Corp.,Consumer Discretionary
471 | VLO,Valero Energy,Energy
472 | VAR,Varian Medical Systems,Health Care
473 | VTR,Ventas Inc,Real Estate
474 | VRSN,Verisign Inc.,Information Technology
475 | VRSK,Verisk Analytics,Industrials
476 | VZ,Verizon Communications,Telecommunication Services
477 | VRTX,Vertex Pharmaceuticals Inc,Health Care
478 | VIAB,Viacom Inc.,Consumer Discretionary
479 | V,Visa Inc.,Information Technology
480 | VNO,Vornado Realty Trust,Real Estate
481 | VMC,Vulcan Materials,Materials
482 | WMT,Wal-Mart Stores,Consumer Staples
483 | WBA,Walgreens Boots Alliance,Consumer Staples
484 | WM,Waste Management Inc.,Industrials
485 | WAT,Waters Corporation,Health Care
486 | WEC,Wec Energy Group Inc,Utilities
487 | WFC,Wells Fargo,Financials
488 | WELL,Welltower Inc.,Real Estate
489 | WDC,Western Digital,Information Technology
490 | WU,Western Union Co,Information Technology
491 | WRK,WestRock Company,Materials
492 | WY,Weyerhaeuser Corp.,Real Estate
493 | WHR,Whirlpool Corp.,Consumer Discretionary
494 | WMB,Williams Cos.,Energy
495 | WLTW,Willis Towers Watson,Financials
496 | WYN,Wyndham Worldwide,Consumer Discretionary
497 | WYNN,Wynn Resorts Ltd,Consumer Discretionary
498 | XEL,Xcel Energy Inc,Utilities
499 | XRX,Xerox Corp.,Information Technology
500 | XLNX,Xilinx Inc,Information Technology
501 | XL,XL Capital,Financials
502 | XYL,Xylem Inc.,Industrials
503 | YUM,Yum! Brands Inc,Consumer Discretionary
504 | ZBH,Zimmer Biomet Holdings,Health Care
505 | ZION,Zions Bancorp,Financials
506 | ZTS,Zoetis,Health Care
507 | 


--------------------------------------------------------------------------------
/data/dict_example_export.json:
--------------------------------------------------------------------------------
1 | {"first": "Guido", "last": "van Rossum"}


--------------------------------------------------------------------------------
/data/json_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "planeId": "1xc2345g",
 3 |     "manufacturerDetails": {
 4 |         "manufacturer": "Airbus",
 5 |         "model": "A330",
 6 |         "year": 1999
 7 |     },
 8 |     "airlineDetails": {
 9 |         "currentAirline": "Southwest",
10 |         "previousAirlines": {
11 |             "1st": "Delta"
12 |         },
13 |         "lastPurchased": 2013
14 |     },
15 |     "numberOfFlights": 4654
16 | }
17 | 


--------------------------------------------------------------------------------
/data/pickle_example.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/data/pickle_example.pickle


--------------------------------------------------------------------------------
/data/pickle_example_export.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/data/pickle_example_export.pickle


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: uc-python
 2 | channels:
 3 |     - defaults
 4 |     - conda-forge
 5 | dependencies:
 6 |     - python=3.12
 7 |     - pip>=22.1
 8 |     - nbconvert>=6.1
 9 |     - numpy=1.26
10 |     - pandas=2
11 |     - seaborn=0.12
12 |     - ipykernel>=6.28
13 |     - jinja2>=2.11
14 |     - jedi>=0.17
15 | 


--------------------------------------------------------------------------------
/notebooks/00-Introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Introduction to Python for Data Science\n",
 12 |     "\n",
 13 |     "Gus Powers & Jay Cunningham"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "## Introductions"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "slide"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "## Gus Powers\n",
 36 |     "<table><tr>\n",
 37 |     "  <td><img width=\"300\" src=\"images/gus.jpg\"></td>\n",
 38 |     "  <td>\n",
 39 |     "    <h3><strong>Lead Data Scientist</strong> at 84.51&deg;</h3>\n",
 40 |     "    <ul>\n",
 41 |     "      <li>Creating and maintaining data science tools for internal use</li>\n",
 42 |     "      <li>Python, Bash (shell), & R</li>\n",
 43 |     "    </ul>\n",
 44 |     "    <h4>Academic</h4>\n",
 45 |     "    <ul>\n",
 46 |     "      <li>BS, Chemistry, Thomas More College</li>\n",
 47 |     "      <li>MS, Chemistry, University of Cincinnati</li>\n",
 48 |     "      <li>MS, Business Analytics, University of Cincinnati</li>\n",
 49 |     "    </ul>\n",
 50 |     "    <h4>Contact</h4>\n",
 51 |     "    <ul>\n",
 52 |     "      <li>GitHub: <a href=\"https://github.com/augustopher\">augustopher</a></li>\n",
 53 |     "      <li>LinkedIn: <a href=\"https://www.linkedin.com/in/august-powers-09717481/\">August Powers</a></li>\n",
 54 |     "      <li>Email: <a href=\"mailto:guspowers0@gmail.com\">guspowers0@gmail.com</a></li>\n",
 55 |     "    </ul>\n",
 56 |     "    </td>\n",
 57 |     "</tr></table>"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "slideshow": {
 64 |      "slide_type": "slide"
 65 |     }
 66 |    },
 67 |    "source": [
 68 |     "## Jay Cunningham\n",
 69 |     "<table><tr>\n",
 70 |     "  <td><img width=\"300\" src=\"images/jay.jpg\"></td>\n",
 71 |     "  <td>\n",
 72 |     "    <h3><strong>Lead Data Scientist</strong> at 84.51&deg;</h3>\n",
 73 |     "    <ul>\n",
 74 |     "      <li>Researching and developing forecasting models</li>\n",
 75 |     "      <li>Machine learning, Python</li>\n",
 76 |     "    </ul>\n",
 77 |     "    <h4>Academic</h4>\n",
 78 |     "    <ul>\n",
 79 |     "        <li>BA, Mathematics, University of Kentucky</li>\n",
 80 |     "        <li>MA, Economics, University of North Carolina (Greensboro)</li>\n",
 81 |     "    </ul>\n",
 82 |     "    <h4>Contact</h4>\n",
 83 |     "    <ul>\n",
 84 |     "      <li>LinkedIn: <a href=\"https://www.linkedin.com/mwlite/in/jay-cunningham-60612886\">Jay Cunningham</a></li>\n",
 85 |     "      <li>Email: <a href=\"mailto:james@notbadafterall.com\">james@notbadafterall.com</a></li>\n",
 86 |     "    </ul>\n",
 87 |     "    </td>\n",
 88 |     "</tr></table>"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "markdown",
 93 |    "metadata": {
 94 |     "slideshow": {
 95 |      "slide_type": "skip"
 96 |     }
 97 |    },
 98 |    "source": [
 99 |     "## Brad Boehmke\n",
100 |     "<table><tr>\n",
101 |     "    <td><img width=\"300\" src=\"images/brad.jpg\"></td>\n",
102 |     "    <td>\n",
103 |     "      <h4><strong>Director, Data Science</strong> at 84.51&deg;</h4>\n",
104 |     "      <ul>\n",
105 |     "        <li>Productionizing models and science solutions</li>\n",
106 |     "        <li>R&D and protogyping new solutions</li>\n",
107 |     "        <li>Python, R, & MLOps toolchain</li>\n",
108 |     "      </ul>\n",
109 |     "      <h4>Academic</h4>\n",
110 |     "      <ul>\n",
111 |     "          <li>BS, Kinesiology, North Dakota State University</li>\n",
112 |     "          <li>MS, Cost Analytics, Air Force Institute of Technology</li>\n",
113 |     "          <li>PhD, Logistics, Air Force Institute of Technology</li>\n",
114 |     "      </ul>\n",
115 |     "      <h4>Contact</h4>\n",
116 |     "      <ul>\n",
117 |     "          <li>Website: <a href=\"http://bradleyboehmke.github.io/\">bradleyboehmke.github.io</a></li>\n",
118 |     "          <li>GitHub: <a href=\"https://github.com/bradleyboehmke\">bradleyboehmke</a></li>\n",
119 |     "          <li>Twitter: <a href=\"https://twitter.com/bradleyboehmke\">@bradleyboehmke</a></li>\n",
120 |     "          <li>LinkedIn: <a href=\"https://www.linkedin.com/in/brad-boehmke-ph-d-9b0a257\">Brad Boehmke, PhD</a></li>\n",
121 |     "          <li>Email: <a href=\"mailto:bradleyboehmke@gmail.com\">bradleyboehmke@gmail.com</a></li>\n",
122 |     "      </ul>\n",
123 |     "    </td>\n",
124 |     "</tr></table>"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {
130 |     "slideshow": {
131 |      "slide_type": "skip"
132 |     }
133 |    },
134 |    "source": [
135 |     "## Ethan Swan\n",
136 |     "<table><tr>\n",
137 |     "  <td><img width=\"300\" src=\"images/ethan.jpg\"></td>\n",
138 |     "  <td>\n",
139 |     "    <h4><strong>Senior Backend Engineer</strong> at ReviewTrackers</h4>\n",
140 |     "    <ul>\n",
141 |     "      <li>Rest API development</li>\n",
142 |     "      <li>Putting ML models in production</li>\n",
143 |     "      <li>Python, Go, Ruby, & ReactJS (JavaScript)</li>\n",
144 |     "    </ul>\n",
145 |     "    <h4>Academic</h4>\n",
146 |     "    <ul>\n",
147 |     "        <li>BS, Computer Science, University of Notre Dame</li>\n",
148 |     "        <li>MBA, Business Analytics, University of Notre Dame</li>\n",
149 |     "    </ul>\n",
150 |     "    <h4>Contact</h4>\n",
151 |     "    <ul>\n",
152 |     "        <li>Website: <a href=\"https://ethanswan.com\">ethanswan.com</a></li>\n",
153 |     "        <li>GitHub: <a href=\"https://github.com/eswan18/\">eswan18</a></li>\n",
154 |     "        <li>Twitter: <a href=\"https://twitter.com/eswan18\">@eswan18</a></li>\n",
155 |     "        <li>LinkedIn: <a href=\"https://linkedin.com/in/ethanpswan\">Ethan Swan</a></li>\n",
156 |     "        <li>Email: <a href=\"mailto:ethanpswan@gmail.com\">ethanpswan@gmail.com</a></li>\n",
157 |     "    </ul>\n",
158 |     "    </td>\n",
159 |     "</tr></table>"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {
165 |     "slideshow": {
166 |      "slide_type": "slide"
167 |     }
168 |    },
169 |    "source": [
170 |     "### Around The Room\n",
171 |     "\n",
172 |     "We'll go around the room. Please share:\n",
173 |     "\n",
174 |     "1. Your name\n",
175 |     "2. Your job or field\n",
176 |     "3. How you use Python now or would like to in the future"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {
182 |     "slideshow": {
183 |      "slide_type": "slide"
184 |     }
185 |    },
186 |    "source": [
187 |     "## Course"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {
193 |     "slideshow": {
194 |      "slide_type": "slide"
195 |     }
196 |    },
197 |    "source": [
198 |     "### Defining Data Science\n",
199 |     "\n",
200 |     "<center>\n",
201 |     "<img src=\"images/data-science.png\" alt=\"data-science.png\" width=\"900\" height=\"900\">\n",
202 |     "</center>"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {
208 |     "slideshow": {
209 |      "slide_type": "slide"
210 |     }
211 |    },
212 |    "source": [
213 |     "### Data Science and Technology\n",
214 |     "\n",
215 |     "<center>\n",
216 |     "<img src=\"images/data-science-and-tech.png\" alt=\"data-science-and-tech.png\" width=\"1100\" height=\"1100\">\n",
217 |     "</center>"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {
223 |     "slideshow": {
224 |      "slide_type": "slide"
225 |     }
226 |    },
227 |    "source": [
228 |     "### Applied Data Science\n",
229 |     "\n",
230 |     "<center>\n",
231 |     "<img src=\"images/applied-data-science.gif\" alt=\"applied-data-science.gif\" width=\"1200\" height=\"1200\">\n",
232 |     "</center>"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {
238 |     "slideshow": {
239 |      "slide_type": "slide"
240 |     }
241 |    },
242 |    "source": [
243 |     "## Course Objectives\n",
244 |     "\n",
245 |     "The following are the primary learning objectives of this course:"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {
251 |     "slideshow": {
252 |      "slide_type": "fragment"
253 |     }
254 |    },
255 |    "source": [
256 |     "1. Develop comprehensive skills in the importing/exporting, wrangling, aggregating and joining of data using Python.\n",
257 |     "2. Establish a mental model of the Python programming language to enable future self-learning.\n",
258 |     "3. Build awareness and basic skills in the core data science area of data visualization."
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {
264 |     "slideshow": {
265 |      "slide_type": "slide"
266 |     }
267 |    },
268 |    "source": [
269 |     "## Course Agenda"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {
275 |     "slideshow": {
276 |      "slide_type": "slide"
277 |     }
278 |    },
279 |    "source": [
280 |     "| Day       | Topic                                                                          |     Time      |\n",
281 |     "| :--------:| :----------------------------------------------------------------------------- | :-----------: |\n",
282 |     "| __Day 1__ | Introductions                                                                  | 12:30 - 12:45 |\n",
283 |     "|           | Python and Jupyter Overview                                                    | 12:45 - 1:15  |\n",
284 |     "|           | Fundamentals                                                                   |  1:15 - 2:00  |\n",
285 |     "|           | Break                                                                          |  2:00 - 2:15  |\n",
286 |     "|           | Packages, Modules, Methods, Functions                                          |  2:15 - 3:00  |\n",
287 |     "|           | Importing Data                                                                 |  3:00 - 3:45  |\n",
288 |     "|           | Q\\&A                                                                           |  3:45 - 4:15  |\n",
289 |     "| __Day 2__ | Q\\&A                                                                           | 12:45 - 1:00  |\n",
290 |     "|           | Selecting and Filtering Data                                                   |  1:00 - 1:45  |\n",
291 |     "|           | Working with Columns                                                           |  1:45 - 2:30  |\n",
292 |     "|           | Break                                                                          |  2:30 - 2:45  |\n",
293 |     "|           | Case Study, pt. 1                                                              |  2:45 - 3:45  |\n",
294 |     "|           | Q\\&A                                                                           |  3:45 - 4:15  |"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "markdown",
299 |    "metadata": {
300 |     "slideshow": {
301 |      "slide_type": "slide"
302 |     }
303 |    },
304 |    "source": [
305 |     "| Day       | Topic                                                                          |     Time      |\n",
306 |     "| :--------:| :----------------------------------------------------------------------------- | :-----------: |\n",
307 |     "| __Day 3__ | Q\\&A                                                                           | 12:45 - 1:00  |\n",
308 |     "|           | Review                                                                         |  1:00 - 1:15  |\n",
309 |     "|           | Summarizing Data                                                               |  1:15 - 2:00  |\n",
310 |     "|           | Break                                                                          |  2:00 - 2:15  |\n",
311 |     "|           | Summarizing Grouped Data                                                       |  2:15 - 3:00  |\n",
312 |     "|           | Joining Data                                                                   |  3:00 - 3:45  |\n",
313 |     "|           | Q\\&A                                                                           |  3:45 - 4:15  |\n",
314 |     "| __Day 4__ | Q\\&A                                                                           | 12:45 - 1:00  |\n",
315 |     "|           | Exporting Data                                                                 |  1:00 - 1:30  |\n",
316 |     "|           | Visualizing Data                                                               |  1:30 - 2:30  |\n",
317 |     "|           | Break                                                                          |  2:30 - 2:45  |\n",
318 |     "|           | Case Study, pt. 2                                                              |  2:45 - 3:45  |\n",
319 |     "|           | Q\\&A                                                                           |  3:45 - 4:15  |"
320 |    ]
321 |   },
322 |   {
323 |    "cell_type": "markdown",
324 |    "metadata": {
325 |     "slideshow": {
326 |      "slide_type": "slide"
327 |     }
328 |    },
329 |    "source": [
330 |     "## Technologies"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "markdown",
335 |    "metadata": {
336 |     "cell_style": "center",
337 |     "slideshow": {
338 |      "slide_type": "slide"
339 |     }
340 |    },
341 |    "source": [
342 |     "### Binder\n",
343 |     "\n",
344 |     "* We've developed this class using a product named [Binder](https://mybinder.org/).\n",
345 |     "* As a result, this course requires *zero* setup on your part.\n",
346 |     "* There are two core techologies within the Binder repository: Python and Jupyter.\n",
347 |     "\n",
348 |     "*We will cover more on this shortly.*"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {
354 |     "cell_style": "split",
355 |     "slideshow": {
356 |      "slide_type": "slide"
357 |     }
358 |    },
359 |    "source": [
360 |     "### Python\n",
361 |     "\n",
362 |     "* Python is the programming language we'll be learning in this class.\n",
363 |     "* We are using Python 3.12, the newest version of Python, for the entirety of this class.\n",
364 |     "* The core libaries we will be using are `pandas` and `seaborn`."
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {
370 |     "cell_style": "split",
371 |     "slideshow": {
372 |      "slide_type": "fragment"
373 |     }
374 |    },
375 |    "source": [
376 |     "### Jupyter\n",
377 |     "\n",
378 |     "* Jupyter is the integrated development environment (IDE) we will be using.\n",
379 |     "* This is where we will write and run our Python code.\n"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "metadata": {
385 |     "slideshow": {
386 |      "slide_type": "slide"
387 |     }
388 |    },
389 |    "source": [
390 |     "## Course Material"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {
396 |     "slideshow": {
397 |      "slide_type": "fragment"
398 |     }
399 |    },
400 |    "source": [
401 |     "* All of the material for this course can be reached from our [GitHub](https://github.com/uc-python/intro-python-datasci) repository."
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "markdown",
406 |    "metadata": {
407 |     "slideshow": {
408 |      "slide_type": "fragment"
409 |     }
410 |    },
411 |    "source": [
412 |     "* You can either access this material through [Binder](https://mybinder.org/v2/gh/uc-python/intro-python-datasci/main) or by [downloading the material](https://github.com/uc-python/intro-python-datasci/archive/refs/heads/main.zip)\n",
413 |     " and opening it via Anaconda Navigator and JupyterLab."
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {
419 |     "slideshow": {
420 |      "slide_type": "slide"
421 |     }
422 |    },
423 |    "source": [
424 |     "### Slides *are* notebooks"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {
430 |     "slideshow": {
431 |      "slide_type": "fragment"
432 |     }
433 |    },
434 |    "source": [
435 |     "* We will be teaching using slides."
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "markdown",
440 |    "metadata": {
441 |     "slideshow": {
442 |      "slide_type": "fragment"
443 |     }
444 |    },
445 |    "source": [
446 |     "* These slides are created from the notebooks in the course repository -- so you can follow along and run the code in your notebook."
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "markdown",
451 |    "metadata": {
452 |     "slideshow": {
453 |      "slide_type": "slide"
454 |     }
455 |    },
456 |    "source": [
457 |     "### Source Code"
458 |    ]
459 |   },
460 |   {
461 |    "cell_type": "markdown",
462 |    "metadata": {
463 |     "slideshow": {
464 |      "slide_type": "fragment"
465 |     }
466 |    },
467 |    "source": [
468 |     "* Source code for the training can be found on [GitHub](https://github.com/uc-python/intro-python-datasci)."
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {
474 |     "slideshow": {
475 |      "slide_type": "fragment"
476 |     }
477 |    },
478 |    "source": [
479 |     "* This repository is public so you can clone (download) and/or refer to the materials at any point in the future."
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "metadata": {
485 |     "slideshow": {
486 |      "slide_type": "slide"
487 |     }
488 |    },
489 |    "source": [
490 |     "## Questions\n",
491 |     "\n",
492 |     "Are there any questions before moving on?"
493 |    ]
494 |   }
495 |  ],
496 |  "metadata": {
497 |   "celltoolbar": "Slideshow",
498 |   "kernelspec": {
499 |    "display_name": "uc-python",
500 |    "language": "python",
501 |    "name": "python3"
502 |   },
503 |   "language_info": {
504 |    "codemirror_mode": {
505 |     "name": "ipython",
506 |     "version": 3
507 |    },
508 |    "file_extension": ".py",
509 |    "mimetype": "text/x-python",
510 |    "name": "python",
511 |    "nbconvert_exporter": "python",
512 |    "pygments_lexer": "ipython3",
513 |    "version": "3.11.4"
514 |   },
515 |   "rise": {
516 |    "autolaunch": true,
517 |    "transition": "none"
518 |   }
519 |  },
520 |  "nbformat": 4,
521 |  "nbformat_minor": 4
522 | }
523 | 


--------------------------------------------------------------------------------
/notebooks/01-Python-and-Jupyter.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Python and Jupyter Overview\n",
 12 |     "\n",
 13 |     "![](images/python_jupyter.png)"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {
 19 |     "slideshow": {
 20 |      "slide_type": "slide"
 21 |     }
 22 |    },
 23 |    "source": [
 24 |     "## Python"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {
 30 |     "slideshow": {
 31 |      "slide_type": "slide"
 32 |     }
 33 |    },
 34 |    "source": [
 35 |     "### Python is..."
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {
 41 |     "slideshow": {
 42 |      "slide_type": "fragment"
 43 |     }
 44 |    },
 45 |    "source": [
 46 |     "* a *high-level*, *structured*, *open-source*, *interpreted* programming language"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {
 52 |     "slideshow": {
 53 |      "slide_type": "fragment"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "* a really good choice for almost any programming task"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "slideshow": {
 64 |      "slide_type": "fragment"
 65 |     }
 66 |    },
 67 |    "source": [
 68 |     "* a very popular and effective choice for data science tasks"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "slideshow": {
 75 |      "slide_type": "slide"
 76 |     }
 77 |    },
 78 |    "source": [
 79 |     "According to StackOverflow Trends, more than 11 percent of Stack Overflow questions were tagged with \"python\" in late 2018. All other languages fell well short of this number:\n",
 80 |     "\n",
 81 |     "| Language | Percent |\n",
 82 |     "|----------|---------|\n",
 83 |     "| Python   | 11.2%   |\n",
 84 |     "| Java     | 7.7%    |\n",
 85 |     "| C++      | 2.75%   |\n",
 86 |     "| R        | 2.7%    |\n",
 87 |     "| Matlab   | < 1%    |\n",
 88 |     "| Scala    | < 1%    |\n",
 89 |     "| SAS      | < 1%    |\n",
 90 |     "| Julia    | < 1%    |"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {
 96 |     "slideshow": {
 97 |      "slide_type": "slide"
 98 |     },
 99 |     "tags": []
100 |    },
101 |    "source": [
102 |     "More recently, the [2021 Stack Overflow Developer Survey](https://insights.stackoverflow.com/survey/2021) shows that Python is used by over 48% of all \"developers\"\n",
103 |     "\n",
104 |     "<center>\n",
105 |     "<img src=\"images/so_dev_survey.png\" alt=\"Stack Overflow Dev Survey\" width=\"50%\" height=\"50%\">\n",
106 |     "</center>"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "markdown",
111 |    "metadata": {
112 |     "slideshow": {
113 |      "slide_type": "slide"
114 |     }
115 |    },
116 |    "source": [
117 |     "### Python in the real world\n",
118 |     "\n",
119 |     "Python is one of the *the most popular programming languages in the world*. It's commonly used for:\n",
120 |     "\n",
121 |     "* Application development\n",
122 |     "* Scripting\n",
123 |     "* Automation\n",
124 |     "* Testing\n",
125 |     "* Data science"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {
131 |     "slideshow": {
132 |      "slide_type": "slide"
133 |     }
134 |    },
135 |    "source": [
136 |     "### Python in data science\n",
137 |     "\n",
138 |     "As previously mentioned, Python is also a popular choice in data science. For reference:"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {
144 |     "slideshow": {
145 |      "slide_type": "fragment"
146 |     }
147 |    },
148 |    "source": [
149 |     "  * According to KDNuggets, 65.6 percent of data scientists used Python regularly in 2018. This was an *increase* from 54 percent in 2017."
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {
155 |     "slideshow": {
156 |      "slide_type": "fragment"
157 |     }
158 |    },
159 |    "source": [
160 |     "  * In contrast, R was used by 48.5 percent of data scientists in 2018. This was a *decrease* from 63 percent in 2017."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {
166 |     "slideshow": {
167 |      "slide_type": "fragment"
168 |     }
169 |    },
170 |    "source": [
171 |     "<div class=\"admonition note alert alert-info\">\n",
172 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Note</p></b>\n",
173 |     "    <p><b>Disclaimer</b>: R is terrific and an <i>excellent</i> tool for data science.</p>\n",
174 |     "</div>"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {
180 |     "slideshow": {
181 |      "slide_type": "slide"
182 |     }
183 |    },
184 |    "source": [
185 |     "### Why are data scientists choosing Python?"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {
191 |     "slideshow": {
192 |      "slide_type": "fragment"
193 |     }
194 |    },
195 |    "source": [
196 |     "  * It can do anything...so everybody uses it\n",
197 |     " "
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {
203 |     "slideshow": {
204 |      "slide_type": "fragment"
205 |     },
206 |     "tags": []
207 |    },
208 |    "source": [
209 |     "* Consistency across engineering and data science teams"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {
215 |     "slideshow": {
216 |      "slide_type": "fragment"
217 |     }
218 |    },
219 |    "source": [
220 |     "  * Open-source and community support"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {
226 |     "slideshow": {
227 |      "slide_type": "fragment"
228 |     }
229 |    },
230 |    "source": [
231 |     "  * Concise syntax, readability and ease-of-use"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {
237 |     "slideshow": {
238 |      "slide_type": "fragment"
239 |     }
240 |    },
241 |    "source": [
242 |     "  * Strength in numeric computations and cutting edge data science libraries"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {
248 |     "slideshow": {
249 |      "slide_type": "slide"
250 |     }
251 |    },
252 |    "source": [
253 |     "## Jupyter"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {
259 |     "slideshow": {
260 |      "slide_type": "slide"
261 |     }
262 |    },
263 |    "source": [
264 |     "### JupyterLab is..."
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {
270 |     "slideshow": {
271 |      "slide_type": "fragment"
272 |     }
273 |    },
274 |    "source": [
275 |     "* a *language-agnostic integrated development environment (IDE)* specializing in **notebooks**"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {
281 |     "slideshow": {
282 |      "slide_type": "fragment"
283 |     }
284 |    },
285 |    "source": [
286 |     "* a popular choice among data scientists using Python"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {
292 |     "slideshow": {
293 |      "slide_type": "fragment"
294 |     }
295 |    },
296 |    "source": [
297 |     "<div class=\"admonition note alert alert-info\">\n",
298 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Note</p></b>\n",
299 |     "    <p>We've chosen to use Jupyter over other popular IDEs for this course but that does not mean it is always the best IDE for writing Python.</p>\n",
300 |     "</div>"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {
306 |     "slideshow": {
307 |      "slide_type": "slide"
308 |     }
309 |    },
310 |    "source": [
311 |     "### Why are data scientists choosing Jupyter?"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {
317 |     "slideshow": {
318 |      "slide_type": "fragment"
319 |     }
320 |    },
321 |    "source": [
322 |     "* Ad-hoc analyses and science development"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {
328 |     "slideshow": {
329 |      "slide_type": "fragment"
330 |     }
331 |    },
332 |    "source": [
333 |     "* Synchronous data visualizations"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "markdown",
338 |    "metadata": {
339 |     "slideshow": {
340 |      "slide_type": "fragment"
341 |     }
342 |    },
343 |    "source": [
344 |     "* Documentation of code with accompanying comments"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "metadata": {
350 |     "slideshow": {
351 |      "slide_type": "fragment"
352 |     }
353 |    },
354 |    "source": [
355 |     "* Flexibility through extensions"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {
361 |     "slideshow": {
362 |      "slide_type": "slide"
363 |     }
364 |    },
365 |    "source": [
366 |     "### Reasons you may want to use another IDE..."
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "markdown",
371 |    "metadata": {
372 |     "slideshow": {
373 |      "slide_type": "fragment"
374 |     }
375 |    },
376 |    "source": [
377 |     "* Jupyter is not good for developing *scripts*, or Python code that isn't run interactively"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {
383 |     "slideshow": {
384 |      "slide_type": "fragment"
385 |     }
386 |    },
387 |    "source": [
388 |     "* It's not trivial to install and launch Jupyter without a tech background"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "markdown",
393 |    "metadata": {
394 |     "slideshow": {
395 |      "slide_type": "fragment"
396 |     }
397 |    },
398 |    "source": [
399 |     "<div class=\"admonition note alert alert-info\">\n",
400 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Note</p></b>\n",
401 |     "    <p>We've minimized these challenges for this workshop by using Binder.  If you are interested in installing and using Jupyter and Python on your own machine, we recommend using <a href=\"https://jupyter.readthedocs.io/en/latest/install.html\">Anaconda</a> to do so.</p>\n",
402 |     "</div>"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {
408 |     "slideshow": {
409 |      "slide_type": "slide"
410 |     }
411 |    },
412 |    "source": [
413 |     "## Jupyter Basics"
414 |    ]
415 |   },
416 |   {
417 |    "cell_type": "markdown",
418 |    "metadata": {
419 |     "slideshow": {
420 |      "slide_type": "slide"
421 |     }
422 |    },
423 |    "source": [
424 |     "### Launching Jupyter \n",
425 |     "\n",
426 |     "Let's begin by launching Jupyter by opening the [Binder repository](https://mybinder.org/v2/gh/uc-python/intro-python-datasci/master?urlpath=lab). This is how we will access Jupyter throughout the course."
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {
432 |     "slideshow": {
433 |      "slide_type": "fragment"
434 |     }
435 |    },
436 |    "source": [
437 |     "You should see a screen like this:\n",
438 |     "\n",
439 |     "![binder-launching.png](images/binder-launching.png)"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {
445 |     "slideshow": {
446 |      "slide_type": "slide"
447 |     }
448 |    },
449 |    "source": [
450 |     "Jupyter can also be launched via Anaconda Navigator:\n",
451 |     "\n",
452 |     "<center>\n",
453 |     "<img src=\"images/navigator-jupyter.png\" alt=\"navigator-jupyter.png\" width=\"900\" height=\"900\">\n",
454 |     "</center>"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "markdown",
459 |    "metadata": {
460 |     "slideshow": {
461 |      "slide_type": "fragment"
462 |     }
463 |    },
464 |    "source": [
465 |     "Note that we want to launch the JupyterLab option for this class."
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {
471 |     "slideshow": {
472 |      "slide_type": "slide"
473 |     }
474 |    },
475 |    "source": [
476 |     "### Jupyter File Structure"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {
482 |     "slideshow": {
483 |      "slide_type": "fragment"
484 |     }
485 |    },
486 |    "source": [
487 |     "As you can see, Jupyter displays a file browser when it launches:\n",
488 |     "\n",
489 |     "![jupyter-file-structure.png](images/jupyter-file-structure.png)"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "markdown",
494 |    "metadata": {
495 |     "slideshow": {
496 |      "slide_type": "slide"
497 |     }
498 |    },
499 |    "source": [
500 |     "There is just one directory for you to worry about:\n",
501 |     "\n",
502 |     "**notebooks** - interactive slideshows and code for you to follow along with"
503 |    ]
504 |   },
505 |   {
506 |    "cell_type": "markdown",
507 |    "metadata": {
508 |     "slideshow": {
509 |      "slide_type": "slide"
510 |     }
511 |    },
512 |    "source": [
513 |     "### Jupyter Notebooks"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "markdown",
518 |    "metadata": {
519 |     "slideshow": {
520 |      "slide_type": "fragment"
521 |     }
522 |    },
523 |    "source": [
524 |     "The **notebook** is the core file used to interact with Python from Jupyter. A few details:"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {
530 |     "slideshow": {
531 |      "slide_type": "fragment"
532 |     }
533 |    },
534 |    "source": [
535 |     "* Notebooks allow the writing AND running of Python code"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {
541 |     "slideshow": {
542 |      "slide_type": "fragment"
543 |     }
544 |    },
545 |    "source": [
546 |     "* Notebooks are organized by **cells** - code and commentary text goes in the cells"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "markdown",
551 |    "metadata": {
552 |     "slideshow": {
553 |      "slide_type": "fragment"
554 |     }
555 |    },
556 |    "source": [
557 |     "* All notebook files have the extension `.ipynb` (**i**nteractive **py**thon **n**ote**b**ook)"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "markdown",
562 |    "metadata": {
563 |     "slideshow": {
564 |      "slide_type": "slide"
565 |     }
566 |    },
567 |    "source": [
568 |     "A new Jupyter notebook can be opened from the \"launcher\" page, which opens automatically when you start JupyterLab.\n",
569 |     "\n",
570 |     "There may be multiple options listed in the Notebooks section, as seen here. These are different Python installations available on your computer. You should choose **Python 3** in this case.\n",
571 |     "\n",
572 |     "![open-jupyter-notebook](images/open-jupyter-notebook.png)"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "markdown",
577 |    "metadata": {
578 |     "slideshow": {
579 |      "slide_type": "slide"
580 |     }
581 |    },
582 |    "source": [
583 |     "This will open a new notebook with a Python 3 kernel:\n",
584 |     "\n",
585 |     "![new-jupyter-notebook](images/new-jupyter-notebook.png)"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "markdown",
590 |    "metadata": {
591 |     "slideshow": {
592 |      "slide_type": "slide"
593 |     }
594 |    },
595 |    "source": [
596 |     "### Notebook Cells"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "markdown",
601 |    "metadata": {
602 |     "slideshow": {
603 |      "slide_type": "fragment"
604 |     }
605 |    },
606 |    "source": [
607 |     "As previously mentioned, Jupyter notebooks are organized by **cells**. These cells are at the core of a notebook:"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "metadata": {
613 |     "slideshow": {
614 |      "slide_type": "fragment"
615 |     }
616 |    },
617 |    "source": [
618 |     "* When using Jupyter, all Python code is typed into and run from a cell"
619 |    ]
620 |   },
621 |   {
622 |    "cell_type": "markdown",
623 |    "metadata": {
624 |     "slideshow": {
625 |      "slide_type": "fragment"
626 |     }
627 |    },
628 |    "source": [
629 |     "* Comments, markdown, HTML, LaTeX can also be rendered within a cell"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "markdown",
634 |    "metadata": {
635 |     "slideshow": {
636 |      "slide_type": "slide"
637 |     }
638 |    },
639 |    "source": [
640 |     "### Code Cells\n",
641 |     "\n",
642 |     "By default, all cells are code cells. This means Python code can be run by simply:\n",
643 |     "\n",
644 |     "1. Clicking on a cell's input area\n",
645 |     "2. Typing Python code into the cell\n",
646 |     "3. Pressing CTRL + RETURN (or SHIFT + RETURN)"
647 |    ]
648 |   },
649 |   {
650 |    "cell_type": "markdown",
651 |    "metadata": {
652 |     "slideshow": {
653 |      "slide_type": "fragment"
654 |     }
655 |    },
656 |    "source": [
657 |     "The results of the code will be printed to the output area:\n",
658 |     "\n",
659 |     "![python-code-cell.png](images/python-code-cell.png)"
660 |    ]
661 |   },
662 |   {
663 |    "cell_type": "markdown",
664 |    "metadata": {
665 |     "slideshow": {
666 |      "slide_type": "slide"
667 |     }
668 |    },
669 |    "source": [
670 |     "### Comment/Markdown/HTML/LaTeX Cells\n",
671 |     "\n",
672 |     "Cells can be converted to text-oriented cells by:\n",
673 |     "\n",
674 |     "1. Selecting a cell by clicking on it\n",
675 |     "2. Clicking the \"Code\" dropdown on the edit panel\n",
676 |     "3. Clicking the \"Markdown\" option\n",
677 |     "\n",
678 |     "![markdown-cell-selection.png](images/markdown-cell-selection.png)"
679 |    ]
680 |   },
681 |   {
682 |    "cell_type": "markdown",
683 |    "metadata": {
684 |     "slideshow": {
685 |      "slide_type": "slide"
686 |     }
687 |    },
688 |    "source": [
689 |     "Text can then be typed into these cells (regular text, markdown, HTML, LaTeX):\n",
690 |     "\n",
691 |     "![markdown-cell-unrendered.png](images/markdown-cell-unrendered.png)"
692 |    ]
693 |   },
694 |   {
695 |    "cell_type": "markdown",
696 |    "metadata": {
697 |     "slideshow": {
698 |      "slide_type": "slide"
699 |     }
700 |    },
701 |    "source": [
702 |     "And it can be rendered by pressing CTRL + RETURN:\n",
703 |     "\n",
704 |     "![markdown-cell-rendered.png](images/markdown-cell-rendered.png)"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "markdown",
709 |    "metadata": {
710 |     "slideshow": {
711 |      "slide_type": "slide"
712 |     }
713 |    },
714 |    "source": [
715 |     "### Inserting New Cells\n",
716 |     "\n",
717 |     "New cells can be inserted by selecting a cell by clicking on it and\n",
718 |     "\n",
719 |     "* Clicking the \"+\" menu button add a new cell below the selected one\n",
720 |     "* Or using the keyboard shortcut `\"a\"` to insert cell above or `\"b\"` to insert cell below\n",
721 |     "\n",
722 |     "![insert-new-cell.png](images/insert-new-cell.png)"
723 |    ]
724 |   },
725 |   {
726 |    "cell_type": "markdown",
727 |    "metadata": {
728 |     "slideshow": {
729 |      "slide_type": "slide"
730 |     }
731 |    },
732 |    "source": [
733 |     "### Your Turn\n",
734 |     "\n",
735 |     "1. Create or open a notebook in Jupyter.\n",
736 |     "2. Create a new *markdown* cell. Write your name in it.\n",
737 |     "3. Create a new *code* cell. Write `x = 5` and run it."
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "markdown",
742 |    "metadata": {
743 |     "slideshow": {
744 |      "slide_type": "slide"
745 |     }
746 |    },
747 |    "source": [
748 |     "### Additional Tips"
749 |    ]
750 |   },
751 |   {
752 |    "cell_type": "markdown",
753 |    "metadata": {
754 |     "slideshow": {
755 |      "slide_type": "fragment"
756 |     }
757 |    },
758 |    "source": [
759 |     "* Notebooks can be saved by clicking \"File\" -> \"Save Notebook As...\""
760 |    ]
761 |   },
762 |   {
763 |    "cell_type": "markdown",
764 |    "metadata": {
765 |     "slideshow": {
766 |      "slide_type": "fragment"
767 |     }
768 |    },
769 |    "source": [
770 |     "* An easy way to find a feature or its related keyboard shortcut is \"View\" -> \"Activate Command Palette\""
771 |    ]
772 |   },
773 |   {
774 |    "cell_type": "markdown",
775 |    "metadata": {
776 |     "slideshow": {
777 |      "slide_type": "fragment"
778 |     }
779 |    },
780 |    "source": [
781 |     "* Notebooks can be downloaded from Binder in numerous formats by clicking \"File\" -> \"Download\"\n",
782 |     "  * This is a great way to save your work when using Binder. These notebooks can then be reloaded at any time.\n",
783 |     "  \n",
784 |     "     <div class=\"admonition warning alert alert-danger\">\n",
785 |     "      <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Caution!</p></b>\n",
786 |     "      <p>New content created in Binder does not persist across sessions. Also, Binder tends to shutdown after a few minutes of inactivity. Save and download your work accordingly!</p>\n",
787 |     "     </div>"
788 |    ]
789 |   },
790 |   {
791 |    "cell_type": "markdown",
792 |    "metadata": {
793 |     "slideshow": {
794 |      "slide_type": "slide"
795 |     }
796 |    },
797 |    "source": [
798 |     "## Questions\n",
799 |     "\n",
800 |     "Are there any questions before moving on?"
801 |    ]
802 |   }
803 |  ],
804 |  "metadata": {
805 |   "celltoolbar": "Slideshow",
806 |   "kernelspec": {
807 |    "display_name": "Python 3 (ipykernel)",
808 |    "language": "python",
809 |    "name": "python3"
810 |   },
811 |   "language_info": {
812 |    "codemirror_mode": {
813 |     "name": "ipython",
814 |     "version": 3
815 |    },
816 |    "file_extension": ".py",
817 |    "mimetype": "text/x-python",
818 |    "name": "python",
819 |    "nbconvert_exporter": "python",
820 |    "pygments_lexer": "ipython3",
821 |    "version": "3.8.12"
822 |   },
823 |   "rise": {
824 |    "autolaunch": true,
825 |    "transition": "none"
826 |   }
827 |  },
828 |  "nbformat": 4,
829 |  "nbformat_minor": 4
830 | }
831 | 


--------------------------------------------------------------------------------
/notebooks/07-Review-Day-1.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "slideshow": {
   7 |      "slide_type": "slide"
   8 |     }
   9 |    },
  10 |    "source": [
  11 |     "# Review of Week 1"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {
  17 |     "slideshow": {
  18 |      "slide_type": "slide"
  19 |     }
  20 |    },
  21 |    "source": [
  22 |     "## Fundamentals"
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "markdown",
  27 |    "metadata": {
  28 |     "slideshow": {
  29 |      "slide_type": "slide"
  30 |     }
  31 |    },
  32 |    "source": [
  33 |     "### Data Types"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "markdown",
  38 |    "metadata": {
  39 |     "tags": []
  40 |    },
  41 |    "source": [
  42 |     "Everything in Python is an object, and every object has a type."
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "markdown",
  47 |    "metadata": {
  48 |     "tags": []
  49 |    },
  50 |    "source": [
  51 |     "Let's review the most important ones."
  52 |    ]
  53 |   },
  54 |   {
  55 |    "cell_type": "markdown",
  56 |    "metadata": {
  57 |     "slideshow": {
  58 |      "slide_type": "slide"
  59 |     }
  60 |    },
  61 |    "source": [
  62 |     "**Integers** – Whole Numbers"
  63 |    ]
  64 |   },
  65 |   {
  66 |    "cell_type": "code",
  67 |    "execution_count": 1,
  68 |    "metadata": {
  69 |     "tags": []
  70 |    },
  71 |    "outputs": [
  72 |     {
  73 |      "data": {
  74 |       "text/plain": [
  75 |        "3"
  76 |       ]
  77 |      },
  78 |      "execution_count": 1,
  79 |      "metadata": {},
  80 |      "output_type": "execute_result"
  81 |     }
  82 |    ],
  83 |    "source": [
  84 |     "i = 3\n",
  85 |     "i"
  86 |    ]
  87 |   },
  88 |   {
  89 |    "cell_type": "markdown",
  90 |    "metadata": {
  91 |     "slideshow": {
  92 |      "slide_type": "fragment"
  93 |     },
  94 |     "tags": []
  95 |    },
  96 |    "source": [
  97 |     "**Floats** – Decimal Numbers"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 2,
 103 |    "metadata": {
 104 |     "tags": []
 105 |    },
 106 |    "outputs": [
 107 |     {
 108 |      "data": {
 109 |       "text/plain": [
 110 |        "3.4"
 111 |       ]
 112 |      },
 113 |      "execution_count": 2,
 114 |      "metadata": {},
 115 |      "output_type": "execute_result"
 116 |     }
 117 |    ],
 118 |    "source": [
 119 |     "f = 3.4\n",
 120 |     "f"
 121 |    ]
 122 |   },
 123 |   {
 124 |    "cell_type": "markdown",
 125 |    "metadata": {
 126 |     "slideshow": {
 127 |      "slide_type": "fragment"
 128 |     },
 129 |     "tags": []
 130 |    },
 131 |    "source": [
 132 |     "**Strings** – Bits of Text"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "code",
 137 |    "execution_count": 3,
 138 |    "metadata": {
 139 |     "tags": []
 140 |    },
 141 |    "outputs": [
 142 |     {
 143 |      "data": {
 144 |       "text/plain": [
 145 |        "'python'"
 146 |       ]
 147 |      },
 148 |      "execution_count": 3,
 149 |      "metadata": {},
 150 |      "output_type": "execute_result"
 151 |     }
 152 |    ],
 153 |    "source": [
 154 |     "s = 'python'\n",
 155 |     "s"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "markdown",
 160 |    "metadata": {
 161 |     "slideshow": {
 162 |      "slide_type": "slide"
 163 |     }
 164 |    },
 165 |    "source": [
 166 |     "**Lists** – Ordered collections of other Python objects"
 167 |    ]
 168 |   },
 169 |   {
 170 |    "cell_type": "code",
 171 |    "execution_count": 4,
 172 |    "metadata": {
 173 |     "tags": []
 174 |    },
 175 |    "outputs": [
 176 |     {
 177 |      "data": {
 178 |       "text/plain": [
 179 |        "['a', 'b', 'c']"
 180 |       ]
 181 |      },
 182 |      "execution_count": 4,
 183 |      "metadata": {},
 184 |      "output_type": "execute_result"
 185 |     }
 186 |    ],
 187 |    "source": [
 188 |     "l = ['a', 'b', 'c']\n",
 189 |     "l"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "markdown",
 194 |    "metadata": {
 195 |     "slideshow": {
 196 |      "slide_type": "fragment"
 197 |     },
 198 |     "tags": []
 199 |    },
 200 |    "source": [
 201 |     "**Dictionaries** – A collection of key-value pairs, which let you easily look up the value for a given key"
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "code",
 206 |    "execution_count": 5,
 207 |    "metadata": {
 208 |     "tags": []
 209 |    },
 210 |    "outputs": [
 211 |     {
 212 |      "data": {
 213 |       "text/plain": [
 214 |        "{'a': 1, 'b': 2, 'z': 26}"
 215 |       ]
 216 |      },
 217 |      "execution_count": 5,
 218 |      "metadata": {},
 219 |      "output_type": "execute_result"
 220 |     }
 221 |    ],
 222 |    "source": [
 223 |     "d = {'a': 1,\n",
 224 |     "     'b': 2,\n",
 225 |     "     'z': 26}\n",
 226 |     "d"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "markdown",
 231 |    "metadata": {
 232 |     "slideshow": {
 233 |      "slide_type": "slide"
 234 |     }
 235 |    },
 236 |    "source": [
 237 |     "**DataFrames** - Tabular datasets. Part of the Pandas library."
 238 |    ]
 239 |   },
 240 |   {
 241 |    "cell_type": "code",
 242 |    "execution_count": 6,
 243 |    "metadata": {
 244 |     "tags": []
 245 |    },
 246 |    "outputs": [
 247 |     {
 248 |      "data": {
 249 |       "text/html": [
 250 |        "<div>\n",
 251 |        "<style scoped>\n",
 252 |        "    .dataframe tbody tr th:only-of-type {\n",
 253 |        "        vertical-align: middle;\n",
 254 |        "    }\n",
 255 |        "\n",
 256 |        "    .dataframe tbody tr th {\n",
 257 |        "        vertical-align: top;\n",
 258 |        "    }\n",
 259 |        "\n",
 260 |        "    .dataframe thead th {\n",
 261 |        "        text-align: right;\n",
 262 |        "    }\n",
 263 |        "</style>\n",
 264 |        "<table border=\"1\" class=\"dataframe\">\n",
 265 |        "  <thead>\n",
 266 |        "    <tr style=\"text-align: right;\">\n",
 267 |        "      <th></th>\n",
 268 |        "      <th>x</th>\n",
 269 |        "      <th>y</th>\n",
 270 |        "    </tr>\n",
 271 |        "  </thead>\n",
 272 |        "  <tbody>\n",
 273 |        "    <tr>\n",
 274 |        "      <th>0</th>\n",
 275 |        "      <td>1</td>\n",
 276 |        "      <td>2</td>\n",
 277 |        "    </tr>\n",
 278 |        "    <tr>\n",
 279 |        "      <th>1</th>\n",
 280 |        "      <td>3</td>\n",
 281 |        "      <td>4</td>\n",
 282 |        "    </tr>\n",
 283 |        "  </tbody>\n",
 284 |        "</table>\n",
 285 |        "</div>"
 286 |       ],
 287 |       "text/plain": [
 288 |        "   x  y\n",
 289 |        "0  1  2\n",
 290 |        "1  3  4"
 291 |       ]
 292 |      },
 293 |      "execution_count": 6,
 294 |      "metadata": {},
 295 |      "output_type": "execute_result"
 296 |     }
 297 |    ],
 298 |    "source": [
 299 |     "import pandas as pd\n",
 300 |     "df = pd.DataFrame([(1, 2), (3, 4)], columns=['x', 'y'])\n",
 301 |     "df"
 302 |    ]
 303 |   },
 304 |   {
 305 |    "cell_type": "markdown",
 306 |    "metadata": {
 307 |     "slideshow": {
 308 |      "slide_type": "slide"
 309 |     }
 310 |    },
 311 |    "source": [
 312 |     "### The `type` Function"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "markdown",
 317 |    "metadata": {
 318 |     "slideshow": {
 319 |      "slide_type": "fragment"
 320 |     }
 321 |    },
 322 |    "source": [
 323 |     "You can use the `type` function to determine the type of an object."
 324 |    ]
 325 |   },
 326 |   {
 327 |    "cell_type": "code",
 328 |    "execution_count": 7,
 329 |    "metadata": {
 330 |     "slideshow": {
 331 |      "slide_type": "slide"
 332 |     }
 333 |    },
 334 |    "outputs": [
 335 |     {
 336 |      "data": {
 337 |       "text/plain": [
 338 |        "list"
 339 |       ]
 340 |      },
 341 |      "execution_count": 7,
 342 |      "metadata": {},
 343 |      "output_type": "execute_result"
 344 |     }
 345 |    ],
 346 |    "source": [
 347 |     "x = [1, 2, 3]\n",
 348 |     "type(x)"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "code",
 353 |    "execution_count": 8,
 354 |    "metadata": {
 355 |     "tags": []
 356 |    },
 357 |    "outputs": [
 358 |     {
 359 |      "data": {
 360 |       "text/plain": [
 361 |        "str"
 362 |       ]
 363 |      },
 364 |      "execution_count": 8,
 365 |      "metadata": {},
 366 |      "output_type": "execute_result"
 367 |     }
 368 |    ],
 369 |    "source": [
 370 |     "x = 'hello'\n",
 371 |     "type(x)"
 372 |    ]
 373 |   },
 374 |   {
 375 |    "cell_type": "markdown",
 376 |    "metadata": {
 377 |     "slideshow": {
 378 |      "slide_type": "slide"
 379 |     }
 380 |    },
 381 |    "source": [
 382 |     "## Packages, Modules, and Functions"
 383 |    ]
 384 |   },
 385 |   {
 386 |    "cell_type": "markdown",
 387 |    "metadata": {
 388 |     "slideshow": {
 389 |      "slide_type": "slide"
 390 |     }
 391 |    },
 392 |    "source": [
 393 |     "### Packages"
 394 |    ]
 395 |   },
 396 |   {
 397 |    "cell_type": "markdown",
 398 |    "metadata": {
 399 |     "slideshow": {
 400 |      "slide_type": "fragment"
 401 |     }
 402 |    },
 403 |    "source": [
 404 |     "*Packages* (generally synonymous with *modules* or *libraries*) are extensions for Python featuring useful code."
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "markdown",
 409 |    "metadata": {
 410 |     "slideshow": {
 411 |      "slide_type": "fragment"
 412 |     }
 413 |    },
 414 |    "source": [
 415 |     "Some are included in every Python install (*\"standard library\"*), while others (like Pandas, matplotlib, and more) need to be installed separately (*\"third party packages\"*)."
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "markdown",
 420 |    "metadata": {
 421 |     "slideshow": {
 422 |      "slide_type": "fragment"
 423 |     }
 424 |    },
 425 |    "source": [
 426 |     "The DataFrame type, a staple of data science, comes in the Pandas package."
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "markdown",
 431 |    "metadata": {
 432 |     "slideshow": {
 433 |      "slide_type": "slide"
 434 |     }
 435 |    },
 436 |    "source": [
 437 |     "### Functions"
 438 |    ]
 439 |   },
 440 |   {
 441 |    "cell_type": "markdown",
 442 |    "metadata": {
 443 |     "slideshow": {
 444 |      "slide_type": "fragment"
 445 |     }
 446 |    },
 447 |    "source": [
 448 |     "*Functions* are executable Python code stored in a name, just like a regular variable."
 449 |    ]
 450 |   },
 451 |   {
 452 |    "cell_type": "markdown",
 453 |    "metadata": {
 454 |     "slideshow": {
 455 |      "slide_type": "fragment"
 456 |     }
 457 |    },
 458 |    "source": [
 459 |     "You can call a function by putting parentheses after its name, and optionally including *arguments* to it (e.g. `myfunction(argument_1, argument_2)`)."
 460 |    ]
 461 |   },
 462 |   {
 463 |    "cell_type": "markdown",
 464 |    "metadata": {
 465 |     "slideshow": {
 466 |      "slide_type": "fragment"
 467 |     }
 468 |    },
 469 |    "source": [
 470 |     "Well-named functions can help to simplify your code and make it much more readable."
 471 |    ]
 472 |   },
 473 |   {
 474 |    "cell_type": "markdown",
 475 |    "metadata": {
 476 |     "slideshow": {
 477 |      "slide_type": "slide"
 478 |     }
 479 |    },
 480 |    "source": [
 481 |     "### Attributes and Methods"
 482 |    ]
 483 |   },
 484 |   {
 485 |    "cell_type": "markdown",
 486 |    "metadata": {
 487 |     "slideshow": {
 488 |      "slide_type": "fragment"
 489 |     }
 490 |    },
 491 |    "source": [
 492 |     "Python objects (that's everything in Python, remember?) come with *attributes*, or internal information accessible through dot syntax:\n",
 493 |     "```python\n",
 494 |     "myobject.attribute\n",
 495 |     "```"
 496 |    ]
 497 |   },
 498 |   {
 499 |    "cell_type": "markdown",
 500 |    "metadata": {
 501 |     "slideshow": {
 502 |      "slide_type": "slide"
 503 |     }
 504 |    },
 505 |    "source": [
 506 |     "Attributes can be handy when you want to learn more about an object."
 507 |    ]
 508 |   },
 509 |   {
 510 |    "cell_type": "code",
 511 |    "execution_count": 9,
 512 |    "metadata": {
 513 |     "slideshow": {
 514 |      "slide_type": "fragment"
 515 |     }
 516 |    },
 517 |    "outputs": [
 518 |     {
 519 |      "data": {
 520 |       "text/plain": [
 521 |        "(2, 2)"
 522 |       ]
 523 |      },
 524 |      "execution_count": 9,
 525 |      "metadata": {},
 526 |      "output_type": "execute_result"
 527 |     }
 528 |    ],
 529 |    "source": [
 530 |     "df.shape"
 531 |    ]
 532 |   },
 533 |   {
 534 |    "cell_type": "markdown",
 535 |    "metadata": {
 536 |     "slideshow": {
 537 |      "slide_type": "slide"
 538 |     }
 539 |    },
 540 |    "source": [
 541 |     "Some attributes actually hold functions, in which case we call them *methods*."
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "code",
 546 |    "execution_count": 10,
 547 |    "metadata": {
 548 |     "slideshow": {
 549 |      "slide_type": "fragment"
 550 |     }
 551 |    },
 552 |    "outputs": [
 553 |     {
 554 |      "data": {
 555 |       "text/html": [
 556 |        "<div>\n",
 557 |        "<style scoped>\n",
 558 |        "    .dataframe tbody tr th:only-of-type {\n",
 559 |        "        vertical-align: middle;\n",
 560 |        "    }\n",
 561 |        "\n",
 562 |        "    .dataframe tbody tr th {\n",
 563 |        "        vertical-align: top;\n",
 564 |        "    }\n",
 565 |        "\n",
 566 |        "    .dataframe thead th {\n",
 567 |        "        text-align: right;\n",
 568 |        "    }\n",
 569 |        "</style>\n",
 570 |        "<table border=\"1\" class=\"dataframe\">\n",
 571 |        "  <thead>\n",
 572 |        "    <tr style=\"text-align: right;\">\n",
 573 |        "      <th></th>\n",
 574 |        "      <th>x</th>\n",
 575 |        "      <th>y</th>\n",
 576 |        "    </tr>\n",
 577 |        "  </thead>\n",
 578 |        "  <tbody>\n",
 579 |        "    <tr>\n",
 580 |        "      <th>count</th>\n",
 581 |        "      <td>2.000000</td>\n",
 582 |        "      <td>2.000000</td>\n",
 583 |        "    </tr>\n",
 584 |        "    <tr>\n",
 585 |        "      <th>mean</th>\n",
 586 |        "      <td>2.000000</td>\n",
 587 |        "      <td>3.000000</td>\n",
 588 |        "    </tr>\n",
 589 |        "    <tr>\n",
 590 |        "      <th>std</th>\n",
 591 |        "      <td>1.414214</td>\n",
 592 |        "      <td>1.414214</td>\n",
 593 |        "    </tr>\n",
 594 |        "    <tr>\n",
 595 |        "      <th>min</th>\n",
 596 |        "      <td>1.000000</td>\n",
 597 |        "      <td>2.000000</td>\n",
 598 |        "    </tr>\n",
 599 |        "    <tr>\n",
 600 |        "      <th>25%</th>\n",
 601 |        "      <td>1.500000</td>\n",
 602 |        "      <td>2.500000</td>\n",
 603 |        "    </tr>\n",
 604 |        "    <tr>\n",
 605 |        "      <th>50%</th>\n",
 606 |        "      <td>2.000000</td>\n",
 607 |        "      <td>3.000000</td>\n",
 608 |        "    </tr>\n",
 609 |        "    <tr>\n",
 610 |        "      <th>75%</th>\n",
 611 |        "      <td>2.500000</td>\n",
 612 |        "      <td>3.500000</td>\n",
 613 |        "    </tr>\n",
 614 |        "    <tr>\n",
 615 |        "      <th>max</th>\n",
 616 |        "      <td>3.000000</td>\n",
 617 |        "      <td>4.000000</td>\n",
 618 |        "    </tr>\n",
 619 |        "  </tbody>\n",
 620 |        "</table>\n",
 621 |        "</div>"
 622 |       ],
 623 |       "text/plain": [
 624 |        "              x         y\n",
 625 |        "count  2.000000  2.000000\n",
 626 |        "mean   2.000000  3.000000\n",
 627 |        "std    1.414214  1.414214\n",
 628 |        "min    1.000000  2.000000\n",
 629 |        "25%    1.500000  2.500000\n",
 630 |        "50%    2.000000  3.000000\n",
 631 |        "75%    2.500000  3.500000\n",
 632 |        "max    3.000000  4.000000"
 633 |       ]
 634 |      },
 635 |      "execution_count": 10,
 636 |      "metadata": {},
 637 |      "output_type": "execute_result"
 638 |     }
 639 |    ],
 640 |    "source": [
 641 |     "df.describe()"
 642 |    ]
 643 |   },
 644 |   {
 645 |    "cell_type": "markdown",
 646 |    "metadata": {
 647 |     "slideshow": {
 648 |      "slide_type": "slide"
 649 |     }
 650 |    },
 651 |    "source": [
 652 |     "### DataFrames and Series"
 653 |    ]
 654 |   },
 655 |   {
 656 |    "cell_type": "markdown",
 657 |    "metadata": {
 658 |     "slideshow": {
 659 |      "slide_type": "fragment"
 660 |     }
 661 |    },
 662 |    "source": [
 663 |     "When you extract individual rows or columns of DataFrames, you get a 1-dimensional dataset called a *Series*."
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "markdown",
 668 |    "metadata": {
 669 |     "slideshow": {
 670 |      "slide_type": "fragment"
 671 |     }
 672 |    },
 673 |    "source": [
 674 |     "Series look like lists but their data must be all of the same type, and they provide similar (though subtly different) functionality to DataFrames."
 675 |    ]
 676 |   },
 677 |   {
 678 |    "cell_type": "markdown",
 679 |    "metadata": {
 680 |     "slideshow": {
 681 |      "slide_type": "slide"
 682 |     }
 683 |    },
 684 |    "source": [
 685 |     "## Importing Data"
 686 |    ]
 687 |   },
 688 |   {
 689 |    "cell_type": "markdown",
 690 |    "metadata": {
 691 |     "slideshow": {
 692 |      "slide_type": "fragment"
 693 |     }
 694 |    },
 695 |    "source": [
 696 |     "Importing data is the process of taking data *on disk* and moving it into *memory*, where Python can do its work."
 697 |    ]
 698 |   },
 699 |   {
 700 |    "cell_type": "markdown",
 701 |    "metadata": {
 702 |     "slideshow": {
 703 |      "slide_type": "slide"
 704 |     }
 705 |    },
 706 |    "source": [
 707 |     "Reading CSVs will likely be one of the most common ways you import data."
 708 |    ]
 709 |   },
 710 |   {
 711 |    "cell_type": "markdown",
 712 |    "metadata": {
 713 |     "slideshow": {
 714 |      "slide_type": "fragment"
 715 |     }
 716 |    },
 717 |    "source": [
 718 |     "To do so, use Pandas' `read_csv` function, passing the name of your file as an argument."
 719 |    ]
 720 |   },
 721 |   {
 722 |    "cell_type": "markdown",
 723 |    "metadata": {
 724 |     "tags": []
 725 |    },
 726 |    "source": [
 727 |     "```python\n",
 728 |     "import pandas as pd\n",
 729 |     "data = pd.read_csv('myfile.csv')\n",
 730 |     "```"
 731 |    ]
 732 |   },
 733 |   {
 734 |    "cell_type": "markdown",
 735 |    "metadata": {
 736 |     "slideshow": {
 737 |      "slide_type": "slide"
 738 |     }
 739 |    },
 740 |    "source": [
 741 |     "Though they are less common in data science, JSON and pickle files may come up in your work as well."
 742 |    ]
 743 |   },
 744 |   {
 745 |    "cell_type": "markdown",
 746 |    "metadata": {
 747 |     "slideshow": {
 748 |      "slide_type": "fragment"
 749 |     }
 750 |    },
 751 |    "source": [
 752 |     "These are slightly more complicated to import, but it's still very doable."
 753 |    ]
 754 |   },
 755 |   {
 756 |    "cell_type": "markdown",
 757 |    "metadata": {
 758 |     "slideshow": {
 759 |      "slide_type": "slide"
 760 |     }
 761 |    },
 762 |    "source": [
 763 |     "JSON:"
 764 |    ]
 765 |   },
 766 |   {
 767 |    "cell_type": "markdown",
 768 |    "metadata": {
 769 |     "tags": []
 770 |    },
 771 |    "source": [
 772 |     "```python\n",
 773 |     "import json\n",
 774 |     "with open('myfile.json', 'r') as f:\n",
 775 |     "    data = json.load(f)\n",
 776 |     "```"
 777 |    ]
 778 |   },
 779 |   {
 780 |    "cell_type": "markdown",
 781 |    "metadata": {
 782 |     "tags": []
 783 |    },
 784 |    "source": [
 785 |     "Pickle:"
 786 |    ]
 787 |   },
 788 |   {
 789 |    "cell_type": "markdown",
 790 |    "metadata": {
 791 |     "tags": []
 792 |    },
 793 |    "source": [
 794 |     "```python\n",
 795 |     "import pickle\n",
 796 |     "with open('myfile.pickle', 'rb') as f:\n",
 797 |     "    data = pickle.load(f)\n",
 798 |     "```"
 799 |    ]
 800 |   },
 801 |   {
 802 |    "cell_type": "markdown",
 803 |    "metadata": {
 804 |     "slideshow": {
 805 |      "slide_type": "slide"
 806 |     }
 807 |    },
 808 |    "source": [
 809 |     "## Subsetting and Filtering"
 810 |    ]
 811 |   },
 812 |   {
 813 |    "cell_type": "markdown",
 814 |    "metadata": {
 815 |     "slideshow": {
 816 |      "slide_type": "fragment"
 817 |     }
 818 |    },
 819 |    "source": [
 820 |     "There are three primary ways of subsetting data:"
 821 |    ]
 822 |   },
 823 |   {
 824 |    "cell_type": "markdown",
 825 |    "metadata": {
 826 |     "slideshow": {
 827 |      "slide_type": "fragment"
 828 |     }
 829 |    },
 830 |    "source": [
 831 |     "- **Selecting** - Including certain *columns* of the data while excluding others"
 832 |    ]
 833 |   },
 834 |   {
 835 |    "cell_type": "markdown",
 836 |    "metadata": {
 837 |     "slideshow": {
 838 |      "slide_type": "fragment"
 839 |     }
 840 |    },
 841 |    "source": [
 842 |     "- **Slicing** - Including only certain *rows* based on their position (index) in the DataFrame "
 843 |    ]
 844 |   },
 845 |   {
 846 |    "cell_type": "markdown",
 847 |    "metadata": {
 848 |     "slideshow": {
 849 |      "slide_type": "fragment"
 850 |     }
 851 |    },
 852 |    "source": [
 853 |     "- **Filtering** - Including only certain *rows* with data that meets some criterion"
 854 |    ]
 855 |   },
 856 |   {
 857 |    "cell_type": "markdown",
 858 |    "metadata": {
 859 |     "slideshow": {
 860 |      "slide_type": "slide"
 861 |     }
 862 |    },
 863 |    "source": [
 864 |     "### Selecting"
 865 |    ]
 866 |   },
 867 |   {
 868 |    "cell_type": "markdown",
 869 |    "metadata": {
 870 |     "tags": []
 871 |    },
 872 |    "source": [
 873 |     "Selection is done with brackets.\n",
 874 |     "Pass a single column name (as a string) or a list of column names."
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "markdown",
 879 |    "metadata": {
 880 |     "tags": []
 881 |    },
 882 |    "source": [
 883 |     "```python\n",
 884 |     "# The column \"mycolumn\", as a Series\n",
 885 |     "df['mycolumn']\n",
 886 |     "\n",
 887 |     "# The columns \"column1\" and \"column2\" as a DataFrame \n",
 888 |     "df[['column_1', 'column_2']]\n",
 889 |     "```"
 890 |    ]
 891 |   },
 892 |   {
 893 |    "cell_type": "markdown",
 894 |    "metadata": {
 895 |     "slideshow": {
 896 |      "slide_type": "fragment"
 897 |     }
 898 |    },
 899 |    "source": [
 900 |     "<div class=\"admonition note alert alert-info\">\n",
 901 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Note</p></b>\n",
 902 |     "    <p>If you pass a list, the returned value will be a DataFrame.\n",
 903 |     "If you pass a single column name, it will be a Series.</p>\n",
 904 |     "</div>"
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "markdown",
 909 |    "metadata": {
 910 |     "slideshow": {
 911 |      "slide_type": "slide"
 912 |     }
 913 |    },
 914 |    "source": [
 915 |     "### Slicing"
 916 |    ]
 917 |   },
 918 |   {
 919 |    "cell_type": "markdown",
 920 |    "metadata": {
 921 |     "tags": []
 922 |    },
 923 |    "source": [
 924 |     "Slicing is typically done with the `.loc` accessor and brackets.\n",
 925 |     "Pass in a row index or a range of row indices."
 926 |    ]
 927 |   },
 928 |   {
 929 |    "cell_type": "markdown",
 930 |    "metadata": {
 931 |     "tags": []
 932 |    },
 933 |    "source": [
 934 |     "```python\n",
 935 |     "# The fifth (zero-indexing!) row, as a Series\n",
 936 |     "df.loc[4]\n",
 937 |     "\n",
 938 |     "# The second, third, and fourth rows, as a DataFrame\n",
 939 |     "df.loc[1:3]\n",
 940 |     "```"
 941 |    ]
 942 |   },
 943 |   {
 944 |    "cell_type": "markdown",
 945 |    "metadata": {
 946 |     "slideshow": {
 947 |      "slide_type": "fragment"
 948 |     }
 949 |    },
 950 |    "source": [
 951 |     "<div class=\"admonition note alert alert-info\">\n",
 952 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Note</p></b>\n",
 953 |     "    <p>If you pass a range of indices, the returned value will be a DataFrame. Otherwise it will be a Series.</p>\n",
 954 |     "</div>"
 955 |    ]
 956 |   },
 957 |   {
 958 |    "cell_type": "markdown",
 959 |    "metadata": {
 960 |     "slideshow": {
 961 |      "slide_type": "slide"
 962 |     }
 963 |    },
 964 |    "source": [
 965 |     "### Filtering"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "markdown",
 970 |    "metadata": {
 971 |     "tags": []
 972 |    },
 973 |    "source": [
 974 |     "DataFrames can be filtered by passing a *condition* in brackets."
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "markdown",
 979 |    "metadata": {
 980 |     "tags": []
 981 |    },
 982 |    "source": [
 983 |     "```python\n",
 984 |     "# Keep rows where `condition` is true\n",
 985 |     "df[condition]\n",
 986 |     "```"
 987 |    ]
 988 |   },
 989 |   {
 990 |    "cell_type": "markdown",
 991 |    "metadata": {
 992 |     "slideshow": {
 993 |      "slide_type": "slide"
 994 |     }
 995 |    },
 996 |    "source": [
 997 |     "Conditions are things like tests of equality, assertions that one value is greater than another, etc."
 998 |    ]
 999 |   },
1000 |   {
1001 |    "cell_type": "markdown",
1002 |    "metadata": {
1003 |     "slideshow": {
1004 |      "slide_type": "fragment"
1005 |     }
1006 |    },
1007 |    "source": [
1008 |     "```python\n",
1009 |     "# Keep rows where the value in \"mycolumn\" is equal to 5\n",
1010 |     "df[df['mycolumn'] == 5]\n",
1011 |     "```"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "markdown",
1016 |    "metadata": {
1017 |     "slideshow": {
1018 |      "slide_type": "fragment"
1019 |     }
1020 |    },
1021 |    "source": [
1022 |     "```python\n",
1023 |     "# Keep rows where mycolumn is less than 3 OR greater than 10\n",
1024 |     "df[ (df['mycolumn'] < 3) | (df['mycolumn'] > 10) ]\n",
1025 |     "```"
1026 |    ]
1027 |   },
1028 |   {
1029 |    "cell_type": "markdown",
1030 |    "metadata": {
1031 |     "slideshow": {
1032 |      "slide_type": "slide"
1033 |     }
1034 |    },
1035 |    "source": [
1036 |     "### Selecting and Filtering Together"
1037 |    ]
1038 |   },
1039 |   {
1040 |    "cell_type": "markdown",
1041 |    "metadata": {
1042 |     "tags": []
1043 |    },
1044 |    "source": [
1045 |     "Using `.loc`, it's possible to do selecting and filtering all in one step."
1046 |    ]
1047 |   },
1048 |   {
1049 |    "cell_type": "markdown",
1050 |    "metadata": {
1051 |     "tags": []
1052 |    },
1053 |    "source": [
1054 |     "```python\n",
1055 |     "# Filter down to rows where column_a is equal to 5,\n",
1056 |     "# and select column_b and column_c from those rows\n",
1057 |     "df.loc[df['column_a'] == 5, ['column_b', 'column_c']]\n",
1058 |     "```"
1059 |    ]
1060 |   },
1061 |   {
1062 |    "cell_type": "markdown",
1063 |    "metadata": {
1064 |     "slideshow": {
1065 |      "slide_type": "slide"
1066 |     }
1067 |    },
1068 |    "source": [
1069 |     "## Manipulating Columns"
1070 |    ]
1071 |   },
1072 |   {
1073 |    "cell_type": "markdown",
1074 |    "metadata": {
1075 |     "slideshow": {
1076 |      "slide_type": "slide"
1077 |     }
1078 |    },
1079 |    "source": [
1080 |     "### Numeric Calculations"
1081 |    ]
1082 |   },
1083 |   {
1084 |    "cell_type": "markdown",
1085 |    "metadata": {
1086 |     "tags": []
1087 |    },
1088 |    "source": [
1089 |     "It's possible to perform calculations using columns."
1090 |    ]
1091 |   },
1092 |   {
1093 |    "cell_type": "markdown",
1094 |    "metadata": {
1095 |     "slideshow": {
1096 |      "slide_type": "fragment"
1097 |     }
1098 |    },
1099 |    "source": [
1100 |     "```python\n",
1101 |     "df['mycolumn'] + 7\n",
1102 |     "```"
1103 |    ]
1104 |   },
1105 |   {
1106 |    "cell_type": "markdown",
1107 |    "metadata": {
1108 |     "slideshow": {
1109 |      "slide_type": "fragment"
1110 |     }
1111 |    },
1112 |    "source": [
1113 |     "```python\n",
1114 |     "df['mycolumn'] * 4 - 3\n",
1115 |     "```"
1116 |    ]
1117 |   },
1118 |   {
1119 |    "cell_type": "markdown",
1120 |    "metadata": {
1121 |     "slideshow": {
1122 |      "slide_type": "slide"
1123 |     }
1124 |    },
1125 |    "source": [
1126 |     "It's also possible to perform calculations based on values in multiple columns."
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "markdown",
1131 |    "metadata": {
1132 |     "slideshow": {
1133 |      "slide_type": "fragment"
1134 |     }
1135 |    },
1136 |    "source": [
1137 |     "```python\n",
1138 |     "df['column_a'] / df['column_b']\n",
1139 |     "```"
1140 |    ]
1141 |   },
1142 |   {
1143 |    "cell_type": "markdown",
1144 |    "metadata": {
1145 |     "slideshow": {
1146 |      "slide_type": "slide"
1147 |     }
1148 |    },
1149 |    "source": [
1150 |     "Generally you'll want to save the calculated values in a new column, which you can do with sensible assignment syntax."
1151 |    ]
1152 |   },
1153 |   {
1154 |    "cell_type": "markdown",
1155 |    "metadata": {
1156 |     "slideshow": {
1157 |      "slide_type": "fragment"
1158 |     }
1159 |    },
1160 |    "source": [
1161 |     "```python\n",
1162 |     "df['e'] = df['m'] * (df['c'] ** 2)\n",
1163 |     "```"
1164 |    ]
1165 |   },
1166 |   {
1167 |    "cell_type": "markdown",
1168 |    "metadata": {
1169 |     "slideshow": {
1170 |      "slide_type": "slide"
1171 |     }
1172 |    },
1173 |    "source": [
1174 |     "### String Manipulations"
1175 |    ]
1176 |   },
1177 |   {
1178 |    "cell_type": "markdown",
1179 |    "metadata": {
1180 |     "tags": []
1181 |    },
1182 |    "source": [
1183 |     "Lots of string functionality can be found within the `.str` accessor."
1184 |    ]
1185 |   },
1186 |   {
1187 |    "cell_type": "markdown",
1188 |    "metadata": {
1189 |     "tags": []
1190 |    },
1191 |    "source": [
1192 |     "```python\n",
1193 |     "# Convert the strings in mycolumn to all caps\n",
1194 |     "df['mycolumn'].str.upper()\n",
1195 |     "```"
1196 |    ]
1197 |   },
1198 |   {
1199 |    "cell_type": "markdown",
1200 |    "metadata": {
1201 |     "slideshow": {
1202 |      "slide_type": "slide"
1203 |     }
1204 |    },
1205 |    "source": [
1206 |     "### Mapping Values"
1207 |    ]
1208 |   },
1209 |   {
1210 |    "cell_type": "markdown",
1211 |    "metadata": {
1212 |     "tags": []
1213 |    },
1214 |    "source": [
1215 |     "In some cases you may need to convert some values to other values."
1216 |    ]
1217 |   },
1218 |   {
1219 |    "cell_type": "markdown",
1220 |    "metadata": {
1221 |     "slideshow": {
1222 |      "slide_type": "fragment"
1223 |     }
1224 |    },
1225 |    "source": [
1226 |     "This is a good case for the `.map` method of Series."
1227 |    ]
1228 |   },
1229 |   {
1230 |    "cell_type": "markdown",
1231 |    "metadata": {
1232 |     "slideshow": {
1233 |      "slide_type": "fragment"
1234 |     }
1235 |    },
1236 |    "source": [
1237 |     "Pass in a dictionary whose keys are the elements to be converted and whose values are the desired new values."
1238 |    ]
1239 |   },
1240 |   {
1241 |    "cell_type": "code",
1242 |    "execution_count": 11,
1243 |    "metadata": {
1244 |     "slideshow": {
1245 |      "slide_type": "slide"
1246 |     }
1247 |    },
1248 |    "outputs": [
1249 |     {
1250 |      "data": {
1251 |       "text/html": [
1252 |        "<div>\n",
1253 |        "<style scoped>\n",
1254 |        "    .dataframe tbody tr th:only-of-type {\n",
1255 |        "        vertical-align: middle;\n",
1256 |        "    }\n",
1257 |        "\n",
1258 |        "    .dataframe tbody tr th {\n",
1259 |        "        vertical-align: top;\n",
1260 |        "    }\n",
1261 |        "\n",
1262 |        "    .dataframe thead th {\n",
1263 |        "        text-align: right;\n",
1264 |        "    }\n",
1265 |        "</style>\n",
1266 |        "<table border=\"1\" class=\"dataframe\">\n",
1267 |        "  <thead>\n",
1268 |        "    <tr style=\"text-align: right;\">\n",
1269 |        "      <th></th>\n",
1270 |        "      <th>x</th>\n",
1271 |        "      <th>y</th>\n",
1272 |        "    </tr>\n",
1273 |        "  </thead>\n",
1274 |        "  <tbody>\n",
1275 |        "    <tr>\n",
1276 |        "      <th>0</th>\n",
1277 |        "      <td>1</td>\n",
1278 |        "      <td>2</td>\n",
1279 |        "    </tr>\n",
1280 |        "    <tr>\n",
1281 |        "      <th>1</th>\n",
1282 |        "      <td>3</td>\n",
1283 |        "      <td>4</td>\n",
1284 |        "    </tr>\n",
1285 |        "  </tbody>\n",
1286 |        "</table>\n",
1287 |        "</div>"
1288 |       ],
1289 |       "text/plain": [
1290 |        "   x  y\n",
1291 |        "0  1  2\n",
1292 |        "1  3  4"
1293 |       ]
1294 |      },
1295 |      "execution_count": 11,
1296 |      "metadata": {},
1297 |      "output_type": "execute_result"
1298 |     }
1299 |    ],
1300 |    "source": [
1301 |     "df"
1302 |    ]
1303 |   },
1304 |   {
1305 |    "cell_type": "code",
1306 |    "execution_count": 12,
1307 |    "metadata": {
1308 |     "tags": []
1309 |    },
1310 |    "outputs": [
1311 |     {
1312 |      "data": {
1313 |       "text/html": [
1314 |        "<div>\n",
1315 |        "<style scoped>\n",
1316 |        "    .dataframe tbody tr th:only-of-type {\n",
1317 |        "        vertical-align: middle;\n",
1318 |        "    }\n",
1319 |        "\n",
1320 |        "    .dataframe tbody tr th {\n",
1321 |        "        vertical-align: top;\n",
1322 |        "    }\n",
1323 |        "\n",
1324 |        "    .dataframe thead th {\n",
1325 |        "        text-align: right;\n",
1326 |        "    }\n",
1327 |        "</style>\n",
1328 |        "<table border=\"1\" class=\"dataframe\">\n",
1329 |        "  <thead>\n",
1330 |        "    <tr style=\"text-align: right;\">\n",
1331 |        "      <th></th>\n",
1332 |        "      <th>x</th>\n",
1333 |        "      <th>y</th>\n",
1334 |        "    </tr>\n",
1335 |        "  </thead>\n",
1336 |        "  <tbody>\n",
1337 |        "    <tr>\n",
1338 |        "      <th>0</th>\n",
1339 |        "      <td>11</td>\n",
1340 |        "      <td>2</td>\n",
1341 |        "    </tr>\n",
1342 |        "    <tr>\n",
1343 |        "      <th>1</th>\n",
1344 |        "      <td>33</td>\n",
1345 |        "      <td>4</td>\n",
1346 |        "    </tr>\n",
1347 |        "  </tbody>\n",
1348 |        "</table>\n",
1349 |        "</div>"
1350 |       ],
1351 |       "text/plain": [
1352 |        "    x  y\n",
1353 |        "0  11  2\n",
1354 |        "1  33  4"
1355 |       ]
1356 |      },
1357 |      "execution_count": 12,
1358 |      "metadata": {},
1359 |      "output_type": "execute_result"
1360 |     }
1361 |    ],
1362 |    "source": [
1363 |     "df['x'] = df['x'].map({1: 11, 3: 33})\n",
1364 |     "df"
1365 |    ]
1366 |   },
1367 |   {
1368 |    "cell_type": "markdown",
1369 |    "metadata": {
1370 |     "slideshow": {
1371 |      "slide_type": "slide"
1372 |     }
1373 |    },
1374 |    "source": [
1375 |     "## Practice\n",
1376 |     "\n",
1377 |     "1. Load the weather data (`weather.csv`) from the data folder of our repository. Store it in a variable called `weather`.\n",
1378 |     "2. Keep only the rows that have precipitation (i.e. `precip > 0`).\n",
1379 |     "3. Create a new column, \"air_hazard_rating\", that is `wind_speed / 2 + visib`.\n",
1380 |     "4. Keep only the \"origin\" and \"time\" columns."
1381 |    ]
1382 |   },
1383 |   {
1384 |    "cell_type": "markdown",
1385 |    "metadata": {
1386 |     "slideshow": {
1387 |      "slide_type": "slide"
1388 |     }
1389 |    },
1390 |    "source": [
1391 |     "# Questions\n",
1392 |     "\n",
1393 |     "Are there any questions before we move on?"
1394 |    ]
1395 |   }
1396 |  ],
1397 |  "metadata": {
1398 |   "kernelspec": {
1399 |    "display_name": "Python 3 (ipykernel)",
1400 |    "language": "python",
1401 |    "name": "python3"
1402 |   },
1403 |   "language_info": {
1404 |    "codemirror_mode": {
1405 |     "name": "ipython",
1406 |     "version": 3
1407 |    },
1408 |    "file_extension": ".py",
1409 |    "mimetype": "text/x-python",
1410 |    "name": "python",
1411 |    "nbconvert_exporter": "python",
1412 |    "pygments_lexer": "ipython3",
1413 |    "version": "3.11.4"
1414 |   },
1415 |   "rise": {
1416 |    "autolaunch": true,
1417 |    "transition": "none"
1418 |   }
1419 |  },
1420 |  "nbformat": 4,
1421 |  "nbformat_minor": 4
1422 | }
1423 | 


--------------------------------------------------------------------------------
/notebooks/09-Summarizing-Grouped-Data.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "slideshow": {
   7 |      "slide_type": "slide"
   8 |     }
   9 |    },
  10 |    "source": [
  11 |     "# Summarizing Grouped Data"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {
  17 |     "slideshow": {
  18 |      "slide_type": "slide"
  19 |     }
  20 |    },
  21 |    "source": [
  22 |     "## Applied Review"
  23 |    ]
  24 |   },
  25 |   {
  26 |    "cell_type": "markdown",
  27 |    "metadata": {
  28 |     "slideshow": {
  29 |      "slide_type": "slide"
  30 |     }
  31 |    },
  32 |    "source": [
  33 |     "### DataFrame Structure"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "markdown",
  38 |    "metadata": {
  39 |     "tags": []
  40 |    },
  41 |    "source": [
  42 |     "* We will start by importing the `planes` data set as a DataFrame:"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 1,
  48 |    "metadata": {
  49 |     "slideshow": {
  50 |      "slide_type": "-"
  51 |     }
  52 |    },
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "import pandas as pd\n",
  56 |     "planes_df = pd.read_csv('../data/planes.csv')"
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "markdown",
  61 |    "metadata": {
  62 |     "tags": []
  63 |    },
  64 |    "source": [
  65 |     "* Each DataFrame variable is a **Series** and can be accessed with bracket subsetting notation: \n",
  66 |     "\n",
  67 |     "```python \n",
  68 |     "DataFrame['SeriesName']\n",
  69 |     "```"
  70 |    ]
  71 |   },
  72 |   {
  73 |    "cell_type": "markdown",
  74 |    "metadata": {
  75 |     "tags": []
  76 |    },
  77 |    "source": [
  78 |     "* The DataFrame has an **Index** that is visible the far left side"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "markdown",
  83 |    "metadata": {
  84 |     "slideshow": {
  85 |      "slide_type": "slide"
  86 |     }
  87 |    },
  88 |    "source": [
  89 |     "### Summary Operations"
  90 |    ]
  91 |   },
  92 |   {
  93 |    "cell_type": "markdown",
  94 |    "metadata": {
  95 |     "cell_style": "split",
  96 |     "tags": []
  97 |    },
  98 |    "source": [
  99 |     "* Summary operations occur when we collapse a Series or DataFrame down to a single row"
 100 |    ]
 101 |   },
 102 |   {
 103 |    "cell_type": "markdown",
 104 |    "metadata": {
 105 |     "cell_style": "split",
 106 |     "tags": []
 107 |    },
 108 |    "source": [
 109 |     "* This is an aggregation of a variable across its rows"
 110 |    ]
 111 |   },
 112 |   {
 113 |    "cell_type": "markdown",
 114 |    "metadata": {
 115 |     "cell_style": "center",
 116 |     "tags": []
 117 |    },
 118 |    "source": [
 119 |     "<center>\n",
 120 |     "<img src=\"images/aggregate-series.png\" alt=\"aggregate-series.png\" width=\"400\" height=\"400\">\n",
 121 |     "</center>"
 122 |    ]
 123 |   },
 124 |   {
 125 |    "cell_type": "markdown",
 126 |    "metadata": {
 127 |     "slideshow": {
 128 |      "slide_type": "slide"
 129 |     }
 130 |    },
 131 |    "source": [
 132 |     "### Summarizing Data Frames"
 133 |    ]
 134 |   },
 135 |   {
 136 |    "cell_type": "markdown",
 137 |    "metadata": {
 138 |     "tags": []
 139 |    },
 140 |    "source": [
 141 |     "* We can perform summary operations on DataFrames in a number of ways:\n",
 142 |     "  * Summary methods for a specific summary operation: \n",
 143 |     "  ```python \n",
 144 |     "  DataFrame.sum()\n",
 145 |     "  ```\n",
 146 |     "  * Describe method for a collection of summary operations: \n",
 147 |     "  ```python\n",
 148 |     "  DataFrame.describe()\n",
 149 |     "  ```\n",
 150 |     "  * Agg method for flexibility in summary operations: \n",
 151 |     "  ```python\n",
 152 |     "  DataFrame.agg({'VariableName': ['sum', 'mean']})\n",
 153 |     "  ```"
 154 |    ]
 155 |   },
 156 |   {
 157 |    "cell_type": "markdown",
 158 |    "metadata": {
 159 |     "slideshow": {
 160 |      "slide_type": "slide"
 161 |     }
 162 |    },
 163 |    "source": [
 164 |     "* An example of the agg method:"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "code",
 169 |    "execution_count": 2,
 170 |    "metadata": {
 171 |     "scrolled": true,
 172 |     "slideshow": {
 173 |      "slide_type": "-"
 174 |     }
 175 |    },
 176 |    "outputs": [
 177 |     {
 178 |      "data": {
 179 |       "text/html": [
 180 |        "<div>\n",
 181 |        "<style scoped>\n",
 182 |        "    .dataframe tbody tr th:only-of-type {\n",
 183 |        "        vertical-align: middle;\n",
 184 |        "    }\n",
 185 |        "\n",
 186 |        "    .dataframe tbody tr th {\n",
 187 |        "        vertical-align: top;\n",
 188 |        "    }\n",
 189 |        "\n",
 190 |        "    .dataframe thead th {\n",
 191 |        "        text-align: right;\n",
 192 |        "    }\n",
 193 |        "</style>\n",
 194 |        "<table border=\"1\" class=\"dataframe\">\n",
 195 |        "  <thead>\n",
 196 |        "    <tr style=\"text-align: right;\">\n",
 197 |        "      <th></th>\n",
 198 |        "      <th>year</th>\n",
 199 |        "      <th>seats</th>\n",
 200 |        "    </tr>\n",
 201 |        "  </thead>\n",
 202 |        "  <tbody>\n",
 203 |        "    <tr>\n",
 204 |        "      <th>mean</th>\n",
 205 |        "      <td>2000.48401</td>\n",
 206 |        "      <td>154.316376</td>\n",
 207 |        "    </tr>\n",
 208 |        "    <tr>\n",
 209 |        "      <th>median</th>\n",
 210 |        "      <td>2001.00000</td>\n",
 211 |        "      <td>NaN</td>\n",
 212 |        "    </tr>\n",
 213 |        "    <tr>\n",
 214 |        "      <th>max</th>\n",
 215 |        "      <td>NaN</td>\n",
 216 |        "      <td>450.000000</td>\n",
 217 |        "    </tr>\n",
 218 |        "  </tbody>\n",
 219 |        "</table>\n",
 220 |        "</div>"
 221 |       ],
 222 |       "text/plain": [
 223 |        "              year       seats\n",
 224 |        "mean    2000.48401  154.316376\n",
 225 |        "median  2001.00000         NaN\n",
 226 |        "max            NaN  450.000000"
 227 |       ]
 228 |      },
 229 |      "execution_count": 2,
 230 |      "metadata": {},
 231 |      "output_type": "execute_result"
 232 |     }
 233 |    ],
 234 |    "source": [
 235 |     "planes_df.agg({\n",
 236 |     "    'year': ['mean', 'median'],\n",
 237 |     "    'seats': ['mean', 'max']\n",
 238 |     "})"
 239 |    ]
 240 |   },
 241 |   {
 242 |    "cell_type": "markdown",
 243 |    "metadata": {
 244 |     "slideshow": {
 245 |      "slide_type": "fragment"
 246 |     }
 247 |    },
 248 |    "source": [
 249 |     "<div class=\"admonition note alert alert-info\">\n",
 250 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Note</p></b>\n",
 251 |     "    <p>We will primarily use the <tt class=\\\"docutils literal\\\">.agg()</tt> method moving forward.</p>\n",
 252 |     "</div>"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "markdown",
 257 |    "metadata": {
 258 |     "slideshow": {
 259 |      "slide_type": "slide"
 260 |     }
 261 |    },
 262 |    "source": [
 263 |     "## General Model"
 264 |    ]
 265 |   },
 266 |   {
 267 |    "cell_type": "markdown",
 268 |    "metadata": {
 269 |     "slideshow": {
 270 |      "slide_type": "slide"
 271 |     }
 272 |    },
 273 |    "source": [
 274 |     "### Variable Groups"
 275 |    ]
 276 |   },
 277 |   {
 278 |    "cell_type": "markdown",
 279 |    "metadata": {
 280 |     "cell_style": "split",
 281 |     "tags": []
 282 |    },
 283 |    "source": [
 284 |     "* We can group DataFrame rows together by the value in a Series/variable\n",
 285 |     "* If we \"group by A\", then rows with the same value in variable A are in the same group"
 286 |    ]
 287 |   },
 288 |   {
 289 |    "cell_type": "markdown",
 290 |    "metadata": {
 291 |     "cell_style": "split",
 292 |     "tags": []
 293 |    },
 294 |    "source": [
 295 |     "<img src=\"images/dataframe-groups.png\" width=\"50%\" height=\"50%\"/>"
 296 |    ]
 297 |   },
 298 |   {
 299 |    "cell_type": "markdown",
 300 |    "metadata": {
 301 |     "cell_style": "split",
 302 |     "slideshow": {
 303 |      "slide_type": "slide"
 304 |     }
 305 |    },
 306 |    "source": [
 307 |     "* Note that groups do not need to be ordered by their values:"
 308 |    ]
 309 |   },
 310 |   {
 311 |    "cell_type": "markdown",
 312 |    "metadata": {
 313 |     "cell_style": "split",
 314 |     "slideshow": {
 315 |      "slide_type": "-"
 316 |     }
 317 |    },
 318 |    "source": [
 319 |     "<img src=\"images/dataframe-groups-unordered.png\" width=\"50%\" height=\"50%\"/>"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "metadata": {
 325 |     "slideshow": {
 326 |      "slide_type": "slide"
 327 |     }
 328 |    },
 329 |    "source": [
 330 |     "<div class=\"admonition tip alert alert-warning\">\n",
 331 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Question</p></b>\n",
 332 |     "    <p>Why might we be interested in grouping by a variable?</p>\n",
 333 |     "</div>"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "markdown",
 338 |    "metadata": {
 339 |     "slideshow": {
 340 |      "slide_type": "slide"
 341 |     },
 342 |     "tags": []
 343 |    },
 344 |    "source": [
 345 |     "### Summarizing by Groups"
 346 |    ]
 347 |   },
 348 |   {
 349 |    "cell_type": "markdown",
 350 |    "metadata": {
 351 |     "tags": []
 352 |    },
 353 |    "source": [
 354 |     "* When we've talked about **summary** operations, we've talked about collapsing a DataFrame to a single row"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "markdown",
 359 |    "metadata": {
 360 |     "tags": []
 361 |    },
 362 |    "source": [
 363 |     "* This is not always the case -- we sometimes collapse to a *single row per group*"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "markdown",
 368 |    "metadata": {
 369 |     "tags": []
 370 |    },
 371 |    "source": [
 372 |     "* This is known as a grouped aggregation:"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "markdown",
 377 |    "metadata": {
 378 |     "slideshow": {
 379 |      "slide_type": "slide"
 380 |     }
 381 |    },
 382 |    "source": [
 383 |     "![summarizing-by-groups.png](images/summarizing-by-groups.png)"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "markdown",
 388 |    "metadata": {
 389 |     "slideshow": {
 390 |      "slide_type": "slide"
 391 |     }
 392 |    },
 393 |    "source": [
 394 |     "* This can be useful when we want to aggregate by cateogory:\n",
 395 |     "  * Maximum temperature *by month*\n",
 396 |     "  * Total home runs *by team*\n",
 397 |     "  * Total sales *by geography*\n",
 398 |     "  * Average number of seats by plane manufacturer"
 399 |    ]
 400 |   },
 401 |   {
 402 |    "cell_type": "markdown",
 403 |    "metadata": {
 404 |     "slideshow": {
 405 |      "slide_type": "fragment"
 406 |     },
 407 |     "tags": []
 408 |    },
 409 |    "source": [
 410 |     "<div class=\"admonition tip alert alert-warning\">\n",
 411 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Question</p></b>\n",
 412 |     "    <p>What are common grouped aggregation metrics used in your industry/organization?</p>\n",
 413 |     "</div>"
 414 |    ]
 415 |   },
 416 |   {
 417 |    "cell_type": "markdown",
 418 |    "metadata": {
 419 |     "slideshow": {
 420 |      "slide_type": "slide"
 421 |     }
 422 |    },
 423 |    "source": [
 424 |     "## Summarizing Grouped Data"
 425 |    ]
 426 |   },
 427 |   {
 428 |    "cell_type": "markdown",
 429 |    "metadata": {
 430 |     "tags": []
 431 |    },
 432 |    "source": [
 433 |     "* When we summarize by groups, we can use the same aggregation methods we previously did\n",
 434 |     "  * Summary methods for a specific summary operation: \n",
 435 |     "  ```python\n",
 436 |     "  DataFrame.sum()\n",
 437 |     "  ```\n",
 438 |     "  * Describe method for a collection of summary operations: \n",
 439 |     "  ```python\n",
 440 |     "  DataFrame.describe()\n",
 441 |     "  ```\n",
 442 |     "  * Agg method for flexibility in summary operations: \n",
 443 |     "  ```python\n",
 444 |     "  DataFrame.agg({'VariableName': ['sum', 'mean']})\n",
 445 |     "  ```"
 446 |    ]
 447 |   },
 448 |   {
 449 |    "cell_type": "markdown",
 450 |    "metadata": {
 451 |     "slideshow": {
 452 |      "slide_type": "slide"
 453 |     }
 454 |    },
 455 |    "source": [
 456 |     "* The only difference is the need to **set the DataFrame group prior to aggregating**"
 457 |    ]
 458 |   },
 459 |   {
 460 |    "cell_type": "markdown",
 461 |    "metadata": {
 462 |     "slideshow": {
 463 |      "slide_type": "slide"
 464 |     }
 465 |    },
 466 |    "source": [
 467 |     "### Setting the DataFrame Group"
 468 |    ]
 469 |   },
 470 |   {
 471 |    "cell_type": "markdown",
 472 |    "metadata": {
 473 |     "tags": []
 474 |    },
 475 |    "source": [
 476 |     "* We can set the DataFrame group by calling the `DataFrame.groupby()` method and passing a variable name:"
 477 |    ]
 478 |   },
 479 |   {
 480 |    "cell_type": "code",
 481 |    "execution_count": 3,
 482 |    "metadata": {
 483 |     "slideshow": {
 484 |      "slide_type": "-"
 485 |     }
 486 |    },
 487 |    "outputs": [
 488 |     {
 489 |      "data": {
 490 |       "text/plain": [
 491 |        "<pandas.core.groupby.generic.DataFrameGroupBy object at 0x111554950>"
 492 |       ]
 493 |      },
 494 |      "execution_count": 3,
 495 |      "metadata": {},
 496 |      "output_type": "execute_result"
 497 |     }
 498 |    ],
 499 |    "source": [
 500 |     "planes_df.groupby('model')"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "markdown",
 505 |    "metadata": {
 506 |     "slideshow": {
 507 |      "slide_type": "fragment"
 508 |     }
 509 |    },
 510 |    "source": [
 511 |     "* Notice that a DataFrame doesn't print when it's grouped"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "markdown",
 516 |    "metadata": {
 517 |     "slideshow": {
 518 |      "slide_type": "fragment"
 519 |     }
 520 |    },
 521 |    "source": [
 522 |     "* The `groupby()` method is just setting the group - you can see the changed DataFrame class:"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "code",
 527 |    "execution_count": 4,
 528 |    "metadata": {
 529 |     "slideshow": {
 530 |      "slide_type": "-"
 531 |     }
 532 |    },
 533 |    "outputs": [
 534 |     {
 535 |      "data": {
 536 |       "text/plain": [
 537 |        "pandas.core.groupby.generic.DataFrameGroupBy"
 538 |       ]
 539 |      },
 540 |      "execution_count": 4,
 541 |      "metadata": {},
 542 |      "output_type": "execute_result"
 543 |     }
 544 |    ],
 545 |    "source": [
 546 |     "type(planes_df.groupby('manufacturer'))"
 547 |    ]
 548 |   },
 549 |   {
 550 |    "cell_type": "markdown",
 551 |    "metadata": {
 552 |     "slideshow": {
 553 |      "slide_type": "slide"
 554 |     }
 555 |    },
 556 |    "source": [
 557 |     "* If we then call an aggregation method, we will see the DataFrame returned with the aggregated results:"
 558 |    ]
 559 |   },
 560 |   {
 561 |    "cell_type": "code",
 562 |    "execution_count": 5,
 563 |    "metadata": {
 564 |     "slideshow": {
 565 |      "slide_type": "-"
 566 |     }
 567 |    },
 568 |    "outputs": [
 569 |     {
 570 |      "data": {
 571 |       "text/html": [
 572 |        "<div>\n",
 573 |        "<style scoped>\n",
 574 |        "    .dataframe tbody tr th:only-of-type {\n",
 575 |        "        vertical-align: middle;\n",
 576 |        "    }\n",
 577 |        "\n",
 578 |        "    .dataframe tbody tr th {\n",
 579 |        "        vertical-align: top;\n",
 580 |        "    }\n",
 581 |        "\n",
 582 |        "    .dataframe thead tr th {\n",
 583 |        "        text-align: left;\n",
 584 |        "    }\n",
 585 |        "\n",
 586 |        "    .dataframe thead tr:last-of-type th {\n",
 587 |        "        text-align: right;\n",
 588 |        "    }\n",
 589 |        "</style>\n",
 590 |        "<table border=\"1\" class=\"dataframe\">\n",
 591 |        "  <thead>\n",
 592 |        "    <tr>\n",
 593 |        "      <th></th>\n",
 594 |        "      <th colspan=\"2\" halign=\"left\">seats</th>\n",
 595 |        "    </tr>\n",
 596 |        "    <tr>\n",
 597 |        "      <th></th>\n",
 598 |        "      <th>mean</th>\n",
 599 |        "      <th>max</th>\n",
 600 |        "    </tr>\n",
 601 |        "    <tr>\n",
 602 |        "      <th>manufacturer</th>\n",
 603 |        "      <th></th>\n",
 604 |        "      <th></th>\n",
 605 |        "    </tr>\n",
 606 |        "  </thead>\n",
 607 |        "  <tbody>\n",
 608 |        "    <tr>\n",
 609 |        "      <th>AGUSTA SPA</th>\n",
 610 |        "      <td>8.000000</td>\n",
 611 |        "      <td>8</td>\n",
 612 |        "    </tr>\n",
 613 |        "    <tr>\n",
 614 |        "      <th>AIRBUS</th>\n",
 615 |        "      <td>221.202381</td>\n",
 616 |        "      <td>379</td>\n",
 617 |        "    </tr>\n",
 618 |        "    <tr>\n",
 619 |        "      <th>AIRBUS INDUSTRIE</th>\n",
 620 |        "      <td>187.402500</td>\n",
 621 |        "      <td>379</td>\n",
 622 |        "    </tr>\n",
 623 |        "    <tr>\n",
 624 |        "      <th>AMERICAN AIRCRAFT INC</th>\n",
 625 |        "      <td>2.000000</td>\n",
 626 |        "      <td>2</td>\n",
 627 |        "    </tr>\n",
 628 |        "    <tr>\n",
 629 |        "      <th>AVIAT AIRCRAFT INC</th>\n",
 630 |        "      <td>2.000000</td>\n",
 631 |        "      <td>2</td>\n",
 632 |        "    </tr>\n",
 633 |        "  </tbody>\n",
 634 |        "</table>\n",
 635 |        "</div>"
 636 |       ],
 637 |       "text/plain": [
 638 |        "                            seats     \n",
 639 |        "                             mean  max\n",
 640 |        "manufacturer                          \n",
 641 |        "AGUSTA SPA               8.000000    8\n",
 642 |        "AIRBUS                 221.202381  379\n",
 643 |        "AIRBUS INDUSTRIE       187.402500  379\n",
 644 |        "AMERICAN AIRCRAFT INC    2.000000    2\n",
 645 |        "AVIAT AIRCRAFT INC       2.000000    2"
 646 |       ]
 647 |      },
 648 |      "execution_count": 5,
 649 |      "metadata": {},
 650 |      "output_type": "execute_result"
 651 |     }
 652 |    ],
 653 |    "source": [
 654 |     "(\n",
 655 |     "    planes_df.groupby('manufacturer')\n",
 656 |     "    .agg({'seats': ['mean', 'max']}).head()\n",
 657 |     ")"
 658 |    ]
 659 |   },
 660 |   {
 661 |    "cell_type": "markdown",
 662 |    "metadata": {
 663 |     "slideshow": {
 664 |      "slide_type": "slide"
 665 |     }
 666 |    },
 667 |    "source": [
 668 |     "* This process always follows this model:\n",
 669 |     "\n",
 670 |     "![model-for-grouped-aggs.png](images/model-for-grouped-aggs.png)"
 671 |    ]
 672 |   },
 673 |   {
 674 |    "cell_type": "markdown",
 675 |    "metadata": {
 676 |     "slideshow": {
 677 |      "slide_type": "slide"
 678 |     }
 679 |    },
 680 |    "source": [
 681 |     "* **Notice that the grouped variable becomes the Index in our example!**"
 682 |    ]
 683 |   },
 684 |   {
 685 |    "cell_type": "code",
 686 |    "execution_count": 6,
 687 |    "metadata": {
 688 |     "slideshow": {
 689 |      "slide_type": "-"
 690 |     }
 691 |    },
 692 |    "outputs": [
 693 |     {
 694 |      "data": {
 695 |       "text/html": [
 696 |        "<div>\n",
 697 |        "<style scoped>\n",
 698 |        "    .dataframe tbody tr th:only-of-type {\n",
 699 |        "        vertical-align: middle;\n",
 700 |        "    }\n",
 701 |        "\n",
 702 |        "    .dataframe tbody tr th {\n",
 703 |        "        vertical-align: top;\n",
 704 |        "    }\n",
 705 |        "\n",
 706 |        "    .dataframe thead tr th {\n",
 707 |        "        text-align: left;\n",
 708 |        "    }\n",
 709 |        "\n",
 710 |        "    .dataframe thead tr:last-of-type th {\n",
 711 |        "        text-align: right;\n",
 712 |        "    }\n",
 713 |        "</style>\n",
 714 |        "<table border=\"1\" class=\"dataframe\">\n",
 715 |        "  <thead>\n",
 716 |        "    <tr>\n",
 717 |        "      <th></th>\n",
 718 |        "      <th colspan=\"2\" halign=\"left\">seats</th>\n",
 719 |        "    </tr>\n",
 720 |        "    <tr>\n",
 721 |        "      <th></th>\n",
 722 |        "      <th>mean</th>\n",
 723 |        "      <th>max</th>\n",
 724 |        "    </tr>\n",
 725 |        "    <tr>\n",
 726 |        "      <th>manufacturer</th>\n",
 727 |        "      <th></th>\n",
 728 |        "      <th></th>\n",
 729 |        "    </tr>\n",
 730 |        "  </thead>\n",
 731 |        "  <tbody>\n",
 732 |        "    <tr>\n",
 733 |        "      <th>AGUSTA SPA</th>\n",
 734 |        "      <td>8.000000</td>\n",
 735 |        "      <td>8</td>\n",
 736 |        "    </tr>\n",
 737 |        "    <tr>\n",
 738 |        "      <th>AIRBUS</th>\n",
 739 |        "      <td>221.202381</td>\n",
 740 |        "      <td>379</td>\n",
 741 |        "    </tr>\n",
 742 |        "    <tr>\n",
 743 |        "      <th>AIRBUS INDUSTRIE</th>\n",
 744 |        "      <td>187.402500</td>\n",
 745 |        "      <td>379</td>\n",
 746 |        "    </tr>\n",
 747 |        "    <tr>\n",
 748 |        "      <th>AMERICAN AIRCRAFT INC</th>\n",
 749 |        "      <td>2.000000</td>\n",
 750 |        "      <td>2</td>\n",
 751 |        "    </tr>\n",
 752 |        "    <tr>\n",
 753 |        "      <th>AVIAT AIRCRAFT INC</th>\n",
 754 |        "      <td>2.000000</td>\n",
 755 |        "      <td>2</td>\n",
 756 |        "    </tr>\n",
 757 |        "  </tbody>\n",
 758 |        "</table>\n",
 759 |        "</div>"
 760 |       ],
 761 |       "text/plain": [
 762 |        "                            seats     \n",
 763 |        "                             mean  max\n",
 764 |        "manufacturer                          \n",
 765 |        "AGUSTA SPA               8.000000    8\n",
 766 |        "AIRBUS                 221.202381  379\n",
 767 |        "AIRBUS INDUSTRIE       187.402500  379\n",
 768 |        "AMERICAN AIRCRAFT INC    2.000000    2\n",
 769 |        "AVIAT AIRCRAFT INC       2.000000    2"
 770 |       ]
 771 |      },
 772 |      "execution_count": 6,
 773 |      "metadata": {},
 774 |      "output_type": "execute_result"
 775 |     }
 776 |    ],
 777 |    "source": [
 778 |     "(\n",
 779 |     "    planes_df.groupby('manufacturer')\n",
 780 |     "    .agg({'seats': ['mean', 'max']}).head()\n",
 781 |     ")"
 782 |    ]
 783 |   },
 784 |   {
 785 |    "cell_type": "code",
 786 |    "execution_count": 7,
 787 |    "metadata": {
 788 |     "slideshow": {
 789 |      "slide_type": "slide"
 790 |     }
 791 |    },
 792 |    "outputs": [
 793 |     {
 794 |      "data": {
 795 |       "text/plain": [
 796 |        "Index(['AGUSTA SPA', 'AIRBUS', 'AIRBUS INDUSTRIE', 'AMERICAN AIRCRAFT INC',\n",
 797 |        "       'AVIAT AIRCRAFT INC', 'AVIONS MARCEL DASSAULT', 'BARKER JACK L',\n",
 798 |        "       'BEECH', 'BELL', 'BOEING', 'BOMBARDIER INC', 'CANADAIR', 'CANADAIR LTD',\n",
 799 |        "       'CESSNA', 'CIRRUS DESIGN CORP', 'DEHAVILLAND', 'DOUGLAS', 'EMBRAER',\n",
 800 |        "       'FRIEDEMANN JON', 'GULFSTREAM AEROSPACE', 'HURLEY JAMES LARRY',\n",
 801 |        "       'JOHN G HESS', 'KILDALL GARY', 'LAMBERT RICHARD', 'LEARJET INC',\n",
 802 |        "       'LEBLANC GLENN T', 'MARZ BARRY', 'MCDONNELL DOUGLAS',\n",
 803 |        "       'MCDONNELL DOUGLAS AIRCRAFT CO', 'MCDONNELL DOUGLAS CORPORATION',\n",
 804 |        "       'PAIR MIKE E', 'PIPER', 'ROBINSON HELICOPTER CO', 'SIKORSKY',\n",
 805 |        "       'STEWART MACO'],\n",
 806 |        "      dtype='object', name='manufacturer')"
 807 |       ]
 808 |      },
 809 |      "execution_count": 7,
 810 |      "metadata": {},
 811 |      "output_type": "execute_result"
 812 |     }
 813 |    ],
 814 |    "source": [
 815 |     "(\n",
 816 |     "    planes_df.groupby('manufacturer')\n",
 817 |     "    .agg({'seats': ['mean', 'max']}).index\n",
 818 |     ")"
 819 |    ]
 820 |   },
 821 |   {
 822 |    "cell_type": "markdown",
 823 |    "metadata": {
 824 |     "slideshow": {
 825 |      "slide_type": "slide"
 826 |     }
 827 |    },
 828 |    "source": [
 829 |     "### Groups as Indexes"
 830 |    ]
 831 |   },
 832 |   {
 833 |    "cell_type": "markdown",
 834 |    "metadata": {
 835 |     "tags": []
 836 |    },
 837 |    "source": [
 838 |     "* This is the default behavior of `pandas`, and probably how `pandas` wants to be used"
 839 |    ]
 840 |   },
 841 |   {
 842 |    "cell_type": "markdown",
 843 |    "metadata": {
 844 |     "tags": []
 845 |    },
 846 |    "source": [
 847 |     "* This is the fastest way to do it, but it's a matter of less than a millisecond"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "markdown",
 852 |    "metadata": {},
 853 |    "source": [
 854 |     "* You aren't always going to see people group by the Index..."
 855 |    ]
 856 |   },
 857 |   {
 858 |    "cell_type": "markdown",
 859 |    "metadata": {
 860 |     "slideshow": {
 861 |      "slide_type": "slide"
 862 |     }
 863 |    },
 864 |    "source": [
 865 |     "### Groups as Variables"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "markdown",
 870 |    "metadata": {
 871 |     "tags": []
 872 |    },
 873 |    "source": [
 874 |     "* Instead of setting the group as the Index, we can set the group as a variable"
 875 |    ]
 876 |   },
 877 |   {
 878 |    "cell_type": "markdown",
 879 |    "metadata": {
 880 |     "slideshow": {
 881 |      "slide_type": "fragment"
 882 |     }
 883 |    },
 884 |    "source": [
 885 |     "* The grouped variable can remain a Series/variable by adding the `as_index = False` parameter/argument to `groupby()`:"
 886 |    ]
 887 |   },
 888 |   {
 889 |    "cell_type": "code",
 890 |    "execution_count": 8,
 891 |    "metadata": {
 892 |     "slideshow": {
 893 |      "slide_type": "-"
 894 |     }
 895 |    },
 896 |    "outputs": [
 897 |     {
 898 |      "data": {
 899 |       "text/html": [
 900 |        "<div>\n",
 901 |        "<style scoped>\n",
 902 |        "    .dataframe tbody tr th:only-of-type {\n",
 903 |        "        vertical-align: middle;\n",
 904 |        "    }\n",
 905 |        "\n",
 906 |        "    .dataframe tbody tr th {\n",
 907 |        "        vertical-align: top;\n",
 908 |        "    }\n",
 909 |        "\n",
 910 |        "    .dataframe thead tr th {\n",
 911 |        "        text-align: left;\n",
 912 |        "    }\n",
 913 |        "</style>\n",
 914 |        "<table border=\"1\" class=\"dataframe\">\n",
 915 |        "  <thead>\n",
 916 |        "    <tr>\n",
 917 |        "      <th></th>\n",
 918 |        "      <th>manufacturer</th>\n",
 919 |        "      <th colspan=\"2\" halign=\"left\">seats</th>\n",
 920 |        "    </tr>\n",
 921 |        "    <tr>\n",
 922 |        "      <th></th>\n",
 923 |        "      <th></th>\n",
 924 |        "      <th>mean</th>\n",
 925 |        "      <th>max</th>\n",
 926 |        "    </tr>\n",
 927 |        "  </thead>\n",
 928 |        "  <tbody>\n",
 929 |        "    <tr>\n",
 930 |        "      <th>0</th>\n",
 931 |        "      <td>AGUSTA SPA</td>\n",
 932 |        "      <td>8.000000</td>\n",
 933 |        "      <td>8</td>\n",
 934 |        "    </tr>\n",
 935 |        "    <tr>\n",
 936 |        "      <th>1</th>\n",
 937 |        "      <td>AIRBUS</td>\n",
 938 |        "      <td>221.202381</td>\n",
 939 |        "      <td>379</td>\n",
 940 |        "    </tr>\n",
 941 |        "    <tr>\n",
 942 |        "      <th>2</th>\n",
 943 |        "      <td>AIRBUS INDUSTRIE</td>\n",
 944 |        "      <td>187.402500</td>\n",
 945 |        "      <td>379</td>\n",
 946 |        "    </tr>\n",
 947 |        "    <tr>\n",
 948 |        "      <th>3</th>\n",
 949 |        "      <td>AMERICAN AIRCRAFT INC</td>\n",
 950 |        "      <td>2.000000</td>\n",
 951 |        "      <td>2</td>\n",
 952 |        "    </tr>\n",
 953 |        "    <tr>\n",
 954 |        "      <th>4</th>\n",
 955 |        "      <td>AVIAT AIRCRAFT INC</td>\n",
 956 |        "      <td>2.000000</td>\n",
 957 |        "      <td>2</td>\n",
 958 |        "    </tr>\n",
 959 |        "  </tbody>\n",
 960 |        "</table>\n",
 961 |        "</div>"
 962 |       ],
 963 |       "text/plain": [
 964 |        "            manufacturer       seats     \n",
 965 |        "                                mean  max\n",
 966 |        "0             AGUSTA SPA    8.000000    8\n",
 967 |        "1                 AIRBUS  221.202381  379\n",
 968 |        "2       AIRBUS INDUSTRIE  187.402500  379\n",
 969 |        "3  AMERICAN AIRCRAFT INC    2.000000    2\n",
 970 |        "4     AVIAT AIRCRAFT INC    2.000000    2"
 971 |       ]
 972 |      },
 973 |      "execution_count": 8,
 974 |      "metadata": {},
 975 |      "output_type": "execute_result"
 976 |     }
 977 |    ],
 978 |    "source": [
 979 |     "(\n",
 980 |     "    planes_df.groupby('manufacturer', as_index = False)\n",
 981 |     "    .agg({'seats': ['mean', 'max']}).head()\n",
 982 |     ")"
 983 |    ]
 984 |   },
 985 |   {
 986 |    "cell_type": "markdown",
 987 |    "metadata": {
 988 |     "slideshow": {
 989 |      "slide_type": "slide"
 990 |     }
 991 |    },
 992 |    "source": [
 993 |     "### Grouping by Multiple Variables"
 994 |    ]
 995 |   },
 996 |   {
 997 |    "cell_type": "markdown",
 998 |    "metadata": {
 999 |     "tags": []
1000 |    },
1001 |    "source": [
1002 |     "* Sometimes we have multiple categories by which we'd like to group"
1003 |    ]
1004 |   },
1005 |   {
1006 |    "cell_type": "markdown",
1007 |    "metadata": {
1008 |     "slideshow": {
1009 |      "slide_type": "fragment"
1010 |     }
1011 |    },
1012 |    "source": [
1013 |     "* To extend our example, assume we want to find the average number of seats by plane manufacturer AND plane year"
1014 |    ]
1015 |   },
1016 |   {
1017 |    "cell_type": "markdown",
1018 |    "metadata": {
1019 |     "slideshow": {
1020 |      "slide_type": "fragment"
1021 |     }
1022 |    },
1023 |    "source": [
1024 |     "* We can pass a list of variable names to the `groupby()` method:"
1025 |    ]
1026 |   },
1027 |   {
1028 |    "cell_type": "code",
1029 |    "execution_count": 9,
1030 |    "metadata": {
1031 |     "slideshow": {
1032 |      "slide_type": "-"
1033 |     }
1034 |    },
1035 |    "outputs": [
1036 |     {
1037 |      "data": {
1038 |       "text/html": [
1039 |        "<div>\n",
1040 |        "<style scoped>\n",
1041 |        "    .dataframe tbody tr th:only-of-type {\n",
1042 |        "        vertical-align: middle;\n",
1043 |        "    }\n",
1044 |        "\n",
1045 |        "    .dataframe tbody tr th {\n",
1046 |        "        vertical-align: top;\n",
1047 |        "    }\n",
1048 |        "\n",
1049 |        "    .dataframe thead tr th {\n",
1050 |        "        text-align: left;\n",
1051 |        "    }\n",
1052 |        "</style>\n",
1053 |        "<table border=\"1\" class=\"dataframe\">\n",
1054 |        "  <thead>\n",
1055 |        "    <tr>\n",
1056 |        "      <th></th>\n",
1057 |        "      <th>manufacturer</th>\n",
1058 |        "      <th>year</th>\n",
1059 |        "      <th colspan=\"2\" halign=\"left\">seats</th>\n",
1060 |        "    </tr>\n",
1061 |        "    <tr>\n",
1062 |        "      <th></th>\n",
1063 |        "      <th></th>\n",
1064 |        "      <th></th>\n",
1065 |        "      <th>mean</th>\n",
1066 |        "      <th>max</th>\n",
1067 |        "    </tr>\n",
1068 |        "  </thead>\n",
1069 |        "  <tbody>\n",
1070 |        "    <tr>\n",
1071 |        "      <th>0</th>\n",
1072 |        "      <td>AGUSTA SPA</td>\n",
1073 |        "      <td>2001.0</td>\n",
1074 |        "      <td>8.000000</td>\n",
1075 |        "      <td>8</td>\n",
1076 |        "    </tr>\n",
1077 |        "    <tr>\n",
1078 |        "      <th>1</th>\n",
1079 |        "      <td>AIRBUS</td>\n",
1080 |        "      <td>2002.0</td>\n",
1081 |        "      <td>173.800000</td>\n",
1082 |        "      <td>200</td>\n",
1083 |        "    </tr>\n",
1084 |        "    <tr>\n",
1085 |        "      <th>2</th>\n",
1086 |        "      <td>AIRBUS</td>\n",
1087 |        "      <td>2003.0</td>\n",
1088 |        "      <td>174.966667</td>\n",
1089 |        "      <td>200</td>\n",
1090 |        "    </tr>\n",
1091 |        "    <tr>\n",
1092 |        "      <th>3</th>\n",
1093 |        "      <td>AIRBUS</td>\n",
1094 |        "      <td>2004.0</td>\n",
1095 |        "      <td>217.000000</td>\n",
1096 |        "      <td>379</td>\n",
1097 |        "    </tr>\n",
1098 |        "    <tr>\n",
1099 |        "      <th>4</th>\n",
1100 |        "      <td>AIRBUS</td>\n",
1101 |        "      <td>2005.0</td>\n",
1102 |        "      <td>197.000000</td>\n",
1103 |        "      <td>379</td>\n",
1104 |        "    </tr>\n",
1105 |        "  </tbody>\n",
1106 |        "</table>\n",
1107 |        "</div>"
1108 |       ],
1109 |       "text/plain": [
1110 |        "  manufacturer    year       seats     \n",
1111 |        "                              mean  max\n",
1112 |        "0   AGUSTA SPA  2001.0    8.000000    8\n",
1113 |        "1       AIRBUS  2002.0  173.800000  200\n",
1114 |        "2       AIRBUS  2003.0  174.966667  200\n",
1115 |        "3       AIRBUS  2004.0  217.000000  379\n",
1116 |        "4       AIRBUS  2005.0  197.000000  379"
1117 |       ]
1118 |      },
1119 |      "execution_count": 9,
1120 |      "metadata": {},
1121 |      "output_type": "execute_result"
1122 |     }
1123 |    ],
1124 |    "source": [
1125 |     "(\n",
1126 |     "    planes_df.groupby(['manufacturer', 'year'], as_index = False)\n",
1127 |     "    .agg({'seats': ['mean', 'max']}).head()\n",
1128 |     ")"
1129 |    ]
1130 |   },
1131 |   {
1132 |    "cell_type": "markdown",
1133 |    "metadata": {
1134 |     "slideshow": {
1135 |      "slide_type": "slide"
1136 |     }
1137 |    },
1138 |    "source": [
1139 |     "### Your Turn\n",
1140 |     "\n",
1141 |     "1\\. What is meant by \"find the minimum number of seats on a plane by year\"?\n",
1142 |     "\n",
1143 |     "2\\. Fix the below code to find the minimum number of seats on a plane by year:\n",
1144 |     "\n",
1145 |     "   ```python\n",
1146 |     "   planes_df.groupby('_____').agg({'_____': ['min']})\n",
1147 |     "   ```\n",
1148 |     "   \n",
1149 |     "3\\. What is the Index of the result?"
1150 |    ]
1151 |   },
1152 |   {
1153 |    "cell_type": "markdown",
1154 |    "metadata": {
1155 |     "slideshow": {
1156 |      "slide_type": "slide"
1157 |     }
1158 |    },
1159 |    "source": [
1160 |     "## Questions\n",
1161 |     "\n",
1162 |     "Are there any questions before we move on?"
1163 |    ]
1164 |   }
1165 |  ],
1166 |  "metadata": {
1167 |   "celltoolbar": "Slideshow",
1168 |   "kernelspec": {
1169 |    "display_name": "Python 3 (ipykernel)",
1170 |    "language": "python",
1171 |    "name": "python3"
1172 |   },
1173 |   "language_info": {
1174 |    "codemirror_mode": {
1175 |     "name": "ipython",
1176 |     "version": 3
1177 |    },
1178 |    "file_extension": ".py",
1179 |    "mimetype": "text/x-python",
1180 |    "name": "python",
1181 |    "nbconvert_exporter": "python",
1182 |    "pygments_lexer": "ipython3",
1183 |    "version": "3.11.4"
1184 |   },
1185 |   "rise": {
1186 |    "autolaunch": true,
1187 |    "transition": "none"
1188 |   }
1189 |  },
1190 |  "nbformat": 4,
1191 |  "nbformat_minor": 4
1192 | }
1193 | 


--------------------------------------------------------------------------------
/notebooks/11-Exporting-Data.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "slideshow": {
   7 |      "slide_type": "slide"
   8 |     }
   9 |    },
  10 |    "source": [
  11 |     "# Exporting Data"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "markdown",
  16 |    "metadata": {
  17 |     "slideshow": {
  18 |      "slide_type": "-"
  19 |     }
  20 |    },
  21 |    "source": [
  22 |     "> Data science is not effective without saving results.\n",
  23 |     ">\n",
  24 |     "> \\- Another wise person"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "metadata": {
  30 |     "slideshow": {
  31 |      "slide_type": "slide"
  32 |     }
  33 |    },
  34 |    "source": [
  35 |     "## Applied Review"
  36 |    ]
  37 |   },
  38 |   {
  39 |    "cell_type": "markdown",
  40 |    "metadata": {
  41 |     "slideshow": {
  42 |      "slide_type": "slide"
  43 |     }
  44 |    },
  45 |    "source": [
  46 |     "### Data in Python"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "metadata": {
  52 |     "tags": []
  53 |    },
  54 |    "source": [
  55 |     "* Data is frequently represented inside a **DataFrame** - a class from the pandas library"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "markdown",
  60 |    "metadata": {
  61 |     "tags": []
  62 |    },
  63 |    "source": [
  64 |     "* Other structures exist, too - dicts, models, etc."
  65 |    ]
  66 |   },
  67 |   {
  68 |    "cell_type": "markdown",
  69 |    "metadata": {
  70 |     "tags": []
  71 |    },
  72 |    "source": [
  73 |     "* Data is stored in memory - this makes it relatively quickly accessible"
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "markdown",
  78 |    "metadata": {
  79 |     "tags": []
  80 |    },
  81 |    "source": [
  82 |     "* Data is session-specific, so quitting Python (i.e shutting down JupyterLab) removes the data from memory"
  83 |    ]
  84 |   },
  85 |   {
  86 |    "cell_type": "markdown",
  87 |    "metadata": {
  88 |     "slideshow": {
  89 |      "slide_type": "slide"
  90 |     }
  91 |    },
  92 |    "source": [
  93 |     "### Importing Data"
  94 |    ]
  95 |   },
  96 |   {
  97 |    "cell_type": "markdown",
  98 |    "metadata": {
  99 |     "tags": []
 100 |    },
 101 |    "source": [
 102 |     "* Tabular data can be imported into DataFrames using the `pd.read_csv()` function - there are parameters for different options and other `pd.read_xxx()` functions."
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "markdown",
 107 |    "metadata": {
 108 |     "slideshow": {
 109 |      "slide_type": "fragment"
 110 |     }
 111 |    },
 112 |    "source": [
 113 |     "* Other data formats like JSON (key-value pairs) and Pickle (native Python) can be imported using the `with` statement and respective functions:\n",
 114 |     "  * JSON files use the `load()` function from the `json` library\n",
 115 |     "  * Pickle files use the `load()` function from the `pickle` library"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "markdown",
 120 |    "metadata": {
 121 |     "slideshow": {
 122 |      "slide_type": "slide"
 123 |     }
 124 |    },
 125 |    "source": [
 126 |     "## General Model"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "markdown",
 131 |    "metadata": {
 132 |     "slideshow": {
 133 |      "slide_type": "slide"
 134 |     }
 135 |    },
 136 |    "source": [
 137 |     "### General Framework"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "markdown",
 142 |    "metadata": {
 143 |     "tags": []
 144 |    },
 145 |    "source": [
 146 |     "A general way to conceptualize data export from Python to Disk:"
 147 |    ]
 148 |   },
 149 |   {
 150 |    "cell_type": "markdown",
 151 |    "metadata": {
 152 |     "tags": []
 153 |    },
 154 |    "source": [
 155 |     "1. Data sits in memory in the Python session"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "markdown",
 160 |    "metadata": {
 161 |     "tags": []
 162 |    },
 163 |    "source": [
 164 |     "2. Python code can be used to copy the data from Python's memory to an appropriate format on disk"
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "markdown",
 169 |    "metadata": {
 170 |     "slideshow": {
 171 |      "slide_type": "slide"
 172 |     }
 173 |    },
 174 |    "source": [
 175 |     "This framework can be visualized below:\n",
 176 |     "\n",
 177 |     "<center>\n",
 178 |     "<img src=\"images/export-framework.png\" alt=\"export-framework.png\" width=\"80%\" height=\"80%\">\n",
 179 |     "<center/>"
 180 |    ]
 181 |   },
 182 |   {
 183 |    "cell_type": "markdown",
 184 |    "metadata": {
 185 |     "slideshow": {
 186 |      "slide_type": "slide"
 187 |     }
 188 |    },
 189 |    "source": [
 190 |     "## Exporting DataFrames"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "markdown",
 195 |    "metadata": {
 196 |     "tags": []
 197 |    },
 198 |    "source": [
 199 |     "Remember that DataFrames are representations of tabular data -- therefore, knowing how to export DataFrames to tabular data files is important."
 200 |    ]
 201 |   },
 202 |   {
 203 |    "cell_type": "markdown",
 204 |    "metadata": {
 205 |     "slideshow": {
 206 |      "slide_type": "slide"
 207 |     }
 208 |    },
 209 |    "source": [
 210 |     "### Exporting Setup"
 211 |    ]
 212 |   },
 213 |   {
 214 |    "cell_type": "markdown",
 215 |    "metadata": {
 216 |     "tags": []
 217 |    },
 218 |    "source": [
 219 |     "We need data to export."
 220 |    ]
 221 |   },
 222 |   {
 223 |    "cell_type": "markdown",
 224 |    "metadata": {
 225 |     "tags": []
 226 |    },
 227 |    "source": [
 228 |     "Let's begin by revisiting the importing of tabular data into a DataFrame:"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": 1,
 234 |    "metadata": {
 235 |     "slideshow": {
 236 |      "slide_type": "-"
 237 |     }
 238 |    },
 239 |    "outputs": [],
 240 |    "source": [
 241 |     "import pandas as pd\n",
 242 |     "planes_df = pd.read_csv('../data/planes.csv')"
 243 |    ]
 244 |   },
 245 |   {
 246 |    "cell_type": "markdown",
 247 |    "metadata": {
 248 |     "slideshow": {
 249 |      "slide_type": "fragment"
 250 |     }
 251 |    },
 252 |    "source": [
 253 |     "Next, let's do some manipulations on `planes_df`."
 254 |    ]
 255 |   },
 256 |   {
 257 |    "cell_type": "markdown",
 258 |    "metadata": {
 259 |     "slideshow": {
 260 |      "slide_type": "slide"
 261 |     }
 262 |    },
 263 |    "source": [
 264 |     "<div class=\"admonition tip alert alert-warning\">\n",
 265 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Question</p></b>\n",
 266 |     "    <p>How do we select the <tt class=\\\"docutils literal\\\">year</tt> and <tt class=\\\"docutils literal\\\">manufacturer</tt> variables while returning a DataFrame?</p>\n",
 267 |     "</div>"
 268 |    ]
 269 |   },
 270 |   {
 271 |    "cell_type": "code",
 272 |    "execution_count": 2,
 273 |    "metadata": {
 274 |     "slideshow": {
 275 |      "slide_type": "fragment"
 276 |     }
 277 |    },
 278 |    "outputs": [],
 279 |    "source": [
 280 |     "planes_df = planes_df[['year', 'manufacturer']]"
 281 |    ]
 282 |   },
 283 |   {
 284 |    "cell_type": "markdown",
 285 |    "metadata": {
 286 |     "slideshow": {
 287 |      "slide_type": "slide"
 288 |     }
 289 |    },
 290 |    "source": [
 291 |     "<div class=\"admonition tip alert alert-warning\">\n",
 292 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Question</p></b>\n",
 293 |     "    <p>How do we compute the average <tt class=\\\"docutils literal\\\">year</tt> by <tt class=\\\"docutils literal\\\">manufacturer</tt>?</p>\n",
 294 |     "</div>"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "code",
 299 |    "execution_count": 3,
 300 |    "metadata": {
 301 |     "slideshow": {
 302 |      "slide_type": "fragment"
 303 |     }
 304 |    },
 305 |    "outputs": [],
 306 |    "source": [
 307 |     "avg_year_by_man_df = (\n",
 308 |     "    planes_df.groupby('manufacturer', as_index = False)\n",
 309 |     "    .mean()\n",
 310 |     ")"
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "markdown",
 315 |    "metadata": {
 316 |     "slideshow": {
 317 |      "slide_type": "slide"
 318 |     }
 319 |    },
 320 |    "source": [
 321 |     "Let's view our result to find the manufacturers with the oldest planes:"
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "code",
 326 |    "execution_count": 4,
 327 |    "metadata": {
 328 |     "slideshow": {
 329 |      "slide_type": "-"
 330 |     }
 331 |    },
 332 |    "outputs": [
 333 |     {
 334 |      "data": {
 335 |       "text/html": [
 336 |        "<div>\n",
 337 |        "<style scoped>\n",
 338 |        "    .dataframe tbody tr th:only-of-type {\n",
 339 |        "        vertical-align: middle;\n",
 340 |        "    }\n",
 341 |        "\n",
 342 |        "    .dataframe tbody tr th {\n",
 343 |        "        vertical-align: top;\n",
 344 |        "    }\n",
 345 |        "\n",
 346 |        "    .dataframe thead th {\n",
 347 |        "        text-align: right;\n",
 348 |        "    }\n",
 349 |        "</style>\n",
 350 |        "<table border=\"1\" class=\"dataframe\">\n",
 351 |        "  <thead>\n",
 352 |        "    <tr style=\"text-align: right;\">\n",
 353 |        "      <th></th>\n",
 354 |        "      <th>manufacturer</th>\n",
 355 |        "      <th>year</th>\n",
 356 |        "    </tr>\n",
 357 |        "  </thead>\n",
 358 |        "  <tbody>\n",
 359 |        "    <tr>\n",
 360 |        "      <th>16</th>\n",
 361 |        "      <td>DOUGLAS</td>\n",
 362 |        "      <td>1956.000000</td>\n",
 363 |        "    </tr>\n",
 364 |        "    <tr>\n",
 365 |        "      <th>15</th>\n",
 366 |        "      <td>DEHAVILLAND</td>\n",
 367 |        "      <td>1959.000000</td>\n",
 368 |        "    </tr>\n",
 369 |        "    <tr>\n",
 370 |        "      <th>7</th>\n",
 371 |        "      <td>BEECH</td>\n",
 372 |        "      <td>1969.500000</td>\n",
 373 |        "    </tr>\n",
 374 |        "    <tr>\n",
 375 |        "      <th>13</th>\n",
 376 |        "      <td>CESSNA</td>\n",
 377 |        "      <td>1972.444444</td>\n",
 378 |        "    </tr>\n",
 379 |        "    <tr>\n",
 380 |        "      <th>12</th>\n",
 381 |        "      <td>CANADAIR LTD</td>\n",
 382 |        "      <td>1974.000000</td>\n",
 383 |        "    </tr>\n",
 384 |        "  </tbody>\n",
 385 |        "</table>\n",
 386 |        "</div>"
 387 |       ],
 388 |       "text/plain": [
 389 |        "    manufacturer         year\n",
 390 |        "16       DOUGLAS  1956.000000\n",
 391 |        "15   DEHAVILLAND  1959.000000\n",
 392 |        "7          BEECH  1969.500000\n",
 393 |        "13        CESSNA  1972.444444\n",
 394 |        "12  CANADAIR LTD  1974.000000"
 395 |       ]
 396 |      },
 397 |      "execution_count": 4,
 398 |      "metadata": {},
 399 |      "output_type": "execute_result"
 400 |     }
 401 |    ],
 402 |    "source": [
 403 |     "avg_year_by_man_df.sort_values('year').head()"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "markdown",
 408 |    "metadata": {
 409 |     "slideshow": {
 410 |      "slide_type": "slide"
 411 |     }
 412 |    },
 413 |    "source": [
 414 |     "### Exporting DataFrames with Pandas"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "markdown",
 419 |    "metadata": {
 420 |     "slideshow": {
 421 |      "slide_type": "fragment"
 422 |     }
 423 |    },
 424 |    "source": [
 425 |     "DataFrames can be exported using a method built-in to the DataFrame object itself: `DataFrame.to_csv()`."
 426 |    ]
 427 |   },
 428 |   {
 429 |    "cell_type": "code",
 430 |    "execution_count": 5,
 431 |    "metadata": {
 432 |     "slideshow": {
 433 |      "slide_type": "-"
 434 |     }
 435 |    },
 436 |    "outputs": [],
 437 |    "source": [
 438 |     "avg_year_by_man_df.to_csv('../data/avg_year_by_man.csv')"
 439 |    ]
 440 |   },
 441 |   {
 442 |    "cell_type": "markdown",
 443 |    "metadata": {
 444 |     "slideshow": {
 445 |      "slide_type": "slide"
 446 |     }
 447 |    },
 448 |    "source": [
 449 |     "Let's reimport to see the tabular data we just exported:"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": 6,
 455 |    "metadata": {
 456 |     "slideshow": {
 457 |      "slide_type": "fragment"
 458 |     }
 459 |    },
 460 |    "outputs": [
 461 |     {
 462 |      "data": {
 463 |       "text/html": [
 464 |        "<div>\n",
 465 |        "<style scoped>\n",
 466 |        "    .dataframe tbody tr th:only-of-type {\n",
 467 |        "        vertical-align: middle;\n",
 468 |        "    }\n",
 469 |        "\n",
 470 |        "    .dataframe tbody tr th {\n",
 471 |        "        vertical-align: top;\n",
 472 |        "    }\n",
 473 |        "\n",
 474 |        "    .dataframe thead th {\n",
 475 |        "        text-align: right;\n",
 476 |        "    }\n",
 477 |        "</style>\n",
 478 |        "<table border=\"1\" class=\"dataframe\">\n",
 479 |        "  <thead>\n",
 480 |        "    <tr style=\"text-align: right;\">\n",
 481 |        "      <th></th>\n",
 482 |        "      <th>Unnamed: 0</th>\n",
 483 |        "      <th>manufacturer</th>\n",
 484 |        "      <th>year</th>\n",
 485 |        "    </tr>\n",
 486 |        "  </thead>\n",
 487 |        "  <tbody>\n",
 488 |        "    <tr>\n",
 489 |        "      <th>0</th>\n",
 490 |        "      <td>0</td>\n",
 491 |        "      <td>AGUSTA SPA</td>\n",
 492 |        "      <td>2001.000000</td>\n",
 493 |        "    </tr>\n",
 494 |        "    <tr>\n",
 495 |        "      <th>1</th>\n",
 496 |        "      <td>1</td>\n",
 497 |        "      <td>AIRBUS</td>\n",
 498 |        "      <td>2007.201220</td>\n",
 499 |        "    </tr>\n",
 500 |        "    <tr>\n",
 501 |        "      <th>2</th>\n",
 502 |        "      <td>2</td>\n",
 503 |        "      <td>AIRBUS INDUSTRIE</td>\n",
 504 |        "      <td>1998.233333</td>\n",
 505 |        "    </tr>\n",
 506 |        "    <tr>\n",
 507 |        "      <th>3</th>\n",
 508 |        "      <td>3</td>\n",
 509 |        "      <td>AMERICAN AIRCRAFT INC</td>\n",
 510 |        "      <td>NaN</td>\n",
 511 |        "    </tr>\n",
 512 |        "    <tr>\n",
 513 |        "      <th>4</th>\n",
 514 |        "      <td>4</td>\n",
 515 |        "      <td>AVIAT AIRCRAFT INC</td>\n",
 516 |        "      <td>2007.000000</td>\n",
 517 |        "    </tr>\n",
 518 |        "  </tbody>\n",
 519 |        "</table>\n",
 520 |        "</div>"
 521 |       ],
 522 |       "text/plain": [
 523 |        "   Unnamed: 0           manufacturer         year\n",
 524 |        "0           0             AGUSTA SPA  2001.000000\n",
 525 |        "1           1                 AIRBUS  2007.201220\n",
 526 |        "2           2       AIRBUS INDUSTRIE  1998.233333\n",
 527 |        "3           3  AMERICAN AIRCRAFT INC          NaN\n",
 528 |        "4           4     AVIAT AIRCRAFT INC  2007.000000"
 529 |       ]
 530 |      },
 531 |      "execution_count": 6,
 532 |      "metadata": {},
 533 |      "output_type": "execute_result"
 534 |     }
 535 |    ],
 536 |    "source": [
 537 |     "pd.read_csv('../data/avg_year_by_man.csv').head()"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "markdown",
 542 |    "metadata": {
 543 |     "slideshow": {
 544 |      "slide_type": "fragment"
 545 |     }
 546 |    },
 547 |    "source": [
 548 |     "<div class=\"admonition warning alert alert-warning\">\n",
 549 |     " <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Question?</p></b>\n",
 550 |     " <p>Notice the extra column named <tt class=\\\"docutils literal\\\">Unnamed: 0</tt> . Where did this extra column come from?</p>\n",
 551 |     "</div>"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "markdown",
 556 |    "metadata": {
 557 |     "slideshow": {
 558 |      "slide_type": "slide"
 559 |     },
 560 |     "tags": []
 561 |    },
 562 |    "source": [
 563 |     "This `Unnamed: 0` column is the index from the DataFrame. Despite it not being part of the original data, it's saved with the DataFrame by default."
 564 |    ]
 565 |   },
 566 |   {
 567 |    "cell_type": "markdown",
 568 |    "metadata": {
 569 |     "slideshow": {
 570 |      "slide_type": "fragment"
 571 |     },
 572 |     "tags": []
 573 |    },
 574 |    "source": [
 575 |     "We can elect not to save the index with the DataFrame by passing `False` to the `index` parameter of `to_csv()`:"
 576 |    ]
 577 |   },
 578 |   {
 579 |    "cell_type": "code",
 580 |    "execution_count": 7,
 581 |    "metadata": {
 582 |     "slideshow": {
 583 |      "slide_type": "-"
 584 |     }
 585 |    },
 586 |    "outputs": [],
 587 |    "source": [
 588 |     "avg_year_by_man_df.to_csv('../data/avg_year_by_man.csv', index=False)"
 589 |    ]
 590 |   },
 591 |   {
 592 |    "cell_type": "markdown",
 593 |    "metadata": {
 594 |     "slideshow": {
 595 |      "slide_type": "fragment"
 596 |     }
 597 |    },
 598 |    "source": [
 599 |     "And then check our result again:"
 600 |    ]
 601 |   },
 602 |   {
 603 |    "cell_type": "code",
 604 |    "execution_count": 8,
 605 |    "metadata": {
 606 |     "slideshow": {
 607 |      "slide_type": "-"
 608 |     }
 609 |    },
 610 |    "outputs": [
 611 |     {
 612 |      "data": {
 613 |       "text/html": [
 614 |        "<div>\n",
 615 |        "<style scoped>\n",
 616 |        "    .dataframe tbody tr th:only-of-type {\n",
 617 |        "        vertical-align: middle;\n",
 618 |        "    }\n",
 619 |        "\n",
 620 |        "    .dataframe tbody tr th {\n",
 621 |        "        vertical-align: top;\n",
 622 |        "    }\n",
 623 |        "\n",
 624 |        "    .dataframe thead th {\n",
 625 |        "        text-align: right;\n",
 626 |        "    }\n",
 627 |        "</style>\n",
 628 |        "<table border=\"1\" class=\"dataframe\">\n",
 629 |        "  <thead>\n",
 630 |        "    <tr style=\"text-align: right;\">\n",
 631 |        "      <th></th>\n",
 632 |        "      <th>manufacturer</th>\n",
 633 |        "      <th>year</th>\n",
 634 |        "    </tr>\n",
 635 |        "  </thead>\n",
 636 |        "  <tbody>\n",
 637 |        "    <tr>\n",
 638 |        "      <th>0</th>\n",
 639 |        "      <td>AGUSTA SPA</td>\n",
 640 |        "      <td>2001.000000</td>\n",
 641 |        "    </tr>\n",
 642 |        "    <tr>\n",
 643 |        "      <th>1</th>\n",
 644 |        "      <td>AIRBUS</td>\n",
 645 |        "      <td>2007.201220</td>\n",
 646 |        "    </tr>\n",
 647 |        "    <tr>\n",
 648 |        "      <th>2</th>\n",
 649 |        "      <td>AIRBUS INDUSTRIE</td>\n",
 650 |        "      <td>1998.233333</td>\n",
 651 |        "    </tr>\n",
 652 |        "    <tr>\n",
 653 |        "      <th>3</th>\n",
 654 |        "      <td>AMERICAN AIRCRAFT INC</td>\n",
 655 |        "      <td>NaN</td>\n",
 656 |        "    </tr>\n",
 657 |        "    <tr>\n",
 658 |        "      <th>4</th>\n",
 659 |        "      <td>AVIAT AIRCRAFT INC</td>\n",
 660 |        "      <td>2007.000000</td>\n",
 661 |        "    </tr>\n",
 662 |        "  </tbody>\n",
 663 |        "</table>\n",
 664 |        "</div>"
 665 |       ],
 666 |       "text/plain": [
 667 |        "            manufacturer         year\n",
 668 |        "0             AGUSTA SPA  2001.000000\n",
 669 |        "1                 AIRBUS  2007.201220\n",
 670 |        "2       AIRBUS INDUSTRIE  1998.233333\n",
 671 |        "3  AMERICAN AIRCRAFT INC          NaN\n",
 672 |        "4     AVIAT AIRCRAFT INC  2007.000000"
 673 |       ]
 674 |      },
 675 |      "execution_count": 8,
 676 |      "metadata": {},
 677 |      "output_type": "execute_result"
 678 |     }
 679 |    ],
 680 |    "source": [
 681 |     "pd.read_csv('../data/avg_year_by_man.csv').head()"
 682 |    ]
 683 |   },
 684 |   {
 685 |    "cell_type": "markdown",
 686 |    "metadata": {
 687 |     "slideshow": {
 688 |      "slide_type": "slide"
 689 |     }
 690 |    },
 691 |    "source": [
 692 |     "The `to_csv()` method has similar parameters to `read_csv()`. A few examples:\n",
 693 |     "\n",
 694 |     "* `sep` - the data's delimter\n",
 695 |     "* `header` - whether or not to write out the column names"
 696 |    ]
 697 |   },
 698 |   {
 699 |    "cell_type": "markdown",
 700 |    "metadata": {
 701 |     "slideshow": {
 702 |      "slide_type": "fragment"
 703 |     }
 704 |    },
 705 |    "source": [
 706 |     "Full documentation can be pulled up by running the method name followed by a question mark:"
 707 |    ]
 708 |   },
 709 |   {
 710 |    "cell_type": "code",
 711 |    "execution_count": 9,
 712 |    "metadata": {
 713 |     "slideshow": {
 714 |      "slide_type": "-"
 715 |     }
 716 |    },
 717 |    "outputs": [
 718 |     {
 719 |      "name": "stdout",
 720 |      "output_type": "stream",
 721 |      "text": [
 722 |       "\u001b[0;31mSignature:\u001b[0m\n",
 723 |       "\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n",
 724 |       "\u001b[0;34m\u001b[0m    \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 725 |       "\u001b[0;34m\u001b[0m    \u001b[0mpath_or_buf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 726 |       "\u001b[0;34m\u001b[0m    \u001b[0msep\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m','\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 727 |       "\u001b[0;34m\u001b[0m    \u001b[0mna_rep\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 728 |       "\u001b[0;34m\u001b[0m    \u001b[0mfloat_format\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | Callable | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 729 |       "\u001b[0;34m\u001b[0m    \u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Sequence[Hashable] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 730 |       "\u001b[0;34m\u001b[0m    \u001b[0mheader\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool_t | list[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 731 |       "\u001b[0;34m\u001b[0m    \u001b[0mindex\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool_t'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 732 |       "\u001b[0;34m\u001b[0m    \u001b[0mindex_label\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'IndexLabel | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 733 |       "\u001b[0;34m\u001b[0m    \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 734 |       "\u001b[0;34m\u001b[0m    \u001b[0mencoding\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 735 |       "\u001b[0;34m\u001b[0m    \u001b[0mcompression\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'CompressionOptions'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infer'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 736 |       "\u001b[0;34m\u001b[0m    \u001b[0mquoting\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 737 |       "\u001b[0;34m\u001b[0m    \u001b[0mquotechar\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'\"'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 738 |       "\u001b[0;34m\u001b[0m    \u001b[0mlineterminator\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 739 |       "\u001b[0;34m\u001b[0m    \u001b[0mchunksize\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 740 |       "\u001b[0;34m\u001b[0m    \u001b[0mdate_format\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 741 |       "\u001b[0;34m\u001b[0m    \u001b[0mdoublequote\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool_t'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 742 |       "\u001b[0;34m\u001b[0m    \u001b[0mescapechar\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 743 |       "\u001b[0;34m\u001b[0m    \u001b[0mdecimal\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'.'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 744 |       "\u001b[0;34m\u001b[0m    \u001b[0merrors\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'strict'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 745 |       "\u001b[0;34m\u001b[0m    \u001b[0mstorage_options\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'StorageOptions'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n",
 746 |       "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'str | None'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
 747 |       "\u001b[0;31mDocstring:\u001b[0m\n",
 748 |       "Write object to a comma-separated values (csv) file.\n",
 749 |       "\n",
 750 |       "Parameters\n",
 751 |       "----------\n",
 752 |       "path_or_buf : str, path object, file-like object, or None, default None\n",
 753 |       "    String, path object (implementing os.PathLike[str]), or file-like\n",
 754 |       "    object implementing a write() function. If None, the result is\n",
 755 |       "    returned as a string. If a non-binary file object is passed, it should\n",
 756 |       "    be opened with `newline=''`, disabling universal newlines. If a binary\n",
 757 |       "    file object is passed, `mode` might need to contain a `'b'`.\n",
 758 |       "\n",
 759 |       "    .. versionchanged:: 1.2.0\n",
 760 |       "\n",
 761 |       "       Support for binary file objects was introduced.\n",
 762 |       "\n",
 763 |       "sep : str, default ','\n",
 764 |       "    String of length 1. Field delimiter for the output file.\n",
 765 |       "na_rep : str, default ''\n",
 766 |       "    Missing data representation.\n",
 767 |       "float_format : str, Callable, default None\n",
 768 |       "    Format string for floating point numbers. If a Callable is given, it takes\n",
 769 |       "    precedence over other numeric formatting parameters, like decimal.\n",
 770 |       "columns : sequence, optional\n",
 771 |       "    Columns to write.\n",
 772 |       "header : bool or list of str, default True\n",
 773 |       "    Write out the column names. If a list of strings is given it is\n",
 774 |       "    assumed to be aliases for the column names.\n",
 775 |       "index : bool, default True\n",
 776 |       "    Write row names (index).\n",
 777 |       "index_label : str or sequence, or False, default None\n",
 778 |       "    Column label for index column(s) if desired. If None is given, and\n",
 779 |       "    `header` and `index` are True, then the index names are used. A\n",
 780 |       "    sequence should be given if the object uses MultiIndex. If\n",
 781 |       "    False do not print fields for index names. Use index_label=False\n",
 782 |       "    for easier importing in R.\n",
 783 |       "mode : str, default 'w'\n",
 784 |       "    Python write mode. The available write modes are the same as\n",
 785 |       "    :py:func:`open`.\n",
 786 |       "encoding : str, optional\n",
 787 |       "    A string representing the encoding to use in the output file,\n",
 788 |       "    defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`\n",
 789 |       "    is a non-binary file object.\n",
 790 |       "compression : str or dict, default 'infer'\n",
 791 |       "    For on-the-fly compression of the output data. If 'infer' and 'path_or_buf' is\n",
 792 |       "    path-like, then detect compression from the following extensions: '.gz',\n",
 793 |       "    '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'\n",
 794 |       "    (otherwise no compression).\n",
 795 |       "    Set to ``None`` for no compression.\n",
 796 |       "    Can also be a dict with key ``'method'`` set\n",
 797 |       "    to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other\n",
 798 |       "    key-value pairs are forwarded to\n",
 799 |       "    ``zipfile.ZipFile``, ``gzip.GzipFile``,\n",
 800 |       "    ``bz2.BZ2File``, ``zstandard.ZstdCompressor`` or\n",
 801 |       "    ``tarfile.TarFile``, respectively.\n",
 802 |       "    As an example, the following could be passed for faster compression and to create\n",
 803 |       "    a reproducible gzip archive:\n",
 804 |       "    ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.\n",
 805 |       "\n",
 806 |       "    .. versionadded:: 1.5.0\n",
 807 |       "        Added support for `.tar` files.\n",
 808 |       "\n",
 809 |       "    .. versionchanged:: 1.0.0\n",
 810 |       "\n",
 811 |       "       May now be a dict with key 'method' as compression mode\n",
 812 |       "       and other entries as additional compression options if\n",
 813 |       "       compression mode is 'zip'.\n",
 814 |       "\n",
 815 |       "    .. versionchanged:: 1.1.0\n",
 816 |       "\n",
 817 |       "       Passing compression options as keys in dict is\n",
 818 |       "       supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.\n",
 819 |       "\n",
 820 |       "    .. versionchanged:: 1.2.0\n",
 821 |       "\n",
 822 |       "        Compression is supported for binary file objects.\n",
 823 |       "\n",
 824 |       "    .. versionchanged:: 1.2.0\n",
 825 |       "\n",
 826 |       "        Previous versions forwarded dict entries for 'gzip' to\n",
 827 |       "        `gzip.open` instead of `gzip.GzipFile` which prevented\n",
 828 |       "        setting `mtime`.\n",
 829 |       "\n",
 830 |       "quoting : optional constant from csv module\n",
 831 |       "    Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`\n",
 832 |       "    then floats are converted to strings and thus csv.QUOTE_NONNUMERIC\n",
 833 |       "    will treat them as non-numeric.\n",
 834 |       "quotechar : str, default '\\\"'\n",
 835 |       "    String of length 1. Character used to quote fields.\n",
 836 |       "lineterminator : str, optional\n",
 837 |       "    The newline character or character sequence to use in the output\n",
 838 |       "    file. Defaults to `os.linesep`, which depends on the OS in which\n",
 839 |       "    this method is called ('\\\\n' for linux, '\\\\r\\\\n' for Windows, i.e.).\n",
 840 |       "\n",
 841 |       "    .. versionchanged:: 1.5.0\n",
 842 |       "\n",
 843 |       "        Previously was line_terminator, changed for consistency with\n",
 844 |       "        read_csv and the standard library 'csv' module.\n",
 845 |       "\n",
 846 |       "chunksize : int or None\n",
 847 |       "    Rows to write at a time.\n",
 848 |       "date_format : str, default None\n",
 849 |       "    Format string for datetime objects.\n",
 850 |       "doublequote : bool, default True\n",
 851 |       "    Control quoting of `quotechar` inside a field.\n",
 852 |       "escapechar : str, default None\n",
 853 |       "    String of length 1. Character used to escape `sep` and `quotechar`\n",
 854 |       "    when appropriate.\n",
 855 |       "decimal : str, default '.'\n",
 856 |       "    Character recognized as decimal separator. E.g. use ',' for\n",
 857 |       "    European data.\n",
 858 |       "errors : str, default 'strict'\n",
 859 |       "    Specifies how encoding and decoding errors are to be handled.\n",
 860 |       "    See the errors argument for :func:`open` for a full list\n",
 861 |       "    of options.\n",
 862 |       "\n",
 863 |       "    .. versionadded:: 1.1.0\n",
 864 |       "\n",
 865 |       "storage_options : dict, optional\n",
 866 |       "    Extra options that make sense for a particular storage connection, e.g.\n",
 867 |       "    host, port, username, password, etc. For HTTP(S) URLs the key-value pairs\n",
 868 |       "    are forwarded to ``urllib.request.Request`` as header options. For other\n",
 869 |       "    URLs (e.g. starting with \"s3://\", and \"gcs://\") the key-value pairs are\n",
 870 |       "    forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more\n",
 871 |       "    details, and for more examples on storage options refer `here\n",
 872 |       "    <https://pandas.pydata.org/docs/user_guide/io.html?\n",
 873 |       "    highlight=storage_options#reading-writing-remote-files>`_.\n",
 874 |       "\n",
 875 |       "    .. versionadded:: 1.2.0\n",
 876 |       "\n",
 877 |       "Returns\n",
 878 |       "-------\n",
 879 |       "None or str\n",
 880 |       "    If path_or_buf is None, returns the resulting csv format as a\n",
 881 |       "    string. Otherwise returns None.\n",
 882 |       "\n",
 883 |       "See Also\n",
 884 |       "--------\n",
 885 |       "read_csv : Load a CSV file into a DataFrame.\n",
 886 |       "to_excel : Write DataFrame to an Excel file.\n",
 887 |       "\n",
 888 |       "Examples\n",
 889 |       "--------\n",
 890 |       ">>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],\n",
 891 |       "...                    'mask': ['red', 'purple'],\n",
 892 |       "...                    'weapon': ['sai', 'bo staff']})\n",
 893 |       ">>> df.to_csv(index=False)\n",
 894 |       "'name,mask,weapon\\nRaphael,red,sai\\nDonatello,purple,bo staff\\n'\n",
 895 |       "\n",
 896 |       "Create 'out.zip' containing 'out.csv'\n",
 897 |       "\n",
 898 |       ">>> compression_opts = dict(method='zip',\n",
 899 |       "...                         archive_name='out.csv')  # doctest: +SKIP\n",
 900 |       ">>> df.to_csv('out.zip', index=False,\n",
 901 |       "...           compression=compression_opts)  # doctest: +SKIP\n",
 902 |       "\n",
 903 |       "To write a csv file to a new folder or nested folder you will first\n",
 904 |       "need to create it using either Pathlib or os:\n",
 905 |       "\n",
 906 |       ">>> from pathlib import Path  # doctest: +SKIP\n",
 907 |       ">>> filepath = Path('folder/subfolder/out.csv')  # doctest: +SKIP\n",
 908 |       ">>> filepath.parent.mkdir(parents=True, exist_ok=True)  # doctest: +SKIP\n",
 909 |       ">>> df.to_csv(filepath)  # doctest: +SKIP\n",
 910 |       "\n",
 911 |       ">>> import os  # doctest: +SKIP\n",
 912 |       ">>> os.makedirs('folder/subfolder', exist_ok=True)  # doctest: +SKIP\n",
 913 |       ">>> df.to_csv('folder/subfolder/out.csv')  # doctest: +SKIP\n",
 914 |       "\u001b[0;31mFile:\u001b[0m      /usr/local/anaconda3/envs/uc-python/lib/python3.11/site-packages/pandas/core/generic.py\n",
 915 |       "\u001b[0;31mType:\u001b[0m      function"
 916 |      ]
 917 |     }
 918 |    ],
 919 |    "source": [
 920 |     "pd.DataFrame.to_csv?"
 921 |    ]
 922 |   },
 923 |   {
 924 |    "cell_type": "markdown",
 925 |    "metadata": {
 926 |     "slideshow": {
 927 |      "slide_type": "slide"
 928 |     },
 929 |     "tags": []
 930 |    },
 931 |    "source": [
 932 |     "<div class=\"admonition note alert alert-info\">\n",
 933 |     "    <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Note</p></b>\n",
 934 |     "    <p>There are several other <tt class=\\\"docutils literal\\\">df.to_xxx()</tt> methods that allow you to export DataFrames to other data formats. See more options <a href=\"https://pandas.pydata.org/docs/search.html?q=DataFrame.to_#\">here</a>.</p>\n",
 935 |     "</div>"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "markdown",
 940 |    "metadata": {
 941 |     "slideshow": {
 942 |      "slide_type": "slide"
 943 |     }
 944 |    },
 945 |    "source": [
 946 |     "### Your Turn\n",
 947 |     "\n",
 948 |     "1. Exporting data is copying data from Python's ________ to the ________. \n",
 949 |     "2. Fill in the blanks to the following code to:\n",
 950 |     "   - import the flights.csv file,\n",
 951 |     "   - filter for flights with a destination to the 'CVG' airport,\n",
 952 |     "   - write this subsetted data out to a new CSV file titled 'flights_to_cvg' (but don't save the index to the CSV). \n",
 953 |     "   <br><br>\n",
 954 |     "\n",
 955 |     "   ```python\n",
 956 |     "   import pandas as pd\n",
 957 |     "   flights_df = pd.________('../data/flights.csv')\n",
 958 |     "   flights_to_cvg_df = flights_df[flights_df[________] == 'CVG']\n",
 959 |     "   flights_to_cvg_df.________('../data/flights_to_cvg.csv', ________ = False)\n",
 960 |     "   ```"
 961 |    ]
 962 |   },
 963 |   {
 964 |    "cell_type": "markdown",
 965 |    "metadata": {
 966 |     "slideshow": {
 967 |      "slide_type": "slide"
 968 |     }
 969 |    },
 970 |    "source": [
 971 |     "## Exporting Other Files"
 972 |    ]
 973 |   },
 974 |   {
 975 |    "cell_type": "markdown",
 976 |    "metadata": {
 977 |     "tags": []
 978 |    },
 979 |    "source": [
 980 |     "Recall being exposed to the importing of JSON and Pickle files -- now we will see how to save them."
 981 |    ]
 982 |   },
 983 |   {
 984 |    "cell_type": "markdown",
 985 |    "metadata": {
 986 |     "slideshow": {
 987 |      "slide_type": "slide"
 988 |     }
 989 |    },
 990 |    "source": [
 991 |     "### JSON Files"
 992 |    ]
 993 |   },
 994 |   {
 995 |    "cell_type": "markdown",
 996 |    "metadata": {
 997 |     "tags": []
 998 |    },
 999 |    "source": [
1000 |     "Take a look at the below `dict`:"
1001 |    ]
1002 |   },
1003 |   {
1004 |    "cell_type": "code",
1005 |    "execution_count": 10,
1006 |    "metadata": {
1007 |     "slideshow": {
1008 |      "slide_type": "-"
1009 |     }
1010 |    },
1011 |    "outputs": [],
1012 |    "source": [
1013 |     "dict_example = {\n",
1014 |     "    \"first\": \"Guido\",\n",
1015 |     "    \"last\": \"van Rossum\"\n",
1016 |     "}"
1017 |    ]
1018 |   },
1019 |   {
1020 |    "cell_type": "markdown",
1021 |    "metadata": {
1022 |     "slideshow": {
1023 |      "slide_type": "fragment"
1024 |     }
1025 |    },
1026 |    "source": [
1027 |     "And then we can save it as a JSON file using the `with` statement and the `dump` function from the `json` library:"
1028 |    ]
1029 |   },
1030 |   {
1031 |    "cell_type": "code",
1032 |    "execution_count": 11,
1033 |    "metadata": {
1034 |     "slideshow": {
1035 |      "slide_type": "-"
1036 |     }
1037 |    },
1038 |    "outputs": [],
1039 |    "source": [
1040 |     "import json\n",
1041 |     "with open('../data/dict_example_export.json', 'w') as f:\n",
1042 |     "    f.write(json.dumps(dict_example))"
1043 |    ]
1044 |   },
1045 |   {
1046 |    "cell_type": "markdown",
1047 |    "metadata": {
1048 |     "slideshow": {
1049 |      "slide_type": "slide"
1050 |     }
1051 |    },
1052 |    "source": [
1053 |     "We can then reimport this to verify we saved it correctly:"
1054 |    ]
1055 |   },
1056 |   {
1057 |    "cell_type": "code",
1058 |    "execution_count": 12,
1059 |    "metadata": {
1060 |     "slideshow": {
1061 |      "slide_type": "-"
1062 |     }
1063 |    },
1064 |    "outputs": [],
1065 |    "source": [
1066 |     "with open('../data/dict_example_export.json', 'r') as f:\n",
1067 |     "    imported_json = json.load(f)"
1068 |    ]
1069 |   },
1070 |   {
1071 |    "cell_type": "code",
1072 |    "execution_count": 13,
1073 |    "metadata": {
1074 |     "slideshow": {
1075 |      "slide_type": "fragment"
1076 |     }
1077 |    },
1078 |    "outputs": [
1079 |     {
1080 |      "data": {
1081 |       "text/plain": [
1082 |        "dict"
1083 |       ]
1084 |      },
1085 |      "execution_count": 13,
1086 |      "metadata": {},
1087 |      "output_type": "execute_result"
1088 |     }
1089 |    ],
1090 |    "source": [
1091 |     "type(imported_json)"
1092 |    ]
1093 |   },
1094 |   {
1095 |    "cell_type": "code",
1096 |    "execution_count": 14,
1097 |    "metadata": {
1098 |     "slideshow": {
1099 |      "slide_type": "fragment"
1100 |     }
1101 |    },
1102 |    "outputs": [
1103 |     {
1104 |      "data": {
1105 |       "text/plain": [
1106 |        "{'first': 'Guido', 'last': 'van Rossum'}"
1107 |       ]
1108 |      },
1109 |      "execution_count": 14,
1110 |      "metadata": {},
1111 |      "output_type": "execute_result"
1112 |     }
1113 |    ],
1114 |    "source": [
1115 |     "imported_json"
1116 |    ]
1117 |   },
1118 |   {
1119 |    "cell_type": "markdown",
1120 |    "metadata": {
1121 |     "slideshow": {
1122 |      "slide_type": "slide"
1123 |     }
1124 |    },
1125 |    "source": [
1126 |     "### Pickle Files"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "markdown",
1131 |    "metadata": {
1132 |     "slideshow": {
1133 |      "slide_type": "fragment"
1134 |     }
1135 |    },
1136 |    "source": [
1137 |     "<div class=\"admonition warning alert alert-warning\">\n",
1138 |     " <b><p class=\"first admonition-title\" style=\"font-weight: bold\">Question?</p></b>\n",
1139 |     " <p>What are Pickle files?</p>\n",
1140 |     "</div>"
1141 |    ]
1142 |   },
1143 |   {
1144 |    "cell_type": "markdown",
1145 |    "metadata": {
1146 |     "slideshow": {
1147 |      "slide_type": "fragment"
1148 |     }
1149 |    },
1150 |    "source": [
1151 |     "Python's native data files are known as **Pickle** files:"
1152 |    ]
1153 |   },
1154 |   {
1155 |    "cell_type": "markdown",
1156 |    "metadata": {
1157 |     "slideshow": {
1158 |      "slide_type": "fragment"
1159 |     }
1160 |    },
1161 |    "source": [
1162 |     "* All Pickle files have the `.pickle` extension"
1163 |    ]
1164 |   },
1165 |   {
1166 |    "cell_type": "markdown",
1167 |    "metadata": {
1168 |     "slideshow": {
1169 |      "slide_type": "fragment"
1170 |     }
1171 |    },
1172 |    "source": [
1173 |     "* Pickle files are great for saving native Python data that can't easily be represented by other file types\n",
1174 |     "  * Pre-processed data\n",
1175 |     "  * Models\n",
1176 |     "  * Any other Python object..."
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "markdown",
1181 |    "metadata": {
1182 |     "slideshow": {
1183 |      "slide_type": "slide"
1184 |     }
1185 |    },
1186 |    "source": [
1187 |     "### Exporting Pickle Files"
1188 |    ]
1189 |   },
1190 |   {
1191 |    "cell_type": "markdown",
1192 |    "metadata": {
1193 |     "slideshow": {
1194 |      "slide_type": "fragment"
1195 |     }
1196 |    },
1197 |    "source": [
1198 |     "Pickle files can be exported using the `pickle` library paired with the `with` statement and the `open()` function:"
1199 |    ]
1200 |   },
1201 |   {
1202 |    "cell_type": "code",
1203 |    "execution_count": 15,
1204 |    "metadata": {
1205 |     "slideshow": {
1206 |      "slide_type": "-"
1207 |     }
1208 |    },
1209 |    "outputs": [],
1210 |    "source": [
1211 |     "import pickle\n",
1212 |     "with open('../data/pickle_example_export.pickle', 'wb') as f:\n",
1213 |     "    pickle.dump(dict_example, f)"
1214 |    ]
1215 |   },
1216 |   {
1217 |    "cell_type": "markdown",
1218 |    "metadata": {
1219 |     "slideshow": {
1220 |      "slide_type": "slide"
1221 |     }
1222 |    },
1223 |    "source": [
1224 |     "We can then reimport this to verify we saved it correctly:"
1225 |    ]
1226 |   },
1227 |   {
1228 |    "cell_type": "code",
1229 |    "execution_count": 16,
1230 |    "metadata": {
1231 |     "slideshow": {
1232 |      "slide_type": "-"
1233 |     }
1234 |    },
1235 |    "outputs": [],
1236 |    "source": [
1237 |     "with open('../data/pickle_example_export.pickle', 'rb') as f:\n",
1238 |     "    imported_pickle = pickle.load(f)"
1239 |    ]
1240 |   },
1241 |   {
1242 |    "cell_type": "code",
1243 |    "execution_count": 17,
1244 |    "metadata": {
1245 |     "slideshow": {
1246 |      "slide_type": "fragment"
1247 |     }
1248 |    },
1249 |    "outputs": [
1250 |     {
1251 |      "data": {
1252 |       "text/plain": [
1253 |        "dict"
1254 |       ]
1255 |      },
1256 |      "execution_count": 17,
1257 |      "metadata": {},
1258 |      "output_type": "execute_result"
1259 |     }
1260 |    ],
1261 |    "source": [
1262 |     "type(imported_pickle)"
1263 |    ]
1264 |   },
1265 |   {
1266 |    "cell_type": "code",
1267 |    "execution_count": 18,
1268 |    "metadata": {
1269 |     "slideshow": {
1270 |      "slide_type": "fragment"
1271 |     }
1272 |    },
1273 |    "outputs": [
1274 |     {
1275 |      "data": {
1276 |       "text/plain": [
1277 |        "{'first': 'Guido', 'last': 'van Rossum'}"
1278 |       ]
1279 |      },
1280 |      "execution_count": 18,
1281 |      "metadata": {},
1282 |      "output_type": "execute_result"
1283 |     }
1284 |    ],
1285 |    "source": [
1286 |     "imported_pickle"
1287 |    ]
1288 |   },
1289 |   {
1290 |    "cell_type": "markdown",
1291 |    "metadata": {
1292 |     "slideshow": {
1293 |      "slide_type": "slide"
1294 |     }
1295 |    },
1296 |    "source": [
1297 |     "# Questions\n",
1298 |     "\n",
1299 |     "Are there any questions before we move on?"
1300 |    ]
1301 |   }
1302 |  ],
1303 |  "metadata": {
1304 |   "celltoolbar": "Slideshow",
1305 |   "kernelspec": {
1306 |    "display_name": "Python 3 (ipykernel)",
1307 |    "language": "python",
1308 |    "name": "python3"
1309 |   },
1310 |   "language_info": {
1311 |    "codemirror_mode": {
1312 |     "name": "ipython",
1313 |     "version": 3
1314 |    },
1315 |    "file_extension": ".py",
1316 |    "mimetype": "text/x-python",
1317 |    "name": "python",
1318 |    "nbconvert_exporter": "python",
1319 |    "pygments_lexer": "ipython3",
1320 |    "version": "3.11.4"
1321 |   },
1322 |   "rise": {
1323 |    "autolaunch": true,
1324 |    "transition": "none"
1325 |   }
1326 |  },
1327 |  "nbformat": 4,
1328 |  "nbformat_minor": 4
1329 | }
1330 | 


--------------------------------------------------------------------------------
/notebooks/99-Conclusion.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "slideshow": {
  7 |      "slide_type": "slide"
  8 |     }
  9 |    },
 10 |    "source": [
 11 |     "# Conclusion"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {
 17 |     "slideshow": {
 18 |      "slide_type": "fragment"
 19 |     }
 20 |    },
 21 |    "source": [
 22 |     "- We've done a lot in 2 days (4 half days)!\n",
 23 |     "\n",
 24 |     "- Much more to learn, but you have the tools to keep going\n",
 25 |     "\n",
 26 |     "- The internet (Google, Stackoverflow) is your friend"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {
 32 |     "slideshow": {
 33 |      "slide_type": "slide"
 34 |     }
 35 |    },
 36 |    "source": [
 37 |     "## Resources"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {
 43 |     "slideshow": {
 44 |      "slide_type": "fragment"
 45 |     }
 46 |    },
 47 |    "source": [
 48 |     "- **[The Python Data Science Handbook](https://www.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/ref=sr_1_3?gclid=CjwKCAiAvOeQBhBkEiwAxutUVMb5JDDzkdkZ7L6NU2ZszLMQ3HiF2jwfJcRWzgR82Jn7xHQoIvxBNhoCPUYQAvD_BwE&hvadid=295647768930&hvdev=c&hvlocphy=9015764&hvnetw=g&hvqmt=e&hvrand=14740440715626152632&hvtargid=kwd-569733732214&hydadcr=16433_10305449&keywords=python+for+data+science+handbook&qid=1645918824&sr=8-3)** by Jake VanderPlas.\n",
 49 |     "    - Approachable, broad, well-written\n",
 50 |     "    - Available free online\n",
 51 |     "- **[Python for Data Analysis](https://www.amazon.com/Python-Data-Analysis-Wrangling-IPython/dp/1491957662/ref=pd_bxgy_img_1/135-0822721-0203815?pd_rd_w=JxvrQ&pf_rd_p=6b3eefea-7b16-43e9-bc45-2e332cbf99da&pf_rd_r=WD9EEXM2VVP0ER90XSE6&pd_rd_r=f7ca71ce-a552-4acb-a083-e51193d09d8e&pd_rd_wg=8ngen&pd_rd_i=1491957662&psc=1)** by Wes McKinney\n",
 52 |     "    - Dense but extremely thorough\n",
 53 |     "    - Probably the most comprehensive guide to Pandas\n",
 54 |     "- **[Hands-On Machine Learning](https://www.amazon.com/Hands-Machine-Learning-Scikit-Learn-TensorFlow/dp/1492032646/ref=pd_bxgy_img_2/135-0822721-0203815?pd_rd_w=JxvrQ&pf_rd_p=6b3eefea-7b16-43e9-bc45-2e332cbf99da&pf_rd_r=WD9EEXM2VVP0ER90XSE6&pd_rd_r=f7ca71ce-a552-4acb-a083-e51193d09d8e&pd_rd_wg=8ngen&pd_rd_i=1492032646&psc=1)** by Aurélien Géron\n",
 55 |     "    - Approachable but advances quickly\n",
 56 |     "    - Most popular machine learning book for Python"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {
 62 |     "jp-MarkdownHeadingCollapsed": true,
 63 |     "slideshow": {
 64 |      "slide_type": "slide"
 65 |     },
 66 |     "tags": []
 67 |    },
 68 |    "source": [
 69 |     "## Additional UC Python courses\n",
 70 |     "\n",
 71 |     "* **[Intermediate Python for Data Science](https://github.com/uc-python/intermediate-python-datasci)**\n",
 72 |     "    - Learn to use control flow and custom functions to work with data more efficiently.\n",
 73 |     "    - Build awareness and basic skills in working with Python from the shell and its environments.\n",
 74 |     "    - Exposure to Python's data science ecosystem and modeling via scikit-learn.\n",
 75 |     "* **[Advanced Python for Data Science](https://github.com/uc-python/advanced-python-datasci)**\n",
 76 |     "    - Develop an intuition for the machine learning workflow and Python tooling.\n",
 77 |     "    - Build familiarity with common software engineering tooling and methodologies for implementing a machine learning project.\n",
 78 |     "    - Gain a high-level understanding of the function of data science-adjacent technologies that students will encounter in the workplace, focusing on Git and GitHub."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {
 84 |     "slideshow": {
 85 |      "slide_type": "slide"
 86 |     }
 87 |    },
 88 |    "source": [
 89 |     "## Thank You"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {
 95 |     "slideshow": {
 96 |      "slide_type": "fragment"
 97 |     }
 98 |    },
 99 |    "source": [
100 |     "- We love being able to teach these workshops and we hope you enjoyed it as well!\n",
101 |     "\n",
102 |     "- We appreciate your feedback on the workshop. Constructive feedback allows us to continue to improve this course."
103 |    ]
104 |   }
105 |  ],
106 |  "metadata": {
107 |   "celltoolbar": "Slideshow",
108 |   "kernelspec": {
109 |    "display_name": "Python 3 (ipykernel)",
110 |    "language": "python",
111 |    "name": "python3"
112 |   },
113 |   "language_info": {
114 |    "codemirror_mode": {
115 |     "name": "ipython",
116 |     "version": 3
117 |    },
118 |    "file_extension": ".py",
119 |    "mimetype": "text/x-python",
120 |    "name": "python",
121 |    "nbconvert_exporter": "python",
122 |    "pygments_lexer": "ipython3",
123 |    "version": "3.8.12"
124 |   },
125 |   "rise": {
126 |    "autolaunch": true,
127 |    "transition": "none"
128 |   }
129 |  },
130 |  "nbformat": 4,
131 |  "nbformat_minor": 4
132 | }
133 | 


--------------------------------------------------------------------------------
/notebooks/Case-Study.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Case Study"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Part 1\n",
 15 |     "*To be completed at the conclusion of Day 1*"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "For the following exercises, you should use the data stored at `../data/companies.csv`\n",
 23 |     "You aren't expected to finish all the exercises; just get through as many as time allows and we will review them together.\n",
 24 |     "\n",
 25 |     "1. Start by becoming familiar with the data. How many rows and how many columns does it have? What are the data types of the columns?\n",
 26 |     "2. Set the data's index to be the \"Symbol\" column.\n",
 27 |     "3. Look up the company with the symbol NCLH. What company is this? What sector is it in?\n",
 28 |     "4. Filter down to companies that *either* in the \"Consumer Discretionary\" or the \"Consumer Staples\" sectors.\n",
 29 |     "5. How many companies are left in the data now?\n",
 30 |     "6. Create a new column, \"Symbol_Length\", that is the length of the symbol of each company. *Hint: you may need to reset an index along the way.*\n",
 31 |     "7. Find the company named \"Kroger Co.\". Change its name to \"The Kroger Company\"."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "**Bonus**: *For these two exercises, you won't find examples of the solution in our notebooks.\n",
 39 |     "You'll need to search for help on the internet.*\n",
 40 |     "\n",
 41 |     "*Don't worry if you aren't able to solve them.*\n",
 42 |     "\n",
 43 |     "1. Filter down to companies whose symbol starts with A. How many companies meet this criterion?\n",
 44 |     "2. What is the longest company name remaining in the dataset? You could just search the data visually, but try to find a programmatic solution."
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "markdown",
 49 |    "metadata": {},
 50 |    "source": [
 51 |     "## Part 2\n",
 52 |     "*To be completed at the conclusion of Day 2*"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "This section again uses the data at `../data/companies.csv`.\n",
 60 |     "\n",
 61 |     "1. Re-create the \"Symbol_Length\" column (see above).\n",
 62 |     "2. What is the average symbol length of companies in the data set?\n",
 63 |     "3. What is the average symbol length by sector? That is, after grouping by sector, what is the average symbol length for each group?\n",
 64 |     "4. How long is the longest company name? How long is the longest company name by sector?"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "Now open the pricing data at `../data/prices.csv`.\n",
 72 |     "Note that this data is entirely fabricated and does not exhibit the qualities of real stock market data!\n",
 73 |     "\n",
 74 |     "1. Become familiar with this data. What is its shape? What are its data types?\n",
 75 |     "2. Get summary metrics (count, min, max, standard deviation, etc) for both the Price and Quarter columns. *Hint: we saw a method of DataFrames that will do this for you in a single line.*\n",
 76 |     "3. Perform an inner join between this data set and the companies data, on the Symbol column.\n",
 77 |     "4. How many rows does our data have now?\n",
 78 |     "5. What do you think this data represents? Form a hypothesis and look through the data more carefully until you are confident you understand what it is and how it is structured.\n",
 79 |     "6. Group the data by sector. What is the average first quarter price for a company in the Real Estate sector? What is the minimum fourth quarter price for a company in the Industrials sector?\n",
 80 |     "7. Filter the data down to just prices for Apple, Google, Microsoft, and Amazon.\n",
 81 |     "8. Save this data as big_4.csv in the `../data` directory.\n",
 82 |     "9. Using Seaborn, plot the price of these companies over 4 quarters. Encode the quarter as the x-axis, the price as the y-axis, and the company symbol as the hue."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "**Bonus**:\n",
 90 |     "\n",
 91 |     "This data is in a form that is useful for plotting.\n",
 92 |     "But in this shape, it would be quite difficult to calculate the difference between each company's fourth quarter price and its first quarter price.\n",
 93 |     "\n",
 94 |     "Reshape this data so it is of a form like the below:\n",
 95 |     "\n",
 96 |     "| Symbol | Name | Sector | Q1 | Q2 | Q3 | Q4 |\n",
 97 |     "|--------|------|--------|----|----|----|----|\n",
 98 |     "| AAPL   | Apple Inc. | Information Technology | 275.20 | 269.96 | 263.51 | 266.07\n",
 99 |     "\n",
100 |     "From which we could easily calculate Q4 - Q1.\n",
101 |     "\n",
102 |     "*You will probably want to google something like \"python reshaping data\". This is a very challenging problem!*"
103 |    ]
104 |   }
105 |  ],
106 |  "metadata": {
107 |   "kernelspec": {
108 |    "display_name": "uc-python",
109 |    "language": "python",
110 |    "name": "uc-python"
111 |   },
112 |   "language_info": {
113 |    "codemirror_mode": {
114 |     "name": "ipython",
115 |     "version": 3
116 |    },
117 |    "file_extension": ".py",
118 |    "mimetype": "text/x-python",
119 |    "name": "python",
120 |    "nbconvert_exporter": "python",
121 |    "pygments_lexer": "ipython3",
122 |    "version": "3.7.3"
123 |   }
124 |  },
125 |  "nbformat": 4,
126 |  "nbformat_minor": 4
127 | }
128 | 


--------------------------------------------------------------------------------
/notebooks/images/aggregate-series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/aggregate-series.png


--------------------------------------------------------------------------------
/notebooks/images/applied-data-science.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/applied-data-science.gif


--------------------------------------------------------------------------------
/notebooks/images/binder-launching.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/binder-launching.png


--------------------------------------------------------------------------------
/notebooks/images/brad.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/brad.jpg


--------------------------------------------------------------------------------
/notebooks/images/combine-horizontally-key.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-horizontally-key.png


--------------------------------------------------------------------------------
/notebooks/images/combine-horizontally-unordered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-horizontally-unordered.png


--------------------------------------------------------------------------------
/notebooks/images/combine-horizontally.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-horizontally.png


--------------------------------------------------------------------------------
/notebooks/images/combine-vertically.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-vertically.png


--------------------------------------------------------------------------------
/notebooks/images/concept_map.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/concept_map.jpg


--------------------------------------------------------------------------------
/notebooks/images/data-science-and-tech.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/data-science-and-tech.png


--------------------------------------------------------------------------------
/notebooks/images/data-science.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/data-science.png


--------------------------------------------------------------------------------
/notebooks/images/dataframe-groups-unordered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/dataframe-groups-unordered.png


--------------------------------------------------------------------------------
/notebooks/images/dataframe-groups.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/dataframe-groups.png


--------------------------------------------------------------------------------
/notebooks/images/dataframe-series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/dataframe-series.png


--------------------------------------------------------------------------------
/notebooks/images/ethan.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/ethan.jpg


--------------------------------------------------------------------------------
/notebooks/images/export-framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/export-framework.png


--------------------------------------------------------------------------------
/notebooks/images/full-outer-join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/full-outer-join.png


--------------------------------------------------------------------------------
/notebooks/images/gus.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/gus.jpg


--------------------------------------------------------------------------------
/notebooks/images/import-framework.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/import-framework.png


--------------------------------------------------------------------------------
/notebooks/images/inner-join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/inner-join.png


--------------------------------------------------------------------------------
/notebooks/images/insert-new-cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/insert-new-cell.png


--------------------------------------------------------------------------------
/notebooks/images/jay.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/jay.jpg


--------------------------------------------------------------------------------
/notebooks/images/jupyter-file-structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/jupyter-file-structure.png


--------------------------------------------------------------------------------
/notebooks/images/left-outer-join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/left-outer-join.png


--------------------------------------------------------------------------------
/notebooks/images/markdown-cell-rendered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/markdown-cell-rendered.png


--------------------------------------------------------------------------------
/notebooks/images/markdown-cell-selection.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/markdown-cell-selection.png


--------------------------------------------------------------------------------
/notebooks/images/markdown-cell-unrendered.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/markdown-cell-unrendered.png


--------------------------------------------------------------------------------
/notebooks/images/model-for-grouped-aggs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/model-for-grouped-aggs.png


--------------------------------------------------------------------------------
/notebooks/images/navigator-jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/navigator-jupyter.png


--------------------------------------------------------------------------------
/notebooks/images/new-jupyter-notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/new-jupyter-notebook.png


--------------------------------------------------------------------------------
/notebooks/images/open-jupyter-notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/open-jupyter-notebook.png


--------------------------------------------------------------------------------
/notebooks/images/python-code-cell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/python-code-cell.png


--------------------------------------------------------------------------------
/notebooks/images/python-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/python-logo.png


--------------------------------------------------------------------------------
/notebooks/images/python_jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/python_jupyter.png


--------------------------------------------------------------------------------
/notebooks/images/right-outer-join.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/right-outer-join.png


--------------------------------------------------------------------------------
/notebooks/images/selecting_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/selecting_columns.png


--------------------------------------------------------------------------------
/notebooks/images/selecting_rows.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/selecting_rows.png


--------------------------------------------------------------------------------
/notebooks/images/selecting_rows_columns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/selecting_rows_columns.png


--------------------------------------------------------------------------------
/notebooks/images/series-plus-series.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/series-plus-series.png


--------------------------------------------------------------------------------
/notebooks/images/so_dev_survey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/so_dev_survey.png


--------------------------------------------------------------------------------
/notebooks/images/subsetting_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/subsetting_result.png


--------------------------------------------------------------------------------
/notebooks/images/summarizing-by-groups.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/summarizing-by-groups.png


--------------------------------------------------------------------------------
/notebooks/rise.css:
--------------------------------------------------------------------------------
 1 | .rise-enabled {
 2 |     background-color: #ffffff !important;
 3 |     border-top: 30px #919191 solid;
 4 |     border-bottom: 30px #919191 solid;
 5 | }
 6 | 
 7 | .question {
 8 |     color: #008;   
 9 | }
10 | 
11 | .your_turn {
12 |     color: #e08414;
13 |     font-size: 150%;
14 |     font-weight: bold;
15 | }
16 | 
17 | .rendered_html h1 {
18 |     color: #129628;
19 | }
20 | 
21 | .rendered_html table, .rendered_html th, .rendered_html tr, .rendered_html td {
22 |     font-size: 100%;
23 | }
24 | 
25 | .container.slides .celltoolbar, .container.slides .hide-in-slideshow {
26 |     display: None ! important;
27 | }
28 | 


--------------------------------------------------------------------------------
/scripts/generate_slides.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ ! -d ".git" ]; then
 4 |     echo "Error: no .git directory detected"
 5 |     echo "This script should be run from the base of the repo"
 6 |     echo 'e.g. `bash scripts/generate_slides.sh`'
 7 |     exit 1
 8 | fi
 9 | 
10 | # We must be *in* the notebook folder for relative links (to eg images) to work
11 | # correctly..
12 | cd notebooks
13 | # images are copied over to slides/ by the Makefile
14 | NB_PATH="$1"
15 | REL_NB=${NB_PATH/#notebooks\//}
16 | jupyter nbconvert --to slides $REL_NB --output-dir=../slides
17 | 


--------------------------------------------------------------------------------
/scripts/prep_nb_for_ci.py:
--------------------------------------------------------------------------------
 1 | # Replace a notebook, with cells tagged ci-skip removed.
 2 | # Adapted from https://stackoverflow.com/questions/62022603/how-to-delete-a-jupyter-notebook-input-cell-programmatically-using-its-tag
 3 | 
 4 | import sys
 5 | import nbformat
 6 | 
 7 | SKIP_TAG = 'ci-skip'
 8 | 
 9 | if len(sys.argv) != 2:
10 |     raise Exception('Usage: prep_nb_for_ci [notebook.ipynb]')
11 | nb_file = sys.argv[1]
12 | 
13 | nb = nbformat.read(nb_file, as_version=nbformat.NO_CONVERT)
14 | 
15 | tagged_cell_indices = []
16 | 
17 | # find index for the cell with the injected params
18 | for idx, cell in enumerate(nb.cells):
19 |     cell_tags = cell.metadata.get('tags')
20 |     if cell_tags:
21 |         if SKIP_TAG in cell_tags:
22 |             tagged_cell_indices.append(idx)
23 | 
24 | # Remove tagged cells.
25 | # Iterate in reverse because deleting an earlier index will change what cell
26 | # is at a later one.
27 | for idx in reversed(tagged_cell_indices):
28 |     nb.cells.pop(idx)
29 | 
30 | # Overwrite the original.
31 | nbformat.write(nb, nb_file)
32 | 


--------------------------------------------------------------------------------