├── .github └── workflows │ └── test.yaml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── data ├── airlines.csv ├── airports.csv ├── avg_year_by_man.csv ├── companies.csv ├── dict_example_export.json ├── flights.csv ├── json_example.json ├── pickle_example.pickle ├── pickle_example_export.pickle ├── planes.csv ├── prices.csv └── weather.csv ├── environment.yml ├── notebooks ├── 00-Introduction.ipynb ├── 01-Python-and-Jupyter.ipynb ├── 02-Fundamentals.ipynb ├── 03-Packages-Modules-Function.ipynb ├── 04-Importing-Data.ipynb ├── 05-Selecting-and-Filtering.ipynb ├── 06-Manipulating-Columns.ipynb ├── 07-Review-Day-1.ipynb ├── 08-Summarizing-Data.ipynb ├── 09-Summarizing-Grouped-Data.ipynb ├── 10-Joining-Data.ipynb ├── 11-Exporting-Data.ipynb ├── 12-Visualizing-Data.ipynb ├── 99-Conclusion.ipynb ├── Case-Study-Solutions.ipynb ├── Case-Study.ipynb ├── images │ ├── aggregate-series.png │ ├── applied-data-science.gif │ ├── binder-launching.png │ ├── brad.jpg │ ├── combine-horizontally-key.png │ ├── combine-horizontally-unordered.png │ ├── combine-horizontally.png │ ├── combine-vertically.png │ ├── concept_map.jpg │ ├── data-science-and-tech.png │ ├── data-science.png │ ├── dataframe-groups-unordered.png │ ├── dataframe-groups.png │ ├── dataframe-series.png │ ├── ethan.jpg │ ├── export-framework.png │ ├── full-outer-join.png │ ├── gus.jpg │ ├── import-framework.png │ ├── inner-join.png │ ├── insert-new-cell.png │ ├── jay.jpg │ ├── jupyter-file-structure.png │ ├── left-outer-join.png │ ├── markdown-cell-rendered.png │ ├── markdown-cell-selection.png │ ├── markdown-cell-unrendered.png │ ├── model-for-grouped-aggs.png │ ├── navigator-jupyter.png │ ├── new-jupyter-notebook.png │ ├── open-jupyter-notebook.png │ ├── python-code-cell.png │ ├── python-logo.png │ ├── python_jupyter.png │ ├── right-outer-join.png │ ├── selecting_columns.png │ ├── selecting_rows.png │ ├── selecting_rows_columns.png │ ├── series-plus-series.png │ ├── so_dev_survey.png │ ├── subsetting_result.png │ └── summarizing-by-groups.png └── rise.css └── scripts ├── generate_slides.sh └── prep_nb_for_ci.py /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: [push, workflow_dispatch] 4 | 5 | jobs: 6 | validate-notebooks: 7 | name: Validate Notebooks 8 | runs-on: ubuntu-latest 9 | defaults: 10 | run: 11 | # Required for "run" commands to execute in the conda env. 12 | shell: bash -l {0} 13 | steps: 14 | - name: Checkout Code 15 | uses: actions/checkout@v3 16 | - name: Set Up Conda Environment 17 | uses: conda-incubator/setup-miniconda@v2 18 | with: 19 | activate-environment: uc-python 20 | environment-file: environment.yml 21 | - name: Set Up Jupyter Kernel 22 | run: | 23 | python -m ipykernel install --user --name uc-python 24 | - name: Install Papermill 25 | run: | 26 | conda install papermill 27 | - name: Prep notebooks 28 | run: | 29 | # Remove nb cells that should be skipped in CI. 30 | for nb in notebooks/*.ipynb; do 31 | python scripts/prep_nb_for_ci.py "$nb" 32 | done 33 | - name: Run notebooks 34 | run: | 35 | for nb in notebooks/*.ipynb; do 36 | echo "running $nb" 37 | output=$(papermill --cwd notebooks/ "$nb" -) 38 | done 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # VS code 107 | .vscode 108 | 109 | # Custom 110 | slides/ 111 | 112 | .DS_Store 113 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Python for Data Science @ University of Cincinnati 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | IMAGESIN=$(wildcard notebooks/images/*) 2 | IMAGESOUT=$(patsubst notebooks/%,slides/%,$(IMAGESIN)) 3 | NBFILES=$(wildcard notebooks/*-*.ipynb) 4 | HTMLFILES=$(patsubst notebooks/%.ipynb,slides/%.slides.html,$(NBFILES)) 5 | 6 | slides: html 7 | 8 | images: slides/images $(IMAGESOUT) 9 | 10 | slides/images: 11 | mkdir -p slides/images 12 | 13 | $(IMAGESOUT): slides/images/%: notebooks/images/% 14 | cp -a $< $@ 15 | 16 | html: images $(HTMLFILES) 17 | 18 | $(HTMLFILES): slides/%.slides.html: notebooks/%.ipynb 19 | bash scripts/generate_slides.sh $< 20 | 21 | clean: 22 | rm -rf slides/ 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction to Python for Data Science 2 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/uc-python/intro-python-datasci/main?urlpath=lab) 3 | 4 | ### Course Description 5 | 6 | Welcome to Introduction to Python for Data Science! This short course provides a gentle, hands-on introduction to the Python programming language for data science applications. You will learn the fundamentals of Python as a language and how to work with data using the `pandas` library. 7 | 8 | ### Objectives 9 | 10 | The following are the primary learning objectives of students: 11 | 12 | 1. Develop comprehensive skills in the importing/exporting, wrangling, aggregating and joining of data using Python. 13 | 14 | 2. Establish a mental model of the Python programming language to enable future self-learning. 15 | 16 | 3. Build awareness and basic skills in the core data science area of data visualization. 17 | 18 | ### Agenda 19 | 20 | ***This workshop offering will be 100% virtual and span 4 half-days.*** 21 | 22 | | Day | Topic | Time | 23 | | :--------:| :----------------------------------------------------------------------------- | :-----------: | 24 | | __Day 1__ | Introductions | 12:30 - 12:45 | 25 | | | Python and Jupyter Overview | 12:45 - 1:15 | 26 | | | Fundamentals | 1:15 - 2:00 | 27 | | | Break | 2:00 - 2:15 | 28 | | | Packages, Modules, Methods, Functions | 2:15 - 3:00 | 29 | | | Importing Data | 3:00 - 3:45 | 30 | | | Q\&A | 3:45 - 4:15 | 31 | | __Day 2__ | Q\&A | 12:45 - 1:00 | 32 | | | Selecting and Filtering Data | 1:00 - 1:45 | 33 | | | Working with Columns | 1:45 - 2:30 | 34 | | | Break | 2:30 - 2:45 | 35 | | | Case Study, pt. 1 | 2:45 - 3:45 | 36 | | | Q\&A | 3:45 - 4:15 | 37 | | __Day 3__ | Q\&A | 12:45 - 1:00 | 38 | | | Review | 1:00 - 1:15 | 39 | | | Summarizing Data | 1:15 - 2:00 | 40 | | | Break | 2:00 - 2:15 | 41 | | | Summarizing Grouped Data | 2:15 - 3:00 | 42 | | | Joining Data | 3:00 - 3:45 | 43 | | | Q\&A | 3:45 - 4:15 | 44 | | __Day 4__ | Q\&A | 12:45 - 1:00 | 45 | | | Exporting Data | 1:00 - 1:30 | 46 | | | Visualizing Data | 1:30 - 2:30 | 47 | | | Break | 2:30 - 2:45 | 48 | | | Case Study, pt. 2 | 2:45 - 3:45 | 49 | | | Q\&A | 3:45 - 4:15 | 50 | 51 | ### Course Preparation 52 | 53 | In an effort to simplify the setup for this class, we are using Binder for all materials (slides, worksheets, etc.). In result, there is no pre-requisite installation required for the in-class material. 54 | 55 | With that being said, it's smart to install the appropriate technologies and download the materials anyways. This will provide you a backup in case there are network issues, and *it will also be required to apply your learnings outside of class*. 56 | 57 | Follow these steps to download the technologies and materials: 58 | 59 | #### 1. Python, Jupyter and package installation. 60 | 61 | These easiest way to install Python, Jupyter, and the necessary packages is through Anaconda. To download and install Anaconda: 62 | 63 | 1. Visit the [Anaconda Individual Edition page](https://www.anaconda.com/products/individual) 64 | 2. Scroll down to the "Anaconda Installers" section. 65 | 3. Choose to download the **Graphical Installer** for your operating system. (If you are comfortable with the command line, you may choose that option instead.) 66 | 4. Open the installer when the download completes, and then follow the prompts. If you are prompted about installing PyCharm, elect **not** to do so. 67 | 5. Once installed, open the Anaconda Navigator and launch a Jupyter Notebook to ensure it works. 68 | 6. Follow [the package installation instructions](https://docs.anaconda.com/free/navigator/tutorials/manage-packages/#installing-a-package) to ensure `pandas` and `seaborn` packages are installed. 69 | 70 | #### 2. Download class materials 71 | 72 | There are two ways to download the class materials: 73 | 74 | 1. Clone it - If you're familiar with how to do so, you can clone this repository. 75 | 2. Download the files as a zip - use [this link](https://github.com/uc-python/intro-python-datasci/archive/refs/heads/main.zip) 76 | 77 | ### Questions 78 | 79 | If you have any specific questions prior to the class you can reach out to us directly via GitHub or email: 80 | 81 | * Ethan Swan: [GitHub](https://www.github.com/eswan18) & [Email](mailto:ethanpswan@gmail.com) 82 | * Bradley Boehmke: [GitHub](https://www.github.com/bradleyboehmke) & [Email](mailto:bradleyboehmke@gmail.com) 83 | * Gus Powers: [GitHub](https://www.github.com/augustopher) & [Email](mailto:guspowers0@gmail.com) 84 | * Jay Cunningham: [GitHub](https://github.com/cunningjames) & [Email](mailto:james@notbadafterall.com) 85 | -------------------------------------------------------------------------------- /data/airlines.csv: -------------------------------------------------------------------------------- 1 | carrier,name 2 | 9E,Endeavor Air Inc. 3 | AA,American Airlines Inc. 4 | AS,Alaska Airlines Inc. 5 | B6,JetBlue Airways 6 | DL,Delta Air Lines Inc. 7 | EV,ExpressJet Airlines Inc. 8 | F9,Frontier Airlines Inc. 9 | FL,AirTran Airways Corporation 10 | HA,Hawaiian Airlines Inc. 11 | MQ,Envoy Air 12 | OO,SkyWest Airlines Inc. 13 | UA,United Air Lines Inc. 14 | US,US Airways Inc. 15 | VX,Virgin America 16 | WN,Southwest Airlines Co. 17 | YV,Mesa Airlines Inc. 18 | -------------------------------------------------------------------------------- /data/avg_year_by_man.csv: -------------------------------------------------------------------------------- 1 | manufacturer,year 2 | AGUSTA SPA,2001.0 3 | AIRBUS,2007.2012195121952 4 | AIRBUS INDUSTRIE,1998.2333333333333 5 | AMERICAN AIRCRAFT INC, 6 | AVIAT AIRCRAFT INC,2007.0 7 | AVIONS MARCEL DASSAULT,1986.0 8 | BARKER JACK L, 9 | BEECH,1969.5 10 | BELL,1984.5 11 | BOEING,2000.1441048034935 12 | BOMBARDIER INC,2004.486187845304 13 | CANADAIR,1997.3333333333333 14 | CANADAIR LTD,1974.0 15 | CESSNA,1972.4444444444443 16 | CIRRUS DESIGN CORP,2007.0 17 | DEHAVILLAND,1959.0 18 | DOUGLAS,1956.0 19 | EMBRAER,2003.5972696245733 20 | FRIEDEMANN JON,2007.0 21 | GULFSTREAM AEROSPACE,1984.0 22 | HURLEY JAMES LARRY, 23 | JOHN G HESS, 24 | KILDALL GARY,1985.0 25 | LAMBERT RICHARD, 26 | LEARJET INC, 27 | LEBLANC GLENN T,1985.0 28 | MARZ BARRY,1993.0 29 | MCDONNELL DOUGLAS,1989.948275862069 30 | MCDONNELL DOUGLAS AIRCRAFT CO,1989.7378640776699 31 | MCDONNELL DOUGLAS CORPORATION,1991.9285714285713 32 | PAIR MIKE E, 33 | PIPER,1976.4 34 | ROBINSON HELICOPTER CO,2012.0 35 | SIKORSKY,1985.0 36 | STEWART MACO,1985.0 37 | -------------------------------------------------------------------------------- /data/companies.csv: -------------------------------------------------------------------------------- 1 | Symbol,Name,Sector 2 | MMM,3M Company,Industrials 3 | AOS,A.O. Smith Corp,Industrials 4 | ABT,Abbott Laboratories,Health Care 5 | ABBV,AbbVie Inc.,Health Care 6 | ACN,Accenture plc,Information Technology 7 | ATVI,Activision Blizzard,Information Technology 8 | AYI,Acuity Brands Inc,Industrials 9 | ADBE,Adobe Systems Inc,Information Technology 10 | AAP,Advance Auto Parts,Consumer Discretionary 11 | AMD,Advanced Micro Devices Inc,Information Technology 12 | AES,AES Corp,Utilities 13 | AET,Aetna Inc,Health Care 14 | AMG,Affiliated Managers Group Inc,Financials 15 | AFL,AFLAC Inc,Financials 16 | A,Agilent Technologies Inc,Health Care 17 | APD,Air Products & Chemicals Inc,Materials 18 | AKAM,Akamai Technologies Inc,Information Technology 19 | ALK,Alaska Air Group Inc,Industrials 20 | ALB,Albemarle Corp,Materials 21 | ARE,Alexandria Real Estate Equities Inc,Real Estate 22 | ALXN,Alexion Pharmaceuticals,Health Care 23 | ALGN,Align Technology,Health Care 24 | ALLE,Allegion,Industrials 25 | AGN,"Allergan, Plc",Health Care 26 | ADS,Alliance Data Systems,Information Technology 27 | LNT,Alliant Energy Corp,Utilities 28 | ALL,Allstate Corp,Financials 29 | GOOGL,Alphabet Inc Class A,Information Technology 30 | GOOG,Alphabet Inc Class C,Information Technology 31 | MO,Altria Group Inc,Consumer Staples 32 | AMZN,Amazon.com Inc.,Consumer Discretionary 33 | AEE,Ameren Corp,Utilities 34 | AAL,American Airlines Group,Industrials 35 | AEP,American Electric Power,Utilities 36 | AXP,American Express Co,Financials 37 | AIG,"American International Group, Inc.",Financials 38 | AMT,American Tower Corp A,Real Estate 39 | AWK,American Water Works Company Inc,Utilities 40 | AMP,Ameriprise Financial,Financials 41 | ABC,AmerisourceBergen Corp,Health Care 42 | AME,AMETEK Inc.,Industrials 43 | AMGN,Amgen Inc.,Health Care 44 | APH,Amphenol Corp,Information Technology 45 | APC,Anadarko Petroleum Corp,Energy 46 | ADI,"Analog Devices, Inc.",Information Technology 47 | ANDV,Andeavor,Energy 48 | ANSS,ANSYS,Information Technology 49 | ANTM,Anthem Inc.,Health Care 50 | AON,Aon plc,Financials 51 | APA,Apache Corporation,Energy 52 | AIV,Apartment Investment & Management,Real Estate 53 | AAPL,Apple Inc.,Information Technology 54 | AMAT,Applied Materials Inc.,Information Technology 55 | APTV,Aptiv Plc,Consumer Discretionary 56 | ADM,Archer-Daniels-Midland Co,Consumer Staples 57 | ARNC,Arconic Inc.,Industrials 58 | AJG,Arthur J. Gallagher & Co.,Financials 59 | AIZ,Assurant Inc.,Financials 60 | T,AT&T Inc.,Telecommunication Services 61 | ADSK,Autodesk Inc.,Information Technology 62 | ADP,Automatic Data Processing,Information Technology 63 | AZO,AutoZone Inc,Consumer Discretionary 64 | AVB,"AvalonBay Communities, Inc.",Real Estate 65 | AVY,Avery Dennison Corp,Materials 66 | BHGE,"Baker Hughes, a GE Company",Energy 67 | BLL,Ball Corp,Materials 68 | BAC,Bank of America Corp,Financials 69 | BAX,Baxter International Inc.,Health Care 70 | BBT,BB&T Corporation,Financials 71 | BDX,Becton Dickinson,Health Care 72 | BRK.B,Berkshire Hathaway,Financials 73 | BBY,Best Buy Co. Inc.,Consumer Discretionary 74 | BIIB,Biogen Inc.,Health Care 75 | BLK,BlackRock,Financials 76 | HRB,Block H&R,Financials 77 | BA,Boeing Company,Industrials 78 | BKNG,Booking Holdings Inc,Consumer Discretionary 79 | BWA,BorgWarner,Consumer Discretionary 80 | BXP,Boston Properties,Real Estate 81 | BSX,Boston Scientific,Health Care 82 | BHF,Brighthouse Financial Inc,Financials 83 | BMY,Bristol-Myers Squibb,Health Care 84 | AVGO,Broadcom,Information Technology 85 | BF.B,Brown-Forman Corp.,Consumer Staples 86 | CHRW,C. H. Robinson Worldwide,Industrials 87 | CA,"CA, Inc.",Information Technology 88 | COG,Cabot Oil & Gas,Energy 89 | CDNS,Cadence Design Systems,Information Technology 90 | CPB,Campbell Soup,Consumer Staples 91 | COF,Capital One Financial,Financials 92 | CAH,Cardinal Health Inc.,Health Care 93 | KMX,Carmax Inc,Consumer Discretionary 94 | CCL,Carnival Corp.,Consumer Discretionary 95 | CAT,Caterpillar Inc.,Industrials 96 | CBOE,Cboe Global Markets,Financials 97 | CBRE,CBRE Group,Real Estate 98 | CBS,CBS Corp.,Consumer Discretionary 99 | CELG,Celgene Corp.,Health Care 100 | CNC,Centene Corporation,Health Care 101 | CNP,CenterPoint Energy,Utilities 102 | CTL,CenturyLink Inc,Telecommunication Services 103 | CERN,Cerner,Health Care 104 | CF,CF Industries Holdings Inc,Materials 105 | SCHW,Charles Schwab Corporation,Financials 106 | CHTR,Charter Communications,Consumer Discretionary 107 | CVX,Chevron Corp.,Energy 108 | CMG,Chipotle Mexican Grill,Consumer Discretionary 109 | CB,Chubb Limited,Financials 110 | CHD,Church & Dwight,Consumer Staples 111 | CI,CIGNA Corp.,Health Care 112 | XEC,Cimarex Energy,Energy 113 | CINF,Cincinnati Financial,Financials 114 | CTAS,Cintas Corporation,Industrials 115 | CSCO,Cisco Systems,Information Technology 116 | C,Citigroup Inc.,Financials 117 | CFG,Citizens Financial Group,Financials 118 | CTXS,Citrix Systems,Information Technology 119 | CME,CME Group Inc.,Financials 120 | CMS,CMS Energy,Utilities 121 | KO,Coca-Cola Company (The),Consumer Staples 122 | CTSH,Cognizant Technology Solutions,Information Technology 123 | CL,Colgate-Palmolive,Consumer Staples 124 | CMCSA,Comcast Corp.,Consumer Discretionary 125 | CMA,Comerica Inc.,Financials 126 | CAG,Conagra Brands,Consumer Staples 127 | CXO,Concho Resources,Energy 128 | COP,ConocoPhillips,Energy 129 | ED,Consolidated Edison,Utilities 130 | STZ,Constellation Brands,Consumer Staples 131 | GLW,Corning Inc.,Information Technology 132 | COST,Costco Wholesale Corp.,Consumer Staples 133 | COTY,"Coty, Inc",Consumer Staples 134 | CCI,Crown Castle International Corp.,Real Estate 135 | CSRA,CSRA Inc.,Information Technology 136 | CSX,CSX Corp.,Industrials 137 | CMI,Cummins Inc.,Industrials 138 | CVS,CVS Health,Consumer Staples 139 | DHI,D. R. Horton,Consumer Discretionary 140 | DHR,Danaher Corp.,Health Care 141 | DRI,Darden Restaurants,Consumer Discretionary 142 | DVA,DaVita Inc.,Health Care 143 | DE,Deere & Co.,Industrials 144 | DAL,Delta Air Lines Inc.,Industrials 145 | XRAY,Dentsply Sirona,Health Care 146 | DVN,Devon Energy Corp.,Energy 147 | DLR,Digital Realty Trust Inc,Real Estate 148 | DFS,Discover Financial Services,Financials 149 | DISCA,Discovery Inc. Class A,Consumer Discretionary 150 | DISCK,Discovery Inc. Class C,Consumer Discretionary 151 | DISH,Dish Network,Consumer Discretionary 152 | DG,Dollar General,Consumer Discretionary 153 | DLTR,Dollar Tree,Consumer Discretionary 154 | D,Dominion Energy,Utilities 155 | DOV,Dover Corp.,Industrials 156 | DWDP,DowDuPont,Materials 157 | DPS,Dr Pepper Snapple Group,Consumer Staples 158 | DTE,DTE Energy Co.,Utilities 159 | DUK,Duke Energy,Utilities 160 | DRE,Duke Realty Corp,Real Estate 161 | DXC,DXC Technology,Information Technology 162 | ETFC,E*Trade,Financials 163 | EMN,Eastman Chemical,Materials 164 | ETN,Eaton Corporation,Industrials 165 | EBAY,eBay Inc.,Information Technology 166 | ECL,Ecolab Inc.,Materials 167 | EIX,Edison Int'l,Utilities 168 | EW,Edwards Lifesciences,Health Care 169 | EA,Electronic Arts,Information Technology 170 | EMR,Emerson Electric Company,Industrials 171 | ETR,Entergy Corp.,Utilities 172 | EVHC,Envision Healthcare,Health Care 173 | EOG,EOG Resources,Energy 174 | EQT,EQT Corporation,Energy 175 | EFX,Equifax Inc.,Industrials 176 | EQIX,Equinix,Real Estate 177 | EQR,Equity Residential,Real Estate 178 | ESS,"Essex Property Trust, Inc.",Real Estate 179 | EL,Estee Lauder Cos.,Consumer Staples 180 | RE,Everest Re Group Ltd.,Financials 181 | ES,Eversource Energy,Utilities 182 | EXC,Exelon Corp.,Utilities 183 | EXPE,Expedia Inc.,Consumer Discretionary 184 | EXPD,Expeditors International,Industrials 185 | ESRX,Express Scripts,Health Care 186 | EXR,Extra Space Storage,Real Estate 187 | XOM,Exxon Mobil Corp.,Energy 188 | FFIV,F5 Networks,Information Technology 189 | FB,"Facebook, Inc.",Information Technology 190 | FAST,Fastenal Co,Industrials 191 | FRT,Federal Realty Investment Trust,Real Estate 192 | FDX,FedEx Corporation,Industrials 193 | FIS,Fidelity National Information Services,Information Technology 194 | FITB,Fifth Third Bancorp,Financials 195 | FE,FirstEnergy Corp,Utilities 196 | FISV,Fiserv Inc,Information Technology 197 | FLIR,FLIR Systems,Information Technology 198 | FLS,Flowserve Corporation,Industrials 199 | FLR,Fluor Corp.,Industrials 200 | FMC,FMC Corporation,Materials 201 | FL,Foot Locker Inc,Consumer Discretionary 202 | F,Ford Motor,Consumer Discretionary 203 | FTV,Fortive Corp,Industrials 204 | FBHS,Fortune Brands Home & Security,Industrials 205 | BEN,Franklin Resources,Financials 206 | FCX,Freeport-McMoRan Inc.,Materials 207 | GPS,Gap Inc.,Consumer Discretionary 208 | GRMN,Garmin Ltd.,Consumer Discretionary 209 | IT,Gartner Inc,Information Technology 210 | GD,General Dynamics,Industrials 211 | GE,General Electric,Industrials 212 | GGP,General Growth Properties Inc.,Real Estate 213 | GIS,General Mills,Consumer Staples 214 | GM,General Motors,Consumer Discretionary 215 | GPC,Genuine Parts,Consumer Discretionary 216 | GILD,Gilead Sciences,Health Care 217 | GPN,Global Payments Inc.,Information Technology 218 | GS,Goldman Sachs Group,Financials 219 | GT,Goodyear Tire & Rubber,Consumer Discretionary 220 | GWW,Grainger (W.W.) Inc.,Industrials 221 | HAL,Halliburton Co.,Energy 222 | HBI,Hanesbrands Inc,Consumer Discretionary 223 | HOG,Harley-Davidson,Consumer Discretionary 224 | HRS,Harris Corporation,Information Technology 225 | HIG,Hartford Financial Svc.Gp.,Financials 226 | HAS,Hasbro Inc.,Consumer Discretionary 227 | HCA,HCA Holdings,Health Care 228 | HCP,HCP Inc.,Real Estate 229 | HP,Helmerich & Payne,Energy 230 | HSIC,Henry Schein,Health Care 231 | HES,Hess Corporation,Energy 232 | HPE,Hewlett Packard Enterprise,Information Technology 233 | HLT,Hilton Worldwide Holdings Inc,Consumer Discretionary 234 | HOLX,Hologic,Health Care 235 | HD,Home Depot,Consumer Discretionary 236 | HON,Honeywell Int'l Inc.,Industrials 237 | HRL,Hormel Foods Corp.,Consumer Staples 238 | HST,Host Hotels & Resorts,Real Estate 239 | HPQ,HP Inc.,Information Technology 240 | HUM,Humana Inc.,Health Care 241 | HBAN,Huntington Bancshares,Financials 242 | HII,Huntington Ingalls Industries,Industrials 243 | IDXX,IDEXX Laboratories,Health Care 244 | INFO,IHS Markit Ltd.,Industrials 245 | ITW,Illinois Tool Works,Industrials 246 | ILMN,Illumina Inc,Health Care 247 | INCY,Incyte,Health Care 248 | IR,Ingersoll-Rand PLC,Industrials 249 | INTC,Intel Corp.,Information Technology 250 | ICE,Intercontinental Exchange,Financials 251 | IBM,International Business Machines,Information Technology 252 | IP,International Paper,Materials 253 | IPG,Interpublic Group,Consumer Discretionary 254 | IFF,Intl Flavors & Fragrances,Materials 255 | INTU,Intuit Inc.,Information Technology 256 | ISRG,Intuitive Surgical Inc.,Health Care 257 | IVZ,Invesco Ltd.,Financials 258 | IPGP,IPG Photonics Corp.,Information Technology 259 | IQV,IQVIA Holdings Inc.,Health Care 260 | IRM,Iron Mountain Incorporated,Real Estate 261 | JBHT,J. B. Hunt Transport Services,Industrials 262 | JEC,Jacobs Engineering Group,Industrials 263 | SJM,JM Smucker,Consumer Staples 264 | JNJ,Johnson & Johnson,Health Care 265 | JCI,Johnson Controls International,Industrials 266 | JPM,JPMorgan Chase & Co.,Financials 267 | JNPR,Juniper Networks,Information Technology 268 | KSU,Kansas City Southern,Industrials 269 | K,Kellogg Co.,Consumer Staples 270 | KEY,KeyCorp,Financials 271 | KMB,Kimberly-Clark,Consumer Staples 272 | KIM,Kimco Realty,Real Estate 273 | KMI,Kinder Morgan,Energy 274 | KLAC,KLA-Tencor Corp.,Information Technology 275 | KSS,Kohl's Corp.,Consumer Discretionary 276 | KHC,Kraft Heinz Co,Consumer Staples 277 | KR,Kroger Co.,Consumer Staples 278 | LB,L Brands Inc.,Consumer Discretionary 279 | LLL,L-3 Communications Holdings,Industrials 280 | LH,Laboratory Corp. of America Holding,Health Care 281 | LRCX,Lam Research,Information Technology 282 | LEG,Leggett & Platt,Consumer Discretionary 283 | LEN,Lennar Corp.,Consumer Discretionary 284 | LUK,Leucadia National Corp.,Financials 285 | LLY,Lilly (Eli) & Co.,Health Care 286 | LNC,Lincoln National,Financials 287 | LKQ,LKQ Corporation,Consumer Discretionary 288 | LMT,Lockheed Martin Corp.,Industrials 289 | L,Loews Corp.,Financials 290 | LOW,Lowe's Cos.,Consumer Discretionary 291 | LYB,LyondellBasell,Materials 292 | MTB,M&T Bank Corp.,Financials 293 | MAC,Macerich,Real Estate 294 | M,Macy's Inc.,Consumer Discretionary 295 | MRO,Marathon Oil Corp.,Energy 296 | MPC,Marathon Petroleum,Energy 297 | MAR,Marriott Int'l.,Consumer Discretionary 298 | MMC,Marsh & McLennan,Financials 299 | MLM,Martin Marietta Materials,Materials 300 | MAS,Masco Corp.,Industrials 301 | MA,Mastercard Inc.,Information Technology 302 | MAT,Mattel Inc.,Consumer Discretionary 303 | MKC,McCormick & Co.,Consumer Staples 304 | MCD,McDonald's Corp.,Consumer Discretionary 305 | MCK,McKesson Corp.,Health Care 306 | MDT,Medtronic plc,Health Care 307 | MRK,Merck & Co.,Health Care 308 | MET,MetLife Inc.,Financials 309 | MTD,Mettler Toledo,Health Care 310 | MGM,MGM Resorts International,Consumer Discretionary 311 | KORS,Michael Kors Holdings,Consumer Discretionary 312 | MCHP,Microchip Technology,Information Technology 313 | MU,Micron Technology,Information Technology 314 | MSFT,Microsoft Corp.,Information Technology 315 | MAA,Mid-America Apartments,Real Estate 316 | MHK,Mohawk Industries,Consumer Discretionary 317 | TAP,Molson Coors Brewing Company,Consumer Staples 318 | MDLZ,Mondelez International,Consumer Staples 319 | MON,Monsanto Co.,Materials 320 | MNST,Monster Beverage,Consumer Staples 321 | MCO,Moody's Corp,Financials 322 | MS,Morgan Stanley,Financials 323 | MSI,Motorola Solutions Inc.,Information Technology 324 | MYL,Mylan N.V.,Health Care 325 | NDAQ,"Nasdaq, Inc.",Financials 326 | NOV,National Oilwell Varco Inc.,Energy 327 | NAVI,Navient,Financials 328 | NKTR,Nektar Therapeutics,Health Care 329 | NTAP,NetApp,Information Technology 330 | NFLX,Netflix Inc.,Information Technology 331 | NWL,Newell Brands,Consumer Discretionary 332 | NFX,Newfield Exploration Co,Energy 333 | NEM,Newmont Mining Corporation,Materials 334 | NWSA,News Corp. Class A,Consumer Discretionary 335 | NWS,News Corp. Class B,Consumer Discretionary 336 | NEE,NextEra Energy,Utilities 337 | NLSN,Nielsen Holdings,Industrials 338 | NKE,Nike,Consumer Discretionary 339 | NI,NiSource Inc.,Utilities 340 | NBL,Noble Energy Inc,Energy 341 | JWN,Nordstrom,Consumer Discretionary 342 | NSC,Norfolk Southern Corp.,Industrials 343 | NTRS,Northern Trust Corp.,Financials 344 | NOC,Northrop Grumman Corp.,Industrials 345 | NCLH,Norwegian Cruise Line,Consumer Discretionary 346 | NRG,NRG Energy,Utilities 347 | NUE,Nucor Corp.,Materials 348 | NVDA,Nvidia Corporation,Information Technology 349 | ORLY,O'Reilly Automotive,Consumer Discretionary 350 | OXY,Occidental Petroleum,Energy 351 | OMC,Omnicom Group,Consumer Discretionary 352 | OKE,ONEOK,Energy 353 | ORCL,Oracle Corp.,Information Technology 354 | PCAR,PACCAR Inc.,Industrials 355 | PKG,Packaging Corporation of America,Materials 356 | PH,Parker-Hannifin,Industrials 357 | PAYX,Paychex Inc.,Information Technology 358 | PYPL,PayPal,Information Technology 359 | PNR,Pentair Ltd.,Industrials 360 | PBCT,People's United Financial,Financials 361 | PEP,PepsiCo Inc.,Consumer Staples 362 | PKI,PerkinElmer,Health Care 363 | PRGO,Perrigo,Health Care 364 | PFE,Pfizer Inc.,Health Care 365 | PCG,PG&E Corp.,Utilities 366 | PM,Philip Morris International,Consumer Staples 367 | PSX,Phillips 66,Energy 368 | PNW,Pinnacle West Capital,Utilities 369 | PXD,Pioneer Natural Resources,Energy 370 | PNC,PNC Financial Services,Financials 371 | RL,Polo Ralph Lauren Corp.,Consumer Discretionary 372 | PPG,PPG Industries,Materials 373 | PPL,PPL Corp.,Utilities 374 | PX,Praxair Inc.,Materials 375 | PFG,Principal Financial Group,Financials 376 | PG,Procter & Gamble,Consumer Staples 377 | PGR,Progressive Corp.,Financials 378 | PLD,Prologis,Real Estate 379 | PRU,Prudential Financial,Financials 380 | PEG,Public Serv. Enterprise Inc.,Utilities 381 | PSA,Public Storage,Real Estate 382 | PHM,Pulte Homes Inc.,Consumer Discretionary 383 | PVH,PVH Corp.,Consumer Discretionary 384 | QRVO,Qorvo,Information Technology 385 | QCOM,QUALCOMM Inc.,Information Technology 386 | PWR,Quanta Services Inc.,Industrials 387 | DGX,Quest Diagnostics,Health Care 388 | RRC,Range Resources Corp.,Energy 389 | RJF,Raymond James Financial Inc.,Financials 390 | RTN,Raytheon Co.,Industrials 391 | O,Realty Income Corporation,Real Estate 392 | RHT,Red Hat Inc.,Information Technology 393 | REG,Regency Centers Corporation,Real Estate 394 | REGN,Regeneron,Health Care 395 | RF,Regions Financial Corp.,Financials 396 | RSG,Republic Services Inc,Industrials 397 | RMD,ResMed,Health Care 398 | RHI,Robert Half International,Industrials 399 | ROK,Rockwell Automation Inc.,Industrials 400 | COL,Rockwell Collins,Industrials 401 | ROP,Roper Technologies,Industrials 402 | ROST,Ross Stores,Consumer Discretionary 403 | RCL,Royal Caribbean Cruises Ltd,Consumer Discretionary 404 | SPGI,"S&P Global, Inc.",Financials 405 | CRM,Salesforce.com,Information Technology 406 | SBAC,SBA Communications,Real Estate 407 | SCG,SCANA Corp,Utilities 408 | SLB,Schlumberger Ltd.,Energy 409 | STX,Seagate Technology,Information Technology 410 | SEE,Sealed Air,Materials 411 | SRE,Sempra Energy,Utilities 412 | SHW,Sherwin-Williams,Materials 413 | SPG,Simon Property Group Inc,Real Estate 414 | SWKS,Skyworks Solutions,Information Technology 415 | SLG,SL Green Realty,Real Estate 416 | SNA,Snap-On Inc.,Consumer Discretionary 417 | SO,Southern Co.,Utilities 418 | LUV,Southwest Airlines,Industrials 419 | SWK,Stanley Black & Decker,Consumer Discretionary 420 | SBUX,Starbucks Corp.,Consumer Discretionary 421 | STT,State Street Corp.,Financials 422 | SRCL,Stericycle Inc,Industrials 423 | SYK,Stryker Corp.,Health Care 424 | STI,SunTrust Banks,Financials 425 | SIVB,SVB Financial,Financials 426 | SYMC,Symantec Corp.,Information Technology 427 | SYF,Synchrony Financial,Financials 428 | SNPS,Synopsys Inc.,Information Technology 429 | SYY,Sysco Corp.,Consumer Staples 430 | TROW,T. Rowe Price Group,Financials 431 | TTWO,Take-Two Interactive,Information Technology 432 | TPR,"Tapestry, Inc.",Consumer Discretionary 433 | TGT,Target Corp.,Consumer Discretionary 434 | TEL,TE Connectivity Ltd.,Information Technology 435 | FTI,TechnipFMC,Energy 436 | TXN,Texas Instruments,Information Technology 437 | TXT,Textron Inc.,Industrials 438 | BK,The Bank of New York Mellon Corp.,Financials 439 | CLX,The Clorox Company,Consumer Staples 440 | COO,The Cooper Companies,Health Care 441 | HSY,The Hershey Company,Consumer Staples 442 | MOS,The Mosaic Company,Materials 443 | TRV,The Travelers Companies Inc.,Financials 444 | DIS,The Walt Disney Company,Consumer Discretionary 445 | TMO,Thermo Fisher Scientific,Health Care 446 | TIF,Tiffany & Co.,Consumer Discretionary 447 | TWX,Time Warner Inc.,Consumer Discretionary 448 | TJX,TJX Companies Inc.,Consumer Discretionary 449 | TMK,Torchmark Corp.,Financials 450 | TSS,Total System Services,Information Technology 451 | TSCO,Tractor Supply Company,Consumer Discretionary 452 | TDG,TransDigm Group,Industrials 453 | TRIP,TripAdvisor,Consumer Discretionary 454 | FOXA,Twenty-First Century Fox Class A,Consumer Discretionary 455 | FOX,Twenty-First Century Fox Class B,Consumer Discretionary 456 | TSN,Tyson Foods,Consumer Staples 457 | USB,U.S. Bancorp,Financials 458 | UDR,UDR Inc,Real Estate 459 | ULTA,Ulta Beauty,Consumer Discretionary 460 | UAA,Under Armour Class A,Consumer Discretionary 461 | UA,Under Armour Class C,Consumer Discretionary 462 | UNP,Union Pacific,Industrials 463 | UAL,United Continental Holdings,Industrials 464 | UNH,United Health Group Inc.,Health Care 465 | UPS,United Parcel Service,Industrials 466 | URI,"United Rentals, Inc.",Industrials 467 | UTX,United Technologies,Industrials 468 | UHS,"Universal Health Services, Inc.",Health Care 469 | UNM,Unum Group,Financials 470 | VFC,V.F. Corp.,Consumer Discretionary 471 | VLO,Valero Energy,Energy 472 | VAR,Varian Medical Systems,Health Care 473 | VTR,Ventas Inc,Real Estate 474 | VRSN,Verisign Inc.,Information Technology 475 | VRSK,Verisk Analytics,Industrials 476 | VZ,Verizon Communications,Telecommunication Services 477 | VRTX,Vertex Pharmaceuticals Inc,Health Care 478 | VIAB,Viacom Inc.,Consumer Discretionary 479 | V,Visa Inc.,Information Technology 480 | VNO,Vornado Realty Trust,Real Estate 481 | VMC,Vulcan Materials,Materials 482 | WMT,Wal-Mart Stores,Consumer Staples 483 | WBA,Walgreens Boots Alliance,Consumer Staples 484 | WM,Waste Management Inc.,Industrials 485 | WAT,Waters Corporation,Health Care 486 | WEC,Wec Energy Group Inc,Utilities 487 | WFC,Wells Fargo,Financials 488 | WELL,Welltower Inc.,Real Estate 489 | WDC,Western Digital,Information Technology 490 | WU,Western Union Co,Information Technology 491 | WRK,WestRock Company,Materials 492 | WY,Weyerhaeuser Corp.,Real Estate 493 | WHR,Whirlpool Corp.,Consumer Discretionary 494 | WMB,Williams Cos.,Energy 495 | WLTW,Willis Towers Watson,Financials 496 | WYN,Wyndham Worldwide,Consumer Discretionary 497 | WYNN,Wynn Resorts Ltd,Consumer Discretionary 498 | XEL,Xcel Energy Inc,Utilities 499 | XRX,Xerox Corp.,Information Technology 500 | XLNX,Xilinx Inc,Information Technology 501 | XL,XL Capital,Financials 502 | XYL,Xylem Inc.,Industrials 503 | YUM,Yum! Brands Inc,Consumer Discretionary 504 | ZBH,Zimmer Biomet Holdings,Health Care 505 | ZION,Zions Bancorp,Financials 506 | ZTS,Zoetis,Health Care 507 | -------------------------------------------------------------------------------- /data/dict_example_export.json: -------------------------------------------------------------------------------- 1 | {"first": "Guido", "last": "van Rossum"} -------------------------------------------------------------------------------- /data/json_example.json: -------------------------------------------------------------------------------- 1 | { 2 | "planeId": "1xc2345g", 3 | "manufacturerDetails": { 4 | "manufacturer": "Airbus", 5 | "model": "A330", 6 | "year": 1999 7 | }, 8 | "airlineDetails": { 9 | "currentAirline": "Southwest", 10 | "previousAirlines": { 11 | "1st": "Delta" 12 | }, 13 | "lastPurchased": 2013 14 | }, 15 | "numberOfFlights": 4654 16 | } 17 | -------------------------------------------------------------------------------- /data/pickle_example.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/data/pickle_example.pickle -------------------------------------------------------------------------------- /data/pickle_example_export.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/data/pickle_example_export.pickle -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: uc-python 2 | channels: 3 | - defaults 4 | - conda-forge 5 | dependencies: 6 | - python=3.12 7 | - pip>=22.1 8 | - nbconvert>=6.1 9 | - numpy=1.26 10 | - pandas=2 11 | - seaborn=0.12 12 | - ipykernel>=6.28 13 | - jinja2>=2.11 14 | - jedi>=0.17 15 | -------------------------------------------------------------------------------- /notebooks/00-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Introduction to Python for Data Science\n", 12 | "\n", 13 | "Gus Powers & Jay Cunningham" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "## Introductions" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "slide" 32 | } 33 | }, 34 | "source": [ 35 | "## Gus Powers\n", 36 | "\n", 37 | " \n", 38 | " \n", 57 | "
\n", 39 | "

Lead Data Scientist at 84.51°

\n", 40 | "
    \n", 41 | "
  • Creating and maintaining data science tools for internal use
  • \n", 42 | "
  • Python, Bash (shell), & R
  • \n", 43 | "
\n", 44 | "

Academic

\n", 45 | "
    \n", 46 | "
  • BS, Chemistry, Thomas More College
  • \n", 47 | "
  • MS, Chemistry, University of Cincinnati
  • \n", 48 | "
  • MS, Business Analytics, University of Cincinnati
  • \n", 49 | "
\n", 50 | "

Contact

\n", 51 | " \n", 56 | "
" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "slideshow": { 64 | "slide_type": "slide" 65 | } 66 | }, 67 | "source": [ 68 | "## Jay Cunningham\n", 69 | "\n", 70 | " \n", 71 | " \n", 88 | "
\n", 72 | "

Lead Data Scientist at 84.51°

\n", 73 | "
    \n", 74 | "
  • Researching and developing forecasting models
  • \n", 75 | "
  • Machine learning, Python
  • \n", 76 | "
\n", 77 | "

Academic

\n", 78 | "
    \n", 79 | "
  • BA, Mathematics, University of Kentucky
  • \n", 80 | "
  • MA, Economics, University of North Carolina (Greensboro)
  • \n", 81 | "
\n", 82 | "

Contact

\n", 83 | " \n", 87 | "
" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "slideshow": { 95 | "slide_type": "skip" 96 | } 97 | }, 98 | "source": [ 99 | "## Brad Boehmke\n", 100 | "\n", 101 | " \n", 102 | " \n", 124 | "
\n", 103 | "

Director, Data Science at 84.51°

\n", 104 | "
    \n", 105 | "
  • Productionizing models and science solutions
  • \n", 106 | "
  • R&D and protogyping new solutions
  • \n", 107 | "
  • Python, R, & MLOps toolchain
  • \n", 108 | "
\n", 109 | "

Academic

\n", 110 | "
    \n", 111 | "
  • BS, Kinesiology, North Dakota State University
  • \n", 112 | "
  • MS, Cost Analytics, Air Force Institute of Technology
  • \n", 113 | "
  • PhD, Logistics, Air Force Institute of Technology
  • \n", 114 | "
\n", 115 | "

Contact

\n", 116 | " \n", 123 | "
" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "slideshow": { 131 | "slide_type": "skip" 132 | } 133 | }, 134 | "source": [ 135 | "## Ethan Swan\n", 136 | "\n", 137 | " \n", 138 | " \n", 159 | "
\n", 139 | "

Senior Backend Engineer at ReviewTrackers

\n", 140 | "
    \n", 141 | "
  • Rest API development
  • \n", 142 | "
  • Putting ML models in production
  • \n", 143 | "
  • Python, Go, Ruby, & ReactJS (JavaScript)
  • \n", 144 | "
\n", 145 | "

Academic

\n", 146 | "
    \n", 147 | "
  • BS, Computer Science, University of Notre Dame
  • \n", 148 | "
  • MBA, Business Analytics, University of Notre Dame
  • \n", 149 | "
\n", 150 | "

Contact

\n", 151 | " \n", 158 | "
" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": { 165 | "slideshow": { 166 | "slide_type": "slide" 167 | } 168 | }, 169 | "source": [ 170 | "### Around The Room\n", 171 | "\n", 172 | "We'll go around the room. Please share:\n", 173 | "\n", 174 | "1. Your name\n", 175 | "2. Your job or field\n", 176 | "3. How you use Python now or would like to in the future" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": { 182 | "slideshow": { 183 | "slide_type": "slide" 184 | } 185 | }, 186 | "source": [ 187 | "## Course" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "slideshow": { 194 | "slide_type": "slide" 195 | } 196 | }, 197 | "source": [ 198 | "### Defining Data Science\n", 199 | "\n", 200 | "
\n", 201 | "\"data-science.png\"\n", 202 | "
" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": { 208 | "slideshow": { 209 | "slide_type": "slide" 210 | } 211 | }, 212 | "source": [ 213 | "### Data Science and Technology\n", 214 | "\n", 215 | "
\n", 216 | "\"data-science-and-tech.png\"\n", 217 | "
" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "slideshow": { 224 | "slide_type": "slide" 225 | } 226 | }, 227 | "source": [ 228 | "### Applied Data Science\n", 229 | "\n", 230 | "
\n", 231 | "\"applied-data-science.gif\"\n", 232 | "
" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "slideshow": { 239 | "slide_type": "slide" 240 | } 241 | }, 242 | "source": [ 243 | "## Course Objectives\n", 244 | "\n", 245 | "The following are the primary learning objectives of this course:" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": { 251 | "slideshow": { 252 | "slide_type": "fragment" 253 | } 254 | }, 255 | "source": [ 256 | "1. Develop comprehensive skills in the importing/exporting, wrangling, aggregating and joining of data using Python.\n", 257 | "2. Establish a mental model of the Python programming language to enable future self-learning.\n", 258 | "3. Build awareness and basic skills in the core data science area of data visualization." 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": { 264 | "slideshow": { 265 | "slide_type": "slide" 266 | } 267 | }, 268 | "source": [ 269 | "## Course Agenda" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": { 275 | "slideshow": { 276 | "slide_type": "slide" 277 | } 278 | }, 279 | "source": [ 280 | "| Day | Topic | Time |\n", 281 | "| :--------:| :----------------------------------------------------------------------------- | :-----------: |\n", 282 | "| __Day 1__ | Introductions | 12:30 - 12:45 |\n", 283 | "| | Python and Jupyter Overview | 12:45 - 1:15 |\n", 284 | "| | Fundamentals | 1:15 - 2:00 |\n", 285 | "| | Break | 2:00 - 2:15 |\n", 286 | "| | Packages, Modules, Methods, Functions | 2:15 - 3:00 |\n", 287 | "| | Importing Data | 3:00 - 3:45 |\n", 288 | "| | Q\\&A | 3:45 - 4:15 |\n", 289 | "| __Day 2__ | Q\\&A | 12:45 - 1:00 |\n", 290 | "| | Selecting and Filtering Data | 1:00 - 1:45 |\n", 291 | "| | Working with Columns | 1:45 - 2:30 |\n", 292 | "| | Break | 2:30 - 2:45 |\n", 293 | "| | Case Study, pt. 1 | 2:45 - 3:45 |\n", 294 | "| | Q\\&A | 3:45 - 4:15 |" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": { 300 | "slideshow": { 301 | "slide_type": "slide" 302 | } 303 | }, 304 | "source": [ 305 | "| Day | Topic | Time |\n", 306 | "| :--------:| :----------------------------------------------------------------------------- | :-----------: |\n", 307 | "| __Day 3__ | Q\\&A | 12:45 - 1:00 |\n", 308 | "| | Review | 1:00 - 1:15 |\n", 309 | "| | Summarizing Data | 1:15 - 2:00 |\n", 310 | "| | Break | 2:00 - 2:15 |\n", 311 | "| | Summarizing Grouped Data | 2:15 - 3:00 |\n", 312 | "| | Joining Data | 3:00 - 3:45 |\n", 313 | "| | Q\\&A | 3:45 - 4:15 |\n", 314 | "| __Day 4__ | Q\\&A | 12:45 - 1:00 |\n", 315 | "| | Exporting Data | 1:00 - 1:30 |\n", 316 | "| | Visualizing Data | 1:30 - 2:30 |\n", 317 | "| | Break | 2:30 - 2:45 |\n", 318 | "| | Case Study, pt. 2 | 2:45 - 3:45 |\n", 319 | "| | Q\\&A | 3:45 - 4:15 |" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "slideshow": { 326 | "slide_type": "slide" 327 | } 328 | }, 329 | "source": [ 330 | "## Technologies" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "cell_style": "center", 337 | "slideshow": { 338 | "slide_type": "slide" 339 | } 340 | }, 341 | "source": [ 342 | "### Binder\n", 343 | "\n", 344 | "* We've developed this class using a product named [Binder](https://mybinder.org/).\n", 345 | "* As a result, this course requires *zero* setup on your part.\n", 346 | "* There are two core techologies within the Binder repository: Python and Jupyter.\n", 347 | "\n", 348 | "*We will cover more on this shortly.*" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": { 354 | "cell_style": "split", 355 | "slideshow": { 356 | "slide_type": "slide" 357 | } 358 | }, 359 | "source": [ 360 | "### Python\n", 361 | "\n", 362 | "* Python is the programming language we'll be learning in this class.\n", 363 | "* We are using Python 3.12, the newest version of Python, for the entirety of this class.\n", 364 | "* The core libaries we will be using are `pandas` and `seaborn`." 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": { 370 | "cell_style": "split", 371 | "slideshow": { 372 | "slide_type": "fragment" 373 | } 374 | }, 375 | "source": [ 376 | "### Jupyter\n", 377 | "\n", 378 | "* Jupyter is the integrated development environment (IDE) we will be using.\n", 379 | "* This is where we will write and run our Python code.\n" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": { 385 | "slideshow": { 386 | "slide_type": "slide" 387 | } 388 | }, 389 | "source": [ 390 | "## Course Material" 391 | ] 392 | }, 393 | { 394 | "cell_type": "markdown", 395 | "metadata": { 396 | "slideshow": { 397 | "slide_type": "fragment" 398 | } 399 | }, 400 | "source": [ 401 | "* All of the material for this course can be reached from our [GitHub](https://github.com/uc-python/intro-python-datasci) repository." 402 | ] 403 | }, 404 | { 405 | "cell_type": "markdown", 406 | "metadata": { 407 | "slideshow": { 408 | "slide_type": "fragment" 409 | } 410 | }, 411 | "source": [ 412 | "* You can either access this material through [Binder](https://mybinder.org/v2/gh/uc-python/intro-python-datasci/main) or by [downloading the material](https://github.com/uc-python/intro-python-datasci/archive/refs/heads/main.zip)\n", 413 | " and opening it via Anaconda Navigator and JupyterLab." 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": { 419 | "slideshow": { 420 | "slide_type": "slide" 421 | } 422 | }, 423 | "source": [ 424 | "### Slides *are* notebooks" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": { 430 | "slideshow": { 431 | "slide_type": "fragment" 432 | } 433 | }, 434 | "source": [ 435 | "* We will be teaching using slides." 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": { 441 | "slideshow": { 442 | "slide_type": "fragment" 443 | } 444 | }, 445 | "source": [ 446 | "* These slides are created from the notebooks in the course repository -- so you can follow along and run the code in your notebook." 447 | ] 448 | }, 449 | { 450 | "cell_type": "markdown", 451 | "metadata": { 452 | "slideshow": { 453 | "slide_type": "slide" 454 | } 455 | }, 456 | "source": [ 457 | "### Source Code" 458 | ] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": { 463 | "slideshow": { 464 | "slide_type": "fragment" 465 | } 466 | }, 467 | "source": [ 468 | "* Source code for the training can be found on [GitHub](https://github.com/uc-python/intro-python-datasci)." 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": { 474 | "slideshow": { 475 | "slide_type": "fragment" 476 | } 477 | }, 478 | "source": [ 479 | "* This repository is public so you can clone (download) and/or refer to the materials at any point in the future." 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": { 485 | "slideshow": { 486 | "slide_type": "slide" 487 | } 488 | }, 489 | "source": [ 490 | "## Questions\n", 491 | "\n", 492 | "Are there any questions before moving on?" 493 | ] 494 | } 495 | ], 496 | "metadata": { 497 | "celltoolbar": "Slideshow", 498 | "kernelspec": { 499 | "display_name": "uc-python", 500 | "language": "python", 501 | "name": "python3" 502 | }, 503 | "language_info": { 504 | "codemirror_mode": { 505 | "name": "ipython", 506 | "version": 3 507 | }, 508 | "file_extension": ".py", 509 | "mimetype": "text/x-python", 510 | "name": "python", 511 | "nbconvert_exporter": "python", 512 | "pygments_lexer": "ipython3", 513 | "version": "3.11.4" 514 | }, 515 | "rise": { 516 | "autolaunch": true, 517 | "transition": "none" 518 | } 519 | }, 520 | "nbformat": 4, 521 | "nbformat_minor": 4 522 | } 523 | -------------------------------------------------------------------------------- /notebooks/01-Python-and-Jupyter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Python and Jupyter Overview\n", 12 | "\n", 13 | "![](images/python_jupyter.png)" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "slideshow": { 20 | "slide_type": "slide" 21 | } 22 | }, 23 | "source": [ 24 | "## Python" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "slide" 32 | } 33 | }, 34 | "source": [ 35 | "### Python is..." 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "fragment" 43 | } 44 | }, 45 | "source": [ 46 | "* a *high-level*, *structured*, *open-source*, *interpreted* programming language" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "slideshow": { 53 | "slide_type": "fragment" 54 | } 55 | }, 56 | "source": [ 57 | "* a really good choice for almost any programming task" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "slideshow": { 64 | "slide_type": "fragment" 65 | } 66 | }, 67 | "source": [ 68 | "* a very popular and effective choice for data science tasks" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "slideshow": { 75 | "slide_type": "slide" 76 | } 77 | }, 78 | "source": [ 79 | "According to StackOverflow Trends, more than 11 percent of Stack Overflow questions were tagged with \"python\" in late 2018. All other languages fell well short of this number:\n", 80 | "\n", 81 | "| Language | Percent |\n", 82 | "|----------|---------|\n", 83 | "| Python | 11.2% |\n", 84 | "| Java | 7.7% |\n", 85 | "| C++ | 2.75% |\n", 86 | "| R | 2.7% |\n", 87 | "| Matlab | < 1% |\n", 88 | "| Scala | < 1% |\n", 89 | "| SAS | < 1% |\n", 90 | "| Julia | < 1% |" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": { 96 | "slideshow": { 97 | "slide_type": "slide" 98 | }, 99 | "tags": [] 100 | }, 101 | "source": [ 102 | "More recently, the [2021 Stack Overflow Developer Survey](https://insights.stackoverflow.com/survey/2021) shows that Python is used by over 48% of all \"developers\"\n", 103 | "\n", 104 | "
\n", 105 | "\"Stack\n", 106 | "
" 107 | ] 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "metadata": { 112 | "slideshow": { 113 | "slide_type": "slide" 114 | } 115 | }, 116 | "source": [ 117 | "### Python in the real world\n", 118 | "\n", 119 | "Python is one of the *the most popular programming languages in the world*. It's commonly used for:\n", 120 | "\n", 121 | "* Application development\n", 122 | "* Scripting\n", 123 | "* Automation\n", 124 | "* Testing\n", 125 | "* Data science" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": { 131 | "slideshow": { 132 | "slide_type": "slide" 133 | } 134 | }, 135 | "source": [ 136 | "### Python in data science\n", 137 | "\n", 138 | "As previously mentioned, Python is also a popular choice in data science. For reference:" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "slideshow": { 145 | "slide_type": "fragment" 146 | } 147 | }, 148 | "source": [ 149 | " * According to KDNuggets, 65.6 percent of data scientists used Python regularly in 2018. This was an *increase* from 54 percent in 2017." 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "slideshow": { 156 | "slide_type": "fragment" 157 | } 158 | }, 159 | "source": [ 160 | " * In contrast, R was used by 48.5 percent of data scientists in 2018. This was a *decrease* from 63 percent in 2017." 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "slideshow": { 167 | "slide_type": "fragment" 168 | } 169 | }, 170 | "source": [ 171 | "
\n", 172 | "

Note

\n", 173 | "

Disclaimer: R is terrific and an excellent tool for data science.

\n", 174 | "
" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": { 180 | "slideshow": { 181 | "slide_type": "slide" 182 | } 183 | }, 184 | "source": [ 185 | "### Why are data scientists choosing Python?" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": { 191 | "slideshow": { 192 | "slide_type": "fragment" 193 | } 194 | }, 195 | "source": [ 196 | " * It can do anything...so everybody uses it\n", 197 | " " 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "slideshow": { 204 | "slide_type": "fragment" 205 | }, 206 | "tags": [] 207 | }, 208 | "source": [ 209 | "* Consistency across engineering and data science teams" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": { 215 | "slideshow": { 216 | "slide_type": "fragment" 217 | } 218 | }, 219 | "source": [ 220 | " * Open-source and community support" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "slideshow": { 227 | "slide_type": "fragment" 228 | } 229 | }, 230 | "source": [ 231 | " * Concise syntax, readability and ease-of-use" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "slideshow": { 238 | "slide_type": "fragment" 239 | } 240 | }, 241 | "source": [ 242 | " * Strength in numeric computations and cutting edge data science libraries" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "slideshow": { 249 | "slide_type": "slide" 250 | } 251 | }, 252 | "source": [ 253 | "## Jupyter" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "slideshow": { 260 | "slide_type": "slide" 261 | } 262 | }, 263 | "source": [ 264 | "### JupyterLab is..." 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": { 270 | "slideshow": { 271 | "slide_type": "fragment" 272 | } 273 | }, 274 | "source": [ 275 | "* a *language-agnostic integrated development environment (IDE)* specializing in **notebooks**" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": { 281 | "slideshow": { 282 | "slide_type": "fragment" 283 | } 284 | }, 285 | "source": [ 286 | "* a popular choice among data scientists using Python" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "slideshow": { 293 | "slide_type": "fragment" 294 | } 295 | }, 296 | "source": [ 297 | "
\n", 298 | "

Note

\n", 299 | "

We've chosen to use Jupyter over other popular IDEs for this course but that does not mean it is always the best IDE for writing Python.

\n", 300 | "
" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "metadata": { 306 | "slideshow": { 307 | "slide_type": "slide" 308 | } 309 | }, 310 | "source": [ 311 | "### Why are data scientists choosing Jupyter?" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": { 317 | "slideshow": { 318 | "slide_type": "fragment" 319 | } 320 | }, 321 | "source": [ 322 | "* Ad-hoc analyses and science development" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": { 328 | "slideshow": { 329 | "slide_type": "fragment" 330 | } 331 | }, 332 | "source": [ 333 | "* Synchronous data visualizations" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": { 339 | "slideshow": { 340 | "slide_type": "fragment" 341 | } 342 | }, 343 | "source": [ 344 | "* Documentation of code with accompanying comments" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "metadata": { 350 | "slideshow": { 351 | "slide_type": "fragment" 352 | } 353 | }, 354 | "source": [ 355 | "* Flexibility through extensions" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": { 361 | "slideshow": { 362 | "slide_type": "slide" 363 | } 364 | }, 365 | "source": [ 366 | "### Reasons you may want to use another IDE..." 367 | ] 368 | }, 369 | { 370 | "cell_type": "markdown", 371 | "metadata": { 372 | "slideshow": { 373 | "slide_type": "fragment" 374 | } 375 | }, 376 | "source": [ 377 | "* Jupyter is not good for developing *scripts*, or Python code that isn't run interactively" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": { 383 | "slideshow": { 384 | "slide_type": "fragment" 385 | } 386 | }, 387 | "source": [ 388 | "* It's not trivial to install and launch Jupyter without a tech background" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": { 394 | "slideshow": { 395 | "slide_type": "fragment" 396 | } 397 | }, 398 | "source": [ 399 | "
\n", 400 | "

Note

\n", 401 | "

We've minimized these challenges for this workshop by using Binder. If you are interested in installing and using Jupyter and Python on your own machine, we recommend using Anaconda to do so.

\n", 402 | "
" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": { 408 | "slideshow": { 409 | "slide_type": "slide" 410 | } 411 | }, 412 | "source": [ 413 | "## Jupyter Basics" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": { 419 | "slideshow": { 420 | "slide_type": "slide" 421 | } 422 | }, 423 | "source": [ 424 | "### Launching Jupyter \n", 425 | "\n", 426 | "Let's begin by launching Jupyter by opening the [Binder repository](https://mybinder.org/v2/gh/uc-python/intro-python-datasci/master?urlpath=lab). This is how we will access Jupyter throughout the course." 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "slideshow": { 433 | "slide_type": "fragment" 434 | } 435 | }, 436 | "source": [ 437 | "You should see a screen like this:\n", 438 | "\n", 439 | "![binder-launching.png](images/binder-launching.png)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "slideshow": { 446 | "slide_type": "slide" 447 | } 448 | }, 449 | "source": [ 450 | "Jupyter can also be launched via Anaconda Navigator:\n", 451 | "\n", 452 | "
\n", 453 | "\"navigator-jupyter.png\"\n", 454 | "
" 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": { 460 | "slideshow": { 461 | "slide_type": "fragment" 462 | } 463 | }, 464 | "source": [ 465 | "Note that we want to launch the JupyterLab option for this class." 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": { 471 | "slideshow": { 472 | "slide_type": "slide" 473 | } 474 | }, 475 | "source": [ 476 | "### Jupyter File Structure" 477 | ] 478 | }, 479 | { 480 | "cell_type": "markdown", 481 | "metadata": { 482 | "slideshow": { 483 | "slide_type": "fragment" 484 | } 485 | }, 486 | "source": [ 487 | "As you can see, Jupyter displays a file browser when it launches:\n", 488 | "\n", 489 | "![jupyter-file-structure.png](images/jupyter-file-structure.png)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": { 495 | "slideshow": { 496 | "slide_type": "slide" 497 | } 498 | }, 499 | "source": [ 500 | "There is just one directory for you to worry about:\n", 501 | "\n", 502 | "**notebooks** - interactive slideshows and code for you to follow along with" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": { 508 | "slideshow": { 509 | "slide_type": "slide" 510 | } 511 | }, 512 | "source": [ 513 | "### Jupyter Notebooks" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": { 519 | "slideshow": { 520 | "slide_type": "fragment" 521 | } 522 | }, 523 | "source": [ 524 | "The **notebook** is the core file used to interact with Python from Jupyter. A few details:" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": { 530 | "slideshow": { 531 | "slide_type": "fragment" 532 | } 533 | }, 534 | "source": [ 535 | "* Notebooks allow the writing AND running of Python code" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "metadata": { 541 | "slideshow": { 542 | "slide_type": "fragment" 543 | } 544 | }, 545 | "source": [ 546 | "* Notebooks are organized by **cells** - code and commentary text goes in the cells" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": { 552 | "slideshow": { 553 | "slide_type": "fragment" 554 | } 555 | }, 556 | "source": [ 557 | "* All notebook files have the extension `.ipynb` (**i**nteractive **py**thon **n**ote**b**ook)" 558 | ] 559 | }, 560 | { 561 | "cell_type": "markdown", 562 | "metadata": { 563 | "slideshow": { 564 | "slide_type": "slide" 565 | } 566 | }, 567 | "source": [ 568 | "A new Jupyter notebook can be opened from the \"launcher\" page, which opens automatically when you start JupyterLab.\n", 569 | "\n", 570 | "There may be multiple options listed in the Notebooks section, as seen here. These are different Python installations available on your computer. You should choose **Python 3** in this case.\n", 571 | "\n", 572 | "![open-jupyter-notebook](images/open-jupyter-notebook.png)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "markdown", 577 | "metadata": { 578 | "slideshow": { 579 | "slide_type": "slide" 580 | } 581 | }, 582 | "source": [ 583 | "This will open a new notebook with a Python 3 kernel:\n", 584 | "\n", 585 | "![new-jupyter-notebook](images/new-jupyter-notebook.png)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": { 591 | "slideshow": { 592 | "slide_type": "slide" 593 | } 594 | }, 595 | "source": [ 596 | "### Notebook Cells" 597 | ] 598 | }, 599 | { 600 | "cell_type": "markdown", 601 | "metadata": { 602 | "slideshow": { 603 | "slide_type": "fragment" 604 | } 605 | }, 606 | "source": [ 607 | "As previously mentioned, Jupyter notebooks are organized by **cells**. These cells are at the core of a notebook:" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": { 613 | "slideshow": { 614 | "slide_type": "fragment" 615 | } 616 | }, 617 | "source": [ 618 | "* When using Jupyter, all Python code is typed into and run from a cell" 619 | ] 620 | }, 621 | { 622 | "cell_type": "markdown", 623 | "metadata": { 624 | "slideshow": { 625 | "slide_type": "fragment" 626 | } 627 | }, 628 | "source": [ 629 | "* Comments, markdown, HTML, LaTeX can also be rendered within a cell" 630 | ] 631 | }, 632 | { 633 | "cell_type": "markdown", 634 | "metadata": { 635 | "slideshow": { 636 | "slide_type": "slide" 637 | } 638 | }, 639 | "source": [ 640 | "### Code Cells\n", 641 | "\n", 642 | "By default, all cells are code cells. This means Python code can be run by simply:\n", 643 | "\n", 644 | "1. Clicking on a cell's input area\n", 645 | "2. Typing Python code into the cell\n", 646 | "3. Pressing CTRL + RETURN (or SHIFT + RETURN)" 647 | ] 648 | }, 649 | { 650 | "cell_type": "markdown", 651 | "metadata": { 652 | "slideshow": { 653 | "slide_type": "fragment" 654 | } 655 | }, 656 | "source": [ 657 | "The results of the code will be printed to the output area:\n", 658 | "\n", 659 | "![python-code-cell.png](images/python-code-cell.png)" 660 | ] 661 | }, 662 | { 663 | "cell_type": "markdown", 664 | "metadata": { 665 | "slideshow": { 666 | "slide_type": "slide" 667 | } 668 | }, 669 | "source": [ 670 | "### Comment/Markdown/HTML/LaTeX Cells\n", 671 | "\n", 672 | "Cells can be converted to text-oriented cells by:\n", 673 | "\n", 674 | "1. Selecting a cell by clicking on it\n", 675 | "2. Clicking the \"Code\" dropdown on the edit panel\n", 676 | "3. Clicking the \"Markdown\" option\n", 677 | "\n", 678 | "![markdown-cell-selection.png](images/markdown-cell-selection.png)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": { 684 | "slideshow": { 685 | "slide_type": "slide" 686 | } 687 | }, 688 | "source": [ 689 | "Text can then be typed into these cells (regular text, markdown, HTML, LaTeX):\n", 690 | "\n", 691 | "![markdown-cell-unrendered.png](images/markdown-cell-unrendered.png)" 692 | ] 693 | }, 694 | { 695 | "cell_type": "markdown", 696 | "metadata": { 697 | "slideshow": { 698 | "slide_type": "slide" 699 | } 700 | }, 701 | "source": [ 702 | "And it can be rendered by pressing CTRL + RETURN:\n", 703 | "\n", 704 | "![markdown-cell-rendered.png](images/markdown-cell-rendered.png)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "metadata": { 710 | "slideshow": { 711 | "slide_type": "slide" 712 | } 713 | }, 714 | "source": [ 715 | "### Inserting New Cells\n", 716 | "\n", 717 | "New cells can be inserted by selecting a cell by clicking on it and\n", 718 | "\n", 719 | "* Clicking the \"+\" menu button add a new cell below the selected one\n", 720 | "* Or using the keyboard shortcut `\"a\"` to insert cell above or `\"b\"` to insert cell below\n", 721 | "\n", 722 | "![insert-new-cell.png](images/insert-new-cell.png)" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "metadata": { 728 | "slideshow": { 729 | "slide_type": "slide" 730 | } 731 | }, 732 | "source": [ 733 | "### Your Turn\n", 734 | "\n", 735 | "1. Create or open a notebook in Jupyter.\n", 736 | "2. Create a new *markdown* cell. Write your name in it.\n", 737 | "3. Create a new *code* cell. Write `x = 5` and run it." 738 | ] 739 | }, 740 | { 741 | "cell_type": "markdown", 742 | "metadata": { 743 | "slideshow": { 744 | "slide_type": "slide" 745 | } 746 | }, 747 | "source": [ 748 | "### Additional Tips" 749 | ] 750 | }, 751 | { 752 | "cell_type": "markdown", 753 | "metadata": { 754 | "slideshow": { 755 | "slide_type": "fragment" 756 | } 757 | }, 758 | "source": [ 759 | "* Notebooks can be saved by clicking \"File\" -> \"Save Notebook As...\"" 760 | ] 761 | }, 762 | { 763 | "cell_type": "markdown", 764 | "metadata": { 765 | "slideshow": { 766 | "slide_type": "fragment" 767 | } 768 | }, 769 | "source": [ 770 | "* An easy way to find a feature or its related keyboard shortcut is \"View\" -> \"Activate Command Palette\"" 771 | ] 772 | }, 773 | { 774 | "cell_type": "markdown", 775 | "metadata": { 776 | "slideshow": { 777 | "slide_type": "fragment" 778 | } 779 | }, 780 | "source": [ 781 | "* Notebooks can be downloaded from Binder in numerous formats by clicking \"File\" -> \"Download\"\n", 782 | " * This is a great way to save your work when using Binder. These notebooks can then be reloaded at any time.\n", 783 | " \n", 784 | "
\n", 785 | "

Caution!

\n", 786 | "

New content created in Binder does not persist across sessions. Also, Binder tends to shutdown after a few minutes of inactivity. Save and download your work accordingly!

\n", 787 | "
" 788 | ] 789 | }, 790 | { 791 | "cell_type": "markdown", 792 | "metadata": { 793 | "slideshow": { 794 | "slide_type": "slide" 795 | } 796 | }, 797 | "source": [ 798 | "## Questions\n", 799 | "\n", 800 | "Are there any questions before moving on?" 801 | ] 802 | } 803 | ], 804 | "metadata": { 805 | "celltoolbar": "Slideshow", 806 | "kernelspec": { 807 | "display_name": "Python 3 (ipykernel)", 808 | "language": "python", 809 | "name": "python3" 810 | }, 811 | "language_info": { 812 | "codemirror_mode": { 813 | "name": "ipython", 814 | "version": 3 815 | }, 816 | "file_extension": ".py", 817 | "mimetype": "text/x-python", 818 | "name": "python", 819 | "nbconvert_exporter": "python", 820 | "pygments_lexer": "ipython3", 821 | "version": "3.8.12" 822 | }, 823 | "rise": { 824 | "autolaunch": true, 825 | "transition": "none" 826 | } 827 | }, 828 | "nbformat": 4, 829 | "nbformat_minor": 4 830 | } 831 | -------------------------------------------------------------------------------- /notebooks/07-Review-Day-1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Review of Week 1" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "slide" 19 | } 20 | }, 21 | "source": [ 22 | "## Fundamentals" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "slideshow": { 29 | "slide_type": "slide" 30 | } 31 | }, 32 | "source": [ 33 | "### Data Types" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "tags": [] 40 | }, 41 | "source": [ 42 | "Everything in Python is an object, and every object has a type." 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": { 48 | "tags": [] 49 | }, 50 | "source": [ 51 | "Let's review the most important ones." 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "slideshow": { 58 | "slide_type": "slide" 59 | } 60 | }, 61 | "source": [ 62 | "**Integers** – Whole Numbers" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 1, 68 | "metadata": { 69 | "tags": [] 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "3" 76 | ] 77 | }, 78 | "execution_count": 1, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "i = 3\n", 85 | "i" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "slideshow": { 92 | "slide_type": "fragment" 93 | }, 94 | "tags": [] 95 | }, 96 | "source": [ 97 | "**Floats** – Decimal Numbers" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 2, 103 | "metadata": { 104 | "tags": [] 105 | }, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/plain": [ 110 | "3.4" 111 | ] 112 | }, 113 | "execution_count": 2, 114 | "metadata": {}, 115 | "output_type": "execute_result" 116 | } 117 | ], 118 | "source": [ 119 | "f = 3.4\n", 120 | "f" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "slideshow": { 127 | "slide_type": "fragment" 128 | }, 129 | "tags": [] 130 | }, 131 | "source": [ 132 | "**Strings** – Bits of Text" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 3, 138 | "metadata": { 139 | "tags": [] 140 | }, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "'python'" 146 | ] 147 | }, 148 | "execution_count": 3, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "s = 'python'\n", 155 | "s" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "slideshow": { 162 | "slide_type": "slide" 163 | } 164 | }, 165 | "source": [ 166 | "**Lists** – Ordered collections of other Python objects" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 4, 172 | "metadata": { 173 | "tags": [] 174 | }, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "['a', 'b', 'c']" 180 | ] 181 | }, 182 | "execution_count": 4, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "l = ['a', 'b', 'c']\n", 189 | "l" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "slideshow": { 196 | "slide_type": "fragment" 197 | }, 198 | "tags": [] 199 | }, 200 | "source": [ 201 | "**Dictionaries** – A collection of key-value pairs, which let you easily look up the value for a given key" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 5, 207 | "metadata": { 208 | "tags": [] 209 | }, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "{'a': 1, 'b': 2, 'z': 26}" 215 | ] 216 | }, 217 | "execution_count": 5, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "d = {'a': 1,\n", 224 | " 'b': 2,\n", 225 | " 'z': 26}\n", 226 | "d" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "slideshow": { 233 | "slide_type": "slide" 234 | } 235 | }, 236 | "source": [ 237 | "**DataFrames** - Tabular datasets. Part of the Pandas library." 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 6, 243 | "metadata": { 244 | "tags": [] 245 | }, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/html": [ 250 | "
\n", 251 | "\n", 264 | "\n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | "
xy
012
134
\n", 285 | "
" 286 | ], 287 | "text/plain": [ 288 | " x y\n", 289 | "0 1 2\n", 290 | "1 3 4" 291 | ] 292 | }, 293 | "execution_count": 6, 294 | "metadata": {}, 295 | "output_type": "execute_result" 296 | } 297 | ], 298 | "source": [ 299 | "import pandas as pd\n", 300 | "df = pd.DataFrame([(1, 2), (3, 4)], columns=['x', 'y'])\n", 301 | "df" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "metadata": { 307 | "slideshow": { 308 | "slide_type": "slide" 309 | } 310 | }, 311 | "source": [ 312 | "### The `type` Function" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": { 318 | "slideshow": { 319 | "slide_type": "fragment" 320 | } 321 | }, 322 | "source": [ 323 | "You can use the `type` function to determine the type of an object." 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 7, 329 | "metadata": { 330 | "slideshow": { 331 | "slide_type": "slide" 332 | } 333 | }, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/plain": [ 338 | "list" 339 | ] 340 | }, 341 | "execution_count": 7, 342 | "metadata": {}, 343 | "output_type": "execute_result" 344 | } 345 | ], 346 | "source": [ 347 | "x = [1, 2, 3]\n", 348 | "type(x)" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 8, 354 | "metadata": { 355 | "tags": [] 356 | }, 357 | "outputs": [ 358 | { 359 | "data": { 360 | "text/plain": [ 361 | "str" 362 | ] 363 | }, 364 | "execution_count": 8, 365 | "metadata": {}, 366 | "output_type": "execute_result" 367 | } 368 | ], 369 | "source": [ 370 | "x = 'hello'\n", 371 | "type(x)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": { 377 | "slideshow": { 378 | "slide_type": "slide" 379 | } 380 | }, 381 | "source": [ 382 | "## Packages, Modules, and Functions" 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": { 388 | "slideshow": { 389 | "slide_type": "slide" 390 | } 391 | }, 392 | "source": [ 393 | "### Packages" 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": { 399 | "slideshow": { 400 | "slide_type": "fragment" 401 | } 402 | }, 403 | "source": [ 404 | "*Packages* (generally synonymous with *modules* or *libraries*) are extensions for Python featuring useful code." 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": { 410 | "slideshow": { 411 | "slide_type": "fragment" 412 | } 413 | }, 414 | "source": [ 415 | "Some are included in every Python install (*\"standard library\"*), while others (like Pandas, matplotlib, and more) need to be installed separately (*\"third party packages\"*)." 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "slideshow": { 422 | "slide_type": "fragment" 423 | } 424 | }, 425 | "source": [ 426 | "The DataFrame type, a staple of data science, comes in the Pandas package." 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "slideshow": { 433 | "slide_type": "slide" 434 | } 435 | }, 436 | "source": [ 437 | "### Functions" 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": { 443 | "slideshow": { 444 | "slide_type": "fragment" 445 | } 446 | }, 447 | "source": [ 448 | "*Functions* are executable Python code stored in a name, just like a regular variable." 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": { 454 | "slideshow": { 455 | "slide_type": "fragment" 456 | } 457 | }, 458 | "source": [ 459 | "You can call a function by putting parentheses after its name, and optionally including *arguments* to it (e.g. `myfunction(argument_1, argument_2)`)." 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": { 465 | "slideshow": { 466 | "slide_type": "fragment" 467 | } 468 | }, 469 | "source": [ 470 | "Well-named functions can help to simplify your code and make it much more readable." 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": { 476 | "slideshow": { 477 | "slide_type": "slide" 478 | } 479 | }, 480 | "source": [ 481 | "### Attributes and Methods" 482 | ] 483 | }, 484 | { 485 | "cell_type": "markdown", 486 | "metadata": { 487 | "slideshow": { 488 | "slide_type": "fragment" 489 | } 490 | }, 491 | "source": [ 492 | "Python objects (that's everything in Python, remember?) come with *attributes*, or internal information accessible through dot syntax:\n", 493 | "```python\n", 494 | "myobject.attribute\n", 495 | "```" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": { 501 | "slideshow": { 502 | "slide_type": "slide" 503 | } 504 | }, 505 | "source": [ 506 | "Attributes can be handy when you want to learn more about an object." 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 9, 512 | "metadata": { 513 | "slideshow": { 514 | "slide_type": "fragment" 515 | } 516 | }, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "(2, 2)" 522 | ] 523 | }, 524 | "execution_count": 9, 525 | "metadata": {}, 526 | "output_type": "execute_result" 527 | } 528 | ], 529 | "source": [ 530 | "df.shape" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": { 536 | "slideshow": { 537 | "slide_type": "slide" 538 | } 539 | }, 540 | "source": [ 541 | "Some attributes actually hold functions, in which case we call them *methods*." 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 10, 547 | "metadata": { 548 | "slideshow": { 549 | "slide_type": "fragment" 550 | } 551 | }, 552 | "outputs": [ 553 | { 554 | "data": { 555 | "text/html": [ 556 | "
\n", 557 | "\n", 570 | "\n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | "
xy
count2.0000002.000000
mean2.0000003.000000
std1.4142141.414214
min1.0000002.000000
25%1.5000002.500000
50%2.0000003.000000
75%2.5000003.500000
max3.0000004.000000
\n", 621 | "
" 622 | ], 623 | "text/plain": [ 624 | " x y\n", 625 | "count 2.000000 2.000000\n", 626 | "mean 2.000000 3.000000\n", 627 | "std 1.414214 1.414214\n", 628 | "min 1.000000 2.000000\n", 629 | "25% 1.500000 2.500000\n", 630 | "50% 2.000000 3.000000\n", 631 | "75% 2.500000 3.500000\n", 632 | "max 3.000000 4.000000" 633 | ] 634 | }, 635 | "execution_count": 10, 636 | "metadata": {}, 637 | "output_type": "execute_result" 638 | } 639 | ], 640 | "source": [ 641 | "df.describe()" 642 | ] 643 | }, 644 | { 645 | "cell_type": "markdown", 646 | "metadata": { 647 | "slideshow": { 648 | "slide_type": "slide" 649 | } 650 | }, 651 | "source": [ 652 | "### DataFrames and Series" 653 | ] 654 | }, 655 | { 656 | "cell_type": "markdown", 657 | "metadata": { 658 | "slideshow": { 659 | "slide_type": "fragment" 660 | } 661 | }, 662 | "source": [ 663 | "When you extract individual rows or columns of DataFrames, you get a 1-dimensional dataset called a *Series*." 664 | ] 665 | }, 666 | { 667 | "cell_type": "markdown", 668 | "metadata": { 669 | "slideshow": { 670 | "slide_type": "fragment" 671 | } 672 | }, 673 | "source": [ 674 | "Series look like lists but their data must be all of the same type, and they provide similar (though subtly different) functionality to DataFrames." 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": { 680 | "slideshow": { 681 | "slide_type": "slide" 682 | } 683 | }, 684 | "source": [ 685 | "## Importing Data" 686 | ] 687 | }, 688 | { 689 | "cell_type": "markdown", 690 | "metadata": { 691 | "slideshow": { 692 | "slide_type": "fragment" 693 | } 694 | }, 695 | "source": [ 696 | "Importing data is the process of taking data *on disk* and moving it into *memory*, where Python can do its work." 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": { 702 | "slideshow": { 703 | "slide_type": "slide" 704 | } 705 | }, 706 | "source": [ 707 | "Reading CSVs will likely be one of the most common ways you import data." 708 | ] 709 | }, 710 | { 711 | "cell_type": "markdown", 712 | "metadata": { 713 | "slideshow": { 714 | "slide_type": "fragment" 715 | } 716 | }, 717 | "source": [ 718 | "To do so, use Pandas' `read_csv` function, passing the name of your file as an argument." 719 | ] 720 | }, 721 | { 722 | "cell_type": "markdown", 723 | "metadata": { 724 | "tags": [] 725 | }, 726 | "source": [ 727 | "```python\n", 728 | "import pandas as pd\n", 729 | "data = pd.read_csv('myfile.csv')\n", 730 | "```" 731 | ] 732 | }, 733 | { 734 | "cell_type": "markdown", 735 | "metadata": { 736 | "slideshow": { 737 | "slide_type": "slide" 738 | } 739 | }, 740 | "source": [ 741 | "Though they are less common in data science, JSON and pickle files may come up in your work as well." 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": { 747 | "slideshow": { 748 | "slide_type": "fragment" 749 | } 750 | }, 751 | "source": [ 752 | "These are slightly more complicated to import, but it's still very doable." 753 | ] 754 | }, 755 | { 756 | "cell_type": "markdown", 757 | "metadata": { 758 | "slideshow": { 759 | "slide_type": "slide" 760 | } 761 | }, 762 | "source": [ 763 | "JSON:" 764 | ] 765 | }, 766 | { 767 | "cell_type": "markdown", 768 | "metadata": { 769 | "tags": [] 770 | }, 771 | "source": [ 772 | "```python\n", 773 | "import json\n", 774 | "with open('myfile.json', 'r') as f:\n", 775 | " data = json.load(f)\n", 776 | "```" 777 | ] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "metadata": { 782 | "tags": [] 783 | }, 784 | "source": [ 785 | "Pickle:" 786 | ] 787 | }, 788 | { 789 | "cell_type": "markdown", 790 | "metadata": { 791 | "tags": [] 792 | }, 793 | "source": [ 794 | "```python\n", 795 | "import pickle\n", 796 | "with open('myfile.pickle', 'rb') as f:\n", 797 | " data = pickle.load(f)\n", 798 | "```" 799 | ] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": { 804 | "slideshow": { 805 | "slide_type": "slide" 806 | } 807 | }, 808 | "source": [ 809 | "## Subsetting and Filtering" 810 | ] 811 | }, 812 | { 813 | "cell_type": "markdown", 814 | "metadata": { 815 | "slideshow": { 816 | "slide_type": "fragment" 817 | } 818 | }, 819 | "source": [ 820 | "There are three primary ways of subsetting data:" 821 | ] 822 | }, 823 | { 824 | "cell_type": "markdown", 825 | "metadata": { 826 | "slideshow": { 827 | "slide_type": "fragment" 828 | } 829 | }, 830 | "source": [ 831 | "- **Selecting** - Including certain *columns* of the data while excluding others" 832 | ] 833 | }, 834 | { 835 | "cell_type": "markdown", 836 | "metadata": { 837 | "slideshow": { 838 | "slide_type": "fragment" 839 | } 840 | }, 841 | "source": [ 842 | "- **Slicing** - Including only certain *rows* based on their position (index) in the DataFrame " 843 | ] 844 | }, 845 | { 846 | "cell_type": "markdown", 847 | "metadata": { 848 | "slideshow": { 849 | "slide_type": "fragment" 850 | } 851 | }, 852 | "source": [ 853 | "- **Filtering** - Including only certain *rows* with data that meets some criterion" 854 | ] 855 | }, 856 | { 857 | "cell_type": "markdown", 858 | "metadata": { 859 | "slideshow": { 860 | "slide_type": "slide" 861 | } 862 | }, 863 | "source": [ 864 | "### Selecting" 865 | ] 866 | }, 867 | { 868 | "cell_type": "markdown", 869 | "metadata": { 870 | "tags": [] 871 | }, 872 | "source": [ 873 | "Selection is done with brackets.\n", 874 | "Pass a single column name (as a string) or a list of column names." 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": { 880 | "tags": [] 881 | }, 882 | "source": [ 883 | "```python\n", 884 | "# The column \"mycolumn\", as a Series\n", 885 | "df['mycolumn']\n", 886 | "\n", 887 | "# The columns \"column1\" and \"column2\" as a DataFrame \n", 888 | "df[['column_1', 'column_2']]\n", 889 | "```" 890 | ] 891 | }, 892 | { 893 | "cell_type": "markdown", 894 | "metadata": { 895 | "slideshow": { 896 | "slide_type": "fragment" 897 | } 898 | }, 899 | "source": [ 900 | "
\n", 901 | "

Note

\n", 902 | "

If you pass a list, the returned value will be a DataFrame.\n", 903 | "If you pass a single column name, it will be a Series.

\n", 904 | "
" 905 | ] 906 | }, 907 | { 908 | "cell_type": "markdown", 909 | "metadata": { 910 | "slideshow": { 911 | "slide_type": "slide" 912 | } 913 | }, 914 | "source": [ 915 | "### Slicing" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": { 921 | "tags": [] 922 | }, 923 | "source": [ 924 | "Slicing is typically done with the `.loc` accessor and brackets.\n", 925 | "Pass in a row index or a range of row indices." 926 | ] 927 | }, 928 | { 929 | "cell_type": "markdown", 930 | "metadata": { 931 | "tags": [] 932 | }, 933 | "source": [ 934 | "```python\n", 935 | "# The fifth (zero-indexing!) row, as a Series\n", 936 | "df.loc[4]\n", 937 | "\n", 938 | "# The second, third, and fourth rows, as a DataFrame\n", 939 | "df.loc[1:3]\n", 940 | "```" 941 | ] 942 | }, 943 | { 944 | "cell_type": "markdown", 945 | "metadata": { 946 | "slideshow": { 947 | "slide_type": "fragment" 948 | } 949 | }, 950 | "source": [ 951 | "
\n", 952 | "

Note

\n", 953 | "

If you pass a range of indices, the returned value will be a DataFrame. Otherwise it will be a Series.

\n", 954 | "
" 955 | ] 956 | }, 957 | { 958 | "cell_type": "markdown", 959 | "metadata": { 960 | "slideshow": { 961 | "slide_type": "slide" 962 | } 963 | }, 964 | "source": [ 965 | "### Filtering" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "metadata": { 971 | "tags": [] 972 | }, 973 | "source": [ 974 | "DataFrames can be filtered by passing a *condition* in brackets." 975 | ] 976 | }, 977 | { 978 | "cell_type": "markdown", 979 | "metadata": { 980 | "tags": [] 981 | }, 982 | "source": [ 983 | "```python\n", 984 | "# Keep rows where `condition` is true\n", 985 | "df[condition]\n", 986 | "```" 987 | ] 988 | }, 989 | { 990 | "cell_type": "markdown", 991 | "metadata": { 992 | "slideshow": { 993 | "slide_type": "slide" 994 | } 995 | }, 996 | "source": [ 997 | "Conditions are things like tests of equality, assertions that one value is greater than another, etc." 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "markdown", 1002 | "metadata": { 1003 | "slideshow": { 1004 | "slide_type": "fragment" 1005 | } 1006 | }, 1007 | "source": [ 1008 | "```python\n", 1009 | "# Keep rows where the value in \"mycolumn\" is equal to 5\n", 1010 | "df[df['mycolumn'] == 5]\n", 1011 | "```" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "markdown", 1016 | "metadata": { 1017 | "slideshow": { 1018 | "slide_type": "fragment" 1019 | } 1020 | }, 1021 | "source": [ 1022 | "```python\n", 1023 | "# Keep rows where mycolumn is less than 3 OR greater than 10\n", 1024 | "df[ (df['mycolumn'] < 3) | (df['mycolumn'] > 10) ]\n", 1025 | "```" 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "markdown", 1030 | "metadata": { 1031 | "slideshow": { 1032 | "slide_type": "slide" 1033 | } 1034 | }, 1035 | "source": [ 1036 | "### Selecting and Filtering Together" 1037 | ] 1038 | }, 1039 | { 1040 | "cell_type": "markdown", 1041 | "metadata": { 1042 | "tags": [] 1043 | }, 1044 | "source": [ 1045 | "Using `.loc`, it's possible to do selecting and filtering all in one step." 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "markdown", 1050 | "metadata": { 1051 | "tags": [] 1052 | }, 1053 | "source": [ 1054 | "```python\n", 1055 | "# Filter down to rows where column_a is equal to 5,\n", 1056 | "# and select column_b and column_c from those rows\n", 1057 | "df.loc[df['column_a'] == 5, ['column_b', 'column_c']]\n", 1058 | "```" 1059 | ] 1060 | }, 1061 | { 1062 | "cell_type": "markdown", 1063 | "metadata": { 1064 | "slideshow": { 1065 | "slide_type": "slide" 1066 | } 1067 | }, 1068 | "source": [ 1069 | "## Manipulating Columns" 1070 | ] 1071 | }, 1072 | { 1073 | "cell_type": "markdown", 1074 | "metadata": { 1075 | "slideshow": { 1076 | "slide_type": "slide" 1077 | } 1078 | }, 1079 | "source": [ 1080 | "### Numeric Calculations" 1081 | ] 1082 | }, 1083 | { 1084 | "cell_type": "markdown", 1085 | "metadata": { 1086 | "tags": [] 1087 | }, 1088 | "source": [ 1089 | "It's possible to perform calculations using columns." 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "markdown", 1094 | "metadata": { 1095 | "slideshow": { 1096 | "slide_type": "fragment" 1097 | } 1098 | }, 1099 | "source": [ 1100 | "```python\n", 1101 | "df['mycolumn'] + 7\n", 1102 | "```" 1103 | ] 1104 | }, 1105 | { 1106 | "cell_type": "markdown", 1107 | "metadata": { 1108 | "slideshow": { 1109 | "slide_type": "fragment" 1110 | } 1111 | }, 1112 | "source": [ 1113 | "```python\n", 1114 | "df['mycolumn'] * 4 - 3\n", 1115 | "```" 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "markdown", 1120 | "metadata": { 1121 | "slideshow": { 1122 | "slide_type": "slide" 1123 | } 1124 | }, 1125 | "source": [ 1126 | "It's also possible to perform calculations based on values in multiple columns." 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "markdown", 1131 | "metadata": { 1132 | "slideshow": { 1133 | "slide_type": "fragment" 1134 | } 1135 | }, 1136 | "source": [ 1137 | "```python\n", 1138 | "df['column_a'] / df['column_b']\n", 1139 | "```" 1140 | ] 1141 | }, 1142 | { 1143 | "cell_type": "markdown", 1144 | "metadata": { 1145 | "slideshow": { 1146 | "slide_type": "slide" 1147 | } 1148 | }, 1149 | "source": [ 1150 | "Generally you'll want to save the calculated values in a new column, which you can do with sensible assignment syntax." 1151 | ] 1152 | }, 1153 | { 1154 | "cell_type": "markdown", 1155 | "metadata": { 1156 | "slideshow": { 1157 | "slide_type": "fragment" 1158 | } 1159 | }, 1160 | "source": [ 1161 | "```python\n", 1162 | "df['e'] = df['m'] * (df['c'] ** 2)\n", 1163 | "```" 1164 | ] 1165 | }, 1166 | { 1167 | "cell_type": "markdown", 1168 | "metadata": { 1169 | "slideshow": { 1170 | "slide_type": "slide" 1171 | } 1172 | }, 1173 | "source": [ 1174 | "### String Manipulations" 1175 | ] 1176 | }, 1177 | { 1178 | "cell_type": "markdown", 1179 | "metadata": { 1180 | "tags": [] 1181 | }, 1182 | "source": [ 1183 | "Lots of string functionality can be found within the `.str` accessor." 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "markdown", 1188 | "metadata": { 1189 | "tags": [] 1190 | }, 1191 | "source": [ 1192 | "```python\n", 1193 | "# Convert the strings in mycolumn to all caps\n", 1194 | "df['mycolumn'].str.upper()\n", 1195 | "```" 1196 | ] 1197 | }, 1198 | { 1199 | "cell_type": "markdown", 1200 | "metadata": { 1201 | "slideshow": { 1202 | "slide_type": "slide" 1203 | } 1204 | }, 1205 | "source": [ 1206 | "### Mapping Values" 1207 | ] 1208 | }, 1209 | { 1210 | "cell_type": "markdown", 1211 | "metadata": { 1212 | "tags": [] 1213 | }, 1214 | "source": [ 1215 | "In some cases you may need to convert some values to other values." 1216 | ] 1217 | }, 1218 | { 1219 | "cell_type": "markdown", 1220 | "metadata": { 1221 | "slideshow": { 1222 | "slide_type": "fragment" 1223 | } 1224 | }, 1225 | "source": [ 1226 | "This is a good case for the `.map` method of Series." 1227 | ] 1228 | }, 1229 | { 1230 | "cell_type": "markdown", 1231 | "metadata": { 1232 | "slideshow": { 1233 | "slide_type": "fragment" 1234 | } 1235 | }, 1236 | "source": [ 1237 | "Pass in a dictionary whose keys are the elements to be converted and whose values are the desired new values." 1238 | ] 1239 | }, 1240 | { 1241 | "cell_type": "code", 1242 | "execution_count": 11, 1243 | "metadata": { 1244 | "slideshow": { 1245 | "slide_type": "slide" 1246 | } 1247 | }, 1248 | "outputs": [ 1249 | { 1250 | "data": { 1251 | "text/html": [ 1252 | "
\n", 1253 | "\n", 1266 | "\n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | "
xy
012
134
\n", 1287 | "
" 1288 | ], 1289 | "text/plain": [ 1290 | " x y\n", 1291 | "0 1 2\n", 1292 | "1 3 4" 1293 | ] 1294 | }, 1295 | "execution_count": 11, 1296 | "metadata": {}, 1297 | "output_type": "execute_result" 1298 | } 1299 | ], 1300 | "source": [ 1301 | "df" 1302 | ] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "execution_count": 12, 1307 | "metadata": { 1308 | "tags": [] 1309 | }, 1310 | "outputs": [ 1311 | { 1312 | "data": { 1313 | "text/html": [ 1314 | "
\n", 1315 | "\n", 1328 | "\n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | "
xy
0112
1334
\n", 1349 | "
" 1350 | ], 1351 | "text/plain": [ 1352 | " x y\n", 1353 | "0 11 2\n", 1354 | "1 33 4" 1355 | ] 1356 | }, 1357 | "execution_count": 12, 1358 | "metadata": {}, 1359 | "output_type": "execute_result" 1360 | } 1361 | ], 1362 | "source": [ 1363 | "df['x'] = df['x'].map({1: 11, 3: 33})\n", 1364 | "df" 1365 | ] 1366 | }, 1367 | { 1368 | "cell_type": "markdown", 1369 | "metadata": { 1370 | "slideshow": { 1371 | "slide_type": "slide" 1372 | } 1373 | }, 1374 | "source": [ 1375 | "## Practice\n", 1376 | "\n", 1377 | "1. Load the weather data (`weather.csv`) from the data folder of our repository. Store it in a variable called `weather`.\n", 1378 | "2. Keep only the rows that have precipitation (i.e. `precip > 0`).\n", 1379 | "3. Create a new column, \"air_hazard_rating\", that is `wind_speed / 2 + visib`.\n", 1380 | "4. Keep only the \"origin\" and \"time\" columns." 1381 | ] 1382 | }, 1383 | { 1384 | "cell_type": "markdown", 1385 | "metadata": { 1386 | "slideshow": { 1387 | "slide_type": "slide" 1388 | } 1389 | }, 1390 | "source": [ 1391 | "# Questions\n", 1392 | "\n", 1393 | "Are there any questions before we move on?" 1394 | ] 1395 | } 1396 | ], 1397 | "metadata": { 1398 | "kernelspec": { 1399 | "display_name": "Python 3 (ipykernel)", 1400 | "language": "python", 1401 | "name": "python3" 1402 | }, 1403 | "language_info": { 1404 | "codemirror_mode": { 1405 | "name": "ipython", 1406 | "version": 3 1407 | }, 1408 | "file_extension": ".py", 1409 | "mimetype": "text/x-python", 1410 | "name": "python", 1411 | "nbconvert_exporter": "python", 1412 | "pygments_lexer": "ipython3", 1413 | "version": "3.11.4" 1414 | }, 1415 | "rise": { 1416 | "autolaunch": true, 1417 | "transition": "none" 1418 | } 1419 | }, 1420 | "nbformat": 4, 1421 | "nbformat_minor": 4 1422 | } 1423 | -------------------------------------------------------------------------------- /notebooks/09-Summarizing-Grouped-Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Summarizing Grouped Data" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "slide" 19 | } 20 | }, 21 | "source": [ 22 | "## Applied Review" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "slideshow": { 29 | "slide_type": "slide" 30 | } 31 | }, 32 | "source": [ 33 | "### DataFrame Structure" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "tags": [] 40 | }, 41 | "source": [ 42 | "* We will start by importing the `planes` data set as a DataFrame:" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 1, 48 | "metadata": { 49 | "slideshow": { 50 | "slide_type": "-" 51 | } 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "import pandas as pd\n", 56 | "planes_df = pd.read_csv('../data/planes.csv')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "tags": [] 63 | }, 64 | "source": [ 65 | "* Each DataFrame variable is a **Series** and can be accessed with bracket subsetting notation: \n", 66 | "\n", 67 | "```python \n", 68 | "DataFrame['SeriesName']\n", 69 | "```" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "tags": [] 76 | }, 77 | "source": [ 78 | "* The DataFrame has an **Index** that is visible the far left side" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "slideshow": { 85 | "slide_type": "slide" 86 | } 87 | }, 88 | "source": [ 89 | "### Summary Operations" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "cell_style": "split", 96 | "tags": [] 97 | }, 98 | "source": [ 99 | "* Summary operations occur when we collapse a Series or DataFrame down to a single row" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "cell_style": "split", 106 | "tags": [] 107 | }, 108 | "source": [ 109 | "* This is an aggregation of a variable across its rows" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "cell_style": "center", 116 | "tags": [] 117 | }, 118 | "source": [ 119 | "
\n", 120 | "\"aggregate-series.png\"\n", 121 | "
" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "slideshow": { 128 | "slide_type": "slide" 129 | } 130 | }, 131 | "source": [ 132 | "### Summarizing Data Frames" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": { 138 | "tags": [] 139 | }, 140 | "source": [ 141 | "* We can perform summary operations on DataFrames in a number of ways:\n", 142 | " * Summary methods for a specific summary operation: \n", 143 | " ```python \n", 144 | " DataFrame.sum()\n", 145 | " ```\n", 146 | " * Describe method for a collection of summary operations: \n", 147 | " ```python\n", 148 | " DataFrame.describe()\n", 149 | " ```\n", 150 | " * Agg method for flexibility in summary operations: \n", 151 | " ```python\n", 152 | " DataFrame.agg({'VariableName': ['sum', 'mean']})\n", 153 | " ```" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "slideshow": { 160 | "slide_type": "slide" 161 | } 162 | }, 163 | "source": [ 164 | "* An example of the agg method:" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 2, 170 | "metadata": { 171 | "scrolled": true, 172 | "slideshow": { 173 | "slide_type": "-" 174 | } 175 | }, 176 | "outputs": [ 177 | { 178 | "data": { 179 | "text/html": [ 180 | "
\n", 181 | "\n", 194 | "\n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | "
yearseats
mean2000.48401154.316376
median2001.00000NaN
maxNaN450.000000
\n", 220 | "
" 221 | ], 222 | "text/plain": [ 223 | " year seats\n", 224 | "mean 2000.48401 154.316376\n", 225 | "median 2001.00000 NaN\n", 226 | "max NaN 450.000000" 227 | ] 228 | }, 229 | "execution_count": 2, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "planes_df.agg({\n", 236 | " 'year': ['mean', 'median'],\n", 237 | " 'seats': ['mean', 'max']\n", 238 | "})" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": { 244 | "slideshow": { 245 | "slide_type": "fragment" 246 | } 247 | }, 248 | "source": [ 249 | "
\n", 250 | "

Note

\n", 251 | "

We will primarily use the .agg() method moving forward.

\n", 252 | "
" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "slideshow": { 259 | "slide_type": "slide" 260 | } 261 | }, 262 | "source": [ 263 | "## General Model" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "slideshow": { 270 | "slide_type": "slide" 271 | } 272 | }, 273 | "source": [ 274 | "### Variable Groups" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": { 280 | "cell_style": "split", 281 | "tags": [] 282 | }, 283 | "source": [ 284 | "* We can group DataFrame rows together by the value in a Series/variable\n", 285 | "* If we \"group by A\", then rows with the same value in variable A are in the same group" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "cell_style": "split", 292 | "tags": [] 293 | }, 294 | "source": [ 295 | "" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "cell_style": "split", 302 | "slideshow": { 303 | "slide_type": "slide" 304 | } 305 | }, 306 | "source": [ 307 | "* Note that groups do not need to be ordered by their values:" 308 | ] 309 | }, 310 | { 311 | "cell_type": "markdown", 312 | "metadata": { 313 | "cell_style": "split", 314 | "slideshow": { 315 | "slide_type": "-" 316 | } 317 | }, 318 | "source": [ 319 | "" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "slideshow": { 326 | "slide_type": "slide" 327 | } 328 | }, 329 | "source": [ 330 | "
\n", 331 | "

Question

\n", 332 | "

Why might we be interested in grouping by a variable?

\n", 333 | "
" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": { 339 | "slideshow": { 340 | "slide_type": "slide" 341 | }, 342 | "tags": [] 343 | }, 344 | "source": [ 345 | "### Summarizing by Groups" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": { 351 | "tags": [] 352 | }, 353 | "source": [ 354 | "* When we've talked about **summary** operations, we've talked about collapsing a DataFrame to a single row" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": { 360 | "tags": [] 361 | }, 362 | "source": [ 363 | "* This is not always the case -- we sometimes collapse to a *single row per group*" 364 | ] 365 | }, 366 | { 367 | "cell_type": "markdown", 368 | "metadata": { 369 | "tags": [] 370 | }, 371 | "source": [ 372 | "* This is known as a grouped aggregation:" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": { 378 | "slideshow": { 379 | "slide_type": "slide" 380 | } 381 | }, 382 | "source": [ 383 | "![summarizing-by-groups.png](images/summarizing-by-groups.png)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": { 389 | "slideshow": { 390 | "slide_type": "slide" 391 | } 392 | }, 393 | "source": [ 394 | "* This can be useful when we want to aggregate by cateogory:\n", 395 | " * Maximum temperature *by month*\n", 396 | " * Total home runs *by team*\n", 397 | " * Total sales *by geography*\n", 398 | " * Average number of seats by plane manufacturer" 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": { 404 | "slideshow": { 405 | "slide_type": "fragment" 406 | }, 407 | "tags": [] 408 | }, 409 | "source": [ 410 | "
\n", 411 | "

Question

\n", 412 | "

What are common grouped aggregation metrics used in your industry/organization?

\n", 413 | "
" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": { 419 | "slideshow": { 420 | "slide_type": "slide" 421 | } 422 | }, 423 | "source": [ 424 | "## Summarizing Grouped Data" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": { 430 | "tags": [] 431 | }, 432 | "source": [ 433 | "* When we summarize by groups, we can use the same aggregation methods we previously did\n", 434 | " * Summary methods for a specific summary operation: \n", 435 | " ```python\n", 436 | " DataFrame.sum()\n", 437 | " ```\n", 438 | " * Describe method for a collection of summary operations: \n", 439 | " ```python\n", 440 | " DataFrame.describe()\n", 441 | " ```\n", 442 | " * Agg method for flexibility in summary operations: \n", 443 | " ```python\n", 444 | " DataFrame.agg({'VariableName': ['sum', 'mean']})\n", 445 | " ```" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": { 451 | "slideshow": { 452 | "slide_type": "slide" 453 | } 454 | }, 455 | "source": [ 456 | "* The only difference is the need to **set the DataFrame group prior to aggregating**" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": { 462 | "slideshow": { 463 | "slide_type": "slide" 464 | } 465 | }, 466 | "source": [ 467 | "### Setting the DataFrame Group" 468 | ] 469 | }, 470 | { 471 | "cell_type": "markdown", 472 | "metadata": { 473 | "tags": [] 474 | }, 475 | "source": [ 476 | "* We can set the DataFrame group by calling the `DataFrame.groupby()` method and passing a variable name:" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 3, 482 | "metadata": { 483 | "slideshow": { 484 | "slide_type": "-" 485 | } 486 | }, 487 | "outputs": [ 488 | { 489 | "data": { 490 | "text/plain": [ 491 | "" 492 | ] 493 | }, 494 | "execution_count": 3, 495 | "metadata": {}, 496 | "output_type": "execute_result" 497 | } 498 | ], 499 | "source": [ 500 | "planes_df.groupby('model')" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": { 506 | "slideshow": { 507 | "slide_type": "fragment" 508 | } 509 | }, 510 | "source": [ 511 | "* Notice that a DataFrame doesn't print when it's grouped" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": { 517 | "slideshow": { 518 | "slide_type": "fragment" 519 | } 520 | }, 521 | "source": [ 522 | "* The `groupby()` method is just setting the group - you can see the changed DataFrame class:" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 4, 528 | "metadata": { 529 | "slideshow": { 530 | "slide_type": "-" 531 | } 532 | }, 533 | "outputs": [ 534 | { 535 | "data": { 536 | "text/plain": [ 537 | "pandas.core.groupby.generic.DataFrameGroupBy" 538 | ] 539 | }, 540 | "execution_count": 4, 541 | "metadata": {}, 542 | "output_type": "execute_result" 543 | } 544 | ], 545 | "source": [ 546 | "type(planes_df.groupby('manufacturer'))" 547 | ] 548 | }, 549 | { 550 | "cell_type": "markdown", 551 | "metadata": { 552 | "slideshow": { 553 | "slide_type": "slide" 554 | } 555 | }, 556 | "source": [ 557 | "* If we then call an aggregation method, we will see the DataFrame returned with the aggregated results:" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 5, 563 | "metadata": { 564 | "slideshow": { 565 | "slide_type": "-" 566 | } 567 | }, 568 | "outputs": [ 569 | { 570 | "data": { 571 | "text/html": [ 572 | "
\n", 573 | "\n", 590 | "\n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | "
seats
meanmax
manufacturer
AGUSTA SPA8.0000008
AIRBUS221.202381379
AIRBUS INDUSTRIE187.402500379
AMERICAN AIRCRAFT INC2.0000002
AVIAT AIRCRAFT INC2.0000002
\n", 635 | "
" 636 | ], 637 | "text/plain": [ 638 | " seats \n", 639 | " mean max\n", 640 | "manufacturer \n", 641 | "AGUSTA SPA 8.000000 8\n", 642 | "AIRBUS 221.202381 379\n", 643 | "AIRBUS INDUSTRIE 187.402500 379\n", 644 | "AMERICAN AIRCRAFT INC 2.000000 2\n", 645 | "AVIAT AIRCRAFT INC 2.000000 2" 646 | ] 647 | }, 648 | "execution_count": 5, 649 | "metadata": {}, 650 | "output_type": "execute_result" 651 | } 652 | ], 653 | "source": [ 654 | "(\n", 655 | " planes_df.groupby('manufacturer')\n", 656 | " .agg({'seats': ['mean', 'max']}).head()\n", 657 | ")" 658 | ] 659 | }, 660 | { 661 | "cell_type": "markdown", 662 | "metadata": { 663 | "slideshow": { 664 | "slide_type": "slide" 665 | } 666 | }, 667 | "source": [ 668 | "* This process always follows this model:\n", 669 | "\n", 670 | "![model-for-grouped-aggs.png](images/model-for-grouped-aggs.png)" 671 | ] 672 | }, 673 | { 674 | "cell_type": "markdown", 675 | "metadata": { 676 | "slideshow": { 677 | "slide_type": "slide" 678 | } 679 | }, 680 | "source": [ 681 | "* **Notice that the grouped variable becomes the Index in our example!**" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": 6, 687 | "metadata": { 688 | "slideshow": { 689 | "slide_type": "-" 690 | } 691 | }, 692 | "outputs": [ 693 | { 694 | "data": { 695 | "text/html": [ 696 | "
\n", 697 | "\n", 714 | "\n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | "
seats
meanmax
manufacturer
AGUSTA SPA8.0000008
AIRBUS221.202381379
AIRBUS INDUSTRIE187.402500379
AMERICAN AIRCRAFT INC2.0000002
AVIAT AIRCRAFT INC2.0000002
\n", 759 | "
" 760 | ], 761 | "text/plain": [ 762 | " seats \n", 763 | " mean max\n", 764 | "manufacturer \n", 765 | "AGUSTA SPA 8.000000 8\n", 766 | "AIRBUS 221.202381 379\n", 767 | "AIRBUS INDUSTRIE 187.402500 379\n", 768 | "AMERICAN AIRCRAFT INC 2.000000 2\n", 769 | "AVIAT AIRCRAFT INC 2.000000 2" 770 | ] 771 | }, 772 | "execution_count": 6, 773 | "metadata": {}, 774 | "output_type": "execute_result" 775 | } 776 | ], 777 | "source": [ 778 | "(\n", 779 | " planes_df.groupby('manufacturer')\n", 780 | " .agg({'seats': ['mean', 'max']}).head()\n", 781 | ")" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": 7, 787 | "metadata": { 788 | "slideshow": { 789 | "slide_type": "slide" 790 | } 791 | }, 792 | "outputs": [ 793 | { 794 | "data": { 795 | "text/plain": [ 796 | "Index(['AGUSTA SPA', 'AIRBUS', 'AIRBUS INDUSTRIE', 'AMERICAN AIRCRAFT INC',\n", 797 | " 'AVIAT AIRCRAFT INC', 'AVIONS MARCEL DASSAULT', 'BARKER JACK L',\n", 798 | " 'BEECH', 'BELL', 'BOEING', 'BOMBARDIER INC', 'CANADAIR', 'CANADAIR LTD',\n", 799 | " 'CESSNA', 'CIRRUS DESIGN CORP', 'DEHAVILLAND', 'DOUGLAS', 'EMBRAER',\n", 800 | " 'FRIEDEMANN JON', 'GULFSTREAM AEROSPACE', 'HURLEY JAMES LARRY',\n", 801 | " 'JOHN G HESS', 'KILDALL GARY', 'LAMBERT RICHARD', 'LEARJET INC',\n", 802 | " 'LEBLANC GLENN T', 'MARZ BARRY', 'MCDONNELL DOUGLAS',\n", 803 | " 'MCDONNELL DOUGLAS AIRCRAFT CO', 'MCDONNELL DOUGLAS CORPORATION',\n", 804 | " 'PAIR MIKE E', 'PIPER', 'ROBINSON HELICOPTER CO', 'SIKORSKY',\n", 805 | " 'STEWART MACO'],\n", 806 | " dtype='object', name='manufacturer')" 807 | ] 808 | }, 809 | "execution_count": 7, 810 | "metadata": {}, 811 | "output_type": "execute_result" 812 | } 813 | ], 814 | "source": [ 815 | "(\n", 816 | " planes_df.groupby('manufacturer')\n", 817 | " .agg({'seats': ['mean', 'max']}).index\n", 818 | ")" 819 | ] 820 | }, 821 | { 822 | "cell_type": "markdown", 823 | "metadata": { 824 | "slideshow": { 825 | "slide_type": "slide" 826 | } 827 | }, 828 | "source": [ 829 | "### Groups as Indexes" 830 | ] 831 | }, 832 | { 833 | "cell_type": "markdown", 834 | "metadata": { 835 | "tags": [] 836 | }, 837 | "source": [ 838 | "* This is the default behavior of `pandas`, and probably how `pandas` wants to be used" 839 | ] 840 | }, 841 | { 842 | "cell_type": "markdown", 843 | "metadata": { 844 | "tags": [] 845 | }, 846 | "source": [ 847 | "* This is the fastest way to do it, but it's a matter of less than a millisecond" 848 | ] 849 | }, 850 | { 851 | "cell_type": "markdown", 852 | "metadata": {}, 853 | "source": [ 854 | "* You aren't always going to see people group by the Index..." 855 | ] 856 | }, 857 | { 858 | "cell_type": "markdown", 859 | "metadata": { 860 | "slideshow": { 861 | "slide_type": "slide" 862 | } 863 | }, 864 | "source": [ 865 | "### Groups as Variables" 866 | ] 867 | }, 868 | { 869 | "cell_type": "markdown", 870 | "metadata": { 871 | "tags": [] 872 | }, 873 | "source": [ 874 | "* Instead of setting the group as the Index, we can set the group as a variable" 875 | ] 876 | }, 877 | { 878 | "cell_type": "markdown", 879 | "metadata": { 880 | "slideshow": { 881 | "slide_type": "fragment" 882 | } 883 | }, 884 | "source": [ 885 | "* The grouped variable can remain a Series/variable by adding the `as_index = False` parameter/argument to `groupby()`:" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": 8, 891 | "metadata": { 892 | "slideshow": { 893 | "slide_type": "-" 894 | } 895 | }, 896 | "outputs": [ 897 | { 898 | "data": { 899 | "text/html": [ 900 | "
\n", 901 | "\n", 914 | "\n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | "
manufacturerseats
meanmax
0AGUSTA SPA8.0000008
1AIRBUS221.202381379
2AIRBUS INDUSTRIE187.402500379
3AMERICAN AIRCRAFT INC2.0000002
4AVIAT AIRCRAFT INC2.0000002
\n", 961 | "
" 962 | ], 963 | "text/plain": [ 964 | " manufacturer seats \n", 965 | " mean max\n", 966 | "0 AGUSTA SPA 8.000000 8\n", 967 | "1 AIRBUS 221.202381 379\n", 968 | "2 AIRBUS INDUSTRIE 187.402500 379\n", 969 | "3 AMERICAN AIRCRAFT INC 2.000000 2\n", 970 | "4 AVIAT AIRCRAFT INC 2.000000 2" 971 | ] 972 | }, 973 | "execution_count": 8, 974 | "metadata": {}, 975 | "output_type": "execute_result" 976 | } 977 | ], 978 | "source": [ 979 | "(\n", 980 | " planes_df.groupby('manufacturer', as_index = False)\n", 981 | " .agg({'seats': ['mean', 'max']}).head()\n", 982 | ")" 983 | ] 984 | }, 985 | { 986 | "cell_type": "markdown", 987 | "metadata": { 988 | "slideshow": { 989 | "slide_type": "slide" 990 | } 991 | }, 992 | "source": [ 993 | "### Grouping by Multiple Variables" 994 | ] 995 | }, 996 | { 997 | "cell_type": "markdown", 998 | "metadata": { 999 | "tags": [] 1000 | }, 1001 | "source": [ 1002 | "* Sometimes we have multiple categories by which we'd like to group" 1003 | ] 1004 | }, 1005 | { 1006 | "cell_type": "markdown", 1007 | "metadata": { 1008 | "slideshow": { 1009 | "slide_type": "fragment" 1010 | } 1011 | }, 1012 | "source": [ 1013 | "* To extend our example, assume we want to find the average number of seats by plane manufacturer AND plane year" 1014 | ] 1015 | }, 1016 | { 1017 | "cell_type": "markdown", 1018 | "metadata": { 1019 | "slideshow": { 1020 | "slide_type": "fragment" 1021 | } 1022 | }, 1023 | "source": [ 1024 | "* We can pass a list of variable names to the `groupby()` method:" 1025 | ] 1026 | }, 1027 | { 1028 | "cell_type": "code", 1029 | "execution_count": 9, 1030 | "metadata": { 1031 | "slideshow": { 1032 | "slide_type": "-" 1033 | } 1034 | }, 1035 | "outputs": [ 1036 | { 1037 | "data": { 1038 | "text/html": [ 1039 | "
\n", 1040 | "\n", 1053 | "\n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | "
manufactureryearseats
meanmax
0AGUSTA SPA2001.08.0000008
1AIRBUS2002.0173.800000200
2AIRBUS2003.0174.966667200
3AIRBUS2004.0217.000000379
4AIRBUS2005.0197.000000379
\n", 1107 | "
" 1108 | ], 1109 | "text/plain": [ 1110 | " manufacturer year seats \n", 1111 | " mean max\n", 1112 | "0 AGUSTA SPA 2001.0 8.000000 8\n", 1113 | "1 AIRBUS 2002.0 173.800000 200\n", 1114 | "2 AIRBUS 2003.0 174.966667 200\n", 1115 | "3 AIRBUS 2004.0 217.000000 379\n", 1116 | "4 AIRBUS 2005.0 197.000000 379" 1117 | ] 1118 | }, 1119 | "execution_count": 9, 1120 | "metadata": {}, 1121 | "output_type": "execute_result" 1122 | } 1123 | ], 1124 | "source": [ 1125 | "(\n", 1126 | " planes_df.groupby(['manufacturer', 'year'], as_index = False)\n", 1127 | " .agg({'seats': ['mean', 'max']}).head()\n", 1128 | ")" 1129 | ] 1130 | }, 1131 | { 1132 | "cell_type": "markdown", 1133 | "metadata": { 1134 | "slideshow": { 1135 | "slide_type": "slide" 1136 | } 1137 | }, 1138 | "source": [ 1139 | "### Your Turn\n", 1140 | "\n", 1141 | "1\\. What is meant by \"find the minimum number of seats on a plane by year\"?\n", 1142 | "\n", 1143 | "2\\. Fix the below code to find the minimum number of seats on a plane by year:\n", 1144 | "\n", 1145 | " ```python\n", 1146 | " planes_df.groupby('_____').agg({'_____': ['min']})\n", 1147 | " ```\n", 1148 | " \n", 1149 | "3\\. What is the Index of the result?" 1150 | ] 1151 | }, 1152 | { 1153 | "cell_type": "markdown", 1154 | "metadata": { 1155 | "slideshow": { 1156 | "slide_type": "slide" 1157 | } 1158 | }, 1159 | "source": [ 1160 | "## Questions\n", 1161 | "\n", 1162 | "Are there any questions before we move on?" 1163 | ] 1164 | } 1165 | ], 1166 | "metadata": { 1167 | "celltoolbar": "Slideshow", 1168 | "kernelspec": { 1169 | "display_name": "Python 3 (ipykernel)", 1170 | "language": "python", 1171 | "name": "python3" 1172 | }, 1173 | "language_info": { 1174 | "codemirror_mode": { 1175 | "name": "ipython", 1176 | "version": 3 1177 | }, 1178 | "file_extension": ".py", 1179 | "mimetype": "text/x-python", 1180 | "name": "python", 1181 | "nbconvert_exporter": "python", 1182 | "pygments_lexer": "ipython3", 1183 | "version": "3.11.4" 1184 | }, 1185 | "rise": { 1186 | "autolaunch": true, 1187 | "transition": "none" 1188 | } 1189 | }, 1190 | "nbformat": 4, 1191 | "nbformat_minor": 4 1192 | } 1193 | -------------------------------------------------------------------------------- /notebooks/11-Exporting-Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Exporting Data" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "-" 19 | } 20 | }, 21 | "source": [ 22 | "> Data science is not effective without saving results.\n", 23 | ">\n", 24 | "> \\- Another wise person" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "slideshow": { 31 | "slide_type": "slide" 32 | } 33 | }, 34 | "source": [ 35 | "## Applied Review" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "slideshow": { 42 | "slide_type": "slide" 43 | } 44 | }, 45 | "source": [ 46 | "### Data in Python" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "tags": [] 53 | }, 54 | "source": [ 55 | "* Data is frequently represented inside a **DataFrame** - a class from the pandas library" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "tags": [] 62 | }, 63 | "source": [ 64 | "* Other structures exist, too - dicts, models, etc." 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "tags": [] 71 | }, 72 | "source": [ 73 | "* Data is stored in memory - this makes it relatively quickly accessible" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": { 79 | "tags": [] 80 | }, 81 | "source": [ 82 | "* Data is session-specific, so quitting Python (i.e shutting down JupyterLab) removes the data from memory" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": { 88 | "slideshow": { 89 | "slide_type": "slide" 90 | } 91 | }, 92 | "source": [ 93 | "### Importing Data" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "tags": [] 100 | }, 101 | "source": [ 102 | "* Tabular data can be imported into DataFrames using the `pd.read_csv()` function - there are parameters for different options and other `pd.read_xxx()` functions." 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "slideshow": { 109 | "slide_type": "fragment" 110 | } 111 | }, 112 | "source": [ 113 | "* Other data formats like JSON (key-value pairs) and Pickle (native Python) can be imported using the `with` statement and respective functions:\n", 114 | " * JSON files use the `load()` function from the `json` library\n", 115 | " * Pickle files use the `load()` function from the `pickle` library" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "slideshow": { 122 | "slide_type": "slide" 123 | } 124 | }, 125 | "source": [ 126 | "## General Model" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": { 132 | "slideshow": { 133 | "slide_type": "slide" 134 | } 135 | }, 136 | "source": [ 137 | "### General Framework" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "tags": [] 144 | }, 145 | "source": [ 146 | "A general way to conceptualize data export from Python to Disk:" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": { 152 | "tags": [] 153 | }, 154 | "source": [ 155 | "1. Data sits in memory in the Python session" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "tags": [] 162 | }, 163 | "source": [ 164 | "2. Python code can be used to copy the data from Python's memory to an appropriate format on disk" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "slideshow": { 171 | "slide_type": "slide" 172 | } 173 | }, 174 | "source": [ 175 | "This framework can be visualized below:\n", 176 | "\n", 177 | "
\n", 178 | "\"export-framework.png\"\n", 179 | "
" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "metadata": { 185 | "slideshow": { 186 | "slide_type": "slide" 187 | } 188 | }, 189 | "source": [ 190 | "## Exporting DataFrames" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "metadata": { 196 | "tags": [] 197 | }, 198 | "source": [ 199 | "Remember that DataFrames are representations of tabular data -- therefore, knowing how to export DataFrames to tabular data files is important." 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "slideshow": { 206 | "slide_type": "slide" 207 | } 208 | }, 209 | "source": [ 210 | "### Exporting Setup" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": { 216 | "tags": [] 217 | }, 218 | "source": [ 219 | "We need data to export." 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": { 225 | "tags": [] 226 | }, 227 | "source": [ 228 | "Let's begin by revisiting the importing of tabular data into a DataFrame:" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 1, 234 | "metadata": { 235 | "slideshow": { 236 | "slide_type": "-" 237 | } 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "import pandas as pd\n", 242 | "planes_df = pd.read_csv('../data/planes.csv')" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": { 248 | "slideshow": { 249 | "slide_type": "fragment" 250 | } 251 | }, 252 | "source": [ 253 | "Next, let's do some manipulations on `planes_df`." 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "slideshow": { 260 | "slide_type": "slide" 261 | } 262 | }, 263 | "source": [ 264 | "
\n", 265 | "

Question

\n", 266 | "

How do we select the year and manufacturer variables while returning a DataFrame?

\n", 267 | "
" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 2, 273 | "metadata": { 274 | "slideshow": { 275 | "slide_type": "fragment" 276 | } 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "planes_df = planes_df[['year', 'manufacturer']]" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "slideshow": { 287 | "slide_type": "slide" 288 | } 289 | }, 290 | "source": [ 291 | "
\n", 292 | "

Question

\n", 293 | "

How do we compute the average year by manufacturer?

\n", 294 | "
" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 3, 300 | "metadata": { 301 | "slideshow": { 302 | "slide_type": "fragment" 303 | } 304 | }, 305 | "outputs": [], 306 | "source": [ 307 | "avg_year_by_man_df = (\n", 308 | " planes_df.groupby('manufacturer', as_index = False)\n", 309 | " .mean()\n", 310 | ")" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": { 316 | "slideshow": { 317 | "slide_type": "slide" 318 | } 319 | }, 320 | "source": [ 321 | "Let's view our result to find the manufacturers with the oldest planes:" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 4, 327 | "metadata": { 328 | "slideshow": { 329 | "slide_type": "-" 330 | } 331 | }, 332 | "outputs": [ 333 | { 334 | "data": { 335 | "text/html": [ 336 | "
\n", 337 | "\n", 350 | "\n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | "
manufactureryear
16DOUGLAS1956.000000
15DEHAVILLAND1959.000000
7BEECH1969.500000
13CESSNA1972.444444
12CANADAIR LTD1974.000000
\n", 386 | "
" 387 | ], 388 | "text/plain": [ 389 | " manufacturer year\n", 390 | "16 DOUGLAS 1956.000000\n", 391 | "15 DEHAVILLAND 1959.000000\n", 392 | "7 BEECH 1969.500000\n", 393 | "13 CESSNA 1972.444444\n", 394 | "12 CANADAIR LTD 1974.000000" 395 | ] 396 | }, 397 | "execution_count": 4, 398 | "metadata": {}, 399 | "output_type": "execute_result" 400 | } 401 | ], 402 | "source": [ 403 | "avg_year_by_man_df.sort_values('year').head()" 404 | ] 405 | }, 406 | { 407 | "cell_type": "markdown", 408 | "metadata": { 409 | "slideshow": { 410 | "slide_type": "slide" 411 | } 412 | }, 413 | "source": [ 414 | "### Exporting DataFrames with Pandas" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "slideshow": { 421 | "slide_type": "fragment" 422 | } 423 | }, 424 | "source": [ 425 | "DataFrames can be exported using a method built-in to the DataFrame object itself: `DataFrame.to_csv()`." 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 5, 431 | "metadata": { 432 | "slideshow": { 433 | "slide_type": "-" 434 | } 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "avg_year_by_man_df.to_csv('../data/avg_year_by_man.csv')" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": { 444 | "slideshow": { 445 | "slide_type": "slide" 446 | } 447 | }, 448 | "source": [ 449 | "Let's reimport to see the tabular data we just exported:" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 6, 455 | "metadata": { 456 | "slideshow": { 457 | "slide_type": "fragment" 458 | } 459 | }, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "text/html": [ 464 | "
\n", 465 | "\n", 478 | "\n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | "
Unnamed: 0manufactureryear
00AGUSTA SPA2001.000000
11AIRBUS2007.201220
22AIRBUS INDUSTRIE1998.233333
33AMERICAN AIRCRAFT INCNaN
44AVIAT AIRCRAFT INC2007.000000
\n", 520 | "
" 521 | ], 522 | "text/plain": [ 523 | " Unnamed: 0 manufacturer year\n", 524 | "0 0 AGUSTA SPA 2001.000000\n", 525 | "1 1 AIRBUS 2007.201220\n", 526 | "2 2 AIRBUS INDUSTRIE 1998.233333\n", 527 | "3 3 AMERICAN AIRCRAFT INC NaN\n", 528 | "4 4 AVIAT AIRCRAFT INC 2007.000000" 529 | ] 530 | }, 531 | "execution_count": 6, 532 | "metadata": {}, 533 | "output_type": "execute_result" 534 | } 535 | ], 536 | "source": [ 537 | "pd.read_csv('../data/avg_year_by_man.csv').head()" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": { 543 | "slideshow": { 544 | "slide_type": "fragment" 545 | } 546 | }, 547 | "source": [ 548 | "
\n", 549 | "

Question?

\n", 550 | "

Notice the extra column named Unnamed: 0 . Where did this extra column come from?

\n", 551 | "
" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": { 557 | "slideshow": { 558 | "slide_type": "slide" 559 | }, 560 | "tags": [] 561 | }, 562 | "source": [ 563 | "This `Unnamed: 0` column is the index from the DataFrame. Despite it not being part of the original data, it's saved with the DataFrame by default." 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": { 569 | "slideshow": { 570 | "slide_type": "fragment" 571 | }, 572 | "tags": [] 573 | }, 574 | "source": [ 575 | "We can elect not to save the index with the DataFrame by passing `False` to the `index` parameter of `to_csv()`:" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 7, 581 | "metadata": { 582 | "slideshow": { 583 | "slide_type": "-" 584 | } 585 | }, 586 | "outputs": [], 587 | "source": [ 588 | "avg_year_by_man_df.to_csv('../data/avg_year_by_man.csv', index=False)" 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "metadata": { 594 | "slideshow": { 595 | "slide_type": "fragment" 596 | } 597 | }, 598 | "source": [ 599 | "And then check our result again:" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 8, 605 | "metadata": { 606 | "slideshow": { 607 | "slide_type": "-" 608 | } 609 | }, 610 | "outputs": [ 611 | { 612 | "data": { 613 | "text/html": [ 614 | "
\n", 615 | "\n", 628 | "\n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | "
manufactureryear
0AGUSTA SPA2001.000000
1AIRBUS2007.201220
2AIRBUS INDUSTRIE1998.233333
3AMERICAN AIRCRAFT INCNaN
4AVIAT AIRCRAFT INC2007.000000
\n", 664 | "
" 665 | ], 666 | "text/plain": [ 667 | " manufacturer year\n", 668 | "0 AGUSTA SPA 2001.000000\n", 669 | "1 AIRBUS 2007.201220\n", 670 | "2 AIRBUS INDUSTRIE 1998.233333\n", 671 | "3 AMERICAN AIRCRAFT INC NaN\n", 672 | "4 AVIAT AIRCRAFT INC 2007.000000" 673 | ] 674 | }, 675 | "execution_count": 8, 676 | "metadata": {}, 677 | "output_type": "execute_result" 678 | } 679 | ], 680 | "source": [ 681 | "pd.read_csv('../data/avg_year_by_man.csv').head()" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": { 687 | "slideshow": { 688 | "slide_type": "slide" 689 | } 690 | }, 691 | "source": [ 692 | "The `to_csv()` method has similar parameters to `read_csv()`. A few examples:\n", 693 | "\n", 694 | "* `sep` - the data's delimter\n", 695 | "* `header` - whether or not to write out the column names" 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": { 701 | "slideshow": { 702 | "slide_type": "fragment" 703 | } 704 | }, 705 | "source": [ 706 | "Full documentation can be pulled up by running the method name followed by a question mark:" 707 | ] 708 | }, 709 | { 710 | "cell_type": "code", 711 | "execution_count": 9, 712 | "metadata": { 713 | "slideshow": { 714 | "slide_type": "-" 715 | } 716 | }, 717 | "outputs": [ 718 | { 719 | "name": "stdout", 720 | "output_type": "stream", 721 | "text": [ 722 | "\u001b[0;31mSignature:\u001b[0m\n", 723 | "\u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\u001b[0m\n", 724 | "\u001b[0;34m\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 725 | "\u001b[0;34m\u001b[0m \u001b[0mpath_or_buf\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 726 | "\u001b[0;34m\u001b[0m \u001b[0msep\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m','\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 727 | "\u001b[0;34m\u001b[0m \u001b[0mna_rep\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m''\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 728 | "\u001b[0;34m\u001b[0m \u001b[0mfloat_format\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | Callable | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 729 | "\u001b[0;34m\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'Sequence[Hashable] | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 730 | "\u001b[0;34m\u001b[0m \u001b[0mheader\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool_t | list[str]'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 731 | "\u001b[0;34m\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool_t'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 732 | "\u001b[0;34m\u001b[0m \u001b[0mindex_label\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'IndexLabel | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 733 | "\u001b[0;34m\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'w'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 734 | "\u001b[0;34m\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 735 | "\u001b[0;34m\u001b[0m \u001b[0mcompression\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'CompressionOptions'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'infer'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 736 | "\u001b[0;34m\u001b[0m \u001b[0mquoting\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 737 | "\u001b[0;34m\u001b[0m \u001b[0mquotechar\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'\"'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 738 | "\u001b[0;34m\u001b[0m \u001b[0mlineterminator\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 739 | "\u001b[0;34m\u001b[0m \u001b[0mchunksize\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'int | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 740 | "\u001b[0;34m\u001b[0m \u001b[0mdate_format\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 741 | "\u001b[0;34m\u001b[0m \u001b[0mdoublequote\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'bool_t'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 742 | "\u001b[0;34m\u001b[0m \u001b[0mescapechar\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str | None'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 743 | "\u001b[0;34m\u001b[0m \u001b[0mdecimal\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'.'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 744 | "\u001b[0;34m\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'str'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'strict'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 745 | "\u001b[0;34m\u001b[0m \u001b[0mstorage_options\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'StorageOptions'\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\n", 746 | "\u001b[0;34m\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m'str | None'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 747 | "\u001b[0;31mDocstring:\u001b[0m\n", 748 | "Write object to a comma-separated values (csv) file.\n", 749 | "\n", 750 | "Parameters\n", 751 | "----------\n", 752 | "path_or_buf : str, path object, file-like object, or None, default None\n", 753 | " String, path object (implementing os.PathLike[str]), or file-like\n", 754 | " object implementing a write() function. If None, the result is\n", 755 | " returned as a string. If a non-binary file object is passed, it should\n", 756 | " be opened with `newline=''`, disabling universal newlines. If a binary\n", 757 | " file object is passed, `mode` might need to contain a `'b'`.\n", 758 | "\n", 759 | " .. versionchanged:: 1.2.0\n", 760 | "\n", 761 | " Support for binary file objects was introduced.\n", 762 | "\n", 763 | "sep : str, default ','\n", 764 | " String of length 1. Field delimiter for the output file.\n", 765 | "na_rep : str, default ''\n", 766 | " Missing data representation.\n", 767 | "float_format : str, Callable, default None\n", 768 | " Format string for floating point numbers. If a Callable is given, it takes\n", 769 | " precedence over other numeric formatting parameters, like decimal.\n", 770 | "columns : sequence, optional\n", 771 | " Columns to write.\n", 772 | "header : bool or list of str, default True\n", 773 | " Write out the column names. If a list of strings is given it is\n", 774 | " assumed to be aliases for the column names.\n", 775 | "index : bool, default True\n", 776 | " Write row names (index).\n", 777 | "index_label : str or sequence, or False, default None\n", 778 | " Column label for index column(s) if desired. If None is given, and\n", 779 | " `header` and `index` are True, then the index names are used. A\n", 780 | " sequence should be given if the object uses MultiIndex. If\n", 781 | " False do not print fields for index names. Use index_label=False\n", 782 | " for easier importing in R.\n", 783 | "mode : str, default 'w'\n", 784 | " Python write mode. The available write modes are the same as\n", 785 | " :py:func:`open`.\n", 786 | "encoding : str, optional\n", 787 | " A string representing the encoding to use in the output file,\n", 788 | " defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`\n", 789 | " is a non-binary file object.\n", 790 | "compression : str or dict, default 'infer'\n", 791 | " For on-the-fly compression of the output data. If 'infer' and 'path_or_buf' is\n", 792 | " path-like, then detect compression from the following extensions: '.gz',\n", 793 | " '.bz2', '.zip', '.xz', '.zst', '.tar', '.tar.gz', '.tar.xz' or '.tar.bz2'\n", 794 | " (otherwise no compression).\n", 795 | " Set to ``None`` for no compression.\n", 796 | " Can also be a dict with key ``'method'`` set\n", 797 | " to one of {``'zip'``, ``'gzip'``, ``'bz2'``, ``'zstd'``, ``'tar'``} and other\n", 798 | " key-value pairs are forwarded to\n", 799 | " ``zipfile.ZipFile``, ``gzip.GzipFile``,\n", 800 | " ``bz2.BZ2File``, ``zstandard.ZstdCompressor`` or\n", 801 | " ``tarfile.TarFile``, respectively.\n", 802 | " As an example, the following could be passed for faster compression and to create\n", 803 | " a reproducible gzip archive:\n", 804 | " ``compression={'method': 'gzip', 'compresslevel': 1, 'mtime': 1}``.\n", 805 | "\n", 806 | " .. versionadded:: 1.5.0\n", 807 | " Added support for `.tar` files.\n", 808 | "\n", 809 | " .. versionchanged:: 1.0.0\n", 810 | "\n", 811 | " May now be a dict with key 'method' as compression mode\n", 812 | " and other entries as additional compression options if\n", 813 | " compression mode is 'zip'.\n", 814 | "\n", 815 | " .. versionchanged:: 1.1.0\n", 816 | "\n", 817 | " Passing compression options as keys in dict is\n", 818 | " supported for compression modes 'gzip', 'bz2', 'zstd', and 'zip'.\n", 819 | "\n", 820 | " .. versionchanged:: 1.2.0\n", 821 | "\n", 822 | " Compression is supported for binary file objects.\n", 823 | "\n", 824 | " .. versionchanged:: 1.2.0\n", 825 | "\n", 826 | " Previous versions forwarded dict entries for 'gzip' to\n", 827 | " `gzip.open` instead of `gzip.GzipFile` which prevented\n", 828 | " setting `mtime`.\n", 829 | "\n", 830 | "quoting : optional constant from csv module\n", 831 | " Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`\n", 832 | " then floats are converted to strings and thus csv.QUOTE_NONNUMERIC\n", 833 | " will treat them as non-numeric.\n", 834 | "quotechar : str, default '\\\"'\n", 835 | " String of length 1. Character used to quote fields.\n", 836 | "lineterminator : str, optional\n", 837 | " The newline character or character sequence to use in the output\n", 838 | " file. Defaults to `os.linesep`, which depends on the OS in which\n", 839 | " this method is called ('\\\\n' for linux, '\\\\r\\\\n' for Windows, i.e.).\n", 840 | "\n", 841 | " .. versionchanged:: 1.5.0\n", 842 | "\n", 843 | " Previously was line_terminator, changed for consistency with\n", 844 | " read_csv and the standard library 'csv' module.\n", 845 | "\n", 846 | "chunksize : int or None\n", 847 | " Rows to write at a time.\n", 848 | "date_format : str, default None\n", 849 | " Format string for datetime objects.\n", 850 | "doublequote : bool, default True\n", 851 | " Control quoting of `quotechar` inside a field.\n", 852 | "escapechar : str, default None\n", 853 | " String of length 1. Character used to escape `sep` and `quotechar`\n", 854 | " when appropriate.\n", 855 | "decimal : str, default '.'\n", 856 | " Character recognized as decimal separator. E.g. use ',' for\n", 857 | " European data.\n", 858 | "errors : str, default 'strict'\n", 859 | " Specifies how encoding and decoding errors are to be handled.\n", 860 | " See the errors argument for :func:`open` for a full list\n", 861 | " of options.\n", 862 | "\n", 863 | " .. versionadded:: 1.1.0\n", 864 | "\n", 865 | "storage_options : dict, optional\n", 866 | " Extra options that make sense for a particular storage connection, e.g.\n", 867 | " host, port, username, password, etc. For HTTP(S) URLs the key-value pairs\n", 868 | " are forwarded to ``urllib.request.Request`` as header options. For other\n", 869 | " URLs (e.g. starting with \"s3://\", and \"gcs://\") the key-value pairs are\n", 870 | " forwarded to ``fsspec.open``. Please see ``fsspec`` and ``urllib`` for more\n", 871 | " details, and for more examples on storage options refer `here\n", 872 | " `_.\n", 874 | "\n", 875 | " .. versionadded:: 1.2.0\n", 876 | "\n", 877 | "Returns\n", 878 | "-------\n", 879 | "None or str\n", 880 | " If path_or_buf is None, returns the resulting csv format as a\n", 881 | " string. Otherwise returns None.\n", 882 | "\n", 883 | "See Also\n", 884 | "--------\n", 885 | "read_csv : Load a CSV file into a DataFrame.\n", 886 | "to_excel : Write DataFrame to an Excel file.\n", 887 | "\n", 888 | "Examples\n", 889 | "--------\n", 890 | ">>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'],\n", 891 | "... 'mask': ['red', 'purple'],\n", 892 | "... 'weapon': ['sai', 'bo staff']})\n", 893 | ">>> df.to_csv(index=False)\n", 894 | "'name,mask,weapon\\nRaphael,red,sai\\nDonatello,purple,bo staff\\n'\n", 895 | "\n", 896 | "Create 'out.zip' containing 'out.csv'\n", 897 | "\n", 898 | ">>> compression_opts = dict(method='zip',\n", 899 | "... archive_name='out.csv') # doctest: +SKIP\n", 900 | ">>> df.to_csv('out.zip', index=False,\n", 901 | "... compression=compression_opts) # doctest: +SKIP\n", 902 | "\n", 903 | "To write a csv file to a new folder or nested folder you will first\n", 904 | "need to create it using either Pathlib or os:\n", 905 | "\n", 906 | ">>> from pathlib import Path # doctest: +SKIP\n", 907 | ">>> filepath = Path('folder/subfolder/out.csv') # doctest: +SKIP\n", 908 | ">>> filepath.parent.mkdir(parents=True, exist_ok=True) # doctest: +SKIP\n", 909 | ">>> df.to_csv(filepath) # doctest: +SKIP\n", 910 | "\n", 911 | ">>> import os # doctest: +SKIP\n", 912 | ">>> os.makedirs('folder/subfolder', exist_ok=True) # doctest: +SKIP\n", 913 | ">>> df.to_csv('folder/subfolder/out.csv') # doctest: +SKIP\n", 914 | "\u001b[0;31mFile:\u001b[0m /usr/local/anaconda3/envs/uc-python/lib/python3.11/site-packages/pandas/core/generic.py\n", 915 | "\u001b[0;31mType:\u001b[0m function" 916 | ] 917 | } 918 | ], 919 | "source": [ 920 | "pd.DataFrame.to_csv?" 921 | ] 922 | }, 923 | { 924 | "cell_type": "markdown", 925 | "metadata": { 926 | "slideshow": { 927 | "slide_type": "slide" 928 | }, 929 | "tags": [] 930 | }, 931 | "source": [ 932 | "
\n", 933 | "

Note

\n", 934 | "

There are several other df.to_xxx() methods that allow you to export DataFrames to other data formats. See more options here.

\n", 935 | "
" 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": { 941 | "slideshow": { 942 | "slide_type": "slide" 943 | } 944 | }, 945 | "source": [ 946 | "### Your Turn\n", 947 | "\n", 948 | "1. Exporting data is copying data from Python's ________ to the ________. \n", 949 | "2. Fill in the blanks to the following code to:\n", 950 | " - import the flights.csv file,\n", 951 | " - filter for flights with a destination to the 'CVG' airport,\n", 952 | " - write this subsetted data out to a new CSV file titled 'flights_to_cvg' (but don't save the index to the CSV). \n", 953 | "

\n", 954 | "\n", 955 | " ```python\n", 956 | " import pandas as pd\n", 957 | " flights_df = pd.________('../data/flights.csv')\n", 958 | " flights_to_cvg_df = flights_df[flights_df[________] == 'CVG']\n", 959 | " flights_to_cvg_df.________('../data/flights_to_cvg.csv', ________ = False)\n", 960 | " ```" 961 | ] 962 | }, 963 | { 964 | "cell_type": "markdown", 965 | "metadata": { 966 | "slideshow": { 967 | "slide_type": "slide" 968 | } 969 | }, 970 | "source": [ 971 | "## Exporting Other Files" 972 | ] 973 | }, 974 | { 975 | "cell_type": "markdown", 976 | "metadata": { 977 | "tags": [] 978 | }, 979 | "source": [ 980 | "Recall being exposed to the importing of JSON and Pickle files -- now we will see how to save them." 981 | ] 982 | }, 983 | { 984 | "cell_type": "markdown", 985 | "metadata": { 986 | "slideshow": { 987 | "slide_type": "slide" 988 | } 989 | }, 990 | "source": [ 991 | "### JSON Files" 992 | ] 993 | }, 994 | { 995 | "cell_type": "markdown", 996 | "metadata": { 997 | "tags": [] 998 | }, 999 | "source": [ 1000 | "Take a look at the below `dict`:" 1001 | ] 1002 | }, 1003 | { 1004 | "cell_type": "code", 1005 | "execution_count": 10, 1006 | "metadata": { 1007 | "slideshow": { 1008 | "slide_type": "-" 1009 | } 1010 | }, 1011 | "outputs": [], 1012 | "source": [ 1013 | "dict_example = {\n", 1014 | " \"first\": \"Guido\",\n", 1015 | " \"last\": \"van Rossum\"\n", 1016 | "}" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "markdown", 1021 | "metadata": { 1022 | "slideshow": { 1023 | "slide_type": "fragment" 1024 | } 1025 | }, 1026 | "source": [ 1027 | "And then we can save it as a JSON file using the `with` statement and the `dump` function from the `json` library:" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "code", 1032 | "execution_count": 11, 1033 | "metadata": { 1034 | "slideshow": { 1035 | "slide_type": "-" 1036 | } 1037 | }, 1038 | "outputs": [], 1039 | "source": [ 1040 | "import json\n", 1041 | "with open('../data/dict_example_export.json', 'w') as f:\n", 1042 | " f.write(json.dumps(dict_example))" 1043 | ] 1044 | }, 1045 | { 1046 | "cell_type": "markdown", 1047 | "metadata": { 1048 | "slideshow": { 1049 | "slide_type": "slide" 1050 | } 1051 | }, 1052 | "source": [ 1053 | "We can then reimport this to verify we saved it correctly:" 1054 | ] 1055 | }, 1056 | { 1057 | "cell_type": "code", 1058 | "execution_count": 12, 1059 | "metadata": { 1060 | "slideshow": { 1061 | "slide_type": "-" 1062 | } 1063 | }, 1064 | "outputs": [], 1065 | "source": [ 1066 | "with open('../data/dict_example_export.json', 'r') as f:\n", 1067 | " imported_json = json.load(f)" 1068 | ] 1069 | }, 1070 | { 1071 | "cell_type": "code", 1072 | "execution_count": 13, 1073 | "metadata": { 1074 | "slideshow": { 1075 | "slide_type": "fragment" 1076 | } 1077 | }, 1078 | "outputs": [ 1079 | { 1080 | "data": { 1081 | "text/plain": [ 1082 | "dict" 1083 | ] 1084 | }, 1085 | "execution_count": 13, 1086 | "metadata": {}, 1087 | "output_type": "execute_result" 1088 | } 1089 | ], 1090 | "source": [ 1091 | "type(imported_json)" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "execution_count": 14, 1097 | "metadata": { 1098 | "slideshow": { 1099 | "slide_type": "fragment" 1100 | } 1101 | }, 1102 | "outputs": [ 1103 | { 1104 | "data": { 1105 | "text/plain": [ 1106 | "{'first': 'Guido', 'last': 'van Rossum'}" 1107 | ] 1108 | }, 1109 | "execution_count": 14, 1110 | "metadata": {}, 1111 | "output_type": "execute_result" 1112 | } 1113 | ], 1114 | "source": [ 1115 | "imported_json" 1116 | ] 1117 | }, 1118 | { 1119 | "cell_type": "markdown", 1120 | "metadata": { 1121 | "slideshow": { 1122 | "slide_type": "slide" 1123 | } 1124 | }, 1125 | "source": [ 1126 | "### Pickle Files" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "markdown", 1131 | "metadata": { 1132 | "slideshow": { 1133 | "slide_type": "fragment" 1134 | } 1135 | }, 1136 | "source": [ 1137 | "
\n", 1138 | "

Question?

\n", 1139 | "

What are Pickle files?

\n", 1140 | "
" 1141 | ] 1142 | }, 1143 | { 1144 | "cell_type": "markdown", 1145 | "metadata": { 1146 | "slideshow": { 1147 | "slide_type": "fragment" 1148 | } 1149 | }, 1150 | "source": [ 1151 | "Python's native data files are known as **Pickle** files:" 1152 | ] 1153 | }, 1154 | { 1155 | "cell_type": "markdown", 1156 | "metadata": { 1157 | "slideshow": { 1158 | "slide_type": "fragment" 1159 | } 1160 | }, 1161 | "source": [ 1162 | "* All Pickle files have the `.pickle` extension" 1163 | ] 1164 | }, 1165 | { 1166 | "cell_type": "markdown", 1167 | "metadata": { 1168 | "slideshow": { 1169 | "slide_type": "fragment" 1170 | } 1171 | }, 1172 | "source": [ 1173 | "* Pickle files are great for saving native Python data that can't easily be represented by other file types\n", 1174 | " * Pre-processed data\n", 1175 | " * Models\n", 1176 | " * Any other Python object..." 1177 | ] 1178 | }, 1179 | { 1180 | "cell_type": "markdown", 1181 | "metadata": { 1182 | "slideshow": { 1183 | "slide_type": "slide" 1184 | } 1185 | }, 1186 | "source": [ 1187 | "### Exporting Pickle Files" 1188 | ] 1189 | }, 1190 | { 1191 | "cell_type": "markdown", 1192 | "metadata": { 1193 | "slideshow": { 1194 | "slide_type": "fragment" 1195 | } 1196 | }, 1197 | "source": [ 1198 | "Pickle files can be exported using the `pickle` library paired with the `with` statement and the `open()` function:" 1199 | ] 1200 | }, 1201 | { 1202 | "cell_type": "code", 1203 | "execution_count": 15, 1204 | "metadata": { 1205 | "slideshow": { 1206 | "slide_type": "-" 1207 | } 1208 | }, 1209 | "outputs": [], 1210 | "source": [ 1211 | "import pickle\n", 1212 | "with open('../data/pickle_example_export.pickle', 'wb') as f:\n", 1213 | " pickle.dump(dict_example, f)" 1214 | ] 1215 | }, 1216 | { 1217 | "cell_type": "markdown", 1218 | "metadata": { 1219 | "slideshow": { 1220 | "slide_type": "slide" 1221 | } 1222 | }, 1223 | "source": [ 1224 | "We can then reimport this to verify we saved it correctly:" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": 16, 1230 | "metadata": { 1231 | "slideshow": { 1232 | "slide_type": "-" 1233 | } 1234 | }, 1235 | "outputs": [], 1236 | "source": [ 1237 | "with open('../data/pickle_example_export.pickle', 'rb') as f:\n", 1238 | " imported_pickle = pickle.load(f)" 1239 | ] 1240 | }, 1241 | { 1242 | "cell_type": "code", 1243 | "execution_count": 17, 1244 | "metadata": { 1245 | "slideshow": { 1246 | "slide_type": "fragment" 1247 | } 1248 | }, 1249 | "outputs": [ 1250 | { 1251 | "data": { 1252 | "text/plain": [ 1253 | "dict" 1254 | ] 1255 | }, 1256 | "execution_count": 17, 1257 | "metadata": {}, 1258 | "output_type": "execute_result" 1259 | } 1260 | ], 1261 | "source": [ 1262 | "type(imported_pickle)" 1263 | ] 1264 | }, 1265 | { 1266 | "cell_type": "code", 1267 | "execution_count": 18, 1268 | "metadata": { 1269 | "slideshow": { 1270 | "slide_type": "fragment" 1271 | } 1272 | }, 1273 | "outputs": [ 1274 | { 1275 | "data": { 1276 | "text/plain": [ 1277 | "{'first': 'Guido', 'last': 'van Rossum'}" 1278 | ] 1279 | }, 1280 | "execution_count": 18, 1281 | "metadata": {}, 1282 | "output_type": "execute_result" 1283 | } 1284 | ], 1285 | "source": [ 1286 | "imported_pickle" 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "markdown", 1291 | "metadata": { 1292 | "slideshow": { 1293 | "slide_type": "slide" 1294 | } 1295 | }, 1296 | "source": [ 1297 | "# Questions\n", 1298 | "\n", 1299 | "Are there any questions before we move on?" 1300 | ] 1301 | } 1302 | ], 1303 | "metadata": { 1304 | "celltoolbar": "Slideshow", 1305 | "kernelspec": { 1306 | "display_name": "Python 3 (ipykernel)", 1307 | "language": "python", 1308 | "name": "python3" 1309 | }, 1310 | "language_info": { 1311 | "codemirror_mode": { 1312 | "name": "ipython", 1313 | "version": 3 1314 | }, 1315 | "file_extension": ".py", 1316 | "mimetype": "text/x-python", 1317 | "name": "python", 1318 | "nbconvert_exporter": "python", 1319 | "pygments_lexer": "ipython3", 1320 | "version": "3.11.4" 1321 | }, 1322 | "rise": { 1323 | "autolaunch": true, 1324 | "transition": "none" 1325 | } 1326 | }, 1327 | "nbformat": 4, 1328 | "nbformat_minor": 4 1329 | } 1330 | -------------------------------------------------------------------------------- /notebooks/99-Conclusion.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "slideshow": { 7 | "slide_type": "slide" 8 | } 9 | }, 10 | "source": [ 11 | "# Conclusion" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "slideshow": { 18 | "slide_type": "fragment" 19 | } 20 | }, 21 | "source": [ 22 | "- We've done a lot in 2 days (4 half days)!\n", 23 | "\n", 24 | "- Much more to learn, but you have the tools to keep going\n", 25 | "\n", 26 | "- The internet (Google, Stackoverflow) is your friend" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "slideshow": { 33 | "slide_type": "slide" 34 | } 35 | }, 36 | "source": [ 37 | "## Resources" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": { 43 | "slideshow": { 44 | "slide_type": "fragment" 45 | } 46 | }, 47 | "source": [ 48 | "- **[The Python Data Science Handbook](https://www.amazon.com/Python-Data-Science-Handbook-Essential/dp/1491912057/ref=sr_1_3?gclid=CjwKCAiAvOeQBhBkEiwAxutUVMb5JDDzkdkZ7L6NU2ZszLMQ3HiF2jwfJcRWzgR82Jn7xHQoIvxBNhoCPUYQAvD_BwE&hvadid=295647768930&hvdev=c&hvlocphy=9015764&hvnetw=g&hvqmt=e&hvrand=14740440715626152632&hvtargid=kwd-569733732214&hydadcr=16433_10305449&keywords=python+for+data+science+handbook&qid=1645918824&sr=8-3)** by Jake VanderPlas.\n", 49 | " - Approachable, broad, well-written\n", 50 | " - Available free online\n", 51 | "- **[Python for Data Analysis](https://www.amazon.com/Python-Data-Analysis-Wrangling-IPython/dp/1491957662/ref=pd_bxgy_img_1/135-0822721-0203815?pd_rd_w=JxvrQ&pf_rd_p=6b3eefea-7b16-43e9-bc45-2e332cbf99da&pf_rd_r=WD9EEXM2VVP0ER90XSE6&pd_rd_r=f7ca71ce-a552-4acb-a083-e51193d09d8e&pd_rd_wg=8ngen&pd_rd_i=1491957662&psc=1)** by Wes McKinney\n", 52 | " - Dense but extremely thorough\n", 53 | " - Probably the most comprehensive guide to Pandas\n", 54 | "- **[Hands-On Machine Learning](https://www.amazon.com/Hands-Machine-Learning-Scikit-Learn-TensorFlow/dp/1492032646/ref=pd_bxgy_img_2/135-0822721-0203815?pd_rd_w=JxvrQ&pf_rd_p=6b3eefea-7b16-43e9-bc45-2e332cbf99da&pf_rd_r=WD9EEXM2VVP0ER90XSE6&pd_rd_r=f7ca71ce-a552-4acb-a083-e51193d09d8e&pd_rd_wg=8ngen&pd_rd_i=1492032646&psc=1)** by Aurélien Géron\n", 55 | " - Approachable but advances quickly\n", 56 | " - Most popular machine learning book for Python" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "jp-MarkdownHeadingCollapsed": true, 63 | "slideshow": { 64 | "slide_type": "slide" 65 | }, 66 | "tags": [] 67 | }, 68 | "source": [ 69 | "## Additional UC Python courses\n", 70 | "\n", 71 | "* **[Intermediate Python for Data Science](https://github.com/uc-python/intermediate-python-datasci)**\n", 72 | " - Learn to use control flow and custom functions to work with data more efficiently.\n", 73 | " - Build awareness and basic skills in working with Python from the shell and its environments.\n", 74 | " - Exposure to Python's data science ecosystem and modeling via scikit-learn.\n", 75 | "* **[Advanced Python for Data Science](https://github.com/uc-python/advanced-python-datasci)**\n", 76 | " - Develop an intuition for the machine learning workflow and Python tooling.\n", 77 | " - Build familiarity with common software engineering tooling and methodologies for implementing a machine learning project.\n", 78 | " - Gain a high-level understanding of the function of data science-adjacent technologies that students will encounter in the workplace, focusing on Git and GitHub." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "slideshow": { 85 | "slide_type": "slide" 86 | } 87 | }, 88 | "source": [ 89 | "## Thank You" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "slideshow": { 96 | "slide_type": "fragment" 97 | } 98 | }, 99 | "source": [ 100 | "- We love being able to teach these workshops and we hope you enjoyed it as well!\n", 101 | "\n", 102 | "- We appreciate your feedback on the workshop. Constructive feedback allows us to continue to improve this course." 103 | ] 104 | } 105 | ], 106 | "metadata": { 107 | "celltoolbar": "Slideshow", 108 | "kernelspec": { 109 | "display_name": "Python 3 (ipykernel)", 110 | "language": "python", 111 | "name": "python3" 112 | }, 113 | "language_info": { 114 | "codemirror_mode": { 115 | "name": "ipython", 116 | "version": 3 117 | }, 118 | "file_extension": ".py", 119 | "mimetype": "text/x-python", 120 | "name": "python", 121 | "nbconvert_exporter": "python", 122 | "pygments_lexer": "ipython3", 123 | "version": "3.8.12" 124 | }, 125 | "rise": { 126 | "autolaunch": true, 127 | "transition": "none" 128 | } 129 | }, 130 | "nbformat": 4, 131 | "nbformat_minor": 4 132 | } 133 | -------------------------------------------------------------------------------- /notebooks/Case-Study.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Case Study" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Part 1\n", 15 | "*To be completed at the conclusion of Day 1*" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "For the following exercises, you should use the data stored at `../data/companies.csv`\n", 23 | "You aren't expected to finish all the exercises; just get through as many as time allows and we will review them together.\n", 24 | "\n", 25 | "1. Start by becoming familiar with the data. How many rows and how many columns does it have? What are the data types of the columns?\n", 26 | "2. Set the data's index to be the \"Symbol\" column.\n", 27 | "3. Look up the company with the symbol NCLH. What company is this? What sector is it in?\n", 28 | "4. Filter down to companies that *either* in the \"Consumer Discretionary\" or the \"Consumer Staples\" sectors.\n", 29 | "5. How many companies are left in the data now?\n", 30 | "6. Create a new column, \"Symbol_Length\", that is the length of the symbol of each company. *Hint: you may need to reset an index along the way.*\n", 31 | "7. Find the company named \"Kroger Co.\". Change its name to \"The Kroger Company\"." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "**Bonus**: *For these two exercises, you won't find examples of the solution in our notebooks.\n", 39 | "You'll need to search for help on the internet.*\n", 40 | "\n", 41 | "*Don't worry if you aren't able to solve them.*\n", 42 | "\n", 43 | "1. Filter down to companies whose symbol starts with A. How many companies meet this criterion?\n", 44 | "2. What is the longest company name remaining in the dataset? You could just search the data visually, but try to find a programmatic solution." 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Part 2\n", 52 | "*To be completed at the conclusion of Day 2*" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "This section again uses the data at `../data/companies.csv`.\n", 60 | "\n", 61 | "1. Re-create the \"Symbol_Length\" column (see above).\n", 62 | "2. What is the average symbol length of companies in the data set?\n", 63 | "3. What is the average symbol length by sector? That is, after grouping by sector, what is the average symbol length for each group?\n", 64 | "4. How long is the longest company name? How long is the longest company name by sector?" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Now open the pricing data at `../data/prices.csv`.\n", 72 | "Note that this data is entirely fabricated and does not exhibit the qualities of real stock market data!\n", 73 | "\n", 74 | "1. Become familiar with this data. What is its shape? What are its data types?\n", 75 | "2. Get summary metrics (count, min, max, standard deviation, etc) for both the Price and Quarter columns. *Hint: we saw a method of DataFrames that will do this for you in a single line.*\n", 76 | "3. Perform an inner join between this data set and the companies data, on the Symbol column.\n", 77 | "4. How many rows does our data have now?\n", 78 | "5. What do you think this data represents? Form a hypothesis and look through the data more carefully until you are confident you understand what it is and how it is structured.\n", 79 | "6. Group the data by sector. What is the average first quarter price for a company in the Real Estate sector? What is the minimum fourth quarter price for a company in the Industrials sector?\n", 80 | "7. Filter the data down to just prices for Apple, Google, Microsoft, and Amazon.\n", 81 | "8. Save this data as big_4.csv in the `../data` directory.\n", 82 | "9. Using Seaborn, plot the price of these companies over 4 quarters. Encode the quarter as the x-axis, the price as the y-axis, and the company symbol as the hue." 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "**Bonus**:\n", 90 | "\n", 91 | "This data is in a form that is useful for plotting.\n", 92 | "But in this shape, it would be quite difficult to calculate the difference between each company's fourth quarter price and its first quarter price.\n", 93 | "\n", 94 | "Reshape this data so it is of a form like the below:\n", 95 | "\n", 96 | "| Symbol | Name | Sector | Q1 | Q2 | Q3 | Q4 |\n", 97 | "|--------|------|--------|----|----|----|----|\n", 98 | "| AAPL | Apple Inc. | Information Technology | 275.20 | 269.96 | 263.51 | 266.07\n", 99 | "\n", 100 | "From which we could easily calculate Q4 - Q1.\n", 101 | "\n", 102 | "*You will probably want to google something like \"python reshaping data\". This is a very challenging problem!*" 103 | ] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "uc-python", 109 | "language": "python", 110 | "name": "uc-python" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.7.3" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 4 127 | } 128 | -------------------------------------------------------------------------------- /notebooks/images/aggregate-series.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/aggregate-series.png -------------------------------------------------------------------------------- /notebooks/images/applied-data-science.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/applied-data-science.gif -------------------------------------------------------------------------------- /notebooks/images/binder-launching.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/binder-launching.png -------------------------------------------------------------------------------- /notebooks/images/brad.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/brad.jpg -------------------------------------------------------------------------------- /notebooks/images/combine-horizontally-key.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-horizontally-key.png -------------------------------------------------------------------------------- /notebooks/images/combine-horizontally-unordered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-horizontally-unordered.png -------------------------------------------------------------------------------- /notebooks/images/combine-horizontally.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-horizontally.png -------------------------------------------------------------------------------- /notebooks/images/combine-vertically.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/combine-vertically.png -------------------------------------------------------------------------------- /notebooks/images/concept_map.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/concept_map.jpg -------------------------------------------------------------------------------- /notebooks/images/data-science-and-tech.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/data-science-and-tech.png -------------------------------------------------------------------------------- /notebooks/images/data-science.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/data-science.png -------------------------------------------------------------------------------- /notebooks/images/dataframe-groups-unordered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/dataframe-groups-unordered.png -------------------------------------------------------------------------------- /notebooks/images/dataframe-groups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/dataframe-groups.png -------------------------------------------------------------------------------- /notebooks/images/dataframe-series.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/dataframe-series.png -------------------------------------------------------------------------------- /notebooks/images/ethan.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/ethan.jpg -------------------------------------------------------------------------------- /notebooks/images/export-framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/export-framework.png -------------------------------------------------------------------------------- /notebooks/images/full-outer-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/full-outer-join.png -------------------------------------------------------------------------------- /notebooks/images/gus.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/gus.jpg -------------------------------------------------------------------------------- /notebooks/images/import-framework.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/import-framework.png -------------------------------------------------------------------------------- /notebooks/images/inner-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/inner-join.png -------------------------------------------------------------------------------- /notebooks/images/insert-new-cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/insert-new-cell.png -------------------------------------------------------------------------------- /notebooks/images/jay.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/jay.jpg -------------------------------------------------------------------------------- /notebooks/images/jupyter-file-structure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/jupyter-file-structure.png -------------------------------------------------------------------------------- /notebooks/images/left-outer-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/left-outer-join.png -------------------------------------------------------------------------------- /notebooks/images/markdown-cell-rendered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/markdown-cell-rendered.png -------------------------------------------------------------------------------- /notebooks/images/markdown-cell-selection.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/markdown-cell-selection.png -------------------------------------------------------------------------------- /notebooks/images/markdown-cell-unrendered.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/markdown-cell-unrendered.png -------------------------------------------------------------------------------- /notebooks/images/model-for-grouped-aggs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/model-for-grouped-aggs.png -------------------------------------------------------------------------------- /notebooks/images/navigator-jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/navigator-jupyter.png -------------------------------------------------------------------------------- /notebooks/images/new-jupyter-notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/new-jupyter-notebook.png -------------------------------------------------------------------------------- /notebooks/images/open-jupyter-notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/open-jupyter-notebook.png -------------------------------------------------------------------------------- /notebooks/images/python-code-cell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/python-code-cell.png -------------------------------------------------------------------------------- /notebooks/images/python-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/python-logo.png -------------------------------------------------------------------------------- /notebooks/images/python_jupyter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/python_jupyter.png -------------------------------------------------------------------------------- /notebooks/images/right-outer-join.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/right-outer-join.png -------------------------------------------------------------------------------- /notebooks/images/selecting_columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/selecting_columns.png -------------------------------------------------------------------------------- /notebooks/images/selecting_rows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/selecting_rows.png -------------------------------------------------------------------------------- /notebooks/images/selecting_rows_columns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/selecting_rows_columns.png -------------------------------------------------------------------------------- /notebooks/images/series-plus-series.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/series-plus-series.png -------------------------------------------------------------------------------- /notebooks/images/so_dev_survey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/so_dev_survey.png -------------------------------------------------------------------------------- /notebooks/images/subsetting_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/subsetting_result.png -------------------------------------------------------------------------------- /notebooks/images/summarizing-by-groups.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uc-python/intro-python-datasci/c242d88f97cdd126f46ae672b7f88680ae7021a5/notebooks/images/summarizing-by-groups.png -------------------------------------------------------------------------------- /notebooks/rise.css: -------------------------------------------------------------------------------- 1 | .rise-enabled { 2 | background-color: #ffffff !important; 3 | border-top: 30px #919191 solid; 4 | border-bottom: 30px #919191 solid; 5 | } 6 | 7 | .question { 8 | color: #008; 9 | } 10 | 11 | .your_turn { 12 | color: #e08414; 13 | font-size: 150%; 14 | font-weight: bold; 15 | } 16 | 17 | .rendered_html h1 { 18 | color: #129628; 19 | } 20 | 21 | .rendered_html table, .rendered_html th, .rendered_html tr, .rendered_html td { 22 | font-size: 100%; 23 | } 24 | 25 | .container.slides .celltoolbar, .container.slides .hide-in-slideshow { 26 | display: None ! important; 27 | } 28 | -------------------------------------------------------------------------------- /scripts/generate_slides.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ ! -d ".git" ]; then 4 | echo "Error: no .git directory detected" 5 | echo "This script should be run from the base of the repo" 6 | echo 'e.g. `bash scripts/generate_slides.sh`' 7 | exit 1 8 | fi 9 | 10 | # We must be *in* the notebook folder for relative links (to eg images) to work 11 | # correctly.. 12 | cd notebooks 13 | # images are copied over to slides/ by the Makefile 14 | NB_PATH="$1" 15 | REL_NB=${NB_PATH/#notebooks\//} 16 | jupyter nbconvert --to slides $REL_NB --output-dir=../slides 17 | -------------------------------------------------------------------------------- /scripts/prep_nb_for_ci.py: -------------------------------------------------------------------------------- 1 | # Replace a notebook, with cells tagged ci-skip removed. 2 | # Adapted from https://stackoverflow.com/questions/62022603/how-to-delete-a-jupyter-notebook-input-cell-programmatically-using-its-tag 3 | 4 | import sys 5 | import nbformat 6 | 7 | SKIP_TAG = 'ci-skip' 8 | 9 | if len(sys.argv) != 2: 10 | raise Exception('Usage: prep_nb_for_ci [notebook.ipynb]') 11 | nb_file = sys.argv[1] 12 | 13 | nb = nbformat.read(nb_file, as_version=nbformat.NO_CONVERT) 14 | 15 | tagged_cell_indices = [] 16 | 17 | # find index for the cell with the injected params 18 | for idx, cell in enumerate(nb.cells): 19 | cell_tags = cell.metadata.get('tags') 20 | if cell_tags: 21 | if SKIP_TAG in cell_tags: 22 | tagged_cell_indices.append(idx) 23 | 24 | # Remove tagged cells. 25 | # Iterate in reverse because deleting an earlier index will change what cell 26 | # is at a later one. 27 | for idx in reversed(tagged_cell_indices): 28 | nb.cells.pop(idx) 29 | 30 | # Overwrite the original. 31 | nbformat.write(nb, nb_file) 32 | --------------------------------------------------------------------------------