├── .gitignore ├── CITATION.cff ├── LICENSE.txt ├── Makefile ├── README.md ├── dsip ├── _config.yml ├── _toc.yml ├── assets │ └── logo.png ├── assignments │ ├── D1_Python.ipynb │ ├── D2_Pandas.ipynb │ ├── D3_DataExploration.ipynb │ ├── D4_DataPrivacy.ipynb │ ├── D5_DataAnalysis.ipynb │ └── D6_NaturalLanguageProcessing.ipynb ├── docs │ └── index.md ├── projects │ ├── ProjectProposal.ipynb │ ├── ProjectReport.ipynb │ ├── project_checklist.md │ └── project_guidelines.md └── tutorials │ ├── .gitignore │ ├── 00-Introduction.ipynb │ ├── 01-Python.ipynb │ ├── 02-JupyterNotebooks.ipynb │ ├── 03-DataAnalysis.ipynb │ ├── 04-ScientificPython.ipynb │ ├── 05-DataGathering.ipynb │ ├── 06-DataWrangling.ipynb │ ├── 07-DataCleaning.ipynb │ ├── 08-DataPrivacy&Anonymization.ipynb │ ├── 09-DataVisualization.ipynb │ ├── 10-Distributions.ipynb │ ├── 11-TestingDistributions.ipynb │ ├── 12-StatisticalComparisons.ipynb │ ├── 13-OrdinaryLeastSquares.ipynb │ ├── 14-LinearModels.ipynb │ ├── 15-Clustering.ipynb │ ├── 16-DimensionalityReduction.ipynb │ ├── 17-Classification.ipynb │ ├── 18-NaturalLanguageProcessing.ipynb │ ├── A1-PythonPackages.ipynb │ ├── A2-Git.ipynb │ ├── files │ ├── book10k.txt │ ├── data.csv │ ├── data.json │ ├── data.txt │ ├── messy_data.csv │ └── messy_data.json │ └── img │ ├── anaconda.png │ ├── git.png │ ├── github.png │ ├── jupyter.png │ ├── matplotlib.png │ ├── numpy.png │ ├── pandas.png │ ├── python.png │ ├── scipy.png │ ├── sklearn.png │ └── sourcetree.png ├── instructions.md └── paper ├── paper.bib └── paper.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore local builds of the textbook 2 | dsip/_build/* 3 | 4 | # Ignore any notebook checkpoint files 5 | *.ipynb_checkpoints* 6 | 7 | # Ignore materials that are copied in 8 | #dsip/tutorials/* 9 | #dsip/assignments/* 10 | #dsip/projects/* 11 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: >- 3 | If you use this software, please cite it using the metadata from this file. 4 | type: software 5 | title: 'DataScienceInPractice' 6 | authors: 7 | - given-names: 'Thomas' 8 | family-names: 'Donoghue' 9 | orcid: 'https://orcid.org/0000-0001-5911-0472' 10 | - given-names: 'Bradley' 11 | family-names: 'Voytek' 12 | orcid: 'https://orcid.org/0000-0003-1640-2525' 13 | - given-names: 'Shannon' 14 | family-names: 'Ellis' 15 | orcid: 'https://orcid.org/0000-0002-9231-0481' 16 | repository-code: 'https://github.com/DataScienceInPractice/Site' 17 | url: 'https://datascienceinpractice.github.io/' 18 | license: MIT 19 | preferred-citation: 20 | type: article 21 | authors: 22 | - given-names: 'Thomas' 23 | family-names: 'Donoghue' 24 | orcid: 'https://orcid.org/0000-0001-5911-0472' 25 | - given-names: 'Bradley' 26 | family-names: 'Voytek' 27 | orcid: 'https://orcid.org/0000-0003-1640-2525' 28 | - given-names: 'Shannon' 29 | family-names: 'Ellis' 30 | orcid: 'https://orcid.org/0000-0002-9231-0481' 31 | doi: '10.21105/jose.00121' 32 | journal: 'The Journal of Open Source Education' 33 | title: 'Course Materials for Data Science in Practice' 34 | issue: 51 35 | volume: 5 36 | year: 2020 37 | start: 1 38 | end: 3 39 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. 396 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for the Data Science in Practice site 2 | 3 | 4 | ########################################################################## 5 | ## REQUIREMENTS 6 | # 7 | # This file requires `jupyter-book` for building the site. 8 | # 9 | 10 | ########################################################################## 11 | ## VARIABLES 12 | 13 | BOOK = dsip 14 | CONTENT-ORG = https://github.com/COGS108 15 | BOOK-ORG = https://github.com/datascienceinpractice 16 | SITE-LOC = datascienceinpractice.github.io 17 | 18 | 19 | ########################################################################## 20 | ## CLONING MATERIALS 21 | 22 | # Clone all materials 23 | clone: 24 | 25 | clone-tutorials 26 | clone-assignments 27 | clone-projects 28 | 29 | # Clone the tutorials 30 | clone-tutorials: 31 | 32 | # Copy tutorial materials 33 | @git clone --depth 1 $(CONTENT-ORG)/Tutorials $(BOOK)/tutorials 34 | @rm $(BOOK)/tutorials/README.md 35 | @rm -rf $(BOOK)/tutorials/.git 36 | 37 | # Clone the assignments 38 | clone-assignments: 39 | 40 | # Clone assignments demo repo, and copy out files (to re-org & rename) 41 | @git clone --depth 1 $(CONTENT-ORG)/Assign_Demo $(BOOK)/temp 42 | @mv $(BOOK)/temp/release/A1/A1_git_python.ipynb $(BOOK)/assignments/D1_Python.ipynb 43 | @mv $(BOOK)/temp/release/A2/A2_Pandas.ipynb $(BOOK)/assignments/D2_Pandas.ipynb 44 | @mv $(BOOK)/temp/release/A3/A3_DataExploration.ipynb $(BOOK)/assignments/D3_DataExploration.ipynb 45 | @mv $(BOOK)/temp/release/A4/A4_DataPrivacy.ipynb $(BOOK)/assignments/D4_DataPrivacy.ipynb 46 | @mv $(BOOK)/temp/release/A5/A5_DataAnalysis.ipynb $(BOOK)/assignments/D5_DataAnalysis.ipynb 47 | @mv $(BOOK)/temp/release/A6/A6_NaturalLanguageProcessing.ipynb $(BOOK)/assignments/D6_NaturalLanguageProcessing.ipynb 48 | @rm -rf $(BOOK)/temp 49 | 50 | # Clone the project information 51 | clone-projects: 52 | 53 | # Copy over the project repositories into temporary repositories 54 | @git clone --depth 1 $(CONTENT-ORG)/Projects $(BOOK)/temp1 55 | @git clone --depth 1 $(CONTENT-ORG)/group_template $(BOOK)/temp2 56 | 57 | # Copy over the files we want 58 | @mkdir $(BOOK)/projects 59 | @mv $(BOOK)/temp1/README.md $(BOOK)/projects/project_checklist.md 60 | @mv $(BOOK)/temp1/FinalProject_Guidelines.md $(BOOK)/projects/project_guidelines.md 61 | @mv $(BOOK)/temp2/ProjectProposal_groupXXX.ipynb $(BOOK)/projects/ProjectProposal.ipynb 62 | @mv $(BOOK)/temp2/FinalProject_groupXXX.ipynb $(BOOK)/projects/ProjectReport.ipynb 63 | 64 | # Clear out the temporary folders 65 | @rm -rf $(BOOK)/temp1 66 | @rm -rf $(BOOK)/temp2 67 | 68 | 69 | ########################################################################## 70 | ## CLEAN UPS 71 | 72 | # Clear out the copied repositories 73 | clear: 74 | 75 | # Clear all cloned materials 76 | clear-tutorials 77 | clear-assignemnts 78 | clear-projects 79 | 80 | clear-tutorials: 81 | rm -rf $(BOOK)/tutorials 82 | 83 | clear-assignemnts: 84 | rm -rf $(BOOK)/assignments 85 | 86 | clear-projects: 87 | rm -rf $(BOOK)/projects 88 | 89 | # Clean out the built textbook 90 | clean: 91 | jupyter-book clean $(BOOK_NAME)/ 92 | 93 | 94 | ########################################################################## 95 | ## BUILDING SITE 96 | 97 | # Build the textbook 98 | build: 99 | jupyter-book build $(BOOK)/ 100 | 101 | 102 | ########################################################################## 103 | ## DEPLOYING SITE 104 | 105 | # Deploy the website 106 | deploy: 107 | 108 | # Create the textbook 109 | make build 110 | 111 | # Clone the website host repository 112 | rm -rf $(BOOK)/_build/deploy/ 113 | git clone --depth 1 $(BOOK-ORG)/$(SITE-LOC) $(BOOK)/_build/deploy/ 114 | 115 | # Add .nojekyll file to tell Github pages to bypass Jekyll processing 116 | touch $(BOOK)/_build/deploy/.nojekyll 117 | 118 | # Copy site source into the host repo folder, then push to Github to deploy 119 | cd $(BOOK)/_build/ && \ 120 | cp -r html/ deploy && \ 121 | cd deploy && \ 122 | git add * && \ 123 | git add .nojekyll && \ 124 | git commit -a -m 'deploy site' && \ 125 | git push 126 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Science in Practice Site 2 | [![Project Status: Active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active) 3 | [![Website](https://img.shields.io/badge/site-datascienceinpractice.github.io-informational.svg)](https://datascienceinpractice.github.io) 4 | [![License: CC-BY 4.0](https://img.shields.io/badge/License-CC--BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/) 5 | [![DOI](https://jose.theoj.org/papers/10.21105/jose.00121/status.svg)](https://doi.org/10.21105/jose.00121) 6 | 7 | Data science in practice is a collection of materials for learning introductory data science. 8 | 9 | ## Overview 10 | 11 | This repository is the source repository for creating the [Data Science in Practice](https://datascienceinpractice.github.io/) website. 12 | 13 | Specifically, the website contains: 14 | - `tutorials`, which introduce key topics for doing data science 15 | - `assignments`, which are problem sets that can be worked through 16 | - `projects`, which describes how to pursue independent analysis projects 17 | 18 | These materials serve as a public version of materials from the [COGS108](https://github.com/COGS108) class. 19 | 20 | This repository has the tools for building the website. To do so, it copies materials from the 21 | COGS 108 organization, and then updates and organizes them for the public website. See the 22 | [instructions](https://github.com/DataScienceInPractice/Site/blob/main/instructions.md) 23 | page for notes on how this works. 24 | 25 | The built version of book is then posted to the 26 | [website repository](https://github.com/DataScienceInPractice/datascienceinpractice.github.io) 27 | for hosting. 28 | 29 | ## Dependencies 30 | 31 | This project uses the Python programming language, and requires Python >= 3.6. 32 | 33 | Materials are written and available as [Jupyter Notebooks](https://jupyter.org/). 34 | 35 | Tutorials & assignments require packages from the scientific Python ecosystem. These dependencies can all be installed using the 36 | [Anaconda distribution](https://www.anaconda.com/products/individual). Details and instructions on the dependencies 37 | and how to get them are available in the materials. 38 | 39 | The website is created using [JupyterBook](https://github.com/executablebooks/jupyter-book). 40 | 41 | ## Organization 42 | 43 | This repository contains the following sections: 44 | 45 | - `dsip/` contains the content of the website, including sub-sections: 46 | - `docs/` contains the source for written sections of the site 47 | - `tutorials/` contains tutorial notebooks which introduce key topics for doing data science 48 | - `assignments/` contains assignment notebooks with problem sets that can be worked through 49 | - `projects/` contains information on how to pursue independent analysis projects 50 | - `paper/` contains a copy of the paper that describes these materials 51 | 52 | ## Reference 53 | 54 | This project is described in the following paper: 55 | 56 | Donoghue T, Voytek B, & Ellis S (2022). Course Materials for Data Science in 57 | Practice. Journal of Open Source Education, 5(51), 121. DOI: 10.21105/jose.00121 58 | 59 | Direct Link: https://doi.org/10.21105/jose.00121 60 | 61 | ## License 62 | 63 | These materials are made freely available, and are licensed under a [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license. 64 | -------------------------------------------------------------------------------- /dsip/_config.yml: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # Book settings 3 | title : Data Science in Practice 4 | author : Thomas Donoghue, Bradley Voytek, & Shannon Ellis 5 | copyright : "2020-" 6 | logo : assets/logo.png 7 | 8 | ####################################################################################### 9 | # Execution settings 10 | execute: 11 | execute_notebooks : cache 12 | 13 | ####################################################################################### 14 | # HTML-specific settings 15 | html: 16 | home_page_in_navbar : false 17 | 18 | # ####################################################################################### 19 | # Interact link settings 20 | notebook_interface : "notebook" 21 | 22 | ####################################################################################### 23 | # Launch button settings 24 | repository: 25 | url : https://github.com/datascienceinpractice/Site 26 | path_to_book : "dsip" 27 | 28 | binder: 29 | binderhub_url : "https://mybinder.org" 30 | text : "Launch binder" 31 | 32 | ####################################################################################### 33 | # HTML settings 34 | html: 35 | favicon: assets/logo.png 36 | use_repository_button: true 37 | use_issues_button: true 38 | use_edit_page_button: false 39 | -------------------------------------------------------------------------------- /dsip/_toc.yml: -------------------------------------------------------------------------------- 1 | root: docs/index 2 | format: jb-book 3 | 4 | parts: 5 | - caption: Tutorials 6 | chapters: 7 | - file: tutorials/00-Introduction 8 | - file: tutorials/01-Python 9 | - file: tutorials/02-JupyterNotebooks 10 | - file: tutorials/03-DataAnalysis 11 | - file: tutorials/04-DataSciencePython 12 | - file: tutorials/05-DataGathering 13 | - file: tutorials/06-DataWrangling 14 | - file: tutorials/07-DataCleaning 15 | - file: tutorials/08-DataPrivacy&Anonymization 16 | - file: tutorials/09-DataVisualization 17 | - file: tutorials/10-Distributions 18 | - file: tutorials/11-TestingDistributions 19 | - file: tutorials/12-StatisticalComparisons 20 | - file: tutorials/13-OrdinaryLeastSquares 21 | - file: tutorials/14-LinearModels 22 | - file: tutorials/15-Clustering 23 | - file: tutorials/16-DimensionalityReduction 24 | - file: tutorials/17-Classification 25 | - file: tutorials/18-NaturalLanguageProcessing 26 | - file: tutorials/A1-PythonPackages 27 | - file: tutorials/A2-Git 28 | 29 | - caption: Assignments 30 | chapters: 31 | - file: assignments/D1_Python 32 | - file: assignments/D2_Pandas 33 | - file: assignments/D3_DataExploration 34 | - file: assignments/D4_DataPrivacy 35 | - file: assignments/D5_DataAnalysis 36 | - file: assignments/D6_NaturalLanguageProcessing 37 | 38 | - caption: Project 39 | chapters: 40 | - file: projects/project_guidelines 41 | - file: projects/ProjectProposal 42 | - file: projects/ProjectReport 43 | - file: projects/project_checklist 44 | -------------------------------------------------------------------------------- /dsip/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/assets/logo.png -------------------------------------------------------------------------------- /dsip/docs/index.md: -------------------------------------------------------------------------------- 1 | # Data Science in Practice 2 | 3 | Data Science in Practice is an open set of materials for learning introductory data science. 4 | 5 | This website is a public version of the Data Science in Practice course, taught as 6 | [COGS 108](https://github.com/COGS108/) 7 | at UC San Diego. 8 | 9 | _If you are in the COGS108 class at UC San Diego, this website is **not** the same as the materials and coursework for the class._ 10 | 11 | ## Overview 12 | 13 | The goal of Data Science in Practice is to introduce the practical elements of _doing_ data science. 14 | 15 | Data science is an emerging and multidisciplinary field, organized around the practice of analyzing data, and all the questions, practices and problems that entails. 16 | 17 | These materials focus on the practical elements of finding, analyzing, interpreting and contextualizing data analysis, in order to practice answering questions with data. 18 | 19 | ## Requirements 20 | 21 | These materials uses the Python programming language, and presume knowledge of standard library Python. 22 | 23 | The tutorials introduce how to get Python installed in which dependencies are needed. 24 | 25 | ## Content 26 | 27 | Available materials include: 28 | 29 | - **Tutorials** which introduce key topics for doing data science 30 | - These can be used to explore and learn about key topics 31 | - **Assignments** which are problem sets that can be worked through 32 | - These can be used to practice key skills and ideas with code 33 | - **Projects** which describes how to pursue independent analysis projects 34 | - This can be used as a guide for how to continue with real data science projects 35 | 36 | All the materials are listed in the table of contents in the left sidebar. 37 | 38 | Note that these materials are not created as fully detailed descriptions or formal descriptions of the topics they introduce. 39 | 40 | Rather, they seek to _introduce_ key topics, _demonstrate_ them in code, and allow for interaction, exploration and _practice_. 41 | 42 | Put another way, these materials are designed to be more of a map than encyclopedia. 43 | 44 | For further information on topics we introduce, these materials link to external resources. 45 | 46 | ## How to Use These Materials 47 | 48 | These materials are created as [Jupyter Notebooks](https://jupyter.org), and are intended to be executed and explored in a hands-on manner. 49 | 50 | There is a download link at the top left of the page, that can be used to download each page as a notebook. This allows you to use the notebook locally, executing code, and answering questions. 51 | 52 | ## Issue Tracking 53 | 54 | If you have any find any bugs or issues, or have any suggestions for these materials, please open an 55 | [issue](https://github.com/DataScienceInPractice/Site/issues). 56 | 57 | ## Source Materials 58 | 59 | This set of materials is an openly available version of tutorials and coursework developed for and 60 | used in a university undergraduate course, 61 | [COGS 108](https://github.com/COGS108/), 62 | which is taught at UC San Diego. 63 | 64 | These materials may still contain some references to the university course or to grading, which can be ignored. 65 | 66 | You can find more information about the university course in the 67 | [overview repository](https://github.com/COGS108/Overview/). 68 | 69 | The materials for this open version of the course are managed through this 70 | [Github organization](https://github.com/DataScienceInPractice/). 71 | 72 | The source repository for this website is available [here](https://github.com/DataScienceInPractice/Site). 73 | 74 | ## Reference 75 | 76 | This project is described in the following paper: 77 | 78 | Donoghue T, Voytek B, & Ellis S (2022). Course Materials for Data Science in 79 | Practice. Journal of Open Source Education, 5(51), 121. DOI: 10.21105/jose.00121 80 | 81 | Direct Link: https://doi.org/10.21105/jose.00121 82 | 83 | ## License 84 | 85 | The materials on this website are openly available under a 86 | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license. 87 | 88 | ## Acknowledgments 89 | 90 | The original university course these materials are adapted from was originally created by 91 | [Bradley Voytek](https://voyteklab.com/), and is currently primarily taught by 92 | [Shannon Ellis](http://www.shanellis.com/). 93 | This website and many of the materials were developed by 94 | [Tom Donoghue](https://tomdonoghue.github.io/), with additional contributions from the 95 | [course staff](https://github.com/COGS108/Overview/blob/master/CONTRIBUTORS.md). 96 | -------------------------------------------------------------------------------- /dsip/projects/ProjectProposal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Project Proposal" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Research Question" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "*Fill in your research question here*" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Background and Prior Work" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "Fill in your background and prior work here. **Use inline citation through Mardown footnotes to specify which references support which statements** \n", 36 | "\n", 37 | "For example: After government genocide in the 20th century, real birds were replaced with surveillance drones designed to look just like birds[^lorenz]. \n", 38 | "Use a minimum of 2 or 3 citations, but we prefer more[^admonish]. You need enough to fully explain and back up important facts. \n", 39 | "\n", 40 | "[^lorenz]: Lorenz, T. (9 Dec 2021) Birds Aren’t Real, or Are They? Inside a Gen Z Conspiracy Theory. *The New York Times*. https://www.nytimes.com/2021/12/09/technology/birds-arent-real-gen-z-misinformation.html \n", 41 | "[^admonish]: Also refs should be important to the background, not some randomly chosen vaguely related stuff. Include a web link if possible in refs as above.\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "# Hypothesis\n" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "*State and defend your hypotheses here.*" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "# Data" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "*Explain what the ideal dataset you would want to answer this question. (This should include: What variables? How many observations? Who/what/how would these data be collected? How would these data be stored/organized?)*" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# Ethics & Privacy" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "*Fill in your ethics & privacy discussion here*" 84 | ] 85 | } 86 | ], 87 | "metadata": { 88 | "kernelspec": { 89 | "display_name": "Python 3", 90 | "language": "python", 91 | "name": "python3" 92 | }, 93 | "language_info": { 94 | "codemirror_mode": { 95 | "name": "ipython", 96 | "version": 3 97 | }, 98 | "file_extension": ".py", 99 | "mimetype": "text/x-python", 100 | "name": "python", 101 | "nbconvert_exporter": "python", 102 | "pygments_lexer": "ipython3", 103 | "version": "3.8.10" 104 | } 105 | }, 106 | "nbformat": 4, 107 | "nbformat_minor": 2 108 | } 109 | -------------------------------------------------------------------------------- /dsip/projects/ProjectReport.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Project Report" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Overview" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "*Fill in your overview here*" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "\n", 29 | "# Research Question" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "*Fill in your research question here*" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "\n", 44 | "\n", 45 | "## Background & Prior Work" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "*Fill in your background and prior work here* \n", 53 | "\n", 54 | "References (include links):\n", 55 | "- 1)\n", 56 | "- 2)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "# Hypothesis\n" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "*Fill in your hypotheses here*" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "# Dataset(s)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "*Fill in your dataset information here*\n", 85 | "\n", 86 | "(Copy this information for each dataset)\n", 87 | "- Dataset Name:\n", 88 | "- Link to the dataset:\n", 89 | "- Number of observations:\n", 90 | "\n", 91 | "1-2 sentences describing each dataset. \n", 92 | "\n", 93 | "If you plan to use multiple datasets, add 1-2 sentences about how you plan to combine these datasets." 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "# Setup" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 1, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "## YOUR CODE HERE" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "# Data Cleaning" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "Describe your data cleaning steps here." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 2, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "## YOUR CODE HERE\n", 133 | "## FEEL FREE TO ADD MULTIPLE CELLS PER SECTION" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "# Data Analysis & Results" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "Include cells that describe the steps in your data analysis." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 3, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "## YOUR CODE HERE\n", 157 | "## FEEL FREE TO ADD MULTIPLE CELLS PER SECTION" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "# Ethics & Privacy" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "*Fill in your ethics & privacy discussion here*" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "# Conclusion & Discussion" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "*Fill in your discussion information here*" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "kernelspec": { 191 | "display_name": "Python 3", 192 | "language": "python", 193 | "name": "python3" 194 | }, 195 | "language_info": { 196 | "codemirror_mode": { 197 | "name": "ipython", 198 | "version": 3 199 | }, 200 | "file_extension": ".py", 201 | "mimetype": "text/x-python", 202 | "name": "python", 203 | "nbconvert_exporter": "python", 204 | "pygments_lexer": "ipython3", 205 | "version": "3.8.10" 206 | } 207 | }, 208 | "nbformat": 4, 209 | "nbformat_minor": 2 210 | } 211 | -------------------------------------------------------------------------------- /dsip/projects/project_checklist.md: -------------------------------------------------------------------------------- 1 | # Project Checklist 2 | 3 | You can use this checklist to help guide your thinking on the final project. If you check off all the boxes below, you should be in good shape to get a perfect score on your final project. 4 | 5 | ### Overview, Question & Background 6 | 7 | **Overview**: 8 | - [ ] Write a clear summary of what you did 9 | - [ ] Briefly describe the results of your project 10 | - [ ] Limit overview to 3-4 sentences 11 | 12 | **Research Question**: 13 | - [ ] Include a specific, clear data science question 14 | - [ ] Make sure what you're measuring (variables) to answer the question is clear 15 | 16 | **Background & Prior Work**: 17 | - [ ] Include a general introduction to your topic 18 | - [ ] Include explanation of what work has been done previously 19 | - [ ] Include citations or links to previous work 20 | 21 | **Hypothesis**: 22 | - [ ] Include the hypothesis 23 | - [ ] Ensure that this hypothesis is clear to readers 24 | - [ ] Explain why you think this will be the outcome (what was your thinking?) 25 | 26 | ### Dataset(s): 27 | - [ ] Include an explanation of dataset(s) used (i.e. features/variables included, number of observations, information in dataset) 28 | - [ ] Source included (if outside dataset(s) being used) 29 | 30 | ### Data Analysis: 31 | 32 | **Data Cleaning & Pre-processing** 33 | - [ ] Perform Data Cleaning and explain steps taken OR include an explanation as to why data cleaning was unnecessary (how did you determine your dataset was ready to go?) 34 | - [ ] Dataset actually clean and usable after data wrangling steps carried out 35 | 36 | **Data Visualization**: 37 | - [ ] Include at least three visualizations 38 | - [ ] Clearly label all axes on plots 39 | - [ ] Type of all plots appropriate given data displayed 40 | - [ ] Interpretation of each visualization included in the text 41 | 42 | **Data Analysis & Results**: 43 | - [ ] EDA carried out with explanations of what was done and interpretations of output included 44 | - [ ] Appropriate analysis performed 45 | - [ ] Output of analysis interpreted and interpretation included in notebook 46 | 47 | ### Privacy/Ethics Considerations: 48 | - [ ] Thoughtful discussion of ethical concerns included 49 | - [ ] Ethical concerns consider the whole data science process (question asked, data collected, data being used, the bias in data, analysis, post-analysis, etc.) 50 | - [ ] How your group handled bias/ethical concerns clearly described 51 | 52 | ### Conclusion & Discussion: 53 | - [ ] Clear conclusion (answer to the question being asked) and discussion of results 54 | - [ ] Limitations of analysis discussed 55 | - [ ] Does not ramble on beyond providing necessary information 56 | 57 | ### Final Checks: 58 | - [ ] Edit all text for clarity 59 | - [ ] Remove all instructions 60 | - [ ] Be sure text included throughout to guide reader 61 | - [ ] Check to make sure all text and images are visible 62 | -------------------------------------------------------------------------------- /dsip/projects/project_guidelines.md: -------------------------------------------------------------------------------- 1 | # Project Guide 2 | 3 | This is an edited version of the project guidelines used for the course. 4 | 5 | If you wish to pursue an independent data science project, this outline may be a useful guide. 6 | 7 | ## Project Overview 8 | 9 | The Final Project will give you the chance to explore a topic of your choice and to expand your analytical skills. By working with real data of your choosing you can examine questions of particular interest to you. 10 | 11 | The broad objectives for the project are to: 12 | 13 | * Identify the problems and goals of a real situation and dataset. 14 | * Choose an appropriate approach for formalizing and testing the problems and goals, and be able to articulate the reasoning for that selection. 15 | * Implement your analysis choices on the dataset. 16 | * Interpret the results of the analyses. 17 | * Contextualize those results within a greater scientific and social context, acknowledging and addressing any potential issues related to privacy and ethics. 18 | 19 | The basic project steps (broken down in more detail below): 20 | 21 | * Find a real world dataset and problem that you believe can be solved with one or more of the techniques we have learned in class. 22 | * After selecting a dataset and identifying the goal, write out a proposed analysis plan using template provided and submit it through GitHub for review. 23 | * Apply the techniques outlined and come up with a result for the dataset that you proposed. 24 | * Assemble a Jupyter notebook that communicates your hypothesis, methods, and results. Submit this as your final project. 25 | * Submit feedback about group and individual group members. This is done individually. 26 | 27 | ## Project Components 28 | 29 | ### Project Proposal 30 | 31 | The project proposal includes the following sections: 32 | 33 | **RESEARCH QUESTION**: What is your research question? Include the specific question you're setting out to answer. This question should be specific, answerable with data, and clear. A general question with specific subquestions is permitted. (1-2 sentences) 34 | 35 | **BACKGROUND & PRIOR WORK**: This section will present the background and context of your topic and question in a few paragraphs. Include a general introduction to your topic and then describe what information you currently know about the topic after doing your initial research. Include references to other projects who have asked similar questions or approached similar problems. Explain what others have learned in their projects. 36 | 37 | Find some relevant prior work, and reference those sources, summarizing what each did and what they learned. Even if you think you have a totally novel question, find the most similar prior work that you can and discuss how it relates to your project. 38 | 39 | References can be research publications, but they need not be. Blogs, GitHub repositories, company websites, etc., are all viable references if they are relevant to your project. It must be clear which information comes from which references. (2-3 paragraphs, including at least 2 references) 40 | 41 | **HYPOTHESIS**: What is your main hypothesis/predictions about what the answer to your question is? Briefly explain your thinking. (2-3 sentences) 42 | 43 | **DATA**: Here, you are to *think* about and *describe* the *ideal* dataset (or datasets) you you would need to answer this question: 44 | 45 | * What variables would you have? 46 | * How would they be stored? 47 | * How many observations would you have? 48 | * What/who would the observations be? Over what time period? etc. 49 | * etc. 50 | 51 | Note: For the project proposal, you do NOT have to find the actual dataset(s) needed for your project. For the first checkpoint and onward, you will. 52 | 53 | **ETHICS & PRIVACY**: Acknowledge and address any ethics & privacy related issues of your question(s), proposed dataset(s), and/or analyses. Use the information provided in lecture to guide your group discussion and thinking. If you need further guidance, check out [Deon's Ethics Checklist](http://deon.drivendata.org/#data-science-ethics-checklist). In particular: 54 | 55 | * Are there any biases/privacy/terms of use issues with the data you propsed? 56 | * Are there potential biases in your dataset(s), in terms of who it composes, and how it was collected, that may be problematic in terms of it allowing for equitable analysis? (For example, does your data exclude particular populations, or is it likely to reflect particular human biases in a way that could be a problem?) 57 | * How will you set out to detect these specific biases before, during, and after/when communicating your analysis? 58 | * Are there any other issues related to your topic area, data, and/or analyses that are potentially problematic in terms of data privacy and equitable impact? 59 | * How will you handle issues you identified? 60 | 61 | (1-2 paragraphs) 62 | 63 | #### Project Proposal - Style Guidelines 64 | 65 | The proposal should be written clearly and at a level understandable by a typical undergraduate student. 66 | 67 | This is a short but detailed proposal meant to give us time to assess and critique your Final Project idea (further described below), in order to give you time to improve upon it throughout the quarter. 68 | 69 | Remember to proofread your Project Proposal. Do not use overly flowery and/or vague language. 70 | 71 | ### Final Project 72 | 73 | Time to put it all together! The main products of the final project are 1) a report submitted as single Jupyter Notebook on GitHub and 2) a 3-5 minute video communicating your group project. 74 | 75 | #### Final Report 76 | 77 | This single notebook should include all the code you used for all components of the project (cleaning, visualization, analysis). Because we won’t be running the code in your notebook, it is important to make sure your notebook as submitted to GitHub has the code evaluated and outputs present (e.g., plots) so that we can read the project as is. 78 | 79 | #### Report Sections - Instructions 80 | 81 | Each of the following sections corresponds to a section in the file FinalProject_groupXXX.ipynb (template is in your group's GitHub repo). 82 | 83 | For sections included in your proposal and previous checkpoints, you can copy and paste into your final project, but be sure to edit these sections with feedback you received on your proposal or additional information you learned throughout the project. This report should read clearly from start to finish, explaining what you did, why you did it, and what you learned. This should be a concise and well-written report. 84 | 85 | **PERMISSIONS**: Specify whether you want your group project to be made publicly available. Place an X in the square brackets where appropriate. 86 | 87 | **OVERVIEW**: Include 3-4 sentences summarizing your group’s project and results. 88 | 89 | **NAMES**: See proposal specifications. 90 | 91 | **RESEARCH QUESTION**: See proposal specifications. 92 | 93 | **BACKGROUND & PRIOR WORK**: See proposal specifications. 94 | 95 | **HYPOTHESIS**: See proposal specifications. 96 | 97 | **DATASET(S)**: Same as Checkpoint #1. 98 | 99 | **SETUP**: See Checkpoint #1. 100 | 101 | **DATA CLEANING**: See Checkpoint #1. 102 | 103 | **DATA ANALYSIS & RESULTS**: This section should include markdown text and code walking us through the following: 104 | 105 | * EDA (Same as Checkpoint #2, but clean visualizations up and feel free to remove unecessary visualizations) 106 | * What distributions do your variables take? 107 | * Are there any outliers? 108 | * Relationship between variables? 109 | 110 | * Analysis (Note that you will likely have to do some Googling for analytical approaches not discussed in class. This is expected for this project and an important skill for a data scientist to master.) 111 | * What approaches did you use? Why? 112 | * What were the results? 113 | * What were your interpretation of these findings. 114 | 115 | * Data Visualization - There must be at least three (3) appropriate data visualizations throughout these sections. Each visualization must included an interpretation of what is displayed *and* what should be learned from that visualization. Be sure that the appropriate type of visualization is generated given the data that you have, axes are all labeled, and the visualizations clearly communicate the point you’re trying to make. 116 | 117 | **ETHICS & PRIVACY**: See proposal specifications. (be sure to update with what you actually did to take the ethical considerations into account for the analysis you did!) 118 | 119 | **CONCLUSION & DISCUSSION**: Discuss your project. Summarize your data and question. Briefly describe your analysis. Summarize your results and conclusions. Be sure to mention any limitations of your project. Discuss the impact of this work on society. (2-3 paragraphs) 120 | 121 | 122 | ## Previous Final Projects 123 | 124 | See Prof. Voytek’s write-up of excellent class projects from the Spring 2017 instance of COGS 108 [here](https://voyteklab.com/uc-san-diego-data-science-projects/), all of which received perfect scores. 125 | 126 | Additionally, previous projects can be viewed from when this course ran in [Spring 2017](https://github.com/COGS108/FinalProjects-Sp17), [Winter 2018](https://github.com/COGS108/FinalProjects-Wi18), [Spring 2019](https://github.com/COGS108/FinalProjects-Sp19), [Fall 2019](https://github.com/COGS108/FinalProjects-Fa19), [Winter 2020](https://github.com/COGS108/FinalProjects-Wi20), [Spring 2020](https://github.com/COGS108/FinalProjects-Sp20), [Fall 2020](https://github.com/COGS108/FinalProjects-Fa20), or [Winter 2021](https://github.com/COGS108/FinalProjects-Wi21). Note first, that these projects are of variable quality and second, that if you get inspiration or code from previous projects, this must be noted in your project, giving attribution to the former groups’ work. 127 | 128 | ## How to Find Datasets 129 | 130 | The purpose of this project is to find a real-world problem and dataset (or likely, datasets!) that can be analyzed with the techniques learned in class and those you learn on your own. It is imperative that by doing so you believe extra information will be gained — that you believe you can discover something new! 131 | 132 | You must use at least one dataset containing at least approximately 1000 observations (if your data are smaller but you feel they are sufficient. You are welcome (and in fact recommended) to find multiple datasets! 133 | 134 | The best datasets are the ones that can help you answer your question of interest. 135 | 136 | Your question could be just for fun: Using text mining of song lyric websites to identify the most commonly used phrases and sentiments by decade. 137 | 138 | Your question could be scientific: Scrape data from animal taxonomies and Wikipedia to figure out if larger animals are more likely to be carnivores?. 139 | 140 | Or, ideally, your question can be aimed at civic or social good, for example, use mapping, transit, and car accident data to identify which parts of San Diego are most in need of dedicated bike lanes. 141 | 142 | To help you find datasets, we have collected a list of websites that have a considerable number of open source data sets and included them at the end of this document. 143 | 144 | ### Dataset Resource List 145 | 146 | Here, is a list of potential locations to find datasets and problems to investigate. If you have another dataset or search location, that is great! 147 | 148 | * [Awesome Public Datasets](https://github.com/awesomedata/awesome-public-datasets/blob/master/README.rst) 149 | * [Data.gov](https://catalog.data.gov/dataset) 150 | * [Data Is Plural](https://docs.google.com/spreadsheets/d/1wZhPLMCHKJvwOkP4juclhjFgqIY8fQFMemwKL2c64vk/edit#gid=0) 151 | * [UCSD Datasets](https://ucsd.libguides.com/data-statistics/home) 152 | * [Datasets | Deep Learning](http://deeplearning.net/datasets/) 153 | * [Stanford | Social Science Data Collection](https://data.stanford.edu/) 154 | * [Eviction Lab (email required)](https://evictionlab.org/get-the-data/) 155 | * [San Diego Data](https://data.sandiego.gov/) 156 | * [US Census](https://www.census.gov/) 157 | * [Open Climate Data](http://openclimatedata.net/) 158 | * [Data and Story Library](https://dasl.datadescription.com/datafiles/) 159 | * [UCSD behavioral mobile data](http://extrasensory.ucsd.edu/) 160 | * [Kaggle](https://www.kaggle.com/) 161 | * [FiveThirtyEight](https://data.fivethirtyeight.com/) 162 | * [data.world](https://data.world/) 163 | * [Free Datasets - R and Data Mining ](http://www.rdatamining.com/resources/data) 164 | * [Data Sources for Cool Data Science Projects](https://blog.thedataincubator.com/2014/10/data-sources-for-cool-data-science-projects-part-1/) 165 | -------------------------------------------------------------------------------- /dsip/tutorials/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore ipynb checkpoint files 2 | *ipynb_checkpoints/* 3 | # Ignore Mac Folder Attribute files 4 | *DS_Store* 5 | -------------------------------------------------------------------------------- /dsip/tutorials/00-Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true, 7 | "nbpresent": { 8 | "id": "7fc0cefe-8b1c-4ca9-aa39-094614969842" 9 | } 10 | }, 11 | "source": [ 12 | "# Introduction\n", 13 | "\n", 14 | "Welcome to the hands on materials for Data Science in Practice.\n", 15 | "\n", 16 | "This notebook will guide through getting the tools you will need for working with these tutorials and assignments." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Alerts" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "Throughout these tutorials, you will see colored 'alert' text:\n", 31 | "\n", 32 | "
\n", 33 | "Green alerts provide key information and definitions.\n", 34 | "
\n", 35 | "\n", 36 | "
\n", 37 | "Blue alerts provide links out to further \n", 38 | "resources. \n", 39 | "
" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": { 45 | "nbpresent": { 46 | "id": "b6153143-e694-4e86-96b0-243f56bad8d5" 47 | } 48 | }, 49 | "source": [ 50 | "## What do you need for these tutorials?\n", 51 | "\n", 52 | "### Software\n", 53 | "\n", 54 | "- Working install of Python (>= 3.6), with the anaconda distribution\n", 55 | " - If you are in the official class, [datahub](http://datahub.ucsd.edu) satisfies this requirement\n", 56 | "- Jupyter Notebooks\n", 57 | " - Also satisfied by [datahub](http://datahub.ucsd.edu)\n", 58 | "- git and a GitHub account" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "### Prerequisites\n", 66 | "\n", 67 | "These tutorials presume that you do already have some basic knowledge of programming. \n", 68 | "\n", 69 | "In particular, it assumes knowledge of the Python programming language and standard library. \n", 70 | "\n", 71 | "If you are somewhat unfamiliar with Python, you can follow the links in the Python notebook to catch up." 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "### Computational Resources\n", 79 | "\n", 80 | "The examples throughout these tutorials, and in the assignments are not computationally heavy. \n", 81 | "\n", 82 | "You should be able to run all these materials on any computer you have access to, assuming it will run the aforementioned tools. " 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "### Installing Python\n", 90 | "\n", 91 | "- If you are running code locally, we recommend you install a new version of Python with Anaconda, as described below\n", 92 | " - If you are in the official course, you can use [datahub](http://datahub.ucsd.edu) for everything you need\n", 93 | "- If you are on Mac, you have a native installation of python. This native installation of Python may be older, will not include the extra packages that you will need for this class, and is best left untouched. \n", 94 | " - Downloading Anaconda will install a separate, independent install of Python, leaving your native install untouched. \n", 95 | "- Windows does not require Python natively and so it is not typically pre-installed." 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "## Tools\n", 103 | "\n", 104 | "The following are a series of tools that you will need for this class" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "
\n", 112 | "
\n", 113 | "\n", 114 | "
\n", 115 | "
" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "
\n", 123 | "Anaconda is an open-source distribution of Python, designed for scientific computing, data science and machine learning. \n", 124 | "
\n", 125 | "\n", 126 | "
\n", 127 | "The anaconda website is \n", 128 | "here,\n", 129 | "with the download page\n", 130 | "here.\n", 131 | "
" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "Anaconda itself is a distribution, meaning that is a version of Python with a collection of packages that are curated and maintained together. \n", 139 | "\n", 140 | "Using a pre-built distribution is useful, as it comes with the packages that you need for data science.\n", 141 | "\n", 142 | "Anaconda also comes with `conda`, which is a package manager, allowing you to download, install, and manage other packages. \n", 143 | "\n", 144 | "The anaconda distribution includes all packages that are needed for these tutorials." 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "
\n", 152 | "
\n", 153 | "\n", 154 | "
\n", 155 | "
" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": { 161 | "nbpresent": { 162 | "id": "0f4dd046-4020-465c-85f6-3d92ac9fe145" 163 | } 164 | }, 165 | "source": [ 166 | "
\n", 167 | "Jupyter notebooks are a way to intermix code, outputs and plain text. \n", 168 | "They run in a web browser, and connect to a kernel to be able to execute code. \n", 169 | "
\n", 170 | "\n", 171 | "
\n", 172 | "The official Jupyter website is available \n", 173 | "here.\n", 174 | "
" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "Note that you do not need to download Jupyter separately, as it comes packaged with the Anaconda distribution." 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "#### Checking Your Python Version\n", 189 | "\n", 190 | "You can check which installation of Python you are using, and which version it is.\n", 191 | "\n", 192 | "Once you have installed anaconda, you should see you are using Python in an anaconda folder. \n", 193 | "\n", 194 | "The version number that is printed should also be 3.6 or greater. " 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 1, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "/opt/anaconda3/bin/python\n", 207 | "Python 3.7.4\n" 208 | ] 209 | } 210 | ], 211 | "source": [ 212 | "# Check the installed version of Python\n", 213 | "# Note: these are command-line functions that may not work on windows\n", 214 | "!which python\n", 215 | "!python --version" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "
\n", 223 | "
\n", 224 | "\n", 225 | "
\n", 226 | "
" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "nbpresent": { 233 | "id": "6576af9c-b0f3-4cbe-9a02-06feaa61d0b0" 234 | } 235 | }, 236 | "source": [ 237 | "
\n", 238 | "Git is a tool, a software package, for version control. \n", 239 | "
\n", 240 | "\n", 241 | "
\n", 242 | "Install \n", 243 | "git,\n", 244 | "if you don't already have it.\n", 245 | "
" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "
\n", 253 | "
\n", 254 | "\n", 255 | "
\n", 256 | "
" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": {}, 262 | "source": [ 263 | "
\n", 264 | "Github is an online hosting service that can be used with git, and offers online tools to use git. \n", 265 | "
\n", 266 | "\n", 267 | "
\n", 268 | "Create an account on \n", 269 | "Github.\n", 270 | "
" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "Git & GitHub are not the same thing, though, in practice, they are commonly used together, whereby git is used as a tool to version control code and manage multiple copies stored across your computer, as well as on remote repositories that are stored on Github.\n", 278 | "\n", 279 | "Note that while GitHub is a private company, git is an open-source tool, and can be used independent of GitHub." 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 2, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "git version 2.20.1 (Apple Git-117)\r\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "# Check that you have git installed (which version doesn't really matter)\n", 297 | "!git --version" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "
\n", 305 | "
\n", 306 | "\n", 307 | "
\n", 308 | "
" 309 | ] 310 | }, 311 | { 312 | "cell_type": "markdown", 313 | "metadata": {}, 314 | "source": [ 315 | "
\n", 316 | "Source Tree is a free graphical user interface (GUI) for managing repositories with git & Github. \n", 317 | "
\n", 318 | "\n", 319 | "
\n", 320 | "Source Tree is available \n", 321 | "here.\n", 322 | "You will need an account on \n", 323 | "Atlassian,\n", 324 | "who make Source Tree, but this is free.\n", 325 | "
\n", 326 | "\n", 327 | "You don't need to use SourceTree (or any other GUI) if you know, or want to learn to use git from the command line." 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "## Environments" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "
\n", 342 | "Environments are isolated, independent installations of a programming language and groups of packages, that don't interfere with each other. \n", 343 | "
\n", 344 | "\n", 345 | "
\n", 346 | "Anaconda has detailed instructions on using environments available \n", 347 | "here.\n", 348 | "
" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "You do not need to use environments, however you may find it useful if you want or need to maintain multiple different versions of Python. \n", 356 | "\n", 357 | "If you want to use an environment, and already have conda, you can run this command from command line:
\n", 358 | "\n", 359 | "``$ conda create --name *envname* python=3.7 anaconda``
\n", 360 | "\n", 361 | "^ Replace '*envname*' with a name to call this environment.
\n", 362 | "\n", 363 | "This will install a new environment, with Python 3.7 and the anaconda distribution.\n", 364 | "\n", 365 | "You will then need to activate this environment (everytime) you want to use it. \n", 366 | "\n", 367 | "To activate your environment:
\n", 368 | "``$ conda activate *envname*``\n", 369 | "\n", 370 | "To deactivate your environment:
\n", 371 | "``$ conda deactivate``" 372 | ] 373 | } 374 | ], 375 | "metadata": { 376 | "anaconda-cloud": {}, 377 | "kernelspec": { 378 | "display_name": "Python 3", 379 | "language": "python", 380 | "name": "python3" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 3 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython3", 392 | "version": "3.7.4" 393 | }, 394 | "nbpresent": { 395 | "slides": { 396 | "3d09dc46-88c8-44cb-bc57-259db78a0e70": { 397 | "id": "3d09dc46-88c8-44cb-bc57-259db78a0e70", 398 | "prev": "8d1b5def-2290-42c9-8b06-1c6e0e495521", 399 | "regions": { 400 | "4601423d-c94d-46da-885b-fe33b0216c22": { 401 | "attrs": { 402 | "height": 1, 403 | "width": 1, 404 | "x": 0, 405 | "y": 0 406 | }, 407 | "content": { 408 | "cell": "0f4dd046-4020-465c-85f6-3d92ac9fe145", 409 | "part": "whole" 410 | }, 411 | "id": "4601423d-c94d-46da-885b-fe33b0216c22" 412 | } 413 | } 414 | }, 415 | "8d1b5def-2290-42c9-8b06-1c6e0e495521": { 416 | "id": "8d1b5def-2290-42c9-8b06-1c6e0e495521", 417 | "prev": "bc666852-d015-42a1-b679-eaf92d5eb643", 418 | "regions": { 419 | "d0118c2f-7757-4efa-a276-96f162d312ae": { 420 | "attrs": { 421 | "height": 1, 422 | "width": 1, 423 | "x": 0, 424 | "y": 0 425 | }, 426 | "content": { 427 | "cell": "d9d878d6-230b-4f1e-b2aa-2f152cb3fe8e", 428 | "part": "whole" 429 | }, 430 | "id": "d0118c2f-7757-4efa-a276-96f162d312ae" 431 | } 432 | } 433 | }, 434 | "b039dd05-8357-462a-9525-7f8103de436c": { 435 | "id": "b039dd05-8357-462a-9525-7f8103de436c", 436 | "prev": "3d09dc46-88c8-44cb-bc57-259db78a0e70", 437 | "regions": { 438 | "9180ab3f-f784-45a2-b2cc-a18aad800fc5": { 439 | "attrs": { 440 | "height": 1, 441 | "width": 1, 442 | "x": 0, 443 | "y": 0 444 | }, 445 | "content": { 446 | "cell": "b57ed03a-8c01-4e48-95e8-9c6753e35088", 447 | "part": "whole" 448 | }, 449 | "id": "9180ab3f-f784-45a2-b2cc-a18aad800fc5" 450 | } 451 | } 452 | }, 453 | "bc666852-d015-42a1-b679-eaf92d5eb643": { 454 | "id": "bc666852-d015-42a1-b679-eaf92d5eb643", 455 | "layout": "grid", 456 | "prev": null, 457 | "regions": { 458 | "31cd776f-cc93-49d6-a40c-c590805cfb8f": { 459 | "attrs": { 460 | "height": 0.8333333333333334, 461 | "pad": 0.01, 462 | "width": 0.8333333333333334, 463 | "x": 0.08333333333333333, 464 | "y": 0.08333333333333333 465 | }, 466 | "content": { 467 | "cell": "7fc0cefe-8b1c-4ca9-aa39-094614969842", 468 | "part": "whole" 469 | }, 470 | "id": "31cd776f-cc93-49d6-a40c-c590805cfb8f" 471 | }, 472 | "e1612c29-0f61-4692-9d6e-112e8d378e46": { 473 | "attrs": { 474 | "height": 0.8333333333333334, 475 | "pad": 0.01, 476 | "width": 0.8333333333333334, 477 | "x": 0.08333333333333333, 478 | "y": 0.08333333333333333 479 | }, 480 | "content": { 481 | "cell": "7fc0cefe-8b1c-4ca9-aa39-094614969842", 482 | "part": "whole" 483 | }, 484 | "id": "e1612c29-0f61-4692-9d6e-112e8d378e46" 485 | } 486 | } 487 | } 488 | }, 489 | "themes": {} 490 | } 491 | }, 492 | "nbformat": 4, 493 | "nbformat_minor": 1 494 | } 495 | -------------------------------------------------------------------------------- /dsip/tutorials/01-Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "
\n", 17 | "
\n", 18 | "\n", 19 | "
\n", 20 | "
" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "source": [ 29 | "
\n", 30 | "Python is an \n", 31 | "open-source, \n", 32 | "high-level , \n", 33 | "general purpose, \n", 34 | "interpreted, \n", 35 | "programming language, \n", 36 | "one of the most popular for data science applications. \n", 37 | "
\n", 38 | "\n", 39 | "
\n", 40 | "The official Python\n", 41 | "website.\n", 42 | "
" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Why Python\n", 50 | "\n", 51 | "- As a general purpose language, Python supports a large range of tasks.\n", 52 | " - Or put another way: 'Python isn't the best at anything, but it's second best at everything'\n", 53 | " - This is useful. A data science project may include everything from scraping data from the web, analyzing a mixture or text and numerical data, computing features, training a model, creating high-quality graphs, and then hosting a website with your results. \n", 54 | "- Python is explicitly and by design, user-friendly.\n", 55 | "- Python also has a massive user community, who contribute to a large number of high-quality, well maintained open-source tools.\n", 56 | " - The best language for your project is one which has the things you need.\n", 57 | "- In part for the reasons listed above, Python is heavily used in industry\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "
\n", 65 | "The Python programming language is developed and maintained by the\n", 66 | "Python Software Foundation.\n", 67 | "
" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Python Versions\n", 75 | "\n", 76 | "This class uses Python3, the currently developed version of Python, and more specifically Python version 3.6 or above. \n", 77 | "\n", 78 | "Python2 has reached \"End of Life\" meaning it is no longer supported or maintained by the Python Organization. " 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## Python Resources\n", 86 | "\n", 87 | "These materials presume prior knowledge of the Python programming language. \n", 88 | "\n", 89 | "If you are note yet familiar, here are some entry level materials for learning Python:\n", 90 | "\n", 91 | "- [Codecademy](https://www.codecademy.com/tracks/python) is good for a beginner's introduction to the language.\n", 92 | "- [The Official Beginners Guide](https://wiki.python.org/moin/BeginnersGuide) is supported by the Python organization.\n", 93 | "- [Whirlwind Tour of Python](https://github.com/jakevdp/WhirlwindTourOfPython) is a free collection of Jupyter notebooks that takes you through Python. \n", 94 | " - This book is especially good (and specifically designed for) if you have some experience with programming in some other language, and want to quickly run through the specifics of Python." 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "
\n", 102 | "A much broader list of resources and guides for learning Python is available \n", 103 | "here.\n", 104 | "
" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Getting Un-Stuck\n", 112 | "\n", 113 | "At some point, you will get stuck. It happens. The internet is your friend. \n", 114 | "\n", 115 | "If you get an error, or aren't sure how to proceed, use {your favourite search engine} with specific search terms relating to what you are trying to do. Sometimes this just means searching the error that you got.\n", 116 | "\n", 117 | "Your are likely to find responses on [StackOverflow](https://stackoverflow.com) - which is basically a forum for programming questions, and a good place to find answers. " 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "## Standard Library" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "
\n", 132 | "The Standard Library refers to everything in Python that is part of standard version and install of Python.\n", 133 | "
" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "
\n", 141 | "The Python \n", 142 | "Standard Library\n", 143 | "comes with a lot of basic functionality. \n", 144 | "
" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "Part of what makes Python a powerful language is the standard library itself, which is a rich set of tools for programming. However, the standard library itself does not include data science tools, and a lot of the power of Python stems for a rich ecosystem of packages that can be added and used with Python. " 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "## Packages" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "
\n", 166 | "Packages are collections of code. Packages from outside the standard library can be installed and added to Python.\n", 167 | "
\n", 168 | "\n", 169 | "
\n", 170 | "For managing and installing packages, Anaconda comes with the \n", 171 | "conda\n", 172 | "package manager.\n", 173 | "
" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Scientific Python\n", 181 | "\n", 182 | "When we say that Python is good for data science, and scientific computing, what we really mean is that there is a rich ecosystem of available open-source external packages, that greatly expand the capacities of the language beyond the standard library. \n", 183 | "\n", 184 | "This set of packages, which we will introduce as we go through these materials, is sometimes referred to as 'Scientific Python', or the 'Scipy' ecosystem. \n", 185 | "\n", 186 | "For the purposes of these materials, the Anaconda distribution that we are using contains all the packages you need. " 187 | ] 188 | } 189 | ], 190 | "metadata": { 191 | "anaconda-cloud": {}, 192 | "kernelspec": { 193 | "display_name": "Python 3", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.7.4" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 1 212 | } 213 | -------------------------------------------------------------------------------- /dsip/tutorials/02-JupyterNotebooks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Jupyter Notebooks\n", 8 | "\n", 9 | "
\n", 10 | "
\n", 11 | "\n", 12 | "
\n", 13 | "
\n", 14 | "\n", 15 | "This is a quick introduction to Jupyter notebooks." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "
\n", 23 | "Jupyter notebooks are a way to combine executable code, code outputs, and text into one connected file.\n", 24 | "
\n", 25 | "\n", 26 | "
\n", 27 | "The official documentation from project Jupyter is available \n", 28 | "here\n", 29 | "and they also have some example notebooks \n", 30 | "here.\n", 31 | "
" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Menu Options & Shortcuts\n", 39 | "\n", 40 | "To get a quick tour of the Jupyter user-interface, click on the 'Help' menu, then click 'User Interface Tour'.\n", 41 | "\n", 42 | "There are also a large number of useful keyboard shortcuts. Click on the 'Help' menu, and then 'Keyboard Shortcuts' to see a list. " 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Cells" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "
\n", 57 | "The main organizational structure of the notebook are 'cells'.\n", 58 | "
" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "Cells, can be markdown (text), like this one or code cells (we'll get to those)." 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Markdown cells" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "slideshow": { 79 | "slide_type": "fragment" 80 | } 81 | }, 82 | "source": [ 83 | "Markdown cell are useful for communicating information about our notebooks.\n", 84 | "\n", 85 | "They perform basic text formatting including italics, bold, headings, links and images.\n", 86 | "\n", 87 | "Double-click on any of the cells in this section to see what the plain-text looks like. Run the cell to then see what the formatted Markdown text looks like." 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "slideshow": { 94 | "slide_type": "slide" 95 | } 96 | }, 97 | "source": [ 98 | "# This is a heading\n", 99 | "\n", 100 | "## This is a smaller heading\n", 101 | "\n", 102 | "### This is a really small heading" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "slideshow": { 109 | "slide_type": "slide" 110 | } 111 | }, 112 | "source": [ 113 | "We can italicize my text either like *this* or like _this_." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": { 119 | "slideshow": { 120 | "slide_type": "fragment" 121 | } 122 | }, 123 | "source": [ 124 | "We can embolden my text either like **this** or like __this__." 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "slideshow": { 131 | "slide_type": "slide" 132 | } 133 | }, 134 | "source": [ 135 | "Here is an unordered list of items:\n", 136 | "* This is an item\n", 137 | "* This is an item\n", 138 | "* This is an item" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "slideshow": { 145 | "slide_type": "slide" 146 | } 147 | }, 148 | "source": [ 149 | "Here is an ordered list of items:\n", 150 | "1. This is my first item\n", 151 | "2. This is my second item\n", 152 | "3. This is my third item" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "slideshow": { 159 | "slide_type": "slide" 160 | } 161 | }, 162 | "source": [ 163 | "We can have a list of lists by using identation:\n", 164 | "* This is an item\n", 165 | "* This is an item\n", 166 | "\t* This is an item\n", 167 | "\t* This is an item\n", 168 | "* This is an item" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "slideshow": { 175 | "slide_type": "slide" 176 | } 177 | }, 178 | "source": [ 179 | "We can also combine ordered and unordered lists:\n", 180 | "1. This is my first item\n", 181 | "2. This is my second item\n", 182 | "\t* This is an item\n", 183 | "\t* This is an item\n", 184 | "3. This is my third item" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": { 190 | "slideshow": { 191 | "slide_type": "slide" 192 | } 193 | }, 194 | "source": [ 195 | "We can make a link to this [useful markdown cheatsheet](https://www.markdownguide.org/cheat-sheet/) as such." 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "slideshow": { 202 | "slide_type": "fragment" 203 | } 204 | }, 205 | "source": [ 206 | "If we don't use the markdown syntax for links, it will just show the link itself as the link text: https://www.markdownguide.org/cheat-sheet/" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "slideshow": { 213 | "slide_type": "slide" 214 | } 215 | }, 216 | "source": [ 217 | "### LaTeX-formatted text" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "slideshow": { 224 | "slide_type": "fragment" 225 | } 226 | }, 227 | "source": [ 228 | "$$ P(A \\mid B) = \\frac{P(B \\mid A) \\, P(A)}{P(B)} $$" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "### Code Cells\n", 236 | "\n", 237 | "Code cells are cells that contain code, that can be executed. \n", 238 | "\n", 239 | "Comments can also be written in code cells, indicated by '#'. " 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 1, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# In a code cell, comments can be typed\n", 249 | "a = 1\n", 250 | "b = 2" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 2, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "3\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "# Cells can also have output, that gets printed out below the cell.\n", 268 | "print(a + b)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 3, 274 | "metadata": { 275 | "slideshow": { 276 | "slide_type": "slide" 277 | } 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "# Define a variable in code\n", 282 | "my_string = 'hello world'" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 4, 288 | "metadata": { 289 | "slideshow": { 290 | "slide_type": "fragment" 291 | } 292 | }, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "hello world\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "# Print out a variable\n", 304 | "print(my_string)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 5, 310 | "metadata": { 311 | "slideshow": { 312 | "slide_type": "slide" 313 | } 314 | }, 315 | "outputs": [ 316 | { 317 | "data": { 318 | "text/plain": [ 319 | "'HELLO WORLD'" 320 | ] 321 | }, 322 | "execution_count": 5, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "# Operations that return objects get printed out as output\n", 329 | "my_string.upper()" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 6, 335 | "metadata": { 336 | "slideshow": { 337 | "slide_type": "slide" 338 | } 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "# Define a list variable\n", 343 | "my_list = ['a','b','c']" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 7, 349 | "metadata": { 350 | "slideshow": { 351 | "slide_type": "fragment" 352 | } 353 | }, 354 | "outputs": [ 355 | { 356 | "name": "stdout", 357 | "output_type": "stream", 358 | "text": [ 359 | "['a', 'b', 'c']\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "# Print out our list variable\n", 365 | "print(my_list)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": {}, 371 | "source": [ 372 | "## Accessing Documentation" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "
\n", 380 | "Jupyter has useful shortcuts. Add a single '?' after a function or class get a window with the documentation, or a double '??' to pull up the source code. \n", 381 | "
" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 8, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "# Import numpy for examples\n", 391 | "import numpy as np" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 9, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "# Check the docs for a numpy array\n", 401 | "np.array?" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 10, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "# Check the full source code for numpy append function\n", 411 | "np.append??" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 11, 417 | "metadata": { 418 | "slideshow": { 419 | "slide_type": "fragment" 420 | } 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "# Get information about a variable you've created\n", 425 | "my_string?" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": {}, 431 | "source": [ 432 | "## Autocomplete" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "
\n", 440 | "Jupyter also has \n", 441 | "tab complete\n", 442 | "capacities, which can autocomplete what you are typing, and/or be used to explore what code is available. \n", 443 | "
" 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 12, 449 | "metadata": {}, 450 | "outputs": [ 451 | { 452 | "ename": "SyntaxError", 453 | "evalue": "invalid syntax (, line 2)", 454 | "output_type": "error", 455 | "traceback": [ 456 | "\u001b[0;36m File \u001b[0;32m\"\"\u001b[0;36m, line \u001b[0;32m2\u001b[0m\n\u001b[0;31m np.\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "# Move your cursor just after the period, press tab, and a drop menu will appear showing all possible completions\n", 462 | "np." 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "# Autocomplete does not have to be at a period. Move to the end of 'ra' and hit tab to see completion options. \n", 472 | "ra" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "# If there is only one option, tab-complete will auto-complete what you are typing\n", 482 | "ran" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "## Kernel & Namespace\n", 490 | "\n", 491 | "You do not need to run cells in order! This is useful for flexibly testing and developing code. \n", 492 | "\n", 493 | "The numbers in the square brackets to the left of a cell show which cells have been run, and in what order.\n", 494 | "\n", 495 | "However, it can also be easy to lose track of what has already been declared / imported, leading to unexpected behaviour from running cells.\n", 496 | "\n", 497 | "The kernel is what connects the notebook to your computer behind-the-scenes to execute the code. \n", 498 | "\n", 499 | "It can be useful to clear and re-launch the kernel. You can do this from the 'kernel' drop down menu, at the top, optionally also clearing all ouputs." 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "## Magic Commands" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": {}, 512 | "source": [ 513 | "
\n", 514 | "'Magic Commands' are a special (command-line like) syntax in IPython/Jupyter to run special functionality. They can run on lines and/or entire cells. \n", 515 | "
\n", 516 | "\n", 517 | "
\n", 518 | "The iPython documentation has more information on magic commands.\n", 519 | "
" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "slideshow": { 526 | "slide_type": "slide" 527 | } 528 | }, 529 | "source": [ 530 | "Magic commands are designed to succinctly solve various common problems in standard data analysis. Magic commands come in two flavors: line magics, which are denoted by a single % prefix and operate on a single line of input, and cell magics, which are denoted by a double %% prefix and operate on multiple lines of input." 531 | ] 532 | }, 533 | { 534 | "cell_type": "code", 535 | "execution_count": null, 536 | "metadata": { 537 | "slideshow": { 538 | "slide_type": "slide" 539 | } 540 | }, 541 | "outputs": [], 542 | "source": [ 543 | "# Access quick reference sheet for interactive Python (this opens a reference guide)\n", 544 | "%quickref" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "# Check a list of available magic commands\n", 554 | "%lsmagic" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": { 561 | "slideshow": { 562 | "slide_type": "slide" 563 | } 564 | }, 565 | "outputs": [], 566 | "source": [ 567 | "# Check the current working directory\n", 568 | "%pwd" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": { 575 | "slideshow": { 576 | "slide_type": "fragment" 577 | } 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "# Check all currently defined variables\n", 582 | "%who" 583 | ] 584 | }, 585 | { 586 | "cell_type": "code", 587 | "execution_count": null, 588 | "metadata": { 589 | "slideshow": { 590 | "slide_type": "fragment" 591 | } 592 | }, 593 | "outputs": [], 594 | "source": [ 595 | "# Chcek all variables, with more information about them\n", 596 | "%whos" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": { 603 | "slideshow": { 604 | "slide_type": "slide" 605 | } 606 | }, 607 | "outputs": [], 608 | "source": [ 609 | "# Check code history\n", 610 | "%hist" 611 | ] 612 | }, 613 | { 614 | "cell_type": "markdown", 615 | "metadata": {}, 616 | "source": [ 617 | "### Line Magics\n", 618 | "\n", 619 | "\n", 620 | "Line magics use a single '%', and apply to a single line. " 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "# For example, we can time how long it takes to create a large list\n", 630 | "%timeit list(range(100000))" 631 | ] 632 | }, 633 | { 634 | "cell_type": "markdown", 635 | "metadata": {}, 636 | "source": [ 637 | "### Cell Magics\n", 638 | "\n", 639 | "Cell magics use a double '%%', and apply to the whole cell. " 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "%%timeit\n", 649 | "# For example, we could time a whole cell\n", 650 | "a = list(range(100000))\n", 651 | "b = [n + 1 for n in a]" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "### Running terminal commands\n", 659 | "\n", 660 | "Another nice thing about notebooks is being able to run terminals commands" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [ 669 | "# You can run a terminal command by adding '!' to the start of the line\n", 670 | "!pwd\n", 671 | "\n", 672 | "# Note that in this case, '!pwd' is equivalent to line magic '%pwd'. \n", 673 | "# The '!' syntax is more general though, allowing you to run anything you want through command-line " 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "%%bash\n", 683 | "# Equivalently, (for bash) use the %%bash cell magic to run a cell as bash (command-line)\n", 684 | "pwd" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": null, 690 | "metadata": { 691 | "slideshow": { 692 | "slide_type": "fragment" 693 | } 694 | }, 695 | "outputs": [], 696 | "source": [ 697 | "# List files in directory\n", 698 | "!ls" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": null, 704 | "metadata": { 705 | "slideshow": { 706 | "slide_type": "fragment" 707 | } 708 | }, 709 | "outputs": [], 710 | "source": [ 711 | "# Change current directory\n", 712 | "!cd ." 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "
\n", 720 | "For more useful information, check out Jupyter Notebooks \n", 721 | "tips & tricks, and more information on how \n", 722 | "notebooks work.\n", 723 | "
" 724 | ] 725 | } 726 | ], 727 | "metadata": { 728 | "kernelspec": { 729 | "display_name": "Python 3", 730 | "language": "python", 731 | "name": "python3" 732 | }, 733 | "language_info": { 734 | "codemirror_mode": { 735 | "name": "ipython", 736 | "version": 3 737 | }, 738 | "file_extension": ".py", 739 | "mimetype": "text/x-python", 740 | "name": "python", 741 | "nbconvert_exporter": "python", 742 | "pygments_lexer": "ipython3", 743 | "version": "3.7.4" 744 | } 745 | }, 746 | "nbformat": 4, 747 | "nbformat_minor": 2 748 | } 749 | -------------------------------------------------------------------------------- /dsip/tutorials/05-DataGathering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Gathering" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "
\n", 15 | "Data Gathering is the process of accessing data and collecting it together.\n", 16 | "
" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "This notebook covers strategies for finding and gathering data.\n", 24 | "\n", 25 | "If you want to start by working on data analyses (with provided data) you can move onto the next tutorials, and come back to this one later.\n", 26 | "\n", 27 | "Data gathering can encompass many different strategies, including data collection, web scraping, accessing data from databases, and downloading data in bulk. Sometimes it even includes things like calling someone to ask if you can use some of their data, and asking them to send it over. " 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "## Where to get Data\n", 35 | "\n", 36 | "There are lots of way to get data, and lots of places to get it from. Typically, most of this data will be accessed through the internet, in one way or another, especially when pursuing indepent research projects. \n", 37 | "\n", 38 | "### Institutional Access\n", 39 | "\n", 40 | "If you are working with data as part of an institution, such as a company of research lab, the institution will typically have data it needs analyzing, that it collects in various ways. Keep in mind that even people working inside institutions, with access to local data, will data still seek to find and incorporate external datasets. \n", 41 | "\n", 42 | "### Data Repositories\n", 43 | "\n", 44 | "**Data repositories** are databases from which you can download data. Some data repositories allow you to explore available datasets and download datasets in bulk. Others may also offer **APIs**, through which you can request specific data from particular databases.\n", 45 | "\n", 46 | "### Web Scraping\n", 47 | "\n", 48 | "The web itself is full of unstructured data. **Web scraping** can be done to directly extract and collect data directly from websites.\n", 49 | "\n", 50 | "### Asking People for Data\n", 51 | "\n", 52 | "Not all data is indexed or accessible on the web, at least not publicly. Sometimes finding data means figuring out if any data is available, figuring out where it might be, and then reaching out and asking people directly about data access. If there is some particular data you need, you can try to figure out who might have it, and get in touch to see if it might be available." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### Data Gathering Skills\n", 60 | "\n", 61 | "Depending on your gathering method, you will likely have to do some combination of the following:\n", 62 | "\n", 63 | "- Direct download data files from repositories\n", 64 | "- Query databases & use APIs to extract and collect data of interest\n", 65 | "- Ask people for data, and going to pick up data with a harddrive\n", 66 | "\n", 67 | "Ultimately, the goal is collect and curate data files, hopefully structured, that you can read into Python." 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": {}, 73 | "source": [ 74 | "## Definitions: Databases & Query Languages\n", 75 | "\n", 76 | "Here, we will introduce some useful definitions you will likely encounter when exploring how to gather data. \n", 77 | "\n", 78 | "Other than these definitions, we will not cover databases & query languages more in these tutorials. " 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "
\n", 86 | "A database is an organized collection of data. More formally, 'database' refers to a set of related data, and the way it is organized. \n", 87 | "
" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "
\n", 95 | "A query language is a language for operating with databases, such as retrieving, and sometimes modifying, information from databases.\n", 96 | "
" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "
\n", 104 | "SQL (pronounced 'sequel') is a common query language used to interact with databases, and request data.\n", 105 | "
\n", 106 | "\n", 107 | "
\n", 108 | "If you are interested, there is a useful introduction and tutorial to SQL\n", 109 | "here\n", 110 | "as well as some useful 'cheat sheets' \n", 111 | "here\n", 112 | "and\n", 113 | "here.\n", 114 | "
" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "## Data Repositories" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "
\n", 129 | "A Data Repository is basically just a place that data is stored. For our purposes, it is a place you can download data from. \n", 130 | "
\n", 131 | "\n", 132 | "
\n", 133 | "There is a curated list of good data source included in the \n", 134 | "project materials.\n", 135 | "
" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "For our purposes, data repositories are places you can download data directly from, for example [data.gov](https://www.data.gov/)." 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## Application Program Interfaces (APIs)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "
\n", 157 | "APIs are basically a way for software to talk to software - it is an interface into an application / website / database designed for software.\n", 158 | "
\n", 159 | "\n", 160 | "
\n", 161 | "For a simple explanation of APIs go\n", 162 | "here\n", 163 | "or for a much broader, more technical, overview try\n", 164 | "here.\n", 165 | "
\n", 166 | "\n", 167 | "
\n", 168 | "This\n", 169 | "list\n", 170 | "includes a collection of commonly used and available APIs. \n", 171 | "
" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "APIs offer a lot of functionality - you can send requests to the application to do all kinds of actions. In fact, any application interface that is designed to be used programmatically is an API, including, for example, interfaces for using packages of code. \n", 179 | "\n", 180 | "One of the many things that APIs do, and offer, is a way to query and access data from particular applications / databases. For example, there is a an API for Google maps that allows for programmatically querying the latitude & longitude positions of given addresses. \n", 181 | "\n", 182 | "The benefit of using APIs for data gathering purposes is that they typically return data in nicely structured formats, that are relatively easy to analyze." 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### Launching URL Requests from Python\n", 190 | "\n", 191 | "In order to use APIs, and for other approaches to collecting data, it may be useful to launch URL requests from Python.\n", 192 | "\n", 193 | "Note that by `URL`, we just mean a file or application that can be reached by a web address. Python can be used to organize and launch URL requests, triggering actions and collecting any returned data. \n", 194 | "\n", 195 | "In practice, APIs are usually special URLs that return raw data, such as `json` or `XML` files. This is compared to URLs we are typically more used to that return web pages as `html`, which can be rendered for human viewers (html). The key difference is that APIs return structured data files, where as `html` files are typically unstructured (more on that later, with web scraping). \n", 196 | "\n", 197 | "If you with to use an API, try and find the documentation for to see how you send requests to access whatever data you want. \n", 198 | "\n", 199 | "#### API Example\n", 200 | "\n", 201 | "For our example here, we will use the Github API. Note that the URL we use is `api.github.com`. This URL accesses the API, and will return structured data files, instead of the html that would be returned by the standard URL (github.com)." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 10, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "import pandas as pd\n", 211 | "\n", 212 | "# We will use the `requests` library to launch URL requests from Python\n", 213 | "import requests" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 11, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "# Request data from the Github API on a particular user\n", 223 | "page = requests.get('https://api.github.com/users/tomdonoghue')" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 12, 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "b'{\"login\":\"TomDonoghue\",\"id\":7727566,\"node_id\":\"MDQ6VXNlcjc3Mjc1NjY=\",\"avatar_url\":\"https://avatars0.githubusercontent.com/u/7727566?v=4\",\"gravatar_id\":\"\",\"url\":\"https://api.github.com/users/TomDonoghue\",\"html_url\":\"https://github.com/TomDonoghue\",\"followers_url\":\"https://api.github.com/users/TomDonoghue/followers\",\"following_url\":\"https://api.github.com/users/TomDonoghue/following{/other_user}\",\"gists_url\":\"https://api.github.com/users/TomDonoghue/gists{/gist_id}\",\"starred_url\":\"https://api.github.com/users/TomDonoghue/starred{/owner}{/repo}\",\"subscriptions_url\":\"https://api.github.com/users/TomDonoghue/subscriptions\",\"organizations_url\":\"https://api.github.com/users/TomDonoghue/orgs\",\"repos_url\":\"https://api.github.com/users/TomDonoghue/repos\",\"events_url\":\"https://api.github.com/users/TomDonoghue/events{/privacy}\",\"received_events_url\":\"https://api.github.com/users/TomDonoghue/received_events\",\"type\":\"User\",\"site_admin\":false,\"name\":\"Tom\",\"company\":\"UC San Diego\",\"blog\":\"https://tomdonoghue.github.io\",\"location\":\"San Diego\",\"email\":null,\"hireable\":null,\"bio\":\"Cognitive Science Grad Student @ UC San Diego working on analyzing electrical brain activity. Also teaching Python & Data Science. \\\\r\\\\n\\\\r\\\\n\",\"twitter_username\":null,\"public_repos\":13,\"public_gists\":0,\"followers\":97,\"following\":83,\"created_at\":\"2014-05-28T20:20:48Z\",\"updated_at\":\"2020-06-19T21:35:12Z\"}'" 235 | ] 236 | }, 237 | "execution_count": 12, 238 | "metadata": {}, 239 | "output_type": "execute_result" 240 | } 241 | ], 242 | "source": [ 243 | "# In this case, the content we get back is a json file\n", 244 | "page.content" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 13, 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/plain": [ 255 | "login TomDonoghue\n", 256 | "id 7727566\n", 257 | "node_id MDQ6VXNlcjc3Mjc1NjY=\n", 258 | "avatar_url https://avatars0.githubusercontent.com/u/77275...\n", 259 | "gravatar_id \n", 260 | "url https://api.github.com/users/TomDonoghue\n", 261 | "html_url https://github.com/TomDonoghue\n", 262 | "followers_url https://api.github.com/users/TomDonoghue/follo...\n", 263 | "following_url https://api.github.com/users/TomDonoghue/follo...\n", 264 | "gists_url https://api.github.com/users/TomDonoghue/gists...\n", 265 | "starred_url https://api.github.com/users/TomDonoghue/starr...\n", 266 | "subscriptions_url https://api.github.com/users/TomDonoghue/subsc...\n", 267 | "organizations_url https://api.github.com/users/TomDonoghue/orgs\n", 268 | "repos_url https://api.github.com/users/TomDonoghue/repos\n", 269 | "events_url https://api.github.com/users/TomDonoghue/event...\n", 270 | "received_events_url https://api.github.com/users/TomDonoghue/recei...\n", 271 | "type User\n", 272 | "site_admin False\n", 273 | "name Tom\n", 274 | "company UC San Diego\n", 275 | "blog https://tomdonoghue.github.io\n", 276 | "location San Diego\n", 277 | "email None\n", 278 | "hireable None\n", 279 | "bio Cognitive Science Grad Student @ UC San Diego ...\n", 280 | "twitter_username None\n", 281 | "public_repos 13\n", 282 | "public_gists 0\n", 283 | "followers 97\n", 284 | "following 83\n", 285 | "created_at 2014-05-28T20:20:48Z\n", 286 | "updated_at 2020-06-19T21:35:12Z\n", 287 | "dtype: object" 288 | ] 289 | }, 290 | "execution_count": 13, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "# We can read in the json data with pandas\n", 297 | "pd.read_json(page.content, typ='series')" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "As we can see above, in a couple lines of code, we can collect a lot of structured data about a particular user.\n", 305 | "\n", 306 | "If we wanted to do analyses of Github profiles and activity, we could use the Github API to collect information about a group of users, and then analyze and compare the collected data. " 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "collapsed": true 313 | }, 314 | "source": [ 315 | "## Web Scraping" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "
\n", 323 | "Web scraping is when you (programmatically) extract data from websites.\n", 324 | "
\n", 325 | "\n", 326 | "
\n", 327 | "Wikipedia\n", 328 | "has a useful page on web scraping.\n", 329 | "
" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "metadata": {}, 335 | "source": [ 336 | "By web scraping, we typically mean something distinct from using the internet to access an API. Rather, web scraping refers to using code to systematically navigate the internet, and extract information of internet, from html or other available files. Note that in this case one is not interacting directly with a database, but simply exploring and collecting whatever is available on web pages.\n", 337 | "\n", 338 | "Note that the following section uses the 'BeautifulSoup' module, which is not part of the standard anaconda distribution. \n", 339 | "\n", 340 | "If you do not have BeautifulSoup, and want to get it to run this section, you can uncomment the cell below, and run it, to install BeautifulSoup in your current Python environment. You only have to do this once." 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 5, 346 | "metadata": { 347 | "collapsed": true 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "#import sys\n", 352 | "#!conda install --yes --prefix {sys.prefix} beautifulsoup4" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 6, 358 | "metadata": { 359 | "collapsed": true 360 | }, 361 | "outputs": [], 362 | "source": [ 363 | "# Import BeautifulSoup\n", 364 | "from bs4 import BeautifulSoup" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 7, 370 | "metadata": { 371 | "collapsed": true 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "# Set the URL for the page we wish to scrape\n", 376 | "site_url = 'https://en.wikipedia.org/wiki/Data_science'\n", 377 | "\n", 378 | "# Launch the URL request, to get the page\n", 379 | "page = requests.get(site_url)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 8, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "data": { 389 | "text/plain": [ 390 | "b'\\n\\n\\n\\nData science - Wikipedia\\n\\n