├── .gitignore
├── CITATION.cff
├── LICENSE.txt
├── Makefile
├── README.md
├── dsip
    ├── _config.yml
    ├── _toc.yml
    ├── assets
    │   └── logo.png
    ├── assignments
    │   ├── D1_Python.ipynb
    │   ├── D2_Pandas.ipynb
    │   ├── D3_DataExploration.ipynb
    │   ├── D4_DataPrivacy.ipynb
    │   ├── D5_DataAnalysis.ipynb
    │   └── D6_NaturalLanguageProcessing.ipynb
    ├── docs
    │   └── index.md
    ├── projects
    │   ├── ProjectProposal.ipynb
    │   ├── ProjectReport.ipynb
    │   ├── project_checklist.md
    │   └── project_guidelines.md
    └── tutorials
    │   ├── .gitignore
    │   ├── 00-Introduction.ipynb
    │   ├── 01-Python.ipynb
    │   ├── 02-JupyterNotebooks.ipynb
    │   ├── 03-DataAnalysis.ipynb
    │   ├── 04-ScientificPython.ipynb
    │   ├── 05-DataGathering.ipynb
    │   ├── 06-DataWrangling.ipynb
    │   ├── 07-DataCleaning.ipynb
    │   ├── 08-DataPrivacy&Anonymization.ipynb
    │   ├── 09-DataVisualization.ipynb
    │   ├── 10-Distributions.ipynb
    │   ├── 11-TestingDistributions.ipynb
    │   ├── 12-StatisticalComparisons.ipynb
    │   ├── 13-OrdinaryLeastSquares.ipynb
    │   ├── 14-LinearModels.ipynb
    │   ├── 15-Clustering.ipynb
    │   ├── 16-DimensionalityReduction.ipynb
    │   ├── 17-Classification.ipynb
    │   ├── 18-NaturalLanguageProcessing.ipynb
    │   ├── A1-PythonPackages.ipynb
    │   ├── A2-Git.ipynb
    │   ├── files
    │       ├── book10k.txt
    │       ├── data.csv
    │       ├── data.json
    │       ├── data.txt
    │       ├── messy_data.csv
    │       └── messy_data.json
    │   └── img
    │       ├── anaconda.png
    │       ├── git.png
    │       ├── github.png
    │       ├── jupyter.png
    │       ├── matplotlib.png
    │       ├── numpy.png
    │       ├── pandas.png
    │       ├── python.png
    │       ├── scipy.png
    │       ├── sklearn.png
    │       └── sourcetree.png
├── instructions.md
└── paper
    ├── paper.bib
    └── paper.md


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore local builds of the textbook
 2 | dsip/_build/*
 3 | 
 4 | # Ignore any notebook checkpoint files
 5 | *.ipynb_checkpoints*
 6 | 
 7 | # Ignore materials that are copied in
 8 | #dsip/tutorials/*
 9 | #dsip/assignments/*
10 | #dsip/projects/*
11 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: >-
 3 |   If you use this software, please cite it using the metadata from this file.
 4 | type: software
 5 | title: 'DataScienceInPractice'
 6 | authors:
 7 | - given-names: 'Thomas'
 8 |   family-names: 'Donoghue'
 9 |   orcid: 'https://orcid.org/0000-0001-5911-0472'
10 | - given-names: 'Bradley'
11 |   family-names: 'Voytek'
12 |   orcid: 'https://orcid.org/0000-0003-1640-2525'
13 | - given-names: 'Shannon'
14 |   family-names: 'Ellis'
15 |   orcid: 'https://orcid.org/0000-0002-9231-0481'
16 | repository-code: 'https://github.com/DataScienceInPractice/Site'
17 | url: 'https://datascienceinpractice.github.io/'
18 | license: MIT
19 | preferred-citation:
20 |   type: article
21 |   authors:
22 |   - given-names: 'Thomas'
23 |     family-names: 'Donoghue'
24 |     orcid: 'https://orcid.org/0000-0001-5911-0472'
25 |   - given-names: 'Bradley'
26 |     family-names: 'Voytek'
27 |     orcid: 'https://orcid.org/0000-0003-1640-2525'
28 |   - given-names: 'Shannon'
29 |     family-names: 'Ellis'
30 |     orcid: 'https://orcid.org/0000-0002-9231-0481'
31 |   doi: '10.21105/jose.00121'
32 |   journal: 'The Journal of Open Source Education'
33 |   title: 'Course Materials for Data Science in Practice'
34 |   issue: 51
35 |   volume: 5
36 |   year: 2020
37 |   start: 1
38 |   end: 3
39 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 | Attribution 4.0 International
  2 | 
  3 | =======================================================================
  4 | 
  5 | Creative Commons Corporation ("Creative Commons") is not a law firm and
  6 | does not provide legal services or legal advice. Distribution of
  7 | Creative Commons public licenses does not create a lawyer-client or
  8 | other relationship. Creative Commons makes its licenses and related
  9 | information available on an "as-is" basis. Creative Commons gives no
 10 | warranties regarding its licenses, any material licensed under their
 11 | terms and conditions, or any related information. Creative Commons
 12 | disclaims all liability for damages resulting from their use to the
 13 | fullest extent possible.
 14 | 
 15 | Using Creative Commons Public Licenses
 16 | 
 17 | Creative Commons public licenses provide a standard set of terms and
 18 | conditions that creators and other rights holders may use to share
 19 | original works of authorship and other material subject to copyright
 20 | and certain other rights specified in the public license below. The
 21 | following considerations are for informational purposes only, are not
 22 | exhaustive, and do not form part of our licenses.
 23 | 
 24 |      Considerations for licensors: Our public licenses are
 25 |      intended for use by those authorized to give the public
 26 |      permission to use material in ways otherwise restricted by
 27 |      copyright and certain other rights. Our licenses are
 28 |      irrevocable. Licensors should read and understand the terms
 29 |      and conditions of the license they choose before applying it.
 30 |      Licensors should also secure all rights necessary before
 31 |      applying our licenses so that the public can reuse the
 32 |      material as expected. Licensors should clearly mark any
 33 |      material not subject to the license. This includes other CC-
 34 |      licensed material, or material used under an exception or
 35 |      limitation to copyright. More considerations for licensors:
 36 |     wiki.creativecommons.org/Considerations_for_licensors
 37 | 
 38 |      Considerations for the public: By using one of our public
 39 |      licenses, a licensor grants the public permission to use the
 40 |      licensed material under specified terms and conditions. If
 41 |      the licensor's permission is not necessary for any reason--for
 42 |      example, because of any applicable exception or limitation to
 43 |      copyright--then that use is not regulated by the license. Our
 44 |      licenses grant only permissions under copyright and certain
 45 |      other rights that a licensor has authority to grant. Use of
 46 |      the licensed material may still be restricted for other
 47 |      reasons, including because others have copyright or other
 48 |      rights in the material. A licensor may make special requests,
 49 |      such as asking that all changes be marked or described.
 50 |      Although not required by our licenses, you are encouraged to
 51 |      respect those requests where reasonable. More considerations
 52 |      for the public:
 53 |     wiki.creativecommons.org/Considerations_for_licensees
 54 | 
 55 | =======================================================================
 56 | 
 57 | Creative Commons Attribution 4.0 International Public License
 58 | 
 59 | By exercising the Licensed Rights (defined below), You accept and agree
 60 | to be bound by the terms and conditions of this Creative Commons
 61 | Attribution 4.0 International Public License ("Public License"). To the
 62 | extent this Public License may be interpreted as a contract, You are
 63 | granted the Licensed Rights in consideration of Your acceptance of
 64 | these terms and conditions, and the Licensor grants You such rights in
 65 | consideration of benefits the Licensor receives from making the
 66 | Licensed Material available under these terms and conditions.
 67 | 
 68 | 
 69 | Section 1 -- Definitions.
 70 | 
 71 |   a. Adapted Material means material subject to Copyright and Similar
 72 |      Rights that is derived from or based upon the Licensed Material
 73 |      and in which the Licensed Material is translated, altered,
 74 |      arranged, transformed, or otherwise modified in a manner requiring
 75 |      permission under the Copyright and Similar Rights held by the
 76 |      Licensor. For purposes of this Public License, where the Licensed
 77 |      Material is a musical work, performance, or sound recording,
 78 |      Adapted Material is always produced where the Licensed Material is
 79 |      synched in timed relation with a moving image.
 80 | 
 81 |   b. Adapter's License means the license You apply to Your Copyright
 82 |      and Similar Rights in Your contributions to Adapted Material in
 83 |      accordance with the terms and conditions of this Public License.
 84 | 
 85 |   c. Copyright and Similar Rights means copyright and/or similar rights
 86 |      closely related to copyright including, without limitation,
 87 |      performance, broadcast, sound recording, and Sui Generis Database
 88 |      Rights, without regard to how the rights are labeled or
 89 |      categorized. For purposes of this Public License, the rights
 90 |      specified in Section 2(b)(1)-(2) are not Copyright and Similar
 91 |      Rights.
 92 | 
 93 |   d. Effective Technological Measures means those measures that, in the
 94 |      absence of proper authority, may not be circumvented under laws
 95 |      fulfilling obligations under Article 11 of the WIPO Copyright
 96 |      Treaty adopted on December 20, 1996, and/or similar international
 97 |      agreements.
 98 | 
 99 |   e. Exceptions and Limitations means fair use, fair dealing, and/or
100 |      any other exception or limitation to Copyright and Similar Rights
101 |      that applies to Your use of the Licensed Material.
102 | 
103 |   f. Licensed Material means the artistic or literary work, database,
104 |      or other material to which the Licensor applied this Public
105 |      License.
106 | 
107 |   g. Licensed Rights means the rights granted to You subject to the
108 |      terms and conditions of this Public License, which are limited to
109 |      all Copyright and Similar Rights that apply to Your use of the
110 |      Licensed Material and that the Licensor has authority to license.
111 | 
112 |   h. Licensor means the individual(s) or entity(ies) granting rights
113 |      under this Public License.
114 | 
115 |   i. Share means to provide material to the public by any means or
116 |      process that requires permission under the Licensed Rights, such
117 |      as reproduction, public display, public performance, distribution,
118 |      dissemination, communication, or importation, and to make material
119 |      available to the public including in ways that members of the
120 |      public may access the material from a place and at a time
121 |      individually chosen by them.
122 | 
123 |   j. Sui Generis Database Rights means rights other than copyright
124 |      resulting from Directive 96/9/EC of the European Parliament and of
125 |      the Council of 11 March 1996 on the legal protection of databases,
126 |      as amended and/or succeeded, as well as other essentially
127 |      equivalent rights anywhere in the world.
128 | 
129 |   k. You means the individual or entity exercising the Licensed Rights
130 |      under this Public License. Your has a corresponding meaning.
131 | 
132 | 
133 | Section 2 -- Scope.
134 | 
135 |   a. License grant.
136 | 
137 |        1. Subject to the terms and conditions of this Public License,
138 |           the Licensor hereby grants You a worldwide, royalty-free,
139 |           non-sublicensable, non-exclusive, irrevocable license to
140 |           exercise the Licensed Rights in the Licensed Material to:
141 | 
142 |             a. reproduce and Share the Licensed Material, in whole or
143 |                in part; and
144 | 
145 |             b. produce, reproduce, and Share Adapted Material.
146 | 
147 |        2. Exceptions and Limitations. For the avoidance of doubt, where
148 |           Exceptions and Limitations apply to Your use, this Public
149 |           License does not apply, and You do not need to comply with
150 |           its terms and conditions.
151 | 
152 |        3. Term. The term of this Public License is specified in Section
153 |           6(a).
154 | 
155 |        4. Media and formats; technical modifications allowed. The
156 |           Licensor authorizes You to exercise the Licensed Rights in
157 |           all media and formats whether now known or hereafter created,
158 |           and to make technical modifications necessary to do so. The
159 |           Licensor waives and/or agrees not to assert any right or
160 |           authority to forbid You from making technical modifications
161 |           necessary to exercise the Licensed Rights, including
162 |           technical modifications necessary to circumvent Effective
163 |           Technological Measures. For purposes of this Public License,
164 |           simply making modifications authorized by this Section 2(a)
165 |           (4) never produces Adapted Material.
166 | 
167 |        5. Downstream recipients.
168 | 
169 |             a. Offer from the Licensor -- Licensed Material. Every
170 |                recipient of the Licensed Material automatically
171 |                receives an offer from the Licensor to exercise the
172 |                Licensed Rights under the terms and conditions of this
173 |                Public License.
174 | 
175 |             b. No downstream restrictions. You may not offer or impose
176 |                any additional or different terms or conditions on, or
177 |                apply any Effective Technological Measures to, the
178 |                Licensed Material if doing so restricts exercise of the
179 |                Licensed Rights by any recipient of the Licensed
180 |                Material.
181 | 
182 |        6. No endorsement. Nothing in this Public License constitutes or
183 |           may be construed as permission to assert or imply that You
184 |           are, or that Your use of the Licensed Material is, connected
185 |           with, or sponsored, endorsed, or granted official status by,
186 |           the Licensor or others designated to receive attribution as
187 |           provided in Section 3(a)(1)(A)(i).
188 | 
189 |   b. Other rights.
190 | 
191 |        1. Moral rights, such as the right of integrity, are not
192 |           licensed under this Public License, nor are publicity,
193 |           privacy, and/or other similar personality rights; however, to
194 |           the extent possible, the Licensor waives and/or agrees not to
195 |           assert any such rights held by the Licensor to the limited
196 |           extent necessary to allow You to exercise the Licensed
197 |           Rights, but not otherwise.
198 | 
199 |        2. Patent and trademark rights are not licensed under this
200 |           Public License.
201 | 
202 |        3. To the extent possible, the Licensor waives any right to
203 |           collect royalties from You for the exercise of the Licensed
204 |           Rights, whether directly or through a collecting society
205 |           under any voluntary or waivable statutory or compulsory
206 |           licensing scheme. In all other cases the Licensor expressly
207 |           reserves any right to collect such royalties.
208 | 
209 | 
210 | Section 3 -- License Conditions.
211 | 
212 | Your exercise of the Licensed Rights is expressly made subject to the
213 | following conditions.
214 | 
215 |   a. Attribution.
216 | 
217 |        1. If You Share the Licensed Material (including in modified
218 |           form), You must:
219 | 
220 |             a. retain the following if it is supplied by the Licensor
221 |                with the Licensed Material:
222 | 
223 |                  i. identification of the creator(s) of the Licensed
224 |                     Material and any others designated to receive
225 |                     attribution, in any reasonable manner requested by
226 |                     the Licensor (including by pseudonym if
227 |                     designated);
228 | 
229 |                 ii. a copyright notice;
230 | 
231 |                iii. a notice that refers to this Public License;
232 | 
233 |                 iv. a notice that refers to the disclaimer of
234 |                     warranties;
235 | 
236 |                  v. a URI or hyperlink to the Licensed Material to the
237 |                     extent reasonably practicable;
238 | 
239 |             b. indicate if You modified the Licensed Material and
240 |                retain an indication of any previous modifications; and
241 | 
242 |             c. indicate the Licensed Material is licensed under this
243 |                Public License, and include the text of, or the URI or
244 |                hyperlink to, this Public License.
245 | 
246 |        2. You may satisfy the conditions in Section 3(a)(1) in any
247 |           reasonable manner based on the medium, means, and context in
248 |           which You Share the Licensed Material. For example, it may be
249 |           reasonable to satisfy the conditions by providing a URI or
250 |           hyperlink to a resource that includes the required
251 |           information.
252 | 
253 |        3. If requested by the Licensor, You must remove any of the
254 |           information required by Section 3(a)(1)(A) to the extent
255 |           reasonably practicable.
256 | 
257 |        4. If You Share Adapted Material You produce, the Adapter's
258 |           License You apply must not prevent recipients of the Adapted
259 |           Material from complying with this Public License.
260 | 
261 | 
262 | Section 4 -- Sui Generis Database Rights.
263 | 
264 | Where the Licensed Rights include Sui Generis Database Rights that
265 | apply to Your use of the Licensed Material:
266 | 
267 |   a. for the avoidance of doubt, Section 2(a)(1) grants You the right
268 |      to extract, reuse, reproduce, and Share all or a substantial
269 |      portion of the contents of the database;
270 | 
271 |   b. if You include all or a substantial portion of the database
272 |      contents in a database in which You have Sui Generis Database
273 |      Rights, then the database in which You have Sui Generis Database
274 |      Rights (but not its individual contents) is Adapted Material; and
275 | 
276 |   c. You must comply with the conditions in Section 3(a) if You Share
277 |      all or a substantial portion of the contents of the database.
278 | 
279 | For the avoidance of doubt, this Section 4 supplements and does not
280 | replace Your obligations under this Public License where the Licensed
281 | Rights include other Copyright and Similar Rights.
282 | 
283 | 
284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability.
285 | 
286 |   a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
287 |      EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
288 |      AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
289 |      ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
290 |      IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
291 |      WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
292 |      PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
293 |      ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
294 |      KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
295 |      ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
296 | 
297 |   b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
298 |      TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
299 |      NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
300 |      INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
301 |      COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
302 |      USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
303 |      ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
304 |      DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
305 |      IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
306 | 
307 |   c. The disclaimer of warranties and limitation of liability provided
308 |      above shall be interpreted in a manner that, to the extent
309 |      possible, most closely approximates an absolute disclaimer and
310 |      waiver of all liability.
311 | 
312 | 
313 | Section 6 -- Term and Termination.
314 | 
315 |   a. This Public License applies for the term of the Copyright and
316 |      Similar Rights licensed here. However, if You fail to comply with
317 |      this Public License, then Your rights under this Public License
318 |      terminate automatically.
319 | 
320 |   b. Where Your right to use the Licensed Material has terminated under
321 |      Section 6(a), it reinstates:
322 | 
323 |        1. automatically as of the date the violation is cured, provided
324 |           it is cured within 30 days of Your discovery of the
325 |           violation; or
326 | 
327 |        2. upon express reinstatement by the Licensor.
328 | 
329 |      For the avoidance of doubt, this Section 6(b) does not affect any
330 |      right the Licensor may have to seek remedies for Your violations
331 |      of this Public License.
332 | 
333 |   c. For the avoidance of doubt, the Licensor may also offer the
334 |      Licensed Material under separate terms or conditions or stop
335 |      distributing the Licensed Material at any time; however, doing so
336 |      will not terminate this Public License.
337 | 
338 |   d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
339 |      License.
340 | 
341 | 
342 | Section 7 -- Other Terms and Conditions.
343 | 
344 |   a. The Licensor shall not be bound by any additional or different
345 |      terms or conditions communicated by You unless expressly agreed.
346 | 
347 |   b. Any arrangements, understandings, or agreements regarding the
348 |      Licensed Material not stated herein are separate from and
349 |      independent of the terms and conditions of this Public License.
350 | 
351 | 
352 | Section 8 -- Interpretation.
353 | 
354 |   a. For the avoidance of doubt, this Public License does not, and
355 |      shall not be interpreted to, reduce, limit, restrict, or impose
356 |      conditions on any use of the Licensed Material that could lawfully
357 |      be made without permission under this Public License.
358 | 
359 |   b. To the extent possible, if any provision of this Public License is
360 |      deemed unenforceable, it shall be automatically reformed to the
361 |      minimum extent necessary to make it enforceable. If the provision
362 |      cannot be reformed, it shall be severed from this Public License
363 |      without affecting the enforceability of the remaining terms and
364 |      conditions.
365 | 
366 |   c. No term or condition of this Public License will be waived and no
367 |      failure to comply consented to unless expressly agreed to by the
368 |      Licensor.
369 | 
370 |   d. Nothing in this Public License constitutes or may be interpreted
371 |      as a limitation upon, or waiver of, any privileges and immunities
372 |      that apply to the Licensor or You, including from the legal
373 |      processes of any jurisdiction or authority.
374 | 
375 | 
376 | =======================================================================
377 | 
378 | Creative Commons is not a party to its public
379 | licenses. Notwithstanding, Creative Commons may elect to apply one of
380 | its public licenses to material it publishes and in those instances
381 | will be considered the “Licensor.” The text of the Creative Commons
382 | public licenses is dedicated to the public domain under the CC0 Public
383 | Domain Dedication. Except for the limited purpose of indicating that
384 | material is shared under a Creative Commons public license or as
385 | otherwise permitted by the Creative Commons policies published at
386 | creativecommons.org/policies, Creative Commons does not authorize the
387 | use of the trademark "Creative Commons" or any other trademark or logo
388 | of Creative Commons without its prior written consent including,
389 | without limitation, in connection with any unauthorized modifications
390 | to any of its public licenses or any other arrangements,
391 | understandings, or agreements concerning use of licensed material. For
392 | the avoidance of doubt, this paragraph does not form part of the
393 | public licenses.
394 | 
395 | Creative Commons may be contacted at creativecommons.org.
396 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for the Data Science in Practice site
  2 | 
  3 | 
  4 | ##########################################################################
  5 | ## REQUIREMENTS
  6 | #
  7 | # This file requires `jupyter-book` for building the site.
  8 | #
  9 | 
 10 | ##########################################################################
 11 | ## VARIABLES
 12 | 
 13 | BOOK     	    = dsip
 14 | CONTENT-ORG   	= https://github.com/COGS108
 15 | BOOK-ORG  	    = https://github.com/datascienceinpractice
 16 | SITE-LOC        = datascienceinpractice.github.io
 17 | 
 18 | 
 19 | ##########################################################################
 20 | ## CLONING MATERIALS
 21 | 
 22 | # Clone all materials
 23 | clone:
 24 | 
 25 | 	clone-tutorials
 26 | 	clone-assignments
 27 | 	clone-projects
 28 | 
 29 | # Clone the tutorials
 30 | clone-tutorials:
 31 | 
 32 | 	# Copy tutorial materials
 33 | 	@git clone --depth 1 $(CONTENT-ORG)/Tutorials $(BOOK)/tutorials
 34 | 	@rm $(BOOK)/tutorials/README.md
 35 | 	@rm -rf $(BOOK)/tutorials/.git
 36 | 
 37 | # Clone the assignments
 38 | clone-assignments:
 39 | 
 40 | 	# Clone assignments demo repo, and copy out files (to re-org & rename)
 41 | 	@git clone --depth 1 $(CONTENT-ORG)/Assign_Demo $(BOOK)/temp
 42 | 	@mv $(BOOK)/temp/release/A1/A1_git_python.ipynb $(BOOK)/assignments/D1_Python.ipynb
 43 | 	@mv $(BOOK)/temp/release/A2/A2_Pandas.ipynb $(BOOK)/assignments/D2_Pandas.ipynb
 44 | 	@mv $(BOOK)/temp/release/A3/A3_DataExploration.ipynb $(BOOK)/assignments/D3_DataExploration.ipynb
 45 | 	@mv $(BOOK)/temp/release/A4/A4_DataPrivacy.ipynb $(BOOK)/assignments/D4_DataPrivacy.ipynb
 46 | 	@mv $(BOOK)/temp/release/A5/A5_DataAnalysis.ipynb $(BOOK)/assignments/D5_DataAnalysis.ipynb
 47 | 	@mv $(BOOK)/temp/release/A6/A6_NaturalLanguageProcessing.ipynb $(BOOK)/assignments/D6_NaturalLanguageProcessing.ipynb
 48 | 	@rm -rf $(BOOK)/temp
 49 | 
 50 | # Clone the project information
 51 | clone-projects:
 52 | 
 53 | 	# Copy over the project repositories into temporary repositories
 54 | 	@git clone --depth 1 $(CONTENT-ORG)/Projects $(BOOK)/temp1
 55 | 	@git clone --depth 1 $(CONTENT-ORG)/group_template $(BOOK)/temp2
 56 | 
 57 | 	# Copy over the files we want
 58 | 	@mkdir $(BOOK)/projects
 59 | 	@mv $(BOOK)/temp1/README.md $(BOOK)/projects/project_checklist.md
 60 | 	@mv $(BOOK)/temp1/FinalProject_Guidelines.md $(BOOK)/projects/project_guidelines.md
 61 | 	@mv $(BOOK)/temp2/ProjectProposal_groupXXX.ipynb $(BOOK)/projects/ProjectProposal.ipynb
 62 | 	@mv $(BOOK)/temp2/FinalProject_groupXXX.ipynb $(BOOK)/projects/ProjectReport.ipynb
 63 | 
 64 | 	# Clear out the temporary folders
 65 | 	@rm -rf $(BOOK)/temp1
 66 | 	@rm -rf $(BOOK)/temp2
 67 | 
 68 | 
 69 | ##########################################################################
 70 | ## CLEAN UPS
 71 | 
 72 | # Clear out the copied repositories
 73 | clear:
 74 | 
 75 | 	# Clear all cloned materials
 76 | 	clear-tutorials
 77 | 	clear-assignemnts
 78 | 	clear-projects
 79 | 
 80 | clear-tutorials:
 81 | 	rm -rf $(BOOK)/tutorials
 82 | 
 83 | clear-assignemnts:
 84 | 	rm -rf $(BOOK)/assignments
 85 | 
 86 | clear-projects:
 87 | 	rm -rf $(BOOK)/projects
 88 | 
 89 | # Clean out the built textbook
 90 | clean:
 91 | 	jupyter-book clean $(BOOK_NAME)/
 92 | 
 93 | 
 94 | ##########################################################################
 95 | ## BUILDING SITE
 96 | 
 97 | # Build the textbook
 98 | build:
 99 | 	jupyter-book build $(BOOK)/
100 | 
101 | 
102 | ##########################################################################
103 | ## DEPLOYING SITE
104 | 
105 | # Deploy the website
106 | deploy:
107 | 
108 | 	# Create the textbook
109 | 	make build
110 | 
111 | 	# Clone the website host repository
112 | 	rm -rf $(BOOK)/_build/deploy/
113 | 	git clone --depth 1 $(BOOK-ORG)/$(SITE-LOC) $(BOOK)/_build/deploy/
114 | 
115 | 	# Add .nojekyll file to tell Github pages to bypass Jekyll processing
116 | 	touch $(BOOK)/_build/deploy/.nojekyll
117 | 
118 | 	# Copy site source into the host repo folder, then push to Github to deploy
119 | 	cd $(BOOK)/_build/ && \
120 | 	cp -r html/ deploy && \
121 | 	cd deploy && \
122 | 	git add * && \
123 | 	git add .nojekyll && \
124 | 	git commit -a -m 'deploy site' && \
125 | 	git push
126 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Science in Practice Site
 2 | [![Project Status: Active](https://www.repostatus.org/badges/latest/active.svg)](https://www.repostatus.org/#active)
 3 | [![Website](https://img.shields.io/badge/site-datascienceinpractice.github.io-informational.svg)](https://datascienceinpractice.github.io)
 4 | [![License: CC-BY 4.0](https://img.shields.io/badge/License-CC--BY%204.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/)
 5 | [![DOI](https://jose.theoj.org/papers/10.21105/jose.00121/status.svg)](https://doi.org/10.21105/jose.00121)
 6 | 
 7 | Data science in practice is a collection of materials for learning introductory data science.
 8 | 
 9 | ## Overview
10 | 
11 | This repository is the source repository for creating the [Data Science in Practice](https://datascienceinpractice.github.io/) website.
12 | 
13 | Specifically, the website contains:
14 | - `tutorials`, which introduce key topics for doing data science
15 | - `assignments`, which are problem sets that can be worked through
16 | - `projects`, which describes how to pursue independent analysis projects
17 | 
18 | These materials serve as a public version of materials from the [COGS108](https://github.com/COGS108) class.
19 | 
20 | This repository has the tools for building the website. To do so, it copies materials from the 
21 | COGS 108 organization, and then updates and organizes them for the public website. See the 
22 | [instructions](https://github.com/DataScienceInPractice/Site/blob/main/instructions.md) 
23 | page for notes on how this works.
24 | 
25 | The built version of book is then posted to the
26 | [website repository](https://github.com/DataScienceInPractice/datascienceinpractice.github.io)
27 | for hosting.
28 | 
29 | ## Dependencies
30 | 
31 | This project uses the Python programming language, and requires Python >= 3.6. 
32 | 
33 | Materials are written and available as [Jupyter Notebooks](https://jupyter.org/). 
34 | 
35 | Tutorials & assignments require packages from the scientific Python ecosystem. These dependencies can all be installed using the 
36 | [Anaconda distribution](https://www.anaconda.com/products/individual). Details and instructions on the dependencies
37 | and how to get them are available in the materials.
38 | 
39 | The website is created using [JupyterBook](https://github.com/executablebooks/jupyter-book).
40 | 
41 | ## Organization
42 | 
43 | This repository contains the following sections:
44 | 
45 | - `dsip/` contains the content of the website, including sub-sections:
46 |     - `docs/` contains the source for written sections of the site
47 |     - `tutorials/` contains tutorial notebooks which introduce key topics for doing data science
48 |     - `assignments/` contains assignment notebooks with problem sets that can be worked through
49 |     - `projects/` contains information on how to pursue independent analysis projects
50 | - `paper/` contains a copy of the paper that describes these materials
51 | 
52 | ## Reference
53 | 
54 | This project is described in the following paper:
55 | 
56 |     Donoghue T, Voytek B, & Ellis S (2022). Course Materials for Data Science in 
57 |     Practice. Journal of Open Source Education, 5(51), 121. DOI: 10.21105/jose.00121
58 | 
59 | Direct Link: https://doi.org/10.21105/jose.00121
60 | 
61 | ## License
62 | 
63 | These materials are made freely available, and are licensed under a [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.
64 | 


--------------------------------------------------------------------------------
/dsip/_config.yml:
--------------------------------------------------------------------------------
 1 | #######################################################################################
 2 | # Book settings
 3 | title     : Data Science in Practice
 4 | author    : Thomas Donoghue, Bradley Voytek, & Shannon Ellis
 5 | copyright : "2020-"
 6 | logo      : assets/logo.png
 7 | 
 8 | #######################################################################################
 9 | # Execution settings
10 | execute:
11 |   execute_notebooks           : cache
12 | 
13 | #######################################################################################
14 | # HTML-specific settings
15 | html:
16 |   home_page_in_navbar         : false
17 | 
18 | # #######################################################################################
19 | # Interact link settings
20 | notebook_interface            : "notebook"
21 | 
22 | #######################################################################################
23 | # Launch button settings
24 | repository:
25 |   url                         : https://github.com/datascienceinpractice/Site
26 |   path_to_book                : "dsip"
27 | 
28 | binder:
29 |   binderhub_url               : "https://mybinder.org"
30 |   text                        : "Launch binder"
31 | 
32 | #######################################################################################
33 | # HTML settings
34 | html:
35 |   favicon: assets/logo.png
36 |   use_repository_button: true
37 |   use_issues_button: true
38 |   use_edit_page_button: false
39 | 


--------------------------------------------------------------------------------
/dsip/_toc.yml:
--------------------------------------------------------------------------------
 1 | root: docs/index
 2 | format: jb-book
 3 | 
 4 | parts:
 5 | - caption: Tutorials
 6 |   chapters:
 7 |   - file: tutorials/00-Introduction
 8 |   - file: tutorials/01-Python
 9 |   - file: tutorials/02-JupyterNotebooks
10 |   - file: tutorials/03-DataAnalysis
11 |   - file: tutorials/04-DataSciencePython
12 |   - file: tutorials/05-DataGathering
13 |   - file: tutorials/06-DataWrangling
14 |   - file: tutorials/07-DataCleaning
15 |   - file: tutorials/08-DataPrivacy&Anonymization
16 |   - file: tutorials/09-DataVisualization
17 |   - file: tutorials/10-Distributions
18 |   - file: tutorials/11-TestingDistributions
19 |   - file: tutorials/12-StatisticalComparisons
20 |   - file: tutorials/13-OrdinaryLeastSquares
21 |   - file: tutorials/14-LinearModels
22 |   - file: tutorials/15-Clustering
23 |   - file: tutorials/16-DimensionalityReduction
24 |   - file: tutorials/17-Classification
25 |   - file: tutorials/18-NaturalLanguageProcessing
26 |   - file: tutorials/A1-PythonPackages
27 |   - file: tutorials/A2-Git
28 | 
29 | - caption: Assignments
30 |   chapters:
31 |   - file: assignments/D1_Python
32 |   - file: assignments/D2_Pandas
33 |   - file: assignments/D3_DataExploration
34 |   - file: assignments/D4_DataPrivacy
35 |   - file: assignments/D5_DataAnalysis
36 |   - file: assignments/D6_NaturalLanguageProcessing
37 | 
38 | - caption: Project
39 |   chapters:
40 |   - file: projects/project_guidelines
41 |   - file: projects/ProjectProposal
42 |   - file: projects/ProjectReport
43 |   - file: projects/project_checklist
44 | 


--------------------------------------------------------------------------------
/dsip/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/assets/logo.png


--------------------------------------------------------------------------------
/dsip/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Data Science in Practice
 2 | 
 3 | Data Science in Practice is an open set of materials for learning introductory data science.
 4 | 
 5 | This website is a public version of the Data Science in Practice course, taught as
 6 | [COGS 108](https://github.com/COGS108/)
 7 | at UC San Diego.
 8 | 
 9 | _If you are in the COGS108 class at UC San Diego, this website is **not** the same as the materials and coursework for the class._
10 | 
11 | ## Overview
12 | 
13 | The goal of Data Science in Practice is to introduce the practical elements of _doing_ data science.
14 | 
15 | Data science is an emerging and multidisciplinary field, organized around the practice of analyzing data, and all the questions, practices and problems that entails.
16 | 
17 | These materials focus on the practical elements of finding, analyzing, interpreting and contextualizing data analysis, in order to practice answering questions with data.
18 | 
19 | ## Requirements
20 | 
21 | These materials uses the Python programming language, and presume knowledge of standard library Python.
22 | 
23 | The tutorials introduce how to get Python installed in which dependencies are needed.
24 | 
25 | ## Content
26 | 
27 | Available materials include:
28 | 
29 | - **Tutorials** which introduce key topics for doing data science
30 |     - These can be used to explore and learn about key topics
31 | - **Assignments** which are problem sets that can be worked through
32 |     - These can be used to practice key skills and ideas with code
33 | - **Projects** which describes how to pursue independent analysis projects
34 |     - This can be used as a guide for how to continue with real data science projects
35 | 
36 | All the materials are listed in the table of contents in the left sidebar.
37 | 
38 | Note that these materials are not created as fully detailed descriptions or formal descriptions of the topics they introduce.
39 | 
40 | Rather, they seek to _introduce_ key topics, _demonstrate_ them in code, and allow for interaction, exploration and _practice_.
41 | 
42 | Put another way, these materials are designed to be more of a map than encyclopedia.
43 | 
44 | For further information on topics we introduce, these materials link to external resources.
45 | 
46 | ## How to Use These Materials
47 | 
48 | These materials are created as [Jupyter Notebooks](https://jupyter.org), and are intended to be executed and explored in a hands-on manner.
49 | 
50 | There is a download link at the top left of the page, that can be used to download each page as a notebook. This allows you to use the notebook locally, executing code, and answering questions.
51 | 
52 | ## Issue Tracking
53 | 
54 | If you have any find any bugs or issues, or have any suggestions for these materials, please open an
55 | [issue](https://github.com/DataScienceInPractice/Site/issues).
56 | 
57 | ## Source Materials
58 | 
59 | This set of materials is an openly available version of tutorials and coursework developed for and 
60 | used in a university undergraduate course,
61 | [COGS 108](https://github.com/COGS108/),
62 | which is taught at UC San Diego.
63 | 
64 | These materials may still contain some references to the university course or to grading, which can be ignored.
65 | 
66 | You can find more information about the university course in the
67 | [overview repository](https://github.com/COGS108/Overview/).
68 | 
69 | The materials for this open version of the course are managed through this
70 | [Github organization](https://github.com/DataScienceInPractice/). 
71 | 
72 | The source repository for this website is available [here](https://github.com/DataScienceInPractice/Site).
73 | 
74 | ## Reference
75 | 
76 | This project is described in the following paper:
77 | 
78 |     Donoghue T, Voytek B, & Ellis S (2022). Course Materials for Data Science in 
79 |     Practice. Journal of Open Source Education, 5(51), 121. DOI: 10.21105/jose.00121
80 | 
81 | Direct Link: https://doi.org/10.21105/jose.00121
82 | 
83 | ## License
84 | 
85 | The materials on this website are openly available under a
86 | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) license.
87 | 
88 | ## Acknowledgments
89 | 
90 | The original university course these materials are adapted from was originally created by
91 | [Bradley Voytek](https://voyteklab.com/), and is currently primarily taught by
92 | [Shannon Ellis](http://www.shanellis.com/).
93 | This website and many of the materials were developed by
94 | [Tom Donoghue](https://tomdonoghue.github.io/), with additional contributions from the
95 | [course staff](https://github.com/COGS108/Overview/blob/master/CONTRIBUTORS.md).
96 | 


--------------------------------------------------------------------------------
/dsip/projects/ProjectProposal.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Project Proposal"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Research Question"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "*Fill in your research question here*"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "## Background and Prior Work"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "Fill in your background and prior work here. **Use inline citation through Mardown footnotes to specify which references support which statements** \n",
 36 |     "\n",
 37 |     "For example: After government genocide in the 20th century, real birds were replaced with surveillance drones designed to look just like birds[^lorenz]. \n",
 38 |     "Use a minimum of 2 or 3 citations, but we prefer more[^admonish]. You need enough to fully explain and back up important facts. \n",
 39 |     "\n",
 40 |     "[^lorenz]: Lorenz, T. (9 Dec 2021) Birds Aren’t Real, or Are They? Inside a Gen Z Conspiracy Theory. *The New York Times*. https://www.nytimes.com/2021/12/09/technology/birds-arent-real-gen-z-misinformation.html \n",
 41 |     "[^admonish]: Also refs should be important to the background, not some randomly chosen vaguely related stuff. Include a web link if possible in refs as above.\n"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "# Hypothesis\n"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "*State and defend your hypotheses here.*"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "# Data"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "*Explain what the ideal dataset you would want to answer this question. (This should include: What variables? How many observations? Who/what/how would these data be collected? How would these data be stored/organized?)*"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "# Ethics & Privacy"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "*Fill in your ethics & privacy discussion here*"
 84 |    ]
 85 |   }
 86 |  ],
 87 |  "metadata": {
 88 |   "kernelspec": {
 89 |    "display_name": "Python 3",
 90 |    "language": "python",
 91 |    "name": "python3"
 92 |   },
 93 |   "language_info": {
 94 |    "codemirror_mode": {
 95 |     "name": "ipython",
 96 |     "version": 3
 97 |    },
 98 |    "file_extension": ".py",
 99 |    "mimetype": "text/x-python",
100 |    "name": "python",
101 |    "nbconvert_exporter": "python",
102 |    "pygments_lexer": "ipython3",
103 |    "version": "3.8.10"
104 |   }
105 |  },
106 |  "nbformat": 4,
107 |  "nbformat_minor": 2
108 | }
109 | 


--------------------------------------------------------------------------------
/dsip/projects/ProjectReport.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Project Report"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Overview"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "*Fill in your overview here*"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "<a id='research_question'></a>\n",
 29 |     "# Research Question"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "*Fill in your research question here*"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "<a id='background'></a>\n",
 44 |     "\n",
 45 |     "## Background & Prior Work"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "*Fill in your background and prior work here* \n",
 53 |     "\n",
 54 |     "References (include links):\n",
 55 |     "- 1)\n",
 56 |     "- 2)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "# Hypothesis\n"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "*Fill in your hypotheses here*"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "# Dataset(s)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "*Fill in your dataset information here*\n",
 85 |     "\n",
 86 |     "(Copy this information for each dataset)\n",
 87 |     "- Dataset Name:\n",
 88 |     "- Link to the dataset:\n",
 89 |     "- Number of observations:\n",
 90 |     "\n",
 91 |     "1-2 sentences describing each dataset. \n",
 92 |     "\n",
 93 |     "If you plan to use multiple datasets, add 1-2 sentences about how you plan to combine these datasets."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "# Setup"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 1,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "## YOUR CODE HERE"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "# Data Cleaning"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "Describe your data cleaning steps here."
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 2,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "## YOUR CODE HERE\n",
133 |     "## FEEL FREE TO ADD MULTIPLE CELLS PER SECTION"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "# Data Analysis & Results"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {},
146 |    "source": [
147 |     "Include cells that describe the steps in your data analysis."
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 3,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "## YOUR CODE HERE\n",
157 |     "## FEEL FREE TO ADD MULTIPLE CELLS PER SECTION"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "# Ethics & Privacy"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "*Fill in your ethics & privacy discussion here*"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "# Conclusion & Discussion"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "*Fill in your discussion information here*"
186 |    ]
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "kernelspec": {
191 |    "display_name": "Python 3",
192 |    "language": "python",
193 |    "name": "python3"
194 |   },
195 |   "language_info": {
196 |    "codemirror_mode": {
197 |     "name": "ipython",
198 |     "version": 3
199 |    },
200 |    "file_extension": ".py",
201 |    "mimetype": "text/x-python",
202 |    "name": "python",
203 |    "nbconvert_exporter": "python",
204 |    "pygments_lexer": "ipython3",
205 |    "version": "3.8.10"
206 |   }
207 |  },
208 |  "nbformat": 4,
209 |  "nbformat_minor": 2
210 | }
211 | 


--------------------------------------------------------------------------------
/dsip/projects/project_checklist.md:
--------------------------------------------------------------------------------
 1 | # Project Checklist
 2 | 
 3 | You can use this checklist to help guide your thinking on the final project. If you check off all the boxes below, you should be in good shape to get a perfect score on your final project.
 4 | 
 5 | ### Overview, Question & Background
 6 | 
 7 | **Overview**:
 8 | - [ ] Write a clear summary of what you did
 9 | - [ ] Briefly describe the results of your project
10 | - [ ] Limit overview to 3-4 sentences
11 | 
12 | **Research Question**:
13 | - [ ] Include a specific, clear data science question
14 | - [ ] Make sure what you're measuring (variables) to answer the question is clear
15 | 
16 | **Background & Prior Work**:
17 | - [ ] Include a general introduction to your topic
18 | - [ ] Include explanation of what work has been done previously
19 | - [ ] Include citations or links to previous work
20 | 
21 | **Hypothesis**:
22 | - [ ] Include the hypothesis
23 | - [ ] Ensure that this hypothesis is clear to readers
24 | - [ ] Explain why you think this will be the outcome (what was your thinking?)
25 | 
26 | ### Dataset(s):
27 | - [ ] Include an explanation of dataset(s) used (i.e. features/variables included, number of observations, information in dataset)
28 | - [ ] Source included (if outside dataset(s) being used)
29 | 
30 | ### Data Analysis:
31 | 
32 | **Data Cleaning & Pre-processing**
33 | - [ ] Perform Data Cleaning and explain steps taken OR include an explanation as to why data cleaning was unnecessary (how did you determine your dataset was ready to go?)
34 | - [ ] Dataset actually clean and usable after data wrangling steps carried out
35 | 
36 | **Data Visualization**:
37 | - [ ] Include at least three visualizations
38 | - [ ] Clearly label all axes on plots
39 | - [ ] Type of all plots appropriate given data displayed
40 | - [ ] Interpretation of each visualization included in the text
41 | 
42 | **Data Analysis & Results**:
43 | - [ ] EDA carried out with explanations of what was done and interpretations of output included
44 | - [ ] Appropriate analysis performed
45 | - [ ] Output of analysis interpreted and interpretation included in notebook
46 | 
47 | ### Privacy/Ethics Considerations:
48 | - [ ] Thoughtful discussion of ethical concerns included
49 | - [ ] Ethical concerns consider the whole data science process (question asked, data collected, data being used, the bias in data, analysis, post-analysis, etc.)
50 | - [ ] How your group handled bias/ethical concerns clearly described
51 | 
52 | ### Conclusion & Discussion:
53 | - [ ] Clear conclusion (answer to the question being asked) and discussion of results
54 | - [ ] Limitations of analysis discussed
55 | - [ ] Does not ramble on beyond providing necessary information
56 | 
57 | ### Final Checks:
58 | - [ ] Edit all text for clarity
59 | - [ ] Remove all instructions
60 | - [ ] Be sure text included throughout to guide reader
61 | - [ ] Check to make sure all text and images are visible
62 | 


--------------------------------------------------------------------------------
/dsip/projects/project_guidelines.md:
--------------------------------------------------------------------------------
  1 | # Project Guide
  2 | 
  3 | This is an edited version of the project guidelines used for the course.
  4 | 
  5 | If you wish to pursue an independent data science project, this outline may be a useful guide.
  6 | 
  7 | ## Project Overview
  8 | 
  9 | The Final Project will give you the chance to explore a topic of your choice and to expand your analytical skills. By working with real data of your choosing you can examine questions of particular interest to you.
 10 | 
 11 | The broad objectives for the project are to:
 12 | 
 13 | * Identify the problems and goals of a real situation and dataset.
 14 | * Choose an appropriate approach for formalizing and testing the problems and goals, and be able to articulate the reasoning for that selection.
 15 | * Implement your analysis choices on the dataset.
 16 | * Interpret the results of the analyses.
 17 | * Contextualize those results within a greater scientific and social context, acknowledging and addressing any potential issues related to privacy and ethics.
 18 | 
 19 | The basic project steps (broken down in more detail below):
 20 | 
 21 | * Find a real world dataset and problem that you believe can be solved with one or more of the techniques we have learned in class.
 22 | * After selecting a dataset and identifying the goal, write out a proposed analysis plan using template provided and submit it through GitHub for review.
 23 | * Apply the techniques outlined and come up with a result for the dataset that you proposed.
 24 | * Assemble a Jupyter notebook that communicates your hypothesis, methods, and results. Submit this as your final project.
 25 | * Submit feedback about group and individual group members. This is done individually.
 26 | 
 27 | ## Project Components
 28 | 
 29 | ### Project Proposal
 30 | 
 31 | The project proposal includes the following sections:
 32 | 
 33 | **RESEARCH QUESTION**: What is your research question? Include the specific question you're setting out to answer. This question should be specific, answerable with data, and clear. A general question with specific subquestions is permitted. (1-2 sentences)
 34 | 
 35 | **BACKGROUND & PRIOR WORK**: This section will present the background and context of your topic and question in a few paragraphs. Include a general introduction to your topic and then describe what information you currently know about the topic after doing your initial research. Include references to other projects who have asked similar questions or approached similar problems. Explain what others have learned in their projects.
 36 | 
 37 | Find some relevant prior work, and reference those sources, summarizing what each did and what they learned. Even if you think you have a totally novel question, find the most similar prior work that you can and discuss how it relates to your project.
 38 | 
 39 | References can be research publications, but they need not be. Blogs, GitHub repositories, company websites, etc., are all viable references if they are relevant to your project. It must be clear which information comes from which references. (2-3 paragraphs, including at least 2 references)
 40 | 
 41 | **HYPOTHESIS**: What is your main hypothesis/predictions about what the answer to your question is? Briefly explain your thinking. (2-3 sentences)
 42 | 
 43 | **DATA**: Here, you are to *think* about and *describe* the *ideal* dataset (or datasets) you you would need to answer this question:
 44 | 
 45 | * What variables would you have?
 46 | * How would they be stored?
 47 | * How many observations would you have?
 48 | * What/who would the observations be? Over what time period? etc.
 49 | * etc.
 50 | 
 51 | Note: For the project proposal, you do NOT have to find the actual dataset(s) needed for your project. For the first checkpoint and onward, you will.
 52 | 
 53 | **ETHICS & PRIVACY**: Acknowledge and address any ethics & privacy related issues of your question(s), proposed dataset(s), and/or analyses. Use the information provided in lecture to guide your group discussion and thinking. If you need further guidance, check out [Deon's Ethics Checklist](http://deon.drivendata.org/#data-science-ethics-checklist). In particular:
 54 | 
 55 | * Are there any biases/privacy/terms of use issues with the data you propsed?
 56 | * Are there potential biases in your dataset(s), in terms of who it composes, and how it was collected, that may be problematic in terms of it allowing for equitable analysis? (For example, does your data exclude particular populations, or is it likely to reflect particular human biases in a way that could be a problem?)
 57 | * How will you set out to detect these specific biases before, during, and after/when communicating your analysis?
 58 | * Are there any other issues related to your topic area, data, and/or analyses that are potentially problematic in terms of data privacy and equitable impact?
 59 | * How will you handle issues you identified?
 60 | 
 61 | (1-2 paragraphs)
 62 | 
 63 | #### Project Proposal - Style Guidelines
 64 | 
 65 | The proposal should be written clearly and at a level understandable by a typical undergraduate student.
 66 | 
 67 | This is a short but detailed proposal meant to give us time to assess and critique your Final Project idea (further described below), in order to give you time to improve upon it throughout the quarter.
 68 | 
 69 | Remember to proofread your Project Proposal. Do not use overly flowery and/or vague language.
 70 | 
 71 | ### Final Project
 72 | 
 73 | Time to put it all together! The main products of the final project are 1) a report submitted as single Jupyter Notebook on GitHub and 2) a 3-5 minute video communicating your group project.
 74 | 
 75 | #### Final Report
 76 | 
 77 | This single notebook should include all the code you used for all components of the project (cleaning, visualization, analysis). Because we won’t be running the code in your notebook, it is important to make sure your notebook as submitted to GitHub has the code evaluated and outputs present (e.g., plots) so that we can read the project as is.
 78 | 
 79 | #### Report Sections - Instructions
 80 | 
 81 | Each of the following sections corresponds to a section in the file FinalProject_groupXXX.ipynb (template is in your group's GitHub repo).
 82 | 
 83 | For sections included in your proposal and previous checkpoints, you can copy and paste into your final project, but be sure to edit these sections with feedback you received on your proposal or additional information you learned throughout the project. This report should read clearly from start to finish, explaining what you did, why you did it, and what you learned. This should be a concise and well-written report.
 84 | 
 85 | **PERMISSIONS**: Specify whether you want your group project to be made publicly available. Place an X in the square brackets where appropriate.
 86 | 
 87 | **OVERVIEW**: Include 3-4 sentences summarizing your group’s project and results.
 88 | 
 89 | **NAMES**: See proposal specifications.
 90 | 
 91 | **RESEARCH QUESTION**: See proposal specifications.
 92 | 
 93 | **BACKGROUND & PRIOR WORK**: See proposal specifications.
 94 | 
 95 | **HYPOTHESIS**: See proposal specifications.
 96 | 
 97 | **DATASET(S)**: Same as Checkpoint #1.
 98 | 
 99 | **SETUP**: See Checkpoint #1.
100 | 
101 | **DATA CLEANING**: See  Checkpoint #1.
102 | 
103 | **DATA ANALYSIS & RESULTS**: This section should include markdown text and code walking us through the following:
104 | 
105 | * EDA (Same as Checkpoint #2, but clean visualizations up and feel free to remove unecessary visualizations)
106 |   * What distributions do your variables take?
107 |   * Are there any outliers?
108 |   * Relationship between variables?
109 | 
110 | * Analysis (Note that you will likely have to do some Googling for analytical approaches not discussed in class. This is expected for this project and an important skill for a data scientist to master.)
111 |   * What approaches did you use? Why?
112 |   * What were the results?
113 |   * What were your interpretation of these findings.
114 | 
115 | * Data Visualization - There must be at least three (3) appropriate data visualizations throughout these sections. Each visualization must included an interpretation of what is displayed *and* what should be learned from that visualization. Be sure that the appropriate type of visualization is generated given the data that you have, axes are all labeled, and the visualizations clearly communicate the point you’re trying to make.
116 | 
117 | **ETHICS & PRIVACY**: See proposal specifications. (be sure to update with what you actually did to take the ethical considerations into account for the analysis you did!)
118 | 
119 | **CONCLUSION & DISCUSSION**: Discuss your project. Summarize your data and question. Briefly describe your analysis. Summarize your results and conclusions. Be sure to mention any limitations of your project. Discuss the impact of this work on society. (2-3 paragraphs)
120 | 
121 | 
122 | ## Previous Final Projects
123 | 
124 | See Prof. Voytek’s write-up of excellent class projects from the Spring 2017 instance of COGS 108 [here](https://voyteklab.com/uc-san-diego-data-science-projects/), all of which received perfect scores.
125 | 
126 | Additionally, previous projects can be viewed from when this course ran in [Spring 2017](https://github.com/COGS108/FinalProjects-Sp17), [Winter 2018](https://github.com/COGS108/FinalProjects-Wi18), [Spring 2019](https://github.com/COGS108/FinalProjects-Sp19), [Fall 2019](https://github.com/COGS108/FinalProjects-Fa19), [Winter 2020](https://github.com/COGS108/FinalProjects-Wi20), [Spring 2020](https://github.com/COGS108/FinalProjects-Sp20), [Fall 2020](https://github.com/COGS108/FinalProjects-Fa20), or [Winter 2021](https://github.com/COGS108/FinalProjects-Wi21). Note first, that these projects are of variable quality and second, that if you get inspiration or code from previous projects, this must be noted in your project, giving attribution to the former groups’ work.
127 | 
128 | ## How to Find Datasets
129 | 
130 | The purpose of this project is to find a real-world problem and dataset (or likely, datasets!) that can be analyzed with the techniques learned in class and those you learn on your own. It is imperative that by doing so you believe extra information will be gained — that you believe you can discover something new!
131 | 
132 | You must use at least one dataset containing at least approximately 1000 observations (if your data are smaller but you feel they are sufficient. You are welcome (and in fact recommended) to find multiple datasets!
133 | 
134 | The best datasets are the ones that can help you answer your question of interest.
135 | 
136 | Your question could be just for fun: Using text mining of song lyric websites to identify the most commonly used phrases and sentiments by decade.
137 | 
138 | Your question could be scientific: Scrape data from animal taxonomies and Wikipedia to figure out if larger animals are more likely to be carnivores?.
139 | 
140 | Or, ideally, your question can be aimed at civic or social good, for example, use mapping, transit, and car accident data to identify which parts of San Diego are most in need of dedicated bike lanes.
141 | 
142 | To help you find datasets, we have collected a list of websites that have a considerable number of open source data sets and included them at the end of this document.
143 | 
144 | ### Dataset Resource List
145 | 
146 | Here, is a list of potential locations to find datasets and problems to investigate. If you have another dataset or search location, that is great!
147 | 
148 | * [Awesome Public Datasets](https://github.com/awesomedata/awesome-public-datasets/blob/master/README.rst)
149 | * [Data.gov](https://catalog.data.gov/dataset)
150 | * [Data Is Plural](https://docs.google.com/spreadsheets/d/1wZhPLMCHKJvwOkP4juclhjFgqIY8fQFMemwKL2c64vk/edit#gid=0)
151 | * [UCSD Datasets](https://ucsd.libguides.com/data-statistics/home)
152 | * [Datasets | Deep Learning](http://deeplearning.net/datasets/)
153 | * [Stanford | Social Science Data Collection](https://data.stanford.edu/)
154 | * [Eviction Lab (email required)](https://evictionlab.org/get-the-data/)
155 | * [San Diego Data](https://data.sandiego.gov/)
156 | * [US Census](https://www.census.gov/)
157 | * [Open Climate Data](http://openclimatedata.net/)
158 | * [Data and Story Library](https://dasl.datadescription.com/datafiles/)
159 | * [UCSD behavioral mobile data](http://extrasensory.ucsd.edu/)
160 | * [Kaggle](https://www.kaggle.com/)
161 | * [FiveThirtyEight](https://data.fivethirtyeight.com/)
162 | * [data.world](https://data.world/)
163 | * [Free Datasets - R and Data Mining ](http://www.rdatamining.com/resources/data)
164 | * [Data Sources for Cool Data Science Projects](https://blog.thedataincubator.com/2014/10/data-sources-for-cool-data-science-projects-part-1/)
165 | 


--------------------------------------------------------------------------------
/dsip/tutorials/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore ipynb checkpoint files
2 | *ipynb_checkpoints/*
3 | # Ignore Mac Folder Attribute files
4 | *DS_Store*
5 | 


--------------------------------------------------------------------------------
/dsip/tutorials/00-Introduction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true,
  7 |     "nbpresent": {
  8 |      "id": "7fc0cefe-8b1c-4ca9-aa39-094614969842"
  9 |     }
 10 |    },
 11 |    "source": [
 12 |     "# Introduction\n",
 13 |     "\n",
 14 |     "Welcome to the hands on materials for Data Science in Practice.\n",
 15 |     "\n",
 16 |     "This notebook will guide through getting the tools you will need for working with these tutorials and assignments."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Alerts"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "Throughout these tutorials, you will see colored 'alert' text:\n",
 31 |     "\n",
 32 |     "<div class=\"alert alert-success\">\n",
 33 |     "Green alerts provide key information and definitions.\n",
 34 |     "</div>\n",
 35 |     "\n",
 36 |     "<div class=\"alert alert-info\">\n",
 37 |     "Blue alerts provide links out to further \n",
 38 |     "<a href=https://google.com class=alert-link>resources</a>. \n",
 39 |     "</div>"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "metadata": {
 45 |     "nbpresent": {
 46 |      "id": "b6153143-e694-4e86-96b0-243f56bad8d5"
 47 |     }
 48 |    },
 49 |    "source": [
 50 |     "## What do you need for these tutorials?\n",
 51 |     "\n",
 52 |     "### Software\n",
 53 |     "\n",
 54 |     "- Working install of Python (>= 3.6), with the anaconda distribution\n",
 55 |     "    - If you are in the official class, [datahub](http://datahub.ucsd.edu) satisfies this requirement\n",
 56 |     "- Jupyter Notebooks\n",
 57 |     "    - Also satisfied by [datahub](http://datahub.ucsd.edu)\n",
 58 |     "- git and a GitHub account"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "### Prerequisites\n",
 66 |     "\n",
 67 |     "These tutorials presume that you do already have some basic knowledge of programming. \n",
 68 |     "\n",
 69 |     "In particular, it assumes knowledge of the Python programming language and standard library. \n",
 70 |     "\n",
 71 |     "If you are somewhat unfamiliar with Python, you can follow the links in the Python notebook to catch up."
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "### Computational Resources\n",
 79 |     "\n",
 80 |     "The examples throughout these tutorials, and in the assignments are not computationally heavy. \n",
 81 |     "\n",
 82 |     "You should be able to run all these materials on any computer you have access to, assuming it will run the aforementioned tools. "
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "### Installing Python\n",
 90 |     "\n",
 91 |     "- If you are running code locally, we recommend you install a new version of Python with Anaconda, as described below\n",
 92 |     "    - If you are in the official course, you can use [datahub](http://datahub.ucsd.edu) for everything you need\n",
 93 |     "- If you are on Mac, you have a native installation of python. This native installation of Python may be older, will not include the extra packages that you will need for this class, and is best left untouched. \n",
 94 |     "    - Downloading Anaconda will install a separate, independent install of Python, leaving your native install untouched. \n",
 95 |     "- Windows does not require Python natively and so it is not typically pre-installed."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "markdown",
100 |    "metadata": {},
101 |    "source": [
102 |     "## Tools\n",
103 |     "\n",
104 |     "The following are a series of tools that you will need for this class"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "<br>\n",
112 |     "<br>\n",
113 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/anaconda.png\" width=\"350px\">\n",
114 |     "<br>\n",
115 |     "<br>"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "<div class=\"alert alert-success\">\n",
123 |     "Anaconda is an open-source distribution of Python, designed for scientific computing, data science and machine learning. \n",
124 |     "</div>\n",
125 |     "\n",
126 |     "<div class=\"alert alert-info\">\n",
127 |     "The anaconda website is \n",
128 |     "<a href=\"https://www.anaconda.com\" class=\"alert-link\">here</a>,\n",
129 |     "with the download page\n",
130 |     "<a href=\"https://www.anaconda.com\" class=\"alert-link\">here</a>.\n",
131 |     "</div>"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "Anaconda itself is a distribution, meaning that is a version of Python with a collection of packages that are curated and maintained together. \n",
139 |     "\n",
140 |     "Using a pre-built distribution is useful, as it comes with the packages that you need for data science.\n",
141 |     "\n",
142 |     "Anaconda also comes with `conda`, which is a package manager, allowing you to download, install, and manage other packages. \n",
143 |     "\n",
144 |     "The anaconda distribution includes all packages that are needed for these tutorials."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "<br>\n",
152 |     "<br>\n",
153 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/jupyter.png\" width=\"250px\">\n",
154 |     "<br>\n",
155 |     "<br>"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {
161 |     "nbpresent": {
162 |      "id": "0f4dd046-4020-465c-85f6-3d92ac9fe145"
163 |     }
164 |    },
165 |    "source": [
166 |     "<div class=\"alert alert-success\">\n",
167 |     "Jupyter notebooks are a way to intermix code, outputs and plain text. \n",
168 |     "They run in a web browser, and connect to a kernel to be able to execute code. \n",
169 |     "</div>\n",
170 |     "\n",
171 |     "<div class=\"alert alert-info\">\n",
172 |     "The official Jupyter website is available \n",
173 |     "<a href=\"http://jupyter.org\" class=\"alert-link\">here</a>.\n",
174 |     "</div>"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "Note that you do not need to download Jupyter separately, as it comes packaged with the Anaconda distribution."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "#### Checking Your Python Version\n",
189 |     "\n",
190 |     "You can check which installation of Python you are using, and which version it is.\n",
191 |     "\n",
192 |     "Once you have installed anaconda, you should see you are using Python in an anaconda folder. \n",
193 |     "\n",
194 |     "The version number that is printed should also be 3.6 or greater. "
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 1,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "/opt/anaconda3/bin/python\n",
207 |       "Python 3.7.4\n"
208 |      ]
209 |     }
210 |    ],
211 |    "source": [
212 |     "# Check the installed version of Python\n",
213 |     "#   Note: these are command-line functions that may not work on windows\n",
214 |     "!which python\n",
215 |     "!python --version"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "<br>\n",
223 |     "<br>\n",
224 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/git.png\" width=\"300px\">\n",
225 |     "<br>\n",
226 |     "<br>"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {
232 |     "nbpresent": {
233 |      "id": "6576af9c-b0f3-4cbe-9a02-06feaa61d0b0"
234 |     }
235 |    },
236 |    "source": [
237 |     "<div class=\"alert alert-success\">\n",
238 |     "Git is a tool, a software package, for version control. \n",
239 |     "</div>\n",
240 |     "\n",
241 |     "<div class=\"alert alert-info\">\n",
242 |     "Install \n",
243 |     "<a href=\"https://git-scm.com/book/en/v2/Getting-Started-Installing-Git\" class=\"alert-link\">git</a>,\n",
244 |     "if you don't already have it.\n",
245 |     "</div>"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "<br>\n",
253 |     "<br>\n",
254 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/github.png\" width=\"300px\">\n",
255 |     "<br>\n",
256 |     "<br>"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "markdown",
261 |    "metadata": {},
262 |    "source": [
263 |     "<div class=\"alert alert-success\">\n",
264 |     "Github is an online hosting service that can be used with git, and offers online tools to use git. \n",
265 |     "</div>\n",
266 |     "\n",
267 |     "<div class=\"alert alert-info\">\n",
268 |     "Create an account on \n",
269 |     "<a href=\"https://github.com/\" class=\"alert-link\">Github</a>.\n",
270 |     "</div>"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "Git & GitHub are not the same thing, though, in practice, they are commonly used together, whereby git is used as a tool to version control code and manage multiple copies stored across your computer, as well as on remote repositories that are stored on Github.\n",
278 |     "\n",
279 |     "Note that while GitHub is a private company, git is an open-source tool, and can be used independent of GitHub."
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 2,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "name": "stdout",
289 |      "output_type": "stream",
290 |      "text": [
291 |       "git version 2.20.1 (Apple Git-117)\r\n"
292 |      ]
293 |     }
294 |    ],
295 |    "source": [
296 |     "# Check that you have git installed (which version doesn't really matter)\n",
297 |     "!git --version"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "<br>\n",
305 |     "<br>\n",
306 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/sourcetree.png\" width=\"500px\">\n",
307 |     "<br>\n",
308 |     "<br>"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "markdown",
313 |    "metadata": {},
314 |    "source": [
315 |     "<div class=\"alert alert-success\">\n",
316 |     "Source Tree is a free graphical user interface (GUI) for managing repositories with git & Github. \n",
317 |     "</div>\n",
318 |     "\n",
319 |     "<div class=\"alert alert-info\">\n",
320 |     "Source Tree is available \n",
321 |     "<a href=\"https://www.sourcetreeapp.com\" class=\"alert-link\">here</a>.\n",
322 |     "You will need an account on \n",
323 |     "<a href=\"https://www.atlassian.com\" class=\"alert-link\">Atlassian</a>,\n",
324 |     "who make Source Tree, but this is free.\n",
325 |     "</div>\n",
326 |     "\n",
327 |     "You don't need to use SourceTree (or any other GUI) if you know, or want to learn to use git from the command line."
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "markdown",
332 |    "metadata": {},
333 |    "source": [
334 |     "## Environments"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "<div class=\"alert alert-success\">\n",
342 |     "Environments are isolated, independent installations of a programming language and groups of packages, that don't interfere with each other. \n",
343 |     "</div>\n",
344 |     "\n",
345 |     "<div class=\"alert alert-info\">\n",
346 |     "Anaconda has detailed instructions on using environments available \n",
347 |     "<a href=\"https://conda.io/docs/using/envs.html\" class=\"alert-link\">here</a>.\n",
348 |     "</div>"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "You do not need to use environments, however you may find it useful if you want or need to maintain multiple different versions of Python. \n",
356 |     "\n",
357 |     "If you want to use an environment, and already have conda, you can run this command from command line: <br>\n",
358 |     "\n",
359 |     "``$ conda create --name *envname* python=3.7 anaconda`` <br>\n",
360 |     "\n",
361 |     "^ Replace '*envname*' with a name to call this environment.<br>\n",
362 |     "\n",
363 |     "This will install a new environment, with Python 3.7 and the anaconda distribution.\n",
364 |     "\n",
365 |     "You will then need to activate this environment (everytime) you want to use it. \n",
366 |     "\n",
367 |     "To activate your environment: <br>\n",
368 |     "``$ conda activate *envname*``\n",
369 |     "\n",
370 |     "To deactivate your environment: <br>\n",
371 |     "``$ conda deactivate``"
372 |    ]
373 |   }
374 |  ],
375 |  "metadata": {
376 |   "anaconda-cloud": {},
377 |   "kernelspec": {
378 |    "display_name": "Python 3",
379 |    "language": "python",
380 |    "name": "python3"
381 |   },
382 |   "language_info": {
383 |    "codemirror_mode": {
384 |     "name": "ipython",
385 |     "version": 3
386 |    },
387 |    "file_extension": ".py",
388 |    "mimetype": "text/x-python",
389 |    "name": "python",
390 |    "nbconvert_exporter": "python",
391 |    "pygments_lexer": "ipython3",
392 |    "version": "3.7.4"
393 |   },
394 |   "nbpresent": {
395 |    "slides": {
396 |     "3d09dc46-88c8-44cb-bc57-259db78a0e70": {
397 |      "id": "3d09dc46-88c8-44cb-bc57-259db78a0e70",
398 |      "prev": "8d1b5def-2290-42c9-8b06-1c6e0e495521",
399 |      "regions": {
400 |       "4601423d-c94d-46da-885b-fe33b0216c22": {
401 |        "attrs": {
402 |         "height": 1,
403 |         "width": 1,
404 |         "x": 0,
405 |         "y": 0
406 |        },
407 |        "content": {
408 |         "cell": "0f4dd046-4020-465c-85f6-3d92ac9fe145",
409 |         "part": "whole"
410 |        },
411 |        "id": "4601423d-c94d-46da-885b-fe33b0216c22"
412 |       }
413 |      }
414 |     },
415 |     "8d1b5def-2290-42c9-8b06-1c6e0e495521": {
416 |      "id": "8d1b5def-2290-42c9-8b06-1c6e0e495521",
417 |      "prev": "bc666852-d015-42a1-b679-eaf92d5eb643",
418 |      "regions": {
419 |       "d0118c2f-7757-4efa-a276-96f162d312ae": {
420 |        "attrs": {
421 |         "height": 1,
422 |         "width": 1,
423 |         "x": 0,
424 |         "y": 0
425 |        },
426 |        "content": {
427 |         "cell": "d9d878d6-230b-4f1e-b2aa-2f152cb3fe8e",
428 |         "part": "whole"
429 |        },
430 |        "id": "d0118c2f-7757-4efa-a276-96f162d312ae"
431 |       }
432 |      }
433 |     },
434 |     "b039dd05-8357-462a-9525-7f8103de436c": {
435 |      "id": "b039dd05-8357-462a-9525-7f8103de436c",
436 |      "prev": "3d09dc46-88c8-44cb-bc57-259db78a0e70",
437 |      "regions": {
438 |       "9180ab3f-f784-45a2-b2cc-a18aad800fc5": {
439 |        "attrs": {
440 |         "height": 1,
441 |         "width": 1,
442 |         "x": 0,
443 |         "y": 0
444 |        },
445 |        "content": {
446 |         "cell": "b57ed03a-8c01-4e48-95e8-9c6753e35088",
447 |         "part": "whole"
448 |        },
449 |        "id": "9180ab3f-f784-45a2-b2cc-a18aad800fc5"
450 |       }
451 |      }
452 |     },
453 |     "bc666852-d015-42a1-b679-eaf92d5eb643": {
454 |      "id": "bc666852-d015-42a1-b679-eaf92d5eb643",
455 |      "layout": "grid",
456 |      "prev": null,
457 |      "regions": {
458 |       "31cd776f-cc93-49d6-a40c-c590805cfb8f": {
459 |        "attrs": {
460 |         "height": 0.8333333333333334,
461 |         "pad": 0.01,
462 |         "width": 0.8333333333333334,
463 |         "x": 0.08333333333333333,
464 |         "y": 0.08333333333333333
465 |        },
466 |        "content": {
467 |         "cell": "7fc0cefe-8b1c-4ca9-aa39-094614969842",
468 |         "part": "whole"
469 |        },
470 |        "id": "31cd776f-cc93-49d6-a40c-c590805cfb8f"
471 |       },
472 |       "e1612c29-0f61-4692-9d6e-112e8d378e46": {
473 |        "attrs": {
474 |         "height": 0.8333333333333334,
475 |         "pad": 0.01,
476 |         "width": 0.8333333333333334,
477 |         "x": 0.08333333333333333,
478 |         "y": 0.08333333333333333
479 |        },
480 |        "content": {
481 |         "cell": "7fc0cefe-8b1c-4ca9-aa39-094614969842",
482 |         "part": "whole"
483 |        },
484 |        "id": "e1612c29-0f61-4692-9d6e-112e8d378e46"
485 |       }
486 |      }
487 |     }
488 |    },
489 |    "themes": {}
490 |   }
491 |  },
492 |  "nbformat": 4,
493 |  "nbformat_minor": 1
494 | }
495 | 


--------------------------------------------------------------------------------
/dsip/tutorials/01-Python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Python"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "collapsed": true
 14 |    },
 15 |    "source": [
 16 |     "<br>\n",
 17 |     "<br>\n",
 18 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/python.png\" width=\"400px\">\n",
 19 |     "<br>\n",
 20 |     "<br>"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "source": [
 29 |     "<div class=\"alert alert-success\"> \n",
 30 |     "Python is an \n",
 31 |     "<a href=https://en.wikipedia.org/wiki/Open-source_model class=\"alert-link\">open-source</a>, \n",
 32 |     "<a href=https://en.wikipedia.org/wiki/High-level_programming_language class=\"alert-link\">high-level </a>, \n",
 33 |     "<a href=https://en.wikipedia.org/wiki/General-purpose_programming_language class=\"alert-link\">general purpose</a>, \n",
 34 |     "<a href=https://en.wikipedia.org/wiki/Interpreted_language class=\"alert-link\">interpreted</a>, \n",
 35 |     "<a href=https://en.wikipedia.org/wiki/Programming_language class=\"alert-link\">programming language</a>, \n",
 36 |     "one of the most popular for data science applications. \n",
 37 |     "</div>\n",
 38 |     "\n",
 39 |     "<div class=\"alert alert-info\">\n",
 40 |     "The official Python\n",
 41 |     "<a href=https://www.python.org class=\"alert-link\">website</a>.\n",
 42 |     "</div>"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Why Python\n",
 50 |     "\n",
 51 |     "- As a general purpose language, Python supports a large range of tasks.\n",
 52 |     "    - Or put another way: 'Python isn't the best at anything, but it's second best at everything'\n",
 53 |     "    - This is useful. A data science project may include everything from scraping data from the web, analyzing a mixture or text and numerical data, computing features, training a model, creating high-quality graphs, and then hosting a website with your results. \n",
 54 |     "- Python is explicitly and by design, user-friendly.\n",
 55 |     "- Python also has a massive user community, who contribute to a large number of high-quality, well maintained open-source tools.\n",
 56 |     "    - The best language for your project is one which has the things you need.\n",
 57 |     "- In part for the reasons listed above, Python is heavily used in industry\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "<div class=\"alert alert-info\">\n",
 65 |     "The Python programming language is developed and maintained by the\n",
 66 |     "<a href=https://www.python.org/psf/ class=\"alert-link\">Python Software Foundation</a>.\n",
 67 |     "</div>"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Python Versions\n",
 75 |     "\n",
 76 |     "This class uses Python3, the currently developed version of Python, and more specifically Python version 3.6 or above. \n",
 77 |     "\n",
 78 |     "Python2 has reached \"End of Life\" meaning it is no longer supported or maintained by the Python Organization. "
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Python Resources\n",
 86 |     "\n",
 87 |     "These materials presume prior knowledge of the Python programming language. \n",
 88 |     "\n",
 89 |     "If you are note yet familiar, here are some entry level materials for learning Python:\n",
 90 |     "\n",
 91 |     "- [Codecademy](https://www.codecademy.com/tracks/python) is good for a beginner's introduction to the language.\n",
 92 |     "- [The Official Beginners Guide](https://wiki.python.org/moin/BeginnersGuide) is supported by the Python organization.\n",
 93 |     "- [Whirlwind Tour of Python](https://github.com/jakevdp/WhirlwindTourOfPython) is a free collection of Jupyter notebooks that takes you through Python. \n",
 94 |     "    - This book is especially good (and specifically designed for) if you have some experience with programming in some other language, and want to quickly run through the specifics of Python."
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "<div class=\"alert alert-info\">\n",
102 |     "A much broader list of resources and guides for learning Python is available \n",
103 |     "<a href=https://github.com/openlists/PythonResources class=\"alert-link\">here</a>.\n",
104 |     "</div>"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Getting Un-Stuck\n",
112 |     "\n",
113 |     "At some point, you will get stuck. It happens. The internet is your friend. \n",
114 |     "\n",
115 |     "If you get an error, or aren't sure how to proceed, use {your favourite search engine} with specific search terms relating to what you are trying to do. Sometimes this just means searching the error that you got.\n",
116 |     "\n",
117 |     "Your are likely to find responses on [StackOverflow](https://stackoverflow.com) - which is basically a forum for programming questions, and a good place to find answers. "
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "## Standard Library"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "<div class=\"alert alert-success\"> \n",
132 |     "The Standard Library refers to everything in Python that is part of standard version and install of Python.\n",
133 |     "</div>"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "<div class=\"alert alert-info\"> \n",
141 |     "The Python \n",
142 |     "<a href=https://docs.python.org/3/library/ class=\"alert-link\">Standard Library</a>\n",
143 |     "comes with a lot of basic functionality. \n",
144 |     "</div>"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "Part of what makes Python a powerful language is the standard library itself, which is a rich set of tools for programming. However, the standard library itself does not include data science tools, and a lot of the power of Python stems for a rich ecosystem of packages that can be added and used with Python. "
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "## Packages"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "<div class=\"alert alert-success\"> \n",
166 |     "Packages are collections of code. Packages from outside the standard library can be installed and added to Python.\n",
167 |     "</div>\n",
168 |     "\n",
169 |     "<div class=\"alert alert-info\"> \n",
170 |     "For managing and installing packages, Anaconda comes with the \n",
171 |     "<a href=\"https://conda.io/docs/using/pkgs.html\" class=\"alert-link\">conda</a>\n",
172 |     "package manager.\n",
173 |     "</div>"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## Scientific Python\n",
181 |     "\n",
182 |     "When we say that Python is good for data science, and scientific computing, what we really mean is that there is a rich ecosystem of available open-source external packages, that greatly expand the capacities of the language beyond the standard library. \n",
183 |     "\n",
184 |     "This set of packages, which we will introduce as we go through these materials, is sometimes referred to as 'Scientific Python', or the 'Scipy' ecosystem. \n",
185 |     "\n",
186 |     "For the purposes of these materials, the Anaconda distribution that we are using contains all the packages you need. "
187 |    ]
188 |   }
189 |  ],
190 |  "metadata": {
191 |   "anaconda-cloud": {},
192 |   "kernelspec": {
193 |    "display_name": "Python 3",
194 |    "language": "python",
195 |    "name": "python3"
196 |   },
197 |   "language_info": {
198 |    "codemirror_mode": {
199 |     "name": "ipython",
200 |     "version": 3
201 |    },
202 |    "file_extension": ".py",
203 |    "mimetype": "text/x-python",
204 |    "name": "python",
205 |    "nbconvert_exporter": "python",
206 |    "pygments_lexer": "ipython3",
207 |    "version": "3.7.4"
208 |   }
209 |  },
210 |  "nbformat": 4,
211 |  "nbformat_minor": 1
212 | }
213 | 


--------------------------------------------------------------------------------
/dsip/tutorials/02-JupyterNotebooks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Jupyter Notebooks\n",
  8 |     "\n",
  9 |     "<br>\n",
 10 |     "<br>\n",
 11 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/jupyter.png\" width=\"200px\">\n",
 12 |     "<br>\n",
 13 |     "<br>\n",
 14 |     "\n",
 15 |     "This is a quick introduction to Jupyter notebooks."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "<div class=\"alert alert-success\">\n",
 23 |     "Jupyter notebooks are a way to combine executable code, code outputs, and text into one connected file.\n",
 24 |     "</div>\n",
 25 |     "\n",
 26 |     "<div class=\"alert alert-info\">\n",
 27 |     "The official documentation from project Jupyter is available \n",
 28 |     "<a href=\"https://jupyter-notebook.readthedocs.io/en/stable/\" class=\"alert-link\">here</a>\n",
 29 |     "and they also have some example notebooks \n",
 30 |     "<a href=\"https://github.com/jupyter/notebook/tree/master/docs/source/examples/Notebook\" class=\"alert-link\">here</a>.\n",
 31 |     "</div>"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## Menu Options & Shortcuts\n",
 39 |     "\n",
 40 |     "To get a quick tour of the Jupyter user-interface, click on the 'Help' menu, then click 'User Interface Tour'.\n",
 41 |     "\n",
 42 |     "There are also a large number of useful keyboard shortcuts. Click on the 'Help' menu, and then 'Keyboard Shortcuts' to see a list. "
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Cells"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "<div class=\"alert alert-success\">\n",
 57 |     "The main organizational structure of the notebook are 'cells'.\n",
 58 |     "</div>"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "Cells, can be markdown (text), like this one or code cells (we'll get to those)."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "### Markdown cells"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {
 78 |     "slideshow": {
 79 |      "slide_type": "fragment"
 80 |     }
 81 |    },
 82 |    "source": [
 83 |     "Markdown cell are useful for communicating information about our notebooks.\n",
 84 |     "\n",
 85 |     "They perform basic text formatting including italics, bold, headings, links and images.\n",
 86 |     "\n",
 87 |     "Double-click on any of the cells in this section to see what the plain-text looks like. Run the cell to then see what the formatted Markdown text looks like."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {
 93 |     "slideshow": {
 94 |      "slide_type": "slide"
 95 |     }
 96 |    },
 97 |    "source": [
 98 |     "# This is a heading\n",
 99 |     "\n",
100 |     "## This is a smaller heading\n",
101 |     "\n",
102 |     "### This is a really small heading"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "slideshow": {
109 |      "slide_type": "slide"
110 |     }
111 |    },
112 |    "source": [
113 |     "We can italicize my text either like *this* or like _this_."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {
119 |     "slideshow": {
120 |      "slide_type": "fragment"
121 |     }
122 |    },
123 |    "source": [
124 |     "We can embolden my text either like **this** or like __this__."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {
130 |     "slideshow": {
131 |      "slide_type": "slide"
132 |     }
133 |    },
134 |    "source": [
135 |     "Here is an unordered list of items:\n",
136 |     "* This is an item\n",
137 |     "* This is an item\n",
138 |     "* This is an item"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {
144 |     "slideshow": {
145 |      "slide_type": "slide"
146 |     }
147 |    },
148 |    "source": [
149 |     "Here is an ordered list of items:\n",
150 |     "1. This is my first item\n",
151 |     "2. This is my second item\n",
152 |     "3. This is my third item"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {
158 |     "slideshow": {
159 |      "slide_type": "slide"
160 |     }
161 |    },
162 |    "source": [
163 |     "We can have a list of lists by using identation:\n",
164 |     "* This is an item\n",
165 |     "* This is an item\n",
166 |     "\t* This is an item\n",
167 |     "\t* This is an item\n",
168 |     "* This is an item"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {
174 |     "slideshow": {
175 |      "slide_type": "slide"
176 |     }
177 |    },
178 |    "source": [
179 |     "We can also combine ordered and unordered lists:\n",
180 |     "1. This is my first item\n",
181 |     "2. This is my second item\n",
182 |     "\t* This is an item\n",
183 |     "\t* This is an item\n",
184 |     "3. This is my third item"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {
190 |     "slideshow": {
191 |      "slide_type": "slide"
192 |     }
193 |    },
194 |    "source": [
195 |     "We can make a link to this [useful markdown cheatsheet](https://www.markdownguide.org/cheat-sheet/) as such."
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {
201 |     "slideshow": {
202 |      "slide_type": "fragment"
203 |     }
204 |    },
205 |    "source": [
206 |     "If we don't use the markdown syntax for links, it will just show the link itself as the link text: https://www.markdownguide.org/cheat-sheet/"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {
212 |     "slideshow": {
213 |      "slide_type": "slide"
214 |     }
215 |    },
216 |    "source": [
217 |     "### LaTeX-formatted text"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {
223 |     "slideshow": {
224 |      "slide_type": "fragment"
225 |     }
226 |    },
227 |    "source": [
228 |     "$$ P(A \\mid B) = \\frac{P(B \\mid A) \\, P(A)}{P(B)} $$"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "### Code Cells\n",
236 |     "\n",
237 |     "Code cells are cells that contain code, that can be executed. \n",
238 |     "\n",
239 |     "Comments can also be written in code cells, indicated by '#'. "
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 1,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "# In a code cell, comments can be typed\n",
249 |     "a = 1\n",
250 |     "b = 2"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 2,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "3\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "# Cells can also have output, that gets printed out below the cell.\n",
268 |     "print(a + b)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 3,
274 |    "metadata": {
275 |     "slideshow": {
276 |      "slide_type": "slide"
277 |     }
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "# Define a variable in code\n",
282 |     "my_string = 'hello world'"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 4,
288 |    "metadata": {
289 |     "slideshow": {
290 |      "slide_type": "fragment"
291 |     }
292 |    },
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "hello world\n"
299 |      ]
300 |     }
301 |    ],
302 |    "source": [
303 |     "# Print out a variable\n",
304 |     "print(my_string)"
305 |    ]
306 |   },
307 |   {
308 |    "cell_type": "code",
309 |    "execution_count": 5,
310 |    "metadata": {
311 |     "slideshow": {
312 |      "slide_type": "slide"
313 |     }
314 |    },
315 |    "outputs": [
316 |     {
317 |      "data": {
318 |       "text/plain": [
319 |        "'HELLO WORLD'"
320 |       ]
321 |      },
322 |      "execution_count": 5,
323 |      "metadata": {},
324 |      "output_type": "execute_result"
325 |     }
326 |    ],
327 |    "source": [
328 |     "# Operations that return objects get printed out as output\n",
329 |     "my_string.upper()"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": 6,
335 |    "metadata": {
336 |     "slideshow": {
337 |      "slide_type": "slide"
338 |     }
339 |    },
340 |    "outputs": [],
341 |    "source": [
342 |     "# Define a list variable\n",
343 |     "my_list = ['a','b','c']"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 7,
349 |    "metadata": {
350 |     "slideshow": {
351 |      "slide_type": "fragment"
352 |     }
353 |    },
354 |    "outputs": [
355 |     {
356 |      "name": "stdout",
357 |      "output_type": "stream",
358 |      "text": [
359 |       "['a', 'b', 'c']\n"
360 |      ]
361 |     }
362 |    ],
363 |    "source": [
364 |     "# Print out our list variable\n",
365 |     "print(my_list)"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "markdown",
370 |    "metadata": {},
371 |    "source": [
372 |     "## Accessing Documentation"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "<div class=\"alert alert-success\">\n",
380 |     "Jupyter has useful shortcuts. Add a single '?' after a function or class get a window with the documentation, or a double '??' to pull up the source code. \n",
381 |     "</div>"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 8,
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "# Import numpy for examples\n",
391 |     "import numpy as np"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 9,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": [
400 |     "# Check the docs for a numpy array\n",
401 |     "np.array?"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 10,
407 |    "metadata": {},
408 |    "outputs": [],
409 |    "source": [
410 |     "# Check the full source code for numpy append function\n",
411 |     "np.append??"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": 11,
417 |    "metadata": {
418 |     "slideshow": {
419 |      "slide_type": "fragment"
420 |     }
421 |    },
422 |    "outputs": [],
423 |    "source": [
424 |     "# Get information about a variable you've created\n",
425 |     "my_string?"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "metadata": {},
431 |    "source": [
432 |     "## Autocomplete"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "metadata": {},
438 |    "source": [
439 |     "<div class=\"alert alert-success\">\n",
440 |     "Jupyter also has \n",
441 |     "<a href=\"https://en.wikipedia.org/wiki/Command-line_completion\" class=\"alert-link\">tab complete</a>\n",
442 |     "capacities, which can autocomplete what you are typing, and/or be used to explore what code is available.  \n",
443 |     "</div>"
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 12,
449 |    "metadata": {},
450 |    "outputs": [
451 |     {
452 |      "ename": "SyntaxError",
453 |      "evalue": "invalid syntax (<ipython-input-12-06f09a953826>, line 2)",
454 |      "output_type": "error",
455 |      "traceback": [
456 |       "\u001b[0;36m  File \u001b[0;32m\"<ipython-input-12-06f09a953826>\"\u001b[0;36m, line \u001b[0;32m2\u001b[0m\n\u001b[0;31m    np.\u001b[0m\n\u001b[0m       ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
457 |      ]
458 |     }
459 |    ],
460 |    "source": [
461 |     "# Move your cursor just after the period, press tab, and a drop menu will appear showing all possible completions\n",
462 |     "np."
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {},
469 |    "outputs": [],
470 |    "source": [
471 |     "# Autocomplete does not have to be at a period. Move to the end of 'ra' and hit tab to see completion options. \n",
472 |     "ra"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "# If there is only one option, tab-complete will auto-complete what you are typing\n",
482 |     "ran"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {},
488 |    "source": [
489 |     "## Kernel & Namespace\n",
490 |     "\n",
491 |     "You do not need to run cells in order! This is useful for flexibly testing and developing code. \n",
492 |     "\n",
493 |     "The numbers in the square brackets to the left of a cell show which cells have been run, and in what order.\n",
494 |     "\n",
495 |     "However, it can also be easy to lose track of what has already been declared / imported, leading to unexpected behaviour from running cells.\n",
496 |     "\n",
497 |     "The kernel is what connects the notebook to your computer behind-the-scenes to execute the code. \n",
498 |     "\n",
499 |     "It can be useful to clear and re-launch the kernel. You can do this from the 'kernel' drop down menu, at the top, optionally also clearing all ouputs."
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "## Magic Commands"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "<div class=\"alert alert-success\">\n",
514 |     "'Magic Commands' are a special (command-line like) syntax in IPython/Jupyter to run special functionality. They can run on lines and/or entire cells. \n",
515 |     "</div>\n",
516 |     "\n",
517 |     "<div class=\"alert alert-info\">\n",
518 |     "The iPython <a href=\"http://ipython.readthedocs.io/en/stable/interactive/magics.html\" class=\"alert-link\">documentation</a> has more information on magic commands.\n",
519 |     "</div>"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {
525 |     "slideshow": {
526 |      "slide_type": "slide"
527 |     }
528 |    },
529 |    "source": [
530 |     "Magic commands are designed to succinctly solve various common problems in standard data analysis. Magic commands come in two flavors: line magics, which are denoted by a single % prefix and operate on a single line of input, and cell magics, which are denoted by a double %% prefix and operate on multiple lines of input."
531 |    ]
532 |   },
533 |   {
534 |    "cell_type": "code",
535 |    "execution_count": null,
536 |    "metadata": {
537 |     "slideshow": {
538 |      "slide_type": "slide"
539 |     }
540 |    },
541 |    "outputs": [],
542 |    "source": [
543 |     "# Access quick reference sheet for interactive Python (this opens a reference guide)\n",
544 |     "%quickref"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": null,
550 |    "metadata": {},
551 |    "outputs": [],
552 |    "source": [
553 |     "# Check a list of available magic commands\n",
554 |     "%lsmagic"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": null,
560 |    "metadata": {
561 |     "slideshow": {
562 |      "slide_type": "slide"
563 |     }
564 |    },
565 |    "outputs": [],
566 |    "source": [
567 |     "# Check the current working directory\n",
568 |     "%pwd"
569 |    ]
570 |   },
571 |   {
572 |    "cell_type": "code",
573 |    "execution_count": null,
574 |    "metadata": {
575 |     "slideshow": {
576 |      "slide_type": "fragment"
577 |     }
578 |    },
579 |    "outputs": [],
580 |    "source": [
581 |     "# Check all currently defined variables\n",
582 |     "%who"
583 |    ]
584 |   },
585 |   {
586 |    "cell_type": "code",
587 |    "execution_count": null,
588 |    "metadata": {
589 |     "slideshow": {
590 |      "slide_type": "fragment"
591 |     }
592 |    },
593 |    "outputs": [],
594 |    "source": [
595 |     "# Chcek all variables, with more information about them\n",
596 |     "%whos"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "metadata": {
603 |     "slideshow": {
604 |      "slide_type": "slide"
605 |     }
606 |    },
607 |    "outputs": [],
608 |    "source": [
609 |     "# Check code history\n",
610 |     "%hist"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "markdown",
615 |    "metadata": {},
616 |    "source": [
617 |     "### Line Magics\n",
618 |     "\n",
619 |     "\n",
620 |     "Line magics use a single '%', and apply to a single line. "
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": null,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "# For example, we can time how long it takes to create a large list\n",
630 |     "%timeit list(range(100000))"
631 |    ]
632 |   },
633 |   {
634 |    "cell_type": "markdown",
635 |    "metadata": {},
636 |    "source": [
637 |     "### Cell Magics\n",
638 |     "\n",
639 |     "Cell magics use a double '%%', and apply to the whole cell. "
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": null,
645 |    "metadata": {},
646 |    "outputs": [],
647 |    "source": [
648 |     "%%timeit\n",
649 |     "# For example, we could time a whole cell\n",
650 |     "a = list(range(100000))\n",
651 |     "b = [n + 1 for n in a]"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "markdown",
656 |    "metadata": {},
657 |    "source": [
658 |     "### Running terminal commands\n",
659 |     "\n",
660 |     "Another nice thing about notebooks is being able to run terminals commands"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "metadata": {},
667 |    "outputs": [],
668 |    "source": [
669 |     "# You can run a terminal command by adding '!' to the start of the line\n",
670 |     "!pwd\n",
671 |     "\n",
672 |     "# Note that in this case, '!pwd' is equivalent to line magic '%pwd'. \n",
673 |     "# The '!' syntax is more general though, allowing you to run anything you want through command-line "
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "code",
678 |    "execution_count": null,
679 |    "metadata": {},
680 |    "outputs": [],
681 |    "source": [
682 |     "%%bash\n",
683 |     "# Equivalently, (for bash) use the %%bash cell magic to run a cell as bash (command-line)\n",
684 |     "pwd"
685 |    ]
686 |   },
687 |   {
688 |    "cell_type": "code",
689 |    "execution_count": null,
690 |    "metadata": {
691 |     "slideshow": {
692 |      "slide_type": "fragment"
693 |     }
694 |    },
695 |    "outputs": [],
696 |    "source": [
697 |     "# List files in directory\n",
698 |     "!ls"
699 |    ]
700 |   },
701 |   {
702 |    "cell_type": "code",
703 |    "execution_count": null,
704 |    "metadata": {
705 |     "slideshow": {
706 |      "slide_type": "fragment"
707 |     }
708 |    },
709 |    "outputs": [],
710 |    "source": [
711 |     "# Change current directory\n",
712 |     "!cd ."
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "markdown",
717 |    "metadata": {},
718 |    "source": [
719 |     "<div class=\"alert alert-info\">\n",
720 |     "For more useful information, check out Jupyter Notebooks \n",
721 |     "<a href=\"https://www.dataquest.io/blog/jupyter-notebook-tips-tricks-shortcuts/\" class=\"alert-link\">tips & tricks</a>, and more information on how \n",
722 |     "<a href=\"http://jupyter.readthedocs.io/en/latest/architecture/how_jupyter_ipython_work.html\" class=\"alert-link\">notebooks work</a>.\n",
723 |     "</div>"
724 |    ]
725 |   }
726 |  ],
727 |  "metadata": {
728 |   "kernelspec": {
729 |    "display_name": "Python 3",
730 |    "language": "python",
731 |    "name": "python3"
732 |   },
733 |   "language_info": {
734 |    "codemirror_mode": {
735 |     "name": "ipython",
736 |     "version": 3
737 |    },
738 |    "file_extension": ".py",
739 |    "mimetype": "text/x-python",
740 |    "name": "python",
741 |    "nbconvert_exporter": "python",
742 |    "pygments_lexer": "ipython3",
743 |    "version": "3.7.4"
744 |   }
745 |  },
746 |  "nbformat": 4,
747 |  "nbformat_minor": 2
748 | }
749 | 


--------------------------------------------------------------------------------
/dsip/tutorials/05-DataGathering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Gathering"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<div class=\"alert alert-success\">\n",
 15 |     "Data Gathering is the process of accessing data and collecting it together.\n",
 16 |     "</div>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "This notebook covers strategies for finding and gathering data.\n",
 24 |     "\n",
 25 |     "If you want to start by working on data analyses (with provided data) you can move onto the next tutorials, and come back to this one later.\n",
 26 |     "\n",
 27 |     "Data gathering can encompass many different strategies, including data collection, web scraping, accessing data from databases, and downloading data in bulk. Sometimes it even includes things like calling someone to ask if you can use some of their data, and asking them to send it over. "
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Where to get Data\n",
 35 |     "\n",
 36 |     "There are lots of way to get data, and lots of places to get it from. Typically, most of this data will be accessed through the internet, in one way or another, especially when pursuing indepent research projects. \n",
 37 |     "\n",
 38 |     "### Institutional Access\n",
 39 |     "\n",
 40 |     "If you are working with data as part of an institution, such as a company of research lab, the institution will typically have data it needs analyzing, that it collects in various ways. Keep in mind that even people working inside institutions, with access to local data, will data still seek to find and incorporate external datasets. \n",
 41 |     "\n",
 42 |     "### Data Repositories\n",
 43 |     "\n",
 44 |     "**Data repositories** are databases from which you can download data. Some data repositories allow you to explore available datasets and download datasets in bulk. Others may also offer **APIs**, through which you can request specific data from particular databases.\n",
 45 |     "\n",
 46 |     "### Web Scraping\n",
 47 |     "\n",
 48 |     "The web itself is full of unstructured data. **Web scraping** can be done to directly extract and collect data directly from websites.\n",
 49 |     "\n",
 50 |     "### Asking People for Data\n",
 51 |     "\n",
 52 |     "Not all data is indexed or accessible on the web, at least not publicly. Sometimes finding data means figuring out if any data is available, figuring out where it might be, and then reaching out and asking people directly about data access. If there is some particular data you need, you can try to figure out who might have it, and get in touch to see if it might be available."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "### Data Gathering Skills\n",
 60 |     "\n",
 61 |     "Depending on your gathering method, you will likely have to do some combination of the following:\n",
 62 |     "\n",
 63 |     "- Direct download data files from repositories\n",
 64 |     "- Query databases & use APIs to extract and collect data of interest\n",
 65 |     "- Ask people for data, and going to pick up data with a harddrive\n",
 66 |     "\n",
 67 |     "Ultimately, the goal is collect and curate data files, hopefully structured, that you can read into Python."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "## Definitions: Databases & Query Languages\n",
 75 |     "\n",
 76 |     "Here, we will introduce some useful definitions you will likely encounter when exploring how to gather data. \n",
 77 |     "\n",
 78 |     "Other than these definitions, we will not cover databases & query languages more in these tutorials. "
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "<div class=\"alert alert-success\">\n",
 86 |     "A database is an organized collection of data. More formally, 'database' refers to a set of related data, and the way it is organized. \n",
 87 |     "</div>"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "<div class=\"alert alert-success\">\n",
 95 |     "A query language is a language for operating with databases, such as retrieving, and sometimes modifying, information from databases.\n",
 96 |     "</div>"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "<div class=\"alert alert-success\">\n",
104 |     "SQL (pronounced 'sequel') is a common query language used to interact with databases, and request data.\n",
105 |     "</div>\n",
106 |     "\n",
107 |     "<div class=\"alert alert-info\">\n",
108 |     "If you are interested, there is a useful introduction and tutorial to SQL\n",
109 |     "<a href=\"http://www.sqlcourse.com/intro.html\" class=\"alert-link\">here</a>\n",
110 |     "as well as some useful 'cheat sheets' \n",
111 |     "<a href=\"http://www.cheat-sheets.org/sites/sql.su/\" class=\"alert-link\">here</a>\n",
112 |     "and\n",
113 |     "<a href=\"http://www.sqltutorial.org/wp-content/uploads/2016/04/SQL-cheat-sheet.pdf\" class=\"alert-link\">here</a>.\n",
114 |     "</div>"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "## Data Repositories"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "markdown",
126 |    "metadata": {},
127 |    "source": [
128 |     "<div class=\"alert alert-success\">\n",
129 |     "A Data Repository is basically just a place that data is stored. For our purposes, it is a place you can download data from. \n",
130 |     "</div>\n",
131 |     "\n",
132 |     "<div class=\"alert alert-info\">\n",
133 |     "There is a curated list of good data source included in the \n",
134 |     "<a href=\"https://github.com/COGS108/Projects\" class=\"alert-link\">project materials</a>.\n",
135 |     "</div>"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "For our purposes, data repositories are places you can download data directly from, for example [data.gov](https://www.data.gov/)."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "## Application Program Interfaces (APIs)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "<div class=\"alert alert-success\">\n",
157 |     "APIs are basically a way for software to talk to software - it is an interface into an application / website / database designed for software.\n",
158 |     "</div>\n",
159 |     "\n",
160 |     "<div class=\"alert alert-info\">\n",
161 |     "For a simple explanation of APIs go\n",
162 |     "<a href=\"https://medium.freecodecamp.com/what-is-an-api-in-english-please-b880a3214a82\" class=\"alert-link\">here</a>\n",
163 |     "or for a much broader, more technical, overview try\n",
164 |     "<a href=\"https://medium.com/@mattburgess/apis-a-basic-primer-f8250602597d\" class=\"alert-link\">here</a>.\n",
165 |     "</div>\n",
166 |     "\n",
167 |     "<div class=\"alert alert-info\">\n",
168 |     "This\n",
169 |     "<a href=\"http://www.webopedia.com/TERM/A/API.html\" class=\"alert-link\">list</a>\n",
170 |     "includes a collection of commonly used and available APIs. \n",
171 |     "</div>"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "APIs offer a lot of functionality - you can send requests to the application to do all kinds of actions. In fact, any application interface that is designed to be used programmatically is an API, including, for example, interfaces for using packages of code. \n",
179 |     "\n",
180 |     "One of the many things that APIs do, and offer, is a way to query and access data from particular applications / databases. For example, there is a an API for Google maps that allows for programmatically querying the latitude & longitude positions of given addresses. \n",
181 |     "\n",
182 |     "The benefit of using APIs for data gathering purposes is that they typically return data in nicely structured formats, that are relatively easy to analyze."
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "### Launching URL Requests from Python\n",
190 |     "\n",
191 |     "In order to use APIs, and for other approaches to collecting data, it may be useful to launch URL requests from Python.\n",
192 |     "\n",
193 |     "Note that by `URL`, we just mean a file or application that can be reached by a web address. Python can be used to organize and launch URL requests, triggering actions and collecting any returned data. \n",
194 |     "\n",
195 |     "In practice, APIs are usually special URLs that return raw data, such as `json` or `XML` files. This is compared to URLs we are typically more used to that return web pages as `html`, which can be rendered for human viewers (html). The key difference is that APIs return structured data files, where as `html` files are typically unstructured (more on that later, with web scraping). \n",
196 |     "\n",
197 |     "If you with to use an API, try and find the documentation for to see how you send requests to access whatever data you want. \n",
198 |     "\n",
199 |     "#### API Example\n",
200 |     "\n",
201 |     "For our example here, we will use the Github API. Note that the URL we use is `api.github.com`. This URL accesses the API, and will return structured data files, instead of the html that would be returned by the standard URL (github.com)."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 10,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "import pandas as pd\n",
211 |     "\n",
212 |     "# We will use the `requests` library to launch URL requests from Python\n",
213 |     "import requests"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 11,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "# Request data from the Github API on a particular user\n",
223 |     "page = requests.get('https://api.github.com/users/tomdonoghue')"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": 12,
229 |    "metadata": {},
230 |    "outputs": [
231 |     {
232 |      "data": {
233 |       "text/plain": [
234 |        "b'{\"login\":\"TomDonoghue\",\"id\":7727566,\"node_id\":\"MDQ6VXNlcjc3Mjc1NjY=\",\"avatar_url\":\"https://avatars0.githubusercontent.com/u/7727566?v=4\",\"gravatar_id\":\"\",\"url\":\"https://api.github.com/users/TomDonoghue\",\"html_url\":\"https://github.com/TomDonoghue\",\"followers_url\":\"https://api.github.com/users/TomDonoghue/followers\",\"following_url\":\"https://api.github.com/users/TomDonoghue/following{/other_user}\",\"gists_url\":\"https://api.github.com/users/TomDonoghue/gists{/gist_id}\",\"starred_url\":\"https://api.github.com/users/TomDonoghue/starred{/owner}{/repo}\",\"subscriptions_url\":\"https://api.github.com/users/TomDonoghue/subscriptions\",\"organizations_url\":\"https://api.github.com/users/TomDonoghue/orgs\",\"repos_url\":\"https://api.github.com/users/TomDonoghue/repos\",\"events_url\":\"https://api.github.com/users/TomDonoghue/events{/privacy}\",\"received_events_url\":\"https://api.github.com/users/TomDonoghue/received_events\",\"type\":\"User\",\"site_admin\":false,\"name\":\"Tom\",\"company\":\"UC San Diego\",\"blog\":\"https://tomdonoghue.github.io\",\"location\":\"San Diego\",\"email\":null,\"hireable\":null,\"bio\":\"Cognitive Science Grad Student @ UC San Diego working on analyzing electrical brain activity. Also teaching Python & Data Science. \\\\r\\\\n\\\\r\\\\n\",\"twitter_username\":null,\"public_repos\":13,\"public_gists\":0,\"followers\":97,\"following\":83,\"created_at\":\"2014-05-28T20:20:48Z\",\"updated_at\":\"2020-06-19T21:35:12Z\"}'"
235 |       ]
236 |      },
237 |      "execution_count": 12,
238 |      "metadata": {},
239 |      "output_type": "execute_result"
240 |     }
241 |    ],
242 |    "source": [
243 |     "# In this case, the content we get back is a json file\n",
244 |     "page.content"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 13,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "data": {
254 |       "text/plain": [
255 |        "login                                                        TomDonoghue\n",
256 |        "id                                                               7727566\n",
257 |        "node_id                                             MDQ6VXNlcjc3Mjc1NjY=\n",
258 |        "avatar_url             https://avatars0.githubusercontent.com/u/77275...\n",
259 |        "gravatar_id                                                             \n",
260 |        "url                             https://api.github.com/users/TomDonoghue\n",
261 |        "html_url                                  https://github.com/TomDonoghue\n",
262 |        "followers_url          https://api.github.com/users/TomDonoghue/follo...\n",
263 |        "following_url          https://api.github.com/users/TomDonoghue/follo...\n",
264 |        "gists_url              https://api.github.com/users/TomDonoghue/gists...\n",
265 |        "starred_url            https://api.github.com/users/TomDonoghue/starr...\n",
266 |        "subscriptions_url      https://api.github.com/users/TomDonoghue/subsc...\n",
267 |        "organizations_url          https://api.github.com/users/TomDonoghue/orgs\n",
268 |        "repos_url                 https://api.github.com/users/TomDonoghue/repos\n",
269 |        "events_url             https://api.github.com/users/TomDonoghue/event...\n",
270 |        "received_events_url    https://api.github.com/users/TomDonoghue/recei...\n",
271 |        "type                                                                User\n",
272 |        "site_admin                                                         False\n",
273 |        "name                                                                 Tom\n",
274 |        "company                                                     UC San Diego\n",
275 |        "blog                                       https://tomdonoghue.github.io\n",
276 |        "location                                                       San Diego\n",
277 |        "email                                                               None\n",
278 |        "hireable                                                            None\n",
279 |        "bio                    Cognitive Science Grad Student @ UC San Diego ...\n",
280 |        "twitter_username                                                    None\n",
281 |        "public_repos                                                          13\n",
282 |        "public_gists                                                           0\n",
283 |        "followers                                                             97\n",
284 |        "following                                                             83\n",
285 |        "created_at                                          2014-05-28T20:20:48Z\n",
286 |        "updated_at                                          2020-06-19T21:35:12Z\n",
287 |        "dtype: object"
288 |       ]
289 |      },
290 |      "execution_count": 13,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "# We can read in the json data with pandas\n",
297 |     "pd.read_json(page.content, typ='series')"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "As we can see above, in a couple lines of code, we can collect a lot of structured data about a particular user.\n",
305 |     "\n",
306 |     "If we wanted to do analyses of Github profiles and activity, we could use the Github API to collect information about a group of users, and then analyze and compare the collected data. "
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "markdown",
311 |    "metadata": {
312 |     "collapsed": true
313 |    },
314 |    "source": [
315 |     "## Web Scraping"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "metadata": {},
321 |    "source": [
322 |     "<div class=\"alert alert-success\">\n",
323 |     "Web scraping is when you (programmatically) extract data from websites.\n",
324 |     "</div>\n",
325 |     "\n",
326 |     "<div class=\"alert alert-info\">\n",
327 |     "<a href=\"https://en.wikipedia.org/wiki/Web_scraping\" class=\"alert-link\">Wikipedia</a>\n",
328 |     "has a useful page on web scraping.\n",
329 |     "</div>"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "metadata": {},
335 |    "source": [
336 |     "By web scraping, we typically mean something distinct from using the internet to access an API. Rather, web scraping refers to using code to systematically navigate the internet, and extract information of internet, from html or other available files. Note that in this case one is not interacting directly with a database, but simply exploring and collecting whatever is available on web pages.\n",
337 |     "\n",
338 |     "Note that the following section uses the 'BeautifulSoup' module, which is not part of the standard anaconda distribution. \n",
339 |     "\n",
340 |     "If you do not have BeautifulSoup, and want to get it to run this section, you can uncomment the cell below, and run it, to install BeautifulSoup in your current Python environment. You only have to do this once."
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 5,
346 |    "metadata": {
347 |     "collapsed": true
348 |    },
349 |    "outputs": [],
350 |    "source": [
351 |     "#import sys\n",
352 |     "#!conda install --yes --prefix {sys.prefix} beautifulsoup4"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 6,
358 |    "metadata": {
359 |     "collapsed": true
360 |    },
361 |    "outputs": [],
362 |    "source": [
363 |     "# Import BeautifulSoup\n",
364 |     "from bs4 import BeautifulSoup"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": 7,
370 |    "metadata": {
371 |     "collapsed": true
372 |    },
373 |    "outputs": [],
374 |    "source": [
375 |     "# Set the URL for the page we wish to scrape\n",
376 |     "site_url = 'https://en.wikipedia.org/wiki/Data_science'\n",
377 |     "\n",
378 |     "# Launch the URL request, to get the page\n",
379 |     "page = requests.get(site_url)"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 8,
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "data": {
389 |       "text/plain": [
390 |        "b'<!DOCTYPE html>\\n<html class=\"client-nojs\" lang=\"en\" dir=\"ltr\">\\n<head>\\n<meta charset=\"UTF-8\"/>\\n<title>Data science - Wikipedia</title>\\n<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\\\s)client-nojs(\\\\s|$)/, \"$1client-js$2\" );</script>\\n<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({\"wgCanonicalNamespace\":\"\",\"wgCanonicalSpecialPageName\":false,\"wgNamespaceNumber\":0,\"wgPageName\":\"Data_science\",\"wgTitle\":\"Data science\",\"wgCurRevisionId\":822535327,\"wgRevisionId\":822535327,\"wgArticleId\":35458904,\"wgIsArticle\":true,\"wgIsRedirect\":false,\"wgAction\":\"view\",\"wgUserName\":null,\"wgUserGroups\":[\"*\"],\"wgCategories\":[\"Use dmy dates from December 2012\",\"Information science\",\"Computer occupations\",\"Computational fields of study\",\"Data analysis\"],\"wgBreakFrames\":false,\"wgPageContentLanguage\":\"en\",\"wgPageContentModel\":\"wikitext\",\"wgSeparatorTransformTable\":[\"\",\"\"],\"wgDigitTransformTable\":[\"\",\"\"],\"wgDefaultDateFormat\":\"dmy\",\"wgMonthNames\":[\"\",\"Ja'"
391 |       ]
392 |      },
393 |      "execution_count": 8,
394 |      "metadata": {},
395 |      "output_type": "execute_result"
396 |     }
397 |    ],
398 |    "source": [
399 |     "# Print out the first 1000 characters of the scraped web page\n",
400 |     "page.content[0:1000]"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "Note that the source of the scraped web-page is a messy pile of HTML. \n",
408 |     "\n",
409 |     "There is a lot of information in there, but with no clear organization. There is some structure in the page though, delineated by HTML tags, etc, we just need to use them to parse out the data. We can do that with BeautifulSoup, which takes in messy documents like this, and parses them based on a specified format. "
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": 9,
415 |    "metadata": {
416 |     "collapsed": true
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "# Parse the webpage with Beautiful Soup, using a html parser\n",
421 |     "soup = BeautifulSoup(page.content, 'html.parser')"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 10,
427 |    "metadata": {
428 |     "scrolled": false
429 |    },
430 |    "outputs": [
431 |     {
432 |      "name": "stdout",
433 |      "output_type": "stream",
434 |      "text": [
435 |       "TITLE: \n",
436 |       "\n",
437 |       "<title>Data science - Wikipedia</title>\n",
438 |       "\n",
439 |       "P-TAG:\n",
440 |       "\n",
441 |       "<p><b>Data science</b>, also known as <b>data-driven science</b>, is an interdisciplinary field of scientific methods, processes, and systems to extract <a href=\"/wiki/Knowledge\" title=\"Knowledge\">knowledge</a> or insights from <a href=\"/wiki/Data\" title=\"Data\">data</a> in various forms, either structured or unstructured,<sup class=\"reference\" id=\"cite_ref-:0_1-0\"><a href=\"#cite_note-:0-1\">[1]</a></sup><sup class=\"reference\" id=\"cite_ref-2\"><a href=\"#cite_note-2\">[2]</a></sup> similar to <a href=\"/wiki/Data_mining\" title=\"Data mining\">data mining</a>.</p>\n"
442 |      ]
443 |     }
444 |    ],
445 |    "source": [
446 |     "# With the parsed soup object, we can select particular segments of the web page\n",
447 |     "\n",
448 |     "# Print out the page title\n",
449 |     "print('TITLE: \\n')\n",
450 |     "print(soup.title)\n",
451 |     "\n",
452 |     "# Print out the first p-tag\n",
453 |     "print('\\nP-TAG:\\n')\n",
454 |     "print(soup.find('p'))"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "markdown",
459 |    "metadata": {},
460 |    "source": [
461 |     "From the soup object, you can explore the page in a more organized way, and start to extract particular components of interest.\n",
462 |     "\n",
463 |     "Note that it is still 'messy' in other ways, in that there might or might not be a systematic structure to how the page is laid out, and it still might take a lot of work to extract the particular information you want from it."
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "markdown",
468 |    "metadata": {},
469 |    "source": [
470 |     "### APIs vs. Web Scraping\n",
471 |     "\n",
472 |     "Web scraping is distinct from using an API, even though many APIs may be accessed over the internet. Web scraping is different in that you are (programmatically) navigating through the internet, and extracting data of interest. \n",
473 |     "\n",
474 |     "Note:\n",
475 |     "Be aware that scraping data from websites (without using APIs) can often be an involved project itself. Web scraping itself can take a considerable amount of time and work to get the data you want. \n",
476 |     "\n",
477 |     "Be aware that data presented on websites may not be well structured, and may not be in an organized format that lends itself to easy collection and analysis.\n",
478 |     "\n",
479 |     "If you try scraping websites, you should also check to make sure you are allowed to scrape the data, and follow the websites terms of service. "
480 |    ]
481 |   }
482 |  ],
483 |  "metadata": {
484 |   "anaconda-cloud": {},
485 |   "kernelspec": {
486 |    "display_name": "Python 3",
487 |    "language": "python",
488 |    "name": "python3"
489 |   },
490 |   "language_info": {
491 |    "codemirror_mode": {
492 |     "name": "ipython",
493 |     "version": 3
494 |    },
495 |    "file_extension": ".py",
496 |    "mimetype": "text/x-python",
497 |    "name": "python",
498 |    "nbconvert_exporter": "python",
499 |    "pygments_lexer": "ipython3",
500 |    "version": "3.7.4"
501 |   }
502 |  },
503 |  "nbformat": 4,
504 |  "nbformat_minor": 1
505 | }
506 | 


--------------------------------------------------------------------------------
/dsip/tutorials/06-DataWrangling.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Wrangling"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "collapsed": true
 14 |    },
 15 |    "source": [
 16 |     "<div class=\"alert alert-success\">\n",
 17 |     "'Data Wrangling' generally refers to transforming raw data into a useable form for your analyses of interest, including loading, aggregating and formating. \n",
 18 |     "</div>"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "In this notebook, we will focus on loading different types of data files. Other aspects of 'wrangling' such as combining different datasets will be covered in future tutorials, and are explored in the assignments.\n",
 26 |     "\n",
 27 |     "Note: Throughout this notebook, we will be using `!` to run the shell command `cat` to print out the contents of example data files."
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "## Python I/O\n",
 35 |     "\n",
 36 |     "Let's start with basic Python utilities for reading and loading data files. "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "<div class=\"alert alert-info\">\n",
 44 |     "Official Python \n",
 45 |     "<a href=\"https://docs.python.org/3/library/io.html\" class=\"alert-link\">documentation</a> \n",
 46 |     "on input / output.\n",
 47 |     "</div>"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 13,
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "First line of data\r\n",
 60 |       "Second line of data"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "# Check out an example data file\n",
 66 |     "!cat files/data.txt"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 14,
 72 |    "metadata": {},
 73 |    "outputs": [
 74 |     {
 75 |      "name": "stdout",
 76 |      "output_type": "stream",
 77 |      "text": [
 78 |       "First line of data\n",
 79 |       "Second line of data\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "# First, explicitly open the file object for reading\n",
 85 |     "file_obj = open('files/data.txt', 'r')\n",
 86 |     "\n",
 87 |     "# You can then loop through the file object, grabbing each line of data\n",
 88 |     "for line in file_obj:\n",
 89 |     "    # Here we explicitly remove the new line marker at the end of each line (the '\\n')\n",
 90 |     "    print(line.strip('\\n'))\n",
 91 |     "\n",
 92 |     "# File objects then have to closed when you are finished with them\n",
 93 |     "file_obj.close()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "Since opening and closing files basically always goes together, there is a shortcut to do both of them together, which is the `with` keyword. \n",
101 |     "\n",
102 |     "By using `with`, file objects will be opened, and then automatically closed at the end of the code block. "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 15,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "name": "stdout",
112 |      "output_type": "stream",
113 |      "text": [
114 |       "First line of data\n",
115 |       "Second line of data\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "# Use 'with' keyword to open, read, and then close a file\n",
121 |     "with open('files/data.txt', 'r') as file_obj:\n",
122 |     "    for line in file_obj:\n",
123 |     "        print(line.strip('\\n'))"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "Using input / output functionality from standard library Python is a pretty 'low level' way to read data files. This strategy often takes a lot of work to organize and define the details of how files are organized and how to read them. For example, in the above simple example, we had to deal with the new line character explicitly. \n",
131 |     "\n",
132 |     "As long as you have reasonably well structured data files, using standardized file types, you can use higher-level functions that will take care of a lot of these details - loading data straight into `pandas` data objects, for example."
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Pandas I/O"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "<div class=\"alert alert-success\">\n",
147 |     "Pandas has a range of functions that will automatically read in whole files of standard file types in pandas objects. \n",
148 |     "</div>\n",
149 |     "\n",
150 |     "<div class=\"alert alert-info\">\n",
151 |     "Official Pandas\n",
152 |     "<a href=\"http://pandas.pydata.org/pandas-docs/stable/io.html\" class=\"alert-link\">documentation</a> \n",
153 |     "on input / output. \n",
154 |     "</div>"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 16,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "import pandas as pd"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "# Tab complete to check out all the read functions available\n",
173 |     "pd.read_"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## File types\n",
181 |     "\n",
182 |     "There are many different file types in which data may be stored. \n",
183 |     "\n",
184 |     "Here, we will start by examining CSV and JSON files. "
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "### CSV Files"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "markdown",
196 |    "metadata": {},
197 |    "source": [
198 |     "<div class=\"alert alert-success\">\n",
199 |     "'Comma Separated Value' files store data, separated by comma's. Think of them like lists.\n",
200 |     "</div>\n",
201 |     "\n",
202 |     "<div class=\"alert alert-info\">\n",
203 |     "More information on CSV files from\n",
204 |     "<a href=\"https://en.wikipedia.org/wiki/Comma-separated_values\" class=\"alert-link\">wikipedia</a>. \n",
205 |     "</div>"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 17,
211 |    "metadata": {},
212 |    "outputs": [
213 |     {
214 |      "name": "stdout",
215 |      "output_type": "stream",
216 |      "text": [
217 |       "1, 2, 3, 4\r\n",
218 |       "5, 6, 7, 8\r\n",
219 |       "9, 10, 11, 12"
220 |      ]
221 |     }
222 |    ],
223 |    "source": [
224 |     "# Let's have a look at a csv file (printed out in plain text)\n",
225 |     "!cat files/data.csv"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "markdown",
230 |    "metadata": {},
231 |    "source": [
232 |     "#### CSV Files with Python"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": 18,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": [
241 |     "# Python has a module devoted to working with csv's\n",
242 |     "import csv"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 19,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "1,  2,  3,  4\n",
255 |       "5,  6,  7,  8\n",
256 |       "9,  10,  11,  12\n"
257 |      ]
258 |     }
259 |    ],
260 |    "source": [
261 |     "# We can read through our file with the csv module\n",
262 |     "with open('files/data.csv') as csv_file:\n",
263 |     "    csv_reader = csv.reader(csv_file, delimiter=',')\n",
264 |     "    for row in csv_reader:\n",
265 |     "        print(', '.join(row))"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "#### CSV Files with Pandas"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 20,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "# Pandas also has functions to directly load csv data\n",
282 |     "pd.read_csv?"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 21,
288 |    "metadata": {},
289 |    "outputs": [
290 |     {
291 |      "data": {
292 |       "text/html": [
293 |        "<div>\n",
294 |        "<style scoped>\n",
295 |        "    .dataframe tbody tr th:only-of-type {\n",
296 |        "        vertical-align: middle;\n",
297 |        "    }\n",
298 |        "\n",
299 |        "    .dataframe tbody tr th {\n",
300 |        "        vertical-align: top;\n",
301 |        "    }\n",
302 |        "\n",
303 |        "    .dataframe thead th {\n",
304 |        "        text-align: right;\n",
305 |        "    }\n",
306 |        "</style>\n",
307 |        "<table border=\"1\" class=\"dataframe\">\n",
308 |        "  <thead>\n",
309 |        "    <tr style=\"text-align: right;\">\n",
310 |        "      <th></th>\n",
311 |        "      <th>0</th>\n",
312 |        "      <th>1</th>\n",
313 |        "      <th>2</th>\n",
314 |        "      <th>3</th>\n",
315 |        "    </tr>\n",
316 |        "  </thead>\n",
317 |        "  <tbody>\n",
318 |        "    <tr>\n",
319 |        "      <td>0</td>\n",
320 |        "      <td>1</td>\n",
321 |        "      <td>2</td>\n",
322 |        "      <td>3</td>\n",
323 |        "      <td>4</td>\n",
324 |        "    </tr>\n",
325 |        "    <tr>\n",
326 |        "      <td>1</td>\n",
327 |        "      <td>5</td>\n",
328 |        "      <td>6</td>\n",
329 |        "      <td>7</td>\n",
330 |        "      <td>8</td>\n",
331 |        "    </tr>\n",
332 |        "    <tr>\n",
333 |        "      <td>2</td>\n",
334 |        "      <td>9</td>\n",
335 |        "      <td>10</td>\n",
336 |        "      <td>11</td>\n",
337 |        "      <td>12</td>\n",
338 |        "    </tr>\n",
339 |        "  </tbody>\n",
340 |        "</table>\n",
341 |        "</div>"
342 |       ],
343 |       "text/plain": [
344 |        "   0   1   2   3\n",
345 |        "0  1   2   3   4\n",
346 |        "1  5   6   7   8\n",
347 |        "2  9  10  11  12"
348 |       ]
349 |      },
350 |      "execution_count": 21,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "# Let's read in our csv file\n",
357 |     "pd.read_csv(open('files/data.csv'), header=None)"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "metadata": {},
363 |    "source": [
364 |     "As we can see, using `Pandas` save us from having to do more work (write more code) to use load the file. "
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "### JSON Files"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "markdown",
376 |    "metadata": {},
377 |    "source": [
378 |     "<div class=\"alert alert-success\">\n",
379 |     "JavaScript Object Notation files can store hierachical key/value pairings. Think of them like dictionaries.\n",
380 |     "</div>\n",
381 |     "\n",
382 |     "<div class=\"alert alert-info\">\n",
383 |     "More information on JSON files from\n",
384 |     "<a href=\"https://en.wikipedia.org/wiki/JSON\" class=\"alert-link\">wikipedia</a>.\n",
385 |     "</div>"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 22,
391 |    "metadata": {},
392 |    "outputs": [
393 |     {
394 |      "name": "stdout",
395 |      "output_type": "stream",
396 |      "text": [
397 |       "{\r\n",
398 |       "  \"firstName\": \"John\",\r\n",
399 |       "  \"age\": 53\r\n",
400 |       "}\r\n"
401 |      ]
402 |     }
403 |    ],
404 |    "source": [
405 |     "# Let's have a look at a json file (printed out in plain text)\n",
406 |     "!cat files/data.json"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 23,
412 |    "metadata": {},
413 |    "outputs": [
414 |     {
415 |      "name": "stdout",
416 |      "output_type": "stream",
417 |      "text": [
418 |       "{'firstName': 'John', 'age': '53'}\n"
419 |      ]
420 |     }
421 |    ],
422 |    "source": [
423 |     "# Think of json's as similar to dictionaries\n",
424 |     "d = {'firstName': 'John', 'age': '53'}\n",
425 |     "print(d)"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "metadata": {},
431 |    "source": [
432 |     "#### JSON Files with Python"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": 24,
438 |    "metadata": {},
439 |    "outputs": [],
440 |    "source": [
441 |     "# Python also has a module for dealing with json\n",
442 |     "import json"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 25,
448 |    "metadata": {},
449 |    "outputs": [],
450 |    "source": [
451 |     "# Load a json file\n",
452 |     "with open('files/data.json') as dat_file:    \n",
453 |     "    dat = json.load(dat_file)"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 26,
459 |    "metadata": {},
460 |    "outputs": [
461 |     {
462 |      "name": "stdout",
463 |      "output_type": "stream",
464 |      "text": [
465 |       "<class 'dict'>\n"
466 |      ]
467 |     }
468 |    ],
469 |    "source": [
470 |     "# Check what data type this gets loaded as\n",
471 |     "print(type(dat))"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "markdown",
476 |    "metadata": {},
477 |    "source": [
478 |     "#### JSON Files with Pandas"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 27,
484 |    "metadata": {},
485 |    "outputs": [],
486 |    "source": [
487 |     "# Pandas also has support for reading in json files\n",
488 |     "pd.read_json?"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": 28,
494 |    "metadata": {},
495 |    "outputs": [
496 |     {
497 |      "data": {
498 |       "text/plain": [
499 |        "first          Alan\n",
500 |        "place    Manchester\n",
501 |        "dtype: object"
502 |       ]
503 |      },
504 |      "execution_count": 28,
505 |      "metadata": {},
506 |      "output_type": "execute_result"
507 |     }
508 |    ],
509 |    "source": [
510 |     "# You can read in json formatted strings with pandas\n",
511 |     "#  Note that here I am specifying to read it in as a pd.Series, as there is a single line of data\n",
512 |     "pd.read_json('{ \"first\": \"Alan\", \"place\": \"Manchester\"}', typ='series')"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 29,
518 |    "metadata": {},
519 |    "outputs": [
520 |     {
521 |      "data": {
522 |       "text/plain": [
523 |        "firstName    John\n",
524 |        "age            53\n",
525 |        "dtype: object"
526 |       ]
527 |      },
528 |      "execution_count": 29,
529 |      "metadata": {},
530 |      "output_type": "execute_result"
531 |     }
532 |    ],
533 |    "source": [
534 |     "# Read in our json file with pandas\n",
535 |     "pd.read_json(open('files/data.json'), typ='series')"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "markdown",
540 |    "metadata": {},
541 |    "source": [
542 |     "## Conclusion\n",
543 |     "\n",
544 |     "As a general guideline, for loading and wrangling data files, using standardized data files, and loading them with 'higher-level' tools such as `Pandas` makes it easier to work with data files. "
545 |    ]
546 |   }
547 |  ],
548 |  "metadata": {
549 |   "anaconda-cloud": {},
550 |   "kernelspec": {
551 |    "display_name": "Python 3",
552 |    "language": "python",
553 |    "name": "python3"
554 |   },
555 |   "language_info": {
556 |    "codemirror_mode": {
557 |     "name": "ipython",
558 |     "version": 3
559 |    },
560 |    "file_extension": ".py",
561 |    "mimetype": "text/x-python",
562 |    "name": "python",
563 |    "nbconvert_exporter": "python",
564 |    "pygments_lexer": "ipython3",
565 |    "version": "3.7.4"
566 |   }
567 |  },
568 |  "nbformat": 4,
569 |  "nbformat_minor": 1
570 | }
571 | 


--------------------------------------------------------------------------------
/dsip/tutorials/08-DataPrivacy&Anonymization.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "# Data Privacy & Anonymization\n",
 10 |     "\n",
 11 |     "A lot of data, perhaps the vast majority of data typically used in data science, is, directly or indirectly, about people. \n",
 12 |     "\n",
 13 |     "Individuals have privacy rights regarding who can know or share information about specifically identified individuals. This is true in particular about certain classes of sensitive information. For example, health-related information has special protections. Regardless of the data type, data privacy and security should also be a key concern when analyzing human data."
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## Information Privacy"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "<div class=\"alert alert-success\">\n",
 28 |     "Information (or Data) Privacy refers to the legal, ethical, and practical issues of collecting, using and releasing data in which there is identifiable information about people included in the dataset. It also deals with when and how to deal with data privacy issues, and how to protect users' privacy.\n",
 29 |     "</div>\n",
 30 |     "\n",
 31 |     "<div class=\"alert alert-info\">\n",
 32 |     "<a href=https://en.wikipedia.org/wiki/Information_privacy class=\"alert-link\">Wikipedia</a>\n",
 33 |     "has an overview of information privacy.\n",
 34 |     "</div>"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "## Anonymization"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "<div class=\"alert alert-success\">\n",
 49 |     "Data Anonymization is a type of information sanitization - that is the removal of sensitive information - for the purpose of privacy protection. It is a procedure to modify a data set such that the individuals it reflect are anonymous. Typically this means the removal or personally identifiable information from data sets such that the identify of individuals contained in the data set are anonymous.\n",
 50 |     "</div>\n",
 51 |     "\n",
 52 |     "<div class=\"alert alert-info\">\n",
 53 |     "<a href=\"https://en.wikipedia.org/wiki/Data_anonymization\" class=\"alert-link\">Wikipedia</a>\n",
 54 |     "also has an overview of data anonymization.\n",
 55 |     "</div>"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "Data protection and anonymization are interdisciplinary components of data science and data practice. Data protection includes everything from considerations of the ethics & legalities of data use, to the practical and technical challenges of protecting and anonymizing data. \n",
 63 |     "\n",
 64 |     "Anonymizing data typically comes down to removing any personally identifiable data from a dataset, or, if this information must be kept, separating the identifiable data from sensitive information. \n",
 65 |     "\n",
 66 |     "Part of the difficulty of data anonymization is that while we can provably demonstrate that a given dataset is anonymized, this rests on particular assumptions. Most notably, datasets are only provably anonymized under the assumption that no extra external information is available to be used to attempt to de-identify it. In practice, this means that de-anonymization of data can often be done by combining multiple datasets. By using information from multiple information sources, one can often use processes of elimination to decode the individuals included in a particular dataset."
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "# Regulations\n",
 74 |     "\n",
 75 |     "There are many official guidelines, rules and standards for data privacy and user identity protection, although much of it is case specific. \n",
 76 |     "\n",
 77 |     "At the minimum, what is legally required in terms of data protection depends, amongst other things, on:\n",
 78 |     "- What the data is / contains, and who it is about, \n",
 79 |     "    - Certain data types, and/or populations may have special protections, for example health-related information.\n",
 80 |     "- Who owns the data and in what capacity they are acting (company, university, etc.)\n",
 81 |     "    - For example, regulations for scientific research are different than those for companies\n",
 82 |     "- User agreements / consent procedures that were in place when the data were collected. \n",
 83 |     "    - Individuals have a right to self-determination in terms of what their data is used for. Data should only be used for things that are covered by it's terms of use / terms of collection / consent procedures.\n",
 84 |     "- What the data is to be used for.\n",
 85 |     "    - Often a combination of the what and the who, there may be specific regulations about how you must deal with, and what you can do, based on the goal of having and using the data.\n",
 86 |     "- Where the data was collected and where it is stored, and who it is about.\n",
 87 |     "    - Different regions (countries, etc) often have different regulations.\n",
 88 |     "\n",
 89 |     "Much of these regulations apply more directly to the collection, storage, and release of datasets (rather than analysis), but aspects also apply to the use of datasets, including publicly available datasets. Available datasets often have a user agreement for using the data, and, in particular, attempting to identify individuals from datasets may at a minimum break user agreements, and/or (depending on the nature of the data) be illegal based on consumer and research subject protection laws. "
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "## Research Standards"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "<div class=\"alert alert-success\">\n",
104 |     "Data collected and used for research purposes has it's own set of guidelines and requirements regarding the treatment of human subjects, and the collection, storage, use, and dissemination of data. These regulations focus, among other things, on the right to self-determination of human subjects to consent to what data is collected, and how it is used. Data collected for research purposes must follow restrictions based on these consent procedures. \n",
105 |     "</div>\n",
106 |     "\n",
107 |     "<div class=\"alert alert-info\">\n",
108 |     "Research protections under the\n",
109 |     "<a href=\"https://en.wikipedia.org/wiki/Declaration_of_Helsinki\" class=\"alert-link\">Declaration of Helsinki</a>.\n",
110 |     "</div>"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "## HIPAA - Protection for Health Related Information"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "<div class=\"alert alert-success\">\n",
125 |     "The Health Insurance Portability and Accountability Act (HIPAA) is a US federal government regulation that standardizes and protects individuals medical records and health related data. It includes terms for how data must be stored, and how they can be used & shared.\n",
126 |     "</div>\n",
127 |     "\n",
128 |     "<div class=\"alert alert-info\">\n",
129 |     "The official US federal government HIPAA information\n",
130 |     "<a href=\"https://www.hhs.gov/hipaa/\" class=\"alert-link\">guidelines</a>\n",
131 |     "include an overview of HIPAA.\n",
132 |     "</div>"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "## Safe Harbour Method"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "markdown",
144 |    "metadata": {},
145 |    "source": [
146 |     "<div class=\"alert alert-success\">\n",
147 |     "Safe Harbor is an official agreement regarding how to deal with datasets that have personal data. It describes specific guidelines on what information to remove from datasets in order to anonymize them. It is a single set of data protection requirements shared across many contexts and countries.\n",
148 |     "</div>\n",
149 |     "\n",
150 |     "<div class=\"alert alert-info\">\n",
151 |     "The \n",
152 |     "<a href=\"https://www.hhs.gov/hipaa/for-professionals/privacy/special-topics/de-identification/\" class=\"alert-link\">official documentation</a>\n",
153 |     "for Safe Harbour includes guidelines on how to anonymize data.\n",
154 |     "</div>"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "The Safe Harbor method requires that the following identifiers of the individuals be removed:\n",
162 |     "- Names\n",
163 |     "- Geographic Subdivisions smaller than a state**\n",
164 |     "- Dates (such as birth dates, etc), and all ages above 90\n",
165 |     "- Telephone Numbers\n",
166 |     "- Vehicle Identification Numbers\n",
167 |     "- Fax numbers\n",
168 |     "- Device identifiers and serial numbers\n",
169 |     "- Email addresses\n",
170 |     "- Web Universal Resource Locators (URLs)\n",
171 |     "- Social security numbers\n",
172 |     "- Internet Protocol (IP) addresses\n",
173 |     "- Medical record numbers\n",
174 |     "- Biometric identifiers, including finger and voice prints\n",
175 |     "- Health plan beneficiary numbers\n",
176 |     "- Full-face photographs and any comparable images\n",
177 |     "- Account numbers\n",
178 |     "- Certificate/license numbers\n",
179 |     "- Any other unique identifying number, characteristic, or code\n",
180 |     "\n",
181 |     "** The first three numbers of the zip code can be kept, provided that more than 20,000 people live in the region covered by all the zip codes that share the same initial three digits (the same geographic subdivision). "
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "### Unique Identifiers\n",
189 |     "\n",
190 |     "The goal of Safe Harbor, and Data Anonymization in general, is to remove any unique information that could be used to identify you. \n",
191 |     "\n",
192 |     "This is perhaps most obvious for things like names. Other, perhaps less obvious specifications of Safe Harbour, are also based on the that this information being in a dataset creates a risk for identification of individuals contained in the dataset. \n",
193 |     "\n",
194 |     "For example, while it may be innocuous to talk about a 37 year old male who lives in Los Angeles (as there are many candidates, such that the specific individual is not revealed), it might actually be quite obvious who the person is when talking about a 37 year old male who lives in Potrero, California, a town of about 700 people. This is the same reason ages above 90 have to be removed - even in a fairly large area, say San Diego, it may be fairly obvious who the 98 year old female participant is. \n",
195 |     "\n",
196 |     "Basically - any information that makes you stand out is liable to identify you. Anonymization attempts to remove these kinds of indications from the data, such that individuals do not stand out in a way that lets someone figure out who they are.\n",
197 |     "\n",
198 |     "This also underlies the difficulty in protecting data in the face of multiple data sources, since collecting observations together makes it much easier to start to pick out people more uniquely. It may still be relatively easy to identify the 37 year old male from LA if you also happen to know (or figure out) that he has a poodle, is 5'6\", works at UCLA, and was at Griffith Park on Saturday, April 15th. All of this extra information may be relatively easy to figure out by combining publicly available, or easily obtainable, data."
199 |    ]
200 |   }
201 |  ],
202 |  "metadata": {
203 |   "kernelspec": {
204 |    "display_name": "Python 3",
205 |    "language": "python",
206 |    "name": "python3"
207 |   },
208 |   "language_info": {
209 |    "codemirror_mode": {
210 |     "name": "ipython",
211 |     "version": 3
212 |    },
213 |    "file_extension": ".py",
214 |    "mimetype": "text/x-python",
215 |    "name": "python",
216 |    "nbconvert_exporter": "python",
217 |    "pygments_lexer": "ipython3",
218 |    "version": "3.7.4"
219 |   }
220 |  },
221 |  "nbformat": 4,
222 |  "nbformat_minor": 2
223 | }
224 | 


--------------------------------------------------------------------------------
/dsip/tutorials/12-StatisticalComparisons.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Statistical Comparisons\n",
  8 |     "\n",
  9 |     "Whenever we have data, we often want to use statistical analyses to explore, compare, and quantify our data. \n",
 10 |     "\n",
 11 |     "In this notebook, we will briefly introduce and explore some common statistical tests that can be applied to data. \n",
 12 |     "\n",
 13 |     "As with many of the topics in data analysis and machine learning, this tutorial is focused on introducing some related topics for data science, and demonstrated their application in Python, but it is out of scope of these tutorials to systematically introduce and describe the topic at hand, which in this case is statistics. If the topics here are unfamiliar, we recommend you follow the links or look for other resources to learn more about these topics. "
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 1,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "%matplotlib inline\n",
 23 |     "\n",
 24 |     "import numpy as np\n",
 25 |     "import matplotlib.pyplot as plt\n",
 26 |     "\n",
 27 |     "from scipy.stats import norm"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 2,
 33 |    "metadata": {},
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "# Set random seed, for consistency simulating data\n",
 37 |     "np.random.seed(21)"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## On Causality\n",
 45 |     "\n",
 46 |     "Before we dive into particular statistical tests, just a general reminder that though we would often like to understand the _causal structure_ of the data that we are interested in, this is generally not directly interpretable from statistical tests themselves. \n",
 47 |     "\n",
 48 |     "In the follow, we will explore some statistical tests for investigating if and when distributions of data are the same or different, and if and how related they are. These tests, by themselves, do not tell us about what causes what. Correlation is not causation.\n",
 49 |     "\n",
 50 |     "In the context of data science, this can be a limitation as we are often using previously collected datasets of convenience and observational datasets collected. Though we can explore the structure of the data, such datasets typically do not allow for causal interpretations."
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "markdown",
 55 |    "metadata": {},
 56 |    "source": [
 57 |     "## Correlations\n",
 58 |     "\n",
 59 |     "A common question we may be interested in is if two datasets, or two features of data, are related to each other. \n",
 60 |     "\n",
 61 |     "If they, we would also like to now _how_ related they are to each other. \n",
 62 |     "\n",
 63 |     "For this, we can calculate correlations between features. "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "<div class=\"alert alert-success\">\n",
 71 |     "Correlations are statistical dependencies or relationships between variables. \n",
 72 |     "</div>\n",
 73 |     "\n",
 74 |     "<div class=\"alert alert-info\">\n",
 75 |     "Correlation on \n",
 76 |     "<a href=https://en.wikipedia.org/wiki/Correlation_and_dependence class=\"alert-link\">wikipedia</a>, \n",
 77 |     "including for the \n",
 78 |     "<a href=https://en.wikipedia.org/wiki/Pearson_correlation_coefficient class=\"alert-link\">pearson</a>, \n",
 79 |     "and \n",
 80 |     "<a href=https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient class=\"alert-link\">spearman</a>\n",
 81 |     "correlation measures.     \n",
 82 |     "</div>"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 3,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from scipy.stats import pearsonr, spearmanr"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "markdown",
 96 |    "metadata": {},
 97 |    "source": [
 98 |     "### Simulate Data\n",
 99 |     "\n",
100 |     "First, let's simulate some data. \n",
101 |     "\n",
102 |     "For this example, we will simulate two arrays of data that do have a relationship to each other. "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 4,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# Settings for simulated data\n",
112 |     "corr = 0.75\n",
113 |     "covs = [[1, corr], [corr, 1]]\n",
114 |     "means = [0, 0]\n",
115 |     "\n",
116 |     "# Generate the data\n",
117 |     "d1, d2 = np.random.multivariate_normal(means, covs, 1000).T"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "### Calculate Correlations\n",
125 |     "\n",
126 |     "Next, we can calculate the correlation coefficient between our data arrays, using the `pearsonr` function from `scipy.stats`. "
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 5,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "# Calculate a pearson correlation between two arrays of data\n",
136 |     "r_val, p_val = pearsonr(d1, d2)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 6,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "The correlation coefficient is 0.7732 with a p-value of 0.00.\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "print(\"The correlation coefficient is {:1.4f} with a p-value of {:1.2f}.\".format(r_val, p_val))"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "In this case, we have a high correlation coefficient, with a very low p-value. \n",
161 |     "\n",
162 |     "This suggests our data are strongly correlated!\n",
163 |     "\n",
164 |     "In this case, since we simulated the data, we know that this is a good estimation of the relationship between the data."
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "### Rank Correlations\n",
172 |     "\n",
173 |     "One thing to keep in mind is that the `pearson` correlation used above assumes that both data distributions are normally distributed.\n",
174 |     "\n",
175 |     "These assumptions should also be tested in data to be analyzed. \n",
176 |     "\n",
177 |     "Sometimes these assumptions will not be met. In that case, one option is to a different kind of correlation example. For example, the `spearman` correlation is a rank correlation that does not have the same assumptions as pearson."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 7,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "name": "stdout",
187 |      "output_type": "stream",
188 |      "text": [
189 |       "The correlation coefficient is 0.7595 with a p-value of 0.00.\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "# Calculate the spearman rank correlation between our data\n",
195 |     "r_val, p_val = spearmanr(d1, d2)\n",
196 |     "\n",
197 |     "# Check the results of the spearman correlation\n",
198 |     "print(\"The correlation coefficient is {:1.4f} with a p-value of {:1.2f}.\".format(r_val, p_val))"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {},
204 |    "source": [
205 |     "In this case, the measured values for `pearson` and `spearman` correlations are about the same, since both are appropriate for the properties of this data."
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## T-Tests\n",
213 |     "\n",
214 |     "Another question we might often want to ask about data is to check and detect when there is a significant difference between collections of data. \n",
215 |     "\n",
216 |     "For example, we might want to analyze if there is a significant different in measured feature values between some groups of interest. \n",
217 |     "\n",
218 |     "To do so, we can use t-tests. \n",
219 |     "\n",
220 |     "There are different variants of t-test, including:\n",
221 |     "- one sample t-test\n",
222 |     "    - test the mean of one group of data\n",
223 |     "- independent samples t-test\n",
224 |     "    - test for a difference of means between two independent samples of data\n",
225 |     "- related samples t-test\n",
226 |     "    - test for a difference of means between two related samples of data\n",
227 |     "    \n",
228 |     "For this example, we will explore using the independent samples t-test. \n",
229 |     "\n",
230 |     "Functions for the other versions are also available in `scipy.stats`. "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "markdown",
235 |    "metadata": {},
236 |    "source": [
237 |     "<div class=\"alert alert-success\">\n",
238 |     "T-tests are statistical hypothesis tests for examining mean values and differences of groups of data. \n",
239 |     "</div>\n",
240 |     "\n",
241 |     "<div class=\"alert alert-info\">\n",
242 |     "T-tests on\n",
243 |     "<a href=https://en.wikipedia.org/wiki/Student%27s_t-test class=\"alert-link\">wikipedia</a>. \n",
244 |     "</div>"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 8,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "from scipy.stats import ttest_ind"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "### Simulate Data\n",
261 |     "\n",
262 |     "First, let's simulate some data. \n",
263 |     "\n",
264 |     "For this example, we will simulate two samples of normally distributed data. "
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 9,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "# Settings for data simulation\n",
274 |     "n_samples = 250\n",
275 |     "\n",
276 |     "# Simulate some data\n",
277 |     "d1 = norm.rvs(loc=0.5, scale=1, size=n_samples)\n",
278 |     "d2 = norm.rvs(loc=0.75, scale=1, size=n_samples)"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 10,
284 |    "metadata": {},
285 |    "outputs": [
286 |     {
287 |      "data": {
288 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXAAAAD4CAYAAAD1jb0+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAWR0lEQVR4nO3dfXBV9Z3H8feXEAhFWxSTLCXSgBVaFYU1VZCOrSjWVkexBbe1UlQctrN1R0BHsWUdFWekPsXO0HYn40MBbStqBdqObVIWZXYKVrCUh4KoGYpRNklT8amEx+/+kYNivLnnJLn33PzC5zXD3Jxzv/eezzDw5fA7v/M75u6IiEh4+hQ6gIiIdI0auIhIoNTARUQCpQYuIhIoNXARkUD1TfNgJ5xwgldWVqZ5SOnl1q9vez3zzCxF/4iKjs9WJNJzrV+//u/uXtp+v6U5jbCqqsrXrVuX2vGk9zNre836x/jnUdGVmjIrYTKz9e5e1X6/hlBERAKlBi4iEig1cBGRQKV6EVNEJF/2799PQ0MDra2thY7SZSUlJVRUVFBcXJyoXg1cgpboGrwuXh4VGhoaOPbYY6msrMQOX90OiLvT0tJCQ0MDw4cPT/QZDaGISK/Q2trK4MGDg2zeAGbG4MGDO/U/CDVwEek1Qm3eh3U2vxq4BO3MM2Nu4gF49sy2XyK9jMbAJWgvvZSg6K0kRdLbVNdtz+n3zZ40MramqKiI0aNHs3//fvr27cv06dOZNWsWffr0oaWlhSlTpvDiiy9y9dVXs3Dhwm5nUgOXHqXzf+lGxn5udoLvTvKXUyTOgAED2LBhAwBNTU1ceeWVvP3229xxxx2UlJQwf/58Nm/ezObNm3NyPA2hiIjkQVlZGTU1NSxcuBB3Z+DAgXzxi1+kpKQkZ8dQAxcRyZMRI0Zw6NAhmpqa8vL9auAiInmUzwUD1cBFRPKkvr6eoqIiysrK8vL9uogpQRv31d2xNZtKrkghichHNTc3893vfpfrr78+b/PT1cAlaFfMjh9b/MOx81NIIj1NIWYW7dmzhzFjxnwwjXDatGnMmTPng/crKyt555132LdvH8uWLaO2tpZTTjmly8dTAxcRyZGDBw9mfX/Hjh05PV7sGLiZjTKzDUf8esfMZpnZ8WZWZ2avRK/H5TSZSAKvb+/P69v7Z60p27+Zsv25mXcr0pPENnB3f9ndx7j7GOBM4J/AM8BcYKW7nwysjLZFUlV9/Weovv4zWWu+vfsbfHv3N1JKJJKezs5COR94zd3/BlwGLIr2LwIm5zKYiIhk19kG/k3gF9HP5e6+CyB6zThPxsxmmtk6M1vX3Nzc9aQiIvIRiRu4mfUDLgWe7MwB3L3G3avcvaq0tLSz+UREpAOdOQP/KvCSuzdG241mNgQges3PvaIiIpJRZ6YRfosPh08AVgDTgQXR6/Ic5hIR6Z5Vd+f2+867NbYk23KydXV1zJ07l3379tGvXz/uvfdeJk6c2K1IiRq4mX0CmAT8+xG7FwBLzWwGsBOY2q0kIiKBy7ac7AknnMCvf/1rPv3pT7N582a+8pWv8MYbb3TreIkauLv/Exjcbl8LbbNSRApm9sK/xdY8PujpFJKIfNTh5WS/8IUvcPvttzN27NgP3jv11FNpbW1l79699O+f/T6GbHQnpgTtxJF7Y2uaik9LIYnIxx25nGx5efkH+59++mnGjh3breYNauAiInnVfjnZLVu2cMstt1BbW9vt79ZyshK0pdVlLK3OvlTnBe/+Fxe8+18pJRL5UPvlZBsaGrj88stZvHgxJ510Ure/Xw1cgrb22UGsfXZQ1prRrUsZ3bo0pUQibdovJ7t7924uvvhi7r77biZMmJCTY2gIRUR6pwTT/nIt23KyCxcu5NVXX2X+/PnMn9+2xHFtbW23HvagBi4ikiPZlpOdN28e8+bNy+nxNIQiIhIoNXARkUCpgYtIr5HPJ8CnobP5NQYuQav4bGtsTWPfU1NIIoVWUlJCS0sLgwcPzttDhPPJ3WlpaaGkpCTxZ9TAJWhzfrIztubnx/0qhSRSaBUVFTQ0NBDycwdKSkqoqKhIXK8GLiK9QnFxMcOHDy90jFRpDFxEJFBq4BK0OReOZM6FI7PWzG4exezmUSklEkmPGriISKDUwEVEAqUGLiISKDVwEZFAqYGLiAQqUQM3s0Fm9pSZbTOzrWY23syON7M6M3slej0u32FFRORDSW/k+RHwO3efYmb9gE8A3wdWuvsCM5sLzAVuyVNOkYym3tAYW/OHY+5MIYlI+mIbuJl9EjgXuBrA3fcB+8zsMuDLUdki4DnUwCVl4y9+O7Zm04B/SyGJSPqSnIGPAJqBR83sDGA9cANQ7u67ANx9l5llfKyEmc0EZgIMGzYsJ6FFumLczpqO31w1ODcHKcBTYOTolWQMvC/wr8BP3X0s8D5twyWJuHuNu1e5e1VpaWkXY4pktua3n2LNbz+VtWb0nicYveeJlBKJpCfJGXgD0ODuL0TbT9HWwBvNbEh09j0EaMpXSJGOPPmjciD7UMoF790GwBpuTCWTSFpiz8Dd/f+A183s8GIS5wN/BVYA06N904HleUkoIiIZJZ2F8p/A49EMlHrgGtqa/1IzmwHsBKbmJ6KIiGSSqIG7+wagKsNb5+c2joiIJKU7MUVEAqUGLiISKDVwEZFA6ZmYErQHarfH1lSXvgzE3MgjEiCdgYuIBEoNXEQkUBpCkaA98B9t6+vM+cnODmuufOvrANRzUSqZRNKiBi5Ba3i1JLam/MAWQA1ceh81cOk5Vt3NuJ0tnfzQfUDMBcoBXY8k0pNpDFxEJFBq4CIigVIDFxEJlBq4iEigdBFTgnbpuWtjaxoPjE4hiUj61MAlaHO/81RsTf3+C1NIIpI+DaGIiARKDVyCtm3HULbtGJq1ZqA1MtAaU0okkh4NoUjQrr1rNgB/fOimDmtOL3kMgDV79FBj6V10Bi4iEqhEZ+BmtgN4FzgIHHD3KjM7HngCqAR2AFe4+1v5iSkiIu115gz8PHcf4+6HH248F1jp7icDK6NtERFJSXeGUC4DFkU/LwImdz+OiIgklbSBO1BrZuvNbGa0r9zddwFEr2WZPmhmM81snZmta25u7n5iEREBks9CmeDub5pZGVBnZtuSHsDda4AagKqqKu9CRhERySBRA3f3N6PXJjN7BjgLaDSzIe6+y8yGAE15zCmS0SPzqmNrNrZelUISkfTFNnAzGwj0cfd3o58vBO4EVgDTgQXR6/J8BpX0VNfFP+k9Hzr/MAf4XOUbsTXve3lszZr6zh87k7UHOv97N3vSyJwcW44+Sc7Ay4FnzOxw/c/d/Xdm9iKw1MxmADuBqfmLKSIi7cU2cHevB87IsL8FOD8foUSSWrB4CpB9UasRxbWAFrWS3kd3YkrQVqwex4rV47LWlPfdRHnfTSklEkmPGriISKDUwEVEAqUGLiISKDVwEZFAqYGLiARKD3SQoI0a1hBb896hjMv0iARPDVyC9uhtD8bWbNo7LYUkIunTEIqISKDUwEVEAqUGLkE757r7OOe6+7LWjB9wP+MH3J9SIpH0qIGLiARKDVxEJFBq4CIigVIDFxEJlBq4iEig1MBFRAKlOzElaDdPezK25rV9k1JIIpI+NXAJ2uQvvRBb03Tw9BSSiKQv8RCKmRWZ2Z/N7DfR9nAze8HMXjGzJ8ysX/5iiohIe50ZA78B2HrE9g+Banc/GXgLmJHLYCJJLHv+bJY9f3bWmrKijZQVbUwpkUh6EjVwM6sALgYeirYNmAgcfhT4ImByPgKKZHPPkqncs2Rq1pqT+tVxUr+6lBKJpCfpGfiDwM3AoWh7MLDb3Q9E2w3A0EwfNLOZZrbOzNY1Nzd3K6yIiHwotoGb2SVAk7uvP3J3hlLP9Hl3r3H3KnevKi0t7WJMERFpL8kslAnApWb2NaAE+CRtZ+SDzKxvdBZeAbyZv5giItJebAN391uBWwHM7MvATe7+bTN7EpgC/BKYDizPY04psHE7awodQUTa6c6dmLcAc8zsVdrGxB/OTSQREUmiUzfyuPtzwHPRz/XAWbmPJCIiSehOTAnaHx+6KbZmzZ4bU0gikj4tZiUiEig1cBGRQKmBS9CuuXMW19w5K2vN6P5LGN1/SUqJRNKjMXAJ2ss7K2JrjunTlEISkfTpDFxEJFBq4CIigVIDFxEJlMbARXKoS0sOrBrcufrzbu38MaRX0hm4iEigdAYuQbv03LWxNY0HRqeQRCR9auAStLnfeSq2pn7/hSkkEUmfhlBERAKlBi5B27ZjKNt2ZHya3wcGWiMDrTGlRCLp0RCKBO3au2YD2VclPL3kMUCrEkrvozNwEZFAqYGLiARKDVxEJFBq4CIigYpt4GZWYmZ/MrO/mNkWM7sj2j/czF4ws1fM7Akz65f/uCIicliSM/C9wER3PwMYA1xkZuOAHwLV7n4y8BYwI38xRUSkvdhphO7uwHvRZnH0y4GJwJXR/kXA7cBPcx9RpGOPzKuOrdnYelUKSUTSl2geuJkVAeuBzwI/Bl4Ddrv7gaikAch4N4WZzQRmAgwbNqy7eUU+4nOVb8TWvO/lKSQRSV+ii5juftDdxwAVwFnA5zOVdfDZGnevcveq0tLSricVEZGP6NQsFHffDTwHjAMGmdnhM/gK4M3cRhOJt2DxFBYsnpK1ZkRxLSOKa1NKJJKeJLNQSs1sUPTzAOACYCuwCjj8N2c6sDxfIUU6smL1OFasHpe1przvJsr7bkopkUh6koyBDwEWRePgfYCl7v4bM/sr8Eszuwv4M/BwHnOKiEg7SWahbATGZthfT9t4uIiIFIDuxBQRCZQauIhIoLQeuEiBralv6VT92gPbc3Lc2ZNG5uR7pHDUwCVoo4Y1xNa8d6gshSQi6VMDl6A9etuDsTWb9k5LIYlI+jQGLiISKDVwEZFAqYFL0M657j7Oue6+rDXjB9zP+AH3p5RIJD0aA+/BqutyM9tARHonnYGLiARKDVxEJFBq4CIigVIDFxEJlBq4iEigNAtFgnbztCdja17bNymFJCLpUwOXoE3+0guxNU0HT08hiUj6NIQiIhIoNXAJ2rLnz2bZ82dnrSkr2khZ0caUEomkR0MoErR7lkwFsg+lnNSvDoCmPRpKkd4lyVPpTzSzVWa21cy2mNkN0f7jzazOzF6JXo/Lf1wRETksyRDKAeBGd/88MA74npmdAswFVrr7ycDKaFtERFIS28DdfZe7vxT9/C6wFRgKXAYsisoWAZPzFVJERD6uUxcxzawSGAu8AJS7+y5oa/JAxudWmdlMM1tnZuuam5u7l1ZERD6QuIGb2THA08Asd38n6efcvcbdq9y9qrS0tCsZRUQkg0QN3MyKaWvej7v7r6LdjWY2JHp/CNCUn4giIpJJ7DRCMzPgYWCruz9wxFsrgOnAguh1eV4SimTxx4duiq1Zs+fGFJKIpC/JPPAJwDRgk5ltiPZ9n7bGvdTMZgA7gan5iSgiIpnENnB3/1/AOnj7/NzGka4Yt7Om0BFEpAB0K70E7Zo7Z3HNnbOy1ozuv4TR/ZeklEgkPbqVXoL28s6K2Jpj+uj6uvROOgMXEQmUGriISKDUwEVEAqUGLiISKDVwEZFAaRaKBO3Sc9fG1jQeGJ1CEpH0qYFL0OZ+56nYmvr9F6aQRCR9GkIREQmUGrgEbduOoWzbMTRrzUBrZKA1ppRIJD0aQpGgXXvXbCD7qoSnlzwG9J5VCXO29s2qwR2/d96tuTmG5JXOwEVEAqUz8BjVddsLHUFEJCOdgYuIBEoNXEQkUBpCEZGPW3V3fr9fF0lzQmfgIiKB0hm4BO2RedWxNRtbr0ohiUj6kjyV/hHgEqDJ3U+L9h0PPAFUAjuAK9z9rfzFDJeeV5lfn6t8I7bmfS9PIYlI+pIMofwMuKjdvrnASnc/GVgZbYuISIpiG7i7rwb+0W73ZcCi6OdFwOQc5xJJZMHiKSxYPCVrzYjiWkYU16aUSCQ9Xb2IWe7uuwCi17KOCs1sppmtM7N1zc3NXTycSGYrVo9jxepxWWvK+26ivO+mlBKJpCfvs1Dcvcbdq9y9qrS0NN+HExE5anS1gTea2RCA6LUpd5FERCSJrk4jXAFMBxZEr8tzlkhEUrGmvqVgxx5/XsEO3avEnoGb2S+ANcAoM2swsxm0Ne5JZvYKMCnaFhGRFMWegbv7tzp46/wcZxERkU7QnZgStFHDGmJr3jvU4SQpkaCpgUvQHr3twdiaTXunpZBEJH1azEpEJFBq4CIigVIDl6Cdc919nHPdfVlrxg+4n/ED7k8pkUh61MBFRAKlBi4iEig1cBGRQKmBi4gESg1cRCRQauAiIoHSnZgStJunPRlb89q+SSkkEUmfGrgEbfKXXoitaTp4egpJRNIXTAOvrtue8+9M8sT47A/rEpEuWXV3/o9x3q35P0aBaQxcgrbs+bNZ9vzZWWvKijZSVrQxpUQi6QnmDFwkk3uWTAWyD6Wc1K8OgKY9GkqR3kVn4CIigVIDFxEJlBq4iEigujUGbmYXAT8CioCH3F0PNxaRHisfs9mSmD1pZF6+t8tn4GZWBPwY+CpwCvAtMzslV8FERCS77gyhnAW86u717r4P+CVwWW5iiYhIHHP3rn3QbApwkbtfF21PA8529+vb1c0EZkabo4CXux43Z04A/l7oEJ2kzPkXWl4IL3NoeaFnZP6Mu5e239mdMXDLsO9j/xq4ew0Qf8tjisxsnbtXFTpHZyhz/oWWF8LLHFpe6NmZuzOE0gCceMR2BfBm9+KIiEhS3WngLwInm9lwM+sHfBNYkZtYIiISp8tDKO5+wMyuB35P2zTCR9x9S86S5VePGtJJSJnzL7S8EF7m0PJCD87c5YuYIiJSWLoTU0QkUGrgIiKBOmobuJnda2bbzGyjmT1jZoMKnSmOmU01sy1mdsjMeuS0JmhbYsHMXjazV81sbqHzxDGzR8ysycw2FzpLEmZ2opmtMrOt0Z+HGwqdKY6ZlZjZn8zsL1HmOwqdKQkzKzKzP5vZbwqdJZOjtoEDdcBp7n46sB0I4fEdm4GvA6sLHaQjgS6x8DPgokKH6IQDwI3u/nnaHhr1vQB+j/cCE939DGAMcJGZhfDAqxuArYUO0ZGjtoG7e627H4g219I2j71Hc/et7t4T7mTNJrglFtx9NfCPQudIyt13uftL0c/v0tZghhY2VXbe5r1oszj61aNnUJhZBXAx8FChs3TkqG3g7VwLPFvoEL3EUOD1I7Yb6OHNJWRmVgmMBeKf7lxg0XDEBqAJqHP3np75QeBm4FChg3SkVz9Szcz+APxLhrd+4O7Lo5of0PZf0sfTzNaRJJl7uERLLEj3mdkxwNPALHd/p9B54rj7QWBMdL3pGTM7zd175HUHM7sEaHL39Wb25ULn6UivbuDufkG2981sOnAJcL73kAnxcZkDoCUWUmBmxbQ178fd/VeFztMZ7r7bzJ6j7bpDj2zgwATgUjP7GlACfNLMHnP3qwqc6yOO2iGU6GEUtwCXuvs/C52nF9ESC3lmZgY8DGx19wcKnScJMys9PNPLzAYAFwDbCpuqY+5+q7tXuHslbX+G/6enNW84ihs4sBA4Fqgzsw1m9t+FDhTHzC43swZgPPBbM/t9oTO1F10YPrzEwlZgaU9fYsHMfgGsAUaZWYOZzSh0phgTgGnAxOjP7oboTLEnGwKsMrONtP0jX+fuPXJqXkh0K72ISKCO5jNwEZGgqYGLiARKDVxEJFBq4CIigVIDFxEJlBq4iEig1MBFRAL1/0yJ7PORGPE8AAAAAElFTkSuQmCC\n",
289 |       "text/plain": [
290 |        "<Figure size 432x288 with 1 Axes>"
291 |       ]
292 |      },
293 |      "metadata": {
294 |       "needs_background": "light"
295 |      },
296 |      "output_type": "display_data"
297 |     }
298 |    ],
299 |    "source": [
300 |     "# Visualize our data comparison\n",
301 |     "plt.hist(d1, alpha=0.5, label='D1');\n",
302 |     "plt.axvline(np.mean(d1), linestyle='--', linewidth=2, color='blue')\n",
303 |     "plt.hist(d2, alpha=0.5, label='D2');\n",
304 |     "plt.axvline(np.mean(d2), linestyle='--', linewidth=2, color='orange')\n",
305 |     "plt.legend();"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### Calculate T-Tests\n",
313 |     "\n",
314 |     "Now that we have some data, let's use a t-tests to statistically compare the two groups of data. \n",
315 |     "\n",
316 |     "For this example, we will test whether the two distributions have significantly different means. "
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 11,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "# Run independent samples t-test\n",
326 |     "t_val, p_val = ttest_ind(d1, d2)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": 12,
332 |    "metadata": {},
333 |    "outputs": [
334 |     {
335 |      "name": "stdout",
336 |      "output_type": "stream",
337 |      "text": [
338 |       "T-Test comparison of D1 & D2:\n",
339 |       "\tT-value \t -2.2502\n",
340 |       "\tP-value \t 2.49e-02\n"
341 |      ]
342 |     }
343 |    ],
344 |    "source": [
345 |     "# Check the results of the t-test\n",
346 |     "print('T-Test comparison of D1 & D2:')\n",
347 |     "print('\\tT-value \\t {:1.4f}'.format(t_val))\n",
348 |     "print('\\tP-value \\t {:1.2e}'.format(p_val))"
349 |    ]
350 |   },
351 |   {
352 |    "cell_type": "markdown",
353 |    "metadata": {},
354 |    "source": [
355 |     "In this case, the t-test shows that there is a significant difference in the mean of the two arrays of data!"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "markdown",
360 |    "metadata": {},
361 |    "source": [
362 |     "### Assumptions of T-Tests\n",
363 |     "\n",
364 |     "Note, again, that t-tests assume normally distributed data. This is again a property of the data that should be examined before applying statistical tests. If this assumption is not met, other approaches for comparing the data may be needed."
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "## Effect Size\n",
372 |     "\n",
373 |     "One thing to keep in mind about hypothesis tests such as the t-test above is that they while they can be used to _is there a difference_ between two sets of data, they do not answer the question of _how different are they_.\n",
374 |     "\n",
375 |     "Often, we would also like to measure how different groups of data are.\n",
376 |     "\n",
377 |     "To do so, we can use effect size measures, which can be used to estimate the magnitude of changes or differences. \n",
378 |     "\n",
379 |     "There are many methods and approaches to measuring effect sizes across different contexts. \n",
380 |     "\n",
381 |     "For this example, we will use cohens-d effect size estimate for differences in means."
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "<div class=\"alert alert-success\">\n",
389 |     "Effect size measurements are measurements of the magnitude of a particular effect.\n",
390 |     "</div>\n",
391 |     "\n",
392 |     "<div class=\"alert alert-info\">\n",
393 |     "Effect sizes on \n",
394 |     "<a href=https://en.wikipedia.org/wiki/Effect_size class=\"alert-link\">wikipedia</a>.\n",
395 |     "</div>"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "metadata": {},
401 |    "source": [
402 |     "### Defining Effect Size Code\n",
403 |     "\n",
404 |     "Often, when analyzing data, we will want to apply some measure that we may not find already available, in which case we may need to implement a version ourselves. \n",
405 |     "\n",
406 |     "For this example, we will implement cohens-d, an effect size measure for differences of means. Briefly, is a calculation of the difference of means between two distributions, divided by the pooled standard deviation. As such, cohens-d is a standardized measure, meaning the output value is independent of the units of the inputs. \n",
407 |     "\n",
408 |     "Note that `math` and `statistics` are standard library modules that contain some useful basic numerical functionality. "
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 13,
414 |    "metadata": {},
415 |    "outputs": [],
416 |    "source": [
417 |     "from math import sqrt\n",
418 |     "from statistics import mean, stdev\n",
419 |     "\n",
420 |     "def compute_cohens_d(data_1, data_2):\n",
421 |     "    \"\"\"Compute cohens-d effect size.\n",
422 |     "    \n",
423 |     "    Parameters\n",
424 |     "    ----------\n",
425 |     "    data_1, data_2 : 1d array\n",
426 |     "        Array of data to compute the effect size between.\n",
427 |     "        \n",
428 |     "    Returns\n",
429 |     "    -------\n",
430 |     "    cohens_d : float\n",
431 |     "        The computed effect size measure. \n",
432 |     "    \"\"\"\n",
433 |     "\n",
434 |     "    # Calculate group means\n",
435 |     "    d1_mean = mean(data_1)\n",
436 |     "    d2_mean = mean(data_2)\n",
437 |     "    \n",
438 |     "    # Calculate group standard deviations\n",
439 |     "    d1_std = stdev(data_1)\n",
440 |     "    d2_std = stdev(data_2)\n",
441 |     "    \n",
442 |     "    # Calculate the pooled standard deviation\n",
443 |     "    pooled_std = sqrt((d1_std ** 2 + d2_std ** 2) / 2)\n",
444 |     "    \n",
445 |     "    # Calculate cohens-d\n",
446 |     "    cohens_d = (d1_mean - d2_mean) / pooled_std\n",
447 |     "\n",
448 |     "    return cohens_d"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 14,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "# Calculate the cohens-d effect size for our simulated data from before\n",
458 |     "cohens_d = compute_cohens_d(d2, d1)"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 15,
464 |    "metadata": {},
465 |    "outputs": [
466 |     {
467 |      "name": "stdout",
468 |      "output_type": "stream",
469 |      "text": [
470 |       "The cohens-d effect size is 0.20.\n"
471 |      ]
472 |     }
473 |    ],
474 |    "source": [
475 |     "# Check the measured value of the effect size\n",
476 |     "print('The cohens-d effect size is {:1.2f}.'.format(cohens_d))"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "markdown",
481 |    "metadata": {},
482 |    "source": [
483 |     "A cohens-d effect size of ~0.2 is a small or modest effect. \n",
484 |     "\n",
485 |     "In combination with our t-test above, we can conclude that there is a difference of means between the two groups of data, but that the magnitude of this difference is relatively small. "
486 |    ]
487 |   },
488 |   {
489 |    "cell_type": "markdown",
490 |    "metadata": {},
491 |    "source": [
492 |     "## Conclusion\n",
493 |     "\n",
494 |     "Here we have briefly explored some statistical tests and comparisons for numerical data. \n",
495 |     "\n",
496 |     "For more information on statistical tests of data, check out courses and resources focused on statistics."
497 |    ]
498 |   }
499 |  ],
500 |  "metadata": {
501 |   "kernelspec": {
502 |    "display_name": "Python 3",
503 |    "language": "python",
504 |    "name": "python3"
505 |   },
506 |   "language_info": {
507 |    "codemirror_mode": {
508 |     "name": "ipython",
509 |     "version": 3
510 |    },
511 |    "file_extension": ".py",
512 |    "mimetype": "text/x-python",
513 |    "name": "python",
514 |    "nbconvert_exporter": "python",
515 |    "pygments_lexer": "ipython3",
516 |    "version": "3.7.4"
517 |   }
518 |  },
519 |  "nbformat": 4,
520 |  "nbformat_minor": 2
521 | }
522 | 


--------------------------------------------------------------------------------
/dsip/tutorials/A1-PythonPackages.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Appendix: Python Packages\n",
  8 |     "\n",
  9 |     "<div class=\"alert alert-success\">\n",
 10 |     "The following is general overview of packages available in Python that may be useful for data science.\n",
 11 |     "</div>\n",
 12 |     "\n",
 13 |     "<div class=\"alert alert-info\">\n",
 14 |     "For a much broader / fuller list of the Python ecosystem, check out the \n",
 15 |     "<a href=\"https://github.com/vinta/awesome-python\" class=\"alert-link\">Awesome Python</a> list. \n",
 16 |     "</div>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Data-Science Modules\n",
 24 |     "\n",
 25 |     "These are all external (non-standard library) packages. Many of them are available in the Ananconda distribution. \n",
 26 |     "\n",
 27 |     "### Core Packages\n",
 28 |     "\n",
 29 |     "- [scipy](https://www.scipy.org) - mathematics, science, and engineering.\n",
 30 |     "- [numpy](http://www.numpy.org) - numerical computing with arrays & array operations.\n",
 31 |     "- [pandas](https://pandas.pydata.org) - data structures and data analysis.\n",
 32 |     "- [scikit-learn](http://scikit-learn.org/stable/) - machine learning and data analysis.\n",
 33 |     "    \n",
 34 |     "### Text Mining\n",
 35 |     "\n",
 36 |     "- [nltk](http://www.nltk.org) - natural language processing.\n",
 37 |     "- [gensim](https://radimrehurek.com/gensim/) - topic modelling.\n",
 38 |     "\n",
 39 |     "### Mathematics & Statistics\n",
 40 |     "\n",
 41 |     "- [sympy](http://www.sympy.org/en/index.html) - symbolic mathematics.\n",
 42 |     "- [statsmodels](http://www.statsmodels.org/stable/index.html) - statistical modelling.\n",
 43 |     "\n",
 44 |     "### Web Scraping\n",
 45 |     "\n",
 46 |     "- [requests](https://requests.readthedocs.io/) - HTTP requests.\n",
 47 |     "- [scrapy](https://scrapy.org) - web scraping.\n",
 48 |     "    \n",
 49 |     "### Plotting / Vizualization Libraries\n",
 50 |     "\n",
 51 |     "- [matplotlib](https://matplotlib.org) - 2D plotting library.\n",
 52 |     "- [seaborn](https://seaborn.pydata.org/) - visualization (based on matplotlib).\n",
 53 |     "- [bokeh](http://bokeh.pydata.org/en/latest/) - interactive visualizations.\n",
 54 |     "    \n",
 55 |     "### Graph Theory / Networks\n",
 56 |     "\n",
 57 |     "- [networkx](https://networkx.github.io/) - network analysis.\n",
 58 |     "- [graph-tool](https://graph-tool.skewed.de/) - manipulation and analysis of graphs.\n",
 59 |     "    \n",
 60 |     "### Deep Learning\n",
 61 |     "\n",
 62 |     "- [theano](http://deeplearning.net/software/theano/) - mathematical operations on multi-dimensional arrays.\n",
 63 |     "- [tensorflow](https://www.tensorflow.org/) - numerical computation using data flow graphs.\n",
 64 |     "- [keras](https://keras.io) - a high-level neural network library."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "## Useful parts of the standard library\n",
 72 |     "\n",
 73 |     "The full list of packages in the standard library is available [here](https://docs.python.org/3/library/index.html)."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "### Basic Utilities\n",
 81 |     "\n",
 82 |     "- [os](https://docs.python.org/3/library/os.html) - miscellaneous operating system operations.\n",
 83 |     "- [sys](https://docs.python.org/3/library/sys.html) - system operations.\n",
 84 |     "- [datetime](https://docs.python.org/3/library/datetime.html) - manipulating dates & times.\n",
 85 |     "- [glob](https://docs.python.org/3/library/glob.html) - searching path names.\n",
 86 |     "\n",
 87 |     "### Useful Functions\n",
 88 |     "\n",
 89 |     "- [math](https://docs.python.org/3/library/math.html) - mathematical functions.\n",
 90 |     "- [random](https://docs.python.org/3/library/random.html) - (pseudo) random number generators.\n",
 91 |     "- [re](https://docs.python.org/3/library/re.html) - regular expressions.\n",
 92 |     "\n",
 93 |     "### File Formats\n",
 94 |     "\n",
 95 |     "- [json](https://docs.python.org/3/library/json.html) - support for working with JSON files.\n",
 96 |     "- [csv](https://docs.python.org/3/library/csv.html) - support for working with CSV files.\n",
 97 |     "\n",
 98 |     "### Data Objects\n",
 99 |     "\n",
100 |     "- [collections](https://docs.python.org/3/library/collections.html) - container data types.\n",
101 |     "- [pickle](https://docs.python.org/3/library/pickle.html) - serializing & de-serializing (saving and loading complex objects)."
102 |    ]
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "anaconda-cloud": {},
107 |   "kernelspec": {
108 |    "display_name": "Python 3",
109 |    "language": "python",
110 |    "name": "python3"
111 |   },
112 |   "language_info": {
113 |    "codemirror_mode": {
114 |     "name": "ipython",
115 |     "version": 3
116 |    },
117 |    "file_extension": ".py",
118 |    "mimetype": "text/x-python",
119 |    "name": "python",
120 |    "nbconvert_exporter": "python",
121 |    "pygments_lexer": "ipython3",
122 |    "version": "3.7.4"
123 |   }
124 |  },
125 |  "nbformat": 4,
126 |  "nbformat_minor": 1
127 | }
128 | 


--------------------------------------------------------------------------------
/dsip/tutorials/A2-Git.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Appendix: Version Control\n",
  8 |     "\n",
  9 |     "This notebook explores version control, and using git & Github."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "<div class=\"alert alert-success\">\n",
 17 |     "Version Control reflects a set of practices related to managing files, in particular managing different versions of files. \n",
 18 |     "</div>\n",
 19 |     "\n",
 20 |     "<div class=\"alert alert-info\">\n",
 21 |     "You can read more about version control on\n",
 22 |     "<a href=\"https://en.wikipedia.org/wiki/Version_control\" class=\"alert-link\">Wikipedia</a>\n",
 23 |     " or on the \n",
 24 |     "<a href=\"https://git-scm.com/book/en/v2/Getting-Started-About-Version-Control\" class=\"alert-link\">git docs</a>.\n",
 25 |     "</div>"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "<br>\n",
 33 |     "<br>\n",
 34 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/git.png\" width=\"300px\" class=\"center\">\n",
 35 |     "<br>\n",
 36 |     "<br>"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "<div class=\"alert alert-success\">\n",
 44 |     "Git is a version control system: a tool to track changes in files, across multiple locations. \n",
 45 |     "</div>\n",
 46 |     "\n",
 47 |     "<div class=\"alert alert-info\">\n",
 48 |     "Information for using git is available in the \n",
 49 |     "<a href=https://git-scm.com/doc class=\"alert-link\">official documentation</a>, \n",
 50 |     "as well as in many external resources, collected in \n",
 51 |     "<a href=https://github.com/openlists/GitResources class=\"alert-link\">this list</a>.\n",
 52 |     "</div>"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "markdown",
 57 |    "metadata": {},
 58 |    "source": [
 59 |     "<br>\n",
 60 |     "<br>\n",
 61 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/github.png\" width=\"300px\" class=\"center\">\n",
 62 |     "<br>\n",
 63 |     "<br>"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "<div class=\"alert alert-success\">\n",
 71 |     "Github is company that offers version control services, using git.\n",
 72 |     "    \n",
 73 |     "Github is a hosting service, with git built in, or basically, a place on the internet to put code that is tracked with git. \n",
 74 |     "</div>\n",
 75 |     "\n",
 76 |     "<div class=\"alert alert-info\">\n",
 77 |     "Click through to \n",
 78 |     "<a href=\"https://github.com/\" class=\"alert-link\">Github</a>.\n",
 79 |     "</div>"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "## Using Git"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "There are several ways to use git, including:\n",
 94 |     "- from the command line, typing in commands directly\n",
 95 |     "- using a graphical program to launch git commands\n",
 96 |     "    - This kind of program is referred to as a 'graphical user interface' (GUI).\n",
 97 |     "        - It basically just means you can click buttons to do things, rather than writing out commands\n",
 98 |     "        \n",
 99 |     "Either way the underlying commands, and code that gets executed, are the same. Behind the scenes, it all reduces to the same thing. \n",
100 |     "\n",
101 |     "You should use whichever approach you are most comfortable with. If you already know some command line programming, using git from command line can be useful, as generally slightly more functionality is available to you, with more specific control. Using a GUI is much more straight forward if you are not very familiar with the command line."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "### Git Graphical User Interfaces (GUIs)"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "<br>\n",
116 |     "<br>\n",
117 |     "<img src=\"https://raw.githubusercontent.com/COGS108/Tutorials/master/img/sourcetree.png\" width=\"500px\" class=\"center\">\n",
118 |     "<br>\n",
119 |     "<br>"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "If you plan to use a GUI, one option is SourceTree. \n",
127 |     "\n",
128 |     "<div class=\"alert alert-info\">\n",
129 |     "See the Source Tree\n",
130 |     "<a href=\"https://confluence.atlassian.com/get-started-with-sourcetree\" class=\"alert-link\">documentation</a>.\n",
131 |     "</div>"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "## Git repositories\n",
139 |     "\n",
140 |     "A repository is simply a collection of files, like a folder on a computer. \n",
141 |     "\n",
142 |     "What git is mainly doing is keeping two (or more) versions of the same repository consistent."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "### Local vs. Remote\n",
150 |     "\n",
151 |     "Given a repository with multiple copies (or versions), we can refer to these copies:\n",
152 |     "\n",
153 |     "- the 'local' copy, which is a copy of the repository on your computer (the one 'local' to you)\n",
154 |     "- the 'remote' copy, which is a copy of the repository somewhere else, such as on Github\n",
155 |     "\n",
156 |     "There is often a particular copy of the code that is referred to as the 'master', which just means it is the master version of the repository in question. Most typically, this is the copy of the code on Github - so there is one 'master' copy of the code on Github, and one or more people also have local copies of the code, with local updates. When local updates want to be shared, they can be sent to master, to update the main version of the code, for everyone.\n",
157 |     "\n",
158 |     "Here we will consider the case of having two copies of the repository. What is described here can all be extended to multiple copies of the code, including multiple different remote copies of the repository.\n",
159 |     "\n",
160 |     "The main function of git, as a version control system, is to automatically check all of the files in each copy of the repository, tracking any changes that happen. It then provides tools to synchronize between different copies when there are changes."
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "## Getting Code from Github\n",
168 |     "\n",
169 |     "As a starting point, it if often the case that there is code available on Github, and you want to get a local copy. This allows you to access the code, to use, and potentially update it. Then, if you want, you can contribute your code updates back to the main version of the code up on Github. \n",
170 |     "\n",
171 |     "First, you need to get a local copy of the code. Git calls making a copy of a repository 'cloning'.\n",
172 |     "\n",
173 |     "From command line, to clone a Github repository to your computer, use the clone command with the URL to the repository.\n",
174 |     "- $ ``git clone 'repo_url'``\n",
175 |     "    - For example, you can clone these materials with:\n",
176 |     "        - ``git clone https://github.com/COGS108/Tutorials``"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## Tracking & Propagating Changes\n",
184 |     "\n",
185 |     "Once you have a local copy, connected to a remote repository, changes can go in two directions:\n",
186 |     "\n",
187 |     "- sending changes you make locally to the remote\n",
188 |     "    - this is called a `push`\n",
189 |     "- updating your local copy with changes from the remote\n",
190 |     "    - this is called a `pull`"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {},
196 |    "source": [
197 |     "### Tracking and Sending Changes from local -> remote\n",
198 |     "\n",
199 |     "When working on local files, git has hierarchical 'levels' of how it is tracking files (or, of what it is paying attention to). These multiple levels are useful to organize groups of changes into well organized actions.\n",
200 |     "\n",
201 |     "The repository can have many files in it, within which there are:\n",
202 |     "- tracked files\n",
203 |     "    - these are files that git is tracking\n",
204 |     "    - any files that are not tracked by git are untracked, and won't be version controlled\n",
205 |     "- staged files\n",
206 |     "    - these are tracked files that have been indicated to be added to an update of the code\n",
207 |     "    - staging allows for specifying which files to save updates to, and how to group files\n",
208 |     "\n",
209 |     "The typical workflow is:\n",
210 |     "- ``add``: select which changes, on which files, you would like git to add to staging area\n",
211 |     "    - You can add multiple files together, each with their own changes\n",
212 |     "    - If a file is untracked, this adds it to tracking\n",
213 |     "- ``commit``: make a checkpoint, saving all files that have been added together\n",
214 |     "    - These changes will be 'saved' together with a message (a commit log) about what the changes are\n",
215 |     "- ``push``: send the changes to the remote\n",
216 |     "    - These changes will now be available in the remote copy of the repository"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "#### Example Workflow\n",
224 |     "\n",
225 |     "First, in a local git repository, you make some changes to a file, or perhaps many files. \n",
226 |     "\n",
227 |     "You can now add files. This needs to be done for each file you changed, that we want to add to staging:\n",
228 |     "- $ ``git add 'f_name'``\n",
229 |     "\n",
230 |     "After you've added one or more files, you use a commit to save the status of these files:\n",
231 |     "- $ ``git commit -m 'Commit message'``\n",
232 |     "\n",
233 |     "The '-m' flag is an option to write your commit message directly with the command. If you don't add it, git will send you into a text editor for you to write a commit message there. You should always add an informative message about what you have changed / added.\n",
234 |     "\n",
235 |     "Making small, incremental changes and commiting often, with detailed messages means that your Github log can serve as a history for your project. This is useful as a way to use version control to keep track of a projects history. This also allows you to step back to an older version of the code if everything suddenly breaks.\n",
236 |     "\n",
237 |     "A committed change is still only stored in your local copy. To update the remote repostory, you have to push:\n",
238 |     "- $ ``git push``\n",
239 |     "\n",
240 |     "You do not have to push after each commit, you can push many commits at once. \n",
241 |     "\n",
242 |     "Once you have pushed, you should see your updates have been added to the remote repository, on Github."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "markdown",
247 |    "metadata": {},
248 |    "source": [
249 |     "### Getting Changes from remote -> local\n",
250 |     "\n",
251 |     "Git refers to updating a local copy with changes from a remote copy as 'pulling'\n",
252 |     "\n",
253 |     "On command line, if there are changes on the remote branch, use the 'git pull' to copy those changes to your local: copy.\n",
254 |     "\n",
255 |     "- $ ``git pull``"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "## Git Cheatsheet\n",
263 |     "\n",
264 |     "The most common git functions are:\n",
265 |     "\n",
266 |     "- ``git status``\n",
267 |     "    - Check the status of a git repository\n",
268 |     "- ``git add 'file'``\n",
269 |     "    - Add a file to staging area\n",
270 |     "- ``git commit -m 'message'``\n",
271 |     "    - Log a 'save point' of all changes in the staging area.\n",
272 |     "- ``git push``\n",
273 |     "    - Copy commits to remote\n",
274 |     "- ``git diff 'file'``\n",
275 |     "    - Check what has changed in file since last commit\n",
276 |     "- ``git clone 'repo'``\n",
277 |     "    - Create a local copy of a git repository\n",
278 |     "- ``git pull``\n",
279 |     "    - Update your local copy of a git repository from the remote\n",
280 |     "- ``git log``\n",
281 |     "    - Check the log of git commits"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "markdown",
286 |    "metadata": {},
287 |    "source": [
288 |     "## Advanced Git: Branching & Merging\n",
289 |     "\n",
290 |     "Git has many other functionalities, including [branching](https://git-scm.com/book/en/v1/Git-Branching/) and [merging](https://git-scm.com/docs/git-merge), that are worth exploring as you get used to using git, and work on larger projects."
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "markdown",
295 |    "metadata": {},
296 |    "source": [
297 |     "## External Resources and Tutorials"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "<div class=\"alert alert-info\">\n",
305 |     "There are many tutorials for working with Github, including an \n",
306 |     "<a href=\"https://try.github.io/levels/1/challenges/1\" class=\"alert-link\">interactive tutorial</a>, \n",
307 |     "and\n",
308 |     "<a href=\"https://guides.github.com/activities/hello-world/\" class=\"alert-link\">Hello World</a>,\n",
309 |     "made by Github, and many other guides and tutorials, for example those available from\n",
310 |     "<a href=\"http://lifehacker.com/5983680/how-the-heck-do-i-use-github\" class=\"alert-link\">LifeHacker</a>, \n",
311 |     "<a href=\"https://www.atlassian.com/git/tutorials\" class=\"alert-link\">Atlassian</a>, and\n",
312 |     "<a href=\"http://neuroplausible.com/github\" class=\"alert-link\">Neuroplausible</a>.\n",
313 |     "For a list of many resources, see\n",
314 |     "<a href=https://github.com/openlists/GitResources class=\"alert-link\">this list</a>.\n",
315 |     "</div>"
316 |    ]
317 |   }
318 |  ],
319 |  "metadata": {
320 |   "kernelspec": {
321 |    "display_name": "Python 3",
322 |    "language": "python",
323 |    "name": "python3"
324 |   },
325 |   "language_info": {
326 |    "codemirror_mode": {
327 |     "name": "ipython",
328 |     "version": 3
329 |    },
330 |    "file_extension": ".py",
331 |    "mimetype": "text/x-python",
332 |    "name": "python",
333 |    "nbconvert_exporter": "python",
334 |    "pygments_lexer": "ipython3",
335 |    "version": "3.7.4"
336 |   }
337 |  },
338 |  "nbformat": 4,
339 |  "nbformat_minor": 2
340 | }
341 | 


--------------------------------------------------------------------------------
/dsip/tutorials/files/data.csv:
--------------------------------------------------------------------------------
1 | 1, 2, 3, 4
2 | 5, 6, 7, 8
3 | 9, 10, 11, 12


--------------------------------------------------------------------------------
/dsip/tutorials/files/data.json:
--------------------------------------------------------------------------------
1 | {
2 |   "firstName": "John",
3 |   "age": 53
4 | }
5 | 


--------------------------------------------------------------------------------
/dsip/tutorials/files/data.txt:
--------------------------------------------------------------------------------
1 | First line of data
2 | Second line of data


--------------------------------------------------------------------------------
/dsip/tutorials/files/messy_data.csv:
--------------------------------------------------------------------------------
1 | id,age,weight
2 | 001,20,11
3 | 002,27,
4 | 003,25,14
5 | 004,-999,12


--------------------------------------------------------------------------------
/dsip/tutorials/files/messy_data.json:
--------------------------------------------------------------------------------
1 | [{"id": 001,"height": 168},
2 | {"id": 002,"height": 155},
3 | {"id": 003,"height": null},
4 | {"id": 004,"height": 173}]


--------------------------------------------------------------------------------
/dsip/tutorials/img/anaconda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/anaconda.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/git.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/git.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/github.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/jupyter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/jupyter.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/matplotlib.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/matplotlib.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/numpy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/numpy.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/pandas.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/pandas.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/python.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/python.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/scipy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/scipy.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/sklearn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/sklearn.png


--------------------------------------------------------------------------------
/dsip/tutorials/img/sourcetree.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DataScienceInPractice/Site/3388685495c1e22e99727a50208704df0fbd5c81/dsip/tutorials/img/sourcetree.png


--------------------------------------------------------------------------------
/instructions.md:
--------------------------------------------------------------------------------
 1 | # Building the Site
 2 | 
 3 | Notes and instructions for building the public
 4 | [Data Science in Practice](https://datascienceinpractice.github.io/)
 5 | site.
 6 | 
 7 | ## Overview
 8 | 
 9 | This repository and associated website serve as a public version of materials for the
10 | [COGS108](https://github.com/cogs108) course from UC San Diego.
11 | 
12 | This repository,
13 | [Site](https://github.com/DataScienceInPractice/Site),
14 | is used to organize and create the website, which is then pushed to be hosted from the
15 | [source](https://github.com/DataScienceInPractice/datascienceinpractice.github.io) repository.
16 | 
17 | ## Building the Site
18 | 
19 | Functionality to manage content and build the site is all controlled by a
20 | [Makefile](https://github.com/DataScienceInPractice/Site/blob/master/Makefile).
21 | 
22 | The site is created with the
23 | [Jupyter-book](https://github.com/executablebooks/jupyter-book)
24 | tool, which is a requirement for running the Makefile.
25 | 
26 | ## Content
27 | 
28 | Content for the website is copied in from the
29 | [COGS108 organization](https://github.com/cogs108).
30 | 
31 | The idea here is that this repository copies in materials created for COGS108, and hosts a public version.
32 | 
33 | Note that as this procedure copies in files from the course organization, the original source
34 | files for these materials are not stored in this repository. Any development of these materials
35 | should be done on the original files in the COGS108 organization.
36 | 
37 | This repository does contain copies of the materials, as are presented on the website.
38 | Note that these files are likely to have been edited for public release, and so may differ
39 | from the original files.
40 | 
41 | Note that for rebuilding the public website, there are manual edits to some files that are needed,
42 | as described below.
43 | 
44 | #### Tutorials
45 | 
46 | Tutorial materials are copied from the [Tutorials](https://github.com/COGS108/Tutorials) repository.
47 | 
48 | No updates to the materials should be needed for hosting these materials on the public site.
49 | 
50 | #### Assignments
51 | 
52 | Assignment materials are copied from the [Demo Assignments](https://github.com/COGS108/Assign_Demo) repository.
53 | 
54 | Assignments need to be updated to replace the header of the assignments to indicate that they are not valid assignments for 108.
55 | 
56 | Currently, the Makefile will copy over and rename files the DX (for "demo"). Manual tuning is then needed to remove course information and add a note that these are not valid 108 files.
57 | 
58 | Note that with any updates, if any data files change, this needs to be coordinated with the
59 | [Data](https://github.com/dataScienceInPractice/Data) repository.
60 | 
61 | #### Project
62 | 
63 | Project materials are copied over from the
64 | [Projects](https://github.com/COGS108/Projects) repository, and from the
65 | [group template](https://github.com/COGS108/group_template) repository.
66 | 
67 | The project files need manual editing, to remove sections are specific to the course.
68 | 
69 | ## Build Schedule
70 | 
71 | The website should be rebuilt periodically, to add updates to the materials to the public website.
72 | 
73 | A proposed schedule is to re-build this site at the end of every semester in which the course is taught, in order to integrate updated materials into the public release.
74 | 


--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
 1 | @article{jupyter_nbgrader_2019,
 2 |   title = {nbgrader: {A} {Tool} for {Creating} and {Grading} {Assignments} in the {Jupyter} {Notebook}},
 3 |   volume = {2},
 4 |   issn = {2577-3569},
 5 |   url = {https://jose.theoj.org/papers/10.21105/jose.00032},
 6 |   doi = {10.21105/jose.00032},
 7 |   number = {11},
 8 |   journal = {Journal of Open Source Education},
 9 |   author = {{Jupyter Project} and Blank, Douglas and Bourgin, David and Brown, Alexander and Bussonnier, Matthias and Frederic, Jonathan and Granger, Brian and Griffiths, Thomas and Hamrick, Jessica and Kelley, Kyle and Pacer, M and Page, Logan and PÃ©rez, Fernando and Ragan-Kelley, Benjamin and Suchow, Jordan and Willing, Carol},
10 |   year = {2019},
11 | }
12 | 
13 | @software{executable_books_community_2020_4539666,
14 |   author       = {{Executable Books Community}},
15 |   title        = {Jupyter Book},
16 |   year         = 2020,
17 |   publisher    = {Zenodo},
18 |   version      = {v0.10},
19 |   doi          = {10.5281/zenodo.4539666},
20 |   url          = {https://doi.org/10.5281/zenodo.4539666}
21 | }
22 | 
23 | @article{donoghue_teaching_2020,
24 |   title = {Teaching {Creative} and {Practical} {Data} {Science} at {Scale}},
25 |   volume = {29},
26 |   issn = {1069-1898},
27 |   url = {https://www.tandfonline.com/doi/full/10.1080/10691898.2020.1860725},
28 |   doi = {10.1080/10691898.2020.1860725},
29 |   number = {sup1},
30 |   journal = {Journal of Statistics and Data Science Education},
31 |   author = {Donoghue, Thomas and Voytek, Bradley and Ellis, Shannon E.},
32 |   year = {2021},
33 |   pages = {S27--S39},
34 | }
35 | 


--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Course Materials for Data Science in Practice'
 3 | tags:
 4 |   - data science
 5 |   - python
 6 | authors:
 7 |   - name: Thomas Donoghue
 8 |     orcid: 0000-0001-5911-0472
 9 |     affiliation: "1"
10 |   - name: Bradley Voytek
11 |     orcid: 0000-0003-1640-2525
12 |     affiliation: "1, 2, 3"
13 |   - name: Shannon Ellis
14 |     orcid: 0000-0002-9231-0481
15 |     affiliation: "1, 2"
16 | affiliations:
17 |  - name: Department of Cognitive Science, UC San Diego
18 |    index: 1
19 |  - name: Halıcıoğlu Data Science Institute, UC San Diego
20 |    index: 2
21 |  - name: Neurosciences Graduate Program, UC San Diego
22 |    index: 3
23 | 
24 | date: 1 July 2020
25 | bibliography: paper.bib
26 | ---
27 | 
28 | # Summary
29 | 
30 | Data Science in Practice is a collection of openly available materials, including tutorials and assignments, for learning how to integrate the many skills of data science. The course materials focus on the day-to-day practicalities of hands-on data science, with a particular emphasis on gaining a working familiarity with real-world applications and gaining a 'data intuition'. This collection of materials was originally developed for a course at UC San Diego designed to teach creative and practical data science at scale [@donoghue_teaching_2020]. The materials for this course have been updated and made publicly available, and are hosted at: https://datascienceinpractice.github.io/.
31 | 
32 | Topics covered in the data science in practice tutorials include:
33 | 
34 | * An introduction to relevant tooling, including Python, the scientific Python ecosystem, version control, Github, and Jupyter notebooks.
35 | * Practicalities of working with data, including finding data, evaluating data sources, gathering data, data wrangling and data cleaning.
36 | * Concepts and concerns for using and interpreting data, including data ethics, privacy, and security.
37 | * Introductory statistical concepts, including distributions, statistical tests, and testing statistical properties and assumptions in real datasets.
38 | * Simple data analyses, such as linear models, clustering, dimensionality reduction and classification.
39 | * How to report on data and analysis, through data visualization and narrative text, with a focus on clear explanations.
40 | 
41 | These topics are further explored in available assignments, which cover:
42 | 
43 | * Wrangling, cleaning, and combining multiple messy and heterogeneous datasets.
44 | * Collecting web data, applying data protection policies, anonymizing data, and adversarial attacks for deanonymizing data.
45 | * Data analyses, including statistical analyses, applying linear models, and creating visualizations.
46 | 
47 | The materials are developed in the Python (>= 3.6) programming language, using a standard collection of packages in the scientific Python environment, which can be installed using the Anaconda distribution. All materials are built as Jupyter notebooks, with the assignments being built with the nbgrader extension [@jupyter_nbgrader_2019]. All the materials are hosted online, using the Jupyter Book tool [@executable_books_community_2020_4539666], from which all the source notebooks can be downloaded to be run locally.
48 | 
49 | # Statement of Need
50 | 
51 | The field of data science has been rapidly expanding, creating a need for accessible and scalable materials. There is high interest for instruction in data science, and a need in both academia and industry for trained and skilled practitioners. Developing such skills requires hands-on experience and expertise. To address this need, the materials here are focused on practical code-based tutorials, and guided assignments that allow users to practice applying the topics and ideas under study.
52 | 
53 | There are many available resources for topics within and related to data science, including dedicated tutorials for data science tools and software packages. What can still be difficult, for the novice, is learning how to find and navigate through these materials. A key goal of this course and these materials is to offer a curated introduction to the many topics and available tools, and some initial guided work to make sure users can start to engage with the many aspects of data science. Throughout the course materials, there are many links to other resources. The goal is that these materials be a starting place for the potential user, and a launching off point to the many other more specific resources and tutorials available.
54 | 
55 | Data science is an interdisciplinary field, requiring expertise from across a range of relevant fields - including technical aspects such as software, computation, statistics, mathematics and machine learning, as well as topics such as research design, contextual understanding of data, ethics, and an understanding of the potential impacts. These materials aim to encompass these multiple elements of data science, focusing not only on the technical aspects of doing data science, but also acknowledging and emphasizing the social impacts and responsibilities of practicing data scientists. These materials are part of an emerging field of integrated data science, as compared to more traditional courses and materials that focus on, for example, detailed machine learning or computation.
56 | 
57 | # Instructional Design
58 | 
59 | This set of materials was originally created as core materials for a university course, Data Science in Practice, taught at UC San Diego [@donoghue_teaching_2020]. This course was first taught in the Spring of 2017 and has about 400 students per iteration. The scale of this course originally prompted the development of standalone materials and assignments, that we are now making more generally available.
60 | 
61 | The full course is supplemented by lectures and lab sections, and is designed as a project-based course. Students work through the materials and assignments presented here, with the goal of building towards doing realistic data science projects. In these projects, students must find openly available datasets, develop a proposal, and then execute analyses to come to an answer. Students must then contextualize the results as a computational notebook that lists their questions and hypotheses, background, ethical considerations, data sources and reliability, results, and conclusion, intermixed with the code and visualizations used to perform the analyses.
62 | 
63 | In order to encourage users of the public website to also continue to pursue independent data science projects, the website also includes a description of the project outline from the course. This includes guidelines for how to complete data-driven projects using openly available datasets and tools, including listings of available data repositories.
64 | 
65 | # Conclusion
66 | 
67 | Altogether, these materials offer a general, hands-on introduction to the practice of data science, and its many facets. These materials serve as a complement to many other resources dedicated more specifically to technical skills, and aim to introduce and contextualize the interdisciplinary nature and practical components of working with real world data.
68 | 
69 | # Acknowledgments
70 | 
71 | We would like to thank the course staff, including the instructional assistants, teaching assistants, and instructors who have contributed time, work, and ideas to this course and these materials, as well as all the students who have taken this course and provided valuable feedback.
72 | 
73 | # References
74 | 


--------------------------------------------------------------------------------