├── README.md
├── data
├── Daily-Ballot-Returns.pdf
├── battleground-state-changes.csv
├── la-homes.csv
├── or_df_1.csv
├── oregon_ballot_returns.rda
├── penguins.csv
├── uselections.csv
└── vote_cov.csv
├── homework
├── figs
│ ├── adelie-hist.png
│ ├── dragons-love-tacos.png
│ ├── elements-as-types.png
│ ├── rubiks-cube-keyed.png
│ ├── rubiks-cube.png
│ └── seaborn-overview.png
├── hw-01.md
├── hw-02.md
├── hw-02.pdf
├── hw-03.md
├── hw-03.pdf
├── hw-03_files
│ └── figure-gfm
│ │ └── unnamed-chunk-1-1.png
├── hw-04.md
├── hw-04.pdf
├── hw-04_files
│ └── figure-gfm
│ │ └── unnamed-chunk-1-1.png
├── hw-05.md
├── hw-05.pdf
├── hw-05_files
│ └── figure-gfm
│ │ └── unnamed-chunk-8-1.png
├── hw-06.md
├── hw-06.pdf
├── hw-07.md
├── hw-07.pdf
├── hw-10.md
├── hw-10.pdf
├── hw-10_files
│ └── figure-gfm
│ │ ├── unnamed-chunk-10-1.png
│ │ ├── unnamed-chunk-3-1.png
│ │ ├── unnamed-chunk-4-1.png
│ │ ├── unnamed-chunk-5-1.png
│ │ ├── unnamed-chunk-6-1.png
│ │ ├── unnamed-chunk-7-1.png
│ │ ├── unnamed-chunk-8-1.png
│ │ └── unnamed-chunk-9-1.png
├── hw-12.md
├── hw-12.pdf
├── hw-13.md
└── hw-13.pdf
├── labs
├── figs
│ ├── david-mansion-LA.jpg
│ ├── oak-pdx-flight.png
│ ├── or-plot-1.png
│ ├── or-plot-2.png
│ ├── or-plot-3.png
│ ├── oregon-votes.png
│ ├── penguin-ice-cubes.png
│ ├── r4ds-ds-cycle.png
│ ├── square-grid.png
│ ├── tiny-airplane.png
│ ├── trump-biden-votes.png
│ ├── turtle.png
│ └── tux-penguin.jpg
├── lab-01.Rmd
├── lab-01.md
├── lab-01.pdf
├── lab-02.md
├── lab-02.pdf
├── lab-02_files
│ └── figure-gfm
│ │ ├── unnamed-chunk-5-1.png
│ │ └── unnamed-chunk-5-2.png
├── lab-03.md
├── lab-03.pdf
├── lab-04.md
├── lab-04.pdf
├── lab-05.md
├── lab-05.pdf
├── lab-05_files
│ └── figure-gfm
│ │ ├── unnamed-chunk-1-1.png
│ │ ├── unnamed-chunk-2-1.png
│ │ └── unnamed-chunk-2-2.png
├── lab-06.md
├── lab-06.pdf
├── lab-07.md
├── lab-07.pdf
├── lab-08.md
├── lab-08.pdf
├── lab-09.md
├── lab-09.pdf
├── lab-10.md
├── lab-10.pdf
├── lab-11.md
└── lab-11.pdf
├── lectures
├── 01-peek-into-python-code.Rmd
├── 01-peek-into-python-slides.Rmd
├── 01-peek-into-python-slides.html
├── 01-peek-into-python-slides.pdf
├── 02-types-and-structures-code.Rmd
├── 02-types-and-structures-slides.Rmd
├── 02-types-and-structures-slides.html
├── 02-types-and-structures-slides.pdf
├── 03-functions-methods-code.Rmd
├── 03-functions-methods-slides.Rmd
├── 03-functions-methods-slides.html
├── 03-functions-methods-slides.pdf
├── 03-functions-methods-slides_files
│ └── figure-html
│ │ ├── unnamed-chunk-12-1.png
│ │ ├── unnamed-chunk-13-1.png
│ │ ├── unnamed-chunk-16-1.png
│ │ ├── unnamed-chunk-18-1.png
│ │ ├── unnamed-chunk-18-2.png
│ │ ├── unnamed-chunk-18-3.png
│ │ ├── unnamed-chunk-18-4.png
│ │ ├── unnamed-chunk-20-1.png
│ │ ├── unnamed-chunk-20-2.png
│ │ ├── unnamed-chunk-20-3.png
│ │ └── unnamed-chunk-20-4.png
├── 04-control-flow-notes.Rmd
├── 04-control-flow-slides.Rmd
├── 04-control-flow-slides.html
├── 04-control-flow-slides.pdf
├── 05-numpy-arrays-code.Rmd
├── 05-numpy-arrays-slides.Rmd
├── 05-numpy-arrays-slides.html
├── 05-numpy-arrays-slides.pdf
├── 06-pandas-dataframes-code.Rmd
├── 06-pandas-dataframes-slides.Rmd
├── 06-pandas-dataframes-slides.html
├── 06-pandas-dataframes-slides.pdf
├── 06-pandas-dataframes-slides_files
│ └── figure-html
│ │ └── unnamed-chunk-2-1.png
├── 06-pandas-dataframes.html
├── 06-pandas-dataframes_files
│ └── figure-html
│ │ ├── unnamed-chunk-1-1.png
│ │ └── unnamed-chunk-2-1.png
├── 07-pandas-2-notebook.Rmd
├── 07-pandas-2-practice.Rmd
├── 07-pandas-2-slides.Rmd
├── 08-pandas-3-notebook.Rmd
├── 08-pandas-3-practice.Rmd
├── 09-reshape-matplotlib-notebook.Rmd
├── 09-reshape-matplotlib-practice.Rmd
├── 10-matplotlib-seaborn-notebook.Rmd
├── 10-matplotlib-seaborn-notebook.ipynb
├── 10-matplotlib-seaborn-practice.Rmd
├── 11-linear-models-notebook.Rmd
├── 12-predictive-modeling.Rmd
├── 12-predictive-modeling.ipynb
├── 13-classes-and-methods_files
│ └── figure-html
│ │ ├── unnamed-chunk-10-1.png
│ │ ├── unnamed-chunk-10-2.png
│ │ ├── unnamed-chunk-10-3.png
│ │ ├── unnamed-chunk-10-4.png
│ │ └── unnamed-chunk-8-1.png
├── 13-classes-and-objects-practice.Rmd
├── 13-classes-and-objects-slides.Rmd
├── 13-classes-and-objects-slides.html
├── 13-classes-and-objects-slides_files
│ └── figure-html
│ │ ├── .unnamed-chunk-10-3.png-V1UM
│ │ ├── unnamed-chunk-10-1.png
│ │ ├── unnamed-chunk-10-2.png
│ │ ├── unnamed-chunk-10-3.png
│ │ ├── unnamed-chunk-10-4.png
│ │ └── unnamed-chunk-8-1.png
├── 14-classes-and-objects-2-slides.Rmd
├── 14-classes-and-objects-2-slides.html
├── cal.css
├── figs
│ ├── 2-fold-CV.png
│ ├── 5-fold-CV.png
│ ├── bonneville.jpg
│ ├── class-hierarchies.png
│ ├── comparison-operators.png
│ ├── data-structures.png
│ ├── equality.png
│ ├── flight-delay-evidence.png
│ ├── hw12-ii.png
│ ├── hw13-10.png
│ ├── hw3-turtle.png
│ ├── identity.png
│ ├── join-inner.png
│ ├── join-outer.png
│ ├── lab-3-sc1.png
│ ├── lab-3-sc2.png
│ ├── learning-languages.png
│ ├── list-indexing-bah.gif
│ ├── list-indexing.png
│ ├── math-operators.png
│ ├── mic-drop.gif
│ ├── mutatr.png
│ ├── numpy-logo.png
│ ├── oop-cars.png
│ ├── or-plot-1.png
│ ├── or-plot-2.png
│ ├── pandas-logo.png
│ ├── pascals-triangle.png
│ ├── plotly-gallery.png
│ ├── polyforce.png
│ ├── r-python-diagram.png
│ ├── seaborn-overview.png
│ ├── slido-qr.png
│ └── tidy-1.png
└── libs
│ ├── anchor-sections-1.0
│ ├── anchor-sections.css
│ └── anchor-sections.js
│ ├── header-attrs-2.5
│ └── header-attrs.js
│ └── remark-css-0.0.1
│ └── metropolis.css
├── libs
└── remark-css-0.0.1
│ └── metropolis.css
├── small-notes
├── pointers-identity-equality.Rmd
└── pointers-identity-equality.md
└── syllabus.md
/README.md:
--------------------------------------------------------------------------------
1 | # Python for R Users
2 |
3 | **STAT 198/298**
4 | **UC Berkeley Fall 2020**
5 |
6 | This repository contains all of the course materials for a short course taught aimed to introduce students adept at programming in R to the Python language.
7 |
8 | 
This work is licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
--------------------------------------------------------------------------------
/data/Daily-Ballot-Returns.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/data/Daily-Ballot-Returns.pdf
--------------------------------------------------------------------------------
/data/or_df_1.csv:
--------------------------------------------------------------------------------
1 | ,Statewide Ballot Returns by,Oct 16,Oct 19,Oct 20,Oct 21,Oct 22,Oct 23,Oct 26,Oct 27,Oct 28,Oct 29,Oct 30,Nov 2,Nov 3
2 | 0,Baker,105,912,"1,428",843,613,675,"1,202",264,770,0,,,
3 | 1,Benton,"1,991","3,713","7,568","7,276","4,315","2,969","5,723","4,729","1,218",0,,,
4 | 2,Clackamas,"4,653","39,591","14,104","36,995","16,239","24,960","17,188","23,202","9,462",78,,,
5 | 3,Clatsop,51,"3,845","1,357","3,200","1,993","2,048","2,769",647,"1,519",1,,,
6 | 4,Columbia,"1,032","3,745","2,265","4,023","1,701","2,092","3,965",872,"1,196",0,,,
7 | 5,Coos,179,"2,944","4,069","1,907","5,212","3,614","4,592","1,449","2,983",0,,,
8 | 6,Crook,386,"1,729","1,503","1,119","1,272","1,603","1,885","1,146",263,0,,,
9 | 7,Curry,"2,263","1,909","1,243",960,"1,551","1,243","1,282",734,657,0,,,
10 | 8,Deschutes,"2,754","13,221","4,692","12,826","3,819","18,633","15,597","8,367","9,906","2,359",,,
11 | 9,Douglas,154,"5,664","5,278","4,682","9,274","6,606","7,173","4,839","3,989",0,,,
12 | 10,Gilliam,83,172,64,60,123,41,150,42,101,0,,,
13 | 11,Grant,13,493,674,644,255,317,513,342,166,0,,,
14 | 12,Harney,255,661,369,387,230,254,561,293,261,25,,,
15 | 13,Hood River,149,"1,600","1,563","1,234","1,294",839,"1,602",564,738,0,,,
16 | 14,Jackson,848,"9,888","12,439","14,186","9,859","11,736","14,485","8,270","7,159",0,,,
17 | 15,Jefferson,180,"1,040","1,008",865,"1,077",853,"1,699",617,936,0,,,
18 | 16,Josephine,123,11,"4,446","5,206","5,791","3,637","5,527","6,010","3,633",211,,,
19 | 17,Klamath,"1,122","3,935","2,444","3,783","3,114","2,705","2,727","3,008","2,286",478,,,
20 | 18,Lake,54,460,359,390,371,376,445,190,311,0,,,
21 | 19,Lane,"3,162","26,115","18,103","24,333","22,154","23,435","15,964","11,895","13,515",0,,,
22 | 20,Lincoln,"2,900","4,403","2,784","2,819","2,216","2,483","2,994",834,"1,700",93,,,
23 | 21,Linn,"1,522","5,358","7,344","8,097","6,003","8,517","4,454","3,428","5,019",0,,,
24 | 22,Malheur,402,931,794,"1,342",909,"1,362",711,800,791,63,,,
25 | 23,Marion,"8,749","19,552","10,308","16,185","10,685","18,912","9,390","10,457","9,126",0,,,
26 | 24,Morrow,191,685,369,413,380,443,530,309,277,17,,,
27 | 25,Multnomah,"59,937","49,738","34,286","39,769","27,503","43,123","30,499","31,130","26,348",0,,,
28 | 26,Polk,248,"1,288","4,200","6,900","5,363","3,787","6,152","2,183","3,092",273,,,
29 | 27,Sherman,70,143,161,76,74,57,149,78,68,0,,,
30 | 28,Tillamook,149,"3,293","1,082","2,303",626,"1,738","1,487",925,831,12,,,
31 | 29,Umatilla,548,"2,773","3,314","2,583","2,382","2,862","5,169","2,073",62,2,,,
32 | 30,Union,193,"1,828",803,"1,011","1,823",870,"1,847","1,688",507,0,,,
33 | 31,Wallowa,0,377,621,421,288,586,732,43,690,0,,,
34 | 32,Wasco,161,"1,020","2,278","1,275",811,"1,105","1,694",590,944,0,,,
35 | 33,Washington,"1,246","44,779","19,894","25,244","33,789","42,543","22,795","21,961","15,210",18,,,
36 | 34,Wheeler,105,152,66,66,38,44,111,70,33,0,,,
37 | 35,Yamhill,"4,626","6,013","4,117","5,894","3,565","1,791","7,989","2,775",167,0,,,
38 | 36,,,,,,,,,,,,,,
39 | 37,Number of ballots returned on this day,"100,604","263,981","177,397","239,317","186,712","238,859","201,752","156,824","125,934","3,630",,,
40 | 38,Daily return as % of total ballots,3.4%,8.9%,6.0%,8.1%,6.3%,8.1%,6.8%,5.3%,4.3%,0.1%,,,
41 | 39,Daily return as % of total return,5.9%,15.6%,10.5%,14.1%,11.0%,14.1%,11.9%,9.3%,7.4%,0.2%,,,
42 | 40,Cumulative number of ballots returned,"100,604","364,585","541,982","781,299","968,011","1,206,870","1,408,622","1,565,446","1,691,380","1,695,010",,,
43 | 41,Cumulative return as % of total ballots,3.4%,12.4%,18.4%,26.5%,32.8%,40.9%,47.8%,53.1%,57.3%,57.5%,,,
44 | 42,Cumulative return as % of total return,5.9%,21.5%,32.0%,46.1%,57.1%,71.2%,83.1%,92.4%,99.8%,100.0%,,,
45 |
--------------------------------------------------------------------------------
/data/oregon_ballot_returns.rda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/data/oregon_ballot_returns.rda
--------------------------------------------------------------------------------
/homework/figs/adelie-hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/figs/adelie-hist.png
--------------------------------------------------------------------------------
/homework/figs/dragons-love-tacos.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/figs/dragons-love-tacos.png
--------------------------------------------------------------------------------
/homework/figs/elements-as-types.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/figs/elements-as-types.png
--------------------------------------------------------------------------------
/homework/figs/rubiks-cube-keyed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/figs/rubiks-cube-keyed.png
--------------------------------------------------------------------------------
/homework/figs/rubiks-cube.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/figs/rubiks-cube.png
--------------------------------------------------------------------------------
/homework/figs/seaborn-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/figs/seaborn-overview.png
--------------------------------------------------------------------------------
/homework/hw-01.md:
--------------------------------------------------------------------------------
1 | Homework 1
2 | ================
3 |
4 | ## Getting started
5 |
6 | In order to configure your machine to run both R and Python, you’ll need
7 | to do the following steps:
8 |
9 | 1. Download and install R (or update if you already have it)
10 | 2. Download and install RStudio (or update if you already have it)
11 | 3. Download and install the `reticulate` package.
12 | 4. If it doesn’t prompt you to install miniconda, run
13 | `reticulate::install_miniconda()`.
14 |
15 | If you have issues running Python commands from an Rmarkdown file,
16 | please come by office hours or get in touch with me.
17 |
--------------------------------------------------------------------------------
/homework/hw-02.md:
--------------------------------------------------------------------------------
1 | Homework 2
2 | ================
3 |
4 |
5 |
6 | This homework will provide some practice with aspects of Python’s types
7 | and data structures. Give each of these an earnest try in your Rmd
8 | before turning to stack overflow. You’re also encouraged to post your
9 | question to Piazza - collaboration there on homeworks and labs is
10 | encouraged, including sharing approaches to solving questions. If you do
11 | use an external resource, be sure to cite it (include the url).
12 |
13 | ## Types
14 |
15 | Types or simple types are the primary forms that a single object in
16 | Python can take. They form the building blocks of more complex data
17 | structures.
18 |
19 | 1. What are the legal values that a Boolean type object can take in
20 | Python to indicate *true* and *false*? How about in R?
21 |
22 | 2. How would you check to see if a particular integer in Python is odd?
23 | (there are a few ways to skin this cat)
24 |
25 | 3. We’ve seen how you can use `and` and `or` to compose two Boolean
26 | type objects and output another Boolean. `xor` or “exclusive or” is
27 | a third option and though it exists in R as `xor()`, it doesn’t in
28 | Python. Build up an expression that performs `xor` in Python using
29 | `and` and `or`.
30 |
31 | 4. Provide both the Python and R approaches of using `"hello"` and `"
32 | world"` to output
33 |
34 | 1. `"hello world"`
35 | 2. `"hellohellohello"`
36 | 3. `"w"`
37 | 4. `"olleh"`
38 |
39 | 5. We saw that in Python it is possible to concatenate and replicate
40 | strings using arithmetic operators. Do the update assignment
41 | operators also work on strings? Try it.
42 |
43 | 6. See Piazza for the note related to string comparison in Python.
44 |
45 | 1. What does the output of `"Hi" < "lo"` and `"hi" < "Lo"` suggest
46 | about how the integers in the ASCII table associate with capital
47 | letters compares to that of lowercase letters? Check your answer
48 | with `ord()`.
49 | 2. Place the following characters in order from least to greatest:
50 | `$`, `+`, `-`, `~`.
51 |
52 | 7. We discussed the important distinction between the notion of
53 | `equality` and `identity`. How do you check for these in R?
54 |
55 | ## Data Structures
56 |
57 | Data structures can also be called compound types or, frustratingly,
58 | just types (thus we still use `type()` to query them). The important
59 | distinction from the simple types is that they serve as structured
60 | containers for multiple simple objects. Each one is defined by its
61 | particular structure in terms of how indexing can be done, if at all,
62 | and if they are mutable, or capable of being changed without reassigning
63 | them.
64 |
65 | 1. Consider the list, `a = [1, 3, 5, ["a", "b"]]`.
66 |
67 | 1. Replace the first element with the integer `99`.
68 |
69 | 2. Extract the fourth element as a list using two approaches:
70 | positive and negative indexing.
71 |
72 | 3. Extract `"a"` as a list with one element, a string.
73 |
74 | 4. Extract from the list the elements and sub-element needed to
75 | form a new list: `[1, 3, "b"]`.
76 |
77 | 5. In one line of code, reverse the order of the elements and
78 | subelements of the list so that the result is `[["b",
79 | "a"], 5, 3, 1]`.
80 |
81 | 2. What happens if you try to use negative indexing on a vector in R?
82 | Please show an example.
83 |
84 | 3. What are the closest R analogs to each of the four basic data
85 | structures in Python? Are they perfect matches? If not, how do they
86 | differ? You can get `reticulate`’s opinion on this by bringing
87 | different data structures back and forth between R and Python using
88 | `py$` and `r.`
89 |
90 | 4. The central data structure in R is the dataframe. Construct an
91 | analog in Python to this snippet of the `mtcars` dataframe in R
92 | (ignore the row names): `mtcars[1:3, 1:3]`.
93 |
94 | 5. Dictionaries look an awful lot like sets but with a key associated
95 | with each value in the set. Do set operations work on dictionaries?
96 | Try it out.
97 |
98 | ## Farther afield
99 |
100 | 1. Why does Python use zero-based indexing? Propose a possible
101 | rationale of your own, then look online to see what other people
102 | think.
103 |
104 | 2. What is a factor in R? A type? A data structure?
105 |
106 | 3. What are the type heterogenous data structures and type homogeneous
107 | structure in R? Why do you think R in particular has so many type
108 | homogeneous structures? (feel free to speculate)
109 |
--------------------------------------------------------------------------------
/homework/hw-02.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-02.pdf
--------------------------------------------------------------------------------
/homework/hw-03.md:
--------------------------------------------------------------------------------
1 | Homework 3
2 | ================
3 |
4 |
5 |
6 | ## Functions
7 |
8 | 1. The [Laplace
9 | Distribution](https://en.wikipedia.org/wiki/Laplace_distribution) is
10 | a continuous probability distribution with a shape that’s determined
11 | by a center parameter \(\mu\) and a spread parameter \(b\). Write a
12 | function that returns the probability density of the Laplace
13 | distribution evaluated at some real number value `x` and call the
14 | function `lap_pdf`. Allow the user to pass two optional arguments
15 | specifying the parameters of the distribution that are otherwise set
16 | to default values of 0 and 1 respectively. Test out your function at
17 | a view values of `x` and a couple parameter values to test if it’s
18 | working the way you expect it to.
19 |
20 | 2. Consider the following function:
21 |
22 | ``` python
23 | x = 1
24 | def f(y):
25 | return x + y
26 | ```
27 |
28 | 1. Does this function have access to the value of `x` in the global
29 | namespace even though you haven’t passed it in as an argument?
30 | Write an analagous function in R and compare the behavior.
31 | 2. Add a line to the function before the `return` where you
32 | reassign `x` to have a new value. Does running the function
33 | change the value that `x` takes in the global namespace? Compare
34 | this behavior to R.
35 | 3. Add one more line above the previous one to declare that `x` is
36 | a global variable (as opposed to a local variable): `global x`.
37 | How does this change the behavior of the function? Compare it to
38 | a similar modification in R where you instead reassign `x` with
39 | the super-assignment operator: `<<-`.
40 |
41 | 3. In class I gave an ill-conceived poll that asked you to, “Write a
42 | function that takes a given string, and outputs it as a single
43 | string repeated `n` times, each one separated by a `.`.” We saw that
44 | a naive implementation does not work:
45 |
46 | ``` python
47 | def dotted_print(x, n):
48 | return print(x * n, sep = ".")
49 |
50 | dotted_print("hello", 3)
51 | ```
52 |
53 | ## hellohellohello
54 |
55 | The reason is that `x * n` is evaluated first, which smushes the
56 | repeated strings together to form a single scalar string. With only
57 | one string there (albeit one with repetitions inside it), `print()`
58 | has nothing to glue together with a `.`.
59 |
60 | You can fix this behavior by being sure that each repeated string
61 | remains a separate element before getting passed to `print()` and
62 | that print recognizes each of those elements to be separate. You can
63 | get there by incorporating two ideas:
64 |
65 | - As we’ve seen, you can use `*` and `+` operators with string
66 | types to repeat and concatenate them. They can also be used on
67 | list types with a similar functionality. The list type is one
68 | way to preserve the separation between the repeated strings.
69 | - The `*` character can also be used before a variable passed to a
70 | function (this character keeps very busy in Python\!). This can
71 | be read to mean, “expand this as a sequence” (see p. 43 in
72 | *Whirlwind*). If you look in `?print`, this allows you to pass a
73 | compound data structure not as a single `value`, but as multiple
74 | values that take advantage of the `...`.
75 |
76 | Use these ideas to fix `dotted_print()`.
77 |
78 | ## Methods
79 |
80 | 1. Consider the following list of strings:
81 |
82 | ``` python
83 | l = ["my", "it's", "smokey", "out"]
84 | ```
85 |
86 | 1. Query two attributes of this list that are of interest to you.
87 | 2. Use the `.append()` list method to add the string `"today"` to
88 | this list.
89 | 3. Use list comprehension (see Lab 2) and string methods to
90 | capitalize each of the words in this list.
91 | 4. Construct one additional list comprehension that uses a string
92 | method of your choice and asserts a logical condition for the
93 | operation using the `if` keyword.
94 |
95 | 2. One of the most common places that you’ll find object-oriented
96 | programming in R is in print methods. Any time you type the name of
97 | a variable at the console, it often doesn’t actually print out the
98 | value of that object, but rather calls on its particular print
99 | method.
100 |
101 | 1. Create a linear model object using `lm()` (you’re welcome to use
102 | the `mtcars` example from the slides) and call it `m1`. What is
103 | the class of `m1`? What is its type (use `typeof()`)? What do
104 | you get when you print it to the console by either just typing
105 | the name of the object or using `print()`? Compare that with
106 | what you get if you print the object using `print.default()`.
107 | 2. Run the `summary()` of the model object and save it as `s1`.
108 | Answer the same questions as above for `s1`.
109 | 3. Create a ggplot object and save it as `p1`. Answer the same
110 | questions as above for `p1`. What command would you use to pull
111 | up the help file for the print method of a ggplot?
112 |
--------------------------------------------------------------------------------
/homework/hw-03.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-03.pdf
--------------------------------------------------------------------------------
/homework/hw-03_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-03_files/figure-gfm/unnamed-chunk-1-1.png
--------------------------------------------------------------------------------
/homework/hw-04.md:
--------------------------------------------------------------------------------
1 | Homework 4
2 | ================
3 |
4 |
5 |
6 | 1. Consider the list `l = [2, 8, 45, 11, 5, 0]`. Check whether each
7 | element in the list is even and return the result, either true or
8 | false, using …
9 |
10 | 1. a `for` loop where your assign your variable to take the *index*
11 | of the loop,
12 | 2. a `for` loop where you assign your variable to take the *values*
13 | in the list directly, and
14 | 3. Also implement this in R.
15 |
16 | 2. Consider the lists `animals = ["penguins", "turtles", "dragons"]`
17 | and `foods = ["sandwiches", "ice cream", "tacos"]`. Use two for
18 | loops, one nested within another, and assigning the variables to
19 | take the *values* of the lists, to print the full set of sentence
20 | combinations of the form \`“penguins love sandwiches”, “penguins
21 | love ice cream”, …, “dragons love tacos”.
22 |
23 | 3. Repeat this exercise but this time use a single for loop where the
24 | variable takes the *index* of the loop.
25 |
26 | 4. We saw in lecture that Python has a structure called an *iterator*
27 | that acts like a list for the purpose of iteration, but it doesn’t
28 | actually allocate that full list. The example we saw was `range()`.
29 | Use the `enumerate()` iterator (see p. 55 in *Whirlwind*) to print
30 | out the index of the elements of `l`, their value, and if they’re
31 | even.
32 |
33 | 5. Although the result won’t be quite the same, repeat exercise 2 using
34 | the `zip()` iterator (see p. 56 in *Whirlwind*).
35 |
36 | 6. In R, create a simple linear model using whatever data set you like
37 | and call it `m1`. In addition to being an object of class `lm`, it
38 | is, more fundamentally a named list.
39 |
40 | 1. Extract the names of the elements in the list.
41 | 2. Extract the model coefficients in two ways: dollar-sign notation
42 | and square bracket notation by name. Are the results different?
43 | If so, how?
44 | 3. Use the `summary()` method on `m1` to create a new list and call
45 | it `s1`. Extract the names of the elements of this list.
46 | 4. Extract the `coefficient` element of `s1`. What is its type? Its
47 | class?
48 | 5. Using the fact that “Everything in R is a vector”, use single
49 | bracket vector subsetting to extract the p-value corresponding
50 | to the slope coefficient.
51 |
--------------------------------------------------------------------------------
/homework/hw-04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-04.pdf
--------------------------------------------------------------------------------
/homework/hw-04_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-04_files/figure-gfm/unnamed-chunk-1-1.png
--------------------------------------------------------------------------------
/homework/hw-05.md:
--------------------------------------------------------------------------------
1 | Homework 5
2 | ================
3 |
4 |
5 |
6 | **Slicing, Viewing, and Copying Arrays**
7 |
8 | 1. Create two numpy arrays that are 3 x 3 identity matrices:
9 |
10 | - `I_3A`: a coerced list of lists
11 | - `I_3B`: built using the constructor method `eye()`.
12 |
13 | Be sure that both arrays are of type integer
14 | ([Section 2.1](https://jakevdp.github.io/PythonDataScienceHandbook/02.01-understanding-data-types.html)
15 | in Python Data Science Handbook may be helpful here. You can query
16 | the data type attribute of an array by calling the relevant
17 | attribute). Check your work by testing the equality of the two
18 | arrays.
19 |
20 | 2. Use slice indexing on `I_3A` to create a new matrix `I_2_UL`, a 2 x
21 | 2 identity matrix, from the upper-left corner of `I_3A`. Does
22 | creating this object change the nature of `I_3A`?
23 |
24 | 3. Use slice indexing to change in place the values of `I_2_UL` so that
25 | it is a matrix of all `1`s. Does this change the nature of `I_3A`?
26 |
27 | 4. Similar to exercises 2 and 3, use slice indexing on `I_3A` to create
28 | a new matrix `I_2_LR` from the lower right corner of `I_3A`. This,
29 | time, however, append the `.copy()` method to your slice indexing
30 | when you’re creating the new array. Use indexing to change in place
31 | the values of `I_2_LR` so that it is a matrix of all 1s. Does this
32 | change the nature of `I_3A`?
33 |
34 | **Broadcasting and vector recycling**
35 |
36 | 5. Consider the following arrays:
37 |
38 | ``` python
39 | A = np.array([1])
40 | B = np.array([1, 2, 3])
41 | C = np.array([[4],
42 | [5],
43 | [6]])
44 | ```
45 |
46 | 1. What is the *shape* of each array?
47 | 2. Try adding each pair of arrays togethers and observe the result.
48 | In a few sentences, describe precisely how Python carries out
49 | operations on arrays that differ in their shape.
50 | 3. Demonstrate what happens when you add an array of shape `(2, 3)`
51 | to it’s transpose.
52 |
53 | 6. While that behavior in Python is called *broadcasting*, the nearest
54 | analog in R is called *vector recycling*. Try adding each pair of
55 | the following objects together; some of them vectors, some of
56 | matrices (special cases of arrays). Based upon the results, precise
57 | describe how/if R carries out operations on vectors and matrices of
58 | different shape.
59 |
60 | ``` r
61 | A_vec <- 1
62 | A2_vec <- c(1, 2)
63 | B_vec <- c(1, 2, 3)
64 | B_mat <- matrix(c(1, 2, 3), nrow = 1)
65 | C_vec <- c(4, 5, 6)
66 | C_mat <- matrix(c(4, 5, 6), ncol = 1)
67 | ```
68 |
69 | 7. What follows is a simple scatterplot that features one of the most
70 | common errors to new users to `{ggplot2}`. Note, though, that it
71 | doesn’t throw an error\! How can this behavior be explained by
72 | vector recycling?
73 |
74 | ``` r
75 | library(ggplot2)
76 | ggplot(mtcars, aes(x = hp, y = mpg, color = "blue")) +
77 | geom_point()
78 | ```
79 |
80 |
81 |
82 | **Aggregated Operations**
83 |
84 | 8. Use `x = np.random.normal(loc = 7, scale = 2, size = (12, 4))` to
85 | generate a 12 x 4 array of random variables, each drawn from
86 | \(N(\mu = 7, \sigma = 2)\). Treat this as 12 observations on 4
87 | varibles.
88 |
89 | 1. Use `dir()` to remind yourself of the methods available to a
90 | numpy array. Using the appropriate method, find the mean,
91 | standard deviation, and maximum values within each column.
92 | 2. Using the results of these computations, create a new 12 x 4
93 | array called `Z` that is the z-scores corresponding to each of
94 | the observations. Check that it is correct by computing it’s
95 | columnwise mean and standard deviation.
96 | 3. Explain how broadcasting was involved in the computations from
97 | the previous exercise.
98 |
--------------------------------------------------------------------------------
/homework/hw-05.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-05.pdf
--------------------------------------------------------------------------------
/homework/hw-05_files/figure-gfm/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-05_files/figure-gfm/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/homework/hw-06.md:
--------------------------------------------------------------------------------
1 | Homework 6
2 | ================
3 |
4 |
5 |
6 | 1. At this point, we’ve seen a slew of different data structures in
7 | Python and R. It’s worth taking a moment to sit down and lay them
8 | out in a manner that hopefully reveals their similarities and
9 | differences.
10 |
11 | Construct a visual represention of the properties and relationships
12 | between the data structures in both languages. This could take the
13 | form of a [mind map](https://en.wikipedia.org/wiki/Mind_map) or a
14 | [flowchart](https://en.wikipedia.org/wiki/Flowchart), or whatever
15 | other representation that you think best conveys the picture. You
16 | may want to start with a draft to try out your layout before
17 | spending time on the final version. You can use any technology you
18 | like: pencil and paper, powerpoint, R (there is a useful package
19 | called `diagrammer`), etc., but include a picture of the final
20 | version in your Rmd/pdf document.
21 |
22 | Include the following data structures:
23 |
24 | **Python**: list, tuple, dictionary, numpy array, pandas series,
25 | pandas dataframe.
26 |
27 | **R**: atomic vectors, matrix/array, list, data frame, tibble.
28 |
29 | 2. In lecture we demonstrated how you can build a pandas dataframe from
30 | a dictionary, but looked at fairly well-behaved examples. Consider
31 | the following dataframe and describe what rules pandas appears to
32 | rely upon to turn an unruly dictionary into a dataframe. Some
33 | suggestions for things to try:
34 |
35 | - Query the types of the columns of a dataframe using the
36 | `.dtypes` attribute.
37 | - Like an array, you can check shape with `.shape`.
38 | - Try pulling the resulting dictionary apart with subsetting and
39 | checking `type()`.
40 |
41 |
42 |
43 | ``` python
44 | import pandas as pd
45 | df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
46 | print(df)
47 | ```
48 |
49 | ## A B
50 | ## 0 [1, 2, 3] 1
51 | ## 1 foo 1
52 | ## 2 [] 1
53 | ## 3 [3, 4] 1
54 |
55 | 3. In addition to building dataframes from dictionaries, you can also
56 | build them from arrays. Convert the following arrays from homework 5
57 | into dataframes and use them to provide examples as you answer the
58 | following questions.
59 |
60 |
61 |
62 | ``` python
63 | import numpy as np
64 | A = np.array([1])
65 | B = np.array([1, 2, 3])
66 | C = np.array([[4],
67 | [5],
68 | [6]])
69 | D = np.array([[4, 1],
70 | [5, 0],
71 | [6, 1]])
72 | ```
73 |
74 | - What happens when you convert an array into a dataframe? How can you
75 | add row and column names? (try looking through `dir()` as well as
76 | consulting the textbook/internet)
77 | - What happens when you add an array to a dataframe that share the
78 | same shape?
79 | - What happens when you add two dataframes that do not share the same
80 | shape? Consider both cases: different number of rows and different
81 | number of columns.
82 |
83 |
84 |
85 | 4. The last problem of the previous homework had you standardize random
86 | normally distribution data using z-scores. Repeat that exercise, but
87 | work with `X` as a dataframe with variables/columns called X1, X2,
88 | X3, and X4.
89 |
--------------------------------------------------------------------------------
/homework/hw-06.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-06.pdf
--------------------------------------------------------------------------------
/homework/hw-07.md:
--------------------------------------------------------------------------------
1 | Homework 7
2 | ================
3 |
4 | For the following exercises, use the data frame created below.
5 |
6 | ``` python
7 | import pandas as pd
8 | df = pd.DataFrame({"var_1": [1, 2, 1, 1], "var_2": [1, 2, 3, 4], "var_3": ["a", "a", "b", "c"]})
9 | ```
10 |
11 | 1. Data frames and series have a method called `.isin()` that makes
12 | certain boolean masks easier to construct. Use `.isin()` to extract
13 | the rows of `df` where `var_3` is either `"a"` or `"b"`.
14 |
15 | 2. Does there appear to be any difference between `.mean()` and
16 | `df.agg(mean)`? Tinker with a few applications of each to `df` and
17 | read the help files to formulate your answer.
18 |
19 | 3. Write a new function called `cube_root` that takes a series and
20 | returns its cube root. Use that within `.agg()` to take the cube
21 | root of `var_2`.
22 |
23 | 4. A very common construction is to follow a `groupby` with an
24 | aggregation operation. Often it’s just a single column of output
25 | that you’re interested in, but you can get there by selecting the
26 | column at the beginning of the operation or at the end. Demonstrate
27 | both approaches using `df` and explain which you think is preferable
28 | and why.
29 |
30 | 5. When you studied numpy arrays, you learned that it makes a
31 | distinction between *viewing* a sub-array versus *copying* it (you
32 | used `.copy()`). Using `df`, determine if the same distinction
33 | applies with pandas data frames.
34 |
35 | 6. Consider the four different approaches to setting the value in the
36 | 1st row, 1st column to `2`. The final method is the preferred
37 | approach, but for the other three, see if you can figure out why
38 | you’re getting those warning messages.
39 |
40 |
41 |
42 | ``` python
43 | df[0:1]["var_1"] = 2
44 | df["var_1"][0:1] = 2
45 | df[["var_1"]][0:1] = 2
46 | df.loc[0, "var_1"] = 2
47 | ```
48 |
--------------------------------------------------------------------------------
/homework/hw-07.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-07.pdf
--------------------------------------------------------------------------------
/homework/hw-10.md:
--------------------------------------------------------------------------------
1 | Homework 10
2 | ================
3 |
4 |
5 |
6 | This lab features a gallery of several canonical plot types made in
7 | `ggplot2`. Your task is to recreate them using `seaborn` and
8 | `matplotlib`. As you work, I recommend using as a reference seaborns
9 | [excellent web documentation](https://seaborn.pydata.org/).
10 |
11 | - You’re welcome to use the atomic `seaborn` plot functions
12 | (`scatterplot()`, `boxplot()`, etc.) or their wrapper equivalents
13 | (`relplot()`, `displot()`, and `catplot()`).
14 |
15 | - In terms of aesthetics, do try to match the main elements of the
16 | ggplot: the plot type, line types, the colors, the labels, and the
17 | title. Don’t worry about getting the sizes identical or all of the
18 | axis tick marks to match up.
19 |
20 | - Your code for these plots might involve bits of `pandas`, `seaborn`,
21 | and `matplotlib`. You’re at a good point now where you should be
22 | able to get your code *working* fairly easily, so pay some attention
23 | to getting your code simple, readable, and well-formatted.
24 |
25 | - While reproducing all five of these plots in `seaborn` will give you
26 | the most practice with its functionality, if you are short on time,
27 | it is fine to select three of them to do.
28 |
29 | As you begin this journey from `ggplot2` users to `seaborn` users, I
30 | encourage you to read a [blog
31 | post](https://robinsones.github.io/Better-Plotting-in-Python-with-Seaborn/)
32 | of data scientist Emily Robinson, as she went on a similar journey.
33 |
34 | You’ll want to start by prepping your R session,
35 |
36 | ``` r
37 | library(tidyverse)
38 | library(palmerpenguins)
39 | data(penguins)
40 | penguins <- drop_na(penguins)
41 | ```
42 |
43 | as well as your Python session.
44 |
45 | ``` python
46 | import pandas as pd
47 | import seaborn as sns
48 | import matplotlib.pyplot as plt
49 | # penguins = pd.read_csv("https://raw.githubusercontent.com/andrewpbray/python-for-r-users/master/data/penguins.csv")
50 | # plt.style.available
51 | plt.style.use('ggplot')
52 | ```
53 |
54 | You may want to set up a parallel notebook to allow you to iterate on
55 | your seaborn plots a little more easily. If you do that, it will be
56 | easiest to read in the penguinds data from the csv file. Importantly:
57 | currently [Berkeley’s
58 | DataHub](https://datahub.berkeley.edu/user/andrewbray/tree) is using an
59 | older version of `seaborn` and there have been several important updates
60 | for current version `0.11.0`. To use the current version in your
61 | notebook, add a cell at the top with `!pip install seaborn==0.11.*`.
62 |
63 | #### Example: Single histogram
64 |
65 | Below is the `ggplot2` code to generate a histogram and an analog using
66 | `seaborn`. You’ll note that even though they both have 13 bins, the
67 | plots end up slightly different from one another, despite the fact that
68 | they’re using the same data. The difference is that they use two
69 | different rules for determining when to start the first and last bins
70 | after the first and last observations, respectively. You may come across
71 | similar slight differences in the course of this homework; don’t worry
72 | about ironing them out perfectly.
73 |
74 | ``` r
75 | penguins %>%
76 | slice(-1) %>%
77 | filter(species == "Adelie") %>%
78 | ggplot(aes(x = body_mass_g)) +
79 | geom_histogram(bins = 13,
80 | color = "white",
81 | fill = "goldenrod",
82 | alpha = .8) +
83 | xlab("body mass (g)") +
84 | ggtitle("Adelie Penguin Body Mass")
85 | ```
86 |
87 | 
88 |
89 | ``` python
90 | p = sns.histplot(x = "body_mass_g",
91 | data = r.penguins[r.penguins["species"] == "Adelie"].iloc[1:, :],
92 | bins = 13,
93 | color = "goldenrod")
94 | p.set_xlabel("body mass (g)")
95 | p.set_title("Adelie Penguin Body Mass", loc = "left")
96 | ```
97 |
98 |
99 |
100 | 1.
101 |
102 |
103 | ``` r
104 | penguins %>%
105 | ggplot(aes(x = bill_length_mm,
106 | y = flipper_length_mm,
107 | color = species)) +
108 | geom_point(size = 1.5, alpha = .5) +
109 | labs(x = "bill length (mm)",
110 | y = "flipper length (mm)",
111 | title = "Penguin physiology by species")
112 | ```
113 |
114 | 
115 |
116 | 2.
117 |
118 |
119 | ``` r
120 | penguins %>%
121 | ggplot(aes(x = island,
122 | y = bill_length_mm)) +
123 | geom_boxplot() +
124 | coord_flip() +
125 | labs(y = "bill length (mm)",
126 | x = "island",
127 | title = "Penguin bill length by island")
128 | ```
129 |
130 | 
131 |
132 | 3.
133 |
134 |
135 | ``` r
136 | penguins %>%
137 | ggplot(aes(x = body_mass_g,
138 | fill = species)) +
139 | geom_density(alpha = .5) +
140 | facet_grid(rows = vars(island))
141 | ```
142 |
143 | 
144 |
145 | 4.
146 |
147 |
148 | ``` r
149 | penguins %>%
150 | ggplot(aes(x = species,
151 | fill = sex)) +
152 | geom_bar(position = "fill")
153 | ```
154 |
155 | 
156 |
157 | 5.
158 |
159 |
160 | ``` r
161 | library(lubridate)
162 | penguins %>%
163 | mutate(year = ymd(year, truncated = 2L)) %>%
164 | group_by(species, year) %>%
165 | summarize(avg_body_mass = mean(body_mass_g)) %>%
166 | ggplot(aes(x = year,
167 | y = avg_body_mass,
168 | color = species)) +
169 | geom_line() +
170 | geom_point() +
171 | lims(y = c(0, 6000)) +
172 | labs(x = "average body mass (g)") +
173 | scale_x_date(date_breaks = "1 year")
174 | ```
175 |
176 | 
177 |
--------------------------------------------------------------------------------
/homework/hw-10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10.pdf
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-3-1.png
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-4-1.png
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-6-1.png
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-7-1.png
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/homework/hw-10_files/figure-gfm/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-10_files/figure-gfm/unnamed-chunk-9-1.png
--------------------------------------------------------------------------------
/homework/hw-12.md:
--------------------------------------------------------------------------------
1 | Homework 12
2 | ================
3 |
4 | This homework will give you a peek into the expansive world of machine
5 | learning as implemented in the Python package `scikit-learn`. You won’t
6 | emerge an expert by any means, but it will give you some exposure to a
7 | framework that’s likely quite different from what you’re used to. In the
8 | classic statistical / scientific modeling framework, it is common to
9 | spend your effort thinking through the how to craft your model based on
10 | your understanding of the phenomenon at hand; e.g. “I want to control
11 | for X, so I’ll include that in the model”, “I know incomes are right
12 | skewed, so I’ll take a log-transformation”.
13 |
14 | In the machine learning framework, these concerns fall away and are
15 | replaced by a strong focus on predictive accuracy. Once you’ve settled
16 | on a specific measure of accuracy, it becomes an optimization problem:
17 | finding the particular model specification that maximized predictive
18 | accuracy. In this homework you’ll use the tool of simulation to
19 | investigate the process of conducting model selection via
20 | cross-validation. Along the way you will:
21 |
22 | - Generate data from a linear model,
23 | - Fit several polynomial regression models,
24 | - Create validation curves to select the optimal model, and
25 | - Consider the effect of different measures of predictive accuracy.
26 |
27 | ## Simulation
28 |
29 | ### Generating data from the true model
30 |
31 | The great advantage to studying models via simulation is that you’re
32 | able to define the true model apriori and generate as much data as you
33 | like. You can then fit a wide range of candidate models and see which
34 | one does the best job of recovering the true model. The true model that
35 | you’ll be working with is:
36 |
37 | \[
38 | y = -1 - 6x + 5x^2 + 5x^3 - 5x^4 + x^5 + \epsilon; \quad \epsilon \sim \textrm{N}(0, 1)
39 | \]
40 |
41 | You can break the process of simulating data from this model in to four
42 | parts.
43 |
44 | #### I. Specify parameters
45 |
46 | This model has 7 parameters: the intercept, the coefficients in front of
47 | each \(x\), and the standard deviation of the error. Go ahead and code
48 | them in.
49 |
50 | ``` python
51 | b_0 = -1
52 | b_1 = -6
53 | b_2 = 5
54 | b_3 = 5
55 | b_4 = -5
56 | b_5 = 1
57 | sigma = 1
58 | ```
59 |
60 | #### II. Generate `x`
61 |
62 | An key part of any regression model is that it models the *conditional*
63 | distribution of the \(y\) given an \(x\). Therefore we can choose any
64 | distribution for the \(x\) that we like. We do need to set `n`, however,
65 | which in a way acts like another parameter.
66 |
67 | 1. Set the value of `n` to 30 and use `np.random.RandomState()` to
68 | create a random state object. Use that object’s appropriate method
69 | (call `dir()` on it or use tab-complete to see your options) to
70 | generate `n` random uniform numbers between 0 and 1. Rescale those
71 | numbers so that they are between 0 and 4, then subtract 1 from all
72 | of them.
73 |
74 | #### III. Calculate \(E(y)\)
75 |
76 | Now that you have your parameters specified and a set of `x` in hand,
77 | you can go ahead and calculate what your model says should be the
78 | expected value of \(y\) given each of those `x`. Another way to think of
79 | these are as the values of \(\hat{y}\) for each `x`.
80 |
81 | 2. Calculate and save the array `Ey` as the linear combination of each
82 | of your parameters with `x`, `x^2`, etc.
83 |
84 | 3. Construct a scatterplot of `x` and `Ey`. You may run into an error
85 | that can be solved by calling the `.ravel()` method on `x` at the
86 | outset. What was the error caused by and how did `ravel()` fix it?
87 |
88 | #### IV. Simulate `y`
89 |
90 | Everything up until this point is just laying the groundwork. The actual
91 | *random* part of the simulation is the generation of the error terms
92 | that make up your “observed” \(y\) values.
93 |
94 | 4. Create a new array `y` as `Ey` plus an array of random normal
95 | variables with mean 0 and standard deviation `sigma`.
96 |
97 | 5. Remake the scatterplot from question 3 but this time use `y` instead
98 | of `Ey`.
99 |
100 | ## Fitting Models
101 |
102 | With your data set of `x` and `y` in hand, you’re now at the point that
103 | a data analysis normally begins: deciding which model to fit to a data
104 | set. Even though we know the correct model form (a quintic polynomial),
105 | normally we have no idea what the model form is. Start off by trying two
106 | models: a linear model and a 6th degree polynomial model. From here on
107 | out you’ll be using `sklearn`, the interface to which can be found in
108 | the lecture notes.
109 |
110 | 6. Use the `LinearRegression` function within the `linear_model` module
111 | of `sklearn` to fit a linear model to your data.
112 |
113 | 7. Using the `PolynomialFeatures` function in the `preprocessing`
114 | module of `sklearn`, create a two dimensional array
115 |
116 | 8. Construct a scatterplot of your original data that has each of these
117 | models overlaid. Note that this requires creating a grid of x values
118 | that you then call each model’s `.predict()` method on to get your
119 | predicted y-values. See the lecture notes and the [textbook
120 | ch. 5](https://jakevdp.github.io/PythonDataScienceHandbook/05.03-hyperparameters-and-model-validation.html)
121 | for reference.
122 |
123 | 9. Using the `metrics` module, calculate the *training* MSE on each of
124 | these models. Which model has better accuracy predicting back into
125 | the training set?
126 |
127 | 10. Using the `cross_validate()` function in the `model_selection`
128 | module, calculate the (negative) *testing* MSE for each model as
129 | estimated through 5-fold cross validation. Take the mean test MSE
130 | across all 5 folds to get a single estimate. Which model has the
131 | smaller testing error (higher negative MSE)?
132 |
133 | ## Validation curves
134 |
135 | The `cross_validate()` function is an efficient way to implement a
136 | procedure that would otherwise require an awful lot of error-prone
137 | typing to code up. It has allowed you to evaluate two potential models
138 | for your simulated data, and it should be clear that the 7th degree
139 | polynomial is preferred over the linear model.
140 |
141 | To be thorough, though, we’d want to consider a wide range of models,
142 | say every polynomial between degree 1 and 20 and track the training and
143 | testing scores of each model. These results can be used to construct a
144 | *validation curve*, which can then be consulted to select the model with
145 | the highest testing score.
146 |
147 | 11. Adapt the code found in the section “Validation curves in
148 | Scikit-Learn” in Ch. 5 of the textbook to construct a validation
149 | curve for polynomial models of degree 1 through 20 using 7-fold
150 | cross-validation. Some notes on using this this function:
151 |
152 |
153 |
154 | - Depending on the version of `sklearn` that you’re using, the
155 | `validation_curve` function may be in the `model_selection` module.
156 | - It is safer to name each of the arguments that you pass to the
157 | function rather than relying upon their position as the book does.
158 | - You will need to make a judgment call on what appropriate y axis
159 | limits are to reveal the most important structure in the curves.
160 | - You can specify that you’re interested in the (negative) MSE by
161 | passing it as a string to the `scoring` argument. See the scoring
162 | document to see what other options are available
163 | ().
164 |
165 |
166 |
167 | 12. Which model does the validation curve suggest is the best one?
168 |
169 | 13. Why is the training score a strictly increasing function of the
170 | degree?
171 |
172 | 14. Change your scoring metric to the mean *absolute* error. Does that
173 | change which model is selected? If so, why do you think this
174 | particular metric would favor that particular model?
175 |
--------------------------------------------------------------------------------
/homework/hw-12.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-12.pdf
--------------------------------------------------------------------------------
/homework/hw-13.md:
--------------------------------------------------------------------------------
1 | Homework 13
2 | ================
3 |
4 | ### Building a car factory in Python
5 |
6 | Pick up where we left off in lecture with the definition of the `car`
7 | class, with five attributes and three methods.
8 |
9 | ``` python
10 | class car:
11 | """ Create a new car """
12 | def __init__(self, make = "honda", model = "civic",
13 | year = 2007, mpg = 30, gas = 12):
14 | """ Create a new car with attributes """
15 | self.make = make
16 | self.model = model
17 | self.year = year
18 | self.mpg = mpg
19 | self.gas = gas
20 |
21 | def drive(self, distance):
22 | """ Drive a distance (in miles) and deplete the gas """
23 | self.gas = self.gas - distance / self.mpg
24 |
25 | def age(self, years):
26 | """ Age a car by a number of years and decrease mpg """
27 | self.mpg += -years / 4
28 |
29 | def greet(self, name):
30 | """ Greet the user with car characteristics """
31 | print("Hello, I am a " + self.make.capitalize() +
32 | " " + self.model.capitalize() + " and my name is " +
33 | name + ".")
34 | ```
35 |
36 | 1. The `.age()` function is currently problematic: its possible to age
37 | your car into negative mpg. It also doesn’t keep track of the age
38 | explicitly. Modify the `.__init__` method to add an age attribute
39 | and modify the `.age()` method so that mpg is not a strictly
40 | decreasing linear function of years aged.
41 |
42 | 2. Modify the `.drive()` method to respond more usefully if the user
43 | specifies the car to drive farther than is possible given the gas in
44 | the tank. If this happens, prevent the gas from going negative and
45 | print a message when the tank is empty. Be sure your method has a
46 | docstring.
47 |
48 | 3. Add a `.fill()` method that can be used to fill the tank back up
49 | with gas. To make this realistic add an attribute for the size of
50 | the gas tank and print message to `.fill()` for when the tank is
51 | full. Be sure your method has a docstring.
52 |
53 | 4. Right now, the notion of driving only involves the spending of gas.
54 | More importantly, though, it changes the location of the car. Add an
55 | attribute that stores the distance of the car from Cal. Modify the
56 | `.drive()` method to take an additional argument `to_cal` that takes
57 | boolean values and will affect the distance from Cal attribute. Add
58 | a message that prints whenever the car has returned to Cal.
59 |
60 | ### Building a car factory in R
61 |
62 | R’s S3 approach to OOP is a bit different. The new `sloop` package has
63 | some useful tools for understanding how it works.
64 |
65 | 5. Use `s3_methods_generic()` to print out all of the methods that stem
66 | from the generic function `summary()`. Which ones have you used
67 | before?
68 |
69 | 6. Use `s3_methods_class()` to print out all of the methods have been
70 | written for the `lm()` class. Which ones have you used before?
71 |
72 | In lecture we created an S3 R class called “car” by writing a
73 | *constructor* function that sets the attributes of an instance of that
74 | object. Instead of including the methods in the definition of the class,
75 | they’re written separately, are are associated with a *generic* version
76 | of the function. Here, we replaced `.greet()` with `summary.car()`.
77 |
78 | ``` r
79 | new_car <- function(make = "honda", model = "civic",
80 | year = 2007, mpg = 30, gas = 12) {
81 | out <- list(make = make,
82 | model = model,
83 | year = year,
84 | mpg = mpg,
85 | gas = gas)
86 | structure(out, class = c("car", "list"))
87 | }
88 |
89 | summary.car <- function(obj, ...) {
90 | cat(paste0("Hello, I am a ", obj$make, " ", obj$model, "."))
91 | }
92 | ```
93 |
94 | 7. Run the above code then call the same two `sloop` functions to check
95 | which methods the `car` class has to check in `summary.car()` shows
96 | up as one of the methods for the `summary()` generic.
97 |
98 | 8. Implement a `age.car()`, `drive.car()`, and `fill.car()` method with
99 | the same functionality as in Python, adding attributes to your
100 | constructor as necessary. Writing each of these methods actually
101 | requires writing two functions: a generic function, e.g. `drive()`,
102 | as well as the class-specific methods. These generics are very
103 | simple pass-through functions that serve to be sure the correct
104 | method is called. See
105 | to learn their structure. Note that a method must have the same
106 | arguments as its generic. There is one exception to this rule: if
107 | the generic has `...`, the method can contain a superset of the
108 | arguments. This allows methods to take arbitrary additional
109 | arguments (see:
110 | ).
111 |
112 | 9. Use `sloop` to verify the methods that are available for an object
113 | of class `car`.
114 |
115 | 10. This exercise is to be done twice: first in R then in Python.
116 | Construct a car that gets 20 mpg and starts 210 miles from Cal with
117 | 10 gallons in its gas tank. Write lines of code that describe each
118 | of the following:
119 |
120 | - The car drives straight back to Cal.
121 | - The car drives for awhile back to Cal, stops for gas, then
122 | continues back to Cal.
123 |
124 | Each line of Python code should be a single line using dot notation
125 | to pass methods. Each line of R code should be concatenate the
126 | methods using pipe operator `%>%`.
127 |
128 | 11. An alternative to writing R code as methods for an S3 object is to
129 | just write independent functions `age()`, `drive()` and `fill()`.
130 | What are the advantages and disadvantages of the object-oriented
131 | approach?
132 |
--------------------------------------------------------------------------------
/homework/hw-13.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/homework/hw-13.pdf
--------------------------------------------------------------------------------
/labs/figs/david-mansion-LA.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/david-mansion-LA.jpg
--------------------------------------------------------------------------------
/labs/figs/oak-pdx-flight.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/oak-pdx-flight.png
--------------------------------------------------------------------------------
/labs/figs/or-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/or-plot-1.png
--------------------------------------------------------------------------------
/labs/figs/or-plot-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/or-plot-2.png
--------------------------------------------------------------------------------
/labs/figs/or-plot-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/or-plot-3.png
--------------------------------------------------------------------------------
/labs/figs/oregon-votes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/oregon-votes.png
--------------------------------------------------------------------------------
/labs/figs/penguin-ice-cubes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/penguin-ice-cubes.png
--------------------------------------------------------------------------------
/labs/figs/r4ds-ds-cycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/r4ds-ds-cycle.png
--------------------------------------------------------------------------------
/labs/figs/square-grid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/square-grid.png
--------------------------------------------------------------------------------
/labs/figs/tiny-airplane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/tiny-airplane.png
--------------------------------------------------------------------------------
/labs/figs/trump-biden-votes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/trump-biden-votes.png
--------------------------------------------------------------------------------
/labs/figs/turtle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/turtle.png
--------------------------------------------------------------------------------
/labs/figs/tux-penguin.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/figs/tux-penguin.jpg
--------------------------------------------------------------------------------
/labs/lab-01.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Lab 1: Your Data in R"
3 | output: github_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE, echo = FALSE)
8 | ```
9 |
10 | One of the best ways to learn a new computer language is to use it for a project that *matters to you*. Here at the beginning of the semester, you'll select a data analysis that you've done in the past and set it as a goal for something that you will be able to re-implement in Python by the end of the semester. The main guideline is that your analysis touches on as many of the steps of the data science cycle as possible.
11 |
12 | ```{r fig.align='center'}
13 | knitr::include_graphics("figs/r4ds-ds-cycle.png")
14 | ```
15 |
16 | Some further guidelines:
17 |
18 | 1. This analysis could come from a course project, a particularly challenging lab, a research project, or a fun hobby project that you've worked on. If you don't have any candidates from your own work that seem suited to the task, you can use an analysis that you've found elsewhere; Rpubs is a good place to look. Be sure to cite any work that is not your own.
19 | 2. The source file can either be an R Markdown document or a well-commented R script. The file that you will be submitting to gradescope will need to be a pdf.
20 | 3. In terms of length, 1 - 3 pages of R code (on a pdf) and potentially 1 - 5 pages of output would be appropriate, but your mileage may vary. It's possible you were able to do a rich data analysis in only a half page of code.
21 | 4. Before you compile to pdf, add very visible section labels (through comments or markdown) to demarcate which step of the data science cycle that span of code represents.
22 | 5. In terms of sophistication of the data analysis, you should be shooting for moderate. It could be fine to use an analysis that fits a very sophisticated statistical model, but know that that may be a component that you may need to leave in R (and pass objects to Python using reticulate).
23 |
24 | This data analysis will be due as a pdf to gradescope by Sunday at 8 pm.
25 |
--------------------------------------------------------------------------------
/labs/lab-01.md:
--------------------------------------------------------------------------------
1 | Lab 1: Your Data in R
2 | ================
3 |
4 | One of the best ways to learn a new computer language is to use it for a
5 | project that *matters to you*. Here at the beginning of the semester,
6 | you’ll select a data analysis that you’ve done in the past and set it as
7 | a goal for something that you will be able to re-implement in Python by
8 | the end of the semester. The main guideline is that your analysis
9 | touches on as many of the steps of the data science cycle as possible.
10 |
11 |
12 |
13 | Some further guidelines:
14 |
15 | 1. This analysis could come from a course project, a particularly
16 | challenging lab, a research project, or a fun hobby project that
17 | you’ve worked on. If you don’t have any candidates from your own
18 | work that seem suited to the task, you can use an analysis that
19 | you’ve found elsewhere; Rpubs is a good place to look. Be sure to
20 | cite any work that is not your own.
21 | 2. The source file can either be an R Markdown document or a
22 | well-commented R script. The file that you will be submitting to
23 | gradescope will need to be a pdf.
24 | 3. In terms of length, 1 - 3 pages of R code (on a pdf) and potentially
25 | 1 - 5 pages of output would be appropriate, but your mileage may
26 | vary. It’s possible you were able to do a rich data analysis in only
27 | a half page of code.
28 | 4. Before you compile to pdf, add very visible section labels (through
29 | comments or markdown) to demarcate which step of the data science
30 | cycle that span of code represents.
31 | 5. In terms of sophistication of the data analysis, you should be
32 | shooting for moderate. It could be fine to use an analysis that fits
33 | a very sophisticated statistical model, but know that that may be a
34 | component that you may need to leave in R (and pass objects to
35 | Python using reticulate).
36 |
37 | This data analysis will be due as a pdf to gradescope by Sunday at 8 pm.
38 |
--------------------------------------------------------------------------------
/labs/lab-01.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-01.pdf
--------------------------------------------------------------------------------
/labs/lab-02.md:
--------------------------------------------------------------------------------
1 | Lab 2: Practice with Penguins
2 | ================
3 |
4 |
5 |
6 | The `penguins` data set in the `{palmerpenguins}` R package will provide
7 | you your first data-centric tour between R and Python.
8 |
9 | 1. Load the data in the R, bring up the helpfile (with `?`) and query
10 | its structure (`str`). What is the data structure? What is the
11 | observational unit in this data set? How many observations are there
12 | and how many variables measured on each?
13 |
14 | 2. Use `{ggplot2}` to make two scatter plots of bill length as a
15 | function of bill depth. For the first, map color to species. For the
16 | second, map color to island. Which covariate more cleanly separates
17 | the penguins based on bill size?
18 |
19 | 3. Bring the data into a Python environment using `{reticulate}` and
20 | access the `r` object and assign it the name `pypenguins`. What is
21 | the `type()` of that data set?
22 |
23 | 4. You can find the length of an object with `len()`. Try calling that
24 | command on `pypenguins`. What is it counting? Try calling the
25 | analogous function `length()` on `penguins` in R. Why does it return
26 | that value? Isn’t `penguins` a rectangular data frame?
27 |
28 | 5. You can access the keys in `pypenguins` by appending the object with
29 | `.keys` and the values by appending with `.values`. If you print
30 | them to the console, you’ll see the resulting objects are structured
31 | like lists. Use the keyword `in` to check if the keys
32 | `bill_length_mm`, `bill_depth_mm` and `species` are in `pypenguins`.
33 |
34 | 6. You can remove elements of a dictionary by using the `del` keyword
35 | followed by the element in the dictionary. Use this approach to
36 | remove the `year` element, then check that it’s been removed by
37 | printing out the keys again. This illustrates the mutable nature of
38 | dictionaries: you can change them without reassigning them.
39 |
40 | 7. *List comprehension* is a concise way to filter out particular
41 | elements of a list to create a new list. It is analogous to the
42 | following logical subsetting of a vector in base R, done to retain
43 | only the names of variable that are longer than 8 characters.
44 |
45 | ``` r
46 | name_vec <- names(penguins)
47 | name_vec[nchar(name_vec) > 8]
48 | ```
49 |
50 | ## [1] "bill_length_mm" "bill_depth_mm" "flipper_length_mm"
51 | ## [4] "body_mass_g"
52 |
53 | In Python, it takes the form:
54 |
55 | ``` python
56 | l = list(pypenguins.keys())
57 | [x for x in l if len(x) > 8]
58 | ```
59 |
60 | ## ['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']
61 |
62 | Which can be read as: form a new list (`[]`) of all elements `x`
63 | from `l` that meet the condition that `len(x) > 8`. We can use the
64 | fact that since those objects passing through `x` will all be
65 | strings, we can also apply the `.upper()` method on the way out:
66 |
67 | ``` python
68 | [x.upper() for x in l if len(x) > 8]
69 | ```
70 |
71 | ## ['BILL_LENGTH_MM', 'BILL_DEPTH_MM', 'FLIPPER_LENGTH_MM', 'BODY_MASS_G']
72 |
73 | Using the fact that each element in `pypenguins` is a list, use list
74 | comprehension to create a new list called `short_bills` that
75 | contains the bill lengths of all penguins with lengths less than 40
76 | mm. Bring that list back into R and use ggplot2 to create a
77 | histogram of its distribution.
78 |
--------------------------------------------------------------------------------
/labs/lab-02.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-02.pdf
--------------------------------------------------------------------------------
/labs/lab-02_files/figure-gfm/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-02_files/figure-gfm/unnamed-chunk-5-1.png
--------------------------------------------------------------------------------
/labs/lab-02_files/figure-gfm/unnamed-chunk-5-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-02_files/figure-gfm/unnamed-chunk-5-2.png
--------------------------------------------------------------------------------
/labs/lab-03.md:
--------------------------------------------------------------------------------
1 | Lab 3: Practice with Turtles
2 | ================
3 |
4 |
5 |
6 | To get more practice with functions and methods, you’ll be making some
7 | very rudimentary drawings using a package called `turtle`. A note about
8 | this lab: it uses an external graphics window and may be a bit fussy to
9 | run through RStudio and `reticulate`. I recommend that you also open a
10 | terminal window to run the code at the command line in python directly.
11 | This will have smoother performance while you’re tinkering around, then
12 | copy your final work into your Rmd. Be sure to review your pdf to be
13 | sure it looks clean before submitting it.
14 |
15 | If you run into questions or problems, please post to Piazza.
16 |
17 | -----
18 |
19 | Start by importing the `turtle` package with the `import` keyword.
20 |
21 | ``` python
22 | import turtle
23 | t = turtle.Turtle()
24 | ```
25 |
26 | This should open a graphics window for you with a small black arrow in
27 | the center. That is your turtle, and you’re able to move it around the
28 | canvas to draw pictures using its methods.
29 |
30 | Before you start drawing with your turtle, consider that last command
31 | that you ran; the dot syntax here is illuminating. It suggests that
32 | `turtle` is an object of a particular class and that `Turtle` is one of
33 | its methods.
34 |
35 | 1. What class is the object `turtle`? You can find out either with the
36 | `type()` function or the `__class__` attribute of that object.
37 |
38 | 2. Name three methods besides `.Turtle()` that are available to
39 | `turtle`.
40 |
41 | This returns us to the notion that “Everything is an object”, including
42 | the package / module `turtle` that we just imported, and to the practice
43 | to associate objects with specific methods.
44 |
45 | You can think of `turtle.Turtle()` acting like `turtle::Turtle()` in R -
46 | it’s calling the `Turtle()` function in the `turtle` package - but the
47 | Python does have an extra layer of coherence because it’s within the
48 | same object-oriented programming framework. And although the help file
49 | for the `.Turtle()` method doesn’t say it very clearly, the method
50 | serves to create a new blank object of class `turtle.Turtle`, your
51 | drawing turtle.
52 |
53 | Let’s take the turtle for a test drive. Run the following code.
54 |
55 | ``` python
56 | t.right(90)
57 | t.forward(100)
58 | t.goto(100, 100)
59 | t.goto(0, 0)
60 | ```
61 |
62 | 3. Based on the performance of the turtle after running each of these
63 | commands and after looking at their help files, describe what each
64 | of the three different methods do.
65 |
66 | You’ll notice that you can either move the turtle relatively (e.g. turn
67 | left, walk forward) or in terms of the absolute cartesian coordinates.
68 |
69 | 4. Write the commands that will make the turtle draw a second triangle
70 | that’s a reflection of the first about the line x = 0. This should
71 | create a large arrow of a similar shape to the turtle, facing down.
72 |
73 | 5. This would make a much more convincing large turtle shape if it were
74 | filled in with the color green. Look through the available turtle
75 | methods for ones with helpful looking names and bring up their help
76 | files, then compose several of them together to fill in the empty
77 | arrow with the color green.
78 |
79 | For your final pdf that you submit, you’ll want to be sure to take a
80 | screenshot of your graphics window and include it. Run the following
81 | chunk once to take the screenshot and convert it to a png. After that,
82 | you can set `echo` and `eval` to `FALSE` in your chunk options.
83 |
84 | ``` python
85 | # Set this chunk to eval = FALSE after having run it once.
86 | ts = turtle.getscreen()
87 | ts.getcanvas().postscript(file = "green-turtle.eps")
88 |
89 | from PIL import Image
90 | img = Image.open("green-turtle.eps")
91 | img.save("green-turtle.png", "png")
92 | ```
93 |
94 | Then, to include that png in your Rmd, you can include:
95 |
96 | ``` r
97 | knitr::include_graphics("green-turtle.png")
98 | ```
99 |
100 | We’ll be moving one to a new shape, so go ahead and clear your canvas
101 | using `t.clear()`.
102 |
103 | 6. Write a function that draws a golden equilateral triangle, taking as
104 | input the length of a side.
105 |
106 | 7. Use that function to draw the
107 | [Triforce](https://en.wikipedia.org/wiki/Triforce). Go through the
108 | same process as above to take a screenshot, save it as a png, and
109 | include it in your final pdf.
110 |
--------------------------------------------------------------------------------
/labs/lab-03.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-03.pdf
--------------------------------------------------------------------------------
/labs/lab-04.md:
--------------------------------------------------------------------------------
1 | Lab 4: More practice with Turtles
2 | ================
3 |
4 |
5 |
6 | Now that we have most of the fundamentals of the Python language in our
7 | toolbox, we’re able to stitch together a more complex program. There are
8 | only two questions for this lab, but each requires more careful coding
9 | than previous labs. I recommend starting on paper to sketch out in
10 | pseudo-code how you’ll solve the problem, then turn to thinking how
11 | you’ll implement it in Python.
12 |
13 | As always, if you have questions, please ask over Piazza.
14 |
15 | 1. Each morning after her cup of coffee, the turtle goes for a walk
16 | through her neighborhood, which is defined by a regular square
17 | street grid where each block is of length `s`. Write a program using
18 | `turtle` to draw one of her walks according to the following
19 | guidelines
20 |
21 | - She lives right on the corner (at the vertex in our regular
22 | street grid), and selects which direction she will head
23 | uniformly at random.
24 | - Every time she approaches an intersection, she chooses randomly
25 | between turning left, turning right, and proceeding straight
26 | ahead, each with probability 1/3.
27 | - Two useful tools for simulating the random process are the
28 | `random()` function imported using `from random import random`
29 | and the `randint()` function imported using `from random import
30 | randint`.
31 | - She always walks 25 block faces, each of length `s`.
32 |
33 | Provide in your lab report the code necessary to draw a walk and
34 | include the images of two random walks.
35 |
36 | 2. Write a function to draw a generalized *Poly*force that takes as an
37 | argument `n`, the number of golden triangles to include (note that
38 | the [Triforce](https://en.wikipedia.org/wiki/Triforce) sets this
39 | argument to 3). Some guidelines:
40 |
41 | - Reuse the function that you wrote last lab to draw a single
42 | triangle.
43 | - Construct the Polyforce either using `for` or `while` loops.
44 | - Your function should return only complete Polyforces; i.e. the
45 | final shape should be a full equilateral triangle. Do this by
46 | raising an exception depending on the value `n` that user
47 | provides (see p. 49 in *Whirlwind*).
48 | - Use a similar approach to preventing your user from supplying an
49 | `n` so large that it would crash the program.
50 | - Your Polyforce should be oriented upright, with the peak of the
51 | triangle facing straight up.
52 |
53 | Please include in your lab report your code as well as an image of a
54 | Polyforce with `n = 10`.
55 |
--------------------------------------------------------------------------------
/labs/lab-04.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-04.pdf
--------------------------------------------------------------------------------
/labs/lab-05.md:
--------------------------------------------------------------------------------
1 | Lab 5: More practice with Penguins
2 | ================
3 |
4 |
5 |
6 | Consider a plot that’s a riff on the one you made in Lab 2. Here we plot
7 | penguin bill length vs bill depth, separated out by year.
8 |
9 | ``` r
10 | library(palmerpenguins)
11 | library(ggplot2)
12 | ggplot(penguins, aes(x = bill_depth_mm,
13 | y = bill_length_mm,
14 | color = factor(year))) +
15 | geom_point()
16 | ```
17 |
18 |
19 |
20 | Previously, we used the strained functionality of a dictionary of lists
21 | to subset this data and replot it. In this lab, we’ll be working with
22 | the data as a numpy array, which will give us a bit more power.
23 |
24 | 1. Load the `penguins` data from `{palmerpenguins}` in R and run it
25 | through `tidyr::drop_na()` to remove rows with missing values. Next,
26 | bring it into Python as `pypenguins`. Just to remind ourselves: what
27 | is the type of `pypenguins`? What is the type of the values of each
28 | of its elements (this you can just assess visually)?
29 |
30 | 2. Our goal is to change the type of `pypenguins` to be a numpy array,
31 | but we have a problem: currently it is type *heterogeneous*. Let’s
32 | solve that by working only with the numerical data that it contains
33 | and drop the strings. Our tool for this will be a dictionary
34 | comprehension.
35 |
36 | A dictionary comprehension is very similar to a list comprehension -
37 | it’s essentially short-hand for a for loop - but it returns a
38 | dictionary instead. Here is an example of such a construction:
39 |
40 | ``` python
41 | d = {'a': 1, 'b': 2, 'c': 3, 'd': 4}
42 | {k:v*2 for (k,v) in d.items() if k == 'd'}
43 | ```
44 |
45 | ## {'d': 8}
46 |
47 | Recall that you can access three forms of iterable elements in a
48 | dictionary `d` using `d.keys()`, `d.values()`, `d.items()`. 1. Using
49 | a dictionary comprehension, create a new dictionary from
50 | `pypenguins` that contains only numerical data. 1. Extract the
51 | values from this dictionary and put them into a numpy array called
52 | `pg_array`. You may need to fiddle with it to get it just right, but
53 | aim for a 344 x 5 array of floats (the number of rows may be less
54 | depending on how you removed NAs).
55 |
56 | 3. Split the array into three separate arrays, each one containing data
57 | from a different year. This combines two steps: making a *boolean
58 | mask* (the familiar structure of, say, `x[x<7]`) and then slice
59 | indexing.
60 |
61 | 4. Compute the mean beak and bill length for each of the three years of
62 | data. Has there been any change over time?
63 |
64 | 5. In which year was the maximum bill length observed? What about the
65 | minimum bill length length? Check your answer by looking at the plot
66 | above.
67 |
68 | 6. Bring the 2007 data back into R and make a scatterplot of bill
69 | length vs bill depth. Though this procedure we through seems
70 | trivial, note that it actually wasn’t possible with the dictionary
71 | techniques we used in Lab 2. To subset the rows of `bill_length` and
72 | `bill_depth` based on values of `year` requires a link between the
73 | `i` and `j` indices of a matrix, which is something made possible by
74 | the numpy array.
75 |
--------------------------------------------------------------------------------
/labs/lab-05.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-05.pdf
--------------------------------------------------------------------------------
/labs/lab-05_files/figure-gfm/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-05_files/figure-gfm/unnamed-chunk-1-1.png
--------------------------------------------------------------------------------
/labs/lab-05_files/figure-gfm/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-05_files/figure-gfm/unnamed-chunk-2-1.png
--------------------------------------------------------------------------------
/labs/lab-05_files/figure-gfm/unnamed-chunk-2-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-05_files/figure-gfm/unnamed-chunk-2-2.png
--------------------------------------------------------------------------------
/labs/lab-06.md:
--------------------------------------------------------------------------------
1 | Lab 6: Wrangling Takeoff
2 | ================
3 |
4 |
5 |
6 | The `boxofdata` package contains a data set called `flights` that
7 | contains *all* of the flights that left from San Francisco International
8 | Airport and Oakland International Airports between July 1st and December
9 | 31st 2019. This will serve a rich data set to practice your data
10 | wrangling skills with `pandas`. This lab focuses on four fundamental
11 | tasks in wrangling:
12 |
13 | - Getting the lay of the land
14 | - Accessing rows and columns
15 | - Boolean masks (logicial subsetting)
16 | - Calculating aggregate statistics
17 |
18 | Start this lab in R, where you will load the `flights` data frame from
19 | the package. Note that this is a *large* data set, so if you’re working
20 | on a machine with limited memory, you may want to opt for working
21 | through this lab with `small_flights` instead of `flights`.
22 |
23 | ``` r
24 | # install.packages("remotes")
25 | remotes::install_github("andrewpbray/boxofdata")
26 | library(boxofdata)
27 | library(reticulate)
28 | library(dplyr)
29 | data(flights)
30 | small_flights <- flights %>%
31 | sample_frac(.10)
32 | ```
33 |
34 | ## Getting the lay of the land
35 |
36 | A convenient part of working with data set stored in an R package is
37 | that it will often have an associated help file. Read through the help
38 | file for `flights` to acquaint yourself with the variables it contains.
39 | You may want to consult with this throughout the lab.
40 |
41 | 1. Bring `flights` into python, coerce it to a pandas dataframe, and
42 | call `dir()` on it to print out a lengthy list of methods. Select
43 | two methods whose behavior you can guess based on their name, and
44 | describe that guess. Check their help file to see if your guesses
45 | are correct. Repeat this with two other methods that have, by
46 | contrast, very puzzling names.
47 |
48 | 2. Often the first stop when looking at a new data set is getting a
49 | birds eye view of the structure. Test out `.head()`, `.tail()`,
50 | `.describe()`, `.columns`, and `.dtypes`. Which R functions most
51 | closely mirror this functionality?
52 |
53 | 3. While this behavior is quite similar between the two languages, the
54 | dot syntax has some behaviors that will be unexpected to an R user.
55 | Try `.year` or `.dest`. What would you use to get similar behavior
56 | in R?
57 |
58 | ## Accessing rows and columns
59 |
60 | 4. Pandas has many different methods by which you can access rows and
61 | columns, but probably the most commons is to use the `[]` operator.
62 | Print out the values of the `dest` variable by passing it as a
63 | string inside square brackets. To keep things clean, only actually
64 | print out the first 4 rows by appending your commands with
65 | `.head(4)`.
66 |
67 | 5. Repeat the exercise but print out the first four values of both
68 | `dest` and `origin` by them as a list into the square brackets. Try
69 | also passing only `dest` as a list. In general, what happens when
70 | you subset the columns with a string versus with a list?
71 |
72 | 6. The `[]` operator can also be used to access rows of the data frame
73 | if you pass it a slice instead of a string or list of strings. This
74 | accesses the implicit indices (the integer order) of the rows of the
75 | dataframe. Use this approach to print out the second and third rows
76 | of the data.
77 |
78 | 7. In addition to applying `[]` directly to a dataframe, you can also
79 | apply it to the explicit (named) or implicit (integer ordered)
80 | indices of the dataframe. Print out the `dest` and `origin` of the
81 | second and third rows using two methods: a) first subset the columns
82 | using `[]`, then subset that object using `[]` b) subset both
83 | simulatenous by apply a single `[]` to the explicit indices in
84 | `flights.loc()`.
85 |
86 | ## Boolean masks
87 |
88 | 8. Create a boolean mask object called `is_oak` that is `True` when the
89 | flight came from Oakland International Airport and `False` when the
90 | flight came from San Francisco International Airport. What is the
91 | type of that object?
92 |
93 | 9. Boolean masks can be strung together if you wrap them in `()` and
94 | combine them using `&` and `|`. If you flew on a flight that
95 | departed from either SFO or OAK in the time window covered this
96 | data, find that flight’s record. Did you land at your destination on
97 | time? Bonus points (not really) for including a photograph of the
98 | actual airplane that made this flight. Bonus bonus points (still not
99 | really) for including first-hand documentation that you were indeed
100 | early, on time, or late.
101 |
102 | Some advice: if you can’t find your flight on first pass, try
103 | searching using different criteria. I have yet to have taken a
104 | flight that doesn’t show up in the database that this data set pulls
105 | from. Often data sets have a bit of messiness in them\!
106 |
107 | If you didn’t fly within this window, find the record corresponding
108 | to a flight that I was on.
109 |
110 |
111 |
112 | 10. Create a new data frame that includes flights headed from the Bay
113 | Area to New York in November and save this data frame as
114 | `nyc_nov_flights` (note that the city has two major airports: JKF
115 | and LaGuardia). Peek at the resulting columns to check if your
116 | subsetting operation worked correctly. How many flights meet these
117 | criteria?
118 |
119 | ## Calculating aggregate statistics
120 |
121 | 11. Calculate the mean and median departure delay for flights heading to
122 | NYC in November. Does a positive number imply that the flight left
123 | early or late? What can you infer by the relative magnitude of the
124 | mean and median?
125 |
126 | 12. Create a second data set called `nyc_jul_flights` that is similar,
127 | but contains data from July. Calculate the mean and median departure
128 | delay. Use any knowledge you have about air travel and weather to
129 | speculate why you see this difference between July and November.
130 |
131 | 13. Bring these two small dataframes back into R and use them to
132 | visualize the difference in the distribution of departure delays to
133 | NYC in July and November (`dplyr::bind_rows` is an efficient way to
134 | combine two dataframes into one).
135 |
--------------------------------------------------------------------------------
/labs/lab-06.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-06.pdf
--------------------------------------------------------------------------------
/labs/lab-07.md:
--------------------------------------------------------------------------------
1 | Lab 7: Wrangling at Cruising Altitude
2 | ================
3 |
4 |
5 |
6 | In this lab, you’ll continue your work with the flights data and apply a
7 | new set of operations that greatly empower your ability to deftly
8 | wrangle a data set with remarkably concise syntax.
9 |
10 | - Adding columns
11 | - Sorting a data frame
12 | - Finding unique values
13 | - Grouped Operations
14 |
15 | Begin, as before, by loading in the data from the `boxofdata` package
16 | and then into Python.
17 |
18 | ``` r
19 | # install.packages("remotes")
20 | # remotes::install_github("andrewpbray/boxofdata")
21 | library(boxofdata)
22 | library(reticulate)
23 | library(dplyr)
24 | data(flights)
25 | small_flights <- flights %>%
26 | sample_frac(.10)
27 | ```
28 |
29 | ``` python
30 | import pandas as pd
31 | flights = r.flights
32 | ```
33 |
34 | These wrangling puzzles are challenging. I recommend starting with
35 | pencil and paper, laying out each of the steps, and keeping handy the
36 | help file that describes each of the variables.
37 |
38 | 1. Start by revisiting the questions from the end of the previous lab.
39 | Use `.isin`, `.groupby`, and `agg()` to find the mean and median
40 | departure delay from the Bay Area to the NYC area in July and
41 | November.
42 |
43 | 2. What proportion of the flights in the full data set are
44 | [red-eyes](https://en.wikipedia.org/wiki/Red-eye_flight)? As part of
45 | answering this question, add a new boolean column to the data frame
46 | indicating whether each flight is a red-eye.
47 |
48 | 3. On average, are red-eye flights from SFO to JFK more or less likely
49 | to arrive (on time or early) or (late)?
50 |
51 | 4. On showing this result to a friend, they reply:
52 |
53 | > Yes, but that’s not really due the flights being red-eyes. As the day
54 | > goes on, flights at an airport get more and more backed-up. Flights
55 | > that leave in the evening, like red-eyes, are just more likely to be
56 | > delayed.
57 |
58 | Evaluate this explanation using this data set in two ways: looking at
59 | (the proportion of flights that are delayed) and (the average delay) at
60 | different points in the day. How do you interpret these results and what
61 | does it suggest about red-eyes and delays?
62 |
63 | ## Various and sundry
64 |
65 | The remaining questions provide a sense of the diverse questions that
66 | can be answered with this data set. Some guidelines and hints:
67 |
68 | - For each question, describe any wrinkles/surprises that came up and
69 | how you addressed them. Relatedly…
70 | - Some of these questions are not fully answerable given the data in
71 | `flights`. If that’s the case, be clear about the special case of
72 | the question that you’re focusing on or the assumptions that you’re
73 | using (do not bring in other data).
74 | - It can be easiest to work these problems backwards: what does the
75 | data frame look like that will answer this question? What is the
76 | unit of observation of that data frame? Usually the answer to that
77 | question is a hint about any variable you may need to group by.
78 | - If in your work you get a `SettingWithCopyWarning`, read through the
79 | explanation of that warning
80 | [here](https://www.dataquest.io/blog/settingwithcopywarning/).
81 |
82 |
83 |
84 | 5. Return to the single flight that you studied in the previous lab.
85 | Compared to other flights to that destination, how does your arrival
86 | delay compare? Answer this by comparing your flight to other flights
87 | on the same route taken within two weeks of the day of the flight.
88 | Calculate the percentile of your arrival delay in that distribution
89 | and then bring the data into R to visualize where your flight is in
90 | the context of that distribution.
91 |
92 | 6. Which routes are served by the greatest number of carriers? List the
93 | top five.
94 |
95 | 7. Which airplane has traveled the greatest distance over the six
96 | months covered by this data set? How long in total was it airborne?
97 | How many distinct routes did it fly?
98 |
99 | 8. Certain airports are notorious for the amount of time you have to
100 | spend on the tarmac waiting to taxi to your gate. What are the top
101 | five airports (the worse offenders) in terms of time spent on the
102 | tarmac taxiing upon arrival?
103 |
--------------------------------------------------------------------------------
/labs/lab-07.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-07.pdf
--------------------------------------------------------------------------------
/labs/lab-08.md:
--------------------------------------------------------------------------------
1 | Lab 8: Wrangling Arrival
2 | ================
3 |
4 |
5 |
6 | This is the third and final installment in the data wrangling series.
7 | You’ll wrap things up by practicing three last skills common in data
8 | science.
9 |
10 | - Date-time methods
11 | - String methods
12 | - Joins
13 |
14 | Begin, as before, by loading in the data from the `boxofdata` package
15 | and then into Python.
16 |
17 | ``` r
18 | # install.packages("remotes")
19 | # remotes::install_github("andrewpbray/boxofdata")
20 | library(boxofdata)
21 | library(reticulate)
22 | library(dplyr)
23 | data(flights)
24 | small_flights <- flights %>%
25 | sample_frac(.10)
26 | ```
27 |
28 | ``` python
29 | import pandas as pd
30 | flights = r.flights
31 | ```
32 |
33 | 1. You saw in lecture that if you have a column that is a Series of
34 | strings, you can append the *string accessor* `.str` to have access
35 | to a host of useful element-wise operations for strings. You can
36 | access analogous functionality for a Series of date-time data using
37 | accessor `.dt`. Take a look at the available methods either in the
38 | [Pandas docs
39 | online](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#time-date-components)
40 | or by calling `dir(flights["time_hour"].dt)` (note that when you
41 | call `flights.dtypes`, `time_hour` is of type `datetime64`).
42 |
43 | Use date-time methods to determine which day of the week is best to
44 | fly from the Bay Area to Seattle if you want to minimize delays.
45 | Does the answer differ between Oakland and San Francisco Airports?
46 |
47 | 2. Return to a question from the second data wrangling lab:
48 |
49 | > Certain airports are notorious for the amount of time you have to
50 | > spend on the tarmac waiting to taxi to your gate. What are the top
51 | > five airports (the worse offenders) in terms of time spent on the
52 | > tarmac taxiing upon arrival?
53 |
54 | The main challenge was the calculation of total time (total time
55 | minus air time gives you what can be interpreted as tarmac time).
56 | Arrival and departure times are always listed in local time, so it
57 | doesn’t work to simply find the difference between these two if the
58 | plane changed time zones. You can now account for this
59 | programmatically because the timezone of all of the airports is
60 | available in the data set in `boxofdata` called `airports`.
61 |
62 | Use your knowledge of how to join multiple tables to re-answer this
63 | question in a more satisfactory way.
64 |
65 | 3. Return to a question from the first data wrangling lab:
66 |
67 | > Create a second data set called `nyc_jul_flights` that is similar,
68 | > but contains data from July. Calculate the mean and median
69 | > departure delay. Use any knowledge you have about air travel and
70 | > weather to speculate why you see this difference between July and
71 | > November.
72 |
73 | You found that, perhaps surprisingly, delays were more common in
74 | July than they are in November. One of the explanations offered by
75 | several students is that during the summer, San Francisco Bay often
76 | gets fog that can delay flights at both airports. Evaluate this
77 | explanation using the data found in the `weather` data set in
78 | `boxofdata`. Explain clearly how you are using the weather data to
79 | infer the presence of fog.
80 |
81 | 4. The two biggest manufacturers of commercial aircraft are Boeing and
82 | Airbus, followed by Embraer and Bombardier. For each of these
83 | manufacturers, which airline schedules the greatest proportion of
84 | their flights out of SFO and OAK using planes made by that
85 | manufacturer? The `planes` data frame in `boxofdata` is useful here.
86 | Note that the manufacturers sometimes use slight variants on their
87 | names; you will want to collapse those variants into a single name.
88 |
--------------------------------------------------------------------------------
/labs/lab-08.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-08.pdf
--------------------------------------------------------------------------------
/labs/lab-09.md:
--------------------------------------------------------------------------------
1 | Lab 9: Reshaping Votes
2 | ================
3 |
4 |
5 |
6 | In 1998, Oregon became the first state to conduct its elections
7 | exclusively by mail. Every election, all registered voters are
8 | automatically mailed ballots to their home address roughly a month
9 | before the election. Voters then vote at their leisure and mail back or
10 | drop off their ballot sometime before the election closes. Due to
11 | concerns about the Covid-19 pandemic, many other states have implemented
12 | some version of Oregon’s voting system in preparation for the November
13 | 3rd, 2020 election next Tuesday.
14 |
15 | Since mid-October, when the first completed ballots started returning to
16 | Oregon election offices, the Secretary of State’s office has been
17 | posting the ballot return data on [their
18 | website](https://sos.oregon.gov/voting/Pages/current-election.aspx). In
19 | this lab you’ll be using this data to construct three visualizations
20 | that will help understand some of the dynamics of the 2020 election in
21 | Oregon.
22 |
23 | ### Set-up
24 |
25 | The election returns data is stored in a pdf on the website linked above
26 | and titled, “Unofficial Daily Ballot Returns”. This is a common way that
27 | public data is distributed and it is . . . unfortunate. As a file
28 | format, a pdf is only meaningful to a program that is designed to
29 | display that format. You can’t, for example, open it up in a text editor
30 | and pull out anything intelligible. It is meant as a *display* format,
31 | not a format to encourage further engagement and analysis.
32 |
33 | Some years ago, this would be a dead end for data analysis unless you
34 | wanted to transcribe those data tables manually. Now, however, there is
35 | software that reads the pdfs and does its best to extract the tabular
36 | data. The `tabula` package is an example of such software and works
37 | quite well. It can be installed within an R chunk using
38 | `reticulate::py_install("tabula-py", pip = TRUE)` (although you install
39 | the package called `tabula-py`, you’ll be importing a package called
40 | `tabula`).
41 |
42 | 1. Download the pdf from the Secretary of State’s website to your
43 | machine and put it in the same directory as your lab .Rmd file. Use
44 | the function `tabula.read_pdf()` to read in the pdf file and tinker
45 | with its arguments to figure out how it works. What data structure
46 | does the function return? How many tables total does the function
47 | identify in the pdf?
48 |
49 | #### Plot 1: Cumulative ballot returns for Multnomah and Deschutes counties
50 |
51 |
52 |
53 | There are several steps needed to clean up and process the data before
54 | you can plot it.
55 |
56 | 2. You’ll notice that there are several rows at the bottom of the table
57 | that contain aggregate information. Since you won’t need them for
58 | your plot, strip out these rows.
59 |
60 | 3. The first column has a lengthy name, so `.rename()` it to
61 | `"County"`.
62 |
63 | 4. The unit of observation in your data set at this point is a single
64 | county and on that county you observe a count at each of several
65 | dates. Reshape this data frame so that the unit of observation is a
66 | county on a particular date and the variable recorded is the
67 | returned ballot count. Your resulting data frame should have three
68 | columns: `"County", "Date", and "Count"`.
69 |
70 | 5. When the data frame was read in, the `Count` column read in as a
71 | string Series with commas. Strip out those commas. There are
72 | multiple ways this can be done, but a good starting place is a data
73 | frame method called `.replace()` or string methods.
74 |
75 | 6. Now that the counts are in good shape, you need to be sure the data
76 | types of all of the columns are as you want them. `"County"` should
77 | be strings, so that one’s all set. `"Count"`, however, should be
78 | either integer or numeric and `"Date"` should be a datetime. There
79 | are, as always, multiple ways to skin this cat. I recommend looking
80 | into a handful of Pandas functions named `to_XXX` where `XXX` is the
81 | type that you want to convert to. There is also a `.astype()` data
82 | frame method that works.
83 |
84 | The conversion of `"Date"` to a datetime can be tricky, so approach
85 | that one carefully. You’ll need to specify the `format`, which you
86 | do as a string using a particular [format
87 | code](https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior)
88 | that indicates what you’re giving it (in this case, the abbreviation
89 | of the month name and the day of the month). Post on Piazza if you
90 | run into problems.
91 |
92 | 7. Instead of plotting the raw county, the plot that you’re aiming for
93 | has on the y-axis the *cumulative* count. Create that new column -
94 | the cumulative count within each county.
95 |
96 | 8. From this parent data frame, create two child data frames: one with
97 | the data from Multnomah County (Portland metro area) and the other
98 | containing the data from Deschutes County (Bend metro area).
99 |
100 | 9. Use `matplotlib` to construct the final plot shown above. Your main
101 | guide through this will be the [first few pages in Python Data
102 | Science
103 | Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/04.00-introduction-to-matplotlib.html).
104 | A few notes:
105 |
106 | - You’ll want to make a single figure with two subplots.
107 | - Rotate the labels on the x-axis so that they’re legible.
108 | - Label the axes and add a title to each subplot.
109 | - Save the figure using `plt.savefig()` and include it in your Rmd
110 | by adding an R chunk containing
111 | `knitr::includegraphics("myfig.png")`.
112 |
113 | #### Plot 2: Cumulative ballot returns for all Oregon counties
114 |
115 |
116 |
117 | 10. Plot the cumulative ballot count curves for *all* Oregon counties on
118 | a single plot. This requires two steps: a) reshaping your data so
119 | that each row corresponds to a date time and there is a column for
120 | each county containing its cumulative counts and b) iterating over
121 | `plot()` to draw a line for each county onto the same plot. For step
122 | b), mimic the for-loop structure found in the example
123 | [here](https://python-graph-gallery.com/124-spaghetti-plot/). A few
124 | notes:
125 |
126 | - Make all lines the same color (there are too many to discern
127 | anyway).
128 | - Shine up the labels and add a title as before.
129 | - Save the file as before an include it in your Rmd.
130 |
131 | Yes, this is an ineffective plot for displaying this structure.
132 | We’ll fix it at a later date.
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 | #### Plot 3: Comparing ballot returns by party
151 |
152 |
153 |
154 | The final page of the pdf adds in another piece of important
155 | information: it breaks out the ballot return numbers by party. Use it to
156 | create the plot above, which shows, as of October 29th, the voter
157 | turnout by among Democrat, Republican, and Unaffiliated voters in
158 | Deschutes County. Deschutes County is particularly interesting because
159 | it is fairly evenly split between voters of all three affiliations. As
160 | in plot 2, the form of this plot has some deficiencies that we’ll
161 | discuss, but it’s a place to start for now.
162 |
163 | Some notes:
164 |
165 | - You can recycle much of your code from plot 1.
166 | - Pdf reading is not an exact science. Be sure to spot check the
167 | resulting data frame and make any necessary corrections.
168 | - Focus only on Republican, Democrat, and Unaffiliated voters and only
169 | on Deschutes County.
170 | - Instead of using `.plot()`, you’ll be using `.bar()`. Read its
171 | documentation to understand the structure of the data the it
172 | expects.
173 |
--------------------------------------------------------------------------------
/labs/lab-09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-09.pdf
--------------------------------------------------------------------------------
/labs/lab-10.md:
--------------------------------------------------------------------------------
1 | Lab 10: Scraping By
2 | ================
3 |
4 |
5 |
6 | Your objective for this lab is to use `seaborn` to visualize the
7 | cumulative vote totals for Donald Trump and Joe Biden in the days
8 | following the elections. The data you’ll be using are displayed here:
9 |
10 |
11 |
12 | and can be downloaded here:
13 |
14 |
15 |
16 | This data set has been scraped from the New York Times website as a JSON
17 | file and saved as a .csv, as described
18 | [here](https://alex.github.io/nyt-2020-election-scraper/).
19 |
20 | Produce two plots, one for Pennsylvania and one for Georgia. For each
21 | plot,
22 |
23 | - Be sure the dates on the x-axis are easily legible.
24 | - Color line for Donald Trump red and the line for Joe Biden blue.
25 | - Add titles
26 | - [Annotate the
27 | point](https://jakevdp.github.io/PythonDataScienceHandbook/04.09-text-and-annotation.html)
28 | in time when the AP declared Biden the winner (`plt.axvline` may
29 | also be useful).
30 |
--------------------------------------------------------------------------------
/labs/lab-10.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-10.pdf
--------------------------------------------------------------------------------
/labs/lab-11.md:
--------------------------------------------------------------------------------
1 | Lab 11: Model Homes
2 | ================
3 |
4 |
5 |
6 | If you’ve spent any time on either *zillow.com* or *redfin.com*, you
7 | have likely seen the price estimate that they give for any home. This
8 | prediction is generated by a regression model that uses covariates on
9 | the house - square footage, number of bedrooms, neighborhood, lot size,
10 | etc - to estimate its price. The models that they use are non-linear,
11 | but you can fairly easily construct a linear model that will produce
12 | similar predictions.
13 |
14 | That is your big-picture goal for this lab: to fit a multiple linear
15 | regression model to a housing data set with the goal of predicting home
16 | prices. Your main Python resource will be the `statsmodels` package,
17 | which has [very good
18 | documentation](https://www.statsmodels.org/stable/index.html). The data
19 | that you’ll use to fit your model come from four neighborhoods and
20 | cities in the Los Angeles area. It lists all of the homes that were sold
21 | in a one month interval. The data set is stored on the course website as
22 | a .csv.
23 |
24 | `https://raw.githubusercontent.com/andrewpbray/python-for-r-users/master/data/la-homes.csv`
25 |
26 | ### Data Import, Wrangling, and EDA
27 |
28 | 1. Load the data set into Python. How many observations are there? How
29 | many variables?
30 |
31 | 2. Before you get to building your model, there are several data
32 | wrangling tasks that you need to take care of first. Calculate the
33 | number of missing values (`NaN`) using the `.isnull()` data frame
34 | method combined with the `.sum()` method. If any variables are
35 | entire null, drop them from the data frame.
36 |
37 | 3. How many different unique values are there in the `type` column? How
38 | many observations of each one? Filter the data set so that it only
39 | includes `SFR` (single family residences).
40 |
41 | 4. Plot the distribution of the number of bedrooms.
42 |
43 | 5. Plot the distribution of `garage`. What type of data is this being
44 | stored as? Coerce it into a Series of type integer.
45 |
46 | 6. Create a scatterplot of the relationship between square footage and
47 | home price. How would you describe this relationship? For this plot,
48 | you can either use `seaborn` or experiment with `plotly`.
49 |
50 | ### Modelling Take I
51 |
52 | 7. Use `statsmodels` to fit a simple linear regression model that
53 | predicts price as a function of square footage. Add the regression
54 | line to your scatterplot. Does your model do a good job of capturing
55 | the mean function of the data (does it go through the center of the
56 | cloud of points)?
57 |
58 | 8. Print the summary table of the regression model. How many
59 | observations was this model fit to? Compare this to the size of the
60 | data set - what does this suggest about how this model handles `NaN`
61 | values that might appear in certain columns?
62 |
63 | 9. Create a plot of the residuals against `x`. Does it appear that the
64 | residuals have equal variance or does the variance appear to be a
65 | function of the `x` variable?
66 |
67 | 10. Create a plot of the Cook’s Distance, a measure of how influential a
68 | single observation is to the coefficient estimates. Which homes does
69 | it flag as the most influential?
70 |
71 | ### Modelling Take II
72 |
73 | 11. Remove those most influential observations and make one other
74 | structural change to your model: create two columns that are the
75 | natural log of square footage and price. Refit your model to this
76 | smaller data set and using these new variables. How do the residual
77 | plot and the influence plot change?
78 |
79 | 12. Augment this model by adding a categorical predictor: city. Based on
80 | their estimated coefficients, list the cities from most to least
81 | expensive (controlling for the size of the house). Are these
82 | coefficients found to be statistically significantly different than
83 | zero.
84 |
85 | 13. Add `bed` to your model. What is the sign of its estimated
86 | coefficient? Speculate as to why this structure might exist using
87 | your knowledge of the way that people value homes.
88 |
89 | 14. Which is the most undervalued house in this data set according to
90 | your model?
91 |
92 | 15. Use the `.predict()` method to estimate the price of a 1550 square
93 | foot house in Long Beach with 3 bedrooms (see the documentation:
94 | ).
95 | Spot check this by hand by plugging those x values into your
96 | regression equation using the coefficient estimates.
97 |
--------------------------------------------------------------------------------
/labs/lab-11.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/labs/lab-11.pdf
--------------------------------------------------------------------------------
/lectures/01-peek-into-python-code.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "A Peek into Python"
3 | output: html_document
4 | editor_options:
5 | chunk_output_type: console
6 | ---
7 |
8 | ```{r include = FALSE}
9 | knitr::opts_chunk$set(error = TRUE, echo = TRUE)
10 | ```
11 |
12 | ## Accessing and installing Python
13 |
14 | ```{r eval = FALSE}
15 | install.packages("reticulate")
16 | ```
17 |
18 | ```{r}
19 | library(reticulate)
20 | ```
21 |
22 | > **Poll**: Were you able to install `reticulate` and `miniconda`?
23 |
24 |
25 | ## Python as a calculator
26 |
27 | ```{python}
28 | 1 + 3
29 | 1 - 3
30 | 2 * 3
31 | 2 / 3
32 | 2 // 3
33 | 3 // 2
34 | 3 % 2
35 | 11 % 3
36 | 2 ^ 3
37 | 2 ** 3
38 | ```
39 |
40 |
41 | ## Python basics: syntax
42 |
43 | ```{r}
44 | a <- 1 + 3
45 | a + 1
46 | ```
47 |
48 | ```{python}
49 | # Does python have access to the object a?
50 | a
51 | ```
52 |
53 | ```{python}
54 | b <- 1 + 3
55 | # What's going on?
56 | ```
57 |
58 | ```{python}
59 | # Objects are assigned with `=`
60 | b = 1 + 3
61 | # Comments are still marked by `#`
62 | print(b)
63 | ```
64 |
65 | > **Poll:** Which of the following will throw an error?
66 |
67 | ```{r whitespace-poll, eval = FALSE}
68 | # A
69 | a = 1 + 3
70 |
71 | # B
72 | a = 1 + 3
73 |
74 | # C
75 | if(a > 2) {
76 |
77 | "so big!"
78 | }
79 |
80 | # D
81 | if(a > 2) {
82 | "so big!"
83 | }
84 | ```
85 |
86 | ```{python}
87 | # A
88 | b = 1 + 3
89 |
90 | # B
91 | b = 1 + 3
92 |
93 | # C
94 | if ( b > 2 ) :
95 | print( "so low!")
96 |
97 | # D
98 | if (b > 2):
99 | print("so low!")
100 |
101 | ```
102 |
103 |
104 |
105 | ## Python basics: semantics
106 |
107 | ### Variables
108 |
109 | ```{r}
110 | a <- 5
111 | a2 <- a
112 | a <- 6
113 | a2
114 | ```
115 |
116 | ```{python}
117 | b = 5
118 | b2 = b
119 | b = 6
120 | print(b2)
121 | ```
122 |
123 | ```{r}
124 | a <- c(1, 5, 3)
125 | sort(a)
126 | a
127 | ```
128 |
129 | ```{python}
130 | b = [1, 5, 3]
131 | b.sort()
132 | print(b)
133 | ```
134 |
135 | ```{python}
136 | b = [1, 5, 3]
137 | b2 = b
138 | b.sort()
139 | b2
140 | ```
141 |
142 | ```{r}
143 | a <- c(1, 5, 3)
144 | a2 <- a
145 | a <- sort(a)
146 | a2
147 | ```
148 |
149 | ### Objects
150 |
151 | ```{python}
152 | # These pointers have types
153 | anumber = 3.4
154 | type(anumber)
155 | aword = "hello"
156 | type(aword)
157 |
158 | # They also have attributes, accessible with `.`
159 | anumber.real
160 | anumber.imag
161 |
162 | # They also have associated methods
163 | print(b)
164 | type(b)
165 | b.append(9)
166 | print(b)
167 |
168 | # everything is an object
169 | type(b.append)
170 | ```
171 |
172 | ## A note about reticulate
173 |
174 | ```{python}
175 | b
176 | ```
177 |
178 | ```{r}
179 | py$b
180 | a
181 | ```
182 |
183 | ```{python}
184 | r.a
185 | ```
186 |
187 |
188 |
--------------------------------------------------------------------------------
/lectures/01-peek-into-python-slides.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 1: A Peek Into Python"
3 | subtitle: "reticulate, basic syntax and semantics"
4 | author: "STAT 198/298 Fall 2020"
5 | output:
6 | xaringan::moon_reader:
7 | css: ["metropolis", "cal.css"]
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: atelier-forest-light
11 | highlightLines: true
12 | highlightSpans: true
13 | countIncrementalSlides: false
14 | ---
15 |
16 | ```{r include = FALSE}
17 | knitr::opts_chunk$set(message = FALSE, fig.align = "center")
18 | library(tidyverse)
19 | ```
20 |
21 | # Something to start things off
22 |
23 | In a browser on your laptop or phone, go to slido.com and enter event code #Z837.
24 |
25 | > **Poll**: Describe your first week of classes in one word.
26 |
27 | --
28 | * * *
29 |
30 | ## Slido
31 |
32 | - Polls
33 | - Q & A (with upvotes)
34 |
35 |
36 | ---
37 | # By the end of this course you will...
38 |
39 | --
40 |
41 | - Use Python to conduct the full data science life cycle on a simple project, including data import, wrangling, visualization, and modeling.
42 |
43 | --
44 |
45 | - Have a sense of the software ecosystem involving R and Python.
46 |
47 | --
48 |
49 | - Be able to describe the relative strengths of Python and R.
50 |
51 | --
52 |
53 | - Understand more about the structure of R.
54 |
55 |
56 | ---
57 | # Disclaimer
58 |
59 | --
60 |
61 | I am not a Python expert. I'm an R User that's (re-)learning Python.
62 |
63 |
64 | ---
65 | # Course Structure
66 |
67 | ## Course website
68 |
69 | - `www.github.com/andrewpbray/python-for-r-users`
70 |
71 | ## Course communication
72 |
73 | - `www.piazza.com/`
74 |
75 | ## Assignments
76 |
77 | - Submitted and graded on `www.gradescope.com`
78 |
79 | ---
80 | # Lecture Style
81 |
82 | - Slides + live coding
83 |
84 | --
85 |
86 | - Feel free to play along, but don't get tunnel vision
87 |
88 | --
89 |
90 | - If you encounter an issue / error, please ask over Slido.
91 |
92 |
93 | ---
94 | # Accessing and installing Python
95 |
96 | ## How will we access Python?
97 |
98 | --
99 |
100 | We'll start off using `{reticulate}`, an R package.
101 |
102 | --
103 |
104 | ## Which installation will we use?
105 |
106 | --
107 |
108 | `miniconda`, a lightweight installation that includes Python and conda, a package management system.
109 |
110 | --
111 |
112 | > **Poll**: Were you able to install `reticulate` and `miniconda`?
113 |
114 | ---
115 | # An expanded ecosystem
116 |
117 | ## Where does `{reticulate}` fit in?
118 |
119 | --
120 |
121 | ```{r echo = FALSE, out.width="100%"}
122 | knitr::include_graphics("figs/r-python-diagram.png")
123 | ```
124 |
125 | --
126 |
127 | `{reticulate}` is an R package that starts a Python session within your R session.
128 |
129 | ---
130 | # Alternatives to {reticulate}
131 |
132 | 1. Running `python` at the terminal
133 |
134 | --
135 |
136 | 2. Running `ipython` at the terminal
137 |
138 | --
139 |
140 | 3. Running `python` within a Jupyter notebook
141 |
142 |
143 | ---
144 | # Python as a calculator
145 |
146 | --
147 |
148 | ```{r echo = FALSE, out.width="100%"}
149 | knitr::include_graphics("figs/math-operators.png")
150 | ```
151 |
152 | ---
153 | # Python basics: syntax
154 |
155 | ## How do you assign objects?
156 | -
157 |
158 |
159 | ## How do you print?
160 | -
161 |
162 |
163 | ## How do you comment?
164 | -
165 |
166 |
167 | ## How does whitespace work?
168 | -
169 |
170 |
171 | ---
172 | # Whitespace
173 |
174 | > **Poll:** Which of the following will throw an error?
175 |
176 | ```{r whitespace-poll, eval = FALSE, echo = TRUE}
177 | # A
178 | a = 1 + 3
179 |
180 | # B
181 | a = 1 + 3
182 |
183 | # C
184 | if(a > 2) {
185 |
186 | "so big!"
187 | }
188 |
189 | # D
190 | if(a > 2) {
191 | "so big!"
192 | }
193 | ```
194 |
195 | ---
196 | # Python basics: syntax
197 |
198 | ## How do you assign objects?
199 | - Assign with `=`
200 |
201 | ## How do you print?
202 | - `print()` or type name of object
203 |
204 | ## How do you comment?
205 | - Comments marked by `#`
206 |
207 | ## How does whitespace matter?
208 | - Yes (indentation) and no (midline)
209 |
210 |
211 | ---
212 | # Python basics: semantics
213 |
214 | --
215 |
216 | ## What are variables?
217 |
218 | -
219 | -
220 |
221 | ## What are objects?
222 |
223 | -
224 |
225 | ---
226 | # Python basics: semantics
227 |
228 | ## What are variables?
229 |
230 | - Variables do not need to be declared
231 | - Variables are *pointers* -> functions can change their arguments.
232 |
233 | ## What are objects?
234 |
235 | -
236 |
237 |
238 | ---
239 | # John Chambers on S (2008)
240 |
241 | > The central computation in R is a function call, defined by the function object itself and the objects that are supplied as the arguments. In the functional programming model, the result is defined by another object, the value of the call. Hence the traditional motto of the S language: **everything is an object**—the arguments, the value, and in fact the function and the call itself: All of these are defined as objects. Think of objects as collections of data of all kinds. The data contained and the way the data is organized depend on the class from which the object was generated.
242 |
243 | ---
244 | # Python basics: semantics
245 |
246 | ## What are variables?
247 |
248 | - Variables do not need to be declared
249 | - Variables are *pointers* -> functions can change their arguments.
250 |
251 | ## What are objects?
252 |
253 | - Everything is an object: data with *attributes* and *methods*
254 |
255 |
256 | ---
257 | # A note about {reticulate}
258 |
259 |
260 | ---
261 |
262 | ```{r echo = FALSE}
263 | knitr::include_graphics("figs/mic-drop.gif")
264 | ```
265 |
266 |
267 | ---
268 | # A note about {reticulate}
269 |
270 | `{reticulate}` can pass objects between your Python and R session.
271 |
272 | - Python object `b` can be accessed in R using py$b
273 | - R object `a` can be accessed in Python using r.a
274 |
275 | ---
276 | # Until next time
277 |
278 | --
279 |
280 | - Schedule a meeting if you need help configuring your system (link in chat)
281 |
282 | --
283 |
284 | - Homework 01 / Lab 01 will be posted by midnight tonight and due Sunday at 8 pm
285 | - Submit work as pdfs to gradescope
286 |
--------------------------------------------------------------------------------
/lectures/01-peek-into-python-slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/01-peek-into-python-slides.pdf
--------------------------------------------------------------------------------
/lectures/02-types-and-structures-code.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 2"
3 | subtitle: "operators, types, and data structures"
4 | output: html_document
5 | editor_options:
6 | chunk_output_type: console
7 | ---
8 |
9 | # Operators
10 |
11 | ```{r}
12 | a <- a + 1
13 | `+`(a, 1)
14 | `<-`(a,)
15 | ```
16 |
17 |
18 | ## Update
19 |
20 | ```{python}
21 | a = 2
22 | b = 3
23 |
24 | # addition
25 | a += b
26 |
27 | # subtraction
28 | a -= b
29 |
30 | # multiplication
31 | a *= b
32 |
33 | # division
34 | a /= b
35 | ```
36 |
37 | ## Comparison
38 |
39 | ```{python}
40 | 2 < 3
41 | 2 <- 3
42 | (2 <- 3) == False
43 | "lo" < "hi
44 | "lo" == "hi"
45 | "lo" != "hi"
46 | ```
47 |
48 | ## Boolean
49 |
50 | ```{python}
51 | # and
52 | x = 4
53 | (x < 6) and (x > 2)
54 |
55 | # boolean algebra
56 | True and True
57 | True and False
58 | False and False
59 |
60 | # and is like * of 1 and 0
61 | 1 * 1
62 | 1 * 0
63 | 0 * 0
64 | ```
65 |
66 | ```{python}
67 | # or
68 | (x < 6) or (x > 2)
69 | (x < 6) or (x == 2)
70 |
71 | # or is like + of 1 and 0
72 | 1 + 1
73 | min(1 + 1, 1)
74 | 1 + 0
75 | 0 + 0
76 | ```
77 |
78 | ```{python}
79 | (x < 6) | (x > 2)
80 | ```
81 |
82 | # Sets
83 |
84 | ```{python}
85 | # equality
86 | a = [1, 2]
87 | b = [1, 2]
88 | a == b
89 | a is b
90 | ```
91 |
92 | ```{python}
93 | # identity
94 | a = [1, 2]
95 | b = a
96 | a == b
97 | a is b
98 | ```
99 |
100 |
101 | # Types
102 |
103 | ## Integers
104 |
105 | ```{python}
106 | a = 3
107 | type(a)
108 |
109 | # An operation on objects of type integer can yield a non-integer.
110 | 3 / 2
111 | ```
112 |
113 | ## Floats
114 |
115 | ```{python}
116 | # anything with a decimal or in exponential notation
117 | type(3.2)
118 | type(1e6)
119 |
120 | # remember floats can only ever be so precise
121 | .1 + .2 == .3
122 | .1 + .2 < .3
123 | .1 + .2 > .3
124 |
125 | # don't check for exact equality with floats.
126 | import math
127 | math.isclose(.1 + .2, .3)
128 | ```
129 |
130 | ## Strings
131 |
132 | # POLL
133 |
134 | ```{python}
135 | # creating strings
136 | type("hello")
137 | type('hello')
138 |
139 | # some unexpected behavior
140 | a = "hello"
141 | b = " world"
142 | a + b
143 | a - b
144 | a * b
145 | a * 5
146 | a / b
147 | a[2]
148 | ```
149 |
150 |
151 |
152 | # Data Structures
153 |
154 | ## Lists
155 |
156 | ```{python}
157 | # Create with brackets
158 | a = [2, 3, 5, 7, 11]
159 | type(a)
160 |
161 | # Ordered? Yes
162 | a[1]
163 | a[0] # uses zero-based indexing
164 | a[-1] # counts backwards
165 | a[0:1] # use : for slicing
166 | a[:2] # omitting before implies 0
167 | a[:] # omitting after implies len(a)
168 | a[::2]
169 |
170 | # Mutable? Yes
171 | a.append(1)
172 | a.sort()
173 | a[0] = 99
174 |
175 | # note: you can also concatenate two lists
176 | a + [1] # not a mutation since a is unchanged
177 |
178 | # what R data structure does this most resemble? a vector?
179 | # Type heterogeneous? Yes
180 | a = [1, "hello", 3.2]
181 | type(a)
182 | a = [1, ["hello", " world"]] # can also be nested
183 | type(a)
184 |
185 | # a list!
186 | ```
187 |
188 | ## Tuples
189 |
190 | ```{python}
191 | # Create with parens
192 | t = (1, 2, 3)
193 | type(t)
194 | t2 = 1, 2, 3
195 | t == t2
196 |
197 | # Ordered? Yes
198 | t[0:1]
199 |
200 | # notes: nestable
201 | t3 = (1, 2, (4, 5))
202 | type(t3[2])
203 | type(t3[3])
204 |
205 | # Mutable? No
206 | t.append(4)
207 | t[0] = 0
208 |
209 | # Type heterogeneous? Yes
210 | t = (1, 2, 3, "hello")
211 | type(t)
212 | ```
213 |
214 | ## Sets
215 |
216 | ```{python}
217 | # created with {}
218 | s1 = {1, 3, 5}
219 | s2 = {1, 2}
220 | type(s1)
221 | type(s2)
222 |
223 | # Ordered? No
224 | s1[0]
225 |
226 | # Mutable? Yes
227 | s1.append(s2)
228 | s1.add(5)
229 |
230 | # note: support set operations
231 | s1 | s2
232 | s1 & s2
233 | s1 - s2
234 | s1 ^ s2
235 | s3 = {1, 3, {1, 3, 5}} # but not nestable
236 |
237 | # Type heterogeneous? Yes
238 | s1 = {1, 3, "hello"}
239 | type(s1)
240 | ```
241 |
242 |
243 | ## Dictionaries
244 |
245 | ```{python}
246 | # created with {} and keys
247 | d = {"one":1, "two":2, "three":3}
248 | type(d)
249 |
250 | # Ordered? No, but still indexed by key
251 | d["two"]
252 |
253 | # Mutable? Yes
254 | d["two"] = 22
255 |
256 | # note: can be nested
257 | d2 = {"one":1, "two":2, "three":{"four":4, "five":5}}
258 | type(d2)
259 |
260 | # Type heterogeneous? Yes
261 | d["two"] = "hello"
262 | ```
263 |
264 |
265 |
--------------------------------------------------------------------------------
/lectures/02-types-and-structures-slides.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 2"
3 | subtitle: "operators, types, and data structures"
4 | author: "STAT 198/298 Fall 2020"
5 | output:
6 | xaringan::moon_reader:
7 | css: ["metropolis", "cal.css"]
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: atelier-forest-light
11 | highlightLines: true
12 | highlightSpans: true
13 | countIncrementalSlides: false
14 | ---
15 |
16 | ```{r include = FALSE}
17 | knitr::opts_chunk$set(message = FALSE, fig.align = "center")
18 | library(tidyverse)
19 | library(reticulate)
20 | ```
21 |
22 | # Send your phone here
23 |
24 | ```{r echo = FALSE, out.width="60%"}
25 | knitr::include_graphics("figs/slido-qr.png")
26 | ```
27 |
28 | Or send a browser to `slido.com`, event `#Z837`.
29 |
30 | ---
31 | # Learning a second language
32 |
33 | --
34 |
35 | ```{r echo = FALSE, out.width="80%"}
36 | knitr::include_graphics("figs/learning-languages.png")
37 | ```
38 |
39 | --
40 |
41 | > Learn Python to learn R.
42 |
43 | ---
44 | # Poll
45 |
46 | We can create the object `a` in R:
47 |
48 | ```{r}
49 | a <- 1
50 | ```
51 |
52 | What line of code will augment `a` by 1? (increase its value by 1)
53 |
54 | --
55 |
56 | ```{r}
57 | a <- a + 1
58 | a
59 | ```
60 |
61 | ---
62 | # Operators: what is it?
63 | --
64 |
65 | Take whatever object is on either side of the operator, perform an operation on it, and return the result...
66 |
67 | --
68 |
69 | Sounds like a *function* with two arguments.
70 |
71 | --
72 |
73 | ```{r}
74 | `+`(a, 1)
75 | ```
76 |
77 |
78 | ---
79 | # Operators: Assignment
80 |
81 | Python has *update* operators to make common assignment tasks more streamlined.
82 |
83 | --
84 |
85 | ```{python}
86 | r.a += 1
87 | r.a
88 | ```
89 |
90 | --
91 |
92 | For any operator `#`, the expression `a #= b` is equivalent to `a = a # b`.
93 |
94 |
95 | ---
96 | # Poll
97 |
98 | What will this code return?
99 |
100 | ```{python, eval = FALSE}
101 | a = 2
102 | b = 3
103 | a -= b
104 | a
105 | ```
106 |
107 | --
108 |
109 | ```{python, eval = TRUE, echo = FALSE}
110 | a = 2
111 | b = 3
112 | a -= b
113 | a
114 | ```
115 |
116 |
117 | ---
118 | # Poll
119 |
120 | What will this code return?
121 |
122 | ```{python, eval = FALSE}
123 | a = 2
124 | b = 3
125 | a **= b
126 | a
127 | ```
128 |
129 | --
130 |
131 | ```{python, eval = TRUE, echo = FALSE}
132 | a = 2
133 | b = 3
134 | a **= b
135 | a
136 | ```
137 |
138 |
139 | ---
140 | # Operators: Comparison
141 |
142 | Comparisons of two objects that yeilds `True` or `False`.
143 |
144 | ```{r echo = FALSE, out.width="60%"}
145 | knitr::include_graphics("figs/comparison-operators.png")
146 | ```
147 |
148 |
149 | ---
150 | # Operators: Comparison
151 |
152 | ## Notes
153 |
154 | - Boolean values: `True` and `False` and nothing else
155 | - Group operations with `(` and `)`
156 | - Validity of comparisons depends on object type
157 |
158 |
159 | ---
160 | # Operators: Boolean
161 |
162 | Operations that compose values of `True` and `False`.
163 |
164 | --
165 |
166 | ## Notes
167 |
168 | - Only two operators: `and` and `or`
169 | - Useful in conditionals (if-then)
170 | - *Not* the same as `&` and `|` (bitwise operations)
171 |
172 | ---
173 | # Operators: Sets
174 |
175 | `code`
176 |
177 | ---
178 | # Operators: Sets
179 |
180 | ## Equality
181 |
182 | Two variables are *equal* if they point to two objects that have the same value.
183 |
184 | --
185 |
186 | ```{r echo = FALSE, out.width="32%"}
187 | knitr::include_graphics("figs/equality.png")
188 | ```
189 |
190 | --
191 |
192 | ## Identity
193 |
194 | Two variables are *equal* if they point to the same object.
195 |
196 | --
197 |
198 | ```{r echo = FALSE, out.width="32%"}
199 | knitr::include_graphics("figs/identity.png")
200 | ```
201 |
202 | ---
203 | # Poll
204 |
205 | ```{r}
206 | a <- c(1, 3, 5)
207 | b <- 3
208 | ```
209 |
210 |
211 | Write the code to check to see if `b` is an element of set `a`.
212 |
213 |
214 | ---
215 | # Poll
216 |
217 | ```{r}
218 | a <- c(1, 3, 5)
219 | b <- 3
220 | ```
221 |
222 |
223 | Write the code to check to see if `b` is an element of set `a`.
224 |
225 | ```{r}
226 | b %in% a
227 | ```
228 |
229 |
230 | ---
231 | # Operators: Membership
232 |
233 | `b in a` checks if `b` is in `a`
234 |
235 | ```{python}
236 | 3 in [1, 3, 5]
237 | ```
238 |
239 | --
240 |
241 | `b not in a` checks if `b` is not in `a`
242 |
243 | ```{python}
244 | 3 not in [1, 3, 5]
245 | ```
246 |
247 |
248 | ---
249 | # Check for Q & A
250 |
251 | ---
252 | # Types
253 |
254 | The most basic form of how a piece of data can be stored.
255 |
256 | --
257 |
258 | - Integer
259 | - Floating-point number
260 | - String
261 | - Boolean (logical)
262 | - [Complex]
263 | - [NoneType]
264 |
265 | ---
266 | # Types
267 |
268 | ## Notes
269 |
270 | - *Integers* are precise numbers
271 |
272 | --
273 |
274 | - *Floats* are approximate fractional numbers, so only check for approximate equality.
275 |
276 | --
277 |
278 | - *Strings* allow some arithmetic operations, direct indexing.
279 |
280 | ---
281 | # Data Structures
282 |
283 | Data structures are *compound types* that act as containers for simple types. The ones built into Python 3:
284 |
285 | --
286 |
287 | - List
288 | - Tuple
289 | - Dictionary
290 | - Set
291 |
292 | --
293 |
294 | When working with a data structure, ask:
295 |
296 | 1. Is it ordered? (index by integer)
297 | 2. Is it heterogenous? (different types)
298 | 3. Is it mutable? (change elements)
299 |
300 | ---
301 | # Code
302 |
303 | ---
304 | # List indexing
305 |
306 | ```{r echo = FALSE, out.width="100%"}
307 | knitr::include_graphics("figs/list-indexing.png")
308 | ```
309 |
310 | --
311 |
312 | ```{r echo = FALSE, out.width="40%"}
313 | knitr::include_graphics("figs/list-indexing-bah.gif")
314 | ```
315 |
316 | ---
317 | # code
318 |
319 | ---
320 | # Data Structures Summary
321 |
322 | ```{r echo = FALSE, out.width="80%"}
323 | knitr::include_graphics("figs/data-structures.png")
324 | ```
325 |
326 |
327 | ---
328 | # Assignments
329 |
330 | ## Homework 2
331 |
332 | Posted end of the day today, due Friday 8 pm
333 |
334 | ## Lab 2
335 |
336 | Posted end of the day today, due Sunday 8 pm
--------------------------------------------------------------------------------
/lectures/02-types-and-structures-slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/02-types-and-structures-slides.pdf
--------------------------------------------------------------------------------
/lectures/03-functions-methods-code.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 3"
3 | subtitle: "functions and methods"
4 | author: "STAT 198/298 Fall 2020"
5 | output: html_document
6 | ---
7 |
8 | ```{r include = FALSE}
9 | knitr::opts_chunk$set(message = FALSE, fig.align = "center")
10 | library(tidyverse)
11 | library(reticulate)
12 | ```
13 |
14 |
15 | ## Review
16 |
17 | # Equal vs identical copies
18 |
19 | ```{python}
20 | a = [1, 3, 5]
21 | y = a
22 |
23 | a is y
24 |
25 | a[1] = 99
26 | a is y
27 |
28 | a = [1, 3, 5]
29 | y = list(a)
30 | a is y
31 |
32 | a = [1, 3, 5]
33 | y = a[:]
34 | a is y
35 | ```
36 |
37 |
38 | # Functions
39 |
40 | # Documentation and arguments
41 |
42 | ```{python}
43 | print(a)
44 | ?
45 | print("hello", "world", sep = " whole wide ")
46 | print(a, end = "victory!")
47 | print(end = "victory!", value = a)
48 | ```
49 |
50 | ```{python}
51 | round(3.141)
52 | ?round
53 | round(3.141, ndigits = 2)
54 | ```
55 |
56 |
57 | # Classes and Methods
58 |
59 | ## Strings
60 |
61 | ```{python}
62 | a = "hello"
63 | type(a)
64 | dir(a)
65 |
66 | # Attributes
67 | a.__class__
68 | a.__len__()
69 | a.__contains__("h")
70 |
71 | # Methods
72 | a.upper()
73 | ?a.replace
74 | a.replace("ll", "r")
75 | ?a.index
76 | a.index("ll")
77 | ```
78 |
79 | ## Lists
80 |
81 | ```{python}
82 | a = [1, 3, 5]
83 | dir(a)
84 |
85 | # Attributes
86 | a.__class__
87 | a.__len__()
88 |
89 | # Methods
90 | a.index(5)
91 | a.count(3)
92 | a.append(3)
93 | a.count(3)
94 | a.sort()
95 |
96 | a.__doc__
97 | ?a.__sizeof__
98 | ```
99 |
100 |
101 | ## OOP in R
102 |
103 | ```{r}
104 | x <- rnorm(100)
105 | m1 <- lm(mpg ~ hp, data = mtcars)
106 |
107 | class(x)
108 | class(m1)
109 |
110 | ?plot
111 | ?plot.lm()
112 |
113 | # Look up {forecast} documentation
114 |
115 | summary(x)
116 | s <- summary(m1)
117 | typeof(s)
118 | typeof(m1)
119 | ?print.summary.lm
120 | ```
121 |
122 |
123 |
124 |
125 |
126 |
127 |
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 3"
3 | subtitle: "functions and methods"
4 | author: "STAT 198/298 Fall 2020"
5 | output:
6 | xaringan::moon_reader:
7 | css: ["metropolis", "cal.css"]
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: atelier-forest-light
11 | highlightLines: true
12 | highlightSpans: true
13 | countIncrementalSlides: false
14 | ---
15 |
16 | ```{r include = FALSE}
17 | knitr::opts_chunk$set(message = FALSE, fig.align = "center")
18 | library(tidyverse)
19 | library(reticulate)
20 | ```
21 |
22 |
23 | --
24 | # Send your phone here
25 |
26 | ```{r echo = FALSE, out.width="60%"}
27 | knitr::include_graphics("figs/slido-qr.png")
28 | ```
29 |
30 | Or send a browser to `slido.com`, event `#Z837`.
31 |
32 | ---
33 | # Poll:
34 |
35 | If two objects are identical, that means...
36 |
37 | ---
38 |
39 | # Review
40 |
41 | ## Equal vs identical copies
42 |
43 | - **Equal**: Same value
44 |
45 | - **Identical**: Same object in memory
46 |
47 | --
48 |
49 | `CODE`
50 |
51 |
52 | ---
53 | # Review
54 |
55 | ## Equal vs identical copies
56 |
57 | - **Equal**: Same value
58 |
59 | - **Identical**: Same object in memory
60 |
61 | If you want to create a non-identical copy, use:
62 |
63 | - `list(object)`
64 | - `object[:]`
65 |
66 |
67 | ---
68 | # Functions
69 | --
70 |
71 | **Function:** an encapsulated, reusable piece of code.
72 |
73 | --
74 |
75 | It's important to understand how a function deals with
76 |
77 | - Documentation
78 | - Arguments
79 |
80 | `CODE`
81 |
82 | ---
83 | ## Documentation
84 |
85 | - Access with `?` or `help()`
86 |
87 | ## Arguments
88 |
89 | - Specify by position (those args preceding `\`) or by name, when available.
90 | - Can access args at console using tab.
91 | - Defaults are indicated either in named arguments with `=` or in the accompanying text (the optional args listed inside `[, ]`)
92 |
93 |
94 | ---
95 | # Writing functions
96 |
97 | ## Python
98 |
99 | ```{python, eval = FALSE}
100 | a = [1, 3, 5]
101 | def victorious_print(object):
102 | return print(object, end = "victory!")
103 | victorious_print(a)
104 | ```
105 |
106 | --
107 |
108 | ```{python, echo = FALSE}
109 | a = [1, 3, 5]
110 | def victorious_print(object):
111 | return print(object, end = "victory!")
112 | victorious_print(a)
113 | ```
114 |
115 | --
116 |
117 | ## R
118 |
119 | ```{r eval = FALSE}
120 | victorious_print <- function(object) {
121 | paste(object, "victory!")
122 | }
123 | victorious_print(py$a)
124 | ```
125 |
126 | --
127 |
128 | ```{r echo = FALSE}
129 | victorious_print <- function(object) {
130 | paste(object, "victory!")
131 | }
132 | victorious_print(py$a)
133 | ```
134 |
135 | ---
136 | # Poll
137 |
138 | Write a function that takes a given string, and outputs it as a single string repeated `n` times, each one separated by a `.`.
139 |
140 | ---
141 | # Anonymous functions in Python
142 |
143 | --
144 |
145 | It can be helpful to have one-liner functions for use within another function, not to be reused or called by name. In Python these are called
146 | *lambda functions*.
147 |
148 | --
149 |
150 | ```{python eval = FALSE}
151 | list(map(lambda x: x * 2 , a))
152 | ```
153 |
154 | --
155 |
156 | ```{python echo = FALSE}
157 | list(map(lambda x: x * 2 , a))
158 | ```
159 |
160 | --
161 |
162 | These also exist in R.
163 |
164 | ---
165 | # Anonymous functions in R
166 |
167 | ```{r eval = FALSE}
168 | py$a %>%
169 | map(function(x) x * 2)
170 | ```
171 |
172 | --
173 |
174 | ```{r echo = FALSE}
175 | py$a %>%
176 | map(function(x) x * 2)
177 | ```
178 |
179 |
180 | ---
181 | # Anonymous functions in R
182 |
183 | ```{r, eval = FALSE, fig.height = 5}
184 | library(ggplot2)
185 | base <- ggplot() +
186 | xlim(-5, 5)
187 | base +
188 | geom_function(fun = dnorm)
189 | ```
190 |
191 | --
192 |
193 | ```{r, echo = FALSE, fig.height = 5}
194 | library(ggplot2)
195 | base <- ggplot() +
196 | xlim(-5, 5)
197 | base +
198 | geom_function(fun = dnorm)
199 | ```
200 |
201 |
202 | ---
203 | # Anonymous functions in R
204 |
205 | ```{r, fig.height = 5}
206 | base +
207 | geom_function(fun = function(x) 0.5*exp(-abs(x)))
208 | ```
209 |
210 |
211 | ---
212 | # Methods
213 |
214 | --
215 |
216 | *Methods* are functions associated with a particulate kind of object. A particular kind of object is called a *class*.
217 |
218 | --
219 |
220 | ```{r echo = FALSE, out.width="60%"}
221 | knitr::include_graphics("figs/oop-cars.png")
222 | # from: https://www.viralpatel.net/object-oriented-programming-with-javascript/
223 | ```
224 |
225 | --
226 |
227 | - Class: car
228 | - Objects: VW Beetle, Ford Escort, Buick Landyacht, etc
229 | - All of these have their own associated *attributes* and *methods*.
230 |
231 | ---
232 | # Attributes vs Methods
233 |
234 | *Attributes* are properties that distinguish one instance of an object from others in its class. *Methods* are functions specific to that class that take the object as an argument.
235 |
236 | --
237 |
238 | ## Car attributes
239 | - Make
240 | - Model
241 | - Year of Manufacture
242 |
243 | --
244 |
245 | ## Car methods
246 | - Drive
247 | - Brake
248 | - Fill with gas
249 |
250 | ---
251 | `CODE`
252 |
253 | ---
254 | # Attributes vs Methods in Python
255 |
256 | Query both on an object with `dir()` or use tab completion.
257 |
258 | - Attributes take the format `__attribute__` and methods just `method`.
259 | - Both can be called by prefixing with `object`.
260 | - Some methods change the object, others do not.
261 |
262 |
263 |
264 | ---
265 | # Recall: Poll
266 |
267 | Write a function that takes a given string, and outputs it as a single string repeated `n` times, each one separated by a `.`.
268 |
269 | --
270 |
271 | *Could have restricted this to work on strings either for checking for the `type()` inside the function, or by making it a method for strings.*
272 |
273 |
274 |
275 | ---
276 | # Object-oriented Programming in R
277 |
278 | --
279 |
280 | Consider what happens when I use `plot()` in two different scenarios.
281 |
282 | --
283 |
284 | ```{r, eval = FALSE, fig.height=4}
285 | x <- rnorm(100)
286 | plot(x)
287 | ```
288 |
289 | --
290 |
291 | ```{r, echo = FALSE, fig.height=4}
292 | x <- rnorm(100)
293 | plot(x)
294 | ```
295 |
296 | ---
297 | # Object-oriented Programming in R
298 |
299 | Consider what happens when I use `plot()` in two different scenarios.
300 |
301 | --
302 |
303 | ```{r, eval = FALSE, fig.height=4}
304 | m1 <- lm(mpg ~ hp, data = mtcars)
305 | plot(m1)
306 | ```
307 |
308 | --
309 |
310 | ```{r, echo = FALSE, fig.height=4}
311 | m1 <- lm(mpg ~ hp, data = mtcars)
312 | plot(m1)
313 | ```
314 |
315 |
316 | ---
317 | # Object-oriented Programming in R
318 |
319 | --
320 |
321 | ```{r, eval = FALSE, fig.height=4}
322 | class(m1)
323 | class(x)
324 | ```
325 |
326 | --
327 |
328 | ```{r, echo = FALSE, fig.height=4}
329 | class(m1)
330 | class(x)
331 | ```
332 |
333 | --
334 |
335 | `CODE`
336 |
337 | ---
338 | # Assignments this week
339 |
340 | - Homework 3 will be due Friday 8 pm
341 | - Lab 3 will be due Sunday 8 pm
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides.pdf
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-12-1.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-13-1.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-16-1.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-1.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-2.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-3.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-18-4.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-1.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-2.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-3.png
--------------------------------------------------------------------------------
/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/03-functions-methods-slides_files/figure-html/unnamed-chunk-20-4.png
--------------------------------------------------------------------------------
/lectures/04-control-flow-notes.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 4"
3 | subtitle: "control flow"
4 | output: html_document
5 | ---
6 |
7 | ```{r setup, include=FALSE}
8 | knitr::opts_chunk$set(echo = TRUE)
9 | ```
10 |
11 | # Iterators
12 |
13 | ```{python}
14 | for i in range(10):
15 | print(i, end = " ")
16 |
17 | range(10)
18 | type(range(10))
19 | list(range(10))
20 | ```
21 |
22 | ```{python}
23 | N = 10 ** 12
24 | for i in range(N):
25 | if i >= 10: break
26 | print(i, end = ", ")
27 | ```
28 |
29 |
30 |
31 | # What is an R data frame?
32 |
33 | ```{r}
34 | mtcars
35 | typeof(mtcars)
36 | ```
37 |
38 | ## It's a list!
39 |
40 | ```{r}
41 | l <- list("hello", pi)
42 | l
43 | length(l)
44 | l[[2]]
45 | l <- list("hello", c(pi, 3))
46 | l
47 | l[[2]][2]
48 | l <- list("hello", c(pi, 3), matrix(1:4, nrow = 2))
49 | typeof(l[[1]])
50 | typeof(l[[2]])
51 | typeof(l[[3]])
52 | l <- list("greeting" = "hello",
53 | "numbers" = c(pi, 3),
54 | "matrix" = matrix(1:4, nrow = 2))
55 | typeof(l)
56 | l[[1]]
57 | l$greeting
58 | names(l)
59 | l <- list("greeting" = "hello",
60 | "numbers" = c(pi, 3),
61 | "matrix" = matrix(1:4, nrow = 2),
62 | "list" = list("world", 2))
63 | ```
64 |
65 | Can I turn a list into a data frame?
66 |
67 | ```{r}
68 | as.data.frame(l)
69 | ```
70 |
71 |
72 | ## It's a data frame!
73 |
74 | ```{r}
75 | class(mtcars)
76 | ?print.data.frame()
77 | print(mtcars, row.names = FALSE)
78 | mtcars
79 | mtcars[1, 3]
80 | mtcars[1, "cyl"]
81 | mtcars["Volvo 142E", "cyl"]
82 | ```
83 |
84 |
85 | ## What is a tibble?
86 |
87 | ```{r}
88 | library(tidyverse)
89 | mtcars_tbl <- as_tibble(mtcars)
90 | mtcars
91 | mtcars2 <- data.frame(mtcars, names = row.names(mtcars))
92 | as_tibble(mtcars2)
93 | as_tibble(mtcars, rownames = "names")
94 | ```
95 |
96 |
97 | ## Review: so what's a ggplot?
98 |
99 | ```{r}
100 | p1 <- ggplot(mtcars, aes(x = hp, y = mpg)) +
101 | geom_point()
102 | print.default(p1)
103 | ```
104 |
105 | It's a list.
106 |
107 |
--------------------------------------------------------------------------------
/lectures/04-control-flow-slides.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 4"
3 | subtitle: "control flow"
4 | author: "STAT 198/298 Fall 2020"
5 | output:
6 | xaringan::moon_reader:
7 | css: ["metropolis", "cal.css"]
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: atelier-forest-light
11 | highlightLines: true
12 | highlightSpans: true
13 | countIncrementalSlides: false
14 | ---
15 |
16 | ```{r include = FALSE}
17 | knitr::opts_chunk$set(message = FALSE, fig.align = "center")
18 | library(tidyverse)
19 | library(reticulate)
20 | ```
21 |
22 | # Send your phone here
23 |
24 | ```{r echo = FALSE, out.width="60%"}
25 | knitr::include_graphics("figs/slido-qr.png")
26 | ```
27 |
28 | Or send a browser to `slido.com`, event `#Z837`.
29 |
30 | ---
31 | # Agenda
32 |
33 | 1. Tips for assignments
34 | 2. Control Flow
35 | - if - else
36 | - for loops
37 | 3. What is a dataframe?
38 |
39 | ---
40 | # Writing Code, Writing English
41 |
42 | ```{r echo = FALSE, out.width="100%"}
43 | knitr::include_graphics("figs/lab-3-sc1.png")
44 | ```
45 |
46 |
47 | ---
48 | # Markdown vs Comments
49 |
50 | ```{r echo = FALSE, out.width="100%"}
51 | knitr::include_graphics("figs/lab-3-sc2.png")
52 | ```
53 |
54 |
55 | ---
56 | `CODE`
57 |
58 | ---
59 | # Assignment tips
60 |
61 | - Use `output: pdf_document`
62 | - Suppress unneeded output with chunk options
63 | - `message = FALSE`: suppress package loading messages
64 | - `warning = FALSE`: suppress warnings
65 | - `echo = FALSE`: suppress your code showing up
66 | - `eval = FALSE`: suppress your output showing up
67 | - Answers questions in full / thoughtful sentences
68 | - Scan your pdf before you submit to be sure it looks right
69 |
70 |
71 | ---
72 | # Control Flow in Python
73 |
74 |
75 | --
76 |
77 | ```{r echo = FALSE, out.width="35%"}
78 | # after showing google earth metaphor of bonneville dam
79 | knitr::include_graphics("figs/bonneville.jpg")
80 | ```
81 |
82 | Elements of a script that redirect the flow of running commands
83 |
84 | - If - then
85 | - For loops
86 | - While loops, etc.
87 |
88 |
89 | ---
90 | # if
91 |
92 | --
93 |
94 | ```{python eval = FALSE}
95 | if :
96 |
97 | ```
98 |
99 | --
100 |
101 | - keyword: `if`
102 | - condition can be single or compound
103 | - use the `:` to start expression
104 | - expression can be multi-line but be sure to indent
105 |
106 | ---
107 | # if example
108 |
109 | ### Example 1
110 |
111 | What do you think will happen when this is run?
112 |
113 | ```{python eval = FALSE}
114 | x = 15
115 | if x == 0:
116 | print(x, "is zero.")
117 | ```
118 |
119 | --
120 |
121 | ```{python echo = FALSE}
122 | x = 15
123 | if x == 0:
124 | print(x, "is zero.")
125 | ```
126 |
127 | Nothing! (well, we now have a new object `x = 15`)
128 |
129 | --
130 |
131 | ### Example 2
132 |
133 | ```{python eval = FALSE}
134 | x = 0
135 | if x == 0:
136 | print(x, "is zero.")
137 | ```
138 |
139 | --
140 |
141 | ```{python echo = FALSE}
142 | x = 0
143 | if x == 0:
144 | print(x, "is zero.")
145 | ```
146 |
147 |
148 | ---
149 | # else
150 |
151 | ### Example 3
152 |
153 | ```{python eval = FALSE}
154 | x = 15
155 | if x == 0:
156 | print(x, "is zero.")
157 | else:
158 | print(x, "is non-zero.")
159 | ```
160 |
161 | --
162 |
163 | ```{python echo = FALSE}
164 | x = 15
165 | if x == 0:
166 | print(x, "is zero.")
167 | else:
168 | print(x, "is non-zero.")
169 | ```
170 |
171 | ---
172 | # elif
173 |
174 | ### Example 4
175 |
176 | ```{python eval = FALSE}
177 | x = 15
178 | if x == 0:
179 | print(x, "is zero.")
180 | elif x > 0:
181 | print(x, "is positive.")
182 | elif x < 0:
183 | print(x, "is negative")
184 | else:
185 | print(x, "confuses me.")
186 | ```
187 |
188 | --
189 |
190 | ```{python echo = FALSE}
191 | x = 15
192 | if x == 0:
193 | print(x, "is zero.")
194 | elif x > 0:
195 | print(x, "is positive.")
196 | elif x < 0:
197 | print(x, "is negative")
198 | else:
199 | print(x, "confuses me.")
200 | ```
201 |
202 | ---
203 | # for loop
204 |
205 | --
206 |
207 | ```{python, eval = FALSE}
208 | for in :
209 |
210 | ```
211 |
212 | - keyword: `for` and `in`
213 | - variable can be any legal name that you'll refer to in the loop
214 | - iterator is a generalized sequence
215 | - start loop with `:`
216 | - expression can be multi-line and must be indented
217 |
218 |
219 | ---
220 | # for loop example
221 |
222 | ### Example 1
223 |
224 | What do you think will happen when this is run?
225 |
226 | ```{python eval = FALSE}
227 | for i in range(10):
228 | print(i, end = " ")
229 | ```
230 |
231 | --
232 |
233 | ```{python echo = FALSE}
234 | for i in range(10):
235 | print(i, end = " ")
236 | ```
237 |
238 |
239 | ---
240 | # for loop example
241 |
242 | ### Example 2
243 |
244 | ```{python eval = FALSE}
245 | l = [1, 3, 99]
246 | for i in range(3):
247 | print(l[i], end = " ")
248 | ```
249 |
250 | --
251 |
252 | ```{python echo = FALSE}
253 | l = [1, 3, 99]
254 | for i in range(3):
255 | print(l[i], end = " ")
256 | ```
257 |
258 | --
259 |
260 | ### Example 3
261 |
262 | ```{python eval = FALSE}
263 | for i in [1, 3, 99]:
264 | print(i, end = " ")
265 | ```
266 |
267 | --
268 |
269 | ```{python echo = FALSE}
270 | for i in [1, 3, 99]:
271 | print(i, end = " ")
272 | ```
273 |
274 | --
275 |
276 | - Loop on value or index - which is better?
277 |
278 | ---
279 | `CODE`
280 |
281 | ---
282 | # Iterators
283 |
284 | Python has a class of objects called *iterators* that behave like a list in terms of iteration but never actually create the full list.
285 |
286 | ---
287 | # POLL: What is a dataframe?
288 |
289 | ---
290 | # What is a dataframe?
291 |
292 | `CODE`
293 |
294 |
295 | ---
296 | # What is a dataframe?
297 |
298 | ... it's a list!
299 |
300 | A compound data structure that
301 |
302 | - is *type heterogeneous*
303 | - can contain elements of different sizes
304 | - is subsetted using `[[]]`
305 | - can also use key:value pairings (named elements)
306 | - can be index with `$`
307 | - can be nested
308 |
309 | --
310 |
311 | Whole smokes, lists are VERY flexible.
312 |
313 | *What's the difference between an R list and a Python dictionary?*
314 |
315 | ---
316 | # R list vs Python Dictionary
317 |
318 | Unlike the R dataframe, the Python dictionary:
319 |
320 | - is unordered; can't index by position
321 | - every element must be named
322 |
323 | R dataframes combine elements of the Python list (index by position) and the Python dictionary (index by key)
324 |
325 | ---
326 | `CODE`
327 |
328 | ---
329 | # What is a dataframe?
330 |
331 | ... it's a dataframe!
332 |
333 | - a list with elements (vectors/lists) of equal length
334 | - has specific methods
335 | - can add `row.names`
336 | - can be subsetted like a matrix `[row, column]` with indexing done by position or name
337 |
338 | --
339 |
340 | ... so what's a tibble?
341 |
342 | ---
343 | `CODE`
344 |
345 | ---
346 | # What is a tibble?
347 |
348 | ... it's an "opinionated dataframe" for the `tidyverse`.
349 |
350 | - a dataframe with a refined print method
351 | - shows `dim()` and `typeof()`
352 | - limits number of rows printed
353 | - doesn't change characters to factors
354 | - can be created by coersion or by passing through `dplyr`
355 |
356 | ---
357 | # So what's a ggplot?
358 |
359 | ```{r}
360 | p1 <- ggplot(mtcars, aes(x = hp, y = mpg)) +
361 | geom_point()
362 | ```
363 |
364 | `CODE`
365 |
366 |
--------------------------------------------------------------------------------
/lectures/04-control-flow-slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/04-control-flow-slides.pdf
--------------------------------------------------------------------------------
/lectures/05-numpy-arrays-code.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 5"
3 | subtitle: "numpy arrays"
4 | author: "STAT 198/298 Fall 2020"
5 | output:
6 | xaringan::moon_reader:
7 | css: ["metropolis", "cal.css"]
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: atelier-forest-light
11 | highlightLines: true
12 | highlightSpans: true
13 | countIncrementalSlides: false
14 | ---
15 |
16 | ```{r include = FALSE}
17 | knitr::opts_chunk$set(message = FALSE, fig.align = "center")
18 | library(tidyverse)
19 | library(reticulate)
20 | ```
21 |
22 | ```{python echo = FALSE}
23 | a = np.array([1, 5, 3])
24 |
25 | # a new object!
26 | type(a)
27 |
28 | # with its own attributes
29 | a.shape
30 | a.size
31 |
32 | # some methods are very different
33 | l1 = [1, 5, 3]
34 | l2 = [4, 5, 6]
35 | print(l1 + l2)
36 |
37 | a1 = np.array(l1)
38 | a2 = np.array(l2)
39 | print(a1 + a2)
40 |
41 | # others are the same
42 | l1.sort()
43 | print(l1)
44 |
45 | a1.sort()
46 | print(a1)
47 | ```
48 |
49 |
50 |
--------------------------------------------------------------------------------
/lectures/05-numpy-arrays-slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/05-numpy-arrays-slides.pdf
--------------------------------------------------------------------------------
/lectures/06-pandas-dataframes-code.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 6"
3 | subtitle: "pandas dataframes"
4 | author: "STAT 198/298 Fall 2020"
5 | output: html_output
6 | ---
7 |
8 | ```{r}
9 | library(palmerpenguins)
10 | library(ggplot2)
11 | penguins <- as.data.frame(unclass(penguins))
12 | small_penguins <- slice(penguins, 1:5)
13 | ```
14 |
15 | ## Colnames vs names
16 |
17 | ```{r}
18 | ?colnames
19 | ?names
20 |
21 | vec <- 1:4
22 | names(vec) <- c("one", "two", "three", "four")
23 | vec["two"]
24 | ```
25 |
26 |
27 | ## Advice on rownames
28 |
29 | ```{r}
30 | data(penguins)
31 | small_penguins <- slice(penguins, 1:5)
32 | rownames(small_penguins) <- c("janet", "phyllis", "jose", "benny", "marty")
33 | small_penguins <- rownames_to_column(small_penguins, var = "given_name")
34 | small_penguins[small_penguins$given_name == "phyllis", "island"]
35 | ```
36 |
37 |
38 |
39 | ## Pandas dataframes
40 |
41 | ```{python}
42 | d_pop = {"California": 38332521,
43 | "Texas": 26448193,
44 | "New York": 19651127,
45 | "Florida": 19552860,
46 | "Illinois": 12882135}
47 | s_pop = pd.Series(d_pop)
48 | s_area = pd.Series({"California": 423967,
49 | "Florida": 170312,
50 | "Illinois": 149995,
51 | "New York": 141297,
52 | "Texas": 695662})
53 | states = pd.DataFrame({"population": s_pop, "area": s_area})
54 | states
55 | ```
56 |
57 | ## Subsetting
58 |
59 | ```{python}
60 | states["area"]
61 | states["Florida"]
62 | states[1]
63 | ```
64 |
65 | ## Use implicit indices
66 |
67 | ```{python}
68 | states.values
69 | states.index
70 | states.columns
71 | states.iloc
72 | ```
73 |
74 | ```{python}
75 | states.iloc[1, 0]
76 | states.iloc[1, 0:2]
77 | ```
78 |
79 | ## Use explicit indices
80 |
81 | ```{python}
82 | states.loc["Florida", "area"]
83 | states.loc[:"Florida", "area"]
84 | ```
85 |
86 |
87 | ## Subset via boolean masks
88 |
89 | ```{python}
90 | states.loc[:, "area"]
91 | mask = states.loc[:, "area"] < 400000
92 | states.loc[mask, :]
93 | ```
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/lectures/06-pandas-dataframes-slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/06-pandas-dataframes-slides.pdf
--------------------------------------------------------------------------------
/lectures/06-pandas-dataframes-slides_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/06-pandas-dataframes-slides_files/figure-html/unnamed-chunk-2-1.png
--------------------------------------------------------------------------------
/lectures/06-pandas-dataframes_files/figure-html/unnamed-chunk-1-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/06-pandas-dataframes_files/figure-html/unnamed-chunk-1-1.png
--------------------------------------------------------------------------------
/lectures/06-pandas-dataframes_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/06-pandas-dataframes_files/figure-html/unnamed-chunk-2-1.png
--------------------------------------------------------------------------------
/lectures/07-pandas-2-notebook.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pandas II"
3 | subtitle: "group by"
4 | author: "Andrew Bray"
5 | output: html_document
6 | editor_options:
7 | chunk_output_type: inline
8 | ---
9 |
10 | # Agenda
11 |
12 | 1. Review
13 | 2. Stuff
14 | 3. Group by
15 |
16 | # US Elections Data
17 |
18 | This is county-level data from the presidential elections between 2000 and 2016, provided by the MIT Elections Lab.
19 |
20 | ```{r}
21 | # remotes::install_github("andrewpbray/boxofdata")
22 | library(boxofdata)
23 | library(tidyverse)
24 | data(uselections)
25 | dim(uselections)
26 | names(uselections)
27 | str(uselections)
28 | ```
29 |
30 | ### Bringing the data into Python
31 |
32 | We bring the data into Python and do the our Python equivalent of greeing our data.
33 |
34 | ```{python}
35 | import pandas as pd
36 | uselections = r.uselections
37 | uselections.shape
38 | uselections.columns
39 | uselections.dtypes
40 | ```
41 |
42 | ### Select columns
43 |
44 | **Method 1:** Pass a *string* into `[]`...
45 |
46 | ```{python}
47 | uselections["county"]
48 | ```
49 |
50 | ... get out a series.
51 |
52 | **Method 2:** Pass a *list* into `[]`...
53 |
54 | ```{python}
55 | uselections[["county"]]
56 | ```
57 |
58 | ... get out a data frame.
59 |
60 | ### Slicing rows
61 |
62 | Pass a *slice* into `[]`...
63 |
64 | ```{python}
65 | uselections[0:5]
66 | ```
67 |
68 | ... get out a slice of a data frame. Sound familiar?
69 |
70 | ```{r}
71 | slice(uselections, 1:2)
72 | ```
73 |
74 | ### Selecting and slicing
75 |
76 | **Method 1:** (preferred) access labels with `.loc`.
77 |
78 | ```{python}
79 | uselections.loc[0:5, ["county"]]
80 | ```
81 |
82 | **Method 2:** access integer indices with `.iloc`.
83 |
84 | ```{python}
85 | uselections.columns
86 | uselections.iloc[0:5, 3]
87 | ```
88 |
89 | ### Filtering rows
90 |
91 | You can apply a Boolean series as a mask.
92 |
93 | ```{python boolrows, eval = FALSE}
94 | mask = uselections["year"] == 2016
95 | uselections[mask]
96 | ```
97 |
98 | ```{python ref.label = "boolrows", echo = FALSE}
99 | ```
100 |
101 | ## Filtering rows and selecting columns
102 |
103 | Boolean mask plus a list of columns.
104 |
105 | ```{python boolandcolsa, eval = FALSE}
106 | mask = uselections["year"].isin([2012, 2016])
107 | uselections[mask, ["county", "state"]]
108 | ```
109 |
110 | Will this run?
111 |
112 | > Need to use `.loc`
113 |
114 | ```{python boolandcolsb, eval = FALSE}
115 | mask = uselections["year"].isin([2012, 2016])
116 | uselections.loc[mask, ["county", "state"]] #<<
117 | ```
118 |
119 | ```{python ref.label = "boolandcolsb", echo = FALSE}
120 | ```
121 |
122 | ### Let's shine that up.
123 |
124 | 1. Form data frame.
125 | 2. Apply `.agg()` method.
126 | 3. Pass as the aggregation function the string method to `.join`.
127 |
128 | ```{python shincol, eval = TRUE}
129 |
130 | ```
131 |
132 | # Pandas Inventory
133 |
134 | Now we know how to:
135 |
136 | 1. Select columns
137 |
138 | 2. Slice rows
139 |
140 | 3. Do both simultaneously
141 |
142 | 4. Filter rows using boolean masks
143 |
144 | 5. Add columns
145 |
146 | # Practice: Question 1
147 |
148 | # Handy utility functions
149 |
150 | **`sort_values()`**
151 |
152 | ```{python eval = FALSE}
153 | uselections.sort_values("state")
154 | ```
155 |
156 | **`value_counts()`**
157 |
158 | ```{python, eval = FALSE}
159 | uselections["year"].value_counts()
160 | ```
161 |
162 | **`unique()`**
163 |
164 | ```{python}
165 |
166 | ```
167 |
168 | **`sample()`**
169 |
170 | ```{python}
171 |
172 | ```
173 |
--------------------------------------------------------------------------------
/lectures/07-pandas-2-practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Practice"
3 | output: html_document
4 | ---
5 |
6 | ```{r r-setup}
7 | # remotes::install_github("andrewpbray/boxofdata")
8 | library(boxofdata)
9 | library(reticulate)
10 | data(uselections)
11 | ```
12 |
13 | ```{python py-setup}
14 | import pandas as pd
15 | uselections = r.uselections
16 | ```
17 |
18 | ### Question 1
19 |
20 | Extract the first three rows where the candidate got more than 90% of the vote.
21 |
22 | ```{python}
23 |
24 | ```
25 |
26 |
27 | ### Question 2
28 |
29 | Which candidates were on the ballot in California in 2016?
30 |
31 | ```{python}
32 |
33 | ```
34 |
35 |
36 | ### Question 3
37 |
38 | Which were the top 5 counties in California in 2016 in the proportion of the vote won by Hillary Clinton?
39 |
40 | ```{python}
41 |
42 | ```
43 |
44 |
45 | ### Question 4
46 |
47 | For each county in California in 2016, calculate the proportion of votes for each major party candidate.
48 |
49 | ```{python}
50 |
51 | ```
52 |
53 |
--------------------------------------------------------------------------------
/lectures/07-pandas-2-slides.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pandas II"
3 | subtitle: "group by"
4 | author: "Andrew Bray"
5 | output:
6 | xaringan::moon_reader:
7 | css: xaringan-themer.css
8 | lib_dir: libs
9 | nature:
10 | highlightStyle: atelier-forest-light
11 | highlightLines: true
12 | highlightSpans: true
13 | countIncrementalSlides: false
14 | ---
15 |
16 | ```{r setup, include=FALSE}
17 | library(knitr)
18 | knitr::opts_chunk$set(warning = FALSE,
19 | message = FALSE)
20 | source("https://raw.githubusercontent.com/andrewpbray/stitched/master/demos/slides_css.R")
21 | ```
22 |
23 | # Agenda
24 |
25 | 1. Review
26 | 2. Stuff
27 | 3. Group by
28 |
29 | ---
30 |
31 | # US Elections Data
32 | --
33 |
34 | - From MIT Elections Lab
35 | - County-level Data
36 | - President elections 2000 - 2016
37 |
38 | ```{r r-setup}
39 | # remotes::install_github("andrewpbray/boxofdata")
40 | library(boxofdata)
41 | library(tidyverse)
42 | data(uselections)
43 | dim(uselections)
44 | names(uselections)
45 | ```
46 |
47 | ---
48 |
49 | # US Elections Data, cont.
50 | --
51 |
52 | ```{r, echo = FALSE}
53 | select(uselections, -state_po, -FIPS, -office, -version) %>%
54 | head(8) %>%
55 | kable()
56 | ```
57 |
58 |
59 | ---
60 |
61 | # Into Python
62 | --
63 |
64 | ```{python}
65 | import pandas as pd
66 | uselections = r.uselections
67 | uselections.shape
68 | uselections.columns
69 | uselections.dtypes
70 | ```
71 |
72 |
73 | ---
74 |
75 | # Select columns
76 |
77 | **Method 1:** Pass a *string* into `[]`...
78 |
79 | --
80 |
81 | ```{python select1, eval = FALSE}
82 | uselections["county"]
83 | ```
84 |
85 | --
86 |
87 | ```{python ref.label = "select1", echo = FALSE}
88 | ```
89 |
90 | --
91 |
92 | ... get out a series.
93 |
94 |
95 | ---
96 |
97 | # Select columns
98 |
99 | **Method 2:** Pass a *list* into `[]`...
100 |
101 | --
102 |
103 | ```{python select2, eval = FALSE}
104 | uselections[["county"]]
105 | ```
106 |
107 | --
108 |
109 | ```{python ref.label = "select2", echo = FALSE}
110 | ```
111 |
112 | --
113 |
114 | ... get out a data frame.
115 |
116 |
117 | ---
118 |
119 | # Slicing rows
120 | --
121 |
122 | **Method 1:** Pass a *slice* into `[]`...
123 |
124 | --
125 |
126 | ```{python slice1, eval = FALSE}
127 | uselections[0:5]
128 | ```
129 |
130 | --
131 |
132 | ```{python ref.label = "slice1", echo = FALSE}
133 | ```
134 |
135 | --
136 |
137 | ... get out a slice data frame. Sound familiar?
138 |
139 | --
140 |
141 | ```{r}
142 | slice(uselections, 1:2)
143 | ```
144 |
145 |
146 | ---
147 |
148 | # Selecting and slicing
149 | --
150 |
151 | **Method 1:** (preferred) access labels with `.loc`.
152 |
153 | ```{python colandrow, eval = FALSE}
154 | uselections.loc[0:5, ["county"]]
155 | ```
156 |
157 | --
158 |
159 | ```{python ref.label = "colandrow", echo = FALSE}
160 | ```
161 |
162 |
163 | ---
164 |
165 | # Selecting and slicing
166 | --
167 |
168 | **Method 2:** access integer indices with `.iloc`.
169 |
170 | ```{python checkcols, eval = FALSE}
171 | uselections.columns
172 | ```
173 |
174 | --
175 |
176 | ```{python ref.label = "checkcols", echo = FALSE}
177 | ```
178 |
179 | --
180 |
181 | ```{python colandrow2, eval = FALSE}
182 | uselections.iloc[0:5, 3]
183 | ```
184 |
185 | --
186 |
187 | ```{python ref.label = "colandrow2", echo = FALSE}
188 | ```
189 |
190 |
191 | ---
192 |
193 | # Filtering rows
194 | --
195 |
196 | You can apply a Boolean series as a mask.
197 |
198 | ```{python boolrows, eval = FALSE}
199 | mask = uselections["year"] == 2016
200 | uselections[mask]
201 | ```
202 |
203 | --
204 |
205 | ```{python ref.label = "boolrows", echo = FALSE}
206 | ```
207 |
208 |
209 | ---
210 |
211 | ## Filtering rows and selecting columns
212 |
213 | Boolean mask plus a list of columns.
214 |
215 | ```{python boolandcolsa, eval = FALSE}
216 | mask = uselections["year"].isin([2012, 2016])
217 | uselections[mask, ["county", "state"]]
218 | ```
219 |
220 | Will this run?
221 |
222 | --
223 |
224 | > Need to use `.loc`
225 |
226 | ```{python boolandcolsb, eval = FALSE}
227 | mask = uselections["year"].isin([2012, 2016])
228 | uselections.loc[mask, ["county", "state"]] #<<
229 | ```
230 |
231 | --
232 |
233 | ```{python ref.label = "boolandcolsb", echo = FALSE}
234 | ```
235 |
236 |
237 | ---
238 |
239 | ### Let's shine that up.
240 |
241 | 1. Form data frame.
242 | 2. Apply `.agg()` method.
243 | 3. Pass as the aggregation function the string method to `.join`.
244 |
245 | --
246 |
247 | ```{python shincol, eval = TRUE}
248 |
249 | ```
250 |
251 | ---
252 |
253 | # Pandas Inventory
254 |
255 | Now we know how to:
256 |
257 | 1. Select columns
258 | --
259 |
260 | 2. Slice rows
261 | --
262 |
263 | 3. Do both simultaneously
264 | --
265 |
266 | 4. Filter rows using boolean masks
267 | --
268 |
269 | 5. Add columns
270 |
271 |
272 | ---
273 |
274 | # Practice: Question 1
275 |
276 | ---
277 |
278 | # Practice: Question 1
279 |
280 | Extract the first three rows where the candidate got more than 90% of the vote.
281 |
282 | ```{python q1}
283 | uselections["prop"] = uselections["candidatevotes"]/uselections["totalvotes"]
284 | ```
285 |
286 | ---
287 |
288 | # Handy utility functions
289 |
290 | **`sort_values()`**
291 |
292 | ```{python eval = FALSE}
293 | uselections.sort_values("state")
294 | ```
295 |
296 | **`value_counts()`**
297 |
298 | ```{python, eval = FALSE}
299 | uselections["year"].value_counts()
300 | ```
301 |
302 | **`unique()`**
303 |
304 | ```{python}
305 |
306 | ```
307 |
308 | **`sample()`**
309 |
310 | ```{python}
311 |
312 | ```
313 |
314 |
315 | ---
316 |
317 | # Practice: Question 2
318 |
319 |
320 | ---
321 |
322 | # Practice: Question 2
323 |
324 | Which candidates were on the ballot in California in 2016?
325 |
326 | ```{python q2}
327 | uselections["candidate"].unique()
328 | mask = (uselections["year"] == 2016) & (uselections["state_po"] == "CA")
329 | uselections[mask]["candidate"].unique()
330 | ```
331 |
332 |
333 | ---
334 |
335 | # Practice: Question 3
336 |
337 | Which were the top 5 counties in California in 2016 in the proportion of the vote won by Hillary Clinton?
338 |
339 | ```{python}
340 | mask = (uselections["year"] == 2016) & (uselections["state_po"] == "CA") & (uselections["candidate"] == "Hillary Clinton")
341 | df = uselections[mask]
342 | df.sort_values("prop", ascending = False)["county"].head(5)
343 | ```
344 |
345 |
346 | ---
347 |
348 | # Groupby
349 | --
350 |
351 | For separate operations on subsets of the data frame, use *grouped* operations.
352 |
353 | ```{python gb, eval = FALSE}
354 | uselections.groupby("year")
355 | ```
356 |
357 | --
358 |
359 | ```{python ref.label = "gb", echo = FALSE}
360 | ```
361 |
362 | --
363 |
364 | ```{python gbtot, eval = FALSE}
365 | uselections.groupby("year").agg(sum)
366 | ```
367 |
368 | --
369 |
370 | ```{python ref.label = "gbtot", echo = FALSE}
371 | ```
372 |
373 |
374 | ---
375 |
376 | # Practice: Question 4
377 |
378 | For each county in California in 2016, calculate the proportion of votes for each major party candidate.
379 |
380 | ```{python}
381 |
382 | ```
383 |
384 |
385 |
386 |
387 |
--------------------------------------------------------------------------------
/lectures/08-pandas-3-notebook.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pandas III"
3 | subtitle: ""
4 | author: "Andrew Bray"
5 | output: html_document
6 | ---
7 |
8 | # Agenda
9 |
10 | 1. Comments on Assignments
11 | 2. Reading in Data
12 | 3. String methods
13 | 4. Joins
14 |
15 | ------------------------------------------------------------------------
16 |
17 | ## Comments on Assignments
18 |
19 |
20 |
21 | 1. Be sure your code is visible in your pdf (no overflowing lines).
22 | 2. If you get at least 2/3 of the points, you get credit on the assignment.
23 | 3. Please tag the pages to the questions, including multiple pages for a single question.
24 | 4. Don't forget to read thoroughly to find the question.
25 |
26 | Example of a question requiring code and a written answer:
27 |
28 | > 5. Repeat the exercise but print out the first four values of both `dest` and `origin` but pass them as a list into the square brackets. Try also passing only `dest` as a list. In general, what happens when you subset the columns with a string versus with a list?
29 |
30 | ```{python}
31 | flights[["dest", "origin"]].head(4)
32 | flights[["dest"]].head(4)
33 | # subsetting with a string with yield a series; with a list yields a data frame
34 | ```
35 |
36 | ------------------------------------------------------------------------
37 |
38 | ## Reading / Writing Data
39 |
40 | Let's return to the example of county level data from presidential elections between 2000 and 2016.
41 |
42 | ```{python}
43 | import pandas as pd
44 | url = "https://raw.githubusercontent.com/andrewpbray/python-for-r-users/master/data/uselections.csv"
45 | uselections = pd.read_csv(url)
46 | View(uselections)
47 | ?uselections
48 | ```
49 |
50 | Useful arguments to keep an eye out for:
51 |
52 | 1. `sep`: what is each data field separated by (can also use `delimiter`)
53 | 2. `header`: which row number has the columns names (or `none`)
54 | 3. `names`: list of columns names to use
55 | 4. `dtype`: dictionary of colname: type pairs for each column.
56 | 5. `index_col`: column to use as row labels when indexing.
57 |
58 | To *write* a data frame to a .csv file, use `df.to_csv()` .
59 |
60 | ## String Methods
61 |
62 | Whenever you find yourself wanting to do operations on columns that contains strings, consider the methods available to you.
63 |
64 | ```{python}
65 | uselections["county"].str
66 | dir(uselections["county"].str)
67 | uselections["county"].str.swapcase()
68 | ```
69 |
70 | When you append `.str` to a Series, the methods available will operate element-wise down the columns.
71 |
72 | #### Question 1
73 |
74 | How many counties are called "Districts"?
75 |
76 | ```{python}
77 | sum(uselections["county"].str.contains("District"))
78 | ```
79 |
80 | #### Question 2
81 |
82 | Return a new data frame that removes all rows containing data from a "District".
83 |
84 | ```{python}
85 | uselections[uselections["county"].str.contains("District")]
86 | ```
87 |
88 | #### Question 3
89 |
90 | Which counties have the greatest number of words in them? (recall that you can apply a normal function over an interable using `.apply`)
91 |
92 | ```{python}
93 | split_counties = uselections["county"].str.split()
94 | n_words = split_counties.apply(len)
95 | new_df = pd.DataFrame({"county": uselections["county"], "n_words": n_words})
96 | new_df.sort_values("n_words", ascending = False).head(25)
97 | ```
98 |
99 | ## Joins
100 |
101 | Joins come into play when we want to utilize variables that are stored across multiple rectangular data files. Consider the following two mock data frames that each contain a different variable, `x` or `y`.
102 |
103 | ```{python}
104 | df_1 = pd.DataFrame({"id": [1, 2, 3], "x":["x1", "x2", "x3"]})
105 | df_2 = pd.DataFrame({"unique_id": [1, 2, 4], "y":["y1", "y2", "y4"]})
106 | df_1
107 | df_2
108 | ```
109 |
110 | We can collect the observations that have been made on both the `x` and `y` variable using `.merge()`.
111 |
112 | ```{python}
113 | df_1.merge(df_2)
114 | ```
115 |
116 |
117 |
118 | ```{python}
119 | ?pandas.DataFrame.merge
120 | ```
121 |
122 | This is an example of an **inner join**. It's one of the four most common database-style data joins. The remaining three are examples of *outer joins*, and they generally result in more rows in the resulting data frame, but often induce missing values.
123 |
124 | 1. **Left Join**: Keeps everything in the "left" data frame and add on columns containing any additional data in the "right" data frame.
125 |
126 | ```{python}
127 | df_1.merge(df_2, how = "left")
128 | ```
129 |
130 | 2. **Right Join**: Keeps everything in the "right" data frame and add on columns containing any additional data in the "left" data frame.
131 |
132 | ```{python}
133 | df_1.merge(df_2, how = "right")
134 | ```
135 |
136 | Note that this is pretty much the same result as a left join with "left" and "right" swapped.
137 |
138 | ```{python}
139 | df_2.merge(df_1, how = "left")
140 | ```
141 |
142 | 3. **Outer Join**: Also known as a full join. Keeps any row found at least one in the two data sets and fills in whatever columns of data are available.
143 |
144 | ```{python}
145 | df_1.merge(df_2, how = "outer")
146 | ```
147 |
148 |
149 |
150 | #### Question 4
151 |
152 | Bring in 2015 county-level demographic data from the `choroplethr` package.
153 |
154 | ```{r}
155 | # install.packages("choroplethr")
156 | library(choroplethr)
157 | data(df_county_demographics)
158 | ```
159 |
160 | Using this data, create a scatterplot of each counties percent vote in favor of Hillary Clinton as a function of their per capita income. Construct the scatterplot with ggplot2 but do the data wrangling in Python.
161 |
--------------------------------------------------------------------------------
/lectures/08-pandas-3-practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Practice"
3 | output: html_document
4 | ---
5 |
6 | ```{python py-setup}
7 | import pandas as pd
8 | url = "https://raw.githubusercontent.com/andrewpbray/python-for-r-users/master/data/uselections.csv"
9 | uselections = pd.read_csv(url)
10 | ```
11 |
12 | ### Question 1
13 |
14 | How many counties are called "District"?
15 |
16 |
17 | ### Question 2
18 |
19 | Return a new data frame that removes all rows containing data from a "District".
20 |
21 |
22 | ### Question 3
23 |
24 | Which counties have the greatest number of words in them? (recall that you can apply a normal function over an iterable using `.apply`)
25 |
26 |
27 | ### Question 4
28 |
29 | Bring in 2015 county-level demographic data from the `choroplethr` package.
30 |
31 | ```{r}
32 | # install.packages("choroplethr")
33 | library(choroplethr)
34 | data(df_county_demographics)
35 | ```
36 |
37 | Using this data, create a scatterplot of each counties percent vote in favor of Hillary Clinton as a function of their per capita income. Construct the scatterplot with ggplot2 but do the data wrangling in Python.
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/lectures/09-reshape-matplotlib-notebook.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Reshaping & Matplotlib"
3 | subtitle: ""
4 | author: "Andrew Bray"
5 | output: html_document
6 | editor_options:
7 | chunk_output_type: console
8 | ---
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(boxofdata)
13 | library(palmerpenguins)
14 | data(penguins)
15 | data(flights)
16 | ```
17 |
18 | # Agenda
19 |
20 | 1. Note on data wrangling
21 |
22 | 2. Note on groupby
23 |
24 | 3. Reshaping Data
25 |
26 | i. Tidy Data
27 | ii. Pivoting
28 | iii. Stacking
29 |
30 | 4. Plotting
31 |
32 | i. Pandas
33 | ii. Matplotlib
34 |
35 | ------------------------------------------------------------------------
36 |
37 | ## Note on Data Wrangling
38 |
39 | Data wrangling is a very rich area of computing with data and we've just skimmed the surface. As you encounter a new data wrangling task in Python, I recommend you take a moment to think about the data type you'll be manipulating, the general operation that you'll be using, then read up on a tutorial in that area. The recommended books for this class are good places to start. For example:
40 |
41 | Python Data Science Handbook (Jake VanderPlas):
42 |
43 | - [Working with Time Series](https://jakevdp.github.io/PythonDataScienceHandbook/03.11-working-with-time-series.html)
44 | - [Visualization with Matplotlib](https://jakevdp.github.io/PythonDataScienceHandbook/04.00-introduction-to-matplotlib.html)
45 |
46 | Python for Data Analysis (Wes McKinney):
47 |
48 | - [Time Series](https://nbviewer.jupyter.org/github/pydata/pydata-book/blob/2nd-edition/ch11.ipynb)
49 | - [Plotting and Visualization](https://nbviewer.jupyter.org/github/pydata/pydata-book/blob/2nd-edition/ch09.ipynb)
50 |
51 | ------------------------------------------------------------------------
52 |
53 | ## Note on `groupby`
54 |
55 | For Lab 8, question 1:
56 |
57 | > Use date-time methods to determine which day of the week is best to fly from the Bay Area to Seattle if you want to minimize delays. Does the answer differ between Oakland and San Francisco Airports?
58 |
59 | ```{python}
60 | flights = r.flights
61 | sea_flights = flights[flights["dest"] == "SEA"]
62 | dow = sea_flights["time_hour"].dt.day_name()
63 | sea_flights["day_of_week"] = dow
64 | sea_flights[["day_of_week","dep_delay"]].groupby("day_of_week").agg("mean").sort_values("dep_delay")
65 | sea2 = (sea_flights[["day_of_week","dep_delay", "origin"]]
66 | .groupby(["origin", "day_of_week"])
67 | .agg("mean")
68 | .sort_values(["origin", "dep_delay"]))
69 |
70 | ```
71 |
72 | In Pandas, after a grouped-by aggregation the result is a data frame, where each row corresponds to a value of the Cartesian product of the grouping variables.
73 |
74 | ```{python}
75 | sea2["origin"]
76 | sea2["day_of_week"]
77 | sea2["dep_delay"]
78 | sea2.index
79 | sea2.index[0]
80 |
81 | ```
82 |
83 | Those grouping variable values are stores as indices (row names), where as in R, they're stored as ordinary variables.
84 |
85 | ```{r}
86 | library(lubridate)
87 | flights %>%
88 | filter(dest == "SEA") %>%
89 | mutate(day_of_week = wday(time_hour, label = TRUE)) %>%
90 | group_by(origin, day_of_week) %>%
91 | summarize(mean_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
92 | arrange(mean_dep_delay)
93 | ```
94 |
95 | ------------------------------------------------------------------------
96 |
97 | ## Reshaping Data
98 |
99 | ### Tidy Data (Wickham 2014)
100 |
101 | There are three interrelated rules which make a dataset tidy:
102 |
103 | 1. Each variable must have its own column.
104 |
105 | 2. Each observation must have its own row.
106 |
107 | 3. Each value must have its own cell.
108 |
109 |
110 |
111 | #### Question 1
112 |
113 | ```{python}
114 | A = r.table3
115 | B = r.table2
116 | C = r.table1
117 | D1 = r.table4a
118 | D2 = r.table4b
119 | ```
120 |
121 | ### Reshaping Data
122 |
123 | **Example**: Consider `D1`. How can this be tidied?
124 |
125 | ```{python}
126 | D1
127 | ```
128 |
129 | - Column names are *values* not *variables*.
130 | - We need to gather the columns into a new pair of variables.
131 | - We want to pivot "wide" to "long" with `.melt()`
132 |
133 | ```{python}
134 | ?D1.melt
135 | D1.melt(id_vars = "country",
136 | value_vars = ["1999", "2000"],
137 | var_name = "year",
138 | value_name = "cases")
139 | ```
140 |
141 | #### Question 2
142 |
143 | **Example**: Consider `B`. How can this be tidied?
144 |
145 | ```{python}
146 | B
147 | ```
148 |
149 | - Each *observation* is spread across multiple rows.
150 | - We need to spread the values of a variable across multiple columns.
151 | - We want to pivot "long" to "wide" with `.pivot()`.
152 |
153 | ```{python}
154 | ?B.pivot
155 | B.pivot(index = ["country", "year"],
156 | columns = "type",
157 | values = "count")
158 | ```
159 |
160 | ------------------------------------------------------------------------
161 |
162 | ## Plotting
163 |
164 | ### `Pandas`
165 |
166 | Pandas has some bare bones plotting functionality if you want to go straight from your data frame to a quick plot. It actually calls up `matplotlib`, so lets install and load that.
167 |
168 | ```{r}
169 | #reticulate::py_install("matplotlib")
170 | ```
171 |
172 | ```{python}
173 | import matplotlib.pyplot as plt
174 | sea_by_day = sea_flights[["day_of_week","dep_delay"]].groupby("day_of_week").agg("mean").sort_values("dep_delay")
175 | sea_by_day.plot.barh()
176 | ```
177 |
178 | These are generally just wrappers for functions within `matplotlib`, so probably better to call those directly.
179 |
180 | ### `matplotlib`
181 |
182 | ```{python}
183 | plt.bar(sea_by_day.index, sea_by_day["dep_delay"])
184 | plt.barh(sea_by_day.index, sea_by_day["dep_delay"])
185 | ```
186 |
187 | In your Rmd, you will need to run `plt.show()` whenever you want to see your plot, but in a Jupyter notebook, you can get them to render automatically.
188 |
189 | ### An "Active" Interface
190 |
191 | Consider the following code.
192 |
193 | ```{python}
194 | import numpy as np
195 | plt.figure() # create a plot figure
196 |
197 | # create the first of two panels
198 | plt.subplot(2, 1, 1) # (rows, columns, panel number)
199 | plt.plot(np.random.normal(scale = 1, size = 100))
200 |
201 | # create the second panel
202 | plt.subplot(2, 1, 2)
203 | plt.plot(np.random.normal(scale = 2, size = 100))
204 | ```
205 |
206 | In this interface, `matplotlib` is keeping track of which plot (or subplot) is the active one. Any plotting commands called will update that plot. This is similar to base R.
207 |
208 | ```{r}
209 | par(mfrow = c(2, 1))
210 | plot(rnorm(100, sd = 1))
211 | plot(rnorm(100, sd = 2))
212 | ```
213 |
214 | This interface is brittle and error-prone. Better to use...
215 |
216 | ### An Object-oriented Interface
217 |
218 | ```{python}
219 | # First create a grid of plots
220 | # ax will be an array of two Axes objects
221 | fig, ax = plt.subplots(2)
222 |
223 | # Call plot() method on the appropriate object
224 | ax[0].plot(np.random.normal(scale = 1, size = 100))
225 | ax[1].plot(np.random.normal(scale = 2, size = 100))
226 | ```
227 |
228 | Here, we're calling the `.plot()` method on a particulate `ax` object (a subplot), so we can be explicit in what we're modifying. `plt.plot()` , by contrast, will modify whatever the active plot is.
229 |
230 | ### Simple line plot
231 |
232 | Let's build up a plot from scratch. Start by creating the figure as well as defining the axis.
233 |
234 | ```{python}
235 | fig = plt.figure()
236 | ax = plt.axes()
237 | ?plt.figure
238 | ?plt.axes
239 | ```
240 |
241 | To plot a function along the axis, we call the `.plot()` method on that ax.
242 |
243 | ```{python}
244 | x = np.linspace(0, 10, 1000)
245 | ax.plot(x, np.sin(x))
246 | # plt.plot(x, np.sin(x)) # "active" interface approach
247 | ```
248 |
249 | Let's add another line.
250 |
251 | ```{python}
252 | ax.plot(x, np.cos(x))
253 | ```
254 |
255 | #### Colors and Line Styles
256 |
257 | Aesthetic preferences can be passed as arguments to `.plot()`.
258 |
259 | ```{python}
260 | ax.plot(x, np.cos(x), color = "g")
261 | ```
262 |
263 | Can also use `color = "green"`, as well as hex codes like `color = "#FFDD44"`. To adjust the line style, specify the `linestyle` with a string.
264 |
265 | ```{python}
266 | ax.plot(x, np.cos(x), color = "g", linestyle = "dashed")
267 | ```
268 |
269 | You'll also sometimes see a very tense (and not recommended) combination:
270 |
271 | ```{python}
272 | ax.plot(x, np.cos(x), '-.g')
273 | ```
274 |
275 | #### Other settings
276 |
277 | #### Question 4
278 |
279 | #### Question 5
280 |
--------------------------------------------------------------------------------
/lectures/09-reshape-matplotlib-practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Reshaping & Matplotlib"
3 | subtitle: ""
4 | author: "Andrew Bray"
5 | output: html_document
6 | ---
7 |
8 | ```{r}
9 | library(tidyverse)
10 | ```
11 |
12 | #### Question 1
13 |
14 | Take a look at the following 4 representations of the same data set concerning occurrences of Tuberculosis in various countries.
15 |
16 | ```{python}
17 | A = r.table3
18 | B = r.table2
19 | C = r.table1
20 | D1 = r.table4a
21 | D2 = r.table4b
22 | ```
23 |
24 | For each of the representations, what are the *observations* or *cases*? These refer to specific, unique, and similar sorts of things. What are the *variables*? These should take the same sort of *value* for each case.
25 |
26 |
27 | #### Question 2
28 |
29 | Extend the approach to tidying `D1` that we demonstrated to form `C` from `D1` and `D2`.
30 |
31 | ```{python}
32 | ```
33 |
34 | #### Question 3
35 |
36 | ```{python}
37 | ```
38 |
39 |
40 | Modify the title, labels, and axis limits with
41 |
42 | - `ax.set_xlabel()`
43 | - `ax.set_ylabel()`
44 | - `ax.set_xlim()`
45 | - `ax.set_ylim()`
46 | - `ax.set_title()`
47 |
48 | #### Question 4
49 |
50 | ```{r}
51 | library(palmerpenguins)
52 | data(penguins)
53 | ```
54 |
55 | ```{python}
56 | penguins = r.penguins
57 | ```
58 |
59 | Create a scatter plot of bill length on bill depth by passing each as a series argument to `.plot()`.
60 |
61 |
62 |
--------------------------------------------------------------------------------
/lectures/10-matplotlib-seaborn-practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Reshaping & Matplotlib"
3 | subtitle: ""
4 | author: "Andrew Bray"
5 | output: html_document
6 | ---
7 |
8 | ```{r}
9 | library(tidyverse)
10 | library(palmerpenguins)
11 | data(penguins)
12 | ```
13 |
14 | ```{python}
15 | penguins = r.penguins
16 | ```
17 |
18 | #### Question 1
19 |
20 | Use `sns.relplot()` to create a scatter plot of bill length on bill depth with faceting based on species.
21 |
22 | #### Question 2
23 |
24 | Use `sns.relplot()` or `sns.lineplto()` to recreate plot 2 from lab 9.
25 |
26 | #### Question 3
27 |
28 | Use seaborn and the penguins data to create a new plot type.
29 |
--------------------------------------------------------------------------------
/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-2.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-3.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-10-4.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-methods_files/figure-html/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-objects-practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Week 13"
3 | subtitle: "classes and objects"
4 | author: "STAT 198/298 Fall 2020"
5 | output: html_document
6 | ---
7 |
8 | ```{r include = FALSE}
9 | knitr::opts_chunk$set(message = FALSE, fig.align = "center")
10 | library(tidyverse)
11 | library(reticulate)
12 | ```
13 |
14 |
15 | Load in class definition.
16 |
17 | ```{python}
18 | class car:
19 | def __init__(self, make = "honda", model = "civic",
20 | year = 2007, mpg = 30):
21 | self.make = make
22 | self.model = model
23 | self.year = year
24 | self.mpg = mpg
25 |
26 | def greet(self, name):
27 | print("Hello, I am a " + self.make.capitalize() +
28 | " " + self.model.capitalize() + " and my name is " +
29 | name + ".")
30 |
31 | def age(self, years):
32 | self.mpg += -years / 4
33 |
34 | ```
35 |
36 | ### Question 1
37 |
38 | Create a new instance of `car()` that corresponds to your your favorite car.
39 |
40 |
41 | ### Question 2
42 |
43 | Add an attribute that stores the amount of gas in the tank and add a method `drive()` that depletes the gas.
44 |
45 |
46 |
--------------------------------------------------------------------------------
/lectures/13-classes-and-objects-slides_files/figure-html/.unnamed-chunk-10-3.png-V1UM:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-objects-slides_files/figure-html/.unnamed-chunk-10-3.png-V1UM
--------------------------------------------------------------------------------
/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-2.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-3.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-10-4.png
--------------------------------------------------------------------------------
/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/13-classes-and-objects-slides_files/figure-html/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/lectures/cal.css:
--------------------------------------------------------------------------------
1 | .large {
2 | font-size: 35px;
3 | }
4 |
5 | .small {
6 | font-size: 15px;
7 | }
8 |
9 | /* regular slide */
10 | .remark-slide-content {
11 | background-color: #FAFAFA;
12 | border-bottom: 5px solid #041E42;
13 | font-size: 30px;
14 | font-weight: 300;
15 | line-height: 1.2;
16 | padding: 1em 3em 1em 3em;
17 | }
18 |
19 |
20 | .remark-code {
21 | font-size: 20px
22 | }
23 |
24 | .remark-inline-code {
25 | font-size: 30px
26 | }
--------------------------------------------------------------------------------
/lectures/figs/2-fold-CV.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/2-fold-CV.png
--------------------------------------------------------------------------------
/lectures/figs/5-fold-CV.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/5-fold-CV.png
--------------------------------------------------------------------------------
/lectures/figs/bonneville.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/bonneville.jpg
--------------------------------------------------------------------------------
/lectures/figs/class-hierarchies.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/class-hierarchies.png
--------------------------------------------------------------------------------
/lectures/figs/comparison-operators.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/comparison-operators.png
--------------------------------------------------------------------------------
/lectures/figs/data-structures.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/data-structures.png
--------------------------------------------------------------------------------
/lectures/figs/equality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/equality.png
--------------------------------------------------------------------------------
/lectures/figs/flight-delay-evidence.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/flight-delay-evidence.png
--------------------------------------------------------------------------------
/lectures/figs/hw12-ii.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/hw12-ii.png
--------------------------------------------------------------------------------
/lectures/figs/hw13-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/hw13-10.png
--------------------------------------------------------------------------------
/lectures/figs/hw3-turtle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/hw3-turtle.png
--------------------------------------------------------------------------------
/lectures/figs/identity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/identity.png
--------------------------------------------------------------------------------
/lectures/figs/join-inner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/join-inner.png
--------------------------------------------------------------------------------
/lectures/figs/join-outer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/join-outer.png
--------------------------------------------------------------------------------
/lectures/figs/lab-3-sc1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/lab-3-sc1.png
--------------------------------------------------------------------------------
/lectures/figs/lab-3-sc2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/lab-3-sc2.png
--------------------------------------------------------------------------------
/lectures/figs/learning-languages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/learning-languages.png
--------------------------------------------------------------------------------
/lectures/figs/list-indexing-bah.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/list-indexing-bah.gif
--------------------------------------------------------------------------------
/lectures/figs/list-indexing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/list-indexing.png
--------------------------------------------------------------------------------
/lectures/figs/math-operators.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/math-operators.png
--------------------------------------------------------------------------------
/lectures/figs/mic-drop.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/mic-drop.gif
--------------------------------------------------------------------------------
/lectures/figs/mutatr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/mutatr.png
--------------------------------------------------------------------------------
/lectures/figs/numpy-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/numpy-logo.png
--------------------------------------------------------------------------------
/lectures/figs/oop-cars.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/oop-cars.png
--------------------------------------------------------------------------------
/lectures/figs/or-plot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/or-plot-1.png
--------------------------------------------------------------------------------
/lectures/figs/or-plot-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/or-plot-2.png
--------------------------------------------------------------------------------
/lectures/figs/pandas-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/pandas-logo.png
--------------------------------------------------------------------------------
/lectures/figs/pascals-triangle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/pascals-triangle.png
--------------------------------------------------------------------------------
/lectures/figs/plotly-gallery.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/plotly-gallery.png
--------------------------------------------------------------------------------
/lectures/figs/polyforce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/polyforce.png
--------------------------------------------------------------------------------
/lectures/figs/r-python-diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/r-python-diagram.png
--------------------------------------------------------------------------------
/lectures/figs/seaborn-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/seaborn-overview.png
--------------------------------------------------------------------------------
/lectures/figs/slido-qr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/slido-qr.png
--------------------------------------------------------------------------------
/lectures/figs/tidy-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewpbray/python-for-r-users/d9624a1b3a2ac29b2643342a1b88a3626a794cb5/lectures/figs/tidy-1.png
--------------------------------------------------------------------------------
/lectures/libs/anchor-sections-1.0/anchor-sections.css:
--------------------------------------------------------------------------------
1 | /* Styles for section anchors */
2 | a.anchor-section {margin-left: 10px; visibility: hidden; color: inherit;}
3 | a.anchor-section::before {content: '#';}
4 | .hasAnchor:hover a.anchor-section {visibility: visible;}
5 |
--------------------------------------------------------------------------------
/lectures/libs/anchor-sections-1.0/anchor-sections.js:
--------------------------------------------------------------------------------
1 | // Anchor sections v1.0 written by Atsushi Yasumoto on Oct 3rd, 2020.
2 | document.addEventListener('DOMContentLoaded', function() {
3 | // Do nothing if AnchorJS is used
4 | if (typeof window.anchors === 'object' && anchors.hasOwnProperty('hasAnchorJSLink')) {
5 | return;
6 | }
7 |
8 | const h = document.querySelectorAll('h1, h2, h3, h4, h5, h6');
9 |
10 | // Do nothing if sections are already anchored
11 | if (Array.from(h).some(x => x.classList.contains('hasAnchor'))) {
12 | return null;
13 | }
14 |
15 | // Use section id when pandoc runs with --section-divs
16 | const section_id = function(x) {
17 | return ((x.classList.contains('section') || (x.tagName === 'SECTION'))
18 | ? x.id : '');
19 | };
20 |
21 | // Add anchors
22 | h.forEach(function(x) {
23 | const id = x.id || section_id(x.parentElement);
24 | if (id === '') {
25 | return null;
26 | }
27 | let anchor = document.createElement('a');
28 | anchor.href = '#' + id;
29 | anchor.classList = ['anchor-section'];
30 | x.classList.add('hasAnchor');
31 | x.appendChild(anchor);
32 | });
33 | });
34 |
--------------------------------------------------------------------------------
/lectures/libs/header-attrs-2.5/header-attrs.js:
--------------------------------------------------------------------------------
1 | // Pandoc 2.9 adds attributes on both header and div. We remove the former (to
2 | // be compatible with the behavior of Pandoc < 2.8).
3 | document.addEventListener('DOMContentLoaded', function(e) {
4 | var hs = document.querySelectorAll("div.section[class*='level'] > :first-child");
5 | var i, h, a;
6 | for (i = 0; i < hs.length; i++) {
7 | h = hs[i];
8 | if (!/^h[1-6]$/i.test(h.tagName)) continue; // it should be a header h1-h6
9 | a = h.attributes;
10 | while (a.length > 0) h.removeAttribute(a[0].name);
11 | }
12 | });
13 |
--------------------------------------------------------------------------------
/lectures/libs/remark-css-0.0.1/metropolis.css:
--------------------------------------------------------------------------------
1 | .remark-slide-content {
2 | background-color: #FAFAFA;
3 | border-top: 80px solid #23373B;
4 | font-size: 20px;
5 | font-weight: 300;
6 | line-height: 1.5;
7 | padding: 1em 2em 1em 2em
8 | }
9 |
10 | .title-slide .inverse .remark-slide-content {
11 | background-color: #FAFAFA;
12 | }
13 |
14 | .inverse {
15 | background-color: #23373B;
16 | text-shadow: none;
17 | }
18 |
19 | /* Removes colored bar from top of the slide resulting in a clear slide */
20 | .clear{
21 | border-top: 0px solid #FAFAFA;
22 | }
23 |
24 | h1 {
25 | font-weight: normal;
26 | margin-top: -95px;
27 | margin-left: -00px;
28 | color: #FAFAFA;
29 | }
30 |
31 | h2, h3, h4 {
32 | padding-top: -15px;
33 | padding-bottom: 00px;
34 | color: #1A292C;
35 | text-shadow: none;
36 | font-weight: 400;
37 | text-align: left;
38 | margin-left: 00px;
39 | margin-bottom: -10px;
40 | }
41 |
42 | .remark-slide-content h1 {
43 | font-size: 45px;
44 | }
45 |
46 | .remark-slide-content h2 {
47 | font-size: 35px;
48 | }
49 |
50 | .remark-slide-content h3 {
51 | font-size: 30px;
52 | }
53 |
54 | .left-column h2, .left-column h3, .left-column h4 {
55 | color: #777;
56 | }
57 |
58 | .left-column h2:last-of-type, .left-column h3:last-child {
59 | color: #1A292C;
60 | }
61 |
62 | .title-slide {
63 | background-color: #FAFAFA;
64 | border-top: 80px solid #FAFAFA;
65 | }
66 |
67 | .title-slide h1 {
68 | color: #1A292C;
69 | font-size: 40px;
70 | text-shadow: none;
71 | font-weight: 400;
72 | text-align: left;
73 | margin-left: 15px;
74 | padding-top: 80px;
75 | }
76 | .title-slide h2 {
77 | margin-top: -25px;
78 | padding-bottom: -20px;
79 | color: #1A292C;
80 | text-shadow: none;
81 | font-weight: 300;
82 | font-size: 35px;
83 | text-align: left;
84 | margin-left: 15px;
85 | }
86 | .title-slide h3 {
87 | color: #1A292C;
88 | text-shadow: none;
89 | font-weight: 300;
90 | font-size: 25px;
91 | text-align: left;
92 | margin-left: 15px;
93 | margin-bottom: -30px;
94 | }
95 |
96 | hr, .title-slide h2::after, .mline h1::after {
97 | content: '';
98 | display: block;
99 | border: none;
100 | background-color: #EB811B;
101 | color: #EB811B;
102 | height: 1px;
103 | }
104 |
105 | hr, .mline h1::after {
106 | margin: 1em 15px 0 15px;
107 | }
108 |
109 | .title-slide h2::after {
110 | margin: 10px 15px 35px 0;
111 | }
112 |
113 | .mline h1::after {
114 | margin: 10px 15px 0 15px;
115 | }
116 |
117 | .remark-slide-number {
118 | font-size: 13pt;
119 | color: #272822;
120 | opacity: 1;
121 | }
122 | .inverse .remark-slide-number {
123 | font-size: 13pt;
124 | color: #FAFAFA;
125 | opacity: 1;
126 | }
127 |
128 | /* turns off slide numbers for title page: https://github.com/gnab/remark/issues/298 */
129 | .title-slide .remark-slide-number {
130 | display: none;
131 | }
132 |
133 | .remark-inline-code {
134 | /* background: #F5F5F5; /* lighter */
135 | background: #e7e8e2; /* darker */
136 | border-radius: 3px;
137 | padding: 4px;
138 | }
139 |
140 | .code10 .remark-code {
141 | font-size: 10%;
142 | }
143 |
144 | .code20 .remark-code {
145 | font-size: 20%;
146 | }
147 |
148 | .code30 .remark-code {
149 | font-size: 30%;
150 | }
151 |
152 | .code40 .remark-code {
153 | font-size: 40%;
154 | }
155 |
156 | .code50 .remark-code {
157 | font-size: 50%;
158 | }
159 |
160 | .code60 .remark-code {
161 | font-size: 60%;
162 | }
163 |
164 | .code70 .remark-code {
165 | font-size: 70%;
166 | }
167 |
168 | .code80 .remark-code {
169 | font-size: 80%;
170 | }
171 |
172 | .code90 .remark-code {
173 | font-size: 90%;
174 | }
175 |
176 | .code100 .remark-code {
177 | font-size: 100%;
178 | }
179 |
180 | .font10 {
181 | font-size: 10%;
182 | }
183 |
184 | .font20 {
185 | font-size: 20%;
186 | }
187 |
188 | .font30 {
189 | font-size: 30%;
190 | }
191 |
192 | .font40 {
193 | font-size: 40%;
194 | }
195 |
196 | .font50 {
197 | font-size: 50%;
198 | }
199 |
200 | .font60 {
201 | font-size: 60%;
202 | }
203 |
204 | .font70 {
205 | font-size: 70%;
206 | }
207 |
208 | .font80 {
209 | font-size: 80%;
210 | }
211 |
212 | .font90 {
213 | font-size: 90%;
214 | }
215 |
216 | .font100 {
217 | font-size: 100%;
218 | }
219 |
220 | .font110 {
221 | font-size: 110%;
222 | }
223 |
224 | .font120 {
225 | font-size: 120%;
226 | }
227 |
228 | .font130 {
229 | font-size: 130%;
230 | }
231 |
232 | .font140 {
233 | font-size: 140%;
234 | }
235 |
236 | .font150 {
237 | font-size: 150%;
238 | }
239 |
240 | .font160 {
241 | font-size: 160%;
242 | }
243 | .font170 {
244 | font-size: 170%;
245 | }
246 | .font180 {
247 | font-size: 180%;
248 | }
249 | .font190 {
250 | font-size: 190%;
251 | }
252 | .font200 {
253 | font-size: 200%;
254 | }
255 |
--------------------------------------------------------------------------------
/libs/remark-css-0.0.1/metropolis.css:
--------------------------------------------------------------------------------
1 | .remark-slide-content {
2 | background-color: #FAFAFA;
3 | border-top: 80px solid #23373B;
4 | font-size: 20px;
5 | font-weight: 300;
6 | line-height: 1.5;
7 | padding: 1em 2em 1em 2em
8 | }
9 |
10 | .title-slide .inverse .remark-slide-content {
11 | background-color: #FAFAFA;
12 | }
13 |
14 | .inverse {
15 | background-color: #23373B;
16 | text-shadow: none;
17 | }
18 |
19 | /* Removes colored bar from top of the slide resulting in a clear slide */
20 | .clear{
21 | border-top: 0px solid #FAFAFA;
22 | }
23 |
24 | h1 {
25 | font-weight: normal;
26 | margin-top: -95px;
27 | margin-left: -00px;
28 | color: #FAFAFA;
29 | }
30 |
31 | h2, h3, h4 {
32 | padding-top: -15px;
33 | padding-bottom: 00px;
34 | color: #1A292C;
35 | text-shadow: none;
36 | font-weight: 400;
37 | text-align: left;
38 | margin-left: 00px;
39 | margin-bottom: -10px;
40 | }
41 |
42 | .remark-slide-content h1 {
43 | font-size: 45px;
44 | }
45 |
46 | .remark-slide-content h2 {
47 | font-size: 35px;
48 | }
49 |
50 | .remark-slide-content h3 {
51 | font-size: 30px;
52 | }
53 |
54 | .left-column h2, .left-column h3, .left-column h4 {
55 | color: #777;
56 | }
57 |
58 | .left-column h2:last-of-type, .left-column h3:last-child {
59 | color: #1A292C;
60 | }
61 |
62 | .title-slide {
63 | background-color: #FAFAFA;
64 | border-top: 80px solid #FAFAFA;
65 | }
66 |
67 | .title-slide h1 {
68 | color: #1A292C;
69 | font-size: 40px;
70 | text-shadow: none;
71 | font-weight: 400;
72 | text-align: left;
73 | margin-left: 15px;
74 | padding-top: 80px;
75 | }
76 | .title-slide h2 {
77 | margin-top: -25px;
78 | padding-bottom: -20px;
79 | color: #1A292C;
80 | text-shadow: none;
81 | font-weight: 300;
82 | font-size: 35px;
83 | text-align: left;
84 | margin-left: 15px;
85 | }
86 | .title-slide h3 {
87 | color: #1A292C;
88 | text-shadow: none;
89 | font-weight: 300;
90 | font-size: 25px;
91 | text-align: left;
92 | margin-left: 15px;
93 | margin-bottom: -30px;
94 | }
95 |
96 | hr, .title-slide h2::after, .mline h1::after {
97 | content: '';
98 | display: block;
99 | border: none;
100 | background-color: #EB811B;
101 | color: #EB811B;
102 | height: 1px;
103 | }
104 |
105 | hr, .mline h1::after {
106 | margin: 1em 15px 0 15px;
107 | }
108 |
109 | .title-slide h2::after {
110 | margin: 10px 15px 35px 0;
111 | }
112 |
113 | .mline h1::after {
114 | margin: 10px 15px 0 15px;
115 | }
116 |
117 | .remark-slide-number {
118 | font-size: 13pt;
119 | color: #272822;
120 | opacity: 1;
121 | }
122 | .inverse .remark-slide-number {
123 | font-size: 13pt;
124 | color: #FAFAFA;
125 | opacity: 1;
126 | }
127 |
128 | /* turns off slide numbers for title page: https://github.com/gnab/remark/issues/298 */
129 | .title-slide .remark-slide-number {
130 | display: none;
131 | }
132 |
133 | .remark-inline-code {
134 | /* background: #F5F5F5; /* lighter */
135 | background: #e7e8e2; /* darker */
136 | border-radius: 3px;
137 | padding: 4px;
138 | }
139 |
140 | .code10 .remark-code {
141 | font-size: 10%;
142 | }
143 |
144 | .code20 .remark-code {
145 | font-size: 20%;
146 | }
147 |
148 | .code30 .remark-code {
149 | font-size: 30%;
150 | }
151 |
152 | .code40 .remark-code {
153 | font-size: 40%;
154 | }
155 |
156 | .code50 .remark-code {
157 | font-size: 50%;
158 | }
159 |
160 | .code60 .remark-code {
161 | font-size: 60%;
162 | }
163 |
164 | .code70 .remark-code {
165 | font-size: 70%;
166 | }
167 |
168 | .code80 .remark-code {
169 | font-size: 80%;
170 | }
171 |
172 | .code90 .remark-code {
173 | font-size: 90%;
174 | }
175 |
176 | .code100 .remark-code {
177 | font-size: 100%;
178 | }
179 |
180 | .font10 {
181 | font-size: 10%;
182 | }
183 |
184 | .font20 {
185 | font-size: 20%;
186 | }
187 |
188 | .font30 {
189 | font-size: 30%;
190 | }
191 |
192 | .font40 {
193 | font-size: 40%;
194 | }
195 |
196 | .font50 {
197 | font-size: 50%;
198 | }
199 |
200 | .font60 {
201 | font-size: 60%;
202 | }
203 |
204 | .font70 {
205 | font-size: 70%;
206 | }
207 |
208 | .font80 {
209 | font-size: 80%;
210 | }
211 |
212 | .font90 {
213 | font-size: 90%;
214 | }
215 |
216 | .font100 {
217 | font-size: 100%;
218 | }
219 |
220 | .font110 {
221 | font-size: 110%;
222 | }
223 |
224 | .font120 {
225 | font-size: 120%;
226 | }
227 |
228 | .font130 {
229 | font-size: 130%;
230 | }
231 |
232 | .font140 {
233 | font-size: 140%;
234 | }
235 |
236 | .font150 {
237 | font-size: 150%;
238 | }
239 |
240 | .font160 {
241 | font-size: 160%;
242 | }
243 | .font170 {
244 | font-size: 170%;
245 | }
246 | .font180 {
247 | font-size: 180%;
248 | }
249 | .font190 {
250 | font-size: 190%;
251 | }
252 | .font200 {
253 | font-size: 200%;
254 | }
255 |
--------------------------------------------------------------------------------
/small-notes/pointers-identity-equality.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pointers, Equality, and Identity"
3 | output: github_document
4 | ---
5 |
6 |
7 | ## Pointers in R
8 |
9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(echo = TRUE, message = FALSE)
11 | library(reticulate)
12 | ```
13 |
14 | Let's start by creating a vector of the integers 1 to 10 in R.
15 |
16 | ```{r}
17 | x <- 1:10
18 | x
19 | ```
20 |
21 | We can ask for the location of this object in memory using `obj_addr()`.
22 |
23 | ```{r}
24 | library(lobstr)
25 | obj_addr(x)
26 | ```
27 |
28 | Let's point another variable `y` at `x` and see where it is stored.
29 |
30 | ```{r}
31 | y <- x
32 | obj_addr(y)
33 | ```
34 |
35 | We can see that's they're actually pointing to the same object. This is a sensible thing to do to save memory - why copy over something that's the same?
36 |
37 | Let's see what happens if we if we change `x` in place:
38 |
39 | ```{r}
40 | x[1] <- 99
41 | obj_addrs(list(x, y))
42 | ```
43 |
44 | We see that while `y` is still pointing to the original object, R has copied over the contents of `x` into a new location, then made the change. This behavior is helpful if you want to be sure to not inadvertently overwrite objects, but it does make me wonder why we call that syntax "change in place"! Perhaps "change in object" would be better.
45 |
46 |
47 | ## Pointers in Python
48 |
49 | Let's go through the same process with Python.
50 |
51 | ```{python}
52 | x = list(range(1, 11))
53 | x
54 | y = x
55 | [id(x), id(y)]
56 | ```
57 |
58 | So far they're behaving the same way: both `x` and `y` are pointing to the same object in memory. Let's see what happens when we change `x`.
59 |
60 | ```{python}
61 | x[0] = 99
62 | [id(x), id(y)]
63 | ```
64 |
65 | *That's* different. Python truly does change `x` in place; it maintains the object's location in memory so that anything else also pointed to that memory slot is also changed.
66 |
67 |
68 | ## Equality and Identity
69 |
70 | Two objects are *equal* if they share the same value. Two objects are *identical* if they're actually the same object in memory. In Python, `x` and `y` are both identical and equal.
71 |
72 | ```{python}
73 | x is y # identical
74 | x == y # equal
75 | ```
76 |
77 | We can create a third variable `z`, that's equal,
78 |
79 | ```{python}
80 | z = [99, 2, 3, 4, 5, 6, 7, 8, 9, 10]
81 | z == x
82 | ```
83 |
84 | But because it's not referring to the same object in memory, it's not identical.
85 |
86 | ```{python}
87 | [id(z), id(x)]
88 | z is x
89 | ```
90 |
91 | In R, `x` and `y` were both equal and identical initially, but when we changed `x`, we broke both the equality and the identity.
92 |
93 | ```{r}
94 | x == y # not equal
95 | obj_addr(x) == obj_addr(y) # not identical
96 | ```
97 |
98 | Note that there is a function called `identical()` in R that isn't using the same definition of *identical* that we're using here. Instead of outputting a vector of `TRUE` and `FALSE` for each pairwise logical comparison that we're making between `x` and `y`, it only returns `TRUE` if all values are `TRUE` and `FALSE` otherwise
99 |
100 | ```{r}
101 | identical(x, y)
102 | ```
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/small-notes/pointers-identity-equality.md:
--------------------------------------------------------------------------------
1 | Pointers, Equality, and Identity
2 | ================
3 |
4 | ## Pointers in R
5 |
6 | Let’s start by creating a vector of the integers 1 to 10 in R.
7 |
8 | ``` r
9 | x <- 1:10
10 | x
11 | ```
12 |
13 | ## [1] 1 2 3 4 5 6 7 8 9 10
14 |
15 | We can ask for the location of this object in memory using `obj_addr()`.
16 |
17 | ``` r
18 | library(lobstr)
19 | obj_addr(x)
20 | ```
21 |
22 | ## [1] "0x7fcd27330420"
23 |
24 | Let’s point another variable `y` at `x` and see where it is stored.
25 |
26 | ``` r
27 | y <- x
28 | obj_addr(y)
29 | ```
30 |
31 | ## [1] "0x7fcd27330420"
32 |
33 | We can see that’s they’re actually pointing to the same object. This is
34 | a sensible thing to do to save memory - why copy over something that’s
35 | the same?
36 |
37 | Let’s see what happens if we if we change `x` in place:
38 |
39 | ``` r
40 | x[1] <- 99
41 | obj_addrs(list(x, y))
42 | ```
43 |
44 | ## [1] "0x7fcd10b96388" "0x7fcd27330420"
45 |
46 | We see that while `y` is still pointing to the original object, R has
47 | copied over the contents of `x` into a new location, then made the
48 | change. This behavior is helpful if you want to be sure to not
49 | inadvertently overwrite objects, but it does make me wonder why we call
50 | that syntax “change in place”\! Perhaps “change in object” would be
51 | better.
52 |
53 | ## Pointers in Python
54 |
55 | Let’s go through the same process with Python.
56 |
57 | ``` python
58 | x = list(range(1, 11))
59 | x
60 | ```
61 |
62 | ## [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
63 |
64 | ``` python
65 | y = x
66 | [id(x), id(y)]
67 | ```
68 |
69 | ## [140518311388936, 140518311388936]
70 |
71 | So far they’re behaving the same way: both `x` and `y` are pointing to
72 | the same object in memory. Let’s see what happens when we change `x`.
73 |
74 | ``` python
75 | x[0] = 99
76 | [id(x), id(y)]
77 | ```
78 |
79 | ## [140518311388936, 140518311388936]
80 |
81 | *That’s* different. Python truly does change `x` in place; it maintains
82 | the object’s location in memory so that anything else also pointed to
83 | that memory slot is also changed.
84 |
85 | ## Equality and Identity
86 |
87 | Two objects are *equal* if they share the same value. Two objects are
88 | *identical* if they’re actually the same object in memory. In Python,
89 | `x` and `y` are both identical and equal.
90 |
91 | ``` python
92 | x is y # identical
93 | ```
94 |
95 | ## True
96 |
97 | ``` python
98 | x == y # equal
99 | ```
100 |
101 | ## True
102 |
103 | We can create a third variable `z`, that’s equal,
104 |
105 | ``` python
106 | z = [99, 2, 3, 4, 5, 6, 7, 8, 9, 10]
107 | z == x
108 | ```
109 |
110 | ## True
111 |
112 | But because it’s not referring to the same object in memory, it’s not
113 | identical.
114 |
115 | ``` python
116 | [id(z), id(x)]
117 | ```
118 |
119 | ## [140518311389384, 140518311388936]
120 |
121 | ``` python
122 | z is x
123 | ```
124 |
125 | ## False
126 |
127 | In R, `x` and `y` were both equal and identical initially, but when we
128 | changed `x`, we broke both the equality and the identity.
129 |
130 | ``` r
131 | x == y # not equal
132 | ```
133 |
134 | ## [1] FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
135 |
136 | ``` r
137 | obj_addr(x) == obj_addr(y) # not identical
138 | ```
139 |
140 | ## [1] FALSE
141 |
142 | Note that there is a function called `identical()` in R that isn’t using
143 | the same definition of *identical* that we’re using here. Instead of
144 | outputting a vector of `TRUE` and `FALSE` for each pairwise logical
145 | comparison that we’re making between `x` and `y`, it only returns `TRUE`
146 | if all values are `TRUE` and `FALSE` otherwise
147 |
148 | ``` r
149 | identical(x, y)
150 | ```
151 |
152 | ## [1] FALSE
153 |
--------------------------------------------------------------------------------
/syllabus.md:
--------------------------------------------------------------------------------
1 | Python for R Users
2 | ================
3 |
4 | This course is designed for students who are adept at using R to analyze
5 | data and are looking to expand their toolbox by gaining familiarity with
6 | Python. The emphasis will be on learning Python’s capacity to do data
7 | science, both as an analog to R and as a distinct language with
8 | particular strengths and weaknesses.
9 |
10 | **Instructor**: Andrew Bray (andrewbray@berkeley.edu)
11 |
12 | **Lecture**: Tu 9-10am
13 |
14 | **Office Hours**: W 2-3 pm, Th 4-5pm (see Piazza for link)
15 |
16 | **Materials**
17 |
18 | - [A Whirlwind Tour of
19 | Python](https://www.oreilly.com/programming/free/files/a-whirlwind-tour-of-python.pdf),
20 | Jake VanderPlas
21 |
22 | - [Python Data Science
23 | Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/),
24 | Jake VanderPlas
25 |
26 | ### Course Communication
27 |
28 | This term we will be using Piazza for class discussion. The system is
29 | highly catered to getting you help fast and efficiently from classmates
30 | and myself. Rather than emailing questions to the me, I encourage you to
31 | post your questions on Piazza.
32 |
33 | ### Grading
34 |
35 | This is a 1 unit Credit/No Credit class. Your assessment is based on two
36 | items:
37 |
38 | **Homework**: Simple practice assignments that serve to cement your
39 | familiarity with the basic aspects of coding in Python. There will be a
40 | homework following each lecture and they will generally be due on
41 | gradescope at 8 pm on Friday. These will be graded on a credit/no credit
42 | basis.
43 |
44 | **Lab**: More involved analyses due on gradescope 8pm Sunday. These will
45 | be graded on a credit/no credit basis.
46 |
47 | To get credit for the course, you must get credit for 2/3 of homeworks
48 | and 2/3 of the labs.
49 |
50 | ### Tentative list of topics
51 |
52 | 1. Using Python with R
53 | 2. Python basics I: syntax, semantics, and types
54 | 3. Python basics II: control flow, functions, iteration
55 | 4. Computations and aggregations on arrays with NumPy
56 | 5. Data wrangling with Pandas
57 | 6. Visualization with Matplotlib and Plotly
58 | 7. Modeling with Scikit-Learn
59 | 8. Object-oriented programming in Python and R
60 |
--------------------------------------------------------------------------------