├── docs
├── _config.yml
└── index.md
├── python
├── images
│ ├── Building1.tif
│ ├── Building2.tif
│ ├── Building3.tif
│ ├── Building4.tif
│ ├── Building5.tif
│ ├── Building6.tif
│ ├── Building7.tif
│ └── Building8.tif
├── covid19
│ ├── data
│ │ ├── eurostat_hlth_rs_bds.csv
│ │ ├── eurostat-population.csv
│ │ ├── eurostat_population_by_age_2019.csv
│ │ └── movie-critics.json
│ └── images
│ │ └── datiaperti_small.png
├── atn
│ ├── taxonomy
│ │ └── signal_chain_charts.xlsx
│ ├── capacitors.ipynb
│ ├── resistors.ipynb
│ ├── logfiles
│ │ └── Users_Navigation_Data.doc
│ └── microcircuites_and_descretes.ipynb
├── physics
│ └── images
│ │ └── pollastrini_flooding.png
├── finance
│ └── data
│ │ ├── persons.xml
│ │ ├── world_companies.html
│ │ └── ENI.MI.csv
├── README.md
├── stats
│ └── data
│ │ └── italy
│ │ ├── unemployment_rate_istat_province.csv
│ │ └── unemployment_rate_istat_province_cod_den_uts.csv
├── recommendations.ipynb
├── parsing-data.ipynb
├── python_oop.ipynb
└── linalgebra
│ └── linalgebra_ch1.ipynb
├── .gitignore
├── r
├── stat_learning
│ ├── data
│ │ ├── 5.R.RData
│ │ ├── 7.R.RData
│ │ ├── 10.R.RData
│ │ └── Auto.csv
│ ├── test_roc.R
│ └── chapter1.ipynb
├── rethinking
│ ├── simulated_science_distortion.R
│ ├── quadratic_approximation.R
│ ├── monte_carlo_globe_tossing.R
│ ├── quadratic_approximation_height.R
│ ├── ch10_maximum_entropy.R
│ ├── simulations.R
│ ├── dbinom_grid.R
│ ├── collider_bias.R
│ ├── ch9_easy_hmc.R
│ ├── ch9_king_markov_decision_procedure.R
│ ├── kruschke.R
│ ├── categorical_variables.R
│ ├── posterior_predictive_distribution.R
│ ├── b_spline.R
│ ├── normal_distribution.R
│ ├── ch11_binomial_regression.R
│ ├── sampling_from_grid.R
│ ├── waic_information_criteria.R
│ ├── ch9_hamiltonian_monte_carlo.R
│ ├── percentile_intervals.R
│ ├── post_treatment_bias.R
│ ├── polynomial_regression.R
│ ├── gaussian_model_of_height.R
│ ├── multicollineariry.R
│ ├── binomial_distribution.R
│ ├── ch8_continuous_interactions.R
│ ├── linear_prediction.R
│ ├── overfitting.R
│ ├── spurious_association.R
│ ├── interaction_model.R
│ └── masked_relationship.R
└── migration_policy
│ └── data
│ └── policy_database_switzerland.csv
├── README.md
├── julia
└── learn_julia.ipynb
└── datasets.md
/docs/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal
--------------------------------------------------------------------------------
/python/images/Building1.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building1.tif
--------------------------------------------------------------------------------
/python/images/Building2.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building2.tif
--------------------------------------------------------------------------------
/python/images/Building3.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building3.tif
--------------------------------------------------------------------------------
/python/images/Building4.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building4.tif
--------------------------------------------------------------------------------
/python/images/Building5.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building5.tif
--------------------------------------------------------------------------------
/python/images/Building6.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building6.tif
--------------------------------------------------------------------------------
/python/images/Building7.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building7.tif
--------------------------------------------------------------------------------
/python/images/Building8.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building8.tif
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | __pycache__
3 | .Rproj.user
4 | r.Rproj
5 | .Rhistory
6 | .RData
7 | .Ruserdata
8 |
--------------------------------------------------------------------------------
/r/stat_learning/data/5.R.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/r/stat_learning/data/5.R.RData
--------------------------------------------------------------------------------
/r/stat_learning/data/7.R.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/r/stat_learning/data/7.R.RData
--------------------------------------------------------------------------------
/r/stat_learning/data/10.R.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/r/stat_learning/data/10.R.RData
--------------------------------------------------------------------------------
/python/covid19/data/eurostat_hlth_rs_bds.csv:
--------------------------------------------------------------------------------
1 | country_code,2017,2018
2 | IT,192548,N/A
3 | DE,661448,N/A
4 | FR,399865,N/A
5 | ES,138511,N/A
6 |
--------------------------------------------------------------------------------
/python/covid19/images/datiaperti_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/covid19/images/datiaperti_small.png
--------------------------------------------------------------------------------
/python/atn/taxonomy/signal_chain_charts.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/atn/taxonomy/signal_chain_charts.xlsx
--------------------------------------------------------------------------------
/python/physics/images/pollastrini_flooding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/physics/images/pollastrini_flooding.png
--------------------------------------------------------------------------------
/python/finance/data/persons.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Pippo
5 | Pippo
6 | United States
7 |
8 |
9 | Mickey
10 | Mouse
11 | Unites States
12 |
13 |
14 |
--------------------------------------------------------------------------------
/r/rethinking/simulated_science_distortion.R:
--------------------------------------------------------------------------------
1 | # R code 6.1 rethinking book
2 | set.seed(1914)
3 | N <- 200 # num grant proposals
4 | P <- 0.1 # proportion to select
5 | # uncorrelated newsworthiness and trustworthiness
6 | nw <- rnorm(N)
7 | tw <- rnorm(N)
8 | # select top 10 % of combined scores
9 | s <- nw + tw # total score
10 | q <- quantile(s, 1 - p) # top 10 % threshold
11 | selected <- ifelse(s >= q, TRUE, FALSE)
12 | cor(tw[selected], nw[selected])
13 | plot(tw[selected], nw[selected])
14 |
--------------------------------------------------------------------------------
/r/rethinking/quadratic_approximation.R:
--------------------------------------------------------------------------------
1 | # R code 2.6, 2.7 rethinking book
2 |
3 | library(rethinking)
4 |
5 | globe.qa <- quap(
6 | alist(
7 | W ~ dbinom(W + L, p), # binomial likelihood
8 | p ~ dunif(0,1) # uniform prior
9 | ),
10 | data = list(W = 6, L = 3)
11 | )
12 |
13 | precis(globe.qa)
14 |
15 | # analytical calculation
16 | W <- 12
17 | L <- 6
18 |
19 | curve(dbeta(x, W + 1, L + 1), from = 0, to = 1)
20 | #quadratic approximation
21 | curve(dnorm(x, 0.67, 0.16), lty = 2, add = TRUE)
22 |
--------------------------------------------------------------------------------
/r/rethinking/monte_carlo_globe_tossing.R:
--------------------------------------------------------------------------------
1 | # R code 2.8 rethinking book
2 | n_samples <- 1000
3 | p <- rep( NA , n_samples )
4 | p[1] <- 0.5
5 | W <- 6
6 | L <- 3
7 |
8 | for ( i in 2:n_samples ) {
9 | p_new <- rnorm( 1 , p[i-1] , 0.1 )
10 | if ( p_new < 0 ) p_new <- abs( p_new )
11 | if ( p_new > 1 ) p_new <- 2 - p_new
12 | q0 <- dbinom( W , W+L , p[i-1] )
13 | q1 <- dbinom( W , W+L , p_new )
14 | p[i] <- ifelse( runif(1) < q1/q0 , p_new , p[i-1] )
15 | }
16 |
17 | dens( p , xlim=c(0,1) )
18 | curve( dbeta( x , W+1 , L+1 ) , lty=2 , add=TRUE )
--------------------------------------------------------------------------------
/r/rethinking/quadratic_approximation_height.R:
--------------------------------------------------------------------------------
1 | # R code 4.26 rethinking book
2 | library(rethinking)
3 | data("Howell1")
4 | d <- Howell1
5 | d2 <- d[d$age >= 18,]
6 |
7 | # model definition
8 | flist <- alist(
9 | height ~ dnorm(mu, sigma),
10 | mu ~ dnorm(156, 10),
11 | sigma ~ dunif(0, 50)
12 | )
13 |
14 | m4.1 <- quap(flist, data = d2)
15 | precis(m4.1)
16 |
17 | # variance-covariance matrix
18 | vcov(m4.1)
19 | # variances
20 | diag(vcov(m4.1))
21 | cov2cor(vcov(m4.1))
22 |
23 | # samples from the multi-dimensional posterior
24 | post <- extract.samples(m4.1, n = 1e4)
25 | head(post)
26 | precis(post)
--------------------------------------------------------------------------------
/r/rethinking/ch10_maximum_entropy.R:
--------------------------------------------------------------------------------
1 | # R code 10.1 rethinking book
2 | # Example 5 buckets, 10 pebbles.
3 | p <- list()
4 | # we define 5 different distributions
5 | p$A <- c(0, 0, 10, 0, 0)
6 | p$B <- c(0, 1, 8, 1, 0)
7 | p$C <- c(0, 2, 6, 2, 0)
8 | p$D <- c(1, 2, 4, 2, 1)
9 | p$E <- c(2, 2, 2, 2, 2)
10 |
11 | # we define the probability distribution by normalizing them
12 | p_norm <- lapply(p, function(q) q / sum(q))
13 |
14 | # We can compute the information entropy. We can see from it that distribution E
15 | # is the most likely and has the biggest entropy.
16 | (H <- sapply(p_norm, function(q) - sum(ifelse(q == 0, 0, q * log(q)))))
--------------------------------------------------------------------------------
/python/finance/data/world_companies.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | World's Companies
7 |
8 |
9 | World's Companies
10 |
11 |
12 |
13 | Company
14 | Contact
15 | Country
16 |
17 |
18 | Alfreds Futterkiste
19 | Maria Anders
20 | Germany
21 |
22 |
23 | Centro comercial Moctezuma
24 | Francisco Chang
25 | Mexico
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/python/covid19/data/eurostat-population.csv:
--------------------------------------------------------------------------------
1 | country_code,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
2 | CH,7593494,7701856,7785806,7870134,7954662,8039060,8139631,8237666,8327126,8419550,8484130,8544527
3 | DE,82217837,82002356,81802257,80222065,80327900,80523746,80767463,81197537,82175684,82521653,82792351,83019213
4 | ES,45668939,46239273,46486619,46667174,46818219,46727890,46512199,46449565,46440099,46528024,46658447,46937060
5 | FR,64007193,64350226,64658856,64978721,65276983,65600350,66165980,66458153,66638391,66809816,66918941,67012883
6 | IT,58652875,59000586,59190143,59364690,59394207,59685227,60782668,60795612,60665551,60589445,60483973,60359546
7 | UK,61571647,62042343,62510197,63022532,63495088,63905342,64351203,64853393,65379044,65844142,66273576,66647112
8 |
--------------------------------------------------------------------------------
/r/rethinking/simulations.R:
--------------------------------------------------------------------------------
1 | # R code from snippet 3.23 (rethinking book).
2 | # We use the binomial distribution with, e.g. parameter p = 0.7,
3 | # to create simulations.
4 |
5 | obs <- 1e5
6 | size <- 9
7 | # Generates observations from a sample of size
8 | # Each observation tells, e.g. how many waters we had from each sample of size
9 | dummy_w <- rbinom(obs, size = size, prob = 0.7)
10 |
11 | # cumputes how many times we had one of the possible
12 | # values from each sample, e.g. from samples of two tosses (size = 2) we can have
13 | # water 0, 1 or 2 times. The values are normalized.
14 | table(dummy_w) / obs
15 |
16 | # plot the histogram of the distribution in the simulation data
17 | simplehist(dummy_w, xlab = 'dummy water count')
18 |
19 |
20 |
--------------------------------------------------------------------------------
/python/covid19/data/eurostat_population_by_age_2019.csv:
--------------------------------------------------------------------------------
1 | country_code,total,0_5,5_9,10_14,15_19,20_24,25_29,30_34,35_39,40_44,45_49,50_54,55_59,60_64,65_69,70_74,75_79,70_84,85_over
2 | DE,83019213,3926397,3662238,3702180,4003477,4607272,5193335,5409029,5237416,4841738,5584519,6875948,6598218,5493914,4808497,3596545,4089384,3111597,2277509
3 | IT,60359546,2367686,2722796,2871733,2897141,2990245,3211025,3369346,3704872,4418357,4824297,4934336,4417895,3846237,3490973,3233852,2728681,2176582,2153492
4 | FR,67012883,3741707,4122551,4183612,4136255,3736133,3790316,4062485,4236160,4107779,4555756,4460465,4327805,4081786,3917445,3266695,2179268,1874151,2232514
5 | ES,46937060,2067503,2356886,2505728,2332161,2288322,2525120,2801658,3436405,3984581,3812140,3640087,3273516,2807378,2406253,2187986,1630452,1363069,1517815
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/r/migration_policy/data/policy_database_switzerland.csv:
--------------------------------------------------------------------------------
1 | Year,Group
2 | 2000,3
3 | 2000,1
4 | 2000,1
5 | 2000,3
6 | 2000,2
7 | 2001,0
8 | 2001,1
9 | 2001,0
10 | 2002,1
11 | 2002,3
12 | 2002,2
13 | 2002,1
14 | 2003,1
15 | 2004,1
16 | 2004,3
17 | 2004,2
18 | 2005,3
19 | 2005,3
20 | 2005,2
21 | 2005,3
22 | 2005,0
23 | 2005,1
24 | 2007,3
25 | 2007,2
26 | 2007,2
27 | 2007,2
28 | 2007,2
29 | 2007,2
30 | 2007,3
31 | 2008,0
32 | 2008,1
33 | 2008,1
34 | 2008,3
35 | 2008,3
36 | 2008,2
37 | 2008,3
38 | 2008,0
39 | 2008,1
40 | 2008,3
41 | 2008,3
42 | 2008,2
43 | 2009,3
44 | 2009,1
45 | 2009,1
46 | 2010,3
47 | 2010,0
48 | 2010,1
49 | 2010,0
50 | 2010,3
51 | 2010,2
52 | 2011,1
53 | 2011,2
54 | 2011,3
55 | 2011,3
56 | 2011,1
57 | 2011,0
58 | 2011,1
59 | 2011,3
60 | 2012,1
61 | 2012,2
62 | 2012,2
63 | 2012,2
64 | 2012,1
65 | 2013,3
--------------------------------------------------------------------------------
/r/rethinking/dbinom_grid.R:
--------------------------------------------------------------------------------
1 | # R code 2.3 rethinking book. The purpose of
2 | # this script is to build a model and evaluate the probability
3 | # distribution (posterior) of the model parameter
4 | # define grid
5 | p_grid <- seq(from = 0, to = 1, length.out = 20)
6 |
7 | # define prior
8 | #prior <- rep(1, 20)
9 | #prior <- ifelse(p_grid < 0.5, 0, 1)
10 | prior <- exp(-5*abs(p_grid - 0.5))
11 |
12 | # compute likelihood at each value in grid
13 | likelihood <-dbinom(6, size = 9, prob = p_grid)
14 |
15 | # compute product of likelihood and prior
16 | unstd.posterior <- likelihood * prior
17 |
18 | # standardize the posterior, so it sums to 1
19 | posterior <- unstd.posterior / sum(unstd.posterior)
20 |
21 | # plot the posterior distribution
22 | plot(p_grid, posterior, type = 'b',
23 | xlab = 'probability of water', ylab = 'posterior probability')
24 | mtext('20 points')
25 |
--------------------------------------------------------------------------------
/r/rethinking/collider_bias.R:
--------------------------------------------------------------------------------
1 | # R code 6.22 rethinking book
2 | library(rethinking)
3 | d <- sim_happiness(seed = 1977, N_years = 1000)
4 | precis(d)
5 | d2 <- d[d$age > 17, ] # only adults
6 | d2$A <- (d2$age - 18) / (65 - 18) # rescale age interval from 65-18 to 0-1
7 | d2$mid <- d2$married + 1 # married = 2, not married = 1
8 | m6.9 <- quap(
9 | alist(
10 | happiness ~ dnorm( mu , sigma ) ,
11 | mu <- a[mid] + bA * A , # here we consider marriage status
12 | a[mid] ~ dnorm(0, 1) ,
13 | bA ~ dnorm(0, 2),
14 | sigma ~ dexp(1)
15 | ) ,
16 | data = d2
17 | )
18 |
19 | precis(m6.9, depth = 2)
20 |
21 | m6.10 <- quap(
22 | alist(
23 | happiness ~ dnorm( mu , sigma ) ,
24 | mu <- a + bA * A , # here we do not consider marriage status
25 | a ~ dnorm(0, 1) ,
26 | bA ~ dnorm(0, 2),
27 | sigma ~ dexp(1)
28 | ) ,
29 | data = d2
30 | )
31 |
32 | precis(m6.10)
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/r/rethinking/ch9_easy_hmc.R:
--------------------------------------------------------------------------------
1 | # R code 9.9 rethinking book
2 | # Example using Hamiltonian Monte Carlo
3 | library(rethinking)
4 | data("rugged")
5 | d <- rugged
6 | d$log_gdp <- log(d$rgdppc_2000)
7 | dd <- d[complete.cases(d$rgdppc_2000), ]
8 | dd$log_gdp_std <- dd$log_gdp / mean(dd$log_gdp)
9 | dd$rugged_std <- dd$rugged / max(dd$rugged)
10 | dd$cid <- ifelse(dd$cont_africa == 1, 1, 2)
11 |
12 | dat_slim <- list(
13 | log_gdp_std = dd$log_gdp_std,
14 | rugged_std = dd$rugged_std,
15 | cid = as.integer(dd$cid)
16 | )
17 |
18 | str(dat_slim)
19 |
20 | m9.1 <- ulam(
21 | alist(
22 | log_gdp_std ~ dnorm( mu , sigma ) ,
23 | mu <- a[cid] + b[cid] * (rugged_std - 0.215),
24 | a[cid] ~ dnorm(1, 0.1) ,
25 | b[cid] ~ dnorm(0, 0.3),
26 | sigma ~ dexp(1)
27 | ) ,
28 | data = dat_slim,
29 | chains = 4,
30 | cores = 4,
31 | iter = 1000
32 | )
33 |
34 | precis(m9.1, depth = 2)
35 |
36 | pairs(m9.1)
37 | traceplot(m9.1, chains = 1)
38 |
39 |
--------------------------------------------------------------------------------
/python/covid19/data/movie-critics.json:
--------------------------------------------------------------------------------
1 | {"Lisa Rose": {"Lady in the Water": 2.5, "Snakes on a Plane": 3.5,
2 | "Just My Luck": 3.0, "Superman Returns": 3.5, "You, Me and Dupree": 2.5,
3 | "The Night Listener": 3.0},
4 | "Gene Seymour": {"Lady in the Water": 3.0, "Snakes on a Plane": 3.5,
5 | "Just My Luck": 1.5, "Superman Returns": 5.0, "The Night Listener": 3.0,
6 | "You, Me and Dupree": 3.5},
7 | "Michael Phillips": {"Lady in the Water": 2.5, "Snakes on a Plane": 3.0,
8 | "Superman Returns": 3.5, "The Night Listener": 4.0},
9 | "Claudia Puig": {"Snakes on a Plane": 3.5, "Just My Luck": 3.0,
10 | "The Night Listener": 4.5, "Superman Returns": 4.0,
11 | "You, Me and Dupree": 2.5},
12 | "Mick LaSalle": {"Lady in the Water": 3.0, "Snakes on a Plane": 4.0,
13 | "Just My Luck": 2.0, "Superman Returns": 3.0, "The Night Listener": 3.0,
14 | "You, Me and Dupree": 2.0},
15 | "Jack Matthews": {"Lady in the Water": 3.0, "Snakes on a Plane": 4.0,
16 | "The Night Listener": 3.0, "Superman Returns": 5.0, "You, Me and Dupree": 3.5},
17 | "Toby": {"Snakes on a Plane":4.5,"You, Me and Dupree":1.0,"Superman Returns":4.0}}
18 |
--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | Python Cookbook
2 | ===============
3 | A collection of notebooks with examples about the main Python packages: NumPy, Pandas, Matplotlib, SciPy.
4 |
5 | * [NumPy and SciPy](scipy-numpy-cheat-sheet.ipynb)
6 | * [Jupyter widgets](jupyter_widgets.ipynb)
7 | * [Pandas](intro_to_pandas.ipynb)
8 | * [Object-Oriented Python](python_oop.ipynb)
9 |
10 | Good references for Python programming are:
11 | * Jake VanderPlas' [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/)
12 | * Wes McKinney's [Python for Data Analysis, 3rd Edition](https://wesmckinney.com/book/)
13 | * Scopatz and Huff's [Effective Computation in Physics](https://www.amazon.com/Effective-Computation-Physics-Research-Python-ebook/dp/B010ORQ8DG)
14 | * [Gorelick - High Performance Python, 2nd Edition](https://www.amazon.com/High-Performance-Python-Performant-Programming/dp/1492055026)
15 |
16 | ## Python packages
17 | [pytest](https://docs.pytest.org/en/stable/), framework for tests
18 | [Python Packaging](https://packaging.python.org/en/latest/), Python code distribution
19 | [Read the Docs](https://about.readthedocs.com/), open source software documentation
20 |
--------------------------------------------------------------------------------
/r/rethinking/ch9_king_markov_decision_procedure.R:
--------------------------------------------------------------------------------
1 | # R code 9.1 rethinking book
2 | # King Markov decision procedure
3 | num_weeks <- 1e5
4 | positions <- rep(0, num_weeks)
5 | current <- 10
6 | for (i in 1:num_weeks) {
7 | # record current position
8 | positions[i] <- current
9 |
10 | # flip coin to generate proposal
11 | proposal <- current + sample(c(-1,1), size = 1)
12 | # make sure he loops around the archipelago
13 | if (proposal < 1) proposal <- 10
14 | if (proposal > 10) proposal <- 1
15 |
16 | # move ?
17 | prob_move <- proposal / current
18 | current <- ifelse(runif(1) < prob_move, proposal, current)
19 | }
20 | plot( 1:1000 , positions[1:1000] )
21 | plot( table( positions ) )
22 |
23 | # Many dimensions problem. A probability density function
24 | # in a hyperspace most of the points are close the its surface
25 | # far from the mean. So sampling close to it won't be efficient.
26 | library(rethinking)
27 | D <- 1000
28 | T <- 1e4
29 | Y <- rmvnorm(T, rep(0, D), diag(D)) # transforms univariate normal distribution, with 0 mean, to a multivariate distribution
30 | rad_dist <- function(Y) sqrt(sum(Y^2))
31 | Rd <- sapply(1:T, function(i) rad_dist(Y[i, ]))
32 | dens(Rd)
--------------------------------------------------------------------------------
/r/rethinking/kruschke.R:
--------------------------------------------------------------------------------
1 | N = 500 # Specify the total number of flips, denoted N.
2 | pHeads = 0.5 # Specify underlying probability of heads.
3 | # Generate a random sample of N flips (heads=1, tails=0):
4 | flipSequence = sample( x=c(0,1), prob=c(1-pHeads,pHeads), size=N, replace=TRUE)
5 | # Compute the running proportion of heads:
6 | r = cumsum( flipSequence ) # Cumulative sum: Number of heads at each step.
7 | n = 1:N # Number of flips at each step.
8 | runProp = r / n # Component by component division.
9 | # Graph the running proportion:
10 | plot( n , runProp , type="o" , log="x" , col="skyblue" ,
11 | xlim=c(1,N) , ylim=c(0.0,1.0) , cex.axis=1.5 ,
12 | xlab="Flip Number" , ylab="Proportion Heads" , cex.lab=1.5 ,
13 | main="Running Proportion of Heads" , cex.main=1.5 )
14 | # Plot a dotted horizontal reference line:
15 | abline( h=pHeads , lty="dotted" )
16 | # Display the beginning of the flip sequence:
17 | flipLetters = paste( c("T","H")[flipSequence[1:10]+1] , collapse="" )
18 | displayString = paste0( "Flip Sequence = " , flipLetters , "..." )
19 | text( N , .9 , displayString , adj=c(1,0.5) , cex=1.3 )
20 | # Display the relative frequency at the end of the sequence.
21 | text( N , .8 , paste("End Proportion =",runProp[N]) , adj=c(1,0.5) , cex=1.3 )
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ## Welcome to the Data Science repository !
2 |
3 | This repository contains code, mostly Jupyter notebooks in Python and R, about data science and statistical inference. The main projects are
4 |
5 | - [covid19 monitoring notebook](https://github.com/luigiselmi/datascience/blob/master/python/covid19/covid19-monitoring-notebook.ipynb) is a single notebook in python that monitors the >
6 | virus in some European countries.
7 | - [Statistical Learning](https://github.com/luigiselmi/datascience/blob/master/r/stat_learning/chapter1.ipynb) is a collection of notebooks in R with my notes about a course on
8 | statistical inference and worked out examples taken from the book used in the course.
9 | - [Bayesian Inference](https://github.com/luigiselmi/datascience/blob/master/r/rethinking/probability.ipynb) is a collection
10 | of notes about probability and bayesian inference and R scripts taken
11 | from the books I am using for this project.
12 | - [copernicus](https://github.com/luigiselmi/datascience/blob/master/python/copernicus/copernicus_services.ipynb) notebooks about climate change using the Copernicus
13 | data.
14 | - [digital image processing](https://github.com/luigiselmi/datascience/blob/master/python/imaging/digital_image_processing.ipynb) notebooks about digital image processing
15 |
--------------------------------------------------------------------------------
/r/rethinking/categorical_variables.R:
--------------------------------------------------------------------------------
1 | # R code 5.34 rethinking book
2 | library(rethinking)
3 | data("Howell1")
4 | d <- Howell1
5 | str(d)
6 |
7 | # we create an index variable to represent the sex
8 | d$sex <- ifelse(d$male == 1, 2, 1)
9 |
10 | # We build a model with an index variable as prior
11 | m5.8 <- quap(
12 | alist(
13 | height ~ dnorm( mu , sigma ) ,
14 | mu <- a[sex] ,
15 | a[sex] ~ dnorm( 178 , 20 ) ,
16 | sigma ~ dunif(0, 50)
17 | ) ,
18 | data = d
19 | )
20 | precis(m5.8, depth = 2)
21 |
22 | # Let's extract a sample from the posterior distribution
23 | post <- extract.samples(m5.8)
24 | post$diff_fm <- post$a[, 1] - post$a[, 2]
25 | precis(post, depth = 2)
26 |
27 | # Let#s use again the primate milk dataset
28 | data("milk")
29 | d <- milk
30 | unique((d$clade))
31 | d$clade_id <- as.integer(d$clade)
32 | # we build a model to measure the average energy in each clade (cialda)
33 | d$K <- scale(d$kcal.per.g)
34 | m5.9 <- quap(
35 | alist(
36 | K ~ dnorm( mu , sigma ) ,
37 | mu <- a[clade_id] ,
38 | a[clade_id] ~ dnorm( 0 , 0.5 ) ,
39 | sigma ~ dexp(1)
40 | ) ,
41 | data = d
42 | )
43 | labels <- paste("a[", 1:4, "]:", levels(d$clade), sep = "")
44 | plot(precis(m5.9, depth = 2, pars = "a"), labels = labels)
45 |
--------------------------------------------------------------------------------
/r/rethinking/posterior_predictive_distribution.R:
--------------------------------------------------------------------------------
1 | # R code 3.26 rethinking book
2 | # Once we have a model we want to use it to make predictions.
3 | # In this script we create a model then we collect samples from it
4 | data_size <- 1000 # size of the dataset
5 | # creates a sequence of values between 0 and 1,
6 | # that represent possible values of a parameter
7 | p_grid <- seq(from = 0, to = 1, length.out = data_size)
8 |
9 | # creates a sequence of ones that represent the
10 | # prior probability of each possible value of the parameter
11 | prior <- rep(1, data_size)
12 |
13 | # computes the likelihood of each value of the paramenter
14 | # under the binomial distribution and the observations
15 | likelihood <- dbinom(6, size = 9, prob = p_grid)
16 | #plot(likelihood)
17 |
18 | # computes the posterior distribution
19 | posterior <- likelihood * prior
20 |
21 | # normalization of the posterior distribution
22 | posterior <- posterior / sum(posterior)
23 |
24 | # creates samples of from the posterior distribution
25 | samples_size = 1e4
26 | samples <- sample(p_grid, prob = posterior, size = samples_size, replace = TRUE)
27 |
28 | # there is uncertainty over the parameter so we propagate through all
29 | # the possible values to create predictions
30 | w <- rbinom(1e4, size = 9, prob = samples)
31 |
32 | # plot the predictions
33 | simplehist(w)
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/r/rethinking/b_spline.R:
--------------------------------------------------------------------------------
1 | # R code 4.72 rethinking book
2 | library(rethinking)
3 | data("cherry_blossoms")
4 | d <- cherry_blossoms
5 | precis(d)
6 | # plot temperature against year
7 | plot(d$temp ~ d$year)
8 |
9 | d2 <- d[complete.cases(d$temp), ] # complete cases on temp
10 | num_knots <- 15
11 | knot_list <- quantile(d2$year, probs = seq(0, 1, length.out = num_knots))
12 |
13 | # constructs degree 3, cubic, spline
14 | library(splines)
15 | B <- bs(d2$year,
16 | knots = knot_list[-c(1, num_knots)],
17 | degree = 3, intercept = TRUE)
18 |
19 | # plot the basis functions
20 | plot( NULL , xlim=range(d2$year) , ylim=c(0,1) , xlab="year" , ylab="basis value")
21 | for ( i in 1:ncol(B) ) lines( d2$year , B[,i] )
22 |
23 | # defines the model
24 | m4.7 <- quap(
25 | alist(
26 | T ~ dnorm( mu , sigma ) ,
27 | mu <- a + B %*% w ,
28 | a ~ dnorm(6,10),
29 | w ~ dnorm(0,1),
30 | sigma ~ dexp(1)
31 | ),
32 | data=list( T=d2$temp , B=B ) ,
33 | start=list( w=rep( 0 , ncol(B) ) ) )
34 |
35 | # plot the weighted basis functions
36 | post <- extract.samples(m4.7)
37 | w <- apply( post$w , 2 , mean )
38 | plot( NULL , xlim=range(d2$year) , ylim=c(-2,2) ,
39 | xlab="year" , ylab="basis * weight" )
40 | for ( i in 1:ncol(B) ) lines( d2$year , w[i]*B[,i] )
41 |
42 | # plot mu with 97 % posterior interval
43 | mu <- link( m4.7 )
44 | mu_PI <- apply(mu,2,PI,0.97)
45 | plot( d2$year , d2$temp , col=col.alpha(rangi2,0.3) , pch=16 )
46 | shade( mu_PI , d2$year , col=col.alpha("black",0.5) )
--------------------------------------------------------------------------------
/r/rethinking/normal_distribution.R:
--------------------------------------------------------------------------------
1 | # R code from snippet 4.1 (rethinking book).
2 | # We look at different processes that end up in a normal distribution.
3 | # 1) Normal by addition
4 | # We imagine an individual that takes a certain number of steps,
5 | # whose length is taken from the uniform distribution within an
6 | # interval between -1 and 1, and finally, we look at its final
7 | # position that means we sum up all the steps.
8 | # We repeat this experiment 1000 times. All this can be
9 | # done in R with one single line of code.
10 | pos <- replicate(1000, sum(runif(16, -1, 1)))
11 |
12 | # The important result is that "adding together random values
13 | # from the same distribution, uniform or others, converges to
14 | # a normal distribution.
15 | #hist(pos) # uncomment to plot
16 |
17 | # 2) Normal by multiplication
18 | # Other processes can be described by a multiplication of values
19 | # that fluctuate randomly by a small amount, e.g. 0.1, about a
20 | # central point
21 | deviation <- 0.1
22 | growth <- replicate(10000, prod(1 + runif(12, 0, 0.1)))
23 |
24 | # We still get a normal distribution for the process.
25 | library(rethinking)
26 | #dens(growth, norm.comp = TRUE) # uncomment to plot
27 |
28 | # 3) Normal by log-multiplication
29 | # Multiplying large random deviations are not normal distributed
30 | # but they logarithm are since they are transformed in additions.
31 | deviation <- 0.5
32 | log.big <- replicate(10000, log(prod(1 + runif(12, 0, 0.5))))
33 | #dens(log.big, norm.comp = TRUE) # uncomment to plot
34 |
35 |
36 | curve(exp(-x^2), from = -3, to = 3) # Gaussian distribution
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Data Science
2 | ============
3 | I use this repository for my projects about data science and statistical inference. The main projects are
4 |
5 | - [Python Cookbook](python/README.md) Python data structures, and code examples using the Python core packages for data science: NumPy, Pandas, Matplotlib, SciPy.
6 | - [covid19 monitoring notebook](python/covid19/covid19-monitoring-notebook.ipynb) is a single notebook in python that monitors the spread of the
7 | virus in some European countries.
8 | - [Statistical Learning](r/stat_learning/chapter1.ipynb) is a collection of notebooks in R with my notes about a course on
9 | statistical inference and worked out examples taken from the book used in the course.
10 | - [Bayesian Inference](r/rethinking/probability.ipynb) is a collection
11 | of notes about probability and bayesian inference and R scripts taken
12 | from the books I am using for this project.
13 |
14 | The Digital Image Processing folder has been moved to a new repository [dip](https://github.com/luigiselmi/dip). The copernicus folder has been moved to the new repository [copernicus](https://github.com/luigiselmi/copernicus). The iia folder has been moved to the new [climate]() repository.
15 |
16 | ## Open data
17 | A list of open data sets about science, economics, finance, and health is available [here](datasets.md).
18 |
19 | ## Python packages for data science
20 | [PyMC](https://www.pymc.io/welcome.html), Bayesian probabilistic programming in Python
21 | [KDEpy](https://kdepy.readthedocs.io/en/latest/#), Kernel density estimation for Python
22 | [Pyro](https://pyro.ai/), deep universal probabilistic programming
23 | [pyFFTW](https://pyfftw.readthedocs.io/en/latest/index.html#), fast fourier transform
24 |
--------------------------------------------------------------------------------
/r/stat_learning/test_roc.R:
--------------------------------------------------------------------------------
1 | # Create the data set and plot
2 | set.seed(1)
3 |
4 | # Create the data set and Split in training and test set
5 | X <- matrix(rnorm(200 * 2), ncol = 2) # random sample matrix from a normal distribution with mean = 0 and standard deviation = 1
6 | y <- c(rep(1, 100), rep(2, 100)) # two classes y = 1 and y = 2
7 | X[y == 1, ] <- X[y == 1, ] + 1 # move apart the class with y = 1
8 | train <- sample(200, 100) # random integers for index of train data
9 | dat <- data.frame(x = X, y = as.factor(y))
10 | plot(X, col = y, xlab = "X1", ylab = "X2")
11 |
12 | library(e1071)
13 | # Fit the training data using svm with radial kernel
14 | #svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial", gamma = 2, cost = 1)
15 | # Fit the training data using svm with linear kernel
16 | svmfit <- svm(y ~ ., data = dat[train, ], kernel = "linear", cost = 6, scale = TRUE)
17 |
18 | plot(svmfit, dat[train, ])
19 |
20 | # Function definition to Plot the ROC curves
21 | library(ROCR)
22 | rocplot <- function(pred, truth, ...) {
23 | predob <- prediction(pred, truth)
24 | perf <- performance(predob, measure = "tpr", x.measure = "fpr")
25 | plot(perf, col = "blue", colorize = TRUE, ...)
26 | auc <- performance(predob, measure = "auc")
27 | abline(a=0, b= 1)
28 | return(auc)
29 | }
30 |
31 | par(mfrow = c(1,2))
32 | # Predict the training data and plot the ROC curve
33 | fitted <- attributes(predict(svmfit, dat[train, ], decision.values = TRUE))$decision.values
34 | auc <- rocplot(fitted, dat[train, "y"], main = "Training Data")
35 | auc@y.values
36 |
37 | # Predict the test data and plot the ROC curve
38 | fitted <- attributes(predict(svmfit, dat[-train, ], decision.values = TRUE))$decision.values
39 | auc <-rocplot(fitted, dat[-train,"y"], main = "Test Data")
40 | auc@y.values
41 |
42 |
--------------------------------------------------------------------------------
/r/rethinking/ch11_binomial_regression.R:
--------------------------------------------------------------------------------
1 | # R code 11.1 rethinking book
2 | # Binomial regression - Prosocial chimpanzees experiment.
3 | # We want to see whether a chimpazee shares food with others or not.
4 | library(rethinking)
5 | data("chimpanzees")
6 | d <- chimpanzees
7 |
8 | d$treatment <- 1 + d$prosoc_left + 2 * d$condition
9 |
10 | xtabs(~ treatment + prosoc_left + condition, d)
11 |
12 | m11.1 <- quap(
13 | alist(
14 | pulled_left ~ dbinom( 1 , p ) ,
15 | logit(p) <- a,
16 | a ~ dnorm(0, 1.5)
17 | ) ,
18 | data = d
19 | )
20 |
21 | m11.2 <- quap(
22 | alist(
23 | pulled_left ~ dbinom( 1 , p ) ,
24 | logit(p) <- a + b[treatment] ,
25 | a ~ dnorm(0, 1.5) ,
26 | b[treatment] ~ dnorm(0, 10)
27 | ) ,
28 | data = d
29 | )
30 |
31 | m11.3 <- quap(
32 | alist(
33 | pulled_left ~ dbinom( 1 , p ) ,
34 | logit(p) <- a + b[treatment] ,
35 | a ~ dnorm(0, 1.5) ,
36 | b[treatment] ~ dnorm(0, 0.5)
37 | ) ,
38 | data = d
39 | )
40 |
41 | set.seed(1999)
42 | prior <- extract.prior(m11.3, n = 1e4)
43 | p <- sapply(1:4, function(k) inv_logit(prior$a + prior$b[, k]))
44 | dens(abs(p[, 1] - p[, 2]), adj = 0.1)
45 |
46 | # prior trimmed data list
47 | dat_list <- list(
48 | pulled_left = d$pulled_left ,
49 | actor = d$actor ,
50 | treatment = as.integer(d$treatment)
51 | )
52 |
53 | # particles in 11-dimensional space
54 | m11.4 <- ulam(
55 | alist(
56 | pulled_left ~ dbinom(1, p) ,
57 | logit(p) <- a[actor] + b[treatment] ,
58 | a[actor] ~ dnorm(0, 1.5) ,
59 | b[treatment] ~ dnorm(0, 0.5)
60 | ) ,
61 | data = dat_list, chains = 4
62 | )
63 |
64 | precis(m11.4, depth = 2)
65 |
66 | post <- extract.samples(m11.4)
67 | p_left <- inv_logit(post$a)
68 | labs <- c("R/N", "L/N", "R/P", "L/P")
69 | plot(precis(m11.4, depth = 2, pars = "b"), labels = labs) # not what expected
70 | #plot(precis(as.data.frame(p_left)), xlim = c(0,1)) # error
71 |
--------------------------------------------------------------------------------
/r/rethinking/sampling_from_grid.R:
--------------------------------------------------------------------------------
1 | # R code from snippet 3.2 (rethinking book)
2 | # Usually a model has many parameters and cannot be handled analytically
3 | # so the parameters are computed numerically using samples taken from
4 | # posterior distribution.This script shows how to build a sample from a
5 | # statistical model with only one parameter (like in the Globe example)
6 | # and how to summarize the posterior distribution computing for example
7 | # the value with highest probability or the probability for the parameter
8 | # to have any value below(above) a defined one.
9 | data_size <- 1000 # size of the dataset
10 |
11 | # creates a sequence of values between 0 and 1,
12 | # that represent possible values of a parameter
13 | p_grid <- seq(from = 0, to = 1, length.out = data_size)
14 |
15 | # creates a sequence of ones that represent the
16 | # prior probability of each possible value of the parameter
17 | prob_p <- rep(1, data_size)
18 |
19 | # computes the likelihood of each value of the paramenter
20 | # under the binomial distribution and the observations
21 | prob_data <- dbinom(6, size = 9, prob = p_grid)
22 |
23 | # computes the posterior distribution
24 | posterior <- prob_data * prob_p
25 |
26 | # normalization of the posterior distribution
27 | posterior <- posterior / sum(posterior)
28 | #plot(posterior)
29 |
30 | # creates a sample of values from the distribution
31 | samples_size = 1e4
32 | samples <- sample(p_grid, prob = posterior, size = samples_size, replace = TRUE)
33 | #plot(samples) # uncomment to plot the data
34 |
35 | library(rethinking)
36 | # plot the (density) posterior distribution
37 | #dens(samples) # uncomment to plot the data
38 |
39 | # Sampling to summarize
40 | # posterior probability for the parameter to be below 0.5 (without sampling)
41 | sum(posterior[p_grid < 0.5])
42 | # same using the samples
43 | sum(samples < 0.5) / samples_size
44 |
45 | # posterior probability for the parameter between 0.5 and 0.75, using ther samples
46 | sum(samples > 0.5 & samples < 0.75) / samples_size
47 |
48 |
49 |
--------------------------------------------------------------------------------
/r/rethinking/waic_information_criteria.R:
--------------------------------------------------------------------------------
1 | # R code 7.33 rethinking book
2 | library(rethinking)
3 | # We want to investigate the influence of body mass (M) and
4 | # brain volume (B) on longevity (L)
5 | data("Primates301")
6 | d <- Primates301
7 | # we standardize the three variables we are going to use
8 | d$log_L <- scale(log(d$longevity))
9 | d$log_B <- scale(log(d$brain))
10 | d$log_M <- scale(log(d$body))
11 | # we look for missing values first
12 | sapply(d[, c("log_L", "log_B", "log_M")], function(x) sum(is.na(x)))
13 | d2 <- d[complete.cases(d$log_L, d$log_M, d$log_B), ] # removed rows with missing values
14 | # Let's define the model to infer the direct influence of brain volume (B) on longevity (L).
15 | # According to the causal graph we have to control the body mass variable (M) to close a
16 | # (pipe) backdoor M->B->L. Controlling a variable means adding it to the model.
17 | m7.8 <- quap(
18 | alist(
19 | log_L ~ dnorm( mu , sigma ) ,
20 | mu <- a + bM * log_M + bB * log_B,
21 | a ~ dnorm(0, 0.1) ,
22 | bM ~ dnorm(0, 0.5),
23 | bB ~ dnorm(0, 0.5),
24 | sigma ~ dexp(1)
25 | ) ,
26 | data = d2
27 | )
28 | # We also define two simpler models to evaluate the accuracy of each one.
29 | m7.9 <- quap(
30 | alist(
31 | log_L ~ dnorm( mu , sigma ) ,
32 | mu <- a + bB * log_B,
33 | a ~ dnorm(0, 0.1) ,
34 | bB ~ dnorm(0, 0.5),
35 | sigma ~ dexp(1)
36 | ) ,
37 | data = d2
38 | )
39 | m7.10 <- quap(
40 | alist(
41 | log_L ~ dnorm( mu , sigma ) ,
42 | mu <- a + bM * log_M ,
43 | a ~ dnorm(0, 0.1) ,
44 | bM ~ dnorm(0, 0.5),
45 | sigma ~ dexp(1)
46 | ) ,
47 | data = d2
48 | )
49 | # We compare the Widely Applicable Information Criterion (WAIC) of all the
50 | # models. The WAIC provides an approximation of the out-of-sample deviance
51 | # of a model. The smaller WAIC value the better because it means the model
52 | # is closer to the target one.
53 | set.seed(301)
54 | compare(m7.8, m7.9, m7.10)
55 | # Let's compare the posterior distributions of the models' parameters
56 | coeftab(m7.8, m7.9, m7.10)
57 | # Let's see how body mass and brain volume correlate
58 | cor(d2$log_B, d2$log_M)
59 |
--------------------------------------------------------------------------------
/python/stats/data/italy/unemployment_rate_istat_province.csv:
--------------------------------------------------------------------------------
1 | DEN_UTS,unemployment
2 | Imperia,11.13
3 | Taranto,14.959
4 | Cuneo,4.62
5 | Rovigo,9.092
6 | Savona,7.572
7 | Oristano,14.832
8 | Piacenza,6.065
9 | Lecco,5.493
10 | Potenza,8.442
11 | Trieste,5.648
12 | Sondrio,6.484
13 | Brescia,4.891
14 | Matera,7.922
15 | Verbano-Cusio-Ossola,5.808
16 | Grosseto,8.041
17 | Ragusa,15.011
18 | Firenze,6.184
19 | Mantova,4.583
20 | Caltanissetta,16.32
21 | Sassari,13.022
22 | Cremona,4.975
23 | Trento,4.807
24 | Forli-Cesena,5.487
25 | Messina,23.909
26 | Barletta-Andria-Trani,15.602
27 | Como,7.501
28 | Palermo,19.449
29 | Rimini,7.409
30 | Perugia,6.458
31 | La Spezia,9.782
32 | Cagliari,16.409
33 | Siena,5.878
34 | Sud Sardegna,13.095
35 | Bari,9.968
36 | Siracusa,21.373
37 | Padova,5.558
38 | Salerno,15.123
39 | Campobasso,10.46
40 | Alessandria,6.867
41 | Belluno,4.239
42 | Aosta,7.26
43 | Venezia,6.026
44 | Ancona,8.406
45 | Cosenza,18.775
46 | Brindisi,15.8
47 | Genova,7.543
48 | Varese,6.556
49 | Reggio di Calabria,16.597
50 | Fermo,4.822
51 | Latina,10.941
52 | Bologna,4.574
53 | Macerata,7.449
54 | Torino,8.256
55 | Bolzano,3.837
56 | Caserta,15.349
57 | Pescara,11.203
58 | Asti,7.447
59 | Vibo Valentia,19.223
60 | L'Aquila,9.493
61 | Monza Brianza,6.586
62 | Lodi,5.277
63 | Enna,18.169
64 | Chieti,9.651
65 | Roma,9.763
66 | Pavia,6.953
67 | Ferrara,7.299
68 | Catania,15.452
69 | Biella,6.016
70 | Pistoia,9.776
71 | Trapani,15.749
72 | Nuoro,7.269
73 | Vercelli,8.231
74 | Novara,7.676
75 | Benevento,12.774
76 | Arezzo,7.051
77 | Napoli,23.672
78 | Verona,4.67
79 | Udine,6.692
80 | Treviso,4.903
81 | Pesaro e Urbino,5.532
82 | Foggia,21.767
83 | Terni,7.189
84 | Gorizia,7.67
85 | Crotone,20.33
86 | Lecce,15.351
87 | Pordenone,3.253
88 | Parma,5.752
89 | Viterbo,10.851
90 | Isernia,12.119
91 | Avellino,14.471
92 | Vicenza,4.628
93 | Pisa,6.755
94 | Massa-Carrara,11.271
95 | Ravenna,6.204
96 | Bergamo,3.545
97 | Milano,6.467
98 | Catanzaro,16.795
99 | Frosinone,10.876
100 | Ascoli Piceno,8.474
101 | Reggio nell'Emilia,5.146
102 | Rieti,10.378
103 | Prato,7.061
104 | Modena,4.393
105 | Lucca,11.927
106 | Agrigento,21.777
107 | Livorno,5.453
108 | Teramo,6.929
109 |
--------------------------------------------------------------------------------
/r/rethinking/ch9_hamiltonian_monte_carlo.R:
--------------------------------------------------------------------------------
1 | # R code 9.3 rethinking book
2 | # Hamiltonian Monte Carlo example.
3 | # In this example we'll use a two variable model.
4 | # We need:
5 | # 1) a function of the log-probability
6 | # 2) the gradient
7 | # 3) the step size
8 | # 4) the number of leapfrog steps
9 |
10 | # function U of the log-probability of the data.
11 | # Returns the neg-log-probability
12 | U <- function(q, a = 0, b = 1, k = 0, d = 1) {
13 | muy <- q[1]
14 | mux <- q[2]
15 | U <- sum(dnorm(y, muy, 1, log = TRUE)) + sum(dnorm(x, mux, 1, log = TRUE)) +
16 | dnorm(muy, a, b, log = TRUE) + dnorm(mux, k, d, log = TRUE)
17 | }
18 |
19 | # U gradient function
20 | # Sum of partial derivatives with respect to parameters mux and muy
21 | U_gradient <- function(q, a = 0, b = 1, k = 0, d = 1) {
22 | muy <- q[1]
23 | mux <- q[2]
24 | G1 <- sum(y - muy) + (a - muy) / b^2 # dU/d(muy)
25 | G2 <- sum(x - mux) + (k - mux) / d^2 # dU/d(mux)
26 | return(c(-G1, -G2)) # negative because energy is neg-log-prob
27 | }
28 | # test data
29 | y <- rnorm(50)
30 | x <- rnorm(50)
31 | x <- as.numeric(scale(x))
32 | y <- as.numeric(scale(y))
33 |
34 | library(shape) # for fancy arrows
35 | Q <- list()
36 | Q$q <- c(-0.1,0.2)
37 | pr <- 0.3
38 | plot( NULL , ylab="muy" , xlab="mux" , xlim=c(-pr,pr) , ylim=c(-pr,pr) )
39 | step <- 0.03
40 |
41 | L <- 11 # 0.03/28 for U-turns --- 11 for working example
42 | n_samples <- 4
43 | path_col <- col.alpha("black",0.5)
44 | points( Q$q[1] , Q$q[2] , pch=4 , col="black" )
45 | for ( i in 1:n_samples ) {
46 | Q <- HMC2( U , U_gradient , step , L , Q$q )
47 | if ( n_samples < 10 ) {
48 | for ( j in 1:L ) {
49 | K0 <- sum(Q$ptraj[j,]^2)/2 # kinetic energy
50 | lines( Q$traj[j:(j+1),1] , Q$traj[j:(j+1),2] , col=path_col , lwd=1+2*K0 )
51 | }
52 | points( Q$traj[1:L+1,] , pch=16 , col="white" , cex=0.35 )
53 | Arrows( Q$traj[L,1] , Q$traj[L,2] , Q$traj[L+1,1] , Q$traj[L+1,2] ,
54 | arr.length=0.35 , arr.adj = 0.7 )
55 | text( Q$traj[L+1,1] , Q$traj[L+1,2] , i , cex=0.8 , pos=4 , offset=0.4 )
56 | }
57 | points( Q$traj[L+1,1] , Q$traj[L+1,2] , pch=ifelse( Q$accept==1 , 16 , 1 ) ,
58 | col=ifelse( abs(Q$dH)>0.1 , "red" , "black" ) )
59 | }
--------------------------------------------------------------------------------
/r/rethinking/percentile_intervals.R:
--------------------------------------------------------------------------------
1 | # R code 3.11 rethinking book
2 | # This script shows how to summarize a posterior distributions
3 | # by computing point estimates (mode, mean, median) and
4 | # intervals
5 |
6 | data_size <- 1000 # size of the dataset
7 | # creates a sequence of values between 0 and 1,
8 | # that represent possible values of a parameter
9 | p_grid <- seq(from = 0, to = 1, length.out = data_size)
10 |
11 | # creates a sequence of ones that represent the
12 | # prior probability of each possible value of the parameter
13 | prior <- rep(1, data_size)
14 |
15 | # computes the likelihood of each value of the paramenter
16 | # under the binomial distribution and the observations
17 | likelihood <- dbinom(3, size = 3, prob = p_grid)
18 | #plot(likelihood)
19 |
20 | # computes the posterior distribution
21 | posterior <- likelihood * prior
22 |
23 | # normalization of the posterior distribution
24 | posterior <- posterior / sum(posterior)
25 |
26 | # creates a sample of values from the distribution
27 | samples_size = 1e4
28 | samples <- sample(p_grid, prob = posterior, size = samples_size, replace = TRUE)
29 | #plot(samples)
30 | library(rethinking)
31 | dens(samples)
32 |
33 | # computes the probabilities between 25 % and 75 % of the interval
34 | PI(samples, prob = 0.5)
35 |
36 | # computes the 50 % highest density interval, the narrowest interval
37 | # containing the specified probability mass, e.g. 50 %
38 | HPDI(samples, prob = 0.5)
39 |
40 | # computes the parameter value with highest posterior probability (grid approximation)
41 | # it is called mode or maximum a posteriori (MAP)
42 | p_grid[which.max(posterior)]
43 |
44 | # same but from samples taken from the posterior distribution
45 | chainmode(samples, adj = 0.01)
46 |
47 | # computes the mean
48 | mean(samples)
49 |
50 | # computes the median
51 | median(samples)
52 |
53 | # defines a loss function that computes the weighted distance between
54 | # a parameter value x and any other value in the distribution
55 | loss <- function(x) sum(posterior*abs(x - p_grid))
56 |
57 | # computes the loss in assuming 0.5 as the value of the parameter
58 | loss(0.5)
59 |
60 | # computes the total loss for any possible value of the parameter
61 | tot_loss <- sapply(p_grid, loss)
62 |
63 | # finds the parameter value with minimum loss
64 | p_grid[which.min(tot_loss)]
65 |
66 |
67 |
--------------------------------------------------------------------------------
/r/rethinking/post_treatment_bias.R:
--------------------------------------------------------------------------------
1 | # R code 6.16 rethinking book
2 | # We first build a set of simulated data then we build the model
3 | # that will use the data.
4 | # 1) data simulation
5 | set.seed(71)
6 | # number of plants
7 | N <- 100
8 |
9 | # simulates initial heights
10 | h0 <- rnorm(N, 10, 2)
11 |
12 | # assign treatments and simulate fungus and growth
13 | treatment <- rep(0:1, each = N / 2)
14 | fungus <- rbinom(N, size = 1, prob = 0.5 - treatment * 0.4)
15 | h1 <- h0 + rnorm(N, 5 - 3 * fungus)
16 |
17 | # compose a dataframe
18 | d <- data.frame(h0 = h0, h1 = h1, treatment = treatment, fungus = fungus)
19 |
20 | precis(d)
21 |
22 | # 2) build the model that contains our hipothetical rule of plants' height
23 | # p represents the proportion of growth. In this model it doesn't depends on
24 | # other predictors.
25 | m6.6 <- quap(
26 | alist(
27 | h1 ~ dnorm( mu , sigma ) ,
28 | mu <- p * h0 ,
29 | p ~ dlnorm( 0 , 0.25 ) ,
30 | sigma ~ dexp(1)
31 | ) ,
32 | data = d
33 | )
34 |
35 | precis(m6.6)
36 |
37 | # now we add the two predictors: treatment and fungus
38 | m6.7 <- quap(
39 | alist(
40 | h1 ~ dnorm( mu , sigma ) ,
41 | mu <- p * h0 ,
42 | p ~ a + bt * treatment + bf * fungus ,
43 | a ~ dlnorm( 0 , 0.2 ) ,
44 | bt ~ dnorm(0, 0.5) ,
45 | bf ~ dnorm(0, 0.5) ,
46 | sigma ~ dexp(1)
47 | ) ,
48 | data = d
49 | )
50 | # from this model it looks like treatment doesn't have any effect on the plant's growth
51 | precis(m6.7)
52 | # If we want to know the impact of treatment on growth we must remove fungus as
53 | # a predictor
54 | m6.8 <- quap(
55 | alist(
56 | h1 ~ dnorm( mu , sigma ) ,
57 | mu <- p * h0 ,
58 | p ~ a + bt * treatment ,
59 | a ~ dlnorm( 0 , 0.2 ) ,
60 | bt ~ dnorm(0, 0.5) ,
61 | sigma ~ dexp(1)
62 | ) ,
63 | data = d
64 | )
65 |
66 | precis(m6.8)
67 | # Let#s plot the directed acyclic graph (DAG) of the model
68 | library(dagitty)
69 | plant_dag <- dagitty("dag {
70 | H0 -> H1
71 | F -> H1
72 | T -> F
73 | }"
74 | )
75 | coordinates(plant_dag) <- list(x = c(H0 = 0, T = 2, F = 1.5, H1 = 1),
76 | y = c(H0 = 0, T = 0, F = 1, H1 = 2))
77 | plot(plant_dag)
78 | dseparated(plant_dag, "T", "H1")
79 | dseparated(plant_dag, "T", "H1", "F")
80 |
81 | # We evaluate the model using the WAIC framework (ch.7)
82 | # WAIC provides an estimation of the deviance of the model
83 | # from the target (best) model. The lower it is, the better.
84 | set.seed(11)
85 | WAIC(m6.7)
86 |
--------------------------------------------------------------------------------
/python/stats/data/italy/unemployment_rate_istat_province_cod_den_uts.csv:
--------------------------------------------------------------------------------
1 | DEN_UTS,unemployment,COD_UTS
2 | Agrigento,21.777,84
3 | Alessandria,6.867,6
4 | Ancona,8.406,42
5 | Aosta,7.26,7
6 | Arezzo,7.051,51
7 | Ascoli Piceno,8.474,44
8 | Asti,7.447,5
9 | Avellino,14.471,64
10 | Bari,9.968,72
11 | Barletta-Andria-Trani,15.602,110
12 | Belluno,4.239,25
13 | Benevento,12.774,62
14 | Bergamo,3.545,16
15 | Biella,6.016,96
16 | Bologna,4.574,37
17 | Bolzano,3.837,21
18 | Brescia,4.891,17
19 | Brindisi,15.8,74
20 | Cagliari,16.409,92
21 | Caltanissetta,16.32,85
22 | Campobasso,10.46,70
23 | Caserta,15.349,61
24 | Catania,15.452,87
25 | Catanzaro,16.795,79
26 | Chieti,9.651,69
27 | Como,7.501,13
28 | Cosenza,18.775,78
29 | Cremona,4.975,19
30 | Crotone,20.33,101
31 | Cuneo,4.62,4
32 | Enna,18.169,86
33 | Fermo,4.822,109
34 | Ferrara,7.299,38
35 | Firenze,6.184,48
36 | Foggia,21.767,71
37 | Forli-Cesena,5.487,40
38 | Frosinone,10.876,60
39 | Genova,7.543,10
40 | Gorizia,7.67,31
41 | Grosseto,8.041,53
42 | Imperia,11.13,8
43 | Isernia,12.119,94
44 | L'Aquila,9.493,66
45 | La Spezia,9.782,11
46 | Latina,10.941,59
47 | Lecce,15.351,75
48 | Lecco,5.493,97
49 | Livorno,5.453,49
50 | Lodi,5.277,98
51 | Lucca,11.927,46
52 | Macerata,7.449,43
53 | Mantova,4.583,20
54 | Massa-Carrara,11.271,45
55 | Matera,7.922,77
56 | Messina,23.909,83
57 | Milano,6.467,15
58 | Modena,4.393,36
59 | Monza Brianza,6.586,108
60 | Napoli,23.672,63
61 | Novara,7.676,3
62 | Nuoro,7.269,91
63 | Oristano,14.832,95
64 | Padova,5.558,28
65 | Palermo,19.449,82
66 | Parma,5.752,34
67 | Pavia,6.953,18
68 | Perugia,6.458,54
69 | Pesaro e Urbino,5.532,41
70 | Pescara,11.203,68
71 | Piacenza,6.065,33
72 | Pisa,6.755,50
73 | Pistoia,9.776,47
74 | Pordenone,3.253,93
75 | Potenza,8.442,76
76 | Prato,7.061,100
77 | Ragusa,15.011,88
78 | Ravenna,6.204,39
79 | Reggio Calabria,16.597,80
80 | Reggio nell'Emilia,5.146,35
81 | Rieti,10.378,57
82 | Rimini,7.409,99
83 | Roma,9.763,58
84 | Rovigo,9.092,29
85 | Salerno,15.123,65
86 | Sassari,13.022,90
87 | Savona,7.572,9
88 | Siena,5.878,52
89 | Siracusa,21.373,89
90 | Sondrio,6.484,14
91 | Sud Sardegna,13.095,111
92 | Taranto,14.959,73
93 | Teramo,6.929,67
94 | Terni,7.189,55
95 | Torino,8.256,1
96 | Trapani,15.749,81
97 | Trento,4.807,22
98 | Treviso,4.903,26
99 | Trieste,5.648,32
100 | Udine,6.692,30
101 | Varese,6.556,12
102 | Venezia,6.026,27
103 | Verbano-Cusio-Ossola,5.808,103
104 | Vercelli,8.231,2
105 | Verona,4.67,23
106 | Vibo Valentia,19.223,102
107 | Vicenza,4.628,24
108 | Viterbo,10.851,56
109 |
--------------------------------------------------------------------------------
/r/rethinking/polynomial_regression.R:
--------------------------------------------------------------------------------
1 | # R code 4.64 rethinking book
2 | # The same approach used to compute the posterior distribution
3 | # of a variable that depends linearly on another can be used when
4 | # the dependency is not linear. As in the linear case when the model
5 | # learnt from the data the values of the two parameters, slope and
6 | # intercept, of the linear relationship, it can learn the parameters
7 | # of a basis of polynomials that represents the non-linear dependency.
8 | # e.g. mu = a + b1x + b2x^2
9 | library(rethinking)
10 | data("Howell1")
11 | d <- Howell1
12 | str(d)
13 | # plot height against weight
14 | plot(d$height ~ d$weight)
15 |
16 | # defines the model
17 | d$weight_s <- (d$weight - mean(d$weight)) / sd(d$weight) # standardize the predictor variable
18 | d$weight_s2 <- d$weight_s^2
19 | # 2nd order polynomial
20 | m4.5 <- quap(
21 | alist(
22 | height ~ dnorm( mu , sigma ) ,
23 | mu <- a + b1*weight_s + b2*weight_s2 ,
24 | a ~ dnorm( 178 , 20 ) ,
25 | b1 ~ dlnorm( 0 , 1 ) ,
26 | b2 ~ dnorm(0, 1) ,
27 | sigma ~ dunif( 0 , 50 )
28 | ) ,
29 | data=d )
30 |
31 | # 3rd order polynomial
32 | d$weight_s3 = d$weight_s^3
33 | m4.6 <- quap(
34 | alist(
35 | height ~ dnorm( mu , sigma ) ,
36 | mu <- a + b1*weight_s + b2*weight_s2 + b3*weight_s3 ,
37 | a ~ dnorm( 178 , 20 ) ,
38 | b1 ~ dlnorm( 0 , 1 ) ,
39 | b2 ~ dnorm(0, 1) ,
40 | b3 ~ dnorm(0, 1) ,
41 | sigma ~ dunif( 0 , 50 )
42 | ) ,
43 | data=d )
44 |
45 | precis(m4.5)
46 |
47 | # plot the 2nd order polynomial model
48 | weight.seq <- seq(from = -2.2, to = 2, length.out = 30)
49 | pred_dat <- list(weight_s = weight.seq, weight_s2 = weight.seq^2)
50 | mu <- link(m4.5, data = pred_dat)
51 | mu.mean <- apply(mu, 2, mean)
52 | mu.PI <- apply(mu, 2, PI, prob = 0.89)
53 | sim.height <- sim(m4.5, data = pred_dat)
54 | height.PI <- apply(sim.height, 2, PI, prob = 0.89)
55 |
56 | plot(height ~ weight_s, d, col = col.alpha(rangi2, 0.5))
57 | lines(weight.seq, mu.mean)
58 | shade(mu.PI, weight.seq)
59 | shade(height.PI, weight.seq)
60 |
61 |
62 | # plot the 3rd order polynomial model
63 | weight.seq <- seq(from = -2.2, to = 2, length.out = 30)
64 | pred_dat <- list(weight_s = weight.seq, weight_s2 = weight.seq^2, weight_s3 = weight.seq^3)
65 | mu <- link(m4.6, data = pred_dat)
66 | mu.mean <- apply(mu, 2, mean)
67 | mu.PI <- apply(mu, 2, PI, prob = 0.89)
68 | sim.height <- sim(m4.6, data = pred_dat)
69 | height.PI <- apply(sim.height, 2, PI, prob = 0.89)
70 |
71 | plot(height ~ weight_s, d, col = col.alpha(rangi2, 0.5))
72 | lines(weight.seq, mu.mean)
73 | shade(mu.PI, weight.seq)
74 | shade(height.PI, weight.seq)
--------------------------------------------------------------------------------
/r/rethinking/gaussian_model_of_height.R:
--------------------------------------------------------------------------------
1 | # R code 4.7 rethinking book
2 | library(rethinking)
3 | data("Howell1")
4 | d <- Howell1 # dataframe
5 | precis(d)
6 |
7 | d2 <- d[d$age >= 18, ]
8 | # plot the height distribution
9 | #dens(d2$height)
10 |
11 | #curve(dnorm(x, 178, 20), from = 100, to = 250)
12 |
13 | #curve(dunif(x, 0, 50), from = -10, to = 60)
14 |
15 | # prior probability distribution for the mean
16 | sample_mu <- rnorm(1e4, 178, 20)
17 | # prior probability distribution for the standard deviation
18 | sample_sigma <- runif(1e4, 0, 50)
19 | # joint prior probability distribution of height, before seeing the data
20 | prior_h <- rnorm(1e4, sample_mu, sample_sigma)
21 | # plot the joint prior probability distribution
22 | #dens(prior_h)
23 |
24 | # grid approximation of the posterior distribution
25 | mu.list <- seq(from = 150, to = 160, length.out = 100)
26 | sigma.list <- seq(from = 7, to = 9, length.out = 100)
27 | post <- expand.grid(mu = mu.list, sigma = sigma.list)
28 | post$LL <- sapply(1:nrow(post), function(i) sum(
29 | dnorm(d2$height, post$mu[i], post$sigma[i], log = TRUE)))
30 | post$prod <- post$LL + dnorm(post$mu, 178, 20, TRUE) + dunif(post$sigma, 0, 50, TRUE)
31 | post$prob <- exp(post$prod - max(post$prod))
32 |
33 | # plot the posterior distribution
34 | #contour_xyz(post$mu, post$sigma, post$prob)
35 | #image_xyz(post$mu, post$sigma, post$prob)
36 |
37 | sample.rows <- sample(1:nrow(post), size = 1e4, replace = TRUE, prob = post$prob)
38 | sample.mu <- post$mu[sample.rows]
39 | sample.sigma <- post$sigma[sample.rows]
40 | #plot(sample.mu, sample.sigma, cex = 0.5, pch = 16, col = col.alpha(rangi2, 0.5))
41 |
42 | #dens(sample.mu)
43 | #dens(sample.sigma)
44 | HPDI(sample.mu)
45 |
46 | # test the standard deviation with a small sample size
47 | # to see that the sigma posterior distribution is not Gaussian
48 | d3 <- sample(d2$height, size = 20)
49 | # grid approximation of the posterior distribution
50 | mu.list <- seq(from = 150, to = 160, length.out = 100)
51 | sigma.list <- seq(from = 7, to = 9, length.out = 100)
52 | post2 <- expand.grid(mu = mu.list, sigma = sigma.list)
53 | post2$LL <- sapply(1:nrow(post2), function(i) sum(
54 | dnorm(d3, post2$mu[i], post2$sigma[i], log = TRUE)))
55 | post2$prod <- post2$LL + dnorm(post2$mu, 178, 20, TRUE) + dunif(post2$sigma, 0, 50, TRUE)
56 | post2$prob <- exp(post2$prod - max(post2$prod))
57 |
58 | # plot the posterior distribution
59 | #contour_xyz(post$mu, post$sigma, post$prob)
60 | image_xyz(post2$mu, post2$sigma, post$prob)
61 |
62 | sample2.rows <- sample(1:nrow(post2), size = 1e4, replace = TRUE, prob = post2$prob)
63 | sample2.mu <- post2$mu[sample2.rows]
64 | sample2.sigma <- post2$sigma[sample2.rows]
65 | plot(sample2.mu, sample2.sigma, cex = 0.5, pch = 16, col = col.alpha(rangi2, 0.5),
66 | xlab = 'mu', ylab = 'sigma')
67 | dens(sample2.sigma, norm.comp = TRUE)
68 |
--------------------------------------------------------------------------------
/python/recommendations.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Movie Recommendations\n",
8 | "An example from the book by Toby Segaran \"Programming Collective Intelligence\""
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 2,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import json\n",
18 | "critics = json.load(open('covid19/data/movie-critics.json'))"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 3,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "data": {
28 | "text/plain": [
29 | "4.5"
30 | ]
31 | },
32 | "execution_count": 3,
33 | "metadata": {},
34 | "output_type": "execute_result"
35 | }
36 | ],
37 | "source": [
38 | "critics['Toby']['Snakes on a Plane']"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 4,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "from math import sqrt\n",
48 | "# Returns a distance-based similarity score for person1 and person2\n",
49 | "def sim_distance(prefs,person1,person2):\n",
50 | " # Get the list of shared_items\n",
51 | " si={}\n",
52 | " for item in prefs[person1]:\n",
53 | " if item in prefs[person2]:\n",
54 | " si[item]=1\n",
55 | " # if they have no ratings in common, return 0\n",
56 | " if len(si)==0: return 0\n",
57 | " # Add up the squares of all the differences\n",
58 | " sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)\n",
59 | " for item in prefs[person1] if item in prefs[person2]])\n",
60 | " return 1/(1+sum_of_squares)"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 5,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "data": {
70 | "text/plain": [
71 | "0.14814814814814814"
72 | ]
73 | },
74 | "execution_count": 5,
75 | "metadata": {},
76 | "output_type": "execute_result"
77 | }
78 | ],
79 | "source": [
80 | "sim_distance(critics,'Lisa Rose','Gene Seymour')"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {
87 | "collapsed": true
88 | },
89 | "outputs": [],
90 | "source": []
91 | }
92 | ],
93 | "metadata": {
94 | "kernelspec": {
95 | "display_name": "Python 3 (ipykernel)",
96 | "language": "python",
97 | "name": "python3"
98 | },
99 | "language_info": {
100 | "codemirror_mode": {
101 | "name": "ipython",
102 | "version": 3
103 | },
104 | "file_extension": ".py",
105 | "mimetype": "text/x-python",
106 | "name": "python",
107 | "nbconvert_exporter": "python",
108 | "pygments_lexer": "ipython3",
109 | "version": "3.9.13"
110 | }
111 | },
112 | "nbformat": 4,
113 | "nbformat_minor": 2
114 | }
115 |
--------------------------------------------------------------------------------
/r/rethinking/multicollineariry.R:
--------------------------------------------------------------------------------
1 | # R code 6.2 rethinking book
2 | # We set up a simulation to showcase the problem when using predictors
3 | # that are strongly correlated.
4 | N <- 100
5 | set.seed(909)
6 | height <- rnorm(N, 10, 2) # total height
7 | leg_prop <- runif(N, 0.4, 0.5) # leg as proportion of height
8 | leg_left <- leg_prop * height + rnorm(N, 0, 0.02) # left leg as proportion + error
9 | leg_right <- leg_prop * height + rnorm(N, 0, 0.02) # right leg as proportion + error
10 | d <- data.frame(height, leg_left, leg_right)
11 |
12 | m6.1 <- quap(
13 | alist(
14 | height ~ dnorm( mu , sigma ) ,
15 | mu <- a + bl * leg_left + br * leg_right ,
16 | a ~ dnorm( 10 , 100 ) ,
17 | bl ~ dnorm( 2 , 10 ) ,
18 | br ~ dnorm( 2 , 10 ) ,
19 | sigma ~ dexp(1)
20 | ) ,
21 | data = d
22 | )
23 |
24 | #plot(precis(m6.1)) # doesn't show what it should
25 | post <- extract.samples(m6.1)
26 | plot(bl ~ br, post, col = col.alpha(rangi2, 0.1), pch = 16)
27 |
28 | sum_blbr <- post$bl + post$br
29 | dens(sum_blbr, col = rangi2, lwd = 2, xlab = "sum of bl and br")
30 |
31 | # Now we show the problem using real data, when one might not know in
32 | # advance that two predictor are in fact strongly correlated.
33 | # We will model the dependency of the milk total energy content (K)
34 | # from fat (F) and lactose (L).
35 | library(rethinking)
36 | data(milk)
37 | d <- milk
38 | d$K <- scale(d$kcal.per.g) # Kilocalories (energy content)
39 | d$F <- scale(d$perc.fat) # Fat
40 | d$L <- scale(log(d$perc.lactose)) # lactose (a carbohidrate)
41 | # we start by creating two bivariate models. The first investigates
42 | # the dependency of kilocalories (K) from fat (F)
43 | m6.3 <- quap(
44 | alist(
45 | K ~ dnorm( mu , sigma ) ,
46 | mu <- a + bF * F ,
47 | a ~ dnorm( 0 , 0.2 ) ,
48 | bF ~ dnorm( 0 , 0.5 ) ,
49 | sigma ~ dexp(1)
50 | ) ,
51 | data = d
52 | )
53 | # The 2nd model investigates the dependency of kilocalories (K) from lactose (L)
54 | m6.4 <- quap(
55 | alist(
56 | K ~ dnorm( mu , sigma ) ,
57 | mu <- a + bL * L,
58 | a ~ dnorm( 0 , 0.2 ) ,
59 | bL ~ dnorm( 0 , 0.5 ) ,
60 | sigma ~ dexp(1)
61 | ) ,
62 | data = d
63 | )
64 | # mean values show strong correlation of fat (positive) and lactose (negative)
65 | # with kilocalories.
66 | precis(m6.3)
67 | precis(m6.4)
68 | # Now we build a multivariate regression model for kilocalories using fat and lactose together.
69 | m6.5 <- quap(
70 | alist(
71 | K ~ dnorm( mu , sigma ) ,
72 | mu <- a + bF * F + bL * L,
73 | a ~ dnorm( 0 , 0.2 ) ,
74 | bF ~ dnorm( 0 , 0.5 ) ,
75 | bL ~ dnorm( 0 , 0.5 ) ,
76 | sigma ~ dexp(1)
77 | ) ,
78 | data = d
79 | )
80 | # mean values of the two parameter are now smaller that would imply a weaker contribution to energy
81 | # and also less precise as the standard deviations are bigger
82 | precis(m6.5)
83 | pairs(~ kcal.per.g + perc.fat + perc.lactose, data = d, col = rangi2) # plot the correlations between each pair of variables
84 | cor(d$perc.fat, d$perc.lactose) # computes the correlation between fat and lactose
85 |
86 |
87 |
--------------------------------------------------------------------------------
/julia/learn_julia.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "9b24ffbf-c78d-4d10-b2c6-78738edd946d",
6 | "metadata": {},
7 | "source": [
8 | "# Learning Julia\n",
9 | "A short introduction to Julia following the book [Think Julia: How to Think Like a Computer Scientist](https://benlauwens.github.io/ThinkJulia.jl/latest/book.html#chap06), by Ben Lauwens and Allen Downey.\r\n"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "id": "ab83fa29-1392-4e7f-b788-2bd44c50b96b",
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "using Pkg"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "id": "429a410f-195e-48dd-9cda-329b966add5a",
25 | "metadata": {},
26 | "source": [
27 | "## Conditionals"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 17,
33 | "id": "80961d42-be0d-409c-a114-aac2eba71315",
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "x is positive"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "x = 9\n",
46 | "if x > 0\n",
47 | " print(\"x is positive\")\n",
48 | "end"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 33,
54 | "id": "8768cea0-01a4-4608-a36e-bc57ffece643",
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "a"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "choice = 'a' # this is a Char\n",
67 | "if choice == 'a'\n",
68 | " print(\"a\")\n",
69 | "elseif choice == 'b'\n",
70 | " print(\"b\")\n",
71 | "elseif choice == 'c'\n",
72 | " print(\"c\")\n",
73 | "end"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "id": "0757b71f-3774-4ee2-b61b-8e7ea11eff66",
79 | "metadata": {},
80 | "source": [
81 | "## Functions"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 19,
87 | "id": "48b12e47-ed05-451e-a359-b4706fb5dd25",
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "data": {
92 | "text/plain": [
93 | "square (generic function with 1 method)"
94 | ]
95 | },
96 | "execution_count": 19,
97 | "metadata": {},
98 | "output_type": "execute_result"
99 | }
100 | ],
101 | "source": [
102 | "function square(a)\n",
103 | " # Computes the square of the argument\n",
104 | " b = a * a\n",
105 | " return a, b\n",
106 | "end "
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 23,
112 | "id": "f822833d-f14b-4da7-a8cb-d2ae8cfc4e27",
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "data": {
117 | "text/plain": [
118 | "(4, 16)"
119 | ]
120 | },
121 | "execution_count": 23,
122 | "metadata": {},
123 | "output_type": "execute_result"
124 | }
125 | ],
126 | "source": [
127 | "square(4)"
128 | ]
129 | }
130 | ],
131 | "metadata": {
132 | "kernelspec": {
133 | "display_name": "Julia 1.5.1",
134 | "language": "julia",
135 | "name": "julia-1.5"
136 | },
137 | "language_info": {
138 | "file_extension": ".jl",
139 | "mimetype": "application/julia",
140 | "name": "julia",
141 | "version": "1.5.1"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 5
146 | }
147 |
--------------------------------------------------------------------------------
/r/rethinking/binomial_distribution.R:
--------------------------------------------------------------------------------
1 | # Binomial distribution
2 | # Estimation of the bias of a coin.
3 | # We assume that a coin can be biased so that the probability
4 | # theta that after a toss it shows head or tail
5 | # may not be 0.5. We assume at the beginning that theta can have any
6 | # value between 0 and 1 but with different probabilities. We have some
7 | # sound even if not complete information about the plausible values of
8 | # theta. In our example we assume a triangular prior p(theta). We use
9 | # the binomial distribution as the likelihood. Then we set the sample
10 | # size n and we set the number of successes k. Finally, we compute the
11 | # posterior distribution as the product of the prior and the likelihood
12 | # distributions.
13 |
14 | # Example 1 from Kruschke ch.5 par. 5.3
15 | # range of values of the parameter
16 | theta <- seq(from = 0, to = 1, by = 0.1)
17 |
18 | # define the prior distribution for each value of theta according to our
19 | # knowledge before seeing the data.
20 | p1 <- 0.4 * theta[1:6]
21 | p2 <- 0.4 - 0.4 * theta[7:11]
22 | prior <- c(p1,p2)
23 |
24 | plot(theta, prior, type = "h", col = "skyblue")
25 |
26 | # sample the likelihood at each value of the parameter theta
27 | # for one toss. The binomial distribution used as the likelihood
28 | # is also called Bernoulli distribution when the sample size n = 1.
29 | # The way in which we extract the sample from the likelihood distribution
30 | # is called grid approximation because the elements of the sample are taken
31 | # from one data point and a set of equally spaced values of theta. This
32 | # approximation works because we are dealing with only one parameter and
33 | # values in a limited interval. Other approximation are quadratic and MCMC.
34 | n = 1 # sample size
35 | k = 1 # number success events out the sample
36 | likelihood <-dbinom(k, size = n, prob = theta)
37 | plot(theta, likelihood, ylab = "likelihood p(x | theta)", type = "h", col = "skyblue")
38 |
39 | # compute the marginal likelihood p(D)
40 | marginal <- sum(likelihood * prior)
41 |
42 | # compute the posterior distribution for theta using the Bayes rule
43 | posterior <- likelihood * prior / marginal
44 |
45 | # compute the posterior mode (value with most occurrences)
46 | mode_posterior <- theta[which.max(posterior)]
47 |
48 | plot(theta, posterior, ylab = "posterior p(theta | x)", type = "h", col = "skyblue")
49 |
50 | # Example 2 from Kruschke ch.5 par. 5.3.1
51 | # Influence of sample size.
52 | theta <- s <- seq(from = 0, to = 1, by = 0.001)
53 | # define the prior distribution for each value of theta
54 | p1 <- 0.4 * theta[1:500]
55 | p2 <- 0.4 - 0.4 * theta[501:1001]
56 | prior <- c(p1,p2)
57 | plot(theta, prior, ylab = "prior_1000", type = "h", col = "skyblue")
58 |
59 | # compute likelihood at each value of the parameter theta
60 | n = 40 # sample size
61 | k = 10 # number success events out the sample
62 | likelihood <-dbinom(k, size = n, prob = theta)
63 |
64 | # compute the likelihood mode (value with most occurrences)
65 | mode_likelihood <- theta[which.max(likelihood)]
66 |
67 | plot(theta, likelihood, ylab = "likelihood_1000 p(x | theta)", type = "h", col = "skyblue")
68 | text( .5 , 0.1 , paste("mode =", mode_likelihood))
69 |
70 | # compute the marginal likelihood p(D)
71 | marginal <- sum(likelihood * prior)
72 |
73 | # compute the posterior distribution for theta
74 | posterior <- likelihood * prior / marginal
75 |
76 | # compute the posterior mode (value with most occurrences)
77 | mode_posterior <- theta[which.max(posterior)]
78 |
79 | plot(theta, posterior, ylab = "posterior_1000 p(theta | x)", type = "h", col = "skyblue")
80 | text( .7 , 0.0020 , paste("mode =", mode_posterior))
81 |
82 |
--------------------------------------------------------------------------------
/r/rethinking/ch8_continuous_interactions.R:
--------------------------------------------------------------------------------
1 | # R code 8.19 rethinking book
2 | # We create two models of the bloom of flowers that depends
3 | # on water and shade. The first model is a linear combination
4 | # of the two, the second model contains an interaction term
5 | # between water and shade.
6 | library(rethinking)
7 | data("tulips")
8 | d <- tulips
9 | str(d)
10 | # standardize the variables
11 | d$blooms_std <- d$blooms / max(d$blooms)
12 | d$water_cent <- d$water - mean(d$water)
13 | d$shade_cent <- d$shade - mean(d$shade)
14 |
15 | # after some reasoning about the plausible values of the parameters'
16 | # prior distribution, we have the 1st model (without interaction)
17 | m8.6 <- quap(
18 | alist(
19 | blooms_std ~ dnorm( mu , sigma ) ,
20 | mu <- a + bw * water_cent + bs * shade_cent,
21 | a ~ dnorm(0.5, 0.25) ,
22 | bw ~ dnorm(0, 0.25),
23 | bs ~ dnorm(0, 0.25),
24 | sigma ~ dexp(1)
25 | ) ,
26 | data = d
27 | )
28 |
29 | # after some additional thinking about the parameter of the interaction
30 | # term we have the 2nd model
31 | m8.7 <- quap(
32 | alist(
33 | blooms_std ~ dnorm( mu , sigma ) ,
34 | mu <- a + bw * water_cent + bs * shade_cent + bws * water_cent * shade_cent,
35 | a ~ dnorm(0.5, 0.25) ,
36 | bw ~ dnorm(0, 0.25),
37 | bs ~ dnorm(0, 0.25),
38 | bws ~ dnorm(0, 0.25),
39 | sigma ~ dexp(1)
40 | ) ,
41 | data = d
42 | )
43 |
44 | # Let's plot a triptych ("trittico" in Italian) of the posterior prediction for blooms for the
45 | # 1st model (without interaction), The plot will show the relation between water and blooms for
46 | # three different values of shade.
47 | par(mfrow = c(1,3)) # 3 plots in a row
48 | for (s in -1:1) {
49 | idx <- which(d$shade_cent == s)
50 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1),
51 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2)
52 | title(paste("shade ", s))
53 | mu <- link(m8.6, data = data.frame(shade_cent = s, water_cent = -1:1))
54 | for (i in 1:20)
55 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3))
56 | }
57 |
58 | # Now we plot a triptych of the posterior prediction for blooms for the 2nd model (with interaction).
59 | # Again, the plot will show the relation between water and blooms for three different values of shade.
60 | par(mfrow = c(1,3)) # 3 plots in a row
61 | for (s in -1:1) {
62 | idx <- which(d$shade_cent == s)
63 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1),
64 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2)
65 | title(paste("shade ", s))
66 | mu <- link(m8.7, data = data.frame(shade_cent = s, water_cent = -1:1))
67 | for (i in 1:20)
68 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3))
69 | }
70 |
71 | # Now we plot the prior predictive simulations
72 | set.seed(7)
73 | prior <- extract.prior(m8.6)
74 | par(mfrow = c(1,3)) # 3 plots in a row
75 | for (s in -1:1) {
76 | idx <- which(d$shade_cent == s)
77 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1),
78 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2)
79 | title(paste("shade ", s))
80 | mu <- link(m8.6, data = data.frame(shade_cent = s, water_cent = -1:1), post = prior)
81 | for (i in 1:20)
82 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3))
83 | }
84 |
85 |
86 | prior <- extract.prior(m8.7)
87 | par(mfrow = c(1,3)) # 3 plots in a row
88 | for (s in -1:1) {
89 | idx <- which(d$shade_cent == s)
90 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1),
91 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2)
92 | title(paste("shade ", s))
93 | mu <- link(m8.7, data = data.frame(shade_cent = s, water_cent = -1:1), post = prior)
94 | for (i in 1:20)
95 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3))
96 | }
--------------------------------------------------------------------------------
/r/rethinking/linear_prediction.R:
--------------------------------------------------------------------------------
1 | # R code 4.37 rethinking book
2 | library(rethinking)
3 | data("Howell1")
4 | d <- Howell1
5 | d2 <- d[d$age >= 18,]
6 | #plot(d2$height ~ d2$weight) # uncomment to plot
7 |
8 | set.seed(2971)
9 | N <- 100
10 | a <- rnorm(N, 178, 20)
11 | #b <- rnorm(N, 0, 10)
12 |
13 | b <- rlnorm(1e4, 0, 1)
14 | dens(b, xlim = c(0,5), adj = 0.1) # plot the log-normal distribution
15 |
16 | # plot N lines with random slopes and intercepts
17 | b <- rlnorm(N, 0, 1) # limits the slope to positive values (log-normal distribution)
18 | plot( NULL , xlim=range(d2$weight) , ylim=c(-100,400) , xlab="weight" , ylab="height" )
19 | abline( h=0 , lty=2) # horizontal line
20 | abline( h=272 , lty=1 , lwd=0.5 ) # horizontal line, height of the tallest known person
21 | mtext( "b ~ dnorm(0,10)" )
22 | xbar <- mean(d2$weight)
23 | for ( i in 1:N ) curve( a[i] + b[i]*(x - xbar) ,
24 | from=min(d2$weight) , to=max(d2$weight) , add=TRUE ,
25 | col=col.alpha("black",0.2) )
26 |
27 | # load data again, since it's a long way back
28 | library(rethinking)
29 | data(Howell1)
30 | d <- Howell1
31 | d2 <- d[ d$age >= 18 , ]
32 | # define the average weight, x-bar
33 | xbar <- mean(d2$weight)
34 | # fit model
35 | m4.3 <- quap(
36 | alist(
37 | height ~ dnorm( mu , sigma ) ,
38 | mu <- a + b*( weight - xbar ) ,
39 | a ~ dnorm( 178 , 20 ) ,
40 | b ~ dlnorm( 0 , 1 ) ,
41 | sigma ~ dunif( 0 , 50 )
42 | ) ,
43 | data=d2 )
44 |
45 | # plots the line using mean values for alpha (a) and beta (b)
46 | plot(height ~ weight, data = d2, col = rangi2)
47 | post <- extract.samples(m4.3)
48 | a_map <- mean(post$a)
49 | b_map <- mean(post$b)
50 | curve(a_map + b_map*(x - xbar), add = TRUE)
51 |
52 | # we look at the marginal posterior distributions of the parameters
53 | precis(m4.3) # mean and standard deviations
54 | round(vcov(m4.3), 3) # covariances
55 | #pairs(m4.3) #
56 | # samples from the posterior distribution (height)
57 | post <- extract.samples(m4.3)
58 |
59 | N <- 352
60 | dN <- d2[1:N, ]
61 | mN <- quap(
62 | alist(
63 | height ~ dnorm(mu, sigma),
64 | mu <- a + b*(weight - mean(weight)),
65 | a ~ dnorm(178, 20),
66 | b ~ dlnorm(0, 1),
67 | sigma ~ dunif(0, 50)
68 | ), data = dN
69 | )
70 | # extract 20 samples from posterior
71 | post <- extract.samples(mN, n = 20)
72 | # display raw data and sample size
73 | plot(dN$weight, dN$height,
74 | xlim = range(d2$weight), ylim = range(d2$height),
75 | col = rangi2, xlab = "weight", ylab = "height")
76 | mtext(concat("N = ", N))
77 |
78 | # plot the lines from the sample, with transparency
79 | for (i in 1:20)
80 | curve(post$a[i] + post$b[i]*(x - mean(dN$weight)),
81 | col = col.alpha("black", 0.3), add = TRUE)
82 |
83 | # plotting regression intervals and contours
84 | # first for weight = 50
85 | post <- extract.samples(m4.3)
86 | mu_at_50 <- post$a + post$b * (50 - xbar)
87 | dens(mu_at_50, col = rangi2, lwd = 2, xlab = "mu|weight=50")
88 | PI(mu_at_50, prob = 0.89)
89 |
90 | mu <- link(m4.3)
91 | str(mu)
92 |
93 | # we want to compute the distribution of the height (mu) for each
94 | # value of the weight
95 | # define sequence of weights to compute predictions for
96 | # these values will be on the horizontal axis
97 | weight.seq <- seq(from = 25, to = 70, by = 1) # 46 weight values
98 | # use link to compute mu
99 | # for each sample from posterior and for each weight in weight.seq
100 | mu <- link(m4.3, data = data.frame(weight = weight.seq))
101 | #str(mu)
102 | plot(height ~ weight, d2, type = "n")
103 | for (i in 1:100)
104 | points(weight.seq, mu[i,], pch = 16, col = col.alpha(rangi2, 0.1))
105 |
106 | # summarize the distribution for each wight
107 | mu.mean <- apply(mu, 2, mean)
108 | mu.PI <- apply(mu, 2, PI, prob = 0.89)
109 | mu.HPDI <- apply(mu, 2, HPDI, prob = 0.89)
110 |
111 | # plot the summaries on top of the data
112 | # fading out points to make line and interval more visible
113 | plot(height ~ weight, data = d2, col = col.alpha(rangi2, 0.5))
114 | # plot the MAP line, aka the mean mu for each weight
115 | lines(weight.seq, mu.mean)
116 | # plot a shaded region for 89 % PI
117 | shade(mu.PI, weight.seq)
118 |
119 | # Prediction intervals R code 4.60
120 | sim.height <- sim(m4.3, data = list(weight = weight.seq))
121 | str(sim.height)
122 | height.PI <- apply(sim.height, 2, PI, prob = 0.89)
123 | # plot raw data
124 | plot(height ~ weight, d2, col = col.alpha(rangi2, 0.5))
125 | # draw MAP line
126 | lines(weight.seq, mu.mean)
127 | # draw HPDI region for line
128 | shade(mu.HPDI, weight.seq)
129 | # draw PI region for simulated heights
130 | shade(height.PI, weight.seq)
131 |
132 |
--------------------------------------------------------------------------------
/r/stat_learning/chapter1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# An Introduction to Statistical Learning\n",
8 | "This is a collection of Jupyter notebooks from the course [Statistical Learning](https://courses.edx.org/courses/course-v1:StanfordOnline+STATSX0001+1T2020/course/) taught by Prof. Trevor Hastie and Prof. Robert Tibshirani, offered by StanfordOnline on the edX platform. The course is based on the book [An Introduction to Statistical Learning with Applications in R](http://faculty.marshall.usc.edu/gareth-james/ISL/code.html) by James Gareth et al.. The notebooks are based on the Lab exercises that are at the end of each chapter in the book. The programming language is R. Each notebook begins with a short summary of the topics discussed in the online course and in the related chapters in the book, and then proceeds with the exercises. I always need more than one book to learn a subject, in this case my main additional textbooks have been \n",
9 | " \n",
10 | " - [McElreath - Statistical Rethinking, 2nd edition](https://github.com/rmcelreath/statrethinking_winter2019)\n",
11 | " - [Gelman et al. - Baysian Data Analysis, 3rd edition](http://www.stat.columbia.edu/~gelman/book/)\n",
12 | " - [DeGroot et al. - Probability and Statistics, 4th edition](https://www.amazon.com/Probability-Statistics-4th-Morris-DeGroot/dp/0321500466)\n",
13 | "\n",
14 | "I wrote these notebooks because I think that writing helps in clarifying the subject discussed. I hope that my summaries of the chapters still make sense and will help to remind me quickly the material learned. \n",
15 | "\n",
16 | "---\n",
17 | "\n",
18 | "The course is mostly about **supervised learning**, in which we have a data set of observations $x_i$ with labels $y_i$ that we can use to fit our model \n",
19 | "\n",
20 | "$$y = f(x)$$\n",
21 | "\n",
22 | "in order to be able to figure out what can be the label of a new observation. The label, or response, can be a numerical value, such as in **regression problems** or a category such as in **classification problems**. The last chapter of the book provides an introduction to two **unsupervised learning** methods in which we have observations but no labels and our goal is to see whether there is some structure or pattern in the data. As can be seen from the chapters, half of the book is devoted to **linear models** in which we represent the relationship between the p predictors of the $i$th observation and the response by a linear function\n",
23 | "\n",
24 | "$$y_i = \\beta_0 + \\beta_1 x_{i1} + \\beta_2 x_{i2} + ... + \\beta_p x_{ip}$$\n",
25 | "\n",
26 | "and our goal is to use the available observations to learn the values of the parameters $\\beta$.The 2nd part of the book presents different ways to overcome the limits of the linear models by adding higher order terms to the linear functions (polynomials and splines). The goal with models that include higher order terms will be to learn their parmeters in adition to the parameters of the linear terms \n",
27 | "\n",
28 | "$$y_i = \\beta_0 + \\beta_1 x_i + \\beta_2 x_i^2 + \\beta_3 x_i^3 + ... + \\beta_d x_i^d$$\n",
29 | "\n",
30 | "A different approach is to use non-parametric methods by finding rules or similarities in the data without using mathematical models such as in **decision trees**, **splines** or **K-Means**. Support vector machines offer a geometrical approach mostly used in classification tasks. A theme that is transversal to all the techniques discussed in the course and in the book is **overfitting**, when our model performs well on the training data but not that much on new observations, and how to overcome it.\n",
31 | "\n",
32 | "---\n",
33 | "\n",
34 | "1. Introduction\n",
35 | "2. [Statistical Learning](chapter2.ipynb)\n",
36 | "3. [Linear Regression](chapter3.ipynb)\n",
37 | "4. [Classification](chapter4.ipynb)\n",
38 | "5. [Resampling Methods](chapter5.ipynb)\n",
39 | "6. [Linear Model Selection and Regularization](chapter6.ipynb)\n",
40 | "7. [Moving Beyond Linearity](chapter7.ipynb)\n",
41 | "8. [Tree-Based Methods](chapter8.ipynb)\n",
42 | "9. [Support Vector Machines](chapter9.ipynb)\n",
43 | "10. [Unsupervised Learning](chapter10.ipynb)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": []
52 | }
53 | ],
54 | "metadata": {
55 | "kernelspec": {
56 | "display_name": "R",
57 | "language": "R",
58 | "name": "ir"
59 | },
60 | "language_info": {
61 | "codemirror_mode": "r",
62 | "file_extension": ".r",
63 | "mimetype": "text/x-r-source",
64 | "name": "R",
65 | "pygments_lexer": "r",
66 | "version": "3.6.1"
67 | }
68 | },
69 | "nbformat": 4,
70 | "nbformat_minor": 4
71 | }
72 |
--------------------------------------------------------------------------------
/r/rethinking/overfitting.R:
--------------------------------------------------------------------------------
1 | # R code 7.1 rethinking book
2 | sppnames <- c("afarensis", "africanus", "habilis", "boisei",
3 | "rudolfensis", "ergaster", "sapiens")
4 | brainvolcc <- c(438, 452, 612, 521, 752, 871, 1350)
5 | masskg <- c(37.0, 35.5, 34.5, 41.5, 55.5, 61.0, 53.5)
6 | d <- data.frame(species = sppnames, brain = brainvolcc, mass = masskg)
7 | # We start modeling the relationship between brain volumes and body mass
8 | # as a linear function.
9 | # We standardize the variables
10 | library(rethinking)
11 | d$mass_std <- (d$mass - mean(d$mass)) / sd(d$mass)
12 | d$brain_std <- d$brain / max(d$brain)
13 | m7.1 <- quap(
14 | alist(
15 | brain_std ~ dnorm( mu , exp(log_sigma) ) ,
16 | mu <- a + b * mass_std ,
17 | a ~ dnorm(0.5, 1) ,
18 | b ~ dnorm(0, 10),
19 | log_sigma ~ dnorm(0, 1)
20 | ) ,
21 | data = d
22 | )
23 |
24 | # We see how the model gets close to the data. The frequentist idea
25 | # is that the model is as good as the variance of the residuals is close
26 | # or smaller than the vairance of the data itself. A residula is the
27 | # difference between an observation and a value predicted by the model.
28 | set.seed(12)
29 | s <- sim(m7.1)
30 | r <- apply(s, 2, mean) - d$brain_std
31 | resid_var <- var2(r)
32 | outcome_var <- var2(d$brain_std)
33 | 1 - resid_var / outcome_var # computes R^2
34 |
35 | R2_is_bad <- function(quap_fit) {
36 | s <- sim(quap_fit, refresh = 0)
37 | r <- apply(s, 2, mean) - d$brain_std # residuals
38 | 1 - var2(r) / var2(d$mass_std)
39 | }
40 |
41 | # We want to compare how different models fit the data. We build
42 | # them using polynomials
43 | m7.2 <- quap(
44 | alist(
45 | brain_std ~ dnorm( mu , exp(log_sigma) ) ,
46 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 ,
47 | a ~ dnorm(0.5, 1) ,
48 | b ~ dnorm(0, 10),
49 | log_sigma ~ dnorm(0, 1)
50 | ) ,
51 | data = d, start = list(b = rep(0, 2))
52 | )
53 | m7.3 <- quap(
54 | alist(
55 | brain_std ~ dnorm( mu , exp(log_sigma) ) ,
56 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 + b[3] * mass_std^3,
57 | a ~ dnorm(0.5, 1) ,
58 | b ~ dnorm(0, 10),
59 | log_sigma ~ dnorm(0, 1)
60 | ) ,
61 | data = d, start = list(b = rep(0, 3))
62 | )
63 | m7.4 <- quap(
64 | alist(
65 | brain_std ~ dnorm( mu , exp(log_sigma) ) ,
66 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 + b[3] * mass_std^3 + b[4] * mass_std^4,
67 | a ~ dnorm(0.5, 1) ,
68 | b ~ dnorm(0, 10),
69 | log_sigma ~ dnorm(0, 1)
70 | ) ,
71 | data = d, start = list(b = rep(0, 4))
72 | )
73 | m7.5 <- quap(
74 | alist(
75 | brain_std ~ dnorm( mu , exp(log_sigma) ) ,
76 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 + b[3] * mass_std^3 + b[4] * mass_std^4 + b[5] * mass_std^5 ,
77 | a ~ dnorm(0.5, 1) ,
78 | b ~ dnorm(0, 10),
79 | log_sigma ~ dnorm(0, 1)
80 | ) ,
81 | data = d, start = list(b = rep(0, 5))
82 | )
83 | m7.6 <- quap(
84 | alist(
85 | brain_std ~ dnorm( mu , 0.001 ) ,
86 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 +
87 | b[3] * mass_std^3 + b[4] * mass_std^4 +
88 | b[5] * mass_std^5 + b[6] * mass_std^6 ,
89 | a ~ dnorm(0.5, 1) ,
90 | b ~ dnorm(0, 10),
91 | log_sigma ~ dnorm(0, 1)
92 | ) ,
93 | data = d, start = list(b = rep(0, 6))
94 | )
95 | # plot the models
96 | # linear model
97 | post <- extract.samples(m7.1)
98 | mass_seq <- seq(from = min(d$mass_std), to = max(d$mass_std), length.out = 100)
99 | l <- link(m7.1, data = list(mass_std = mass_seq))
100 | mu <- apply(l, 2, mean)
101 | ci <- apply(l, 2, PI)
102 | plot(brain_std ~ mass_std, data = d)
103 | lines(mass_seq, mu)
104 | shade(ci, mass_seq)
105 | R2_is_bad(m7.1)
106 | # order 2 polynomial model
107 | post <- extract.samples(m7.2)
108 | l <- link(m7.2, data = list(mass_std = mass_seq))
109 | mu <- apply(l, 2, mean)
110 | ci <- apply(l, 2, PI)
111 | plot(brain_std ~ mass_std, data = d)
112 | lines(mass_seq, mu)
113 | shade(ci, mass_seq)
114 | R2_is_bad(m7.2)
115 | # order 5 polynomial model
116 | post <- extract.samples(m7.5)
117 | l <- link(m7.5, data = list(mass_std = mass_seq))
118 | mu <- apply(l, 2, mean)
119 | ci <- apply(l, 2, PI)
120 | plot(brain_std ~ mass_std, data = d)
121 | lines(mass_seq, mu)
122 | shade(ci, mass_seq)
123 | R2_is_bad(m7.5)
124 | # order 6 polynomial model
125 | post <- extract.samples(m7.6)
126 | l <- link(m7.6, data = list(mass_std = mass_seq))
127 | mu <- apply(l, 2, mean)
128 | ci <- apply(l, 2, PI)
129 | plot(brain_std ~ mass_std, data = d)
130 | lines(mass_seq, mu)
131 | shade(ci, mass_seq)
132 | R2_is_bad(m7.6)
133 | # ordinary least squares (uses the R lm() function)
134 | m7.1_OLS <- lm(brain_std ~ mass_std, data = d)
135 | post <- extract.samples(m7.1_OLS) # how to plot this ?
136 |
137 | # Measure distance from target model
138 | set.seed(1)
139 | sapply(list(m7.1, m7.2, m7.3, m7.4, m7.5, m7.6), function(m) sum(lppd(m)))
--------------------------------------------------------------------------------
/r/rethinking/spurious_association.R:
--------------------------------------------------------------------------------
1 | # R code 5.1 rethinking book
2 | # load data
3 | library(rethinking)
4 | data(WaffleDivorce)
5 | d <- WaffleDivorce
6 |
7 | # standardize variables
8 | d$A <- scale(d$MedianAgeMarriage)
9 | d$D <- scale(d$Divorce)
10 |
11 | # 1) builds the model divorce rate D - age of marriage A
12 | m5.1 <- quap(
13 | alist(
14 | D ~ dnorm( mu , sigma ) ,
15 | mu <- a + bA * A ,
16 | a ~ dnorm( 0 , 0.2 ) ,
17 | bA ~ dnorm( 0 , 0.5 ) ,
18 | sigma ~ dexp(1)
19 | ) ,
20 | data = d
21 | )
22 |
23 | # plot the priors
24 | set.seed(10)
25 | prior <- extract.prior(m5.1)
26 | mu <- link(m5.1, post = prior, data = list(A = c(-2,2)))
27 | plot(NULL, xlim = c(-2,2), ylim = c(-2,2))
28 | for (i in 1:50)
29 | lines(c(-2,2), mu[i, ], col = col.alpha("black", 0.4))
30 |
31 | # compute percentile interval of mean
32 | A_seq <- seq(from = -3, to = 3.2, length.out = 30)
33 | mu <- link(m5.1, data = list(A = A_seq))
34 | mu.mean <- apply(mu, 2, mean)
35 | mu.PI <- apply(mu, 2, PI)
36 |
37 | # plot the posterior predictions
38 | plot(D ~ A, data = d, col = rangi2)
39 | lines(A_seq, mu.mean, lwd = 2)
40 | shade(mu.PI, A_seq)
41 |
42 | precis(m5.1)
43 |
44 | # 2) builds the model divorce rate D - marriage rate M
45 | # standardize variable marriage rate
46 | d$M <- scale(d$Marriage)
47 | m5.2 <- quap(
48 | alist(
49 | D ~ dnorm( mu , sigma ) ,
50 | mu <- a + bM * M ,
51 | a ~ dnorm( 0 , 0.2 ) ,
52 | bM ~ dnorm( 0 , 0.5 ) ,
53 | sigma ~ dexp(1)
54 | ) ,
55 | data = d
56 | )
57 |
58 | # compute percentile interval of mean
59 | M_seq <- seq(from = -3, to = 3.2, length.out = 30)
60 | mu <- link(m5.2, data = list(M = M_seq))
61 | mu.mean <- apply(mu, 2, mean)
62 | mu.PI <- apply(mu, 2, PI)
63 |
64 | # plot the posterior predictions
65 | plot(D ~ M, data = d, col = rangi2)
66 | lines(M_seq, mu.mean, lwd = 2)
67 | shade(mu.PI, M_seq)
68 |
69 | # draw a directed acyclic graph (DAG) that represents
70 | # a causal relationship between the variables
71 | #install.packages('dagitty')
72 | library(dagitty)
73 | dag5.1 <- dagitty("dag {
74 | A -> D
75 | A -> M
76 | M -> D
77 | }"
78 | )
79 | coordinates(dag5.1) <- list(x = c(A = 0, D = 1, M = 2), y = c(A = 0, D = 1, M = 0))
80 | plot(dag5.1)
81 |
82 | # 3) multiple regression model
83 | m5.3 <- quap(
84 | alist(
85 | D ~ dnorm( mu , sigma ) ,
86 | mu <- a + bM * M + bA * A,
87 | a ~ dnorm( 0 , 0.2 ) ,
88 | bA ~ dnorm( 0 , 0.5 ) ,
89 | bM ~ dnorm( 0 , 0.5 ) ,
90 | sigma ~ dexp(1)
91 | ) ,
92 | data = d
93 | )
94 | precis(m5.3)
95 | # plot the posterior distributions of the two weights (parameters)
96 | # for age of marriage (bA) and marriage rate (bM) to see the changes
97 | # from bivariate models (m5.1, m5.2) to a multivariate model (m5.3)
98 | # compute percentile interval of mean
99 | #plot(coeftab(m5.1, m5.2, m5.3), par = c("bA", "bM")) # this line of code fails.
100 |
101 | # Predictor residual plots.
102 | # We check the relationship between age of
103 | # marriage (A) and marriage rate (M), that is A -> M
104 | m5.4 <- quap(
105 | alist(
106 | M ~ dnorm( mu , sigma ) ,
107 | mu <- a + bAM * A,
108 | a ~ dnorm( 0 , 0.2 ) ,
109 | bAM ~ dnorm( 0 , 0.5 ) ,
110 | sigma ~ dexp(1)
111 | ) ,
112 | data = d
113 | )
114 | mu <- link(m5.4)
115 | mu.mean <- apply(mu, 2, mean)
116 | mu_resid <- d$M - mu.mean
117 | # plot the residuals
118 | A_seq <- seq(from = -3, to = 3, length.out = 50)
119 | plot(M ~ A, data = d, col = rangi2)
120 | lines(d$A, mu.mean, lwd = 2)
121 | #for (i in 1:50)
122 | # lines(d$A[i], mu_resid[i, ], col = col.alpha("black", 0.4))
123 |
124 | # Counterfactual plots
125 | # prepare new counterfactual data
126 | M_seq <- seq(from = -2, to = 3, length.out = 30)
127 | pred_data <- data.frame(M = M_seq, A = 0)
128 | # compute counterfactual mean divorce (mu)
129 | mu <- link(m5.3, data = pred_data)
130 | mu_mean <- apply(mu, 2, mean)
131 | mu_PI <- apply(mu, 2, PI)
132 | # simulate counterfactual divorce outcomes
133 | D_sim <- sim(m5.3, data = pred_data, n = 1e4)
134 | D_PI <- apply(D_sim, 2, PI)
135 | # display predictions, hiding raw data with type = "n"
136 | plot(D ~ M, data = d, type = "n")
137 | mtext("Median age marriage (std) = 0")
138 | lines(M_seq, mu_mean)
139 | shade(mu_PI, M_seq)
140 | shade(D_PI, M_seq)
141 |
142 |
143 | # Posterior prediction plots
144 | # It plots the predictions against the observations of the dependent variable (divorce.
145 | # call link without specifying new data so it uses original data
146 | mu <- link(m5.3)
147 | # summarize samples across cases
148 | mu_mean <- apply(mu, 2, mean)
149 | mu_PI <- apply(mu, 2, PI)
150 | # simulate observations, again no new data, uses original data
151 | D_sim <- sim(m5.3, n = 1e4)
152 | D_PI <- apply(D_sim, 2, PI)
153 | # plot the predictions against the observations
154 | plot(mu_mean ~ d$D, col = rangi2, ylim = range(mu_PI),
155 | xlab = "Observed divorce", ylab = "Predictive divorce")
156 | abline(a = 0, b = 1, lty = 2)
157 | for (i in 1:nrow(d))
158 | lines(rep(d$D[i], 2), mu_PI[, i], col = rangi2)
159 | # show some selected points (select by clicking on the points in the window)
160 | identify(x = d$D, y = mu_mean, labels = d$Loc)
161 |
162 | # Simulating spurious association
163 | N <- 100
164 | x_real <- rnorm(N)
165 | x_spur <- rnorm(N, x_real)
166 | y <- rnorm(N, x_real)
167 | d <- data.frame(y, x_real, x_spur)
168 | pairs(d)
169 |
170 |
--------------------------------------------------------------------------------
/python/atn/capacitors.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Capacitors"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "data": {
17 | "text/plain": [
18 | "'1.4.3'"
19 | ]
20 | },
21 | "execution_count": 2,
22 | "metadata": {},
23 | "output_type": "execute_result"
24 | }
25 | ],
26 | "source": [
27 | "import pandas as pd\n",
28 | "import numpy as np\n",
29 | "from datetime import datetime\n",
30 | "from pandas import Series, DataFrame\n",
31 | "%matplotlib inline\n",
32 | "import matplotlib.pyplot as plt\n",
33 | "plt.style.use('seaborn-whitegrid')\n",
34 | "pd.__version__"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "metadata": {},
40 | "source": [
41 | "### Read the data"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 4,
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "name": "stderr",
51 | "output_type": "stream",
52 | "text": [
53 | "C:\\Users\\Luigi\\AppData\\Local\\Temp\\ipykernel_18948\\1313487762.py:1: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.\n",
54 | " sales = pd.read_csv('datasets/PurchaseData_20180319.csv')\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "sales = pd.read_csv('datasets/PurchaseData_20180319.csv')"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "# Capacitors Data Analysis\n",
67 | "In order to predict the price (POP_UnitPrice) and the delivery time (PO_Date) of a capacitor we have received from ATN, with the data, a list of features that should be relevant for the task at hand and a set of rules. The relevant features are a subset of the fields in the data set.\n",
68 | "\n",
69 | "1. Component number (ComponentNumber_MAT_Flight)\n",
70 | "2. Specification name (SpecificationName)\n",
71 | "3. Family path (FamilyPath_Flight)\n",
72 | "4. Style (Style_Flight)\n",
73 | "5. Quality level (QLevel_Flight)\n",
74 | "6. Package class (PACKAGECLASS)\n",
75 | "7. Package (PACKAGE)\n",
76 | "8. Capacitance (CAPACITANCE_N)\n",
77 | "9. Capacitance case (CAPE_CASE)\n",
78 | "10. Tolerance (TOLERANCE_N)\n",
79 | "11. DC rated voltage (DC_RATED_VOLTAGE_N)\n",
80 | "12. Quality Value Name (QualityValueName)\n",
81 | "13. Manufacturer (MnfrDoeeetName)\n",
82 | "14. Quantity (POP_Qty)\n",
83 | "15. Date of purchase (PO_Date)\n",
84 | "16. Unit price (POP_UnitPrice)\n",
85 | "17. Date of delivery (POP_DeliveryDate)"
86 | ]
87 | },
88 | {
89 | "cell_type": "markdown",
90 | "metadata": {},
91 | "source": [
92 | "## Data preparation for capacitors\n",
93 | "Before implementing the algorithm to predict unit price and delivery time for a capacitor, we have to extract the records from the sale orders data set and apply the following transformation\n",
94 | "\n",
95 | "1. Select the records about capacitors (family root -> capacitors)\n",
96 | "2. Extract the most specific family of the component from the hierarchy (family path)\n",
97 | "3. Filter out the records that are about services (remove price label -> material unit price or pop_quantity_unit -> ST)\n",
98 | "4. Transform all the prices in euro\n",
99 | "5. Update the all the unit prices applying an increase of 5 % per year (using the compund interest formula) \n",
100 | "\n",
101 | "After the data is prepared we can implement the algorithms \n",
102 | "\n",
103 | "1. Price prediction\n",
104 | "\n",
105 | "2. Delivery time prediction"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "#### Use only records without charges"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 5,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "data": {
122 | "text/plain": [
123 | "18508"
124 | ]
125 | },
126 | "execution_count": 5,
127 | "metadata": {},
128 | "output_type": "execute_result"
129 | }
130 | ],
131 | "source": [
132 | "sales = sales[sales['PRICE LABEL'] == 'MATERIAL UNIT PRICE']\n",
133 | "sales.index.size"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "#### Select the records about capacitors"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 6,
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "Number of records for resistor: 8027\n"
153 | ]
154 | }
155 | ],
156 | "source": [
157 | "resistor_records = sales[sales['FamilyRoot'] == 'Resistors']\n",
158 | "num_resistor_records = resistor_records.index.size\n",
159 | "print(\"Number of records for resistor: \" + str(num_resistor_records))"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": null,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": []
168 | }
169 | ],
170 | "metadata": {
171 | "kernelspec": {
172 | "display_name": "Python 3 (ipykernel)",
173 | "language": "python",
174 | "name": "python3"
175 | },
176 | "language_info": {
177 | "codemirror_mode": {
178 | "name": "ipython",
179 | "version": 3
180 | },
181 | "file_extension": ".py",
182 | "mimetype": "text/x-python",
183 | "name": "python",
184 | "nbconvert_exporter": "python",
185 | "pygments_lexer": "ipython3",
186 | "version": "3.9.13"
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 2
191 | }
192 |
--------------------------------------------------------------------------------
/r/rethinking/interaction_model.R:
--------------------------------------------------------------------------------
1 | # R code 8.1 rethinking book
2 | # We want to model the relationship between a country's GDP and
3 | # the ruggedness of its territory.
4 | library(rethinking)
5 | data(rugged)
6 | d <- rugged
7 |
8 | # make log version of the outcome
9 | d$log_gdp <-- log(d$rgdppc_2000)
10 |
11 | # extract countries with GDP data
12 | dd <- d[complete.cases(d$rgdppc_2000), ]
13 |
14 | # rescale variables
15 | dd$log_gdp_std <- dd$log_gdp / mean(dd$log_gdp)
16 | dd$rugged_std <- dd$rugged / max(dd$rugged)
17 |
18 | # split countries into Africa and not-Africa
19 | d.A1 <- dd[dd$cont_africa == 1, ] # Africa
20 | d.A0 <- dd[dd$cont_africa == 0, ] # not Africa
21 |
22 | # we model the relationship between GDP and ruggedness
23 | # for Africa
24 | m8.1 <- quap(
25 | alist(
26 | log_gdp_std ~ dnorm( mu , sigma ) ,
27 | mu <- a + b * (rugged_std - 0.215),
28 | a ~ dnorm(1, 0.1) ,
29 | b ~ dnorm(0, 0.3),
30 | sigma ~ dexp(1)
31 | ) ,
32 | data = d.A1
33 | )
34 |
35 | # Let#s look at the priors to see if they are plausible.
36 | # If not we change their standard deviation.
37 | set.seed(7)
38 | prior <- extract.prior(m8.1)
39 | # set up the plot dimensions
40 | plot(NULL, xlim = c(0,1), ylim = c(0.5, 1.5),
41 | xlab = "ruggedness", ylab = "log GDP")
42 | abline(h = min(dd$log_gdp_std), lty = 2)
43 | abline(h = max(dd$log_gdp_std), lty = 2)
44 | # draw 50 lines from the prior
45 | rugged_seq <- seq(from = -0.1, to = 1.1, length.out = 30)
46 | mu <- link(m8.1, post = prior, data = data.frame(rugged_std = rugged_seq))
47 | for (i in 1:50)
48 | lines(rugged_seq, mu[i, ], col = col.alpha("black", 0.3))
49 |
50 | # we model the relationship between GDP and ruggedness
51 | # for not-Africa
52 | m8.2 <- quap(
53 | alist(
54 | log_gdp_std ~ dnorm( mu , sigma ) ,
55 | mu <- a + b * (rugged_std - 0.215),
56 | a ~ dnorm(1, 0.1) ,
57 | b ~ dnorm(0, 0.3),
58 | sigma ~ dexp(1)
59 | ) ,
60 | data = d.A0
61 | )
62 |
63 | # we look at the posteriors for the parameters in both models
64 | # and we see that the relationship (parameter b) is different
65 | # positive for Africa and negative for not-Africa
66 | precis(m8.1) # Africa
67 | precis(m8.2) # not-Africa
68 |
69 | # Now we want to see how to reach the same result using only
70 | # one model and one single data set without splitting it.
71 | # We start building a model like the previous ones but with
72 | # the full dataset. Then we build another model that will use
73 | # different intercets for African and not-African countries within
74 | # the same model. We will then compare these two models.
75 | m8.3 <- quap(
76 | alist(
77 | log_gdp_std ~ dnorm( mu , sigma ) ,
78 | mu <- a + b * (rugged_std - 0.215),
79 | a ~ dnorm(1, 0.1) ,
80 | b ~ dnorm(0, 0.3),
81 | sigma ~ dexp(1)
82 | ) ,
83 | data = dd
84 | )
85 | # we use an indexed intercpt instead of a dummy variable to distinguish
86 | # between African and not-African countries
87 | # create a variable to index Africa (1) or not (2)
88 | dd$cid <- ifelse(dd$cont_africa == 1, 1, 2)
89 | m8.4 <- quap(
90 | alist(
91 | log_gdp_std ~ dnorm( mu , sigma ) ,
92 | mu <- a[cid] + b * (rugged_std - 0.215),
93 | a[cid] ~ dnorm(1, 0.1) ,
94 | b ~ dnorm(0, 0.3),
95 | sigma ~ dexp(1)
96 | ) ,
97 | data = dd
98 | )
99 | # We compare the two models that use the full dataset
100 | compare(m8.3, m8.4)
101 | # we look at the posterior parameters
102 | precis(m8.4, depth = 2)
103 | # let's plot the posterior predictions
104 | rugged_seq <- seq(from = -0.1, to = 1.1, length.out = 30)
105 | # compute mu over samples, fixing cid = 2 (not-Africa)
106 | mu.NotAfrica <- link(m8.4, data = data.frame(cid = 2, rugged_std = rugged_seq))
107 | # compute mu over samples, fixing cid = 1 (Africa)
108 | mu.Africa <- link(m8.4, data = data.frame(cid = 1, rugged_std = rugged_seq))
109 | # summarize to means and intervals
110 | mu.NotAfrica_mu <- apply(mu.NotAfrica, 2, mean)
111 | mu.NotAfrica_ci <- apply(mu.NotAfrica, 2, PI, prob = 0.97)
112 | mu.Africa_mu <- apply(mu.Africa, 2, mean)
113 | mu.Africa_ci <- apply(mu.Africa, 2, PI, prob = 0.97)
114 | # we can see that the model with the indexed intercept (a[cid]) is not a better model because
115 | # the interaction used for the intercept doesn't help explaining the different
116 | # role of ruggedness on GDP we saw at the beginning for African and not-African countries.
117 | # We have to make the parameter b also dependent on being Africa or not-Africa
118 | m8.5 <- quap(
119 | alist(
120 | log_gdp_std ~ dnorm( mu , sigma ) ,
121 | mu <- a[cid] + b[cid] * (rugged_std - 0.215),
122 | a[cid] ~ dnorm(1, 0.1) ,
123 | b[cid] ~ dnorm(0, 0.3),
124 | sigma ~ dexp(1)
125 | ) ,
126 | data = dd
127 | )
128 | precis(m8.5, depth = 2)
129 | # Let's see how much adding the interaction at the slope improves the model
130 | compare(m8.3, m8.4, m8.5)
131 | # plot Africa data points, cid = 1
132 | plot(d.A1$rugged_std, d.A1$log_gdp_std, pch = 16, col = rangi2,
133 | xlab = "ruggedness (standardized)", ylab = "log GDP (as proportion of mean)",
134 | xlim = c(0,1))
135 | mu <- link(m8.5, data = data.frame(cid = 1, rugged_std = rugged_seq))
136 | mu_mean <- apply(mu, 2, mean)
137 | mu_ci <- apply(mu, 2, PI, prob = 0.97)
138 | lines(rugged_seq, mu_mean, lwd = 2)
139 | shade(mu_ci, rugged_seq, col = col.alpha(rangi2, 0.3))
140 | mtext("African nations")
141 |
142 | # plot not-Africa data points, cid = 2
143 | plot(d.A1$rugged_std, d.A1$log_gdp_std, pch = 16, col = rangi2,
144 | xlab = "ruggedness (standardized)", ylab = "log GDP (as proportion of mean)",
145 | xlim = c(0,1))
146 | mu <- link(m8.5, data = data.frame(cid = 2, rugged_std = rugged_seq))
147 | mu_mean <- apply(mu, 2, mean)
148 | mu_ci <- apply(mu, 2, PI, prob = 0.97)
149 | lines(rugged_seq, mu_mean, lwd = 2)
150 | shade(mu_ci, rugged_seq, col = col.alpha(rangi2, 0.3))
151 | mtext("Non-African nations")
--------------------------------------------------------------------------------
/r/rethinking/masked_relationship.R:
--------------------------------------------------------------------------------
1 | # R code 5.18 rethinking book
2 | # Masked relationship
3 | # We investigate the relationships between the milk kilocalories (K) and two
4 | # other variables: neocortex percentage (N) and log body mass (M). We first build
5 | # two bivariate models (K ~ N) and (K ~ N) from which we see that taken separately
6 | # the two variables N and M have a weak relationship with K. Then we build a 3rd
7 | # multivariate model taking into account both N and M together to show that they
8 | # have a strong relationship with K.
9 | library(rethinking)
10 | data(milk)
11 | d <- milk
12 | str(d)
13 | d$K <- scale(d$kcal.per.g)
14 | d$N <- scale(d$neocortex.perc) # contains NA values
15 | d$M <- scale(log(d$mass))
16 |
17 | # remove NA values
18 | dcc <- d[complete.cases(d$K, d$N, d$M), ]
19 | # ---------------------------------------------------------------------
20 | # 1) builds the first model kilocalories (K) - neocortex percentage (N)
21 | # ---------------------------------------------------------------------
22 | m5.5_draft <- quap(
23 | alist(
24 | K ~ dnorm( mu , sigma ) ,
25 | mu <- a + bN * N ,
26 | a ~ dnorm( 0 , 1 ) ,
27 | bN ~ dnorm( 0 , 1 ) ,
28 | sigma ~ dexp(1)
29 | ) ,
30 | data = dcc
31 | )
32 |
33 | # builds a 2nd model kilocalories (K) - neocortex percentage (N)
34 | # with smaller interval values for the predictors a and bN
35 | # This model should produce more reasonable priors and as a
36 | # consequence also posterior.
37 | m5.5 <- quap(
38 | alist(
39 | K ~ dnorm( mu , sigma ) ,
40 | mu <- a + bN * N ,
41 | a ~ dnorm( 0 , 0.2 ) ,
42 | bN ~ dnorm( 0 , 0.5 ) ,
43 | sigma ~ dexp(1)
44 | ) ,
45 | data = dcc
46 | )
47 |
48 |
49 | #prior <- extract.prior(m5.5_draft)
50 | prior <- extract.prior(m5.5)
51 |
52 | # plot the prior regression lines
53 | xseq <- c(-2,2)
54 | mu <- link(m5.5, post = prior, data = list(N = xseq))
55 | plot(NULL, xlim = xseq, ylim = xseq) # plot the frame
56 | for (i in 1:50)
57 | lines(xseq, mu[i, ], col = col.alpha("black", 0.3)) # plausible regression lines
58 |
59 | # Let's look at the posterior
60 | # It shows that the relationship between milk kilocalories and neocortex is weak:
61 | # small value of the bN parameter and standard deviation almost twice the mean.
62 | precis(m5.5)
63 |
64 | # Let's plot the posterior
65 | xseq <- seq(from = min(dcc$N) - 0.15, to = max(dcc$N) + 0.15, length.out = 30)
66 | mu <- link(m5.5, data = list(N = xseq))
67 | mu_mean <- apply(mu, 2, mean)
68 | mu_PI <- apply(mu, 2, PI)
69 | plot(K ~ N, data = dcc)
70 | lines(xseq, mu_mean, lwd = 2)
71 | shade(mu_PI, xseq)
72 |
73 |
74 | # --------------------------------------------------------
75 | # 2) builds the 2nd model kilocalories (K) - body mass (M)
76 | # --------------------------------------------------------
77 | m5.6 <- quap(
78 | alist(
79 | K ~ dnorm( mu , sigma ) ,
80 | mu <- a + bM * M ,
81 | a ~ dnorm( 0 , 0.2 ) ,
82 | bM ~ dnorm( 0 , 0.5 ) ,
83 | sigma ~ dexp(1)
84 | ) ,
85 | data = dcc
86 | )
87 |
88 | prior <- extract.prior(m5.6)
89 |
90 | # plot the prior regression lines
91 | xseq <- c(-2,2)
92 | mu <- link(m5.6, post = prior, data = list(M = xseq))
93 | plot(NULL, xlim = xseq, ylim = xseq) # plot the frame
94 | for (i in 1:50)
95 | lines(xseq, mu[i, ], col = col.alpha("black", 0.3)) # plausible regression lines
96 |
97 | # Let's look at the posterior
98 | # It shows that also the relationship between milk kilocalories and body mass is weak:
99 | # small (negative) value of the bM parameter and comparable standard deviation.
100 | precis(m5.6)
101 |
102 | # Let's plot the posterior
103 | xseq <- seq(from = min(dcc$M) - 0.15, to = max(dcc$M) + 0.15, length.out = 30)
104 | mu <- link(m5.6, data = list(M = xseq))
105 | mu_mean <- apply(mu, 2, mean)
106 | mu_PI <- apply(mu, 2, PI)
107 | plot(K ~ N, data = dcc)
108 | lines(xseq, mu_mean, lwd = 2)
109 | shade(mu_PI, xseq)
110 |
111 | # ------------------------------------------------------------------------------------------------
112 | # 3) builds the 3nd multivariate model kilocalories (K) depends on neocortex (N) and body mass (M)
113 | # ------------------------------------------------------------------------------------------------
114 | # We build a multivariate linear model with milk kilocalories (K) that depends linearly by both the
115 | # neocortex percentage (N) and the body mass (M)
116 | m5.7 <- quap(
117 | alist(
118 | K ~ dnorm( mu , sigma ) ,
119 | mu <- a + bN * N + bM * M ,
120 | a ~ dnorm( 0 , 0.2 ) ,
121 | bN ~ dnorm( 0 , 0.5 ) ,
122 | bM ~ dnorm( 0 , 0.5 ) ,
123 | sigma ~ dexp(1)
124 | ) ,
125 | data = dcc
126 | )
127 | # We can see, from the posterior mean and standard deviation, that K depends strongly on both N and M
128 | precis(m5.7)
129 | #plot(coeftab(m5.5, m5.6, m5.7), pars = c("bM", "bN")) # this doesn't work
130 | pairs(~K + M + N, dcc)
131 |
132 | # Let's now draw the counterfactual plot of the multivariate model.
133 | # Here we keep the neocortex percentage (N) constant at 0 (N = 0) so that we can see
134 | # the relation between K and the body mass M. We can see that the relation is stronger
135 | # in the multivariate model.
136 | xseq <- seq(from = min(dcc$M) - 0.15, to = max(dcc$M) + 0.15, length.out = 30)
137 | mu <- link(m5.7, data = data.frame(M = xseq, N = 0))
138 | mu_mean <- apply(mu, 2, mean)
139 | mu_PI <- apply(mu, 2, PI)
140 | plot(NULL, xlim = range(dcc$M), ylim = range(dcc$K))
141 | lines(xseq, mu_mean, lwd = 2)
142 | shade(mu_PI, xseq)
143 |
144 | # Here we keep the body mass (M) constant at 0 (M = 0) so that we can see
145 | # the relation between K and the neocortex percentage (N). We can see also in this case
146 | # that the relation is stronger in the multivariate model.
147 | xseq <- seq(from = min(dcc$N) - 0.15, to = max(dcc$N) + 0.15, length.out = 30)
148 | mu <- link(m5.7, data = data.frame(N = xseq, M = 0))
149 | mu_mean <- apply(mu, 2, mean)
150 | mu_PI <- apply(mu, 2, PI)
151 | plot(NULL, xlim = range(dcc$M), ylim = range(dcc$K))
152 | lines(xseq, mu_mean, lwd = 2)
153 | shade(mu_PI, xseq)
154 |
--------------------------------------------------------------------------------
/python/atn/resistors.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Resistors"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "scrolled": true
15 | },
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets\n"
22 | ]
23 | }
24 | ],
25 | "source": [
26 | "cd C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/plain": [
37 | "'0.22.0'"
38 | ]
39 | },
40 | "execution_count": 2,
41 | "metadata": {},
42 | "output_type": "execute_result"
43 | }
44 | ],
45 | "source": [
46 | "import pandas as pd\n",
47 | "import numpy as np\n",
48 | "from datetime import datetime\n",
49 | "from pandas import Series, DataFrame\n",
50 | "%matplotlib inline\n",
51 | "import matplotlib.pyplot as plt\n",
52 | "plt.style.use('seaborn-whitegrid')\n",
53 | "pd.__version__"
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "### Read the data"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 3,
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stderr",
70 | "output_type": "stream",
71 | "text": [
72 | "C:\\Users\\lselmi\\Downloads\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2728: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.\n",
73 | " interactivity=interactivity, compiler=compiler, result=result)\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "sales = pd.read_csv('PurchaseData_20180319.csv')"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "# Resistors Data Analysis\n",
86 | "In order to predict the price (POP_UnitPrice) and the delivery time (PO_Date) of resistor we have received from ATN, with the data, a list of features that should be relevant for the task at hand and a set of rules. The relevant features are a subset of the fields in the data set.\n",
87 | "\n",
88 | "1. Component number (ComponentNumber_MAT_Flight)\n",
89 | "2. Specification name (SpecificationName)\n",
90 | "3. Family path (FamilyPath_Flight)\n",
91 | "4. Style (Style_Flight)\n",
92 | "5. Quality level (QLevel_Flight)\n",
93 | "6. Package class (PACKAGECLASS)\n",
94 | "7. Package (PACKAGE)\n",
95 | "8. Resistance (RESISTANCE_N)\n",
96 | "9. Resistance case size (RES_CASESIZE)\n",
97 | "10. Tolerance (TOLERANCE_N)\n",
98 | "11. Temperature coefficient (TEMPCOEFFICIENT_N)\n",
99 | "12. Quality Value Name (QualityValueName)\n",
100 | "13. Manufacturer (MnfrDoeeetName)\n",
101 | "14. Quantity (POP_Qty)\n",
102 | "15. Date of purchase (PO_Date)\n",
103 | "16. Unit price (POP_UnitPrice)\n",
104 | "17. Date of delivery (POP_DeliveryDate)"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "metadata": {},
110 | "source": [
111 | "## Data preparation for resistors\n",
112 | "Before implementing the algorithm to predict unit price and delivery time for a resistor, we have to extract the records from the sale orders data set and apply the following transformation\n",
113 | "\n",
114 | "1. Select the records about resistors (family root -> resistors)\n",
115 | "2. Extract the most specific family of the component from the hierarchy (family path)\n",
116 | "3. Filter out the records that are about services (remove price label -> material unit price or pop_quantity_unit -> ST)\n",
117 | "4. Transform all the prices in euro\n",
118 | "5. Update the all the unit prices applying an increase of 5 % per year (using the compund interest formula) \n",
119 | "\n",
120 | "After the data is prepared we can implement the algorithms \n",
121 | "\n",
122 | "1. Price prediction\n",
123 | "2. Delivery time prediction"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "#### Use only records without charges"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 7,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/plain": [
141 | "18508"
142 | ]
143 | },
144 | "execution_count": 7,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "sales = sales[sales['PRICE LABEL'] == 'MATERIAL UNIT PRICE']\n",
151 | "sales.index.size"
152 | ]
153 | },
154 | {
155 | "cell_type": "markdown",
156 | "metadata": {},
157 | "source": [
158 | "#### Select the records about resistors"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 8,
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "Number of records for resistor: 8027\n"
171 | ]
172 | }
173 | ],
174 | "source": [
175 | "resistor_records = sales[sales['FamilyRoot'] == 'Resistors']\n",
176 | "num_resistor_records = resistor_records.index.size\n",
177 | "print(\"Number of records for resistor: \" + str(num_resistor_records))"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": []
186 | }
187 | ],
188 | "metadata": {
189 | "kernelspec": {
190 | "display_name": "Python 3",
191 | "language": "python",
192 | "name": "python3"
193 | },
194 | "language_info": {
195 | "codemirror_mode": {
196 | "name": "ipython",
197 | "version": 3
198 | },
199 | "file_extension": ".py",
200 | "mimetype": "text/x-python",
201 | "name": "python",
202 | "nbconvert_exporter": "python",
203 | "pygments_lexer": "ipython3",
204 | "version": "3.6.4"
205 | }
206 | },
207 | "nbformat": 4,
208 | "nbformat_minor": 2
209 | }
210 |
--------------------------------------------------------------------------------
/datasets.md:
--------------------------------------------------------------------------------
1 | Open Data Sets
2 | ==============
3 | A list of websites that provide data sets about science, economics and finance
4 | from Italy, Europe, the US, and intergovernmental organizations such as WHO, WMO, OECD, IMF and others.
5 |
6 | * Energy
7 | * European Union
8 | * [Gas Infrastructure Europe - Aggregated Gas Storage Inventory](https://agsi.gie.eu/)
9 | * [European Network of Transmission Operators for Gas](https://www.entsog.eu/)
10 | * Italy
11 | * [Gestore Mercati Energetici](https://www.mercatoelettrico.org/It/default.aspx)
12 | * [Gestore Servizi Energetici](https://gse.it/)
13 | * [Agenzia nazionale per le nuove tecnologie, l'energia e lo sviluppo economico sostenibile (ENEA)](https://www.enea.it/)
14 |
15 | * United States
16 | * [Energy Information Administration (US)](https://www.eia.gov/)
17 |
18 | * International Organizations
19 | * [International Energy Agency](https://www.iea.org/)
20 | * [International Renewable Energy Agency](https://www.irena.org/)
21 |
22 | * Economy
23 | * European Union
24 | * [European Central Bank Data Portal](https://data.ecb.europa.eu)
25 | * [European Securities and Market Authority](https://www.esma.europa.eu/)
26 |
27 | * Italy
28 | * [Banca d'Italia - Bank of Italy](https://www.bancaditalia.it/)
29 | * [Commissione Nazionale per le Societa' e la Borsa (CONSOB)](https://www.consob.it/)
30 | * [Agenzia delle Entrate](https://www.agenziaentrate.gov.it/portale/)
31 |
32 | * United States
33 | * [Federal Reserve Bank of St. Louis](https://fred.stlouisfed.org/)
34 | * [The Federal Reserve](https://www.federalreserve.gov/)
35 | * [U.S. Security and Exchange Commission](https://www.sec.gov/)
36 | * [Bureau of Economic Analysis](https://www.bea.gov/)
37 | * [U.S. Department of the Treasury](https://home.treasury.gov/)
38 | * [U.S. Securities and Exchange Commission - EDGAR Database](https://www.sec.gov/os/accessing-edgar-data)
39 | * [U.S. The Federal Deposit Insurance Corporation](https://www.fdic.gov/)
40 | * [U.S. Department of Commerce](https://www.commerce.gov/)
41 | * [National Association of Realtors](https://www.nar.realtor/)
42 | * [U.S. Department of the Treasury](https://home.treasury.gov/)
43 | * [U.S. Congressional Budget Office](https://www.cbo.gov/)
44 | * [FRED - Import Price Index](https://fred.stlouisfed.org/series/IREXPET)
45 | * [FRED - Consumer Sentiment](https://fred.stlouisfed.org/series/UMCSENT)
46 | * [Wikipedia - List of largest daily changes in the S&P 500 Index](https://en.wikipedia.org/wiki/List_of_largest_daily_changes_in_the_S%26P_500_Index)
47 | * [Wikipedia - List of largest daily changes in the Dow Jones Industrial Average](https://en.wikipedia.org/wiki/List_of_largest_daily_changes_in_the_Dow_Jones_Industrial_Average)
48 | * [Congressional Budget Office](https://www.cbo.gov/)
49 |
50 | * International Organizations
51 | * [International Monetary Fund](https://www.imf.org/en/Home)
52 | * [The World Bank](https://www.worldbank.org/en/home)
53 | * [The Organisation for Economic Co-operation and Development (OECD)](https://www.oecd.org/)
54 | * [The Bank for International Settlements](https://www.bis.org/)
55 | * [Food and Agriculture Organization of the United Nations](https://www.fao.org/)
56 | * [World Trade Organization](https://www.wto.org/)
57 | * [The Conference Board](https://www.conference-board.org/eu/)
58 | * [The World Intellectual Property Organization (WIPO)](https://www.wipo.int/portal/en/index.html)
59 |
60 | * Environment
61 | * European Union
62 | * [The European Environment Agency](https://www.eea.europa.eu/)
63 | * [EEA Industrial Reporting Database 2022](https://www.eea.europa.eu/data-and-maps/data/industrial-reporting-under-the-industrial-6)
64 | * [Extreme Wind Storms Catalogue](http://www.europeanwindstorms.org/)
65 | * [European Severe Storms Laboratory](https://www.essl.org/)
66 | * [GHSL - Global Human Settlement Layer (Copernicus Emergency Management Service - Exposure Mapping Component)](https://human-settlement.emergency.copernicus.eu/index.php)
67 | * [Copernicus Emergency Management Service](https://emergency.copernicus.eu/)
68 | * [Integrated Carbon Observation System](https://www.icos-cp.eu/)
69 |
70 | * Italy
71 | * [The Italian Institute for Environmental Protection and Research - ISPRA](https://www.isprambiente.gov.it/en)
72 | * [IdroGEO Italian Web Platform on Landslides and Floods](https://idrogeo.isprambiente.it/app/)
73 | * [INGV Terremoti - Earthquakes](https://ingvterremoti.com/)
74 | * [Istituto Nazionale di Geofisica e Vulcanologia](https://www.ingv.it/en/)
75 | * [Osservatorio Vesuviano](https://www.ov.ingv.it/index.php)
76 | * [INGV TINITALY - Digital Elevation Model of Italy](https://data.ingv.it/dataset/185#additional-metadata)
77 | * United States
78 | * [Federal Emergency Management Agency (FEMA)](https://www.fema.gov/)
79 | * [US Geological Survey](https://www.usgs.gov/)
80 | * [NOAA Climate Prediction Center](https://www.cpc.ncep.noaa.gov/)
81 | * [USGS Water Data for the Nation](https://waterdata.usgs.gov/nwis)
82 | * [SHELDUS - US Center for Emergency Management and Homeland Security](https://cemhs.asu.edu/sheldus)
83 | * International Organizations
84 | * [World Glacier Monitoring Service](https://wgms.ch/)
85 |
86 | * Health
87 | * [World Health Organization - The Global Health Observatory](https://www.who.int/data/gho)
88 | * [Stockholm Convention on Persistent Organic Pollutants](https://www.pops-gmp.org/index.html)
89 |
90 | * Social Sciences
91 | * European Union
92 | * [Eurostat - The Statistical Office of the European Union](https://ec.europa.eu/eurostat)
93 |
94 | * Italy
95 | * [Istituto Nazionale di Statistica - Italian National Institute of Statistics (IT)](https://www.istat.it/en/)
96 |
97 | * United States
98 | * [Bureau of Labor Statistics](https://www.bls.gov/)
99 | * [Census Bureau](https://data.census.gov/cedsci/)
100 |
101 | * Water
102 | * European Union
103 | [Water Information System for Europe](https://water.europa.eu/)
104 |
105 | * Geospatial Data
106 | * European Union
107 | [European Centre for Medium-Range Weather Forecasts Data Store](https://data.ecmwf.int/)
108 | [The Geographic Information System of the Commission (GISCO)](https://ec.europa.eu/eurostat/web/gisco)
109 |
--------------------------------------------------------------------------------
/python/parsing-data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Parsing data"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## Parsing HTML data"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 54,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "from urllib.request import urlopen\n",
24 | "from lxml.html import parse\n",
25 | "parsed = parse(urlopen('https://raw.githubusercontent.com/luigiselmi/datascience/master/python/finance/data/world_companies.html'))\n",
26 | "doc = parsed.getroot()\n",
27 | "\n",
28 | "tables = doc.findall('.//table')\n",
29 | "\n",
30 | "table = tables[0]\n",
31 | "\n",
32 | "rows = table.findall('.//tr')\n",
33 | "\n",
34 | "def _unpack(row, kind):\n",
35 | " elts = row.findall('.//%s' % kind)\n",
36 | " return [val.text_content() for val in elts]"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 55,
42 | "metadata": {},
43 | "outputs": [
44 | {
45 | "data": {
46 | "text/plain": [
47 | "['Company', 'Contact', 'Country']"
48 | ]
49 | },
50 | "execution_count": 55,
51 | "metadata": {},
52 | "output_type": "execute_result"
53 | }
54 | ],
55 | "source": [
56 | "_unpack(rows[0], kind='th') # header"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 56,
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "data": {
66 | "text/plain": [
67 | "[['Alfreds Futterkiste', 'Maria Anders', 'Germany'],\n",
68 | " ['Centro comercial Moctezuma', 'Francisco Chang', 'Mexico']]"
69 | ]
70 | },
71 | "execution_count": 56,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "[_unpack(rows[i], kind='td') for i in range(1, 3)]"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 57,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/html": [
88 | "\n",
89 | "\n",
102 | "
\n",
103 | " \n",
104 | " \n",
105 | " \n",
106 | " Company \n",
107 | " Contact \n",
108 | " Country \n",
109 | " \n",
110 | " \n",
111 | " \n",
112 | " \n",
113 | " 0 \n",
114 | " Alfreds Futterkiste \n",
115 | " Maria Anders \n",
116 | " Germany \n",
117 | " \n",
118 | " \n",
119 | " 1 \n",
120 | " Centro comercial Moctezuma \n",
121 | " Francisco Chang \n",
122 | " Mexico \n",
123 | " \n",
124 | " \n",
125 | "
\n",
126 | "
"
127 | ],
128 | "text/plain": [
129 | " Company Contact Country\n",
130 | "0 Alfreds Futterkiste Maria Anders Germany\n",
131 | "1 Centro comercial Moctezuma Francisco Chang Mexico"
132 | ]
133 | },
134 | "execution_count": 57,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "from pandas.io.parsers import TextParser\n",
141 | "\n",
142 | "def parse_options_data(table):\n",
143 | " rows = table.findall('.//tr')\n",
144 | " header = _unpack(rows[0], kind='th')\n",
145 | " data = [_unpack(r,'td') for r in rows[1:]]\n",
146 | " return TextParser(data, names=header).get_chunk()\n",
147 | "\n",
148 | "df = parse_options_data(table) # returns a dataframe\n",
149 | "df"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 58,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "data": {
159 | "text/plain": [
160 | "0 Alfreds Futterkiste\n",
161 | "1 Centro comercial Moctezuma\n",
162 | "Name: Company, dtype: object"
163 | ]
164 | },
165 | "execution_count": 58,
166 | "metadata": {},
167 | "output_type": "execute_result"
168 | }
169 | ],
170 | "source": [
171 | "df['Company']"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 59,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | "0 Maria Anders\n",
183 | "1 Francisco Chang\n",
184 | "Name: Contact, dtype: object"
185 | ]
186 | },
187 | "execution_count": 59,
188 | "metadata": {},
189 | "output_type": "execute_result"
190 | }
191 | ],
192 | "source": [
193 | "df['Contact']"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {},
199 | "source": [
200 | "## Parsing XML data"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 60,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "from lxml import objectify\n",
210 | "parsed_xml = objectify.parse(urlopen('https://raw.githubusercontent.com/luigiselmi/datascience/master/python/finance/data/persons.xml'))\n",
211 | "xml_root = parsed_xml.getroot()"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 61,
217 | "metadata": {},
218 | "outputs": [],
219 | "source": [
220 | "person = []\n",
221 | "for element in xml_root.person:\n",
222 | " element_data = {}\n",
223 | " for child in element.getchildren():\n",
224 | " element_data[child.tag] = child.pyval\n",
225 | " person.append(element_data)"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 64,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "data": {
235 | "text/plain": [
236 | "['Pippo', 'Mickey']"
237 | ]
238 | },
239 | "execution_count": 64,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | }
243 | ],
244 | "source": [
245 | "[person[i]['firstname'] for i in range(0, len(person))]"
246 | ]
247 | }
248 | ],
249 | "metadata": {
250 | "kernelspec": {
251 | "display_name": "Python 3 (ipykernel)",
252 | "language": "python",
253 | "name": "python3"
254 | },
255 | "language_info": {
256 | "codemirror_mode": {
257 | "name": "ipython",
258 | "version": 3
259 | },
260 | "file_extension": ".py",
261 | "mimetype": "text/x-python",
262 | "name": "python",
263 | "nbconvert_exporter": "python",
264 | "pygments_lexer": "ipython3",
265 | "version": "3.9.13"
266 | }
267 | },
268 | "nbformat": 4,
269 | "nbformat_minor": 2
270 | }
271 |
--------------------------------------------------------------------------------
/python/atn/logfiles/Users_Navigation_Data.doc:
--------------------------------------------------------------------------------
1 | Date: Mon, 25 Nov 2019 16:22:59 +0100 (CET)
2 | Message-ID: <1508649183.873.1574695379478@confluence-new.iais.fraunhofer.de>
3 | Subject: Exported From Confluence
4 | MIME-Version: 1.0
5 | Content-Type: multipart/related;
6 | boundary="----=_Part_872_1440509474.1574695379478"
7 |
8 | ------=_Part_872_1440509474.1574695379478
9 | Content-Type: text/html; charset=UTF-8
10 | Content-Transfer-Encoding: quoted-printable
11 | Content-Location: file:///C:/exported.html
12 |
13 |
17 |
18 |
20 | Users Navigation Data
21 |
35 |
231 |
232 |
233 | Users Navigation Data
234 |
235 |
ATN provides log files for different user generated events colle=
236 | cted during a session on their portal (e.g. detailed description visualizat=
237 | ion, comparison, download). The log files contain the time-stamp, the =
238 | user identifier and the item identifier. The event type is available from t=
239 | he name of the log file. These events represent an implicit feedback a=
240 | bout the interest of a user on an item. The feedback is collected parsing t=
241 | he log files in records with the following structure
242 |
userID, itemID, feedback_value, feedback_type, time-stamp, query
243 |
The feedback type is the event type (i.e. view, comparison, download). W=
244 | e can assume a default feedback_value =3D 1 for a view event. We can also a=
245 | ssume that a comparison or a download have the same value as feedback to es=
246 | timate the relevance of an item or that they represent a stronger evidence =
247 | of relevancy and assign a feedback_value =3D 2. The query is the query used=
248 | in the search whose result contains the item. Examples of the log fi=
249 | les are attached.
250 |
251 |
252 |
253 |
254 | ------=_Part_872_1440509474.1574695379478--
255 |
--------------------------------------------------------------------------------
/python/python_oop.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "2d5cf880",
6 | "metadata": {},
7 | "source": [
8 | "# Python OOP\n",
9 | "We will see how Object Oriented Programming is implemented in Python. the special method __init__ is called each time the class is istantiated."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "d9fed405",
15 | "metadata": {},
16 | "source": [
17 | "### Class definition"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 42,
23 | "id": "3485cab8",
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "class HumanBeing(object):\n",
28 | " def __init__(self, first_name, eye_color):\n",
29 | " self.first_name = first_name\n",
30 | " self.eye_color = eye_color\n",
31 | " self.position = 0\n",
32 | " def walk_steps(self, steps):\n",
33 | " self.position += steps"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 43,
39 | "id": "68a35edb",
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "luigi = HumanBeing('Luigi', 'brown')"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 44,
49 | "id": "410ba0f9",
50 | "metadata": {},
51 | "outputs": [
52 | {
53 | "data": {
54 | "text/plain": [
55 | "'Luigi'"
56 | ]
57 | },
58 | "execution_count": 44,
59 | "metadata": {},
60 | "output_type": "execute_result"
61 | }
62 | ],
63 | "source": [
64 | "luigi.first_name"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 45,
70 | "id": "3c63be48",
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/plain": [
76 | "3"
77 | ]
78 | },
79 | "execution_count": 45,
80 | "metadata": {},
81 | "output_type": "execute_result"
82 | }
83 | ],
84 | "source": [
85 | "luigi.walk_steps(3)\n",
86 | "luigi.position"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 46,
92 | "id": "78c99245",
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "data": {
97 | "text/plain": [
98 | "32"
99 | ]
100 | },
101 | "execution_count": 46,
102 | "metadata": {},
103 | "output_type": "execute_result"
104 | }
105 | ],
106 | "source": [
107 | "luigi.__sizeof__()"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 51,
113 | "id": "dd9852ab",
114 | "metadata": {},
115 | "outputs": [],
116 | "source": [
117 | "class FinancialInstrument(object):\n",
118 | " def __init__(self, symbol, price):\n",
119 | " self.symbol = symbol\n",
120 | " self.__price = price # private attribute\n",
121 | " pass"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": 52,
127 | "id": "ecb28d2f",
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "data": {
132 | "text/plain": [
133 | "<__main__.FinancialInstrument at 0x162e2c42160>"
134 | ]
135 | },
136 | "execution_count": 52,
137 | "metadata": {},
138 | "output_type": "execute_result"
139 | }
140 | ],
141 | "source": [
142 | "eni_mi = FinancialInstrument('ENI.MI', 11.70)\n",
143 | "eni_mi"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "id": "04190d29",
149 | "metadata": {},
150 | "source": [
151 | "Data attributes can be defined on the fly"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 53,
157 | "id": "2a86ed30",
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/plain": [
163 | "1000"
164 | ]
165 | },
166 | "execution_count": 53,
167 | "metadata": {},
168 | "output_type": "execute_result"
169 | }
170 | ],
171 | "source": [
172 | "eni_mi.num_shares = 1000\n",
173 | "eni_mi.num_shares"
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "id": "770c0241",
179 | "metadata": {},
180 | "source": [
181 | "### Inheritance"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 56,
187 | "id": "7bf80411",
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "class FinancialInstrument(FinancialInstrument):\n",
192 | " def get_price(self):\n",
193 | " return self.__price # private attribute\n",
194 | " def set_price(self, price):\n",
195 | " self.__price = price"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 60,
201 | "id": "a5abe686",
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/plain": [
207 | "12.0"
208 | ]
209 | },
210 | "execution_count": 60,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "eni_stock = FinancialInstrument('ENI.MI', 12.0)\n",
217 | "eni_stock.get_price()"
218 | ]
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": 65,
223 | "id": "2489e483",
224 | "metadata": {},
225 | "outputs": [
226 | {
227 | "data": {
228 | "text/plain": [
229 | "2.4"
230 | ]
231 | },
232 | "execution_count": 65,
233 | "metadata": {},
234 | "output_type": "execute_result"
235 | }
236 | ],
237 | "source": [
238 | "eni_stock.__price"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 66,
244 | "id": "fea81913",
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "data": {
249 | "text/plain": [
250 | "12.0"
251 | ]
252 | },
253 | "execution_count": 66,
254 | "metadata": {},
255 | "output_type": "execute_result"
256 | }
257 | ],
258 | "source": [
259 | "eni_stock.get_price()"
260 | ]
261 | },
262 | {
263 | "cell_type": "code",
264 | "execution_count": 67,
265 | "id": "eae576d3",
266 | "metadata": {},
267 | "outputs": [
268 | {
269 | "data": {
270 | "text/plain": [
271 | "2.4"
272 | ]
273 | },
274 | "execution_count": 67,
275 | "metadata": {},
276 | "output_type": "execute_result"
277 | }
278 | ],
279 | "source": [
280 | "eni_stock.__price = 2.4\n",
281 | "eni_stock.__price"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 68,
287 | "id": "e19273cb",
288 | "metadata": {},
289 | "outputs": [],
290 | "source": [
291 | "class PortfolioPosition(object):\n",
292 | " def __init__(self, financial_instrument, position_size):\n",
293 | " self.position = financial_instrument\n",
294 | " self.__position_size = position_size\n",
295 | " def get_position_size(self):\n",
296 | " return self.__position_size\n",
297 | " def update_position_size(self, position_size):\n",
298 | " self.__position_size = position_size\n",
299 | " def get_position_value(self):\n",
300 | " return self.__position_size * self.position.get_price()"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 69,
306 | "id": "197a8c6f",
307 | "metadata": {},
308 | "outputs": [
309 | {
310 | "data": {
311 | "text/plain": [
312 | "10"
313 | ]
314 | },
315 | "execution_count": 69,
316 | "metadata": {},
317 | "output_type": "execute_result"
318 | }
319 | ],
320 | "source": [
321 | "pp = PortfolioPosition(eni_stock, 10)\n",
322 | "pp.get_position_size()"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 70,
328 | "id": "3c703890",
329 | "metadata": {},
330 | "outputs": [
331 | {
332 | "data": {
333 | "text/plain": [
334 | "120.0"
335 | ]
336 | },
337 | "execution_count": 70,
338 | "metadata": {},
339 | "output_type": "execute_result"
340 | }
341 | ],
342 | "source": [
343 | "pp.get_position_value()"
344 | ]
345 | },
346 | {
347 | "cell_type": "markdown",
348 | "id": "4fa8f29c",
349 | "metadata": {},
350 | "source": [
351 | "## Yahoo! Finance"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 74,
357 | "id": "2a77beb4",
358 | "metadata": {},
359 | "outputs": [
360 | {
361 | "data": {
362 | "text/html": [
363 | "\n",
364 | "\n",
377 | "
\n",
378 | " \n",
379 | " \n",
380 | " \n",
381 | " Open \n",
382 | " High \n",
383 | " Low \n",
384 | " Close \n",
385 | " Volume \n",
386 | " Dividends \n",
387 | " Stock Splits \n",
388 | " \n",
389 | " \n",
390 | " Date \n",
391 | " \n",
392 | " \n",
393 | " \n",
394 | " \n",
395 | " \n",
396 | " \n",
397 | " \n",
398 | " \n",
399 | " \n",
400 | " \n",
401 | " \n",
402 | " 2022-09-05 \n",
403 | " 11.978 \n",
404 | " 12.432 \n",
405 | " 11.910 \n",
406 | " 12.290 \n",
407 | " 12793436 \n",
408 | " 0.0 \n",
409 | " 0.0 \n",
410 | " \n",
411 | " \n",
412 | " 2022-09-06 \n",
413 | " 12.258 \n",
414 | " 12.282 \n",
415 | " 11.786 \n",
416 | " 11.952 \n",
417 | " 17528169 \n",
418 | " 0.0 \n",
419 | " 0.0 \n",
420 | " \n",
421 | " \n",
422 | " 2022-09-07 \n",
423 | " 11.790 \n",
424 | " 11.970 \n",
425 | " 11.540 \n",
426 | " 11.618 \n",
427 | " 17903241 \n",
428 | " 0.0 \n",
429 | " 0.0 \n",
430 | " \n",
431 | " \n",
432 | " 2022-09-08 \n",
433 | " 11.638 \n",
434 | " 11.756 \n",
435 | " 11.410 \n",
436 | " 11.582 \n",
437 | " 16749834 \n",
438 | " 0.0 \n",
439 | " 0.0 \n",
440 | " \n",
441 | " \n",
442 | " 2022-09-09 \n",
443 | " 11.622 \n",
444 | " 11.818 \n",
445 | " 11.586 \n",
446 | " 11.682 \n",
447 | " 10706661 \n",
448 | " 0.0 \n",
449 | " 0.0 \n",
450 | " \n",
451 | " \n",
452 | "
\n",
453 | "
"
454 | ],
455 | "text/plain": [
456 | " Open High Low Close Volume Dividends Stock Splits\n",
457 | "Date \n",
458 | "2022-09-05 11.978 12.432 11.910 12.290 12793436 0.0 0.0\n",
459 | "2022-09-06 12.258 12.282 11.786 11.952 17528169 0.0 0.0\n",
460 | "2022-09-07 11.790 11.970 11.540 11.618 17903241 0.0 0.0\n",
461 | "2022-09-08 11.638 11.756 11.410 11.582 16749834 0.0 0.0\n",
462 | "2022-09-09 11.622 11.818 11.586 11.682 10706661 0.0 0.0"
463 | ]
464 | },
465 | "execution_count": 74,
466 | "metadata": {},
467 | "output_type": "execute_result"
468 | }
469 | ],
470 | "source": [
471 | "import yfinance as yf\n",
472 | "eni_mi = yf.Ticker(\"ENI.MI\")\n",
473 | "hist = eni_mi.history(period=\"max\")\n",
474 | "hist.tail()"
475 | ]
476 | },
477 | {
478 | "cell_type": "code",
479 | "execution_count": null,
480 | "id": "4f84984c",
481 | "metadata": {},
482 | "outputs": [],
483 | "source": []
484 | }
485 | ],
486 | "metadata": {
487 | "kernelspec": {
488 | "display_name": "Python 3 (ipykernel)",
489 | "language": "python",
490 | "name": "python3"
491 | },
492 | "language_info": {
493 | "codemirror_mode": {
494 | "name": "ipython",
495 | "version": 3
496 | },
497 | "file_extension": ".py",
498 | "mimetype": "text/x-python",
499 | "name": "python",
500 | "nbconvert_exporter": "python",
501 | "pygments_lexer": "ipython3",
502 | "version": "3.9.13"
503 | }
504 | },
505 | "nbformat": 4,
506 | "nbformat_minor": 5
507 | }
508 |
--------------------------------------------------------------------------------
/python/atn/microcircuites_and_descretes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Microcircuits and Descretes"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets\n"
20 | ]
21 | }
22 | ],
23 | "source": [
24 | "cd C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 48,
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "data": {
34 | "text/plain": [
35 | "'0.22.0'"
36 | ]
37 | },
38 | "execution_count": 48,
39 | "metadata": {},
40 | "output_type": "execute_result"
41 | }
42 | ],
43 | "source": [
44 | "import pandas as pd\n",
45 | "import numpy as np\n",
46 | "from datetime import datetime\n",
47 | "from pandas import Series, DataFrame\n",
48 | "%matplotlib inline\n",
49 | "import matplotlib.pyplot as plt\n",
50 | "import warnings; warnings.simplefilter('ignore')\n",
51 | "plt.style.use('seaborn-whitegrid')\n",
52 | "pd.__version__"
53 | ]
54 | },
55 | {
56 | "cell_type": "markdown",
57 | "metadata": {},
58 | "source": [
59 | "### Read the data"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 85,
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "sales = pd.read_csv('PurchaseData_20180319.csv')"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "metadata": {},
74 | "source": [
75 | "# Microcircuits Data Analysis\n",
76 | "In order to predict the price (POP_UnitPrice) and the delivery time (PO_Date) of microcircuits we have received from ATN, with the data, a list of features that should be relevant for the task at hand and a set of rules. The relevant features are a subset of the fields in the data set.\n",
77 | "\n",
78 | "1. Component number (ComponentNumber_MAT_Flight)\n",
79 | "2. Specification name (SpecificationName)\n",
80 | "3. Family path (FamilyPath_Flight)\n",
81 | "4. Style (Style_Flight)\n",
82 | "5. Quality level (QLevel_Flight)\n",
83 | "6. Package class (PACKAGECLASS)\n",
84 | "7. Package (PACKAGE)\n",
85 | "8. Finish (FINISH)\n",
86 | "9. Radiation level (TID_HDR_N)\n",
87 | "10. Quality Value Name (QualityValueName)\n",
88 | "11. Manufacturer (MnfrDoeeetName)\n",
89 | "12. Quantity (POP_Qty)\n",
90 | "13. Date of purchase (PO_Date)\n",
91 | "14. Unit price (POP_UnitPrice)\n",
92 | "15. Date of delivery (POP_DeliveryDate)\n",
93 | "\n",
94 | "The assumption is that the data and the rules should allow us to predict the price and delivery time of a microcircuit whether there are records about that specific microcircuit in the sample data set or not. The first 12 paramenters are called features or predictors while the unit price and the date of delivery are called targets. A client, requesting a prediction about the price of a component, will send in the request the predictors that will allow the server to \n",
95 | "\n",
96 | "1. Identify the component (component number, specification name, family path)\n",
97 | "2. Determine the quality characteristics of the component (style, package, package class, finish, radiation level, quality value)\n",
98 | "3. Use other information that might impact the price (manufacturer, quantity, date of purchase) \n",
99 | "\n",
100 | "In order to make a prediction of the price of a component, the algorithm looks into the data to find records about that same component and return the unit price or an average value. In case no records are available about that component number, it looks for records with the same specification name and applies some rules to make a prediction for the price. \n",
101 | "\n",
102 | "The component number and the specification name encode, among other information, the specific family of the component, e.g. whether it is an operational amplifier or an analog to digital converter. When there are no records with the same component number or specification name, the algorithm looks for records about similar components and apply some rules to make a prediction. So the next step in this case is to look for records with the same family path or the same most specific name in the family path. \n"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## Data preparation for microcircuits\n",
110 | "Before implementing the algorithm to predict unit price and delivery time for a microcircuit, we have to extract the records from the sale orders data set and apply the following transformation\n",
111 | "\n",
112 | "1. Filter out the records about services (aka \"charges\") \n",
113 | "2. Select the records about microcircuits (family root -> microcircuits)\n",
114 | "3. Extract the most specific family of the component from the hierarchy (family path)\n",
115 | "4. Transform all the prices in euro\n",
116 | "5. Update the all the unit prices applying an increase of 5 % per year (using the compund interest formula) \n",
117 | "\n",
118 | "After the data is prepared we can implement the algorithms for microcircuits \n",
119 | "\n",
120 | "1. Price prediction\n",
121 | "2. Delivery time prediction\n"
122 | ]
123 | },
124 | {
125 | "cell_type": "markdown",
126 | "metadata": {},
127 | "source": [
128 | "#### 1. Filter out records about services"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 86,
134 | "metadata": {},
135 | "outputs": [
136 | {
137 | "data": {
138 | "text/plain": [
139 | "18508"
140 | ]
141 | },
142 | "execution_count": 86,
143 | "metadata": {},
144 | "output_type": "execute_result"
145 | }
146 | ],
147 | "source": [
148 | "sales = sales[sales['PRICE LABEL'] == 'MATERIAL UNIT PRICE']\n",
149 | "sales.index.size"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "#### 2. Select the records about microcircuits"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 87,
162 | "metadata": {},
163 | "outputs": [
164 | {
165 | "name": "stdout",
166 | "output_type": "stream",
167 | "text": [
168 | "Number of records for microcircuits: 3041\n"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "microcircuits_records = sales[sales['FamilyRoot'] == 'Microcircuits']\n",
174 | "num_microcircuits_records = microcircuits_records.index.size\n",
175 | "print(\"Number of records for microcircuits: \" + str(num_microcircuits_records))"
176 | ]
177 | },
178 | {
179 | "cell_type": "markdown",
180 | "metadata": {},
181 | "source": [
182 | "#### 3. Extract the family root and leaf "
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 88,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "Family root: Microcircuits, Family leaf: Operational Amplifier\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "microcircuits_records['family_leaf'] = [family.split(\"/\")[len(family.split(\"/\")) - 1] for family in microcircuits_records['FamilyPath_Flight'] ]\n",
200 | "microcircuits_records['family_root'] = [family.split(\"/\")[0] for family in microcircuits_records['FamilyPath_Flight'] ]\n",
201 | "print(\"Family root: \" + microcircuits_records['family_root'][0] + \", Family leaf: \" + microcircuits_records['family_leaf'][0])"
202 | ]
203 | },
204 | {
205 | "cell_type": "markdown",
206 | "metadata": {},
207 | "source": [
208 | "#### 4. Transform all the unit prices in US dollars to euros"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 89,
214 | "metadata": {},
215 | "outputs": [],
216 | "source": [
217 | "microcircuits_records['price_euros'] = microcircuits_records['POP_UnitPrice_CU'] * microcircuits_records['PO_Change'] * (microcircuits_records['PO_Currency'] == 'USD')\n",
218 | "microcircuits_records['price_euros'] += microcircuits_records['POP_UnitPrice_CU'] * (microcircuits_records['PO_Currency'] == 'EUR')"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 90,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": [
227 | "#microcircuits_records['price_euros_simple'] = [price * microcircuits_records['PO_Change'] for price in microcircuits_records['POP_UnitPrice_CU']]"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "#### 5. Update the unit prices\n",
235 | "The date of purchase is used to compute the adjusted price (AP) from the unit price (P) in each record using the formula \n",
236 | "\n",
237 | "> AP = P*(1 + %)^Y\n",
238 | "\n",
239 | "where % is the increase in price per year, e.g. 5 %, Y is the number of years since the date of purchase in the record."
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 117,
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "Price in 2013: 9.85. Adjusted price: 12.571373390625004\n"
252 | ]
253 | }
254 | ],
255 | "source": [
256 | "# Change the type of PO_Date from int64 to str\n",
257 | "years_str = pd.Series(microcircuits_records['PO_Date']).astype('str')\n",
258 | "# Extract the 1st 4 digits\n",
259 | "years_str = [year_str[0:4] for year_str in years_str]\n",
260 | "# Change back from str to int and compute the number of years from the purchase date to 2018\n",
261 | "years = [2018 - int(year_str) for year_str in years_str]\n",
262 | "microcircuits_records['years'] = years\n",
263 | "microcircuits_records['adjusted_price'] = microcircuits_records['price_euros'] * np.power(1 + 0.05, microcircuits_records['years'])\n",
264 | "print(\"Price in \" + str(2018 - microcircuits_records['years'][0]) + \": \" + str(microcircuits_records['price_euros'][0]) + \". Adjusted price: \" + str(microcircuits_records['adjusted_price'][0]))"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "## Price prediction for microcircuits\n",
272 | "The client will send all the 12 predictors, each mapped to a field in the dataset\n",
273 | "\n",
274 | "1. Component number\n",
275 | "2. Specification name\n",
276 | "3. Family path\n",
277 | "4. Style \n",
278 | "5. Quality level\n",
279 | "6. Package class\n",
280 | "7. Package\n",
281 | "8. Finish \n",
282 | "9. Radiation level\n",
283 | "10. Quality Value Name\n",
284 | "11. Manufacturer\n",
285 | "12. Quantity\n",
286 | "\n",
287 | "In order to make a prediction the algorithm must find some records about the same component or a similar one in the sale orders. The following three scenarios might happen\n",
288 | "\n",
289 | "1. The component number in the request matches with a component number in the dataset\n",
290 | "2. The component number in the request does not match with any in the dataset but the specification name matches\n",
291 | "3. Neither the component number nor the specification name in the request matches with a record in the sale orders"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "### Scenario 1\n",
299 | "The component number in the request matches with a component number in the datase.\n",
300 | "\n",
301 | "In this scenario the only parameters to use to predict the price are\n",
302 | "\n",
303 | "- Manufacturer\n",
304 | "- Date of purchase\n",
305 | "- Quantity\n",
306 | "\n",
307 | "The manufacturer is used to select the records with the same manufacturer. If the manufacturer is different the records with the different manufacturer will be used. \n",
308 | "\n",
309 | "Compute the average adjusted price for the same quantity in the sale orders and the standard deviation. This step can be performed in the data preparation phase.\n",
310 | "\n",
311 | "If more than one records are availabe with different quantities use a linear interpolation average adjusted prices to find the average adjusted price for the quantity requested. If only one record is available, returns the adjusted price.\n"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "### Scenario 2\n",
319 | "The component number in the request does not match with any in the dataset but the specification name matches.\n",
320 | "\n",
321 | "In this scenario the algorithm must select the records that are about a similar component using the specification name and the family path.\n",
322 | "\n",
323 | "It must also filter the records, about the same specification name and family path"
324 | ]
325 | },
326 | {
327 | "cell_type": "markdown",
328 | "metadata": {},
329 | "source": [
330 | "### Scenario 3\n",
331 | "Neither the component number nor the specification name in the request matches with a record in the sale orders"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": []
340 | }
341 | ],
342 | "metadata": {
343 | "kernelspec": {
344 | "display_name": "Python 3",
345 | "language": "python",
346 | "name": "python3"
347 | },
348 | "language_info": {
349 | "codemirror_mode": {
350 | "name": "ipython",
351 | "version": 3
352 | },
353 | "file_extension": ".py",
354 | "mimetype": "text/x-python",
355 | "name": "python",
356 | "nbconvert_exporter": "python",
357 | "pygments_lexer": "ipython3",
358 | "version": "3.6.4"
359 | }
360 | },
361 | "nbformat": 4,
362 | "nbformat_minor": 2
363 | }
364 |
--------------------------------------------------------------------------------
/python/linalgebra/linalgebra_ch1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "9105420b",
6 | "metadata": {},
7 | "source": [
8 | "# Linear Algebra\n",
9 | "This notebook contains examples of linear algebra in Python. The examples are based on Gilbert Strang's book [Introduction to Linear Algebra, 5th Edition](https://math.mit.edu/~gs/linearalgebra/) and on Robert Johansson's book [Numerical Python](https://jrjohansson.github.io/numericalpython.html)."
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 12,
15 | "id": "02642e2c",
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "import numpy as np\n",
20 | "import math\n",
21 | "import matplotlib.pyplot as plt"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "id": "38696b67",
27 | "metadata": {},
28 | "source": [
29 | "## Chapter 1"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "id": "14367992",
35 | "metadata": {},
36 | "source": [
37 | "### 1.1 Vectors and Linear Combinations"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 6,
43 | "id": "7321c736",
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "(array([1, 1]), array([2, 3]))"
50 | ]
51 | },
52 | "execution_count": 6,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "v = np.array([1,1])\n",
59 | "w = np.array([2,3])"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "id": "3dd71d29",
65 | "metadata": {},
66 | "source": [
67 | "we can add two vectors"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 8,
73 | "id": "6ea79ae8",
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "data": {
78 | "text/plain": [
79 | "array([3, 4])"
80 | ]
81 | },
82 | "execution_count": 8,
83 | "metadata": {},
84 | "output_type": "execute_result"
85 | }
86 | ],
87 | "source": [
88 | "v + w"
89 | ]
90 | },
91 | {
92 | "cell_type": "markdown",
93 | "id": "1b664a55",
94 | "metadata": {},
95 | "source": [
96 | "we can multiply a vector by a scalar"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 9,
102 | "id": "f1a51219",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/plain": [
108 | "array([2, 2])"
109 | ]
110 | },
111 | "execution_count": 9,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "2 * v"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "id": "92622bf0",
123 | "metadata": {},
124 | "source": [
125 | "we can compute a linear combination of two vectors"
126 | ]
127 | },
128 | {
129 | "cell_type": "code",
130 | "execution_count": 43,
131 | "id": "90f7d9b9",
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "data": {
136 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAEKCAYAAAAxXHOuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZaElEQVR4nO3df5RcdZnn8fdDhyAJMPwKvxKCwEZy4hAYbAMrKAMsDIGRiIMLQYybBXOiZpXdZWdyDqt4dhQHx2XPsiDYODmDuwPIDCA5KxJcxnN0BDQdJYHwyxiDdAImQRBBSNLm2T+qklQq1Z3q9L1d1d3v1zl9uuve+6083lz85D5PVXVkJpIkFWWvVhcgSRpZDBZJUqEMFklSoQwWSVKhDBZJUqEMFklSoUoNlog4PyKei4hVEbGwwf5ZEbEiIp6IiO6IOKPZtZKk9hRlvY8lIjqA54FzgR5gKTA7M5+uOWY/4M3MzIiYDtyTmVObWStJak9l3rHMAFZl5urM3AzcDcyqPSAz38gdyTYeyGbXSpLa05gSn3si8GLN4x7g1PqDIuJi4MvAYcCFA1lbXT8PmAcwfvz490ydOnXQhUvSaLFs2bKNmTmhyOcsM1iiwbZd+m6ZeT9wf0R8APhr4N80u7a6vgvoAujs7Mzu7u49LliSRpuIeKHo5yyzFdYDHF3zeBKwrq+DM/MHwPERcehA10qS2keZwbIUmBIRx0bEWOAyYHHtARHxryIiqj+fAowFXmlmrSSpPZXWCsvM3ohYACwBOoBFmbkyIuZX998G/AUwJyK2AG8Bl1aH+Q3XllWrJKk4pb3cuBWcsUjSwETEsszsLPI5fee9JKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUKUGS0ScHxHPRcSqiFjYYP9HI2JF9evRiDipZt+aiHgyIp6IiO4y65QkFWdMWU8cER3ALcC5QA+wNCIWZ+bTNYf9EjgzM1+NiJlAF3Bqzf6zMnNjWTVKkopX5h3LDGBVZq7OzM3A3cCs2gMy89HMfLX68HFgUon1SJKGQJnBMhF4seZxT3VbX64EvlvzOIGHI2JZRMwroT5JUglKa4UB0WBbNjww4iwqwXJGzebTM3NdRBwGfC8ins3MHzRYOw+YBzB58uTBVy1JGpQy71h6gKNrHk8C1tUfFBHTgW8AszLzlW3bM3Nd9ft64H4qrbVdZGZXZnZmZueECRMKLF+StCfKDJalwJSIODYixgKXAYtrD4iIycB9wMcy8/ma7eMjYv9tPwPnAU+VWKskqSCltcIyszciFgBLgA5gUWaujIj51f23AZ8HDgG+FhEAvZnZCRwO3F/dNga4MzMfKqtWSVJxIrPh2GNY6uzszO5u3/IiSc2KiGXVf9AXxnfeS5IKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKVWqwRMT5EfFcRKyKiIUN9n80IlZUvx6NiJOaXStJak+lBUtEdAC3ADOBacDsiJhWd9gvgTMzczrw10DXANZKktpQmXcsM4BVmbk6MzcDdwOzag/IzEcz89Xqw8eBSc2ulSS1pzKDZSLwYs3jnuq2vlwJfHegayNiXkR0R0T3hg0bBlGuJKkIZQZLNNiWDQ+MOItKsPzVQNdmZldmdmZm54QJE/aoUElSccaU+Nw9wNE1jycB6+oPiojpwDeAmZn5ykDWSpLaT5l3LEuBKRFxbESMBS4DFtceEBGTgfuAj2Xm8wNZK0lqT6XdsWRmb0QsAJYAHcCizFwZEfOr+28DPg8cAnwtIgB6q22thmvLqlWSVJzIbDi6GJY6Ozuzu7u71WVI0rAREcsys7PI5/Sd95KkQhkskqRCGSySpEIZLJKkQhksKsXW3MpIemGIpOYZLCrcS797ic/98+eovoRc0ihjsKhQS1Yt4eSvn8xR+x/V6lIktYjBokJs+cMWFv6/hZz/D+fz6luvcukfX9rqkiS1SJmfFaZR4oXXXmD2vbN5rOcxAC5814UcOu7QFlclqVUMFg3Kfc/cx5WLr+S1t1/bvm3O9DmtK0hSyxks2iNv977NNQ9fwy1Lb9lp+8H7HswFUy5oUVWS2oHBogF7/pXnufSfLuWJl5/YZd9l776MfcbsM/RFSWobDu81IKtfXc3se2ez/OXlDffPOck2mDTaGSwakOMOOo5l85bx7IJn2XfMvjvte9ch72LGxBktqkxSuzBYNGC9W3u5avFVvNX7FgAHvuNAoDK0902RkgwWDdh137+OH/7qhwDMPXkuX7vgawBcMf2KVpYlqU04vNeALFm1hOv/5XoA3j3h3dx8wc3s07EPP/zVDznmwGNaXJ2kdmCwqGlrX1/LFfdX7krG7T2Oez5yD+P2HgfATTNvamVpktqIrTA1pXdrL7Pvnc3G328E4NYLb2XahGnb94/Zy3+jSKowWNSU+rmKLyuW1BeDRbvVaK4iSX0xWNSv/uYqktSIwaI+7W6uIkmNGCzqk3MVSXvCYFFDzlUk7SmDRbtwriJpMAwW7cS5iqTBKjVYIuL8iHguIlZFxMIG+6dGxGMRsSkirqnbtyYinoyIJyKiu8w6tYNzFUmDVdrbpSOiA7gFOBfoAZZGxOLMfLrmsN8AnwE+1MfTnJWZG8uqUTtzriKpCLu9Y4mIBRFx0B489wxgVWauzszNwN3ArNoDMnN9Zi4FtuzB86tAzlUkFaWZVtgRVO427qm2tpr9hRsTgRdrHvdUtzUrgYcjYllEzOvroIiYFxHdEdG9YcOGATy9tnGuIqlIuw2WzPyvwBTg74B/B/w8Iq6PiON3s7RRAOUAajs9M08BZgKfjogP9FFfV2Z2ZmbnhAkTBvD02sa5iqQiNTW8z8wEXq5+9QIHAf8UEV/pZ1kPcHTN40nAumYLy8x11e/rgfuptNZUMOcqkorWzIzlMxGxDPgK8CPgxMz8JPAe4C/6WboUmBIRx0bEWOAyYHEzRUXE+IjYf9vPwHnAU82sVfOcq0gqQzOvCjsU+HBmvlC7MTO3RsSf97UoM3sjYgGwBOgAFmXmyoiYX91/W0QcAXQDBwBbI+JqYFr1z7y/Os4ZA9yZmQ8N+H+d+uRcRVJZdhssmfn5fvY9s5u1DwIP1m27rebnl6m0yOq9Dpy0u9q055yrSCqL77wfhZyrSCqTwTLKOFeRVDaDZYTZtAmefrrxPucqkoaCwTKCrF8P55wDW/r4HAPnKpKGQmmfFaahtWIFfPCDlZ+nT991v3MVSUPFO5YR4Nvfhve9D371K7joIqj/0B3nKpKGksEyjGXC9dfDxRfDm29Wtl100c7HOFeRNNRshQ1Tb70FV14Jd921Y9sBB8CZZ+58nHMVSUPNYBmG1q2DD30Ili7defvMmTB27I7HzlUktYLBMsx0d8OsWZVwqVfbBnOuIqlVnLEMI889V2l/bWzwOzU7Oip3LOBcRVJrGSzDyAknwPLl8OyzsO++O+8780w4qPp7Pp2rSGolg2WYyYRPfKIyvAc48cTK921tMOcqklrNYBlmurrgkUcqP8+dCw88APvsU3lzpHMVSe3A4f0wsmYNXHNN5eeJE+HGG+HAA2HRIpj8zl7OvsO5iqTWM1iGiUy46ip4443K49tvr4QKwOWXw7WPOFeR1B5shQ0T9S2wba8AA+cqktqLwTIMNGqBbeNcRVK7MVjaXH8tMN+vIqkdGSxtrr8WmO9XkdSODJY21l8LzLmKpHZlsLSp/lpgzlUktTODpU311QJzriKp3Rksbai/FphzFUntzmBpM/21wJyrSBoODJY201cLzLmKpOHCYGkjfbXAnKtIGk5KDZaIOD8inouIVRGxsMH+qRHxWERsiohrBrJ2pOmvBeZcRdJwUlqwREQHcAswE5gGzI6I+n9m/wb4DPDVPVg7ovTVAnOuImm4KfOOZQawKjNXZ+Zm4G5gVu0Bmbk+M5cCWwa6diTpqwXmXEXScFRmsEwEXqx53FPdVujaiJgXEd0R0b1hw4Y9KrSV+mqBOVeRNFyVGSzRYFsWvTYzuzKzMzM7J0yY0HRx7aKvFphzFUnDVZnB0gMcXfN4ErBuCNYOG321wJyrSBrOygyWpcCUiDg2IsYClwGLh2DtsNBXC8y5iqThrrRfTZyZvRGxAFgCdACLMnNlRMyv7r8tIo4AuoEDgK0RcTUwLTNfb7S2rFpboVELzLmKpJEgMpsde7S/zs7O7O7ubnUZu7VmDZx4YuVuZeJEeOqpyt3KtY9cu70FNvfkuSyataildUoa+SJiWWZ2FvmcvvN+iPXVAnOuImmkMFiGWKMWmHMVSSOJwTKEGr0KzLmKpJGmtOG9dtZXC+zaR3y/iqSRxTuWIdKoBeZcRdJIZLAMgUYtMOcqkkYqg6VkjVpg+x3gXEXSyOWMpWSNWmDOVSSNZN6xlKhRC8y5iqSRzmApSaMW2Jt7OVeRNPLZCitJfQvs3D/r5ew7nKtIGvm8YylBoxaYv19F0mhhsBSsUQvsxxudq0gaPWyFFay+BTb99LWc/HXnKpJGD4OlQPUtsK98tZcP+34VSaOMrbCCNGqB/Y+fOVeRNPoYLAWpb4HtNcW5iqTRyVZYAepbYNf8t7WceZdzFUmjk3csg1TfArutq5f533OuImn0MlgGqb4F9tg+zlUkjW62wgahvgV24WeXcMm3natIGt28Y9lD9S2wL9+ylvkPO1eRJINlD9W2wD4+t5fbf+NcRZLAYNkj9S2wQy5xriJJ2xgsA1TfApv/t0u4calzFUnaxmAZoNoW2L+9ai3/s8e5iiTVMlgGoLYFdtSkXl6c4VxFkuqVGiwRcX5EPBcRqyJiYYP9ERE3VfeviIhTavatiYgnI+KJiOgus85m1LfA3v+563hsnXMVSapX2vtYIqIDuAU4F+gBlkbE4sx8uuawmcCU6tepwK3V79uclZkby6pxIGpbYOd9cgnfesm5iiQ1UuYdywxgVWauzszNwN3ArLpjZgHfzIrHgQMj4sgSa9ojtS2wI6asZdk7natIUl/KDJaJwIs1j3uq25o9JoGHI2JZRMwrrcrd2KkFtlcvh8yfzStvOVeRpL6U+ZEu0WBbDuCY0zNzXUQcBnwvIp7NzB/s8odUQmcewOTJkwdTb0O1LbDpn72OFb9zriJJ/SnzjqUHOLrm8SRgXbPHZOa27+uB+6m01naRmV2Z2ZmZnRMmTCio9IraFtghpy5hxR85V5Gk3SkzWJYCUyLi2IgYC1wGLK47ZjEwp/rqsNOA32bmSxExPiL2B4iI8cB5wFMl1rqLnVpg+69lywd3M1fp7YXvfKeyUJJGsdKCJTN7gQXAEuAZ4J7MXBkR8yNifvWwB4HVwCrgduBT1e2HA/8SEcuBnwDfycyHyqq1ke0tsL16OfzTs3m9t4+5SiY88ABMn165xYlG3T1JGj0iR9C/sDs7O7O7e/BveVmzBk48sXK3st+sa3njTyotsLknz2XRrEU7DvzRj+Av/xIefRSOPx6efhrGjh30ny9JQyUilmVmZ5HP6Tvv6+zUAjt+yfZQ2WmusnIlzJoFZ5xRCRWAL33JUJEk/EVfu9jeAtt/LftcfgWbqJmrvPwKfOE/wN//PWzdumPRe94DH/lIiyqWpPZisNTY/iqwvXoZe/lsNnVU5yp/+lWmffUOuOkmePvtXRfecAPs5c2fJIHBst1OLbCzr2PzkdX3q7zrUuZ8dhH0Nbs591w455yhK1SS2pz/zK7a3gI7fgl8oGaucski+MlP4KGHYEyDHP6bvxnaQiWpzRks1LTA9l/LXpc0eL/Kxo2VA3p7d144ezaccsouzydJo9moD5btLbDf98Ils9m6b937VTZsgLPPhqeq78+cPRsOOgj23hu++MUWVi5J7WnUz1i2t8DOvg6OqfscsPpQufxy+OY34ROfgP32g+OOa13hktSmRnWwbG+B1c9VLri571Dp6IC5c+GEE1pWtyS1s1EbLNtbYLEWPlw3V3ntzb5DBeD9729R1ZLU/kbtjKWrCx75fmWuwviauQoT+g8VSVK/RmWwbG+B/WndXOWomYaKJA3SqAuW7S2ww+vmKu+9zlCRpAKMumDp6oJHflI3VznnNsb92Z8bKpJUgFEVLGvWwH/+L3VzlfffwLRLPmmoSFJBRk2wbGuBvfnemrnK1NnM+dTXDRVJKtCoCZauLnhkTc1c5eCp3Pzl5YaKJBVsVATLmjXwn75QM1fpGMc9d/UybvnTlQMMFUkqzIgPlkz491f18vuZNXOVxw5i2mOrKgcYKpJUqBEfLF1d8P2smav88kDmPLi2stNQkaTCjehgWbMGrr55x1xl6m/35eY7X6vsNFQkqRQjNlgy4YpPreXtmZW5yj5bxnDv/3mLcVswVCSpRCM2WG79ei8/OmLHXKXr//YybQOGiiSVbEQGy5o1cPUDO+YqH/3Z3sxZjqEiSUNgxAVLJlx8zRK2nFaZqxyzfj+6HtxiqEjSEBlxwXLDrWt54rjKXGXM5rE8+I9vMO4jhookDZURFSybNiXX/nTHXOV/faeDaecaKpI0lEbUb5B8/qV1bD36ZQDO/dlE5r/7TENFkoZYqXcsEXF+RDwXEasiYmGD/RERN1X3r4iIU5pd28jmsZVQOXD9UXx7v9MNFUlqgdKCJSI6gFuAmcA0YHZETKs7bCYwpfo1D7h1AGsb/7mb9+W7v/3XjLvjTkNFklqgzDuWGcCqzFydmZuBu4FZdcfMAr6ZFY8DB0bEkU2ubeg//uKDnHbHtwwVSWqRMmcsE4EXax73AKc2cczEJtcCEBHzqNztAGy68Vv3PHXjt+4ZRNmlOxTY2OoimmCdxbLOYllncU4o+gnLDJZosC2bPKaZtZWNmV1AF0BEdGdm50CKHGrDoUawzqJZZ7GsszgR0V30c5YZLD3A0TWPJwHrmjxmbBNrJUltqMwZy1JgSkQcGxFjgcuAxXXHLAbmVF8ddhrw28x8qcm1kqQ2VNodS2b2RsQCYAnQASzKzJURMb+6/zbgQeACYBXwe2Buf2ub+GO7iv9fUrjhUCNYZ9Gss1jWWZzCa4zMhqMLSZL2yIj6SBdJUusZLJKkQrVtsJTxcTARcXBEfC8ifl79flCr6oyIoyPi+xHxTESsjIjP1qz5QkSsjYgnql8XtKrO6r41EfFktZbumu2Fns9BnMsTas7VExHxekRcXd3XinM5NSIei4hNEXFNM2tbdG02rLMNr83+zueQXJuDqbMNr8+PVv/7WRERj0bESbtbO+DzmZlt90VlYP8L4DgqLz1eDkyrO+YC4LtU3vNyGvDj3a0FvgIsrP68ELihhXUeCZxS/Xl/4PmaOr8AXNMO57O6bw1waIPnLex8DrbGuud5GTimhefyMOC9wJdq/+w2vDb7qrPdrs2GdQ7VtVlEnW12fb4POKj680xK+P/Odr1jKevjYGYBd1R/vgP4UKvqzMyXMvOnAJn5O+AZKp84UIbBnM/+FHk+i6rxHOAXmfnCIGoZVJ2ZuT4zlwJbBrB2yK/Nvupst2uzn/PZn7Y5n3Xa4fp8NDNfrT58nMr7BHe3dkDns12Dpa+PemnmmP7WHp6V98lQ/X5YC+vcLiLeCfwJ8OOazQuqt6qLCriNH2ydCTwcEcui8hE62xR5Pgs5l1Te83RX3bahPpd7srYV1+Zutcm12Z+huDaLqHObdrs+r6TSBdjd2gGdz3YNliH5OJgCDKbOys6I/YB7gasz8/Xq5luB44GTgZeA/97iOk/PzFOo3DZ/OiI+MMh6GiniXI4FLgL+sWZ/K85lGWsHatB/Vhtdm/0ZimsTijmfbXV9RsRZVILlrwa6dnfaNVgG83Ew/a399bbWSfX7+hbWSUTsTeU/3H/IzPu2HZCZv87MP2TmVuB2KreoLaszM7d9Xw/cX1NPkedzUDVWzQR+mpm/3rahRedyT9a24trsU5tdm30aomtz0HVWtc31GRHTgW8AszLzlSbWDuh8tmuwlPVxMIuBj1d//jjwQKvqjIgA/g54JjNvrF1QNze4GHiqhXWOj4j9q3WNB86rqafI8zmYv/NtZlPXZmjRudyTta24Nhtqw2uzrzqH6tocVJ012uL6jIjJwH3AxzLz+SbXDux8NvNKg1Z8UXkF0PNUXqVwbXXbfGB+9eeg8svAfgE8CXT2t7a6/RDgEeDn1e8Ht6pO4Awqt5krgCeqXxdU9/3v6rErqn+hR7awzuOovDpkObCyzPM5yL/zccArwB/VPWcrzuURVP719zrwWvXnA9rw2mxYZxtem33VOWTXZgF/7+10fX4DeLXm77a7v7V7cj79SBdJUqHatRUmSRqmDBZJUqEMFklSoQwWSVKhDBZJUqEMFklSoQwWSVKhDBapJBHx3uqHC76j+i7xlRHxx62uSyqbb5CUShQRXwTeAewL9GTml1tcklQ6g0UqUfUzl5YCbwPvy8w/tLgkqXS2wqRyHQzsR+U3Mb6jxbVIQ8I7FqlEEbGYym/iO5bKBwwuaHFJUunGtLoAaaSKiDlAb2beGREdwKMRcXZm/nOra5PK5B2LJKlQzlgkSYUyWCRJhTJYJEmFMlgkSYUyWCRJhTJYJEmFMlgkSYX6/7IN1OLRibb1AAAAAElFTkSuQmCC\n",
137 | "text/plain": [
138 | ""
139 | ]
140 | },
141 | "metadata": {
142 | "needs_background": "light"
143 | },
144 | "output_type": "display_data"
145 | }
146 | ],
147 | "source": [
148 | "V = np.array([v, w, v + w])\n",
149 | "origin = np.array([[0, 0, 0],[0, 0, 0]]) # origin point\n",
150 | "fig, ax = plt.subplots()\n",
151 | "ax.quiver(*origin, V[:,0], V[:,1], color=['r','b','g'], scale=10)\n",
152 | "ax.set(xlim=(0, 0.2), ylim=(0, 0.3))\n",
153 | "plt.xlabel('x')\n",
154 | "plt.ylabel('y')\n",
155 | "plt.show()\n"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "id": "8cfe903f",
161 | "metadata": {},
162 | "source": [
163 | "### 1.2 Lengths and Dot products\n",
164 | "The inner product of two vectors is implemented in NumPy by the dot() function."
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 20,
170 | "id": "59305e51",
171 | "metadata": {},
172 | "outputs": [
173 | {
174 | "data": {
175 | "text/plain": [
176 | "(3, 2)"
177 | ]
178 | },
179 | "execution_count": 20,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "data = np.array([[1, 2], [3, 4], [5, 6]])\n",
186 | "rows, cols = data.shape\n",
187 | "rows, cols"
188 | ]
189 | },
190 | {
191 | "cell_type": "markdown",
192 | "id": "200ad0db",
193 | "metadata": {},
194 | "source": [
195 | "Two vectors whose internal product is zero are orthogonal"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 9,
201 | "id": "20ff3e58",
202 | "metadata": {},
203 | "outputs": [
204 | {
205 | "data": {
206 | "text/plain": [
207 | "0"
208 | ]
209 | },
210 | "execution_count": 9,
211 | "metadata": {},
212 | "output_type": "execute_result"
213 | }
214 | ],
215 | "source": [
216 | "v = np.array([1, 3, 2])\n",
217 | "w = np.array([4, -4, 4])\n",
218 | "np.dot(v, w)"
219 | ]
220 | },
221 | {
222 | "cell_type": "markdown",
223 | "id": "b338209e",
224 | "metadata": {},
225 | "source": [
226 | "The length of a vector is defined as $||\\vec{v}|| = \\sqrt{\\vec{v} \\cdot \\vec{v}} $"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 13,
232 | "id": "d0540e9f",
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "data": {
237 | "text/plain": [
238 | "3.7416573867739413"
239 | ]
240 | },
241 | "execution_count": 13,
242 | "metadata": {},
243 | "output_type": "execute_result"
244 | }
245 | ],
246 | "source": [
247 | "math.sqrt(np.dot(v,v))"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "id": "79216113",
253 | "metadata": {},
254 | "source": [
255 | "so that the unit vector\n",
256 | "\n",
257 | "$$ \\frac{\\vec{v}}{||\\vec{v}||} $$\n",
258 | "\n",
259 | "has length 1"
260 | ]
261 | },
262 | {
263 | "cell_type": "markdown",
264 | "id": "17eeb99d",
265 | "metadata": {},
266 | "source": [
267 | "The angle $\\theta$ between two vectors $\\vec{v}$ and $\\vec{w}$ is defined as\n",
268 | "\n",
269 | "$$ \\cos{\\theta} = \\frac{\\vec{v} \\cdot \\vec{w}}{||\\vec{v}|| ||\\vec{w}||} $$"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 16,
275 | "id": "550c437f",
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/plain": [
281 | "0.7071067811865475"
282 | ]
283 | },
284 | "execution_count": 16,
285 | "metadata": {},
286 | "output_type": "execute_result"
287 | }
288 | ],
289 | "source": [
290 | "v = np.array([1, 0])\n",
291 | "w = np.array([1, 1])\n",
292 | "len_v = math.sqrt(np.dot(v, v))\n",
293 | "len_w = math.sqrt(np.dot(w, w))\n",
294 | "np.dot(v, w) / (len_v * len_w)"
295 | ]
296 | },
297 | {
298 | "cell_type": "markdown",
299 | "id": "a095e2a2",
300 | "metadata": {},
301 | "source": [
302 | "### 1.3 Matrices\n",
303 | "A matrix is a list of vectors\n",
304 | "\n",
305 | "$$ A = \\begin{bmatrix} 1 & 2 \\\\ 3 & 4 \\\\ 5 & 6 \\end{bmatrix}$$\n",
306 | "\n",
307 | "We can combine the matrix vectors by computig the inner product between the matrix and a vector, e.g. $\\vec{x} = [7, 8]$ that represent how we want to combine the matrix vectors\n",
308 | "\n",
309 | "$$ \\begin{bmatrix} 1 & 2 \\\\ 3 & 4 \\\\ 5 & 6 \\end{bmatrix} \\begin{bmatrix} 7 \\\\ 8 \\end{bmatrix} = \\begin{bmatrix} 23 \\\\ 53 \\\\ 83 \\end{bmatrix}$$"
310 | ]
311 | },
312 | {
313 | "cell_type": "code",
314 | "execution_count": 22,
315 | "id": "9cf3b36d",
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "data": {
320 | "text/plain": [
321 | "(3, 2)"
322 | ]
323 | },
324 | "execution_count": 22,
325 | "metadata": {},
326 | "output_type": "execute_result"
327 | }
328 | ],
329 | "source": [
330 | "A = np.array([[1, 2], [3, 4], [5, 6]])\n",
331 | "A.shape # rows, columns"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": 25,
337 | "id": "4a5edfb1",
338 | "metadata": {},
339 | "outputs": [
340 | {
341 | "data": {
342 | "text/plain": [
343 | "array([23, 53, 83])"
344 | ]
345 | },
346 | "execution_count": 25,
347 | "metadata": {},
348 | "output_type": "execute_result"
349 | }
350 | ],
351 | "source": [
352 | "x = np.array([7, 8])\n",
353 | "np.dot(A, x)"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": null,
359 | "id": "2856f488",
360 | "metadata": {},
361 | "outputs": [],
362 | "source": []
363 | }
364 | ],
365 | "metadata": {
366 | "kernelspec": {
367 | "display_name": "Python 3 (ipykernel)",
368 | "language": "python",
369 | "name": "python3"
370 | },
371 | "language_info": {
372 | "codemirror_mode": {
373 | "name": "ipython",
374 | "version": 3
375 | },
376 | "file_extension": ".py",
377 | "mimetype": "text/x-python",
378 | "name": "python",
379 | "nbconvert_exporter": "python",
380 | "pygments_lexer": "ipython3",
381 | "version": "3.7.12"
382 | }
383 | },
384 | "nbformat": 4,
385 | "nbformat_minor": 5
386 | }
387 |
--------------------------------------------------------------------------------
/python/finance/data/ENI.MI.csv:
--------------------------------------------------------------------------------
1 | Date,Open,High,Low,Close,Adj Close,Volume
2 | 2021-08-23,10.132000,10.234000,10.092000,10.234000,9.527454,15205357
3 | 2021-08-24,10.288000,10.358000,10.248000,10.318000,9.605656,13473152
4 | 2021-08-25,10.304000,10.340000,10.274000,10.330000,9.616826,8469631
5 | 2021-08-26,10.290000,10.368000,10.252000,10.306000,9.594483,10300966
6 | 2021-08-27,10.338000,10.450000,10.318000,10.428000,9.708061,14166843
7 | 2021-08-30,10.448000,10.492000,10.410000,10.432000,9.711784,8640464
8 | 2021-08-31,10.450000,10.472000,10.380000,10.454000,9.732265,18640305
9 | 2021-09-01,10.492000,10.616000,10.480000,10.496000,9.771366,19475031
10 | 2021-09-02,10.536000,10.680000,10.502000,10.632000,9.897977,17048493
11 | 2021-09-03,10.630000,10.660000,10.512000,10.530000,9.803019,14713759
12 | 2021-09-06,10.572000,10.678000,10.540000,10.586000,9.855153,10952359
13 | 2021-09-07,10.602000,10.664000,10.550000,10.582000,9.851429,11697761
14 | 2021-09-08,10.570000,10.694000,10.462000,10.590000,9.858876,18936416
15 | 2021-09-09,10.580000,10.654000,10.528000,10.586000,9.855153,13919894
16 | 2021-09-10,10.620000,10.652000,10.532000,10.544000,9.816052,12568028
17 | 2021-09-13,10.608000,10.848000,10.602000,10.806000,10.059963,28029686
18 | 2021-09-14,10.824000,11.008000,10.824000,10.904000,10.151198,21974204
19 | 2021-09-15,10.940000,11.068000,10.926000,10.990000,10.231260,20646537
20 | 2021-09-16,11.032000,11.198000,11.004000,11.024000,10.262913,30759090
21 | 2021-09-17,11.110000,11.164000,10.924000,10.942000,10.186575,36040734
22 | 2021-09-20,10.550000,10.584000,10.330000,10.414000,10.091608,26213320
23 | 2021-09-21,10.458000,10.708000,10.452000,10.606000,10.277664,19744092
24 | 2021-09-22,10.738000,10.928000,10.700000,10.886000,10.548996,22755868
25 | 2021-09-23,10.980000,10.998000,10.832000,10.902000,10.564502,15922933
26 | 2021-09-24,10.920000,10.978000,10.860000,10.932000,10.593573,12280924
27 | 2021-09-27,11.000000,11.208000,10.992000,11.186000,10.839709,22930665
28 | 2021-09-28,11.282000,11.438000,11.260000,11.264000,10.915295,28629086
29 | 2021-09-29,11.260000,11.404000,11.150000,11.388000,11.035456,17341462
30 | 2021-09-30,11.412000,11.578000,11.402000,11.546000,11.188564,27298469
31 | 2021-10-01,11.416000,11.544000,11.302000,11.482000,11.126546,19563954
32 | 2021-10-04,11.492000,11.758000,11.462000,11.642000,11.281592,25481124
33 | 2021-10-05,11.700000,11.914000,11.632000,11.848000,11.481215,28265202
34 | 2021-10-06,11.848000,11.848000,11.596000,11.666000,11.304850,22966517
35 | 2021-10-07,11.750000,11.798000,11.386000,11.588000,11.229264,25591317
36 | 2021-10-08,11.696000,11.900000,11.652000,11.858000,11.490906,25139901
37 | 2021-10-11,11.940000,12.072000,11.846000,12.006000,11.634323,22362197
38 | 2021-10-12,11.900000,12.012000,11.828000,11.974000,11.603314,16097752
39 | 2021-10-13,11.974000,11.992000,11.690000,11.822000,11.456019,19824484
40 | 2021-10-14,11.944000,12.046000,11.932000,11.980000,11.609128,19042947
41 | 2021-10-15,12.078000,12.226000,12.060000,12.208000,11.830070,21210629
42 | 2021-10-18,12.216000,12.304000,12.186000,12.236000,11.857203,17152230
43 | 2021-10-19,12.258000,12.294000,12.156000,12.172000,11.795185,16104721
44 | 2021-10-20,12.170000,12.350000,12.156000,12.318000,11.936666,13570949
45 | 2021-10-21,12.268000,12.318000,12.110000,12.134000,11.758361,15135627
46 | 2021-10-22,12.138000,12.218000,12.052000,12.078000,11.704095,13108483
47 | 2021-10-25,12.166000,12.336000,12.112000,12.278000,11.897903,14628741
48 | 2021-10-26,12.290000,12.500000,12.146000,12.358000,11.975427,19923852
49 | 2021-10-27,12.338000,12.442000,12.192000,12.268000,11.888213,19233474
50 | 2021-10-28,12.174000,12.224000,12.020000,12.164000,11.787433,16985428
51 | 2021-10-29,12.194000,12.484000,12.180000,12.404000,12.020003,26402029
52 | 2021-11-01,12.478000,12.796000,12.450000,12.746000,12.351416,24242601
53 | 2021-11-02,12.746000,12.832000,12.452000,12.540000,12.151793,21518514
54 | 2021-11-03,12.490000,12.496000,12.292000,12.380000,11.996746,16475518
55 | 2021-11-04,12.412000,12.640000,12.412000,12.560000,12.171174,16348057
56 | 2021-11-05,12.454000,12.740000,12.454000,12.672000,12.279706,14126171
57 | 2021-11-08,12.720000,12.834000,12.616000,12.682000,12.289397,13584199
58 | 2021-11-09,12.646000,12.754000,12.568000,12.620000,12.229316,10066288
59 | 2021-11-10,12.674000,12.826000,12.660000,12.664000,12.271954,16042075
60 | 2021-11-11,12.600000,12.698000,12.536000,12.646000,12.254511,10370345
61 | 2021-11-12,12.600000,12.620000,12.446000,12.512000,12.124660,13719125
62 | 2021-11-15,12.440000,12.586000,12.412000,12.546000,12.157606,10901389
63 | 2021-11-16,12.602000,12.698000,12.560000,12.598000,12.207996,12105355
64 | 2021-11-17,12.574000,12.666000,12.542000,12.592000,12.202183,10063683
65 | 2021-11-18,12.450000,12.488000,12.310000,12.430000,12.045198,18750020
66 | 2021-11-19,12.620000,12.762000,12.136000,12.168000,11.791309,27790201
67 | 2021-11-22,12.200000,12.360000,12.072000,12.328000,11.946356,16066193
68 | 2021-11-23,12.208000,12.472000,12.152000,12.380000,11.996746,14566945
69 | 2021-11-24,12.450000,12.552000,12.258000,12.370000,11.987056,14142661
70 | 2021-11-25,12.390000,12.398000,12.244000,12.296000,11.915346,10793759
71 | 2021-11-26,11.800000,11.828000,11.492000,11.530000,11.173059,36240991
72 | 2021-11-29,11.786000,12.016000,11.668000,11.800000,11.434702,22433835
73 | 2021-11-30,11.546000,11.700000,11.448000,11.642000,11.281592,26761672
74 | 2021-12-01,11.830000,12.060000,11.766000,11.904000,11.535482,17637484
75 | 2021-12-02,11.790000,11.986000,11.706000,11.940000,11.570367,18137069
76 | 2021-12-03,12.090000,12.214000,12.008000,12.010000,11.638201,18439109
77 | 2021-12-06,12.120000,12.360000,12.120000,12.292000,11.911470,13925642
78 | 2021-12-07,12.430000,12.616000,12.402000,12.568000,12.178926,15369236
79 | 2021-12-08,12.480000,12.550000,12.360000,12.390000,12.006436,11415930
80 | 2021-12-09,12.410000,12.422000,12.168000,12.266000,11.886275,12442861
81 | 2021-12-10,12.184000,12.366000,12.176000,12.206000,11.828133,8227832
82 | 2021-12-13,12.260000,12.278000,11.978000,12.020000,11.647891,12390937
83 | 2021-12-14,12.044000,12.186000,12.000000,12.148000,11.771928,11456932
84 | 2021-12-15,12.076000,12.120000,11.914000,12.000000,11.628510,14308412
85 | 2021-12-16,12.158000,12.266000,12.060000,12.198000,11.820380,17590204
86 | 2021-12-17,12.104000,12.176000,11.922000,12.052000,11.678900,19831352
87 | 2021-12-20,11.700000,11.834000,11.522000,11.834000,11.467649,16531786
88 | 2021-12-21,11.990000,12.198000,11.918000,12.180000,11.802938,11864621
89 | 2021-12-22,12.182000,12.196000,11.992000,12.174000,11.797123,9485605
90 | 2021-12-23,12.194000,12.338000,12.156000,12.240000,11.861080,9547017
91 | 2021-12-27,12.120000,12.310000,12.100000,12.294000,11.913408,6158602
92 | 2021-12-28,12.304000,12.470000,12.302000,12.404000,12.020003,9148633
93 | 2021-12-29,12.400000,12.476000,12.224000,12.280000,11.899841,9731333
94 | 2021-12-30,12.260000,12.300000,12.190000,12.220000,11.841700,8216497
95 | 2022-01-03,12.300000,12.478000,12.274000,12.408000,12.023879,9327410
96 | 2022-01-04,12.454000,12.708000,12.436000,12.610000,12.219625,15884351
97 | 2022-01-05,12.630000,12.778000,12.568000,12.756000,12.361106,16047955
98 | 2022-01-06,12.614000,12.794000,12.550000,12.650000,12.258387,15643210
99 | 2022-01-07,12.690000,12.800000,12.652000,12.790000,12.394053,12857361
100 | 2022-01-10,12.800000,12.882000,12.628000,12.684000,12.291335,13906778
101 | 2022-01-11,12.726000,12.820000,12.622000,12.810000,12.413435,10082884
102 | 2022-01-12,12.896000,13.086000,12.892000,13.052000,12.647943,22009160
103 | 2022-01-13,13.002000,13.054000,12.946000,13.024000,12.620810,13225765
104 | 2022-01-14,12.996000,13.184000,12.972000,13.160000,12.752599,15720891
105 | 2022-01-17,13.210000,13.270000,13.104000,13.212000,12.802989,12338054
106 | 2022-01-18,13.284000,13.334000,13.158000,13.270000,12.859194,18979901
107 | 2022-01-19,13.300000,13.474000,13.238000,13.420000,13.004550,19794212
108 | 2022-01-20,13.370000,13.404000,13.220000,13.290000,12.878574,19821766
109 | 2022-01-21,13.140000,13.206000,12.966000,13.106000,12.700271,17881760
110 | 2022-01-24,13.114000,13.170000,12.578000,12.686000,12.293273,20679442
111 | 2022-01-25,12.832000,13.142000,12.670000,13.094000,12.688643,18607939
112 | 2022-01-26,13.214000,13.488000,13.200000,13.484000,13.066569,24678628
113 | 2022-01-27,13.300000,13.840000,13.280000,13.812000,13.384415,25616509
114 | 2022-01-28,13.800000,13.802000,13.400000,13.574000,13.153783,19845858
115 | 2022-01-31,13.590000,13.640000,13.234000,13.308000,12.896017,20919460
116 | 2022-02-01,13.324000,13.456000,13.180000,13.456000,13.039436,16676711
117 | 2022-02-02,13.382000,13.498000,13.330000,13.356000,12.942532,13493904
118 | 2022-02-03,13.356000,13.420000,13.204000,13.288000,12.876637,13518177
119 | 2022-02-04,13.384000,13.562000,13.346000,13.480000,13.062693,17295995
120 | 2022-02-07,13.380000,13.418000,13.092000,13.186000,12.777794,16200649
121 | 2022-02-08,13.120000,13.318000,12.970000,13.036000,12.632438,12746726
122 | 2022-02-09,13.066000,13.330000,12.978000,13.252000,12.841751,12584136
123 | 2022-02-10,13.152000,13.366000,13.152000,13.298000,12.886327,10720904
124 | 2022-02-11,13.180000,13.556000,13.142000,13.528000,13.109206,14123872
125 | 2022-02-14,13.500000,13.592000,13.200000,13.324000,12.911522,20387459
126 | 2022-02-15,13.282000,13.432000,13.074000,13.172000,12.764228,15663413
127 | 2022-02-16,13.170000,13.378000,13.086000,13.308000,12.896017,13926631
128 | 2022-02-17,13.202000,13.380000,13.132000,13.334000,12.921212,14169615
129 | 2022-02-18,13.520000,13.600000,13.378000,13.470000,13.053002,19761436
130 | 2022-02-21,13.492000,13.536000,13.104000,13.314000,12.901832,15944636
131 | 2022-02-22,13.062000,13.548000,13.062000,13.492000,13.074321,18850071
132 | 2022-02-23,13.450000,13.618000,13.354000,13.426000,13.010364,14234323
133 | 2022-02-24,13.250000,13.718000,13.066000,13.362000,12.948346,35581182
134 | 2022-02-25,13.422000,13.830000,13.394000,13.780000,13.353405,28060530
135 | 2022-02-28,13.670000,13.906000,13.358000,13.832000,13.403795,27714174
136 | 2022-03-01,13.920000,14.398000,13.884000,14.252000,13.810793,34839560
137 | 2022-03-02,14.402000,14.572000,14.200000,14.530000,14.080187,29727128
138 | 2022-03-03,14.600000,14.852000,13.804000,13.866000,13.436743,27291045
139 | 2022-03-04,13.746000,13.782000,12.810000,12.854000,12.456072,32874311
140 | 2022-03-07,12.782000,13.668000,12.310000,13.406000,12.990984,35013621
141 | 2022-03-08,13.292000,13.770000,13.274000,13.670000,13.246811,24350724
142 | 2022-03-09,13.750000,13.900000,13.308000,13.602000,13.180916,23835591
143 | 2022-03-10,13.498000,13.608000,13.060000,13.104000,12.698333,21272480
144 | 2022-03-11,13.114000,13.480000,13.034000,13.036000,12.632438,15443864
145 | 2022-03-14,13.034000,13.118000,12.822000,12.978000,12.576233,14375697
146 | 2022-03-15,12.774000,12.924000,12.540000,12.924000,12.523905,19719778
147 | 2022-03-16,13.040000,13.124000,12.688000,12.770000,12.374673,20561605
148 | 2022-03-17,12.730000,13.174000,12.674000,13.110000,12.704146,23535647
149 | 2022-03-18,13.190000,13.200000,12.532000,12.728000,12.333972,29960266
150 | 2022-03-21,12.770000,13.238000,12.656000,13.094000,12.688643,18673723
151 | 2022-03-22,13.190000,13.434000,13.086000,13.104000,12.698333,15058029
152 | 2022-03-23,13.028000,13.398000,13.006000,13.222000,12.812680,15153328
153 | 2022-03-24,13.282000,13.486000,13.194000,13.314000,12.901832,11395491
154 | 2022-03-25,13.250000,13.544000,13.088000,13.464000,13.047188,11646774
155 | 2022-03-28,13.380000,13.726000,13.246000,13.272000,12.861132,13570473
156 | 2022-03-29,13.346000,13.558000,12.922000,13.096000,12.690580,17382862
157 | 2022-03-30,13.196000,13.382000,13.158000,13.382000,12.967727,12866099
158 | 2022-03-31,13.280000,13.466000,13.208000,13.294000,12.882450,10324704
159 | 2022-04-01,13.220000,13.458000,13.154000,13.414000,12.998735,9768288
160 | 2022-04-04,13.376000,13.506000,13.332000,13.442000,13.025869,8561573
161 | 2022-04-05,13.502000,13.566000,13.318000,13.508000,13.089827,8963221
162 | 2022-04-06,13.424000,13.582000,13.310000,13.388000,12.973540,11734380
163 | 2022-04-07,13.358000,13.518000,13.196000,13.262000,12.851441,10932585
164 | 2022-04-08,13.424000,13.818000,13.424000,13.818000,13.390229,15918230
165 | 2022-04-11,13.778000,14.084000,13.728000,13.796000,13.368910,10182489
166 | 2022-04-12,13.794000,13.950000,13.706000,13.866000,13.436743,9330978
167 | 2022-04-13,13.866000,14.280000,13.866000,14.174000,13.735208,15748581
168 | 2022-04-14,14.158000,14.268000,14.004000,14.200000,13.760403,12333670
169 | 2022-04-19,14.210000,14.430000,14.138000,14.150000,13.711950,11319929
170 | 2022-04-20,14.254000,14.278000,14.066000,14.228000,13.787536,10261390
171 | 2022-04-21,14.294000,14.298000,14.000000,14.000000,13.566595,11387400
172 | 2022-04-22,13.730000,13.770000,13.464000,13.580000,13.159596,15981425
173 | 2022-04-25,13.302000,13.330000,12.930000,12.930000,12.529719,19172922
174 | 2022-04-26,13.184000,13.224000,12.780000,12.958000,12.556852,14584407
175 | 2022-04-27,12.964000,13.020000,12.806000,12.930000,12.529719,10301442
176 | 2022-04-28,13.008000,13.310000,12.962000,13.158000,12.750661,9925222
177 | 2022-04-29,13.450000,13.464000,13.170000,13.390000,12.975479,12047757
178 | 2022-05-02,13.278000,13.488000,13.110000,13.200000,12.791361,10101913
179 | 2022-05-03,13.306000,13.550000,13.066000,13.528000,13.109206,11352948
180 | 2022-05-04,13.524000,13.732000,13.524000,13.600000,13.178978,10937079
181 | 2022-05-05,13.740000,13.794000,13.428000,13.510000,13.091764,9047418
182 | 2022-05-06,13.550000,13.840000,13.514000,13.604000,13.182854,11964854
183 | 2022-05-09,13.638000,13.736000,13.026000,13.078000,12.673138,12271233
184 | 2022-05-10,13.160000,13.418000,13.046000,13.258000,12.847566,11465475
185 | 2022-05-11,13.300000,13.620000,13.212000,13.620000,13.198359,10849388
186 | 2022-05-12,13.378000,13.530000,13.270000,13.356000,12.942532,10580131
187 | 2022-05-13,13.530000,13.640000,13.344000,13.640000,13.217740,9604893
188 | 2022-05-16,13.586000,13.886000,13.564000,13.826000,13.397982,9820136
189 | 2022-05-17,13.950000,14.168000,13.892000,13.932000,13.500700,12180448
190 | 2022-05-18,13.812000,14.144000,13.810000,13.832000,13.403795,11314162
191 | 2022-05-19,13.788000,13.950000,13.624000,13.786000,13.359220,15046537
192 | 2022-05-20,13.928000,14.120000,13.820000,13.890000,13.460000,17198097
193 | 2022-05-23,13.624000,13.822000,13.612000,13.714000,13.714000,13271908
194 | 2022-05-24,13.620000,13.686000,13.502000,13.610000,13.610000,9516370
195 | 2022-05-25,13.740000,14.120000,13.724000,14.120000,14.120000,17831449
196 | 2022-05-26,14.120000,14.246000,14.054000,14.246000,14.246000,10961160
197 | 2022-05-27,14.260000,14.336000,14.078000,14.192000,14.192000,11271087
198 | 2022-05-30,14.250000,14.274000,14.026000,14.200000,14.200000,8656883
199 | 2022-05-31,14.300000,14.478000,14.198000,14.198000,14.198000,20843817
200 | 2022-06-01,14.198000,14.318000,14.050000,14.098000,14.098000,12269529
201 | 2022-06-02,14.150000,14.150000,13.966000,14.072000,14.072000,7555112
202 | 2022-06-03,14.130000,14.258000,14.054000,14.258000,14.258000,8852539
203 | 2022-06-06,14.326000,14.598000,14.304000,14.432000,14.432000,13025445
204 | 2022-06-07,14.428000,14.508000,14.284000,14.428000,14.428000,11093002
205 | 2022-06-08,14.530000,14.556000,14.302000,14.392000,14.392000,9910104
206 | 2022-06-09,14.380000,14.450000,14.026000,14.026000,14.026000,14315519
207 | 2022-06-10,14.018000,14.020000,13.240000,13.240000,13.240000,22774413
208 | 2022-06-13,13.062000,13.138000,12.792000,12.978000,12.978000,16703779
209 | 2022-06-14,13.140000,13.352000,12.762000,13.170000,13.170000,12124965
210 | 2022-06-15,13.296000,13.392000,12.936000,13.254000,13.254000,14258080
211 | 2022-06-16,13.210000,13.302000,12.486000,12.606000,12.606000,25435857
212 | 2022-06-17,12.574000,12.722000,12.006000,12.010000,12.010000,27990234
213 | 2022-06-20,12.090000,12.270000,12.032000,12.044000,12.044000,11731385
214 | 2022-06-21,12.154000,12.236000,12.008000,12.080000,12.080000,10985501
215 | 2022-06-22,11.766000,11.814000,11.562000,11.660000,11.660000,17707302
216 | 2022-06-23,11.510000,11.842000,11.338000,11.430000,11.430000,15465466
217 | 2022-06-24,11.280000,11.708000,11.210000,11.666000,11.666000,14695680
218 | 2022-06-27,11.588000,11.754000,11.406000,11.430000,11.430000,16186512
219 | 2022-06-28,11.516000,11.720000,11.492000,11.500000,11.500000,11782048
220 | 2022-06-29,11.456000,11.940000,11.430000,11.602000,11.602000,14245962
221 | 2022-06-30,11.500000,11.560000,11.224000,11.328000,11.328000,13465225
222 | 2022-07-01,11.300000,11.450000,11.134000,11.228000,11.228000,10140782
223 | 2022-07-04,11.414000,11.628000,11.414000,11.498000,11.498000,11895933
224 | 2022-07-05,11.512000,11.524000,10.754000,10.832000,10.832000,21932084
225 | 2022-07-06,10.950000,11.080000,10.632000,10.756000,10.756000,16034945
226 | 2022-07-07,10.898000,11.180000,10.882000,11.046000,11.046000,17104234
227 | 2022-07-08,11.046000,11.352000,10.972000,11.226000,11.226000,12923833
228 | 2022-07-11,11.100000,11.316000,11.018000,11.200000,11.200000,10096639
229 | 2022-07-12,11.180000,11.330000,11.006000,11.138000,11.138000,10193478
230 | 2022-07-13,11.144000,11.254000,10.946000,11.116000,11.116000,10229951
231 | 2022-07-14,11.020000,11.156000,10.500000,10.644000,10.644000,18198463
232 | 2022-07-15,10.654000,10.996000,10.578000,10.838000,10.838000,14967921
233 | 2022-07-18,10.890000,11.186000,10.870000,11.014000,11.014000,10679802
234 | 2022-07-19,10.978000,11.360000,10.956000,11.304000,11.304000,12975328
235 | 2022-07-20,11.400000,11.410000,11.100000,11.174000,11.174000,9352632
236 | 2022-07-21,10.960000,11.148000,10.740000,11.000000,11.000000,11031394
237 | 2022-07-22,10.998000,11.142000,10.926000,10.970000,10.970000,9676893
238 | 2022-07-25,10.920000,11.088000,10.798000,11.062000,11.062000,8858839
239 | 2022-07-26,11.142000,11.228000,11.014000,11.020000,11.020000,10479759
240 | 2022-07-27,11.130000,11.288000,11.068000,11.200000,11.200000,8704917
241 | 2022-07-28,11.330000,11.354000,11.028000,11.092000,11.092000,11750707
242 | 2022-07-29,11.316000,11.844000,11.184000,11.716000,11.716000,21159221
243 | 2022-08-01,11.730000,12.118000,11.702000,11.702000,11.702000,15527324
244 | 2022-08-02,11.720000,11.776000,11.382000,11.444000,11.444000,15408263
245 | 2022-08-03,11.490000,11.544000,11.304000,11.400000,11.400000,14708791
246 | 2022-08-04,11.302000,11.570000,11.272000,11.390000,11.390000,13633577
247 | 2022-08-05,11.330000,11.556000,11.288000,11.456000,11.456000,12122650
248 | 2022-08-08,11.568000,11.658000,11.372000,11.446000,11.446000,8746013
249 | 2022-08-09,11.470000,11.604000,11.364000,11.492000,11.492000,8793567
250 | 2022-08-10,11.482000,11.560000,11.286000,11.424000,11.424000,8666139
251 | 2022-08-11,11.490000,11.620000,11.460000,11.590000,11.590000,8666810
252 | 2022-08-12,11.680000,11.788000,11.576000,11.630000,11.630000,7423167
253 | 2022-08-16,11.608000,11.720000,11.512000,11.578000,11.578000,8205962
254 | 2022-08-17,11.600000,11.744000,11.570000,11.692000,11.692000,8577984
255 | 2022-08-18,11.738000,11.934000,11.696000,11.934000,11.934000,8843209
256 | 2022-08-19,11.902000,12.006000,11.722000,11.770000,11.770000,8503102
257 | 2022-08-22,11.750000,11.962000,11.544000,11.838000,11.838000,9397201
--------------------------------------------------------------------------------
/r/stat_learning/data/Auto.csv:
--------------------------------------------------------------------------------
1 | mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
2 | 18,8,307,130,3504,12,70,1,chevrolet chevelle malibu
3 | 15,8,350,165,3693,11.5,70,1,buick skylark 320
4 | 18,8,318,150,3436,11,70,1,plymouth satellite
5 | 16,8,304,150,3433,12,70,1,amc rebel sst
6 | 17,8,302,140,3449,10.5,70,1,ford torino
7 | 15,8,429,198,4341,10,70,1,ford galaxie 500
8 | 14,8,454,220,4354,9,70,1,chevrolet impala
9 | 14,8,440,215,4312,8.5,70,1,plymouth fury iii
10 | 14,8,455,225,4425,10,70,1,pontiac catalina
11 | 15,8,390,190,3850,8.5,70,1,amc ambassador dpl
12 | 15,8,383,170,3563,10,70,1,dodge challenger se
13 | 14,8,340,160,3609,8,70,1,plymouth 'cuda 340
14 | 15,8,400,150,3761,9.5,70,1,chevrolet monte carlo
15 | 14,8,455,225,3086,10,70,1,buick estate wagon (sw)
16 | 24,4,113,95,2372,15,70,3,toyota corona mark ii
17 | 22,6,198,95,2833,15.5,70,1,plymouth duster
18 | 18,6,199,97,2774,15.5,70,1,amc hornet
19 | 21,6,200,85,2587,16,70,1,ford maverick
20 | 27,4,97,88,2130,14.5,70,3,datsun pl510
21 | 26,4,97,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan
22 | 25,4,110,87,2672,17.5,70,2,peugeot 504
23 | 24,4,107,90,2430,14.5,70,2,audi 100 ls
24 | 25,4,104,95,2375,17.5,70,2,saab 99e
25 | 26,4,121,113,2234,12.5,70,2,bmw 2002
26 | 21,6,199,90,2648,15,70,1,amc gremlin
27 | 10,8,360,215,4615,14,70,1,ford f250
28 | 10,8,307,200,4376,15,70,1,chevy c20
29 | 11,8,318,210,4382,13.5,70,1,dodge d200
30 | 9,8,304,193,4732,18.5,70,1,hi 1200d
31 | 27,4,97,88,2130,14.5,71,3,datsun pl510
32 | 28,4,140,90,2264,15.5,71,1,chevrolet vega 2300
33 | 25,4,113,95,2228,14,71,3,toyota corona
34 | 19,6,232,100,2634,13,71,1,amc gremlin
35 | 16,6,225,105,3439,15.5,71,1,plymouth satellite custom
36 | 17,6,250,100,3329,15.5,71,1,chevrolet chevelle malibu
37 | 19,6,250,88,3302,15.5,71,1,ford torino 500
38 | 18,6,232,100,3288,15.5,71,1,amc matador
39 | 14,8,350,165,4209,12,71,1,chevrolet impala
40 | 14,8,400,175,4464,11.5,71,1,pontiac catalina brougham
41 | 14,8,351,153,4154,13.5,71,1,ford galaxie 500
42 | 14,8,318,150,4096,13,71,1,plymouth fury iii
43 | 12,8,383,180,4955,11.5,71,1,dodge monaco (sw)
44 | 13,8,400,170,4746,12,71,1,ford country squire (sw)
45 | 13,8,400,175,5140,12,71,1,pontiac safari (sw)
46 | 18,6,258,110,2962,13.5,71,1,amc hornet sportabout (sw)
47 | 22,4,140,72,2408,19,71,1,chevrolet vega (sw)
48 | 19,6,250,100,3282,15,71,1,pontiac firebird
49 | 18,6,250,88,3139,14.5,71,1,ford mustang
50 | 23,4,122,86,2220,14,71,1,mercury capri 2000
51 | 28,4,116,90,2123,14,71,2,opel 1900
52 | 30,4,79,70,2074,19.5,71,2,peugeot 304
53 | 30,4,88,76,2065,14.5,71,2,fiat 124b
54 | 31,4,71,65,1773,19,71,3,toyota corolla 1200
55 | 35,4,72,69,1613,18,71,3,datsun 1200
56 | 27,4,97,60,1834,19,71,2,volkswagen model 111
57 | 26,4,91,70,1955,20.5,71,1,plymouth cricket
58 | 24,4,113,95,2278,15.5,72,3,toyota corona hardtop
59 | 25,4,97.5,80,2126,17,72,1,dodge colt hardtop
60 | 23,4,97,54,2254,23.5,72,2,volkswagen type 3
61 | 20,4,140,90,2408,19.5,72,1,chevrolet vega
62 | 21,4,122,86,2226,16.5,72,1,ford pinto runabout
63 | 13,8,350,165,4274,12,72,1,chevrolet impala
64 | 14,8,400,175,4385,12,72,1,pontiac catalina
65 | 15,8,318,150,4135,13.5,72,1,plymouth fury iii
66 | 14,8,351,153,4129,13,72,1,ford galaxie 500
67 | 17,8,304,150,3672,11.5,72,1,amc ambassador sst
68 | 11,8,429,208,4633,11,72,1,mercury marquis
69 | 13,8,350,155,4502,13.5,72,1,buick lesabre custom
70 | 12,8,350,160,4456,13.5,72,1,oldsmobile delta 88 royale
71 | 13,8,400,190,4422,12.5,72,1,chrysler newport royal
72 | 19,3,70,97,2330,13.5,72,3,mazda rx2 coupe
73 | 15,8,304,150,3892,12.5,72,1,amc matador (sw)
74 | 13,8,307,130,4098,14,72,1,chevrolet chevelle concours (sw)
75 | 13,8,302,140,4294,16,72,1,ford gran torino (sw)
76 | 14,8,318,150,4077,14,72,1,plymouth satellite custom (sw)
77 | 18,4,121,112,2933,14.5,72,2,volvo 145e (sw)
78 | 22,4,121,76,2511,18,72,2,volkswagen 411 (sw)
79 | 21,4,120,87,2979,19.5,72,2,peugeot 504 (sw)
80 | 26,4,96,69,2189,18,72,2,renault 12 (sw)
81 | 22,4,122,86,2395,16,72,1,ford pinto (sw)
82 | 28,4,97,92,2288,17,72,3,datsun 510 (sw)
83 | 23,4,120,97,2506,14.5,72,3,toyouta corona mark ii (sw)
84 | 28,4,98,80,2164,15,72,1,dodge colt (sw)
85 | 27,4,97,88,2100,16.5,72,3,toyota corolla 1600 (sw)
86 | 13,8,350,175,4100,13,73,1,buick century 350
87 | 14,8,304,150,3672,11.5,73,1,amc matador
88 | 13,8,350,145,3988,13,73,1,chevrolet malibu
89 | 14,8,302,137,4042,14.5,73,1,ford gran torino
90 | 15,8,318,150,3777,12.5,73,1,dodge coronet custom
91 | 12,8,429,198,4952,11.5,73,1,mercury marquis brougham
92 | 13,8,400,150,4464,12,73,1,chevrolet caprice classic
93 | 13,8,351,158,4363,13,73,1,ford ltd
94 | 14,8,318,150,4237,14.5,73,1,plymouth fury gran sedan
95 | 13,8,440,215,4735,11,73,1,chrysler new yorker brougham
96 | 12,8,455,225,4951,11,73,1,buick electra 225 custom
97 | 13,8,360,175,3821,11,73,1,amc ambassador brougham
98 | 18,6,225,105,3121,16.5,73,1,plymouth valiant
99 | 16,6,250,100,3278,18,73,1,chevrolet nova custom
100 | 18,6,232,100,2945,16,73,1,amc hornet
101 | 18,6,250,88,3021,16.5,73,1,ford maverick
102 | 23,6,198,95,2904,16,73,1,plymouth duster
103 | 26,4,97,46,1950,21,73,2,volkswagen super beetle
104 | 11,8,400,150,4997,14,73,1,chevrolet impala
105 | 12,8,400,167,4906,12.5,73,1,ford country
106 | 13,8,360,170,4654,13,73,1,plymouth custom suburb
107 | 12,8,350,180,4499,12.5,73,1,oldsmobile vista cruiser
108 | 18,6,232,100,2789,15,73,1,amc gremlin
109 | 20,4,97,88,2279,19,73,3,toyota carina
110 | 21,4,140,72,2401,19.5,73,1,chevrolet vega
111 | 22,4,108,94,2379,16.5,73,3,datsun 610
112 | 18,3,70,90,2124,13.5,73,3,maxda rx3
113 | 19,4,122,85,2310,18.5,73,1,ford pinto
114 | 21,6,155,107,2472,14,73,1,mercury capri v6
115 | 26,4,98,90,2265,15.5,73,2,fiat 124 sport coupe
116 | 15,8,350,145,4082,13,73,1,chevrolet monte carlo s
117 | 16,8,400,230,4278,9.5,73,1,pontiac grand prix
118 | 29,4,68,49,1867,19.5,73,2,fiat 128
119 | 24,4,116,75,2158,15.5,73,2,opel manta
120 | 20,4,114,91,2582,14,73,2,audi 100ls
121 | 19,4,121,112,2868,15.5,73,2,volvo 144ea
122 | 15,8,318,150,3399,11,73,1,dodge dart custom
123 | 24,4,121,110,2660,14,73,2,saab 99le
124 | 20,6,156,122,2807,13.5,73,3,toyota mark ii
125 | 11,8,350,180,3664,11,73,1,oldsmobile omega
126 | 20,6,198,95,3102,16.5,74,1,plymouth duster
127 | 19,6,232,100,2901,16,74,1,amc hornet
128 | 15,6,250,100,3336,17,74,1,chevrolet nova
129 | 31,4,79,67,1950,19,74,3,datsun b210
130 | 26,4,122,80,2451,16.5,74,1,ford pinto
131 | 32,4,71,65,1836,21,74,3,toyota corolla 1200
132 | 25,4,140,75,2542,17,74,1,chevrolet vega
133 | 16,6,250,100,3781,17,74,1,chevrolet chevelle malibu classic
134 | 16,6,258,110,3632,18,74,1,amc matador
135 | 18,6,225,105,3613,16.5,74,1,plymouth satellite sebring
136 | 16,8,302,140,4141,14,74,1,ford gran torino
137 | 13,8,350,150,4699,14.5,74,1,buick century luxus (sw)
138 | 14,8,318,150,4457,13.5,74,1,dodge coronet custom (sw)
139 | 14,8,302,140,4638,16,74,1,ford gran torino (sw)
140 | 14,8,304,150,4257,15.5,74,1,amc matador (sw)
141 | 29,4,98,83,2219,16.5,74,2,audi fox
142 | 26,4,79,67,1963,15.5,74,2,volkswagen dasher
143 | 26,4,97,78,2300,14.5,74,2,opel manta
144 | 31,4,76,52,1649,16.5,74,3,toyota corona
145 | 32,4,83,61,2003,19,74,3,datsun 710
146 | 28,4,90,75,2125,14.5,74,1,dodge colt
147 | 24,4,90,75,2108,15.5,74,2,fiat 128
148 | 26,4,116,75,2246,14,74,2,fiat 124 tc
149 | 24,4,120,97,2489,15,74,3,honda civic
150 | 26,4,108,93,2391,15.5,74,3,subaru
151 | 31,4,79,67,2000,16,74,2,fiat x1.9
152 | 19,6,225,95,3264,16,75,1,plymouth valiant custom
153 | 18,6,250,105,3459,16,75,1,chevrolet nova
154 | 15,6,250,72,3432,21,75,1,mercury monarch
155 | 15,6,250,72,3158,19.5,75,1,ford maverick
156 | 16,8,400,170,4668,11.5,75,1,pontiac catalina
157 | 15,8,350,145,4440,14,75,1,chevrolet bel air
158 | 16,8,318,150,4498,14.5,75,1,plymouth grand fury
159 | 14,8,351,148,4657,13.5,75,1,ford ltd
160 | 17,6,231,110,3907,21,75,1,buick century
161 | 16,6,250,105,3897,18.5,75,1,chevroelt chevelle malibu
162 | 15,6,258,110,3730,19,75,1,amc matador
163 | 18,6,225,95,3785,19,75,1,plymouth fury
164 | 21,6,231,110,3039,15,75,1,buick skyhawk
165 | 20,8,262,110,3221,13.5,75,1,chevrolet monza 2+2
166 | 13,8,302,129,3169,12,75,1,ford mustang ii
167 | 29,4,97,75,2171,16,75,3,toyota corolla
168 | 23,4,140,83,2639,17,75,1,ford pinto
169 | 20,6,232,100,2914,16,75,1,amc gremlin
170 | 23,4,140,78,2592,18.5,75,1,pontiac astro
171 | 24,4,134,96,2702,13.5,75,3,toyota corona
172 | 25,4,90,71,2223,16.5,75,2,volkswagen dasher
173 | 24,4,119,97,2545,17,75,3,datsun 710
174 | 18,6,171,97,2984,14.5,75,1,ford pinto
175 | 29,4,90,70,1937,14,75,2,volkswagen rabbit
176 | 19,6,232,90,3211,17,75,1,amc pacer
177 | 23,4,115,95,2694,15,75,2,audi 100ls
178 | 23,4,120,88,2957,17,75,2,peugeot 504
179 | 22,4,121,98,2945,14.5,75,2,volvo 244dl
180 | 25,4,121,115,2671,13.5,75,2,saab 99le
181 | 33,4,91,53,1795,17.5,75,3,honda civic cvcc
182 | 28,4,107,86,2464,15.5,76,2,fiat 131
183 | 25,4,116,81,2220,16.9,76,2,opel 1900
184 | 25,4,140,92,2572,14.9,76,1,capri ii
185 | 26,4,98,79,2255,17.7,76,1,dodge colt
186 | 27,4,101,83,2202,15.3,76,2,renault 12tl
187 | 17.5,8,305,140,4215,13,76,1,chevrolet chevelle malibu classic
188 | 16,8,318,150,4190,13,76,1,dodge coronet brougham
189 | 15.5,8,304,120,3962,13.9,76,1,amc matador
190 | 14.5,8,351,152,4215,12.8,76,1,ford gran torino
191 | 22,6,225,100,3233,15.4,76,1,plymouth valiant
192 | 22,6,250,105,3353,14.5,76,1,chevrolet nova
193 | 24,6,200,81,3012,17.6,76,1,ford maverick
194 | 22.5,6,232,90,3085,17.6,76,1,amc hornet
195 | 29,4,85,52,2035,22.2,76,1,chevrolet chevette
196 | 24.5,4,98,60,2164,22.1,76,1,chevrolet woody
197 | 29,4,90,70,1937,14.2,76,2,vw rabbit
198 | 33,4,91,53,1795,17.4,76,3,honda civic
199 | 20,6,225,100,3651,17.7,76,1,dodge aspen se
200 | 18,6,250,78,3574,21,76,1,ford granada ghia
201 | 18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj
202 | 17.5,6,258,95,3193,17.8,76,1,amc pacer d/l
203 | 29.5,4,97,71,1825,12.2,76,2,volkswagen rabbit
204 | 32,4,85,70,1990,17,76,3,datsun b-210
205 | 28,4,97,75,2155,16.4,76,3,toyota corolla
206 | 26.5,4,140,72,2565,13.6,76,1,ford pinto
207 | 20,4,130,102,3150,15.7,76,2,volvo 245
208 | 13,8,318,150,3940,13.2,76,1,plymouth volare premier v8
209 | 19,4,120,88,3270,21.9,76,2,peugeot 504
210 | 19,6,156,108,2930,15.5,76,3,toyota mark ii
211 | 16.5,6,168,120,3820,16.7,76,2,mercedes-benz 280s
212 | 16.5,8,350,180,4380,12.1,76,1,cadillac seville
213 | 13,8,350,145,4055,12,76,1,chevy c10
214 | 13,8,302,130,3870,15,76,1,ford f108
215 | 13,8,318,150,3755,14,76,1,dodge d100
216 | 31.5,4,98,68,2045,18.5,77,3,honda accord cvcc
217 | 30,4,111,80,2155,14.8,77,1,buick opel isuzu deluxe
218 | 36,4,79,58,1825,18.6,77,2,renault 5 gtl
219 | 25.5,4,122,96,2300,15.5,77,1,plymouth arrow gs
220 | 33.5,4,85,70,1945,16.8,77,3,datsun f-10 hatchback
221 | 17.5,8,305,145,3880,12.5,77,1,chevrolet caprice classic
222 | 17,8,260,110,4060,19,77,1,oldsmobile cutlass supreme
223 | 15.5,8,318,145,4140,13.7,77,1,dodge monaco brougham
224 | 15,8,302,130,4295,14.9,77,1,mercury cougar brougham
225 | 17.5,6,250,110,3520,16.4,77,1,chevrolet concours
226 | 20.5,6,231,105,3425,16.9,77,1,buick skylark
227 | 19,6,225,100,3630,17.7,77,1,plymouth volare custom
228 | 18.5,6,250,98,3525,19,77,1,ford granada
229 | 16,8,400,180,4220,11.1,77,1,pontiac grand prix lj
230 | 15.5,8,350,170,4165,11.4,77,1,chevrolet monte carlo landau
231 | 15.5,8,400,190,4325,12.2,77,1,chrysler cordoba
232 | 16,8,351,149,4335,14.5,77,1,ford thunderbird
233 | 29,4,97,78,1940,14.5,77,2,volkswagen rabbit custom
234 | 24.5,4,151,88,2740,16,77,1,pontiac sunbird coupe
235 | 26,4,97,75,2265,18.2,77,3,toyota corolla liftback
236 | 25.5,4,140,89,2755,15.8,77,1,ford mustang ii 2+2
237 | 30.5,4,98,63,2051,17,77,1,chevrolet chevette
238 | 33.5,4,98,83,2075,15.9,77,1,dodge colt m/m
239 | 30,4,97,67,1985,16.4,77,3,subaru dl
240 | 30.5,4,97,78,2190,14.1,77,2,volkswagen dasher
241 | 22,6,146,97,2815,14.5,77,3,datsun 810
242 | 21.5,4,121,110,2600,12.8,77,2,bmw 320i
243 | 21.5,3,80,110,2720,13.5,77,3,mazda rx-4
244 | 43.1,4,90,48,1985,21.5,78,2,volkswagen rabbit custom diesel
245 | 36.1,4,98,66,1800,14.4,78,1,ford fiesta
246 | 32.8,4,78,52,1985,19.4,78,3,mazda glc deluxe
247 | 39.4,4,85,70,2070,18.6,78,3,datsun b210 gx
248 | 36.1,4,91,60,1800,16.4,78,3,honda civic cvcc
249 | 19.9,8,260,110,3365,15.5,78,1,oldsmobile cutlass salon brougham
250 | 19.4,8,318,140,3735,13.2,78,1,dodge diplomat
251 | 20.2,8,302,139,3570,12.8,78,1,mercury monarch ghia
252 | 19.2,6,231,105,3535,19.2,78,1,pontiac phoenix lj
253 | 20.5,6,200,95,3155,18.2,78,1,chevrolet malibu
254 | 20.2,6,200,85,2965,15.8,78,1,ford fairmont (auto)
255 | 25.1,4,140,88,2720,15.4,78,1,ford fairmont (man)
256 | 20.5,6,225,100,3430,17.2,78,1,plymouth volare
257 | 19.4,6,232,90,3210,17.2,78,1,amc concord
258 | 20.6,6,231,105,3380,15.8,78,1,buick century special
259 | 20.8,6,200,85,3070,16.7,78,1,mercury zephyr
260 | 18.6,6,225,110,3620,18.7,78,1,dodge aspen
261 | 18.1,6,258,120,3410,15.1,78,1,amc concord d/l
262 | 19.2,8,305,145,3425,13.2,78,1,chevrolet monte carlo landau
263 | 17.7,6,231,165,3445,13.4,78,1,buick regal sport coupe (turbo)
264 | 18.1,8,302,139,3205,11.2,78,1,ford futura
265 | 17.5,8,318,140,4080,13.7,78,1,dodge magnum xe
266 | 30,4,98,68,2155,16.5,78,1,chevrolet chevette
267 | 27.5,4,134,95,2560,14.2,78,3,toyota corona
268 | 27.2,4,119,97,2300,14.7,78,3,datsun 510
269 | 30.9,4,105,75,2230,14.5,78,1,dodge omni
270 | 21.1,4,134,95,2515,14.8,78,3,toyota celica gt liftback
271 | 23.2,4,156,105,2745,16.7,78,1,plymouth sapporo
272 | 23.8,4,151,85,2855,17.6,78,1,oldsmobile starfire sx
273 | 23.9,4,119,97,2405,14.9,78,3,datsun 200-sx
274 | 20.3,5,131,103,2830,15.9,78,2,audi 5000
275 | 17,6,163,125,3140,13.6,78,2,volvo 264gl
276 | 21.6,4,121,115,2795,15.7,78,2,saab 99gle
277 | 16.2,6,163,133,3410,15.8,78,2,peugeot 604sl
278 | 31.5,4,89,71,1990,14.9,78,2,volkswagen scirocco
279 | 29.5,4,98,68,2135,16.6,78,3,honda accord lx
280 | 21.5,6,231,115,3245,15.4,79,1,pontiac lemans v6
281 | 19.8,6,200,85,2990,18.2,79,1,mercury zephyr 6
282 | 22.3,4,140,88,2890,17.3,79,1,ford fairmont 4
283 | 20.2,6,232,90,3265,18.2,79,1,amc concord dl 6
284 | 20.6,6,225,110,3360,16.6,79,1,dodge aspen 6
285 | 17,8,305,130,3840,15.4,79,1,chevrolet caprice classic
286 | 17.6,8,302,129,3725,13.4,79,1,ford ltd landau
287 | 16.5,8,351,138,3955,13.2,79,1,mercury grand marquis
288 | 18.2,8,318,135,3830,15.2,79,1,dodge st. regis
289 | 16.9,8,350,155,4360,14.9,79,1,buick estate wagon (sw)
290 | 15.5,8,351,142,4054,14.3,79,1,ford country squire (sw)
291 | 19.2,8,267,125,3605,15,79,1,chevrolet malibu classic (sw)
292 | 18.5,8,360,150,3940,13,79,1,chrysler lebaron town @ country (sw)
293 | 31.9,4,89,71,1925,14,79,2,vw rabbit custom
294 | 34.1,4,86,65,1975,15.2,79,3,maxda glc deluxe
295 | 35.7,4,98,80,1915,14.4,79,1,dodge colt hatchback custom
296 | 27.4,4,121,80,2670,15,79,1,amc spirit dl
297 | 25.4,5,183,77,3530,20.1,79,2,mercedes benz 300d
298 | 23,8,350,125,3900,17.4,79,1,cadillac eldorado
299 | 27.2,4,141,71,3190,24.8,79,2,peugeot 504
300 | 23.9,8,260,90,3420,22.2,79,1,oldsmobile cutlass salon brougham
301 | 34.2,4,105,70,2200,13.2,79,1,plymouth horizon
302 | 34.5,4,105,70,2150,14.9,79,1,plymouth horizon tc3
303 | 31.8,4,85,65,2020,19.2,79,3,datsun 210
304 | 37.3,4,91,69,2130,14.7,79,2,fiat strada custom
305 | 28.4,4,151,90,2670,16,79,1,buick skylark limited
306 | 28.8,6,173,115,2595,11.3,79,1,chevrolet citation
307 | 26.8,6,173,115,2700,12.9,79,1,oldsmobile omega brougham
308 | 33.5,4,151,90,2556,13.2,79,1,pontiac phoenix
309 | 41.5,4,98,76,2144,14.7,80,2,vw rabbit
310 | 38.1,4,89,60,1968,18.8,80,3,toyota corolla tercel
311 | 32.1,4,98,70,2120,15.5,80,1,chevrolet chevette
312 | 37.2,4,86,65,2019,16.4,80,3,datsun 310
313 | 28,4,151,90,2678,16.5,80,1,chevrolet citation
314 | 26.4,4,140,88,2870,18.1,80,1,ford fairmont
315 | 24.3,4,151,90,3003,20.1,80,1,amc concord
316 | 19.1,6,225,90,3381,18.7,80,1,dodge aspen
317 | 34.3,4,97,78,2188,15.8,80,2,audi 4000
318 | 29.8,4,134,90,2711,15.5,80,3,toyota corona liftback
319 | 31.3,4,120,75,2542,17.5,80,3,mazda 626
320 | 37,4,119,92,2434,15,80,3,datsun 510 hatchback
321 | 32.2,4,108,75,2265,15.2,80,3,toyota corolla
322 | 46.6,4,86,65,2110,17.9,80,3,mazda glc
323 | 27.9,4,156,105,2800,14.4,80,1,dodge colt
324 | 40.8,4,85,65,2110,19.2,80,3,datsun 210
325 | 44.3,4,90,48,2085,21.7,80,2,vw rabbit c (diesel)
326 | 43.4,4,90,48,2335,23.7,80,2,vw dasher (diesel)
327 | 36.4,5,121,67,2950,19.9,80,2,audi 5000s (diesel)
328 | 30,4,146,67,3250,21.8,80,2,mercedes-benz 240d
329 | 44.6,4,91,67,1850,13.8,80,3,honda civic 1500 gl
330 | 33.8,4,97,67,2145,18,80,3,subaru dl
331 | 29.8,4,89,62,1845,15.3,80,2,vokswagen rabbit
332 | 32.7,6,168,132,2910,11.4,80,3,datsun 280-zx
333 | 23.7,3,70,100,2420,12.5,80,3,mazda rx-7 gs
334 | 35,4,122,88,2500,15.1,80,2,triumph tr7 coupe
335 | 32.4,4,107,72,2290,17,80,3,honda accord
336 | 27.2,4,135,84,2490,15.7,81,1,plymouth reliant
337 | 26.6,4,151,84,2635,16.4,81,1,buick skylark
338 | 25.8,4,156,92,2620,14.4,81,1,dodge aries wagon (sw)
339 | 23.5,6,173,110,2725,12.6,81,1,chevrolet citation
340 | 30,4,135,84,2385,12.9,81,1,plymouth reliant
341 | 39.1,4,79,58,1755,16.9,81,3,toyota starlet
342 | 39,4,86,64,1875,16.4,81,1,plymouth champ
343 | 35.1,4,81,60,1760,16.1,81,3,honda civic 1300
344 | 32.3,4,97,67,2065,17.8,81,3,subaru
345 | 37,4,85,65,1975,19.4,81,3,datsun 210 mpg
346 | 37.7,4,89,62,2050,17.3,81,3,toyota tercel
347 | 34.1,4,91,68,1985,16,81,3,mazda glc 4
348 | 34.7,4,105,63,2215,14.9,81,1,plymouth horizon 4
349 | 34.4,4,98,65,2045,16.2,81,1,ford escort 4w
350 | 29.9,4,98,65,2380,20.7,81,1,ford escort 2h
351 | 33,4,105,74,2190,14.2,81,2,volkswagen jetta
352 | 33.7,4,107,75,2210,14.4,81,3,honda prelude
353 | 32.4,4,108,75,2350,16.8,81,3,toyota corolla
354 | 32.9,4,119,100,2615,14.8,81,3,datsun 200sx
355 | 31.6,4,120,74,2635,18.3,81,3,mazda 626
356 | 28.1,4,141,80,3230,20.4,81,2,peugeot 505s turbo diesel
357 | 30.7,6,145,76,3160,19.6,81,2,volvo diesel
358 | 25.4,6,168,116,2900,12.6,81,3,toyota cressida
359 | 24.2,6,146,120,2930,13.8,81,3,datsun 810 maxima
360 | 22.4,6,231,110,3415,15.8,81,1,buick century
361 | 26.6,8,350,105,3725,19,81,1,oldsmobile cutlass ls
362 | 20.2,6,200,88,3060,17.1,81,1,ford granada gl
363 | 17.6,6,225,85,3465,16.6,81,1,chrysler lebaron salon
364 | 28,4,112,88,2605,19.6,82,1,chevrolet cavalier
365 | 27,4,112,88,2640,18.6,82,1,chevrolet cavalier wagon
366 | 34,4,112,88,2395,18,82,1,chevrolet cavalier 2-door
367 | 31,4,112,85,2575,16.2,82,1,pontiac j2000 se hatchback
368 | 29,4,135,84,2525,16,82,1,dodge aries se
369 | 27,4,151,90,2735,18,82,1,pontiac phoenix
370 | 24,4,140,92,2865,16.4,82,1,ford fairmont futura
371 | 36,4,105,74,1980,15.3,82,2,volkswagen rabbit l
372 | 37,4,91,68,2025,18.2,82,3,mazda glc custom l
373 | 31,4,91,68,1970,17.6,82,3,mazda glc custom
374 | 38,4,105,63,2125,14.7,82,1,plymouth horizon miser
375 | 36,4,98,70,2125,17.3,82,1,mercury lynx l
376 | 36,4,120,88,2160,14.5,82,3,nissan stanza xe
377 | 36,4,107,75,2205,14.5,82,3,honda accord
378 | 34,4,108,70,2245,16.9,82,3,toyota corolla
379 | 38,4,91,67,1965,15,82,3,honda civic
380 | 32,4,91,67,1965,15.7,82,3,honda civic (auto)
381 | 38,4,91,67,1995,16.2,82,3,datsun 310 gx
382 | 25,6,181,110,2945,16.4,82,1,buick century limited
383 | 38,6,262,85,3015,17,82,1,oldsmobile cutlass ciera (diesel)
384 | 26,4,156,92,2585,14.5,82,1,chrysler lebaron medallion
385 | 22,6,232,112,2835,14.7,82,1,ford granada l
386 | 32,4,144,96,2665,13.9,82,3,toyota celica gt
387 | 36,4,135,84,2370,13,82,1,dodge charger 2.2
388 | 27,4,151,90,2950,17.3,82,1,chevrolet camaro
389 | 27,4,140,86,2790,15.6,82,1,ford mustang gl
390 | 44,4,97,52,2130,24.6,82,2,vw pickup
391 | 32,4,135,84,2295,11.6,82,1,dodge rampage
392 | 28,4,120,79,2625,18.6,82,1,ford ranger
393 | 31,4,119,82,2720,19.4,82,1,chevy s-10
394 |
--------------------------------------------------------------------------------