├── docs ├── _config.yml └── index.md ├── python ├── images │ ├── Building1.tif │ ├── Building2.tif │ ├── Building3.tif │ ├── Building4.tif │ ├── Building5.tif │ ├── Building6.tif │ ├── Building7.tif │ └── Building8.tif ├── covid19 │ ├── data │ │ ├── eurostat_hlth_rs_bds.csv │ │ ├── eurostat-population.csv │ │ ├── eurostat_population_by_age_2019.csv │ │ └── movie-critics.json │ └── images │ │ └── datiaperti_small.png ├── atn │ ├── taxonomy │ │ └── signal_chain_charts.xlsx │ ├── capacitors.ipynb │ ├── resistors.ipynb │ ├── logfiles │ │ └── Users_Navigation_Data.doc │ └── microcircuites_and_descretes.ipynb ├── physics │ └── images │ │ └── pollastrini_flooding.png ├── finance │ └── data │ │ ├── persons.xml │ │ ├── world_companies.html │ │ └── ENI.MI.csv ├── README.md ├── stats │ └── data │ │ └── italy │ │ ├── unemployment_rate_istat_province.csv │ │ └── unemployment_rate_istat_province_cod_den_uts.csv ├── recommendations.ipynb ├── parsing-data.ipynb ├── python_oop.ipynb └── linalgebra │ └── linalgebra_ch1.ipynb ├── .gitignore ├── r ├── stat_learning │ ├── data │ │ ├── 5.R.RData │ │ ├── 7.R.RData │ │ ├── 10.R.RData │ │ └── Auto.csv │ ├── test_roc.R │ └── chapter1.ipynb ├── rethinking │ ├── simulated_science_distortion.R │ ├── quadratic_approximation.R │ ├── monte_carlo_globe_tossing.R │ ├── quadratic_approximation_height.R │ ├── ch10_maximum_entropy.R │ ├── simulations.R │ ├── dbinom_grid.R │ ├── collider_bias.R │ ├── ch9_easy_hmc.R │ ├── ch9_king_markov_decision_procedure.R │ ├── kruschke.R │ ├── categorical_variables.R │ ├── posterior_predictive_distribution.R │ ├── b_spline.R │ ├── normal_distribution.R │ ├── ch11_binomial_regression.R │ ├── sampling_from_grid.R │ ├── waic_information_criteria.R │ ├── ch9_hamiltonian_monte_carlo.R │ ├── percentile_intervals.R │ ├── post_treatment_bias.R │ ├── polynomial_regression.R │ ├── gaussian_model_of_height.R │ ├── multicollineariry.R │ ├── binomial_distribution.R │ ├── ch8_continuous_interactions.R │ ├── linear_prediction.R │ ├── overfitting.R │ ├── spurious_association.R │ ├── interaction_model.R │ └── masked_relationship.R └── migration_policy │ └── data │ └── policy_database_switzerland.csv ├── README.md ├── julia └── learn_julia.ipynb └── datasets.md /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /python/images/Building1.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building1.tif -------------------------------------------------------------------------------- /python/images/Building2.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building2.tif -------------------------------------------------------------------------------- /python/images/Building3.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building3.tif -------------------------------------------------------------------------------- /python/images/Building4.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building4.tif -------------------------------------------------------------------------------- /python/images/Building5.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building5.tif -------------------------------------------------------------------------------- /python/images/Building6.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building6.tif -------------------------------------------------------------------------------- /python/images/Building7.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building7.tif -------------------------------------------------------------------------------- /python/images/Building8.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/images/Building8.tif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | __pycache__ 3 | .Rproj.user 4 | r.Rproj 5 | .Rhistory 6 | .RData 7 | .Ruserdata 8 | -------------------------------------------------------------------------------- /r/stat_learning/data/5.R.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/r/stat_learning/data/5.R.RData -------------------------------------------------------------------------------- /r/stat_learning/data/7.R.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/r/stat_learning/data/7.R.RData -------------------------------------------------------------------------------- /r/stat_learning/data/10.R.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/r/stat_learning/data/10.R.RData -------------------------------------------------------------------------------- /python/covid19/data/eurostat_hlth_rs_bds.csv: -------------------------------------------------------------------------------- 1 | country_code,2017,2018 2 | IT,192548,N/A 3 | DE,661448,N/A 4 | FR,399865,N/A 5 | ES,138511,N/A 6 | -------------------------------------------------------------------------------- /python/covid19/images/datiaperti_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/covid19/images/datiaperti_small.png -------------------------------------------------------------------------------- /python/atn/taxonomy/signal_chain_charts.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/atn/taxonomy/signal_chain_charts.xlsx -------------------------------------------------------------------------------- /python/physics/images/pollastrini_flooding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/luigiselmi/datascience/HEAD/python/physics/images/pollastrini_flooding.png -------------------------------------------------------------------------------- /python/finance/data/persons.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Pippo 5 | Pippo 6 | United States 7 | 8 | 9 | Mickey 10 | Mouse 11 | Unites States 12 | 13 | 14 | -------------------------------------------------------------------------------- /r/rethinking/simulated_science_distortion.R: -------------------------------------------------------------------------------- 1 | # R code 6.1 rethinking book 2 | set.seed(1914) 3 | N <- 200 # num grant proposals 4 | P <- 0.1 # proportion to select 5 | # uncorrelated newsworthiness and trustworthiness 6 | nw <- rnorm(N) 7 | tw <- rnorm(N) 8 | # select top 10 % of combined scores 9 | s <- nw + tw # total score 10 | q <- quantile(s, 1 - p) # top 10 % threshold 11 | selected <- ifelse(s >= q, TRUE, FALSE) 12 | cor(tw[selected], nw[selected]) 13 | plot(tw[selected], nw[selected]) 14 | -------------------------------------------------------------------------------- /r/rethinking/quadratic_approximation.R: -------------------------------------------------------------------------------- 1 | # R code 2.6, 2.7 rethinking book 2 | 3 | library(rethinking) 4 | 5 | globe.qa <- quap( 6 | alist( 7 | W ~ dbinom(W + L, p), # binomial likelihood 8 | p ~ dunif(0,1) # uniform prior 9 | ), 10 | data = list(W = 6, L = 3) 11 | ) 12 | 13 | precis(globe.qa) 14 | 15 | # analytical calculation 16 | W <- 12 17 | L <- 6 18 | 19 | curve(dbeta(x, W + 1, L + 1), from = 0, to = 1) 20 | #quadratic approximation 21 | curve(dnorm(x, 0.67, 0.16), lty = 2, add = TRUE) 22 | -------------------------------------------------------------------------------- /r/rethinking/monte_carlo_globe_tossing.R: -------------------------------------------------------------------------------- 1 | # R code 2.8 rethinking book 2 | n_samples <- 1000 3 | p <- rep( NA , n_samples ) 4 | p[1] <- 0.5 5 | W <- 6 6 | L <- 3 7 | 8 | for ( i in 2:n_samples ) { 9 | p_new <- rnorm( 1 , p[i-1] , 0.1 ) 10 | if ( p_new < 0 ) p_new <- abs( p_new ) 11 | if ( p_new > 1 ) p_new <- 2 - p_new 12 | q0 <- dbinom( W , W+L , p[i-1] ) 13 | q1 <- dbinom( W , W+L , p_new ) 14 | p[i] <- ifelse( runif(1) < q1/q0 , p_new , p[i-1] ) 15 | } 16 | 17 | dens( p , xlim=c(0,1) ) 18 | curve( dbeta( x , W+1 , L+1 ) , lty=2 , add=TRUE ) -------------------------------------------------------------------------------- /r/rethinking/quadratic_approximation_height.R: -------------------------------------------------------------------------------- 1 | # R code 4.26 rethinking book 2 | library(rethinking) 3 | data("Howell1") 4 | d <- Howell1 5 | d2 <- d[d$age >= 18,] 6 | 7 | # model definition 8 | flist <- alist( 9 | height ~ dnorm(mu, sigma), 10 | mu ~ dnorm(156, 10), 11 | sigma ~ dunif(0, 50) 12 | ) 13 | 14 | m4.1 <- quap(flist, data = d2) 15 | precis(m4.1) 16 | 17 | # variance-covariance matrix 18 | vcov(m4.1) 19 | # variances 20 | diag(vcov(m4.1)) 21 | cov2cor(vcov(m4.1)) 22 | 23 | # samples from the multi-dimensional posterior 24 | post <- extract.samples(m4.1, n = 1e4) 25 | head(post) 26 | precis(post) -------------------------------------------------------------------------------- /r/rethinking/ch10_maximum_entropy.R: -------------------------------------------------------------------------------- 1 | # R code 10.1 rethinking book 2 | # Example 5 buckets, 10 pebbles. 3 | p <- list() 4 | # we define 5 different distributions 5 | p$A <- c(0, 0, 10, 0, 0) 6 | p$B <- c(0, 1, 8, 1, 0) 7 | p$C <- c(0, 2, 6, 2, 0) 8 | p$D <- c(1, 2, 4, 2, 1) 9 | p$E <- c(2, 2, 2, 2, 2) 10 | 11 | # we define the probability distribution by normalizing them 12 | p_norm <- lapply(p, function(q) q / sum(q)) 13 | 14 | # We can compute the information entropy. We can see from it that distribution E 15 | # is the most likely and has the biggest entropy. 16 | (H <- sapply(p_norm, function(q) - sum(ifelse(q == 0, 0, q * log(q))))) -------------------------------------------------------------------------------- /python/finance/data/world_companies.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | World's Companies 7 | 8 | 9 |

World's Companies

10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 |
CompanyContactCountry
Alfreds FutterkisteMaria AndersGermany
Centro comercial MoctezumaFrancisco ChangMexico
28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /python/covid19/data/eurostat-population.csv: -------------------------------------------------------------------------------- 1 | country_code,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019 2 | CH,7593494,7701856,7785806,7870134,7954662,8039060,8139631,8237666,8327126,8419550,8484130,8544527 3 | DE,82217837,82002356,81802257,80222065,80327900,80523746,80767463,81197537,82175684,82521653,82792351,83019213 4 | ES,45668939,46239273,46486619,46667174,46818219,46727890,46512199,46449565,46440099,46528024,46658447,46937060 5 | FR,64007193,64350226,64658856,64978721,65276983,65600350,66165980,66458153,66638391,66809816,66918941,67012883 6 | IT,58652875,59000586,59190143,59364690,59394207,59685227,60782668,60795612,60665551,60589445,60483973,60359546 7 | UK,61571647,62042343,62510197,63022532,63495088,63905342,64351203,64853393,65379044,65844142,66273576,66647112 8 | -------------------------------------------------------------------------------- /r/rethinking/simulations.R: -------------------------------------------------------------------------------- 1 | # R code from snippet 3.23 (rethinking book). 2 | # We use the binomial distribution with, e.g. parameter p = 0.7, 3 | # to create simulations. 4 | 5 | obs <- 1e5 6 | size <- 9 7 | # Generates observations from a sample of size 8 | # Each observation tells, e.g. how many waters we had from each sample of size 9 | dummy_w <- rbinom(obs, size = size, prob = 0.7) 10 | 11 | # cumputes how many times we had one of the possible 12 | # values from each sample, e.g. from samples of two tosses (size = 2) we can have 13 | # water 0, 1 or 2 times. The values are normalized. 14 | table(dummy_w) / obs 15 | 16 | # plot the histogram of the distribution in the simulation data 17 | simplehist(dummy_w, xlab = 'dummy water count') 18 | 19 | 20 | -------------------------------------------------------------------------------- /python/covid19/data/eurostat_population_by_age_2019.csv: -------------------------------------------------------------------------------- 1 | country_code,total,0_5,5_9,10_14,15_19,20_24,25_29,30_34,35_39,40_44,45_49,50_54,55_59,60_64,65_69,70_74,75_79,70_84,85_over 2 | DE,83019213,3926397,3662238,3702180,4003477,4607272,5193335,5409029,5237416,4841738,5584519,6875948,6598218,5493914,4808497,3596545,4089384,3111597,2277509 3 | IT,60359546,2367686,2722796,2871733,2897141,2990245,3211025,3369346,3704872,4418357,4824297,4934336,4417895,3846237,3490973,3233852,2728681,2176582,2153492 4 | FR,67012883,3741707,4122551,4183612,4136255,3736133,3790316,4062485,4236160,4107779,4555756,4460465,4327805,4081786,3917445,3266695,2179268,1874151,2232514 5 | ES,46937060,2067503,2356886,2505728,2332161,2288322,2525120,2801658,3436405,3984581,3812140,3640087,3273516,2807378,2406253,2187986,1630452,1363069,1517815 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /r/migration_policy/data/policy_database_switzerland.csv: -------------------------------------------------------------------------------- 1 | Year,Group 2 | 2000,3 3 | 2000,1 4 | 2000,1 5 | 2000,3 6 | 2000,2 7 | 2001,0 8 | 2001,1 9 | 2001,0 10 | 2002,1 11 | 2002,3 12 | 2002,2 13 | 2002,1 14 | 2003,1 15 | 2004,1 16 | 2004,3 17 | 2004,2 18 | 2005,3 19 | 2005,3 20 | 2005,2 21 | 2005,3 22 | 2005,0 23 | 2005,1 24 | 2007,3 25 | 2007,2 26 | 2007,2 27 | 2007,2 28 | 2007,2 29 | 2007,2 30 | 2007,3 31 | 2008,0 32 | 2008,1 33 | 2008,1 34 | 2008,3 35 | 2008,3 36 | 2008,2 37 | 2008,3 38 | 2008,0 39 | 2008,1 40 | 2008,3 41 | 2008,3 42 | 2008,2 43 | 2009,3 44 | 2009,1 45 | 2009,1 46 | 2010,3 47 | 2010,0 48 | 2010,1 49 | 2010,0 50 | 2010,3 51 | 2010,2 52 | 2011,1 53 | 2011,2 54 | 2011,3 55 | 2011,3 56 | 2011,1 57 | 2011,0 58 | 2011,1 59 | 2011,3 60 | 2012,1 61 | 2012,2 62 | 2012,2 63 | 2012,2 64 | 2012,1 65 | 2013,3 -------------------------------------------------------------------------------- /r/rethinking/dbinom_grid.R: -------------------------------------------------------------------------------- 1 | # R code 2.3 rethinking book. The purpose of 2 | # this script is to build a model and evaluate the probability 3 | # distribution (posterior) of the model parameter 4 | # define grid 5 | p_grid <- seq(from = 0, to = 1, length.out = 20) 6 | 7 | # define prior 8 | #prior <- rep(1, 20) 9 | #prior <- ifelse(p_grid < 0.5, 0, 1) 10 | prior <- exp(-5*abs(p_grid - 0.5)) 11 | 12 | # compute likelihood at each value in grid 13 | likelihood <-dbinom(6, size = 9, prob = p_grid) 14 | 15 | # compute product of likelihood and prior 16 | unstd.posterior <- likelihood * prior 17 | 18 | # standardize the posterior, so it sums to 1 19 | posterior <- unstd.posterior / sum(unstd.posterior) 20 | 21 | # plot the posterior distribution 22 | plot(p_grid, posterior, type = 'b', 23 | xlab = 'probability of water', ylab = 'posterior probability') 24 | mtext('20 points') 25 | -------------------------------------------------------------------------------- /r/rethinking/collider_bias.R: -------------------------------------------------------------------------------- 1 | # R code 6.22 rethinking book 2 | library(rethinking) 3 | d <- sim_happiness(seed = 1977, N_years = 1000) 4 | precis(d) 5 | d2 <- d[d$age > 17, ] # only adults 6 | d2$A <- (d2$age - 18) / (65 - 18) # rescale age interval from 65-18 to 0-1 7 | d2$mid <- d2$married + 1 # married = 2, not married = 1 8 | m6.9 <- quap( 9 | alist( 10 | happiness ~ dnorm( mu , sigma ) , 11 | mu <- a[mid] + bA * A , # here we consider marriage status 12 | a[mid] ~ dnorm(0, 1) , 13 | bA ~ dnorm(0, 2), 14 | sigma ~ dexp(1) 15 | ) , 16 | data = d2 17 | ) 18 | 19 | precis(m6.9, depth = 2) 20 | 21 | m6.10 <- quap( 22 | alist( 23 | happiness ~ dnorm( mu , sigma ) , 24 | mu <- a + bA * A , # here we do not consider marriage status 25 | a ~ dnorm(0, 1) , 26 | bA ~ dnorm(0, 2), 27 | sigma ~ dexp(1) 28 | ) , 29 | data = d2 30 | ) 31 | 32 | precis(m6.10) 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /r/rethinking/ch9_easy_hmc.R: -------------------------------------------------------------------------------- 1 | # R code 9.9 rethinking book 2 | # Example using Hamiltonian Monte Carlo 3 | library(rethinking) 4 | data("rugged") 5 | d <- rugged 6 | d$log_gdp <- log(d$rgdppc_2000) 7 | dd <- d[complete.cases(d$rgdppc_2000), ] 8 | dd$log_gdp_std <- dd$log_gdp / mean(dd$log_gdp) 9 | dd$rugged_std <- dd$rugged / max(dd$rugged) 10 | dd$cid <- ifelse(dd$cont_africa == 1, 1, 2) 11 | 12 | dat_slim <- list( 13 | log_gdp_std = dd$log_gdp_std, 14 | rugged_std = dd$rugged_std, 15 | cid = as.integer(dd$cid) 16 | ) 17 | 18 | str(dat_slim) 19 | 20 | m9.1 <- ulam( 21 | alist( 22 | log_gdp_std ~ dnorm( mu , sigma ) , 23 | mu <- a[cid] + b[cid] * (rugged_std - 0.215), 24 | a[cid] ~ dnorm(1, 0.1) , 25 | b[cid] ~ dnorm(0, 0.3), 26 | sigma ~ dexp(1) 27 | ) , 28 | data = dat_slim, 29 | chains = 4, 30 | cores = 4, 31 | iter = 1000 32 | ) 33 | 34 | precis(m9.1, depth = 2) 35 | 36 | pairs(m9.1) 37 | traceplot(m9.1, chains = 1) 38 | 39 | -------------------------------------------------------------------------------- /python/covid19/data/movie-critics.json: -------------------------------------------------------------------------------- 1 | {"Lisa Rose": {"Lady in the Water": 2.5, "Snakes on a Plane": 3.5, 2 | "Just My Luck": 3.0, "Superman Returns": 3.5, "You, Me and Dupree": 2.5, 3 | "The Night Listener": 3.0}, 4 | "Gene Seymour": {"Lady in the Water": 3.0, "Snakes on a Plane": 3.5, 5 | "Just My Luck": 1.5, "Superman Returns": 5.0, "The Night Listener": 3.0, 6 | "You, Me and Dupree": 3.5}, 7 | "Michael Phillips": {"Lady in the Water": 2.5, "Snakes on a Plane": 3.0, 8 | "Superman Returns": 3.5, "The Night Listener": 4.0}, 9 | "Claudia Puig": {"Snakes on a Plane": 3.5, "Just My Luck": 3.0, 10 | "The Night Listener": 4.5, "Superman Returns": 4.0, 11 | "You, Me and Dupree": 2.5}, 12 | "Mick LaSalle": {"Lady in the Water": 3.0, "Snakes on a Plane": 4.0, 13 | "Just My Luck": 2.0, "Superman Returns": 3.0, "The Night Listener": 3.0, 14 | "You, Me and Dupree": 2.0}, 15 | "Jack Matthews": {"Lady in the Water": 3.0, "Snakes on a Plane": 4.0, 16 | "The Night Listener": 3.0, "Superman Returns": 5.0, "You, Me and Dupree": 3.5}, 17 | "Toby": {"Snakes on a Plane":4.5,"You, Me and Dupree":1.0,"Superman Returns":4.0}} 18 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | Python Cookbook 2 | =============== 3 | A collection of notebooks with examples about the main Python packages: NumPy, Pandas, Matplotlib, SciPy. 4 | 5 | * [NumPy and SciPy](scipy-numpy-cheat-sheet.ipynb) 6 | * [Jupyter widgets](jupyter_widgets.ipynb) 7 | * [Pandas](intro_to_pandas.ipynb) 8 | * [Object-Oriented Python](python_oop.ipynb) 9 | 10 | Good references for Python programming are: 11 | * Jake VanderPlas' [Python Data Science Handbook](https://jakevdp.github.io/PythonDataScienceHandbook/) 12 | * Wes McKinney's [Python for Data Analysis, 3rd Edition](https://wesmckinney.com/book/) 13 | * Scopatz and Huff's [Effective Computation in Physics](https://www.amazon.com/Effective-Computation-Physics-Research-Python-ebook/dp/B010ORQ8DG) 14 | * [Gorelick - High Performance Python, 2nd Edition](https://www.amazon.com/High-Performance-Python-Performant-Programming/dp/1492055026) 15 | 16 | ## Python packages 17 | [pytest](https://docs.pytest.org/en/stable/), framework for tests 18 | [Python Packaging](https://packaging.python.org/en/latest/), Python code distribution 19 | [Read the Docs](https://about.readthedocs.com/), open source software documentation 20 | -------------------------------------------------------------------------------- /r/rethinking/ch9_king_markov_decision_procedure.R: -------------------------------------------------------------------------------- 1 | # R code 9.1 rethinking book 2 | # King Markov decision procedure 3 | num_weeks <- 1e5 4 | positions <- rep(0, num_weeks) 5 | current <- 10 6 | for (i in 1:num_weeks) { 7 | # record current position 8 | positions[i] <- current 9 | 10 | # flip coin to generate proposal 11 | proposal <- current + sample(c(-1,1), size = 1) 12 | # make sure he loops around the archipelago 13 | if (proposal < 1) proposal <- 10 14 | if (proposal > 10) proposal <- 1 15 | 16 | # move ? 17 | prob_move <- proposal / current 18 | current <- ifelse(runif(1) < prob_move, proposal, current) 19 | } 20 | plot( 1:1000 , positions[1:1000] ) 21 | plot( table( positions ) ) 22 | 23 | # Many dimensions problem. A probability density function 24 | # in a hyperspace most of the points are close the its surface 25 | # far from the mean. So sampling close to it won't be efficient. 26 | library(rethinking) 27 | D <- 1000 28 | T <- 1e4 29 | Y <- rmvnorm(T, rep(0, D), diag(D)) # transforms univariate normal distribution, with 0 mean, to a multivariate distribution 30 | rad_dist <- function(Y) sqrt(sum(Y^2)) 31 | Rd <- sapply(1:T, function(i) rad_dist(Y[i, ])) 32 | dens(Rd) -------------------------------------------------------------------------------- /r/rethinking/kruschke.R: -------------------------------------------------------------------------------- 1 | N = 500 # Specify the total number of flips, denoted N. 2 | pHeads = 0.5 # Specify underlying probability of heads. 3 | # Generate a random sample of N flips (heads=1, tails=0): 4 | flipSequence = sample( x=c(0,1), prob=c(1-pHeads,pHeads), size=N, replace=TRUE) 5 | # Compute the running proportion of heads: 6 | r = cumsum( flipSequence ) # Cumulative sum: Number of heads at each step. 7 | n = 1:N # Number of flips at each step. 8 | runProp = r / n # Component by component division. 9 | # Graph the running proportion: 10 | plot( n , runProp , type="o" , log="x" , col="skyblue" , 11 | xlim=c(1,N) , ylim=c(0.0,1.0) , cex.axis=1.5 , 12 | xlab="Flip Number" , ylab="Proportion Heads" , cex.lab=1.5 , 13 | main="Running Proportion of Heads" , cex.main=1.5 ) 14 | # Plot a dotted horizontal reference line: 15 | abline( h=pHeads , lty="dotted" ) 16 | # Display the beginning of the flip sequence: 17 | flipLetters = paste( c("T","H")[flipSequence[1:10]+1] , collapse="" ) 18 | displayString = paste0( "Flip Sequence = " , flipLetters , "..." ) 19 | text( N , .9 , displayString , adj=c(1,0.5) , cex=1.3 ) 20 | # Display the relative frequency at the end of the sequence. 21 | text( N , .8 , paste("End Proportion =",runProp[N]) , adj=c(1,0.5) , cex=1.3 ) -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | ## Welcome to the Data Science repository ! 2 | 3 | This repository contains code, mostly Jupyter notebooks in Python and R, about data science and statistical inference. The main projects are 4 | 5 | - [covid19 monitoring notebook](https://github.com/luigiselmi/datascience/blob/master/python/covid19/covid19-monitoring-notebook.ipynb) is a single notebook in python that monitors the > 6 | virus in some European countries. 7 | - [Statistical Learning](https://github.com/luigiselmi/datascience/blob/master/r/stat_learning/chapter1.ipynb) is a collection of notebooks in R with my notes about a course on 8 | statistical inference and worked out examples taken from the book used in the course. 9 | - [Bayesian Inference](https://github.com/luigiselmi/datascience/blob/master/r/rethinking/probability.ipynb) is a collection 10 | of notes about probability and bayesian inference and R scripts taken 11 | from the books I am using for this project. 12 | - [copernicus](https://github.com/luigiselmi/datascience/blob/master/python/copernicus/copernicus_services.ipynb) notebooks about climate change using the Copernicus 13 | data. 14 | - [digital image processing](https://github.com/luigiselmi/datascience/blob/master/python/imaging/digital_image_processing.ipynb) notebooks about digital image processing 15 | -------------------------------------------------------------------------------- /r/rethinking/categorical_variables.R: -------------------------------------------------------------------------------- 1 | # R code 5.34 rethinking book 2 | library(rethinking) 3 | data("Howell1") 4 | d <- Howell1 5 | str(d) 6 | 7 | # we create an index variable to represent the sex 8 | d$sex <- ifelse(d$male == 1, 2, 1) 9 | 10 | # We build a model with an index variable as prior 11 | m5.8 <- quap( 12 | alist( 13 | height ~ dnorm( mu , sigma ) , 14 | mu <- a[sex] , 15 | a[sex] ~ dnorm( 178 , 20 ) , 16 | sigma ~ dunif(0, 50) 17 | ) , 18 | data = d 19 | ) 20 | precis(m5.8, depth = 2) 21 | 22 | # Let's extract a sample from the posterior distribution 23 | post <- extract.samples(m5.8) 24 | post$diff_fm <- post$a[, 1] - post$a[, 2] 25 | precis(post, depth = 2) 26 | 27 | # Let#s use again the primate milk dataset 28 | data("milk") 29 | d <- milk 30 | unique((d$clade)) 31 | d$clade_id <- as.integer(d$clade) 32 | # we build a model to measure the average energy in each clade (cialda) 33 | d$K <- scale(d$kcal.per.g) 34 | m5.9 <- quap( 35 | alist( 36 | K ~ dnorm( mu , sigma ) , 37 | mu <- a[clade_id] , 38 | a[clade_id] ~ dnorm( 0 , 0.5 ) , 39 | sigma ~ dexp(1) 40 | ) , 41 | data = d 42 | ) 43 | labels <- paste("a[", 1:4, "]:", levels(d$clade), sep = "") 44 | plot(precis(m5.9, depth = 2, pars = "a"), labels = labels) 45 | -------------------------------------------------------------------------------- /r/rethinking/posterior_predictive_distribution.R: -------------------------------------------------------------------------------- 1 | # R code 3.26 rethinking book 2 | # Once we have a model we want to use it to make predictions. 3 | # In this script we create a model then we collect samples from it 4 | data_size <- 1000 # size of the dataset 5 | # creates a sequence of values between 0 and 1, 6 | # that represent possible values of a parameter 7 | p_grid <- seq(from = 0, to = 1, length.out = data_size) 8 | 9 | # creates a sequence of ones that represent the 10 | # prior probability of each possible value of the parameter 11 | prior <- rep(1, data_size) 12 | 13 | # computes the likelihood of each value of the paramenter 14 | # under the binomial distribution and the observations 15 | likelihood <- dbinom(6, size = 9, prob = p_grid) 16 | #plot(likelihood) 17 | 18 | # computes the posterior distribution 19 | posterior <- likelihood * prior 20 | 21 | # normalization of the posterior distribution 22 | posterior <- posterior / sum(posterior) 23 | 24 | # creates samples of from the posterior distribution 25 | samples_size = 1e4 26 | samples <- sample(p_grid, prob = posterior, size = samples_size, replace = TRUE) 27 | 28 | # there is uncertainty over the parameter so we propagate through all 29 | # the possible values to create predictions 30 | w <- rbinom(1e4, size = 9, prob = samples) 31 | 32 | # plot the predictions 33 | simplehist(w) 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /r/rethinking/b_spline.R: -------------------------------------------------------------------------------- 1 | # R code 4.72 rethinking book 2 | library(rethinking) 3 | data("cherry_blossoms") 4 | d <- cherry_blossoms 5 | precis(d) 6 | # plot temperature against year 7 | plot(d$temp ~ d$year) 8 | 9 | d2 <- d[complete.cases(d$temp), ] # complete cases on temp 10 | num_knots <- 15 11 | knot_list <- quantile(d2$year, probs = seq(0, 1, length.out = num_knots)) 12 | 13 | # constructs degree 3, cubic, spline 14 | library(splines) 15 | B <- bs(d2$year, 16 | knots = knot_list[-c(1, num_knots)], 17 | degree = 3, intercept = TRUE) 18 | 19 | # plot the basis functions 20 | plot( NULL , xlim=range(d2$year) , ylim=c(0,1) , xlab="year" , ylab="basis value") 21 | for ( i in 1:ncol(B) ) lines( d2$year , B[,i] ) 22 | 23 | # defines the model 24 | m4.7 <- quap( 25 | alist( 26 | T ~ dnorm( mu , sigma ) , 27 | mu <- a + B %*% w , 28 | a ~ dnorm(6,10), 29 | w ~ dnorm(0,1), 30 | sigma ~ dexp(1) 31 | ), 32 | data=list( T=d2$temp , B=B ) , 33 | start=list( w=rep( 0 , ncol(B) ) ) ) 34 | 35 | # plot the weighted basis functions 36 | post <- extract.samples(m4.7) 37 | w <- apply( post$w , 2 , mean ) 38 | plot( NULL , xlim=range(d2$year) , ylim=c(-2,2) , 39 | xlab="year" , ylab="basis * weight" ) 40 | for ( i in 1:ncol(B) ) lines( d2$year , w[i]*B[,i] ) 41 | 42 | # plot mu with 97 % posterior interval 43 | mu <- link( m4.7 ) 44 | mu_PI <- apply(mu,2,PI,0.97) 45 | plot( d2$year , d2$temp , col=col.alpha(rangi2,0.3) , pch=16 ) 46 | shade( mu_PI , d2$year , col=col.alpha("black",0.5) ) -------------------------------------------------------------------------------- /r/rethinking/normal_distribution.R: -------------------------------------------------------------------------------- 1 | # R code from snippet 4.1 (rethinking book). 2 | # We look at different processes that end up in a normal distribution. 3 | # 1) Normal by addition 4 | # We imagine an individual that takes a certain number of steps, 5 | # whose length is taken from the uniform distribution within an 6 | # interval between -1 and 1, and finally, we look at its final 7 | # position that means we sum up all the steps. 8 | # We repeat this experiment 1000 times. All this can be 9 | # done in R with one single line of code. 10 | pos <- replicate(1000, sum(runif(16, -1, 1))) 11 | 12 | # The important result is that "adding together random values 13 | # from the same distribution, uniform or others, converges to 14 | # a normal distribution. 15 | #hist(pos) # uncomment to plot 16 | 17 | # 2) Normal by multiplication 18 | # Other processes can be described by a multiplication of values 19 | # that fluctuate randomly by a small amount, e.g. 0.1, about a 20 | # central point 21 | deviation <- 0.1 22 | growth <- replicate(10000, prod(1 + runif(12, 0, 0.1))) 23 | 24 | # We still get a normal distribution for the process. 25 | library(rethinking) 26 | #dens(growth, norm.comp = TRUE) # uncomment to plot 27 | 28 | # 3) Normal by log-multiplication 29 | # Multiplying large random deviations are not normal distributed 30 | # but they logarithm are since they are transformed in additions. 31 | deviation <- 0.5 32 | log.big <- replicate(10000, log(prod(1 + runif(12, 0, 0.5)))) 33 | #dens(log.big, norm.comp = TRUE) # uncomment to plot 34 | 35 | 36 | curve(exp(-x^2), from = -3, to = 3) # Gaussian distribution -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Data Science 2 | ============ 3 | I use this repository for my projects about data science and statistical inference. The main projects are 4 | 5 | - [Python Cookbook](python/README.md) Python data structures, and code examples using the Python core packages for data science: NumPy, Pandas, Matplotlib, SciPy. 6 | - [covid19 monitoring notebook](python/covid19/covid19-monitoring-notebook.ipynb) is a single notebook in python that monitors the spread of the 7 | virus in some European countries. 8 | - [Statistical Learning](r/stat_learning/chapter1.ipynb) is a collection of notebooks in R with my notes about a course on 9 | statistical inference and worked out examples taken from the book used in the course. 10 | - [Bayesian Inference](r/rethinking/probability.ipynb) is a collection 11 | of notes about probability and bayesian inference and R scripts taken 12 | from the books I am using for this project. 13 | 14 | The Digital Image Processing folder has been moved to a new repository [dip](https://github.com/luigiselmi/dip). The copernicus folder has been moved to the new repository [copernicus](https://github.com/luigiselmi/copernicus). The iia folder has been moved to the new [climate]() repository. 15 | 16 | ## Open data 17 | A list of open data sets about science, economics, finance, and health is available [here](datasets.md). 18 | 19 | ## Python packages for data science 20 | [PyMC](https://www.pymc.io/welcome.html), Bayesian probabilistic programming in Python 21 | [KDEpy](https://kdepy.readthedocs.io/en/latest/#), Kernel density estimation for Python 22 | [Pyro](https://pyro.ai/), deep universal probabilistic programming 23 | [pyFFTW](https://pyfftw.readthedocs.io/en/latest/index.html#), fast fourier transform 24 | -------------------------------------------------------------------------------- /r/stat_learning/test_roc.R: -------------------------------------------------------------------------------- 1 | # Create the data set and plot 2 | set.seed(1) 3 | 4 | # Create the data set and Split in training and test set 5 | X <- matrix(rnorm(200 * 2), ncol = 2) # random sample matrix from a normal distribution with mean = 0 and standard deviation = 1 6 | y <- c(rep(1, 100), rep(2, 100)) # two classes y = 1 and y = 2 7 | X[y == 1, ] <- X[y == 1, ] + 1 # move apart the class with y = 1 8 | train <- sample(200, 100) # random integers for index of train data 9 | dat <- data.frame(x = X, y = as.factor(y)) 10 | plot(X, col = y, xlab = "X1", ylab = "X2") 11 | 12 | library(e1071) 13 | # Fit the training data using svm with radial kernel 14 | #svmfit <- svm(y ~ ., data = dat[train, ], kernel = "radial", gamma = 2, cost = 1) 15 | # Fit the training data using svm with linear kernel 16 | svmfit <- svm(y ~ ., data = dat[train, ], kernel = "linear", cost = 6, scale = TRUE) 17 | 18 | plot(svmfit, dat[train, ]) 19 | 20 | # Function definition to Plot the ROC curves 21 | library(ROCR) 22 | rocplot <- function(pred, truth, ...) { 23 | predob <- prediction(pred, truth) 24 | perf <- performance(predob, measure = "tpr", x.measure = "fpr") 25 | plot(perf, col = "blue", colorize = TRUE, ...) 26 | auc <- performance(predob, measure = "auc") 27 | abline(a=0, b= 1) 28 | return(auc) 29 | } 30 | 31 | par(mfrow = c(1,2)) 32 | # Predict the training data and plot the ROC curve 33 | fitted <- attributes(predict(svmfit, dat[train, ], decision.values = TRUE))$decision.values 34 | auc <- rocplot(fitted, dat[train, "y"], main = "Training Data") 35 | auc@y.values 36 | 37 | # Predict the test data and plot the ROC curve 38 | fitted <- attributes(predict(svmfit, dat[-train, ], decision.values = TRUE))$decision.values 39 | auc <-rocplot(fitted, dat[-train,"y"], main = "Test Data") 40 | auc@y.values 41 | 42 | -------------------------------------------------------------------------------- /r/rethinking/ch11_binomial_regression.R: -------------------------------------------------------------------------------- 1 | # R code 11.1 rethinking book 2 | # Binomial regression - Prosocial chimpanzees experiment. 3 | # We want to see whether a chimpazee shares food with others or not. 4 | library(rethinking) 5 | data("chimpanzees") 6 | d <- chimpanzees 7 | 8 | d$treatment <- 1 + d$prosoc_left + 2 * d$condition 9 | 10 | xtabs(~ treatment + prosoc_left + condition, d) 11 | 12 | m11.1 <- quap( 13 | alist( 14 | pulled_left ~ dbinom( 1 , p ) , 15 | logit(p) <- a, 16 | a ~ dnorm(0, 1.5) 17 | ) , 18 | data = d 19 | ) 20 | 21 | m11.2 <- quap( 22 | alist( 23 | pulled_left ~ dbinom( 1 , p ) , 24 | logit(p) <- a + b[treatment] , 25 | a ~ dnorm(0, 1.5) , 26 | b[treatment] ~ dnorm(0, 10) 27 | ) , 28 | data = d 29 | ) 30 | 31 | m11.3 <- quap( 32 | alist( 33 | pulled_left ~ dbinom( 1 , p ) , 34 | logit(p) <- a + b[treatment] , 35 | a ~ dnorm(0, 1.5) , 36 | b[treatment] ~ dnorm(0, 0.5) 37 | ) , 38 | data = d 39 | ) 40 | 41 | set.seed(1999) 42 | prior <- extract.prior(m11.3, n = 1e4) 43 | p <- sapply(1:4, function(k) inv_logit(prior$a + prior$b[, k])) 44 | dens(abs(p[, 1] - p[, 2]), adj = 0.1) 45 | 46 | # prior trimmed data list 47 | dat_list <- list( 48 | pulled_left = d$pulled_left , 49 | actor = d$actor , 50 | treatment = as.integer(d$treatment) 51 | ) 52 | 53 | # particles in 11-dimensional space 54 | m11.4 <- ulam( 55 | alist( 56 | pulled_left ~ dbinom(1, p) , 57 | logit(p) <- a[actor] + b[treatment] , 58 | a[actor] ~ dnorm(0, 1.5) , 59 | b[treatment] ~ dnorm(0, 0.5) 60 | ) , 61 | data = dat_list, chains = 4 62 | ) 63 | 64 | precis(m11.4, depth = 2) 65 | 66 | post <- extract.samples(m11.4) 67 | p_left <- inv_logit(post$a) 68 | labs <- c("R/N", "L/N", "R/P", "L/P") 69 | plot(precis(m11.4, depth = 2, pars = "b"), labels = labs) # not what expected 70 | #plot(precis(as.data.frame(p_left)), xlim = c(0,1)) # error 71 | -------------------------------------------------------------------------------- /r/rethinking/sampling_from_grid.R: -------------------------------------------------------------------------------- 1 | # R code from snippet 3.2 (rethinking book) 2 | # Usually a model has many parameters and cannot be handled analytically 3 | # so the parameters are computed numerically using samples taken from 4 | # posterior distribution.This script shows how to build a sample from a 5 | # statistical model with only one parameter (like in the Globe example) 6 | # and how to summarize the posterior distribution computing for example 7 | # the value with highest probability or the probability for the parameter 8 | # to have any value below(above) a defined one. 9 | data_size <- 1000 # size of the dataset 10 | 11 | # creates a sequence of values between 0 and 1, 12 | # that represent possible values of a parameter 13 | p_grid <- seq(from = 0, to = 1, length.out = data_size) 14 | 15 | # creates a sequence of ones that represent the 16 | # prior probability of each possible value of the parameter 17 | prob_p <- rep(1, data_size) 18 | 19 | # computes the likelihood of each value of the paramenter 20 | # under the binomial distribution and the observations 21 | prob_data <- dbinom(6, size = 9, prob = p_grid) 22 | 23 | # computes the posterior distribution 24 | posterior <- prob_data * prob_p 25 | 26 | # normalization of the posterior distribution 27 | posterior <- posterior / sum(posterior) 28 | #plot(posterior) 29 | 30 | # creates a sample of values from the distribution 31 | samples_size = 1e4 32 | samples <- sample(p_grid, prob = posterior, size = samples_size, replace = TRUE) 33 | #plot(samples) # uncomment to plot the data 34 | 35 | library(rethinking) 36 | # plot the (density) posterior distribution 37 | #dens(samples) # uncomment to plot the data 38 | 39 | # Sampling to summarize 40 | # posterior probability for the parameter to be below 0.5 (without sampling) 41 | sum(posterior[p_grid < 0.5]) 42 | # same using the samples 43 | sum(samples < 0.5) / samples_size 44 | 45 | # posterior probability for the parameter between 0.5 and 0.75, using ther samples 46 | sum(samples > 0.5 & samples < 0.75) / samples_size 47 | 48 | 49 | -------------------------------------------------------------------------------- /r/rethinking/waic_information_criteria.R: -------------------------------------------------------------------------------- 1 | # R code 7.33 rethinking book 2 | library(rethinking) 3 | # We want to investigate the influence of body mass (M) and 4 | # brain volume (B) on longevity (L) 5 | data("Primates301") 6 | d <- Primates301 7 | # we standardize the three variables we are going to use 8 | d$log_L <- scale(log(d$longevity)) 9 | d$log_B <- scale(log(d$brain)) 10 | d$log_M <- scale(log(d$body)) 11 | # we look for missing values first 12 | sapply(d[, c("log_L", "log_B", "log_M")], function(x) sum(is.na(x))) 13 | d2 <- d[complete.cases(d$log_L, d$log_M, d$log_B), ] # removed rows with missing values 14 | # Let's define the model to infer the direct influence of brain volume (B) on longevity (L). 15 | # According to the causal graph we have to control the body mass variable (M) to close a 16 | # (pipe) backdoor M->B->L. Controlling a variable means adding it to the model. 17 | m7.8 <- quap( 18 | alist( 19 | log_L ~ dnorm( mu , sigma ) , 20 | mu <- a + bM * log_M + bB * log_B, 21 | a ~ dnorm(0, 0.1) , 22 | bM ~ dnorm(0, 0.5), 23 | bB ~ dnorm(0, 0.5), 24 | sigma ~ dexp(1) 25 | ) , 26 | data = d2 27 | ) 28 | # We also define two simpler models to evaluate the accuracy of each one. 29 | m7.9 <- quap( 30 | alist( 31 | log_L ~ dnorm( mu , sigma ) , 32 | mu <- a + bB * log_B, 33 | a ~ dnorm(0, 0.1) , 34 | bB ~ dnorm(0, 0.5), 35 | sigma ~ dexp(1) 36 | ) , 37 | data = d2 38 | ) 39 | m7.10 <- quap( 40 | alist( 41 | log_L ~ dnorm( mu , sigma ) , 42 | mu <- a + bM * log_M , 43 | a ~ dnorm(0, 0.1) , 44 | bM ~ dnorm(0, 0.5), 45 | sigma ~ dexp(1) 46 | ) , 47 | data = d2 48 | ) 49 | # We compare the Widely Applicable Information Criterion (WAIC) of all the 50 | # models. The WAIC provides an approximation of the out-of-sample deviance 51 | # of a model. The smaller WAIC value the better because it means the model 52 | # is closer to the target one. 53 | set.seed(301) 54 | compare(m7.8, m7.9, m7.10) 55 | # Let's compare the posterior distributions of the models' parameters 56 | coeftab(m7.8, m7.9, m7.10) 57 | # Let's see how body mass and brain volume correlate 58 | cor(d2$log_B, d2$log_M) 59 | -------------------------------------------------------------------------------- /python/stats/data/italy/unemployment_rate_istat_province.csv: -------------------------------------------------------------------------------- 1 | DEN_UTS,unemployment 2 | Imperia,11.13 3 | Taranto,14.959 4 | Cuneo,4.62 5 | Rovigo,9.092 6 | Savona,7.572 7 | Oristano,14.832 8 | Piacenza,6.065 9 | Lecco,5.493 10 | Potenza,8.442 11 | Trieste,5.648 12 | Sondrio,6.484 13 | Brescia,4.891 14 | Matera,7.922 15 | Verbano-Cusio-Ossola,5.808 16 | Grosseto,8.041 17 | Ragusa,15.011 18 | Firenze,6.184 19 | Mantova,4.583 20 | Caltanissetta,16.32 21 | Sassari,13.022 22 | Cremona,4.975 23 | Trento,4.807 24 | Forli-Cesena,5.487 25 | Messina,23.909 26 | Barletta-Andria-Trani,15.602 27 | Como,7.501 28 | Palermo,19.449 29 | Rimini,7.409 30 | Perugia,6.458 31 | La Spezia,9.782 32 | Cagliari,16.409 33 | Siena,5.878 34 | Sud Sardegna,13.095 35 | Bari,9.968 36 | Siracusa,21.373 37 | Padova,5.558 38 | Salerno,15.123 39 | Campobasso,10.46 40 | Alessandria,6.867 41 | Belluno,4.239 42 | Aosta,7.26 43 | Venezia,6.026 44 | Ancona,8.406 45 | Cosenza,18.775 46 | Brindisi,15.8 47 | Genova,7.543 48 | Varese,6.556 49 | Reggio di Calabria,16.597 50 | Fermo,4.822 51 | Latina,10.941 52 | Bologna,4.574 53 | Macerata,7.449 54 | Torino,8.256 55 | Bolzano,3.837 56 | Caserta,15.349 57 | Pescara,11.203 58 | Asti,7.447 59 | Vibo Valentia,19.223 60 | L'Aquila,9.493 61 | Monza Brianza,6.586 62 | Lodi,5.277 63 | Enna,18.169 64 | Chieti,9.651 65 | Roma,9.763 66 | Pavia,6.953 67 | Ferrara,7.299 68 | Catania,15.452 69 | Biella,6.016 70 | Pistoia,9.776 71 | Trapani,15.749 72 | Nuoro,7.269 73 | Vercelli,8.231 74 | Novara,7.676 75 | Benevento,12.774 76 | Arezzo,7.051 77 | Napoli,23.672 78 | Verona,4.67 79 | Udine,6.692 80 | Treviso,4.903 81 | Pesaro e Urbino,5.532 82 | Foggia,21.767 83 | Terni,7.189 84 | Gorizia,7.67 85 | Crotone,20.33 86 | Lecce,15.351 87 | Pordenone,3.253 88 | Parma,5.752 89 | Viterbo,10.851 90 | Isernia,12.119 91 | Avellino,14.471 92 | Vicenza,4.628 93 | Pisa,6.755 94 | Massa-Carrara,11.271 95 | Ravenna,6.204 96 | Bergamo,3.545 97 | Milano,6.467 98 | Catanzaro,16.795 99 | Frosinone,10.876 100 | Ascoli Piceno,8.474 101 | Reggio nell'Emilia,5.146 102 | Rieti,10.378 103 | Prato,7.061 104 | Modena,4.393 105 | Lucca,11.927 106 | Agrigento,21.777 107 | Livorno,5.453 108 | Teramo,6.929 109 | -------------------------------------------------------------------------------- /r/rethinking/ch9_hamiltonian_monte_carlo.R: -------------------------------------------------------------------------------- 1 | # R code 9.3 rethinking book 2 | # Hamiltonian Monte Carlo example. 3 | # In this example we'll use a two variable model. 4 | # We need: 5 | # 1) a function of the log-probability 6 | # 2) the gradient 7 | # 3) the step size 8 | # 4) the number of leapfrog steps 9 | 10 | # function U of the log-probability of the data. 11 | # Returns the neg-log-probability 12 | U <- function(q, a = 0, b = 1, k = 0, d = 1) { 13 | muy <- q[1] 14 | mux <- q[2] 15 | U <- sum(dnorm(y, muy, 1, log = TRUE)) + sum(dnorm(x, mux, 1, log = TRUE)) + 16 | dnorm(muy, a, b, log = TRUE) + dnorm(mux, k, d, log = TRUE) 17 | } 18 | 19 | # U gradient function 20 | # Sum of partial derivatives with respect to parameters mux and muy 21 | U_gradient <- function(q, a = 0, b = 1, k = 0, d = 1) { 22 | muy <- q[1] 23 | mux <- q[2] 24 | G1 <- sum(y - muy) + (a - muy) / b^2 # dU/d(muy) 25 | G2 <- sum(x - mux) + (k - mux) / d^2 # dU/d(mux) 26 | return(c(-G1, -G2)) # negative because energy is neg-log-prob 27 | } 28 | # test data 29 | y <- rnorm(50) 30 | x <- rnorm(50) 31 | x <- as.numeric(scale(x)) 32 | y <- as.numeric(scale(y)) 33 | 34 | library(shape) # for fancy arrows 35 | Q <- list() 36 | Q$q <- c(-0.1,0.2) 37 | pr <- 0.3 38 | plot( NULL , ylab="muy" , xlab="mux" , xlim=c(-pr,pr) , ylim=c(-pr,pr) ) 39 | step <- 0.03 40 | 41 | L <- 11 # 0.03/28 for U-turns --- 11 for working example 42 | n_samples <- 4 43 | path_col <- col.alpha("black",0.5) 44 | points( Q$q[1] , Q$q[2] , pch=4 , col="black" ) 45 | for ( i in 1:n_samples ) { 46 | Q <- HMC2( U , U_gradient , step , L , Q$q ) 47 | if ( n_samples < 10 ) { 48 | for ( j in 1:L ) { 49 | K0 <- sum(Q$ptraj[j,]^2)/2 # kinetic energy 50 | lines( Q$traj[j:(j+1),1] , Q$traj[j:(j+1),2] , col=path_col , lwd=1+2*K0 ) 51 | } 52 | points( Q$traj[1:L+1,] , pch=16 , col="white" , cex=0.35 ) 53 | Arrows( Q$traj[L,1] , Q$traj[L,2] , Q$traj[L+1,1] , Q$traj[L+1,2] , 54 | arr.length=0.35 , arr.adj = 0.7 ) 55 | text( Q$traj[L+1,1] , Q$traj[L+1,2] , i , cex=0.8 , pos=4 , offset=0.4 ) 56 | } 57 | points( Q$traj[L+1,1] , Q$traj[L+1,2] , pch=ifelse( Q$accept==1 , 16 , 1 ) , 58 | col=ifelse( abs(Q$dH)>0.1 , "red" , "black" ) ) 59 | } -------------------------------------------------------------------------------- /r/rethinking/percentile_intervals.R: -------------------------------------------------------------------------------- 1 | # R code 3.11 rethinking book 2 | # This script shows how to summarize a posterior distributions 3 | # by computing point estimates (mode, mean, median) and 4 | # intervals 5 | 6 | data_size <- 1000 # size of the dataset 7 | # creates a sequence of values between 0 and 1, 8 | # that represent possible values of a parameter 9 | p_grid <- seq(from = 0, to = 1, length.out = data_size) 10 | 11 | # creates a sequence of ones that represent the 12 | # prior probability of each possible value of the parameter 13 | prior <- rep(1, data_size) 14 | 15 | # computes the likelihood of each value of the paramenter 16 | # under the binomial distribution and the observations 17 | likelihood <- dbinom(3, size = 3, prob = p_grid) 18 | #plot(likelihood) 19 | 20 | # computes the posterior distribution 21 | posterior <- likelihood * prior 22 | 23 | # normalization of the posterior distribution 24 | posterior <- posterior / sum(posterior) 25 | 26 | # creates a sample of values from the distribution 27 | samples_size = 1e4 28 | samples <- sample(p_grid, prob = posterior, size = samples_size, replace = TRUE) 29 | #plot(samples) 30 | library(rethinking) 31 | dens(samples) 32 | 33 | # computes the probabilities between 25 % and 75 % of the interval 34 | PI(samples, prob = 0.5) 35 | 36 | # computes the 50 % highest density interval, the narrowest interval 37 | # containing the specified probability mass, e.g. 50 % 38 | HPDI(samples, prob = 0.5) 39 | 40 | # computes the parameter value with highest posterior probability (grid approximation) 41 | # it is called mode or maximum a posteriori (MAP) 42 | p_grid[which.max(posterior)] 43 | 44 | # same but from samples taken from the posterior distribution 45 | chainmode(samples, adj = 0.01) 46 | 47 | # computes the mean 48 | mean(samples) 49 | 50 | # computes the median 51 | median(samples) 52 | 53 | # defines a loss function that computes the weighted distance between 54 | # a parameter value x and any other value in the distribution 55 | loss <- function(x) sum(posterior*abs(x - p_grid)) 56 | 57 | # computes the loss in assuming 0.5 as the value of the parameter 58 | loss(0.5) 59 | 60 | # computes the total loss for any possible value of the parameter 61 | tot_loss <- sapply(p_grid, loss) 62 | 63 | # finds the parameter value with minimum loss 64 | p_grid[which.min(tot_loss)] 65 | 66 | 67 | -------------------------------------------------------------------------------- /r/rethinking/post_treatment_bias.R: -------------------------------------------------------------------------------- 1 | # R code 6.16 rethinking book 2 | # We first build a set of simulated data then we build the model 3 | # that will use the data. 4 | # 1) data simulation 5 | set.seed(71) 6 | # number of plants 7 | N <- 100 8 | 9 | # simulates initial heights 10 | h0 <- rnorm(N, 10, 2) 11 | 12 | # assign treatments and simulate fungus and growth 13 | treatment <- rep(0:1, each = N / 2) 14 | fungus <- rbinom(N, size = 1, prob = 0.5 - treatment * 0.4) 15 | h1 <- h0 + rnorm(N, 5 - 3 * fungus) 16 | 17 | # compose a dataframe 18 | d <- data.frame(h0 = h0, h1 = h1, treatment = treatment, fungus = fungus) 19 | 20 | precis(d) 21 | 22 | # 2) build the model that contains our hipothetical rule of plants' height 23 | # p represents the proportion of growth. In this model it doesn't depends on 24 | # other predictors. 25 | m6.6 <- quap( 26 | alist( 27 | h1 ~ dnorm( mu , sigma ) , 28 | mu <- p * h0 , 29 | p ~ dlnorm( 0 , 0.25 ) , 30 | sigma ~ dexp(1) 31 | ) , 32 | data = d 33 | ) 34 | 35 | precis(m6.6) 36 | 37 | # now we add the two predictors: treatment and fungus 38 | m6.7 <- quap( 39 | alist( 40 | h1 ~ dnorm( mu , sigma ) , 41 | mu <- p * h0 , 42 | p ~ a + bt * treatment + bf * fungus , 43 | a ~ dlnorm( 0 , 0.2 ) , 44 | bt ~ dnorm(0, 0.5) , 45 | bf ~ dnorm(0, 0.5) , 46 | sigma ~ dexp(1) 47 | ) , 48 | data = d 49 | ) 50 | # from this model it looks like treatment doesn't have any effect on the plant's growth 51 | precis(m6.7) 52 | # If we want to know the impact of treatment on growth we must remove fungus as 53 | # a predictor 54 | m6.8 <- quap( 55 | alist( 56 | h1 ~ dnorm( mu , sigma ) , 57 | mu <- p * h0 , 58 | p ~ a + bt * treatment , 59 | a ~ dlnorm( 0 , 0.2 ) , 60 | bt ~ dnorm(0, 0.5) , 61 | sigma ~ dexp(1) 62 | ) , 63 | data = d 64 | ) 65 | 66 | precis(m6.8) 67 | # Let#s plot the directed acyclic graph (DAG) of the model 68 | library(dagitty) 69 | plant_dag <- dagitty("dag { 70 | H0 -> H1 71 | F -> H1 72 | T -> F 73 | }" 74 | ) 75 | coordinates(plant_dag) <- list(x = c(H0 = 0, T = 2, F = 1.5, H1 = 1), 76 | y = c(H0 = 0, T = 0, F = 1, H1 = 2)) 77 | plot(plant_dag) 78 | dseparated(plant_dag, "T", "H1") 79 | dseparated(plant_dag, "T", "H1", "F") 80 | 81 | # We evaluate the model using the WAIC framework (ch.7) 82 | # WAIC provides an estimation of the deviance of the model 83 | # from the target (best) model. The lower it is, the better. 84 | set.seed(11) 85 | WAIC(m6.7) 86 | -------------------------------------------------------------------------------- /python/stats/data/italy/unemployment_rate_istat_province_cod_den_uts.csv: -------------------------------------------------------------------------------- 1 | DEN_UTS,unemployment,COD_UTS 2 | Agrigento,21.777,84 3 | Alessandria,6.867,6 4 | Ancona,8.406,42 5 | Aosta,7.26,7 6 | Arezzo,7.051,51 7 | Ascoli Piceno,8.474,44 8 | Asti,7.447,5 9 | Avellino,14.471,64 10 | Bari,9.968,72 11 | Barletta-Andria-Trani,15.602,110 12 | Belluno,4.239,25 13 | Benevento,12.774,62 14 | Bergamo,3.545,16 15 | Biella,6.016,96 16 | Bologna,4.574,37 17 | Bolzano,3.837,21 18 | Brescia,4.891,17 19 | Brindisi,15.8,74 20 | Cagliari,16.409,92 21 | Caltanissetta,16.32,85 22 | Campobasso,10.46,70 23 | Caserta,15.349,61 24 | Catania,15.452,87 25 | Catanzaro,16.795,79 26 | Chieti,9.651,69 27 | Como,7.501,13 28 | Cosenza,18.775,78 29 | Cremona,4.975,19 30 | Crotone,20.33,101 31 | Cuneo,4.62,4 32 | Enna,18.169,86 33 | Fermo,4.822,109 34 | Ferrara,7.299,38 35 | Firenze,6.184,48 36 | Foggia,21.767,71 37 | Forli-Cesena,5.487,40 38 | Frosinone,10.876,60 39 | Genova,7.543,10 40 | Gorizia,7.67,31 41 | Grosseto,8.041,53 42 | Imperia,11.13,8 43 | Isernia,12.119,94 44 | L'Aquila,9.493,66 45 | La Spezia,9.782,11 46 | Latina,10.941,59 47 | Lecce,15.351,75 48 | Lecco,5.493,97 49 | Livorno,5.453,49 50 | Lodi,5.277,98 51 | Lucca,11.927,46 52 | Macerata,7.449,43 53 | Mantova,4.583,20 54 | Massa-Carrara,11.271,45 55 | Matera,7.922,77 56 | Messina,23.909,83 57 | Milano,6.467,15 58 | Modena,4.393,36 59 | Monza Brianza,6.586,108 60 | Napoli,23.672,63 61 | Novara,7.676,3 62 | Nuoro,7.269,91 63 | Oristano,14.832,95 64 | Padova,5.558,28 65 | Palermo,19.449,82 66 | Parma,5.752,34 67 | Pavia,6.953,18 68 | Perugia,6.458,54 69 | Pesaro e Urbino,5.532,41 70 | Pescara,11.203,68 71 | Piacenza,6.065,33 72 | Pisa,6.755,50 73 | Pistoia,9.776,47 74 | Pordenone,3.253,93 75 | Potenza,8.442,76 76 | Prato,7.061,100 77 | Ragusa,15.011,88 78 | Ravenna,6.204,39 79 | Reggio Calabria,16.597,80 80 | Reggio nell'Emilia,5.146,35 81 | Rieti,10.378,57 82 | Rimini,7.409,99 83 | Roma,9.763,58 84 | Rovigo,9.092,29 85 | Salerno,15.123,65 86 | Sassari,13.022,90 87 | Savona,7.572,9 88 | Siena,5.878,52 89 | Siracusa,21.373,89 90 | Sondrio,6.484,14 91 | Sud Sardegna,13.095,111 92 | Taranto,14.959,73 93 | Teramo,6.929,67 94 | Terni,7.189,55 95 | Torino,8.256,1 96 | Trapani,15.749,81 97 | Trento,4.807,22 98 | Treviso,4.903,26 99 | Trieste,5.648,32 100 | Udine,6.692,30 101 | Varese,6.556,12 102 | Venezia,6.026,27 103 | Verbano-Cusio-Ossola,5.808,103 104 | Vercelli,8.231,2 105 | Verona,4.67,23 106 | Vibo Valentia,19.223,102 107 | Vicenza,4.628,24 108 | Viterbo,10.851,56 109 | -------------------------------------------------------------------------------- /r/rethinking/polynomial_regression.R: -------------------------------------------------------------------------------- 1 | # R code 4.64 rethinking book 2 | # The same approach used to compute the posterior distribution 3 | # of a variable that depends linearly on another can be used when 4 | # the dependency is not linear. As in the linear case when the model 5 | # learnt from the data the values of the two parameters, slope and 6 | # intercept, of the linear relationship, it can learn the parameters 7 | # of a basis of polynomials that represents the non-linear dependency. 8 | # e.g. mu = a + b1x + b2x^2 9 | library(rethinking) 10 | data("Howell1") 11 | d <- Howell1 12 | str(d) 13 | # plot height against weight 14 | plot(d$height ~ d$weight) 15 | 16 | # defines the model 17 | d$weight_s <- (d$weight - mean(d$weight)) / sd(d$weight) # standardize the predictor variable 18 | d$weight_s2 <- d$weight_s^2 19 | # 2nd order polynomial 20 | m4.5 <- quap( 21 | alist( 22 | height ~ dnorm( mu , sigma ) , 23 | mu <- a + b1*weight_s + b2*weight_s2 , 24 | a ~ dnorm( 178 , 20 ) , 25 | b1 ~ dlnorm( 0 , 1 ) , 26 | b2 ~ dnorm(0, 1) , 27 | sigma ~ dunif( 0 , 50 ) 28 | ) , 29 | data=d ) 30 | 31 | # 3rd order polynomial 32 | d$weight_s3 = d$weight_s^3 33 | m4.6 <- quap( 34 | alist( 35 | height ~ dnorm( mu , sigma ) , 36 | mu <- a + b1*weight_s + b2*weight_s2 + b3*weight_s3 , 37 | a ~ dnorm( 178 , 20 ) , 38 | b1 ~ dlnorm( 0 , 1 ) , 39 | b2 ~ dnorm(0, 1) , 40 | b3 ~ dnorm(0, 1) , 41 | sigma ~ dunif( 0 , 50 ) 42 | ) , 43 | data=d ) 44 | 45 | precis(m4.5) 46 | 47 | # plot the 2nd order polynomial model 48 | weight.seq <- seq(from = -2.2, to = 2, length.out = 30) 49 | pred_dat <- list(weight_s = weight.seq, weight_s2 = weight.seq^2) 50 | mu <- link(m4.5, data = pred_dat) 51 | mu.mean <- apply(mu, 2, mean) 52 | mu.PI <- apply(mu, 2, PI, prob = 0.89) 53 | sim.height <- sim(m4.5, data = pred_dat) 54 | height.PI <- apply(sim.height, 2, PI, prob = 0.89) 55 | 56 | plot(height ~ weight_s, d, col = col.alpha(rangi2, 0.5)) 57 | lines(weight.seq, mu.mean) 58 | shade(mu.PI, weight.seq) 59 | shade(height.PI, weight.seq) 60 | 61 | 62 | # plot the 3rd order polynomial model 63 | weight.seq <- seq(from = -2.2, to = 2, length.out = 30) 64 | pred_dat <- list(weight_s = weight.seq, weight_s2 = weight.seq^2, weight_s3 = weight.seq^3) 65 | mu <- link(m4.6, data = pred_dat) 66 | mu.mean <- apply(mu, 2, mean) 67 | mu.PI <- apply(mu, 2, PI, prob = 0.89) 68 | sim.height <- sim(m4.6, data = pred_dat) 69 | height.PI <- apply(sim.height, 2, PI, prob = 0.89) 70 | 71 | plot(height ~ weight_s, d, col = col.alpha(rangi2, 0.5)) 72 | lines(weight.seq, mu.mean) 73 | shade(mu.PI, weight.seq) 74 | shade(height.PI, weight.seq) -------------------------------------------------------------------------------- /r/rethinking/gaussian_model_of_height.R: -------------------------------------------------------------------------------- 1 | # R code 4.7 rethinking book 2 | library(rethinking) 3 | data("Howell1") 4 | d <- Howell1 # dataframe 5 | precis(d) 6 | 7 | d2 <- d[d$age >= 18, ] 8 | # plot the height distribution 9 | #dens(d2$height) 10 | 11 | #curve(dnorm(x, 178, 20), from = 100, to = 250) 12 | 13 | #curve(dunif(x, 0, 50), from = -10, to = 60) 14 | 15 | # prior probability distribution for the mean 16 | sample_mu <- rnorm(1e4, 178, 20) 17 | # prior probability distribution for the standard deviation 18 | sample_sigma <- runif(1e4, 0, 50) 19 | # joint prior probability distribution of height, before seeing the data 20 | prior_h <- rnorm(1e4, sample_mu, sample_sigma) 21 | # plot the joint prior probability distribution 22 | #dens(prior_h) 23 | 24 | # grid approximation of the posterior distribution 25 | mu.list <- seq(from = 150, to = 160, length.out = 100) 26 | sigma.list <- seq(from = 7, to = 9, length.out = 100) 27 | post <- expand.grid(mu = mu.list, sigma = sigma.list) 28 | post$LL <- sapply(1:nrow(post), function(i) sum( 29 | dnorm(d2$height, post$mu[i], post$sigma[i], log = TRUE))) 30 | post$prod <- post$LL + dnorm(post$mu, 178, 20, TRUE) + dunif(post$sigma, 0, 50, TRUE) 31 | post$prob <- exp(post$prod - max(post$prod)) 32 | 33 | # plot the posterior distribution 34 | #contour_xyz(post$mu, post$sigma, post$prob) 35 | #image_xyz(post$mu, post$sigma, post$prob) 36 | 37 | sample.rows <- sample(1:nrow(post), size = 1e4, replace = TRUE, prob = post$prob) 38 | sample.mu <- post$mu[sample.rows] 39 | sample.sigma <- post$sigma[sample.rows] 40 | #plot(sample.mu, sample.sigma, cex = 0.5, pch = 16, col = col.alpha(rangi2, 0.5)) 41 | 42 | #dens(sample.mu) 43 | #dens(sample.sigma) 44 | HPDI(sample.mu) 45 | 46 | # test the standard deviation with a small sample size 47 | # to see that the sigma posterior distribution is not Gaussian 48 | d3 <- sample(d2$height, size = 20) 49 | # grid approximation of the posterior distribution 50 | mu.list <- seq(from = 150, to = 160, length.out = 100) 51 | sigma.list <- seq(from = 7, to = 9, length.out = 100) 52 | post2 <- expand.grid(mu = mu.list, sigma = sigma.list) 53 | post2$LL <- sapply(1:nrow(post2), function(i) sum( 54 | dnorm(d3, post2$mu[i], post2$sigma[i], log = TRUE))) 55 | post2$prod <- post2$LL + dnorm(post2$mu, 178, 20, TRUE) + dunif(post2$sigma, 0, 50, TRUE) 56 | post2$prob <- exp(post2$prod - max(post2$prod)) 57 | 58 | # plot the posterior distribution 59 | #contour_xyz(post$mu, post$sigma, post$prob) 60 | image_xyz(post2$mu, post2$sigma, post$prob) 61 | 62 | sample2.rows <- sample(1:nrow(post2), size = 1e4, replace = TRUE, prob = post2$prob) 63 | sample2.mu <- post2$mu[sample2.rows] 64 | sample2.sigma <- post2$sigma[sample2.rows] 65 | plot(sample2.mu, sample2.sigma, cex = 0.5, pch = 16, col = col.alpha(rangi2, 0.5), 66 | xlab = 'mu', ylab = 'sigma') 67 | dens(sample2.sigma, norm.comp = TRUE) 68 | -------------------------------------------------------------------------------- /python/recommendations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Movie Recommendations\n", 8 | "An example from the book by Toby Segaran \"Programming Collective Intelligence\"" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 2, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import json\n", 18 | "critics = json.load(open('covid19/data/movie-critics.json'))" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "data": { 28 | "text/plain": [ 29 | "4.5" 30 | ] 31 | }, 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "output_type": "execute_result" 35 | } 36 | ], 37 | "source": [ 38 | "critics['Toby']['Snakes on a Plane']" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from math import sqrt\n", 48 | "# Returns a distance-based similarity score for person1 and person2\n", 49 | "def sim_distance(prefs,person1,person2):\n", 50 | " # Get the list of shared_items\n", 51 | " si={}\n", 52 | " for item in prefs[person1]:\n", 53 | " if item in prefs[person2]:\n", 54 | " si[item]=1\n", 55 | " # if they have no ratings in common, return 0\n", 56 | " if len(si)==0: return 0\n", 57 | " # Add up the squares of all the differences\n", 58 | " sum_of_squares=sum([pow(prefs[person1][item]-prefs[person2][item],2)\n", 59 | " for item in prefs[person1] if item in prefs[person2]])\n", 60 | " return 1/(1+sum_of_squares)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 5, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "data": { 70 | "text/plain": [ 71 | "0.14814814814814814" 72 | ] 73 | }, 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "output_type": "execute_result" 77 | } 78 | ], 79 | "source": [ 80 | "sim_distance(critics,'Lisa Rose','Gene Seymour')" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [] 91 | } 92 | ], 93 | "metadata": { 94 | "kernelspec": { 95 | "display_name": "Python 3 (ipykernel)", 96 | "language": "python", 97 | "name": "python3" 98 | }, 99 | "language_info": { 100 | "codemirror_mode": { 101 | "name": "ipython", 102 | "version": 3 103 | }, 104 | "file_extension": ".py", 105 | "mimetype": "text/x-python", 106 | "name": "python", 107 | "nbconvert_exporter": "python", 108 | "pygments_lexer": "ipython3", 109 | "version": "3.9.13" 110 | } 111 | }, 112 | "nbformat": 4, 113 | "nbformat_minor": 2 114 | } 115 | -------------------------------------------------------------------------------- /r/rethinking/multicollineariry.R: -------------------------------------------------------------------------------- 1 | # R code 6.2 rethinking book 2 | # We set up a simulation to showcase the problem when using predictors 3 | # that are strongly correlated. 4 | N <- 100 5 | set.seed(909) 6 | height <- rnorm(N, 10, 2) # total height 7 | leg_prop <- runif(N, 0.4, 0.5) # leg as proportion of height 8 | leg_left <- leg_prop * height + rnorm(N, 0, 0.02) # left leg as proportion + error 9 | leg_right <- leg_prop * height + rnorm(N, 0, 0.02) # right leg as proportion + error 10 | d <- data.frame(height, leg_left, leg_right) 11 | 12 | m6.1 <- quap( 13 | alist( 14 | height ~ dnorm( mu , sigma ) , 15 | mu <- a + bl * leg_left + br * leg_right , 16 | a ~ dnorm( 10 , 100 ) , 17 | bl ~ dnorm( 2 , 10 ) , 18 | br ~ dnorm( 2 , 10 ) , 19 | sigma ~ dexp(1) 20 | ) , 21 | data = d 22 | ) 23 | 24 | #plot(precis(m6.1)) # doesn't show what it should 25 | post <- extract.samples(m6.1) 26 | plot(bl ~ br, post, col = col.alpha(rangi2, 0.1), pch = 16) 27 | 28 | sum_blbr <- post$bl + post$br 29 | dens(sum_blbr, col = rangi2, lwd = 2, xlab = "sum of bl and br") 30 | 31 | # Now we show the problem using real data, when one might not know in 32 | # advance that two predictor are in fact strongly correlated. 33 | # We will model the dependency of the milk total energy content (K) 34 | # from fat (F) and lactose (L). 35 | library(rethinking) 36 | data(milk) 37 | d <- milk 38 | d$K <- scale(d$kcal.per.g) # Kilocalories (energy content) 39 | d$F <- scale(d$perc.fat) # Fat 40 | d$L <- scale(log(d$perc.lactose)) # lactose (a carbohidrate) 41 | # we start by creating two bivariate models. The first investigates 42 | # the dependency of kilocalories (K) from fat (F) 43 | m6.3 <- quap( 44 | alist( 45 | K ~ dnorm( mu , sigma ) , 46 | mu <- a + bF * F , 47 | a ~ dnorm( 0 , 0.2 ) , 48 | bF ~ dnorm( 0 , 0.5 ) , 49 | sigma ~ dexp(1) 50 | ) , 51 | data = d 52 | ) 53 | # The 2nd model investigates the dependency of kilocalories (K) from lactose (L) 54 | m6.4 <- quap( 55 | alist( 56 | K ~ dnorm( mu , sigma ) , 57 | mu <- a + bL * L, 58 | a ~ dnorm( 0 , 0.2 ) , 59 | bL ~ dnorm( 0 , 0.5 ) , 60 | sigma ~ dexp(1) 61 | ) , 62 | data = d 63 | ) 64 | # mean values show strong correlation of fat (positive) and lactose (negative) 65 | # with kilocalories. 66 | precis(m6.3) 67 | precis(m6.4) 68 | # Now we build a multivariate regression model for kilocalories using fat and lactose together. 69 | m6.5 <- quap( 70 | alist( 71 | K ~ dnorm( mu , sigma ) , 72 | mu <- a + bF * F + bL * L, 73 | a ~ dnorm( 0 , 0.2 ) , 74 | bF ~ dnorm( 0 , 0.5 ) , 75 | bL ~ dnorm( 0 , 0.5 ) , 76 | sigma ~ dexp(1) 77 | ) , 78 | data = d 79 | ) 80 | # mean values of the two parameter are now smaller that would imply a weaker contribution to energy 81 | # and also less precise as the standard deviations are bigger 82 | precis(m6.5) 83 | pairs(~ kcal.per.g + perc.fat + perc.lactose, data = d, col = rangi2) # plot the correlations between each pair of variables 84 | cor(d$perc.fat, d$perc.lactose) # computes the correlation between fat and lactose 85 | 86 | 87 | -------------------------------------------------------------------------------- /julia/learn_julia.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9b24ffbf-c78d-4d10-b2c6-78738edd946d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Learning Julia\n", 9 | "A short introduction to Julia following the book [Think Julia: How to Think Like a Computer Scientist](https://benlauwens.github.io/ThinkJulia.jl/latest/book.html#chap06), by Ben Lauwens and Allen Downey.\r\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "id": "ab83fa29-1392-4e7f-b788-2bd44c50b96b", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "using Pkg" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "id": "429a410f-195e-48dd-9cda-329b966add5a", 25 | "metadata": {}, 26 | "source": [ 27 | "## Conditionals" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 17, 33 | "id": "80961d42-be0d-409c-a114-aac2eba71315", 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "x is positive" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "x = 9\n", 46 | "if x > 0\n", 47 | " print(\"x is positive\")\n", 48 | "end" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 33, 54 | "id": "8768cea0-01a4-4608-a36e-bc57ffece643", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "a" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "choice = 'a' # this is a Char\n", 67 | "if choice == 'a'\n", 68 | " print(\"a\")\n", 69 | "elseif choice == 'b'\n", 70 | " print(\"b\")\n", 71 | "elseif choice == 'c'\n", 72 | " print(\"c\")\n", 73 | "end" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "0757b71f-3774-4ee2-b61b-8e7ea11eff66", 79 | "metadata": {}, 80 | "source": [ 81 | "## Functions" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 19, 87 | "id": "48b12e47-ed05-451e-a359-b4706fb5dd25", 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "square (generic function with 1 method)" 94 | ] 95 | }, 96 | "execution_count": 19, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "function square(a)\n", 103 | " # Computes the square of the argument\n", 104 | " b = a * a\n", 105 | " return a, b\n", 106 | "end " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 23, 112 | "id": "f822833d-f14b-4da7-a8cb-d2ae8cfc4e27", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/plain": [ 118 | "(4, 16)" 119 | ] 120 | }, 121 | "execution_count": 23, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "square(4)" 128 | ] 129 | } 130 | ], 131 | "metadata": { 132 | "kernelspec": { 133 | "display_name": "Julia 1.5.1", 134 | "language": "julia", 135 | "name": "julia-1.5" 136 | }, 137 | "language_info": { 138 | "file_extension": ".jl", 139 | "mimetype": "application/julia", 140 | "name": "julia", 141 | "version": "1.5.1" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 5 146 | } 147 | -------------------------------------------------------------------------------- /r/rethinking/binomial_distribution.R: -------------------------------------------------------------------------------- 1 | # Binomial distribution 2 | # Estimation of the bias of a coin. 3 | # We assume that a coin can be biased so that the probability 4 | # theta that after a toss it shows head or tail 5 | # may not be 0.5. We assume at the beginning that theta can have any 6 | # value between 0 and 1 but with different probabilities. We have some 7 | # sound even if not complete information about the plausible values of 8 | # theta. In our example we assume a triangular prior p(theta). We use 9 | # the binomial distribution as the likelihood. Then we set the sample 10 | # size n and we set the number of successes k. Finally, we compute the 11 | # posterior distribution as the product of the prior and the likelihood 12 | # distributions. 13 | 14 | # Example 1 from Kruschke ch.5 par. 5.3 15 | # range of values of the parameter 16 | theta <- seq(from = 0, to = 1, by = 0.1) 17 | 18 | # define the prior distribution for each value of theta according to our 19 | # knowledge before seeing the data. 20 | p1 <- 0.4 * theta[1:6] 21 | p2 <- 0.4 - 0.4 * theta[7:11] 22 | prior <- c(p1,p2) 23 | 24 | plot(theta, prior, type = "h", col = "skyblue") 25 | 26 | # sample the likelihood at each value of the parameter theta 27 | # for one toss. The binomial distribution used as the likelihood 28 | # is also called Bernoulli distribution when the sample size n = 1. 29 | # The way in which we extract the sample from the likelihood distribution 30 | # is called grid approximation because the elements of the sample are taken 31 | # from one data point and a set of equally spaced values of theta. This 32 | # approximation works because we are dealing with only one parameter and 33 | # values in a limited interval. Other approximation are quadratic and MCMC. 34 | n = 1 # sample size 35 | k = 1 # number success events out the sample 36 | likelihood <-dbinom(k, size = n, prob = theta) 37 | plot(theta, likelihood, ylab = "likelihood p(x | theta)", type = "h", col = "skyblue") 38 | 39 | # compute the marginal likelihood p(D) 40 | marginal <- sum(likelihood * prior) 41 | 42 | # compute the posterior distribution for theta using the Bayes rule 43 | posterior <- likelihood * prior / marginal 44 | 45 | # compute the posterior mode (value with most occurrences) 46 | mode_posterior <- theta[which.max(posterior)] 47 | 48 | plot(theta, posterior, ylab = "posterior p(theta | x)", type = "h", col = "skyblue") 49 | 50 | # Example 2 from Kruschke ch.5 par. 5.3.1 51 | # Influence of sample size. 52 | theta <- s <- seq(from = 0, to = 1, by = 0.001) 53 | # define the prior distribution for each value of theta 54 | p1 <- 0.4 * theta[1:500] 55 | p2 <- 0.4 - 0.4 * theta[501:1001] 56 | prior <- c(p1,p2) 57 | plot(theta, prior, ylab = "prior_1000", type = "h", col = "skyblue") 58 | 59 | # compute likelihood at each value of the parameter theta 60 | n = 40 # sample size 61 | k = 10 # number success events out the sample 62 | likelihood <-dbinom(k, size = n, prob = theta) 63 | 64 | # compute the likelihood mode (value with most occurrences) 65 | mode_likelihood <- theta[which.max(likelihood)] 66 | 67 | plot(theta, likelihood, ylab = "likelihood_1000 p(x | theta)", type = "h", col = "skyblue") 68 | text( .5 , 0.1 , paste("mode =", mode_likelihood)) 69 | 70 | # compute the marginal likelihood p(D) 71 | marginal <- sum(likelihood * prior) 72 | 73 | # compute the posterior distribution for theta 74 | posterior <- likelihood * prior / marginal 75 | 76 | # compute the posterior mode (value with most occurrences) 77 | mode_posterior <- theta[which.max(posterior)] 78 | 79 | plot(theta, posterior, ylab = "posterior_1000 p(theta | x)", type = "h", col = "skyblue") 80 | text( .7 , 0.0020 , paste("mode =", mode_posterior)) 81 | 82 | -------------------------------------------------------------------------------- /r/rethinking/ch8_continuous_interactions.R: -------------------------------------------------------------------------------- 1 | # R code 8.19 rethinking book 2 | # We create two models of the bloom of flowers that depends 3 | # on water and shade. The first model is a linear combination 4 | # of the two, the second model contains an interaction term 5 | # between water and shade. 6 | library(rethinking) 7 | data("tulips") 8 | d <- tulips 9 | str(d) 10 | # standardize the variables 11 | d$blooms_std <- d$blooms / max(d$blooms) 12 | d$water_cent <- d$water - mean(d$water) 13 | d$shade_cent <- d$shade - mean(d$shade) 14 | 15 | # after some reasoning about the plausible values of the parameters' 16 | # prior distribution, we have the 1st model (without interaction) 17 | m8.6 <- quap( 18 | alist( 19 | blooms_std ~ dnorm( mu , sigma ) , 20 | mu <- a + bw * water_cent + bs * shade_cent, 21 | a ~ dnorm(0.5, 0.25) , 22 | bw ~ dnorm(0, 0.25), 23 | bs ~ dnorm(0, 0.25), 24 | sigma ~ dexp(1) 25 | ) , 26 | data = d 27 | ) 28 | 29 | # after some additional thinking about the parameter of the interaction 30 | # term we have the 2nd model 31 | m8.7 <- quap( 32 | alist( 33 | blooms_std ~ dnorm( mu , sigma ) , 34 | mu <- a + bw * water_cent + bs * shade_cent + bws * water_cent * shade_cent, 35 | a ~ dnorm(0.5, 0.25) , 36 | bw ~ dnorm(0, 0.25), 37 | bs ~ dnorm(0, 0.25), 38 | bws ~ dnorm(0, 0.25), 39 | sigma ~ dexp(1) 40 | ) , 41 | data = d 42 | ) 43 | 44 | # Let's plot a triptych ("trittico" in Italian) of the posterior prediction for blooms for the 45 | # 1st model (without interaction), The plot will show the relation between water and blooms for 46 | # three different values of shade. 47 | par(mfrow = c(1,3)) # 3 plots in a row 48 | for (s in -1:1) { 49 | idx <- which(d$shade_cent == s) 50 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1), 51 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2) 52 | title(paste("shade ", s)) 53 | mu <- link(m8.6, data = data.frame(shade_cent = s, water_cent = -1:1)) 54 | for (i in 1:20) 55 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3)) 56 | } 57 | 58 | # Now we plot a triptych of the posterior prediction for blooms for the 2nd model (with interaction). 59 | # Again, the plot will show the relation between water and blooms for three different values of shade. 60 | par(mfrow = c(1,3)) # 3 plots in a row 61 | for (s in -1:1) { 62 | idx <- which(d$shade_cent == s) 63 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1), 64 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2) 65 | title(paste("shade ", s)) 66 | mu <- link(m8.7, data = data.frame(shade_cent = s, water_cent = -1:1)) 67 | for (i in 1:20) 68 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3)) 69 | } 70 | 71 | # Now we plot the prior predictive simulations 72 | set.seed(7) 73 | prior <- extract.prior(m8.6) 74 | par(mfrow = c(1,3)) # 3 plots in a row 75 | for (s in -1:1) { 76 | idx <- which(d$shade_cent == s) 77 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1), 78 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2) 79 | title(paste("shade ", s)) 80 | mu <- link(m8.6, data = data.frame(shade_cent = s, water_cent = -1:1), post = prior) 81 | for (i in 1:20) 82 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3)) 83 | } 84 | 85 | 86 | prior <- extract.prior(m8.7) 87 | par(mfrow = c(1,3)) # 3 plots in a row 88 | for (s in -1:1) { 89 | idx <- which(d$shade_cent == s) 90 | plot(d$water_cent[idx], d$blooms_std[idx], xlim = c(-1,1), ylim = c(0,1), 91 | xlab = "water", ylab = "blooms", pch = 16, col = rangi2) 92 | title(paste("shade ", s)) 93 | mu <- link(m8.7, data = data.frame(shade_cent = s, water_cent = -1:1), post = prior) 94 | for (i in 1:20) 95 | lines(-1:1, mu[i, ], col = col.alpha("black", 0.3)) 96 | } -------------------------------------------------------------------------------- /r/rethinking/linear_prediction.R: -------------------------------------------------------------------------------- 1 | # R code 4.37 rethinking book 2 | library(rethinking) 3 | data("Howell1") 4 | d <- Howell1 5 | d2 <- d[d$age >= 18,] 6 | #plot(d2$height ~ d2$weight) # uncomment to plot 7 | 8 | set.seed(2971) 9 | N <- 100 10 | a <- rnorm(N, 178, 20) 11 | #b <- rnorm(N, 0, 10) 12 | 13 | b <- rlnorm(1e4, 0, 1) 14 | dens(b, xlim = c(0,5), adj = 0.1) # plot the log-normal distribution 15 | 16 | # plot N lines with random slopes and intercepts 17 | b <- rlnorm(N, 0, 1) # limits the slope to positive values (log-normal distribution) 18 | plot( NULL , xlim=range(d2$weight) , ylim=c(-100,400) , xlab="weight" , ylab="height" ) 19 | abline( h=0 , lty=2) # horizontal line 20 | abline( h=272 , lty=1 , lwd=0.5 ) # horizontal line, height of the tallest known person 21 | mtext( "b ~ dnorm(0,10)" ) 22 | xbar <- mean(d2$weight) 23 | for ( i in 1:N ) curve( a[i] + b[i]*(x - xbar) , 24 | from=min(d2$weight) , to=max(d2$weight) , add=TRUE , 25 | col=col.alpha("black",0.2) ) 26 | 27 | # load data again, since it's a long way back 28 | library(rethinking) 29 | data(Howell1) 30 | d <- Howell1 31 | d2 <- d[ d$age >= 18 , ] 32 | # define the average weight, x-bar 33 | xbar <- mean(d2$weight) 34 | # fit model 35 | m4.3 <- quap( 36 | alist( 37 | height ~ dnorm( mu , sigma ) , 38 | mu <- a + b*( weight - xbar ) , 39 | a ~ dnorm( 178 , 20 ) , 40 | b ~ dlnorm( 0 , 1 ) , 41 | sigma ~ dunif( 0 , 50 ) 42 | ) , 43 | data=d2 ) 44 | 45 | # plots the line using mean values for alpha (a) and beta (b) 46 | plot(height ~ weight, data = d2, col = rangi2) 47 | post <- extract.samples(m4.3) 48 | a_map <- mean(post$a) 49 | b_map <- mean(post$b) 50 | curve(a_map + b_map*(x - xbar), add = TRUE) 51 | 52 | # we look at the marginal posterior distributions of the parameters 53 | precis(m4.3) # mean and standard deviations 54 | round(vcov(m4.3), 3) # covariances 55 | #pairs(m4.3) # 56 | # samples from the posterior distribution (height) 57 | post <- extract.samples(m4.3) 58 | 59 | N <- 352 60 | dN <- d2[1:N, ] 61 | mN <- quap( 62 | alist( 63 | height ~ dnorm(mu, sigma), 64 | mu <- a + b*(weight - mean(weight)), 65 | a ~ dnorm(178, 20), 66 | b ~ dlnorm(0, 1), 67 | sigma ~ dunif(0, 50) 68 | ), data = dN 69 | ) 70 | # extract 20 samples from posterior 71 | post <- extract.samples(mN, n = 20) 72 | # display raw data and sample size 73 | plot(dN$weight, dN$height, 74 | xlim = range(d2$weight), ylim = range(d2$height), 75 | col = rangi2, xlab = "weight", ylab = "height") 76 | mtext(concat("N = ", N)) 77 | 78 | # plot the lines from the sample, with transparency 79 | for (i in 1:20) 80 | curve(post$a[i] + post$b[i]*(x - mean(dN$weight)), 81 | col = col.alpha("black", 0.3), add = TRUE) 82 | 83 | # plotting regression intervals and contours 84 | # first for weight = 50 85 | post <- extract.samples(m4.3) 86 | mu_at_50 <- post$a + post$b * (50 - xbar) 87 | dens(mu_at_50, col = rangi2, lwd = 2, xlab = "mu|weight=50") 88 | PI(mu_at_50, prob = 0.89) 89 | 90 | mu <- link(m4.3) 91 | str(mu) 92 | 93 | # we want to compute the distribution of the height (mu) for each 94 | # value of the weight 95 | # define sequence of weights to compute predictions for 96 | # these values will be on the horizontal axis 97 | weight.seq <- seq(from = 25, to = 70, by = 1) # 46 weight values 98 | # use link to compute mu 99 | # for each sample from posterior and for each weight in weight.seq 100 | mu <- link(m4.3, data = data.frame(weight = weight.seq)) 101 | #str(mu) 102 | plot(height ~ weight, d2, type = "n") 103 | for (i in 1:100) 104 | points(weight.seq, mu[i,], pch = 16, col = col.alpha(rangi2, 0.1)) 105 | 106 | # summarize the distribution for each wight 107 | mu.mean <- apply(mu, 2, mean) 108 | mu.PI <- apply(mu, 2, PI, prob = 0.89) 109 | mu.HPDI <- apply(mu, 2, HPDI, prob = 0.89) 110 | 111 | # plot the summaries on top of the data 112 | # fading out points to make line and interval more visible 113 | plot(height ~ weight, data = d2, col = col.alpha(rangi2, 0.5)) 114 | # plot the MAP line, aka the mean mu for each weight 115 | lines(weight.seq, mu.mean) 116 | # plot a shaded region for 89 % PI 117 | shade(mu.PI, weight.seq) 118 | 119 | # Prediction intervals R code 4.60 120 | sim.height <- sim(m4.3, data = list(weight = weight.seq)) 121 | str(sim.height) 122 | height.PI <- apply(sim.height, 2, PI, prob = 0.89) 123 | # plot raw data 124 | plot(height ~ weight, d2, col = col.alpha(rangi2, 0.5)) 125 | # draw MAP line 126 | lines(weight.seq, mu.mean) 127 | # draw HPDI region for line 128 | shade(mu.HPDI, weight.seq) 129 | # draw PI region for simulated heights 130 | shade(height.PI, weight.seq) 131 | 132 | -------------------------------------------------------------------------------- /r/stat_learning/chapter1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# An Introduction to Statistical Learning\n", 8 | "This is a collection of Jupyter notebooks from the course [Statistical Learning](https://courses.edx.org/courses/course-v1:StanfordOnline+STATSX0001+1T2020/course/) taught by Prof. Trevor Hastie and Prof. Robert Tibshirani, offered by StanfordOnline on the edX platform. The course is based on the book [An Introduction to Statistical Learning with Applications in R](http://faculty.marshall.usc.edu/gareth-james/ISL/code.html) by James Gareth et al.. The notebooks are based on the Lab exercises that are at the end of each chapter in the book. The programming language is R. Each notebook begins with a short summary of the topics discussed in the online course and in the related chapters in the book, and then proceeds with the exercises. I always need more than one book to learn a subject, in this case my main additional textbooks have been \n", 9 | " \n", 10 | " - [McElreath - Statistical Rethinking, 2nd edition](https://github.com/rmcelreath/statrethinking_winter2019)\n", 11 | " - [Gelman et al. - Baysian Data Analysis, 3rd edition](http://www.stat.columbia.edu/~gelman/book/)\n", 12 | " - [DeGroot et al. - Probability and Statistics, 4th edition](https://www.amazon.com/Probability-Statistics-4th-Morris-DeGroot/dp/0321500466)\n", 13 | "\n", 14 | "I wrote these notebooks because I think that writing helps in clarifying the subject discussed. I hope that my summaries of the chapters still make sense and will help to remind me quickly the material learned. \n", 15 | "\n", 16 | "---\n", 17 | "\n", 18 | "The course is mostly about **supervised learning**, in which we have a data set of observations $x_i$ with labels $y_i$ that we can use to fit our model \n", 19 | "\n", 20 | "$$y = f(x)$$\n", 21 | "\n", 22 | "in order to be able to figure out what can be the label of a new observation. The label, or response, can be a numerical value, such as in **regression problems** or a category such as in **classification problems**. The last chapter of the book provides an introduction to two **unsupervised learning** methods in which we have observations but no labels and our goal is to see whether there is some structure or pattern in the data. As can be seen from the chapters, half of the book is devoted to **linear models** in which we represent the relationship between the p predictors of the $i$th observation and the response by a linear function\n", 23 | "\n", 24 | "$$y_i = \\beta_0 + \\beta_1 x_{i1} + \\beta_2 x_{i2} + ... + \\beta_p x_{ip}$$\n", 25 | "\n", 26 | "and our goal is to use the available observations to learn the values of the parameters $\\beta$.The 2nd part of the book presents different ways to overcome the limits of the linear models by adding higher order terms to the linear functions (polynomials and splines). The goal with models that include higher order terms will be to learn their parmeters in adition to the parameters of the linear terms \n", 27 | "\n", 28 | "$$y_i = \\beta_0 + \\beta_1 x_i + \\beta_2 x_i^2 + \\beta_3 x_i^3 + ... + \\beta_d x_i^d$$\n", 29 | "\n", 30 | "A different approach is to use non-parametric methods by finding rules or similarities in the data without using mathematical models such as in **decision trees**, **splines** or **K-Means**. Support vector machines offer a geometrical approach mostly used in classification tasks. A theme that is transversal to all the techniques discussed in the course and in the book is **overfitting**, when our model performs well on the training data but not that much on new observations, and how to overcome it.\n", 31 | "\n", 32 | "---\n", 33 | "\n", 34 | "1. Introduction\n", 35 | "2. [Statistical Learning](chapter2.ipynb)\n", 36 | "3. [Linear Regression](chapter3.ipynb)\n", 37 | "4. [Classification](chapter4.ipynb)\n", 38 | "5. [Resampling Methods](chapter5.ipynb)\n", 39 | "6. [Linear Model Selection and Regularization](chapter6.ipynb)\n", 40 | "7. [Moving Beyond Linearity](chapter7.ipynb)\n", 41 | "8. [Tree-Based Methods](chapter8.ipynb)\n", 42 | "9. [Support Vector Machines](chapter9.ipynb)\n", 43 | "10. [Unsupervised Learning](chapter10.ipynb)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [] 52 | } 53 | ], 54 | "metadata": { 55 | "kernelspec": { 56 | "display_name": "R", 57 | "language": "R", 58 | "name": "ir" 59 | }, 60 | "language_info": { 61 | "codemirror_mode": "r", 62 | "file_extension": ".r", 63 | "mimetype": "text/x-r-source", 64 | "name": "R", 65 | "pygments_lexer": "r", 66 | "version": "3.6.1" 67 | } 68 | }, 69 | "nbformat": 4, 70 | "nbformat_minor": 4 71 | } 72 | -------------------------------------------------------------------------------- /r/rethinking/overfitting.R: -------------------------------------------------------------------------------- 1 | # R code 7.1 rethinking book 2 | sppnames <- c("afarensis", "africanus", "habilis", "boisei", 3 | "rudolfensis", "ergaster", "sapiens") 4 | brainvolcc <- c(438, 452, 612, 521, 752, 871, 1350) 5 | masskg <- c(37.0, 35.5, 34.5, 41.5, 55.5, 61.0, 53.5) 6 | d <- data.frame(species = sppnames, brain = brainvolcc, mass = masskg) 7 | # We start modeling the relationship between brain volumes and body mass 8 | # as a linear function. 9 | # We standardize the variables 10 | library(rethinking) 11 | d$mass_std <- (d$mass - mean(d$mass)) / sd(d$mass) 12 | d$brain_std <- d$brain / max(d$brain) 13 | m7.1 <- quap( 14 | alist( 15 | brain_std ~ dnorm( mu , exp(log_sigma) ) , 16 | mu <- a + b * mass_std , 17 | a ~ dnorm(0.5, 1) , 18 | b ~ dnorm(0, 10), 19 | log_sigma ~ dnorm(0, 1) 20 | ) , 21 | data = d 22 | ) 23 | 24 | # We see how the model gets close to the data. The frequentist idea 25 | # is that the model is as good as the variance of the residuals is close 26 | # or smaller than the vairance of the data itself. A residula is the 27 | # difference between an observation and a value predicted by the model. 28 | set.seed(12) 29 | s <- sim(m7.1) 30 | r <- apply(s, 2, mean) - d$brain_std 31 | resid_var <- var2(r) 32 | outcome_var <- var2(d$brain_std) 33 | 1 - resid_var / outcome_var # computes R^2 34 | 35 | R2_is_bad <- function(quap_fit) { 36 | s <- sim(quap_fit, refresh = 0) 37 | r <- apply(s, 2, mean) - d$brain_std # residuals 38 | 1 - var2(r) / var2(d$mass_std) 39 | } 40 | 41 | # We want to compare how different models fit the data. We build 42 | # them using polynomials 43 | m7.2 <- quap( 44 | alist( 45 | brain_std ~ dnorm( mu , exp(log_sigma) ) , 46 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 , 47 | a ~ dnorm(0.5, 1) , 48 | b ~ dnorm(0, 10), 49 | log_sigma ~ dnorm(0, 1) 50 | ) , 51 | data = d, start = list(b = rep(0, 2)) 52 | ) 53 | m7.3 <- quap( 54 | alist( 55 | brain_std ~ dnorm( mu , exp(log_sigma) ) , 56 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 + b[3] * mass_std^3, 57 | a ~ dnorm(0.5, 1) , 58 | b ~ dnorm(0, 10), 59 | log_sigma ~ dnorm(0, 1) 60 | ) , 61 | data = d, start = list(b = rep(0, 3)) 62 | ) 63 | m7.4 <- quap( 64 | alist( 65 | brain_std ~ dnorm( mu , exp(log_sigma) ) , 66 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 + b[3] * mass_std^3 + b[4] * mass_std^4, 67 | a ~ dnorm(0.5, 1) , 68 | b ~ dnorm(0, 10), 69 | log_sigma ~ dnorm(0, 1) 70 | ) , 71 | data = d, start = list(b = rep(0, 4)) 72 | ) 73 | m7.5 <- quap( 74 | alist( 75 | brain_std ~ dnorm( mu , exp(log_sigma) ) , 76 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 + b[3] * mass_std^3 + b[4] * mass_std^4 + b[5] * mass_std^5 , 77 | a ~ dnorm(0.5, 1) , 78 | b ~ dnorm(0, 10), 79 | log_sigma ~ dnorm(0, 1) 80 | ) , 81 | data = d, start = list(b = rep(0, 5)) 82 | ) 83 | m7.6 <- quap( 84 | alist( 85 | brain_std ~ dnorm( mu , 0.001 ) , 86 | mu <- a + b[1] * mass_std + b[2] * mass_std^2 + 87 | b[3] * mass_std^3 + b[4] * mass_std^4 + 88 | b[5] * mass_std^5 + b[6] * mass_std^6 , 89 | a ~ dnorm(0.5, 1) , 90 | b ~ dnorm(0, 10), 91 | log_sigma ~ dnorm(0, 1) 92 | ) , 93 | data = d, start = list(b = rep(0, 6)) 94 | ) 95 | # plot the models 96 | # linear model 97 | post <- extract.samples(m7.1) 98 | mass_seq <- seq(from = min(d$mass_std), to = max(d$mass_std), length.out = 100) 99 | l <- link(m7.1, data = list(mass_std = mass_seq)) 100 | mu <- apply(l, 2, mean) 101 | ci <- apply(l, 2, PI) 102 | plot(brain_std ~ mass_std, data = d) 103 | lines(mass_seq, mu) 104 | shade(ci, mass_seq) 105 | R2_is_bad(m7.1) 106 | # order 2 polynomial model 107 | post <- extract.samples(m7.2) 108 | l <- link(m7.2, data = list(mass_std = mass_seq)) 109 | mu <- apply(l, 2, mean) 110 | ci <- apply(l, 2, PI) 111 | plot(brain_std ~ mass_std, data = d) 112 | lines(mass_seq, mu) 113 | shade(ci, mass_seq) 114 | R2_is_bad(m7.2) 115 | # order 5 polynomial model 116 | post <- extract.samples(m7.5) 117 | l <- link(m7.5, data = list(mass_std = mass_seq)) 118 | mu <- apply(l, 2, mean) 119 | ci <- apply(l, 2, PI) 120 | plot(brain_std ~ mass_std, data = d) 121 | lines(mass_seq, mu) 122 | shade(ci, mass_seq) 123 | R2_is_bad(m7.5) 124 | # order 6 polynomial model 125 | post <- extract.samples(m7.6) 126 | l <- link(m7.6, data = list(mass_std = mass_seq)) 127 | mu <- apply(l, 2, mean) 128 | ci <- apply(l, 2, PI) 129 | plot(brain_std ~ mass_std, data = d) 130 | lines(mass_seq, mu) 131 | shade(ci, mass_seq) 132 | R2_is_bad(m7.6) 133 | # ordinary least squares (uses the R lm() function) 134 | m7.1_OLS <- lm(brain_std ~ mass_std, data = d) 135 | post <- extract.samples(m7.1_OLS) # how to plot this ? 136 | 137 | # Measure distance from target model 138 | set.seed(1) 139 | sapply(list(m7.1, m7.2, m7.3, m7.4, m7.5, m7.6), function(m) sum(lppd(m))) -------------------------------------------------------------------------------- /r/rethinking/spurious_association.R: -------------------------------------------------------------------------------- 1 | # R code 5.1 rethinking book 2 | # load data 3 | library(rethinking) 4 | data(WaffleDivorce) 5 | d <- WaffleDivorce 6 | 7 | # standardize variables 8 | d$A <- scale(d$MedianAgeMarriage) 9 | d$D <- scale(d$Divorce) 10 | 11 | # 1) builds the model divorce rate D - age of marriage A 12 | m5.1 <- quap( 13 | alist( 14 | D ~ dnorm( mu , sigma ) , 15 | mu <- a + bA * A , 16 | a ~ dnorm( 0 , 0.2 ) , 17 | bA ~ dnorm( 0 , 0.5 ) , 18 | sigma ~ dexp(1) 19 | ) , 20 | data = d 21 | ) 22 | 23 | # plot the priors 24 | set.seed(10) 25 | prior <- extract.prior(m5.1) 26 | mu <- link(m5.1, post = prior, data = list(A = c(-2,2))) 27 | plot(NULL, xlim = c(-2,2), ylim = c(-2,2)) 28 | for (i in 1:50) 29 | lines(c(-2,2), mu[i, ], col = col.alpha("black", 0.4)) 30 | 31 | # compute percentile interval of mean 32 | A_seq <- seq(from = -3, to = 3.2, length.out = 30) 33 | mu <- link(m5.1, data = list(A = A_seq)) 34 | mu.mean <- apply(mu, 2, mean) 35 | mu.PI <- apply(mu, 2, PI) 36 | 37 | # plot the posterior predictions 38 | plot(D ~ A, data = d, col = rangi2) 39 | lines(A_seq, mu.mean, lwd = 2) 40 | shade(mu.PI, A_seq) 41 | 42 | precis(m5.1) 43 | 44 | # 2) builds the model divorce rate D - marriage rate M 45 | # standardize variable marriage rate 46 | d$M <- scale(d$Marriage) 47 | m5.2 <- quap( 48 | alist( 49 | D ~ dnorm( mu , sigma ) , 50 | mu <- a + bM * M , 51 | a ~ dnorm( 0 , 0.2 ) , 52 | bM ~ dnorm( 0 , 0.5 ) , 53 | sigma ~ dexp(1) 54 | ) , 55 | data = d 56 | ) 57 | 58 | # compute percentile interval of mean 59 | M_seq <- seq(from = -3, to = 3.2, length.out = 30) 60 | mu <- link(m5.2, data = list(M = M_seq)) 61 | mu.mean <- apply(mu, 2, mean) 62 | mu.PI <- apply(mu, 2, PI) 63 | 64 | # plot the posterior predictions 65 | plot(D ~ M, data = d, col = rangi2) 66 | lines(M_seq, mu.mean, lwd = 2) 67 | shade(mu.PI, M_seq) 68 | 69 | # draw a directed acyclic graph (DAG) that represents 70 | # a causal relationship between the variables 71 | #install.packages('dagitty') 72 | library(dagitty) 73 | dag5.1 <- dagitty("dag { 74 | A -> D 75 | A -> M 76 | M -> D 77 | }" 78 | ) 79 | coordinates(dag5.1) <- list(x = c(A = 0, D = 1, M = 2), y = c(A = 0, D = 1, M = 0)) 80 | plot(dag5.1) 81 | 82 | # 3) multiple regression model 83 | m5.3 <- quap( 84 | alist( 85 | D ~ dnorm( mu , sigma ) , 86 | mu <- a + bM * M + bA * A, 87 | a ~ dnorm( 0 , 0.2 ) , 88 | bA ~ dnorm( 0 , 0.5 ) , 89 | bM ~ dnorm( 0 , 0.5 ) , 90 | sigma ~ dexp(1) 91 | ) , 92 | data = d 93 | ) 94 | precis(m5.3) 95 | # plot the posterior distributions of the two weights (parameters) 96 | # for age of marriage (bA) and marriage rate (bM) to see the changes 97 | # from bivariate models (m5.1, m5.2) to a multivariate model (m5.3) 98 | # compute percentile interval of mean 99 | #plot(coeftab(m5.1, m5.2, m5.3), par = c("bA", "bM")) # this line of code fails. 100 | 101 | # Predictor residual plots. 102 | # We check the relationship between age of 103 | # marriage (A) and marriage rate (M), that is A -> M 104 | m5.4 <- quap( 105 | alist( 106 | M ~ dnorm( mu , sigma ) , 107 | mu <- a + bAM * A, 108 | a ~ dnorm( 0 , 0.2 ) , 109 | bAM ~ dnorm( 0 , 0.5 ) , 110 | sigma ~ dexp(1) 111 | ) , 112 | data = d 113 | ) 114 | mu <- link(m5.4) 115 | mu.mean <- apply(mu, 2, mean) 116 | mu_resid <- d$M - mu.mean 117 | # plot the residuals 118 | A_seq <- seq(from = -3, to = 3, length.out = 50) 119 | plot(M ~ A, data = d, col = rangi2) 120 | lines(d$A, mu.mean, lwd = 2) 121 | #for (i in 1:50) 122 | # lines(d$A[i], mu_resid[i, ], col = col.alpha("black", 0.4)) 123 | 124 | # Counterfactual plots 125 | # prepare new counterfactual data 126 | M_seq <- seq(from = -2, to = 3, length.out = 30) 127 | pred_data <- data.frame(M = M_seq, A = 0) 128 | # compute counterfactual mean divorce (mu) 129 | mu <- link(m5.3, data = pred_data) 130 | mu_mean <- apply(mu, 2, mean) 131 | mu_PI <- apply(mu, 2, PI) 132 | # simulate counterfactual divorce outcomes 133 | D_sim <- sim(m5.3, data = pred_data, n = 1e4) 134 | D_PI <- apply(D_sim, 2, PI) 135 | # display predictions, hiding raw data with type = "n" 136 | plot(D ~ M, data = d, type = "n") 137 | mtext("Median age marriage (std) = 0") 138 | lines(M_seq, mu_mean) 139 | shade(mu_PI, M_seq) 140 | shade(D_PI, M_seq) 141 | 142 | 143 | # Posterior prediction plots 144 | # It plots the predictions against the observations of the dependent variable (divorce. 145 | # call link without specifying new data so it uses original data 146 | mu <- link(m5.3) 147 | # summarize samples across cases 148 | mu_mean <- apply(mu, 2, mean) 149 | mu_PI <- apply(mu, 2, PI) 150 | # simulate observations, again no new data, uses original data 151 | D_sim <- sim(m5.3, n = 1e4) 152 | D_PI <- apply(D_sim, 2, PI) 153 | # plot the predictions against the observations 154 | plot(mu_mean ~ d$D, col = rangi2, ylim = range(mu_PI), 155 | xlab = "Observed divorce", ylab = "Predictive divorce") 156 | abline(a = 0, b = 1, lty = 2) 157 | for (i in 1:nrow(d)) 158 | lines(rep(d$D[i], 2), mu_PI[, i], col = rangi2) 159 | # show some selected points (select by clicking on the points in the window) 160 | identify(x = d$D, y = mu_mean, labels = d$Loc) 161 | 162 | # Simulating spurious association 163 | N <- 100 164 | x_real <- rnorm(N) 165 | x_spur <- rnorm(N, x_real) 166 | y <- rnorm(N, x_real) 167 | d <- data.frame(y, x_real, x_spur) 168 | pairs(d) 169 | 170 | -------------------------------------------------------------------------------- /python/atn/capacitors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Capacitors" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "data": { 17 | "text/plain": [ 18 | "'1.4.3'" 19 | ] 20 | }, 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "output_type": "execute_result" 24 | } 25 | ], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "import numpy as np\n", 29 | "from datetime import datetime\n", 30 | "from pandas import Series, DataFrame\n", 31 | "%matplotlib inline\n", 32 | "import matplotlib.pyplot as plt\n", 33 | "plt.style.use('seaborn-whitegrid')\n", 34 | "pd.__version__" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "### Read the data" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 4, 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "name": "stderr", 51 | "output_type": "stream", 52 | "text": [ 53 | "C:\\Users\\Luigi\\AppData\\Local\\Temp\\ipykernel_18948\\1313487762.py:1: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.\n", 54 | " sales = pd.read_csv('datasets/PurchaseData_20180319.csv')\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "sales = pd.read_csv('datasets/PurchaseData_20180319.csv')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# Capacitors Data Analysis\n", 67 | "In order to predict the price (POP_UnitPrice) and the delivery time (PO_Date) of a capacitor we have received from ATN, with the data, a list of features that should be relevant for the task at hand and a set of rules. The relevant features are a subset of the fields in the data set.\n", 68 | "\n", 69 | "1. Component number (ComponentNumber_MAT_Flight)\n", 70 | "2. Specification name (SpecificationName)\n", 71 | "3. Family path (FamilyPath_Flight)\n", 72 | "4. Style (Style_Flight)\n", 73 | "5. Quality level (QLevel_Flight)\n", 74 | "6. Package class (PACKAGECLASS)\n", 75 | "7. Package (PACKAGE)\n", 76 | "8. Capacitance (CAPACITANCE_N)\n", 77 | "9. Capacitance case (CAPE_CASE)\n", 78 | "10. Tolerance (TOLERANCE_N)\n", 79 | "11. DC rated voltage (DC_RATED_VOLTAGE_N)\n", 80 | "12. Quality Value Name (QualityValueName)\n", 81 | "13. Manufacturer (MnfrDoeeetName)\n", 82 | "14. Quantity (POP_Qty)\n", 83 | "15. Date of purchase (PO_Date)\n", 84 | "16. Unit price (POP_UnitPrice)\n", 85 | "17. Date of delivery (POP_DeliveryDate)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## Data preparation for capacitors\n", 93 | "Before implementing the algorithm to predict unit price and delivery time for a capacitor, we have to extract the records from the sale orders data set and apply the following transformation\n", 94 | "\n", 95 | "1. Select the records about capacitors (family root -> capacitors)\n", 96 | "2. Extract the most specific family of the component from the hierarchy (family path)\n", 97 | "3. Filter out the records that are about services (remove price label -> material unit price or pop_quantity_unit -> ST)\n", 98 | "4. Transform all the prices in euro\n", 99 | "5. Update the all the unit prices applying an increase of 5 % per year (using the compund interest formula) \n", 100 | "\n", 101 | "After the data is prepared we can implement the algorithms \n", 102 | "\n", 103 | "1. Price prediction\n", 104 | "\n", 105 | "2. Delivery time prediction" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "#### Use only records without charges" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 5, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "18508" 124 | ] 125 | }, 126 | "execution_count": 5, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "sales = sales[sales['PRICE LABEL'] == 'MATERIAL UNIT PRICE']\n", 133 | "sales.index.size" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "#### Select the records about capacitors" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "Number of records for resistor: 8027\n" 153 | ] 154 | } 155 | ], 156 | "source": [ 157 | "resistor_records = sales[sales['FamilyRoot'] == 'Resistors']\n", 158 | "num_resistor_records = resistor_records.index.size\n", 159 | "print(\"Number of records for resistor: \" + str(num_resistor_records))" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3 (ipykernel)", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.9.13" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 2 191 | } 192 | -------------------------------------------------------------------------------- /r/rethinking/interaction_model.R: -------------------------------------------------------------------------------- 1 | # R code 8.1 rethinking book 2 | # We want to model the relationship between a country's GDP and 3 | # the ruggedness of its territory. 4 | library(rethinking) 5 | data(rugged) 6 | d <- rugged 7 | 8 | # make log version of the outcome 9 | d$log_gdp <-- log(d$rgdppc_2000) 10 | 11 | # extract countries with GDP data 12 | dd <- d[complete.cases(d$rgdppc_2000), ] 13 | 14 | # rescale variables 15 | dd$log_gdp_std <- dd$log_gdp / mean(dd$log_gdp) 16 | dd$rugged_std <- dd$rugged / max(dd$rugged) 17 | 18 | # split countries into Africa and not-Africa 19 | d.A1 <- dd[dd$cont_africa == 1, ] # Africa 20 | d.A0 <- dd[dd$cont_africa == 0, ] # not Africa 21 | 22 | # we model the relationship between GDP and ruggedness 23 | # for Africa 24 | m8.1 <- quap( 25 | alist( 26 | log_gdp_std ~ dnorm( mu , sigma ) , 27 | mu <- a + b * (rugged_std - 0.215), 28 | a ~ dnorm(1, 0.1) , 29 | b ~ dnorm(0, 0.3), 30 | sigma ~ dexp(1) 31 | ) , 32 | data = d.A1 33 | ) 34 | 35 | # Let#s look at the priors to see if they are plausible. 36 | # If not we change their standard deviation. 37 | set.seed(7) 38 | prior <- extract.prior(m8.1) 39 | # set up the plot dimensions 40 | plot(NULL, xlim = c(0,1), ylim = c(0.5, 1.5), 41 | xlab = "ruggedness", ylab = "log GDP") 42 | abline(h = min(dd$log_gdp_std), lty = 2) 43 | abline(h = max(dd$log_gdp_std), lty = 2) 44 | # draw 50 lines from the prior 45 | rugged_seq <- seq(from = -0.1, to = 1.1, length.out = 30) 46 | mu <- link(m8.1, post = prior, data = data.frame(rugged_std = rugged_seq)) 47 | for (i in 1:50) 48 | lines(rugged_seq, mu[i, ], col = col.alpha("black", 0.3)) 49 | 50 | # we model the relationship between GDP and ruggedness 51 | # for not-Africa 52 | m8.2 <- quap( 53 | alist( 54 | log_gdp_std ~ dnorm( mu , sigma ) , 55 | mu <- a + b * (rugged_std - 0.215), 56 | a ~ dnorm(1, 0.1) , 57 | b ~ dnorm(0, 0.3), 58 | sigma ~ dexp(1) 59 | ) , 60 | data = d.A0 61 | ) 62 | 63 | # we look at the posteriors for the parameters in both models 64 | # and we see that the relationship (parameter b) is different 65 | # positive for Africa and negative for not-Africa 66 | precis(m8.1) # Africa 67 | precis(m8.2) # not-Africa 68 | 69 | # Now we want to see how to reach the same result using only 70 | # one model and one single data set without splitting it. 71 | # We start building a model like the previous ones but with 72 | # the full dataset. Then we build another model that will use 73 | # different intercets for African and not-African countries within 74 | # the same model. We will then compare these two models. 75 | m8.3 <- quap( 76 | alist( 77 | log_gdp_std ~ dnorm( mu , sigma ) , 78 | mu <- a + b * (rugged_std - 0.215), 79 | a ~ dnorm(1, 0.1) , 80 | b ~ dnorm(0, 0.3), 81 | sigma ~ dexp(1) 82 | ) , 83 | data = dd 84 | ) 85 | # we use an indexed intercpt instead of a dummy variable to distinguish 86 | # between African and not-African countries 87 | # create a variable to index Africa (1) or not (2) 88 | dd$cid <- ifelse(dd$cont_africa == 1, 1, 2) 89 | m8.4 <- quap( 90 | alist( 91 | log_gdp_std ~ dnorm( mu , sigma ) , 92 | mu <- a[cid] + b * (rugged_std - 0.215), 93 | a[cid] ~ dnorm(1, 0.1) , 94 | b ~ dnorm(0, 0.3), 95 | sigma ~ dexp(1) 96 | ) , 97 | data = dd 98 | ) 99 | # We compare the two models that use the full dataset 100 | compare(m8.3, m8.4) 101 | # we look at the posterior parameters 102 | precis(m8.4, depth = 2) 103 | # let's plot the posterior predictions 104 | rugged_seq <- seq(from = -0.1, to = 1.1, length.out = 30) 105 | # compute mu over samples, fixing cid = 2 (not-Africa) 106 | mu.NotAfrica <- link(m8.4, data = data.frame(cid = 2, rugged_std = rugged_seq)) 107 | # compute mu over samples, fixing cid = 1 (Africa) 108 | mu.Africa <- link(m8.4, data = data.frame(cid = 1, rugged_std = rugged_seq)) 109 | # summarize to means and intervals 110 | mu.NotAfrica_mu <- apply(mu.NotAfrica, 2, mean) 111 | mu.NotAfrica_ci <- apply(mu.NotAfrica, 2, PI, prob = 0.97) 112 | mu.Africa_mu <- apply(mu.Africa, 2, mean) 113 | mu.Africa_ci <- apply(mu.Africa, 2, PI, prob = 0.97) 114 | # we can see that the model with the indexed intercept (a[cid]) is not a better model because 115 | # the interaction used for the intercept doesn't help explaining the different 116 | # role of ruggedness on GDP we saw at the beginning for African and not-African countries. 117 | # We have to make the parameter b also dependent on being Africa or not-Africa 118 | m8.5 <- quap( 119 | alist( 120 | log_gdp_std ~ dnorm( mu , sigma ) , 121 | mu <- a[cid] + b[cid] * (rugged_std - 0.215), 122 | a[cid] ~ dnorm(1, 0.1) , 123 | b[cid] ~ dnorm(0, 0.3), 124 | sigma ~ dexp(1) 125 | ) , 126 | data = dd 127 | ) 128 | precis(m8.5, depth = 2) 129 | # Let's see how much adding the interaction at the slope improves the model 130 | compare(m8.3, m8.4, m8.5) 131 | # plot Africa data points, cid = 1 132 | plot(d.A1$rugged_std, d.A1$log_gdp_std, pch = 16, col = rangi2, 133 | xlab = "ruggedness (standardized)", ylab = "log GDP (as proportion of mean)", 134 | xlim = c(0,1)) 135 | mu <- link(m8.5, data = data.frame(cid = 1, rugged_std = rugged_seq)) 136 | mu_mean <- apply(mu, 2, mean) 137 | mu_ci <- apply(mu, 2, PI, prob = 0.97) 138 | lines(rugged_seq, mu_mean, lwd = 2) 139 | shade(mu_ci, rugged_seq, col = col.alpha(rangi2, 0.3)) 140 | mtext("African nations") 141 | 142 | # plot not-Africa data points, cid = 2 143 | plot(d.A1$rugged_std, d.A1$log_gdp_std, pch = 16, col = rangi2, 144 | xlab = "ruggedness (standardized)", ylab = "log GDP (as proportion of mean)", 145 | xlim = c(0,1)) 146 | mu <- link(m8.5, data = data.frame(cid = 2, rugged_std = rugged_seq)) 147 | mu_mean <- apply(mu, 2, mean) 148 | mu_ci <- apply(mu, 2, PI, prob = 0.97) 149 | lines(rugged_seq, mu_mean, lwd = 2) 150 | shade(mu_ci, rugged_seq, col = col.alpha(rangi2, 0.3)) 151 | mtext("Non-African nations") -------------------------------------------------------------------------------- /r/rethinking/masked_relationship.R: -------------------------------------------------------------------------------- 1 | # R code 5.18 rethinking book 2 | # Masked relationship 3 | # We investigate the relationships between the milk kilocalories (K) and two 4 | # other variables: neocortex percentage (N) and log body mass (M). We first build 5 | # two bivariate models (K ~ N) and (K ~ N) from which we see that taken separately 6 | # the two variables N and M have a weak relationship with K. Then we build a 3rd 7 | # multivariate model taking into account both N and M together to show that they 8 | # have a strong relationship with K. 9 | library(rethinking) 10 | data(milk) 11 | d <- milk 12 | str(d) 13 | d$K <- scale(d$kcal.per.g) 14 | d$N <- scale(d$neocortex.perc) # contains NA values 15 | d$M <- scale(log(d$mass)) 16 | 17 | # remove NA values 18 | dcc <- d[complete.cases(d$K, d$N, d$M), ] 19 | # --------------------------------------------------------------------- 20 | # 1) builds the first model kilocalories (K) - neocortex percentage (N) 21 | # --------------------------------------------------------------------- 22 | m5.5_draft <- quap( 23 | alist( 24 | K ~ dnorm( mu , sigma ) , 25 | mu <- a + bN * N , 26 | a ~ dnorm( 0 , 1 ) , 27 | bN ~ dnorm( 0 , 1 ) , 28 | sigma ~ dexp(1) 29 | ) , 30 | data = dcc 31 | ) 32 | 33 | # builds a 2nd model kilocalories (K) - neocortex percentage (N) 34 | # with smaller interval values for the predictors a and bN 35 | # This model should produce more reasonable priors and as a 36 | # consequence also posterior. 37 | m5.5 <- quap( 38 | alist( 39 | K ~ dnorm( mu , sigma ) , 40 | mu <- a + bN * N , 41 | a ~ dnorm( 0 , 0.2 ) , 42 | bN ~ dnorm( 0 , 0.5 ) , 43 | sigma ~ dexp(1) 44 | ) , 45 | data = dcc 46 | ) 47 | 48 | 49 | #prior <- extract.prior(m5.5_draft) 50 | prior <- extract.prior(m5.5) 51 | 52 | # plot the prior regression lines 53 | xseq <- c(-2,2) 54 | mu <- link(m5.5, post = prior, data = list(N = xseq)) 55 | plot(NULL, xlim = xseq, ylim = xseq) # plot the frame 56 | for (i in 1:50) 57 | lines(xseq, mu[i, ], col = col.alpha("black", 0.3)) # plausible regression lines 58 | 59 | # Let's look at the posterior 60 | # It shows that the relationship between milk kilocalories and neocortex is weak: 61 | # small value of the bN parameter and standard deviation almost twice the mean. 62 | precis(m5.5) 63 | 64 | # Let's plot the posterior 65 | xseq <- seq(from = min(dcc$N) - 0.15, to = max(dcc$N) + 0.15, length.out = 30) 66 | mu <- link(m5.5, data = list(N = xseq)) 67 | mu_mean <- apply(mu, 2, mean) 68 | mu_PI <- apply(mu, 2, PI) 69 | plot(K ~ N, data = dcc) 70 | lines(xseq, mu_mean, lwd = 2) 71 | shade(mu_PI, xseq) 72 | 73 | 74 | # -------------------------------------------------------- 75 | # 2) builds the 2nd model kilocalories (K) - body mass (M) 76 | # -------------------------------------------------------- 77 | m5.6 <- quap( 78 | alist( 79 | K ~ dnorm( mu , sigma ) , 80 | mu <- a + bM * M , 81 | a ~ dnorm( 0 , 0.2 ) , 82 | bM ~ dnorm( 0 , 0.5 ) , 83 | sigma ~ dexp(1) 84 | ) , 85 | data = dcc 86 | ) 87 | 88 | prior <- extract.prior(m5.6) 89 | 90 | # plot the prior regression lines 91 | xseq <- c(-2,2) 92 | mu <- link(m5.6, post = prior, data = list(M = xseq)) 93 | plot(NULL, xlim = xseq, ylim = xseq) # plot the frame 94 | for (i in 1:50) 95 | lines(xseq, mu[i, ], col = col.alpha("black", 0.3)) # plausible regression lines 96 | 97 | # Let's look at the posterior 98 | # It shows that also the relationship between milk kilocalories and body mass is weak: 99 | # small (negative) value of the bM parameter and comparable standard deviation. 100 | precis(m5.6) 101 | 102 | # Let's plot the posterior 103 | xseq <- seq(from = min(dcc$M) - 0.15, to = max(dcc$M) + 0.15, length.out = 30) 104 | mu <- link(m5.6, data = list(M = xseq)) 105 | mu_mean <- apply(mu, 2, mean) 106 | mu_PI <- apply(mu, 2, PI) 107 | plot(K ~ N, data = dcc) 108 | lines(xseq, mu_mean, lwd = 2) 109 | shade(mu_PI, xseq) 110 | 111 | # ------------------------------------------------------------------------------------------------ 112 | # 3) builds the 3nd multivariate model kilocalories (K) depends on neocortex (N) and body mass (M) 113 | # ------------------------------------------------------------------------------------------------ 114 | # We build a multivariate linear model with milk kilocalories (K) that depends linearly by both the 115 | # neocortex percentage (N) and the body mass (M) 116 | m5.7 <- quap( 117 | alist( 118 | K ~ dnorm( mu , sigma ) , 119 | mu <- a + bN * N + bM * M , 120 | a ~ dnorm( 0 , 0.2 ) , 121 | bN ~ dnorm( 0 , 0.5 ) , 122 | bM ~ dnorm( 0 , 0.5 ) , 123 | sigma ~ dexp(1) 124 | ) , 125 | data = dcc 126 | ) 127 | # We can see, from the posterior mean and standard deviation, that K depends strongly on both N and M 128 | precis(m5.7) 129 | #plot(coeftab(m5.5, m5.6, m5.7), pars = c("bM", "bN")) # this doesn't work 130 | pairs(~K + M + N, dcc) 131 | 132 | # Let's now draw the counterfactual plot of the multivariate model. 133 | # Here we keep the neocortex percentage (N) constant at 0 (N = 0) so that we can see 134 | # the relation between K and the body mass M. We can see that the relation is stronger 135 | # in the multivariate model. 136 | xseq <- seq(from = min(dcc$M) - 0.15, to = max(dcc$M) + 0.15, length.out = 30) 137 | mu <- link(m5.7, data = data.frame(M = xseq, N = 0)) 138 | mu_mean <- apply(mu, 2, mean) 139 | mu_PI <- apply(mu, 2, PI) 140 | plot(NULL, xlim = range(dcc$M), ylim = range(dcc$K)) 141 | lines(xseq, mu_mean, lwd = 2) 142 | shade(mu_PI, xseq) 143 | 144 | # Here we keep the body mass (M) constant at 0 (M = 0) so that we can see 145 | # the relation between K and the neocortex percentage (N). We can see also in this case 146 | # that the relation is stronger in the multivariate model. 147 | xseq <- seq(from = min(dcc$N) - 0.15, to = max(dcc$N) + 0.15, length.out = 30) 148 | mu <- link(m5.7, data = data.frame(N = xseq, M = 0)) 149 | mu_mean <- apply(mu, 2, mean) 150 | mu_PI <- apply(mu, 2, PI) 151 | plot(NULL, xlim = range(dcc$M), ylim = range(dcc$K)) 152 | lines(xseq, mu_mean, lwd = 2) 153 | shade(mu_PI, xseq) 154 | -------------------------------------------------------------------------------- /python/atn/resistors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Resistors" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "scrolled": true 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets\n" 22 | ] 23 | } 24 | ], 25 | "source": [ 26 | "cd C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "data": { 36 | "text/plain": [ 37 | "'0.22.0'" 38 | ] 39 | }, 40 | "execution_count": 2, 41 | "metadata": {}, 42 | "output_type": "execute_result" 43 | } 44 | ], 45 | "source": [ 46 | "import pandas as pd\n", 47 | "import numpy as np\n", 48 | "from datetime import datetime\n", 49 | "from pandas import Series, DataFrame\n", 50 | "%matplotlib inline\n", 51 | "import matplotlib.pyplot as plt\n", 52 | "plt.style.use('seaborn-whitegrid')\n", 53 | "pd.__version__" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "### Read the data" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 3, 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stderr", 70 | "output_type": "stream", 71 | "text": [ 72 | "C:\\Users\\lselmi\\Downloads\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:2728: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.\n", 73 | " interactivity=interactivity, compiler=compiler, result=result)\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "sales = pd.read_csv('PurchaseData_20180319.csv')" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "# Resistors Data Analysis\n", 86 | "In order to predict the price (POP_UnitPrice) and the delivery time (PO_Date) of resistor we have received from ATN, with the data, a list of features that should be relevant for the task at hand and a set of rules. The relevant features are a subset of the fields in the data set.\n", 87 | "\n", 88 | "1. Component number (ComponentNumber_MAT_Flight)\n", 89 | "2. Specification name (SpecificationName)\n", 90 | "3. Family path (FamilyPath_Flight)\n", 91 | "4. Style (Style_Flight)\n", 92 | "5. Quality level (QLevel_Flight)\n", 93 | "6. Package class (PACKAGECLASS)\n", 94 | "7. Package (PACKAGE)\n", 95 | "8. Resistance (RESISTANCE_N)\n", 96 | "9. Resistance case size (RES_CASESIZE)\n", 97 | "10. Tolerance (TOLERANCE_N)\n", 98 | "11. Temperature coefficient (TEMPCOEFFICIENT_N)\n", 99 | "12. Quality Value Name (QualityValueName)\n", 100 | "13. Manufacturer (MnfrDoeeetName)\n", 101 | "14. Quantity (POP_Qty)\n", 102 | "15. Date of purchase (PO_Date)\n", 103 | "16. Unit price (POP_UnitPrice)\n", 104 | "17. Date of delivery (POP_DeliveryDate)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Data preparation for resistors\n", 112 | "Before implementing the algorithm to predict unit price and delivery time for a resistor, we have to extract the records from the sale orders data set and apply the following transformation\n", 113 | "\n", 114 | "1. Select the records about resistors (family root -> resistors)\n", 115 | "2. Extract the most specific family of the component from the hierarchy (family path)\n", 116 | "3. Filter out the records that are about services (remove price label -> material unit price or pop_quantity_unit -> ST)\n", 117 | "4. Transform all the prices in euro\n", 118 | "5. Update the all the unit prices applying an increase of 5 % per year (using the compund interest formula) \n", 119 | "\n", 120 | "After the data is prepared we can implement the algorithms \n", 121 | "\n", 122 | "1. Price prediction\n", 123 | "2. Delivery time prediction" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "#### Use only records without charges" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 7, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "18508" 142 | ] 143 | }, 144 | "execution_count": 7, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "sales = sales[sales['PRICE LABEL'] == 'MATERIAL UNIT PRICE']\n", 151 | "sales.index.size" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "#### Select the records about resistors" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 8, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "Number of records for resistor: 8027\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "resistor_records = sales[sales['FamilyRoot'] == 'Resistors']\n", 176 | "num_resistor_records = resistor_records.index.size\n", 177 | "print(\"Number of records for resistor: \" + str(num_resistor_records))" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "Python 3", 191 | "language": "python", 192 | "name": "python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.6.4" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /datasets.md: -------------------------------------------------------------------------------- 1 | Open Data Sets 2 | ============== 3 | A list of websites that provide data sets about science, economics and finance 4 | from Italy, Europe, the US, and intergovernmental organizations such as WHO, WMO, OECD, IMF and others. 5 | 6 | * Energy 7 | * European Union 8 | * [Gas Infrastructure Europe - Aggregated Gas Storage Inventory](https://agsi.gie.eu/) 9 | * [European Network of Transmission Operators for Gas](https://www.entsog.eu/) 10 | * Italy 11 | * [Gestore Mercati Energetici](https://www.mercatoelettrico.org/It/default.aspx) 12 | * [Gestore Servizi Energetici](https://gse.it/) 13 | * [Agenzia nazionale per le nuove tecnologie, l'energia e lo sviluppo economico sostenibile (ENEA)](https://www.enea.it/) 14 | 15 | * United States 16 | * [Energy Information Administration (US)](https://www.eia.gov/) 17 | 18 | * International Organizations 19 | * [International Energy Agency](https://www.iea.org/) 20 | * [International Renewable Energy Agency](https://www.irena.org/) 21 | 22 | * Economy 23 | * European Union 24 | * [European Central Bank Data Portal](https://data.ecb.europa.eu) 25 | * [European Securities and Market Authority](https://www.esma.europa.eu/) 26 | 27 | * Italy 28 | * [Banca d'Italia - Bank of Italy](https://www.bancaditalia.it/) 29 | * [Commissione Nazionale per le Societa' e la Borsa (CONSOB)](https://www.consob.it/) 30 | * [Agenzia delle Entrate](https://www.agenziaentrate.gov.it/portale/) 31 | 32 | * United States 33 | * [Federal Reserve Bank of St. Louis](https://fred.stlouisfed.org/) 34 | * [The Federal Reserve](https://www.federalreserve.gov/) 35 | * [U.S. Security and Exchange Commission](https://www.sec.gov/) 36 | * [Bureau of Economic Analysis](https://www.bea.gov/) 37 | * [U.S. Department of the Treasury](https://home.treasury.gov/) 38 | * [U.S. Securities and Exchange Commission - EDGAR Database](https://www.sec.gov/os/accessing-edgar-data) 39 | * [U.S. The Federal Deposit Insurance Corporation](https://www.fdic.gov/) 40 | * [U.S. Department of Commerce](https://www.commerce.gov/) 41 | * [National Association of Realtors](https://www.nar.realtor/) 42 | * [U.S. Department of the Treasury](https://home.treasury.gov/) 43 | * [U.S. Congressional Budget Office](https://www.cbo.gov/) 44 | * [FRED - Import Price Index](https://fred.stlouisfed.org/series/IREXPET) 45 | * [FRED - Consumer Sentiment](https://fred.stlouisfed.org/series/UMCSENT) 46 | * [Wikipedia - List of largest daily changes in the S&P 500 Index](https://en.wikipedia.org/wiki/List_of_largest_daily_changes_in_the_S%26P_500_Index) 47 | * [Wikipedia - List of largest daily changes in the Dow Jones Industrial Average](https://en.wikipedia.org/wiki/List_of_largest_daily_changes_in_the_Dow_Jones_Industrial_Average) 48 | * [Congressional Budget Office](https://www.cbo.gov/) 49 | 50 | * International Organizations 51 | * [International Monetary Fund](https://www.imf.org/en/Home) 52 | * [The World Bank](https://www.worldbank.org/en/home) 53 | * [The Organisation for Economic Co-operation and Development (OECD)](https://www.oecd.org/) 54 | * [The Bank for International Settlements](https://www.bis.org/) 55 | * [Food and Agriculture Organization of the United Nations](https://www.fao.org/) 56 | * [World Trade Organization](https://www.wto.org/) 57 | * [The Conference Board](https://www.conference-board.org/eu/) 58 | * [The World Intellectual Property Organization (WIPO)](https://www.wipo.int/portal/en/index.html) 59 | 60 | * Environment 61 | * European Union 62 | * [The European Environment Agency](https://www.eea.europa.eu/) 63 | * [EEA Industrial Reporting Database 2022](https://www.eea.europa.eu/data-and-maps/data/industrial-reporting-under-the-industrial-6) 64 | * [Extreme Wind Storms Catalogue](http://www.europeanwindstorms.org/) 65 | * [European Severe Storms Laboratory](https://www.essl.org/) 66 | * [GHSL - Global Human Settlement Layer (Copernicus Emergency Management Service - Exposure Mapping Component)](https://human-settlement.emergency.copernicus.eu/index.php) 67 | * [Copernicus Emergency Management Service](https://emergency.copernicus.eu/) 68 | * [Integrated Carbon Observation System](https://www.icos-cp.eu/) 69 | 70 | * Italy 71 | * [The Italian Institute for Environmental Protection and Research - ISPRA](https://www.isprambiente.gov.it/en) 72 | * [IdroGEO Italian Web Platform on Landslides and Floods](https://idrogeo.isprambiente.it/app/) 73 | * [INGV Terremoti - Earthquakes](https://ingvterremoti.com/) 74 | * [Istituto Nazionale di Geofisica e Vulcanologia](https://www.ingv.it/en/) 75 | * [Osservatorio Vesuviano](https://www.ov.ingv.it/index.php) 76 | * [INGV TINITALY - Digital Elevation Model of Italy](https://data.ingv.it/dataset/185#additional-metadata) 77 | * United States 78 | * [Federal Emergency Management Agency (FEMA)](https://www.fema.gov/) 79 | * [US Geological Survey](https://www.usgs.gov/) 80 | * [NOAA Climate Prediction Center](https://www.cpc.ncep.noaa.gov/) 81 | * [USGS Water Data for the Nation](https://waterdata.usgs.gov/nwis) 82 | * [SHELDUS - US Center for Emergency Management and Homeland Security](https://cemhs.asu.edu/sheldus) 83 | * International Organizations 84 | * [World Glacier Monitoring Service](https://wgms.ch/) 85 | 86 | * Health 87 | * [World Health Organization - The Global Health Observatory](https://www.who.int/data/gho) 88 | * [Stockholm Convention on Persistent Organic Pollutants](https://www.pops-gmp.org/index.html) 89 | 90 | * Social Sciences 91 | * European Union 92 | * [Eurostat - The Statistical Office of the European Union](https://ec.europa.eu/eurostat) 93 | 94 | * Italy 95 | * [Istituto Nazionale di Statistica - Italian National Institute of Statistics (IT)](https://www.istat.it/en/) 96 | 97 | * United States 98 | * [Bureau of Labor Statistics](https://www.bls.gov/) 99 | * [Census Bureau](https://data.census.gov/cedsci/) 100 | 101 | * Water 102 | * European Union 103 | [Water Information System for Europe](https://water.europa.eu/) 104 | 105 | * Geospatial Data 106 | * European Union 107 | [European Centre for Medium-Range Weather Forecasts Data Store](https://data.ecmwf.int/) 108 | [The Geographic Information System of the Commission (GISCO)](https://ec.europa.eu/eurostat/web/gisco) 109 | -------------------------------------------------------------------------------- /python/parsing-data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Parsing data" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Parsing HTML data" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 54, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from urllib.request import urlopen\n", 24 | "from lxml.html import parse\n", 25 | "parsed = parse(urlopen('https://raw.githubusercontent.com/luigiselmi/datascience/master/python/finance/data/world_companies.html'))\n", 26 | "doc = parsed.getroot()\n", 27 | "\n", 28 | "tables = doc.findall('.//table')\n", 29 | "\n", 30 | "table = tables[0]\n", 31 | "\n", 32 | "rows = table.findall('.//tr')\n", 33 | "\n", 34 | "def _unpack(row, kind):\n", 35 | " elts = row.findall('.//%s' % kind)\n", 36 | " return [val.text_content() for val in elts]" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 55, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "['Company', 'Contact', 'Country']" 48 | ] 49 | }, 50 | "execution_count": 55, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "_unpack(rows[0], kind='th') # header" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 56, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "[['Alfreds Futterkiste', 'Maria Anders', 'Germany'],\n", 68 | " ['Centro comercial Moctezuma', 'Francisco Chang', 'Mexico']]" 69 | ] 70 | }, 71 | "execution_count": 56, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "[_unpack(rows[i], kind='td') for i in range(1, 3)]" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 57, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | "
CompanyContactCountry
0Alfreds FutterkisteMaria AndersGermany
1Centro comercial MoctezumaFrancisco ChangMexico
\n", 126 | "
" 127 | ], 128 | "text/plain": [ 129 | " Company Contact Country\n", 130 | "0 Alfreds Futterkiste Maria Anders Germany\n", 131 | "1 Centro comercial Moctezuma Francisco Chang Mexico" 132 | ] 133 | }, 134 | "execution_count": 57, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "from pandas.io.parsers import TextParser\n", 141 | "\n", 142 | "def parse_options_data(table):\n", 143 | " rows = table.findall('.//tr')\n", 144 | " header = _unpack(rows[0], kind='th')\n", 145 | " data = [_unpack(r,'td') for r in rows[1:]]\n", 146 | " return TextParser(data, names=header).get_chunk()\n", 147 | "\n", 148 | "df = parse_options_data(table) # returns a dataframe\n", 149 | "df" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 58, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "0 Alfreds Futterkiste\n", 161 | "1 Centro comercial Moctezuma\n", 162 | "Name: Company, dtype: object" 163 | ] 164 | }, 165 | "execution_count": 58, 166 | "metadata": {}, 167 | "output_type": "execute_result" 168 | } 169 | ], 170 | "source": [ 171 | "df['Company']" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 59, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "0 Maria Anders\n", 183 | "1 Francisco Chang\n", 184 | "Name: Contact, dtype: object" 185 | ] 186 | }, 187 | "execution_count": 59, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "df['Contact']" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "## Parsing XML data" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 60, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "from lxml import objectify\n", 210 | "parsed_xml = objectify.parse(urlopen('https://raw.githubusercontent.com/luigiselmi/datascience/master/python/finance/data/persons.xml'))\n", 211 | "xml_root = parsed_xml.getroot()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 61, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "person = []\n", 221 | "for element in xml_root.person:\n", 222 | " element_data = {}\n", 223 | " for child in element.getchildren():\n", 224 | " element_data[child.tag] = child.pyval\n", 225 | " person.append(element_data)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 64, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "['Pippo', 'Mickey']" 237 | ] 238 | }, 239 | "execution_count": 64, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "[person[i]['firstname'] for i in range(0, len(person))]" 246 | ] 247 | } 248 | ], 249 | "metadata": { 250 | "kernelspec": { 251 | "display_name": "Python 3 (ipykernel)", 252 | "language": "python", 253 | "name": "python3" 254 | }, 255 | "language_info": { 256 | "codemirror_mode": { 257 | "name": "ipython", 258 | "version": 3 259 | }, 260 | "file_extension": ".py", 261 | "mimetype": "text/x-python", 262 | "name": "python", 263 | "nbconvert_exporter": "python", 264 | "pygments_lexer": "ipython3", 265 | "version": "3.9.13" 266 | } 267 | }, 268 | "nbformat": 4, 269 | "nbformat_minor": 2 270 | } 271 | -------------------------------------------------------------------------------- /python/atn/logfiles/Users_Navigation_Data.doc: -------------------------------------------------------------------------------- 1 | Date: Mon, 25 Nov 2019 16:22:59 +0100 (CET) 2 | Message-ID: <1508649183.873.1574695379478@confluence-new.iais.fraunhofer.de> 3 | Subject: Exported From Confluence 4 | MIME-Version: 1.0 5 | Content-Type: multipart/related; 6 | boundary="----=_Part_872_1440509474.1574695379478" 7 | 8 | ------=_Part_872_1440509474.1574695379478 9 | Content-Type: text/html; charset=UTF-8 10 | Content-Transfer-Encoding: quoted-printable 11 | Content-Location: file:///C:/exported.html 12 | 13 | 17 | 18 | 20 | Users Navigation Data 21 | 35 | 231 | 232 | 233 |

Users Navigation Data

234 |
235 |

ATN provides log files for different user generated events colle= 236 | cted during a session on their portal (e.g. detailed description visualizat= 237 | ion, comparison, download). The log files contain the time-stamp, the = 238 | user identifier and the item identifier. The event type is available from t= 239 | he name of the log file. These events represent an implicit feedback a= 240 | bout the interest of a user on an item. The feedback is collected parsing t= 241 | he log files in records with the following structure

242 |

userID, itemID, feedback_value, feedback_type, time-stamp, query

243 |

The feedback type is the event type (i.e. view, comparison, download). W= 244 | e can assume a default feedback_value =3D 1 for a view event. We can also a= 245 | ssume that a comparison or a download have the same value as feedback to es= 246 | timate the relevance of an item or that they represent a stronger evidence = 247 | of relevancy and assign a feedback_value =3D 2. The query is the query used= 248 | in the search whose result contains the item.  Examples of the log fi= 249 | les are attached. 

250 |


251 |
252 | 253 | 254 | ------=_Part_872_1440509474.1574695379478-- 255 | -------------------------------------------------------------------------------- /python/python_oop.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2d5cf880", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python OOP\n", 9 | "We will see how Object Oriented Programming is implemented in Python. the special method __init__ is called each time the class is istantiated." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "id": "d9fed405", 15 | "metadata": {}, 16 | "source": [ 17 | "### Class definition" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 42, 23 | "id": "3485cab8", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "class HumanBeing(object):\n", 28 | " def __init__(self, first_name, eye_color):\n", 29 | " self.first_name = first_name\n", 30 | " self.eye_color = eye_color\n", 31 | " self.position = 0\n", 32 | " def walk_steps(self, steps):\n", 33 | " self.position += steps" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 43, 39 | "id": "68a35edb", 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "luigi = HumanBeing('Luigi', 'brown')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 44, 49 | "id": "410ba0f9", 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "data": { 54 | "text/plain": [ 55 | "'Luigi'" 56 | ] 57 | }, 58 | "execution_count": 44, 59 | "metadata": {}, 60 | "output_type": "execute_result" 61 | } 62 | ], 63 | "source": [ 64 | "luigi.first_name" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 45, 70 | "id": "3c63be48", 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "3" 77 | ] 78 | }, 79 | "execution_count": 45, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "luigi.walk_steps(3)\n", 86 | "luigi.position" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 46, 92 | "id": "78c99245", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "32" 99 | ] 100 | }, 101 | "execution_count": 46, 102 | "metadata": {}, 103 | "output_type": "execute_result" 104 | } 105 | ], 106 | "source": [ 107 | "luigi.__sizeof__()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 51, 113 | "id": "dd9852ab", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "class FinancialInstrument(object):\n", 118 | " def __init__(self, symbol, price):\n", 119 | " self.symbol = symbol\n", 120 | " self.__price = price # private attribute\n", 121 | " pass" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 52, 127 | "id": "ecb28d2f", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "<__main__.FinancialInstrument at 0x162e2c42160>" 134 | ] 135 | }, 136 | "execution_count": 52, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "eni_mi = FinancialInstrument('ENI.MI', 11.70)\n", 143 | "eni_mi" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "id": "04190d29", 149 | "metadata": {}, 150 | "source": [ 151 | "Data attributes can be defined on the fly" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 53, 157 | "id": "2a86ed30", 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "1000" 164 | ] 165 | }, 166 | "execution_count": 53, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "eni_mi.num_shares = 1000\n", 173 | "eni_mi.num_shares" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "id": "770c0241", 179 | "metadata": {}, 180 | "source": [ 181 | "### Inheritance" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 56, 187 | "id": "7bf80411", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "class FinancialInstrument(FinancialInstrument):\n", 192 | " def get_price(self):\n", 193 | " return self.__price # private attribute\n", 194 | " def set_price(self, price):\n", 195 | " self.__price = price" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 60, 201 | "id": "a5abe686", 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "12.0" 208 | ] 209 | }, 210 | "execution_count": 60, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "eni_stock = FinancialInstrument('ENI.MI', 12.0)\n", 217 | "eni_stock.get_price()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 65, 223 | "id": "2489e483", 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "data": { 228 | "text/plain": [ 229 | "2.4" 230 | ] 231 | }, 232 | "execution_count": 65, 233 | "metadata": {}, 234 | "output_type": "execute_result" 235 | } 236 | ], 237 | "source": [ 238 | "eni_stock.__price" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 66, 244 | "id": "fea81913", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "12.0" 251 | ] 252 | }, 253 | "execution_count": 66, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "eni_stock.get_price()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 67, 265 | "id": "eae576d3", 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "data": { 270 | "text/plain": [ 271 | "2.4" 272 | ] 273 | }, 274 | "execution_count": 67, 275 | "metadata": {}, 276 | "output_type": "execute_result" 277 | } 278 | ], 279 | "source": [ 280 | "eni_stock.__price = 2.4\n", 281 | "eni_stock.__price" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 68, 287 | "id": "e19273cb", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "class PortfolioPosition(object):\n", 292 | " def __init__(self, financial_instrument, position_size):\n", 293 | " self.position = financial_instrument\n", 294 | " self.__position_size = position_size\n", 295 | " def get_position_size(self):\n", 296 | " return self.__position_size\n", 297 | " def update_position_size(self, position_size):\n", 298 | " self.__position_size = position_size\n", 299 | " def get_position_value(self):\n", 300 | " return self.__position_size * self.position.get_price()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 69, 306 | "id": "197a8c6f", 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "10" 313 | ] 314 | }, 315 | "execution_count": 69, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "pp = PortfolioPosition(eni_stock, 10)\n", 322 | "pp.get_position_size()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 70, 328 | "id": "3c703890", 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "data": { 333 | "text/plain": [ 334 | "120.0" 335 | ] 336 | }, 337 | "execution_count": 70, 338 | "metadata": {}, 339 | "output_type": "execute_result" 340 | } 341 | ], 342 | "source": [ 343 | "pp.get_position_value()" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "id": "4fa8f29c", 349 | "metadata": {}, 350 | "source": [ 351 | "## Yahoo! Finance" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 74, 357 | "id": "2a77beb4", 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/html": [ 363 | "
\n", 364 | "\n", 377 | "\n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | "
OpenHighLowCloseVolumeDividendsStock Splits
Date
2022-09-0511.97812.43211.91012.290127934360.00.0
2022-09-0612.25812.28211.78611.952175281690.00.0
2022-09-0711.79011.97011.54011.618179032410.00.0
2022-09-0811.63811.75611.41011.582167498340.00.0
2022-09-0911.62211.81811.58611.682107066610.00.0
\n", 453 | "
" 454 | ], 455 | "text/plain": [ 456 | " Open High Low Close Volume Dividends Stock Splits\n", 457 | "Date \n", 458 | "2022-09-05 11.978 12.432 11.910 12.290 12793436 0.0 0.0\n", 459 | "2022-09-06 12.258 12.282 11.786 11.952 17528169 0.0 0.0\n", 460 | "2022-09-07 11.790 11.970 11.540 11.618 17903241 0.0 0.0\n", 461 | "2022-09-08 11.638 11.756 11.410 11.582 16749834 0.0 0.0\n", 462 | "2022-09-09 11.622 11.818 11.586 11.682 10706661 0.0 0.0" 463 | ] 464 | }, 465 | "execution_count": 74, 466 | "metadata": {}, 467 | "output_type": "execute_result" 468 | } 469 | ], 470 | "source": [ 471 | "import yfinance as yf\n", 472 | "eni_mi = yf.Ticker(\"ENI.MI\")\n", 473 | "hist = eni_mi.history(period=\"max\")\n", 474 | "hist.tail()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "id": "4f84984c", 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [] 484 | } 485 | ], 486 | "metadata": { 487 | "kernelspec": { 488 | "display_name": "Python 3 (ipykernel)", 489 | "language": "python", 490 | "name": "python3" 491 | }, 492 | "language_info": { 493 | "codemirror_mode": { 494 | "name": "ipython", 495 | "version": 3 496 | }, 497 | "file_extension": ".py", 498 | "mimetype": "text/x-python", 499 | "name": "python", 500 | "nbconvert_exporter": "python", 501 | "pygments_lexer": "ipython3", 502 | "version": "3.9.13" 503 | } 504 | }, 505 | "nbformat": 4, 506 | "nbformat_minor": 5 507 | } 508 | -------------------------------------------------------------------------------- /python/atn/microcircuites_and_descretes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Microcircuits and Descretes" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "cd C:\\Users\\lselmi\\cygwin64\\home\\lselmi\\anaconda\\altertech\\datasets" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 48, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "'0.22.0'" 36 | ] 37 | }, 38 | "execution_count": 48, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "import pandas as pd\n", 45 | "import numpy as np\n", 46 | "from datetime import datetime\n", 47 | "from pandas import Series, DataFrame\n", 48 | "%matplotlib inline\n", 49 | "import matplotlib.pyplot as plt\n", 50 | "import warnings; warnings.simplefilter('ignore')\n", 51 | "plt.style.use('seaborn-whitegrid')\n", 52 | "pd.__version__" 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": {}, 58 | "source": [ 59 | "### Read the data" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 85, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "sales = pd.read_csv('PurchaseData_20180319.csv')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "# Microcircuits Data Analysis\n", 76 | "In order to predict the price (POP_UnitPrice) and the delivery time (PO_Date) of microcircuits we have received from ATN, with the data, a list of features that should be relevant for the task at hand and a set of rules. The relevant features are a subset of the fields in the data set.\n", 77 | "\n", 78 | "1. Component number (ComponentNumber_MAT_Flight)\n", 79 | "2. Specification name (SpecificationName)\n", 80 | "3. Family path (FamilyPath_Flight)\n", 81 | "4. Style (Style_Flight)\n", 82 | "5. Quality level (QLevel_Flight)\n", 83 | "6. Package class (PACKAGECLASS)\n", 84 | "7. Package (PACKAGE)\n", 85 | "8. Finish (FINISH)\n", 86 | "9. Radiation level (TID_HDR_N)\n", 87 | "10. Quality Value Name (QualityValueName)\n", 88 | "11. Manufacturer (MnfrDoeeetName)\n", 89 | "12. Quantity (POP_Qty)\n", 90 | "13. Date of purchase (PO_Date)\n", 91 | "14. Unit price (POP_UnitPrice)\n", 92 | "15. Date of delivery (POP_DeliveryDate)\n", 93 | "\n", 94 | "The assumption is that the data and the rules should allow us to predict the price and delivery time of a microcircuit whether there are records about that specific microcircuit in the sample data set or not. The first 12 paramenters are called features or predictors while the unit price and the date of delivery are called targets. A client, requesting a prediction about the price of a component, will send in the request the predictors that will allow the server to \n", 95 | "\n", 96 | "1. Identify the component (component number, specification name, family path)\n", 97 | "2. Determine the quality characteristics of the component (style, package, package class, finish, radiation level, quality value)\n", 98 | "3. Use other information that might impact the price (manufacturer, quantity, date of purchase) \n", 99 | "\n", 100 | "In order to make a prediction of the price of a component, the algorithm looks into the data to find records about that same component and return the unit price or an average value. In case no records are available about that component number, it looks for records with the same specification name and applies some rules to make a prediction for the price. \n", 101 | "\n", 102 | "The component number and the specification name encode, among other information, the specific family of the component, e.g. whether it is an operational amplifier or an analog to digital converter. When there are no records with the same component number or specification name, the algorithm looks for records about similar components and apply some rules to make a prediction. So the next step in this case is to look for records with the same family path or the same most specific name in the family path. \n" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Data preparation for microcircuits\n", 110 | "Before implementing the algorithm to predict unit price and delivery time for a microcircuit, we have to extract the records from the sale orders data set and apply the following transformation\n", 111 | "\n", 112 | "1. Filter out the records about services (aka \"charges\") \n", 113 | "2. Select the records about microcircuits (family root -> microcircuits)\n", 114 | "3. Extract the most specific family of the component from the hierarchy (family path)\n", 115 | "4. Transform all the prices in euro\n", 116 | "5. Update the all the unit prices applying an increase of 5 % per year (using the compund interest formula) \n", 117 | "\n", 118 | "After the data is prepared we can implement the algorithms for microcircuits \n", 119 | "\n", 120 | "1. Price prediction\n", 121 | "2. Delivery time prediction\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": {}, 127 | "source": [ 128 | "#### 1. Filter out records about services" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 86, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "18508" 140 | ] 141 | }, 142 | "execution_count": 86, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "sales = sales[sales['PRICE LABEL'] == 'MATERIAL UNIT PRICE']\n", 149 | "sales.index.size" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "#### 2. Select the records about microcircuits" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 87, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "Number of records for microcircuits: 3041\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "microcircuits_records = sales[sales['FamilyRoot'] == 'Microcircuits']\n", 174 | "num_microcircuits_records = microcircuits_records.index.size\n", 175 | "print(\"Number of records for microcircuits: \" + str(num_microcircuits_records))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "#### 3. Extract the family root and leaf " 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 88, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "Family root: Microcircuits, Family leaf: Operational Amplifier\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "microcircuits_records['family_leaf'] = [family.split(\"/\")[len(family.split(\"/\")) - 1] for family in microcircuits_records['FamilyPath_Flight'] ]\n", 200 | "microcircuits_records['family_root'] = [family.split(\"/\")[0] for family in microcircuits_records['FamilyPath_Flight'] ]\n", 201 | "print(\"Family root: \" + microcircuits_records['family_root'][0] + \", Family leaf: \" + microcircuits_records['family_leaf'][0])" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "#### 4. Transform all the unit prices in US dollars to euros" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 89, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "microcircuits_records['price_euros'] = microcircuits_records['POP_UnitPrice_CU'] * microcircuits_records['PO_Change'] * (microcircuits_records['PO_Currency'] == 'USD')\n", 218 | "microcircuits_records['price_euros'] += microcircuits_records['POP_UnitPrice_CU'] * (microcircuits_records['PO_Currency'] == 'EUR')" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 90, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "#microcircuits_records['price_euros_simple'] = [price * microcircuits_records['PO_Change'] for price in microcircuits_records['POP_UnitPrice_CU']]" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "#### 5. Update the unit prices\n", 235 | "The date of purchase is used to compute the adjusted price (AP) from the unit price (P) in each record using the formula \n", 236 | "\n", 237 | "> AP = P*(1 + %)^Y\n", 238 | "\n", 239 | "where % is the increase in price per year, e.g. 5 %, Y is the number of years since the date of purchase in the record." 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 117, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "Price in 2013: 9.85. Adjusted price: 12.571373390625004\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "# Change the type of PO_Date from int64 to str\n", 257 | "years_str = pd.Series(microcircuits_records['PO_Date']).astype('str')\n", 258 | "# Extract the 1st 4 digits\n", 259 | "years_str = [year_str[0:4] for year_str in years_str]\n", 260 | "# Change back from str to int and compute the number of years from the purchase date to 2018\n", 261 | "years = [2018 - int(year_str) for year_str in years_str]\n", 262 | "microcircuits_records['years'] = years\n", 263 | "microcircuits_records['adjusted_price'] = microcircuits_records['price_euros'] * np.power(1 + 0.05, microcircuits_records['years'])\n", 264 | "print(\"Price in \" + str(2018 - microcircuits_records['years'][0]) + \": \" + str(microcircuits_records['price_euros'][0]) + \". Adjusted price: \" + str(microcircuits_records['adjusted_price'][0]))" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "## Price prediction for microcircuits\n", 272 | "The client will send all the 12 predictors, each mapped to a field in the dataset\n", 273 | "\n", 274 | "1. Component number\n", 275 | "2. Specification name\n", 276 | "3. Family path\n", 277 | "4. Style \n", 278 | "5. Quality level\n", 279 | "6. Package class\n", 280 | "7. Package\n", 281 | "8. Finish \n", 282 | "9. Radiation level\n", 283 | "10. Quality Value Name\n", 284 | "11. Manufacturer\n", 285 | "12. Quantity\n", 286 | "\n", 287 | "In order to make a prediction the algorithm must find some records about the same component or a similar one in the sale orders. The following three scenarios might happen\n", 288 | "\n", 289 | "1. The component number in the request matches with a component number in the dataset\n", 290 | "2. The component number in the request does not match with any in the dataset but the specification name matches\n", 291 | "3. Neither the component number nor the specification name in the request matches with a record in the sale orders" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "### Scenario 1\n", 299 | "The component number in the request matches with a component number in the datase.\n", 300 | "\n", 301 | "In this scenario the only parameters to use to predict the price are\n", 302 | "\n", 303 | "- Manufacturer\n", 304 | "- Date of purchase\n", 305 | "- Quantity\n", 306 | "\n", 307 | "The manufacturer is used to select the records with the same manufacturer. If the manufacturer is different the records with the different manufacturer will be used. \n", 308 | "\n", 309 | "Compute the average adjusted price for the same quantity in the sale orders and the standard deviation. This step can be performed in the data preparation phase.\n", 310 | "\n", 311 | "If more than one records are availabe with different quantities use a linear interpolation average adjusted prices to find the average adjusted price for the quantity requested. If only one record is available, returns the adjusted price.\n" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "### Scenario 2\n", 319 | "The component number in the request does not match with any in the dataset but the specification name matches.\n", 320 | "\n", 321 | "In this scenario the algorithm must select the records that are about a similar component using the specification name and the family path.\n", 322 | "\n", 323 | "It must also filter the records, about the same specification name and family path" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "### Scenario 3\n", 331 | "Neither the component number nor the specification name in the request matches with a record in the sale orders" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [] 340 | } 341 | ], 342 | "metadata": { 343 | "kernelspec": { 344 | "display_name": "Python 3", 345 | "language": "python", 346 | "name": "python3" 347 | }, 348 | "language_info": { 349 | "codemirror_mode": { 350 | "name": "ipython", 351 | "version": 3 352 | }, 353 | "file_extension": ".py", 354 | "mimetype": "text/x-python", 355 | "name": "python", 356 | "nbconvert_exporter": "python", 357 | "pygments_lexer": "ipython3", 358 | "version": "3.6.4" 359 | } 360 | }, 361 | "nbformat": 4, 362 | "nbformat_minor": 2 363 | } 364 | -------------------------------------------------------------------------------- /python/linalgebra/linalgebra_ch1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "9105420b", 6 | "metadata": {}, 7 | "source": [ 8 | "# Linear Algebra\n", 9 | "This notebook contains examples of linear algebra in Python. The examples are based on Gilbert Strang's book [Introduction to Linear Algebra, 5th Edition](https://math.mit.edu/~gs/linearalgebra/) and on Robert Johansson's book [Numerical Python](https://jrjohansson.github.io/numericalpython.html)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 12, 15 | "id": "02642e2c", 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "import numpy as np\n", 20 | "import math\n", 21 | "import matplotlib.pyplot as plt" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "id": "38696b67", 27 | "metadata": {}, 28 | "source": [ 29 | "## Chapter 1" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "id": "14367992", 35 | "metadata": {}, 36 | "source": [ 37 | "### 1.1 Vectors and Linear Combinations" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 6, 43 | "id": "7321c736", 44 | "metadata": {}, 45 | "outputs": [ 46 | { 47 | "data": { 48 | "text/plain": [ 49 | "(array([1, 1]), array([2, 3]))" 50 | ] 51 | }, 52 | "execution_count": 6, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "v = np.array([1,1])\n", 59 | "w = np.array([2,3])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "id": "3dd71d29", 65 | "metadata": {}, 66 | "source": [ 67 | "we can add two vectors" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 8, 73 | "id": "6ea79ae8", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "data": { 78 | "text/plain": [ 79 | "array([3, 4])" 80 | ] 81 | }, 82 | "execution_count": 8, 83 | "metadata": {}, 84 | "output_type": "execute_result" 85 | } 86 | ], 87 | "source": [ 88 | "v + w" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "id": "1b664a55", 94 | "metadata": {}, 95 | "source": [ 96 | "we can multiply a vector by a scalar" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 9, 102 | "id": "f1a51219", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "array([2, 2])" 109 | ] 110 | }, 111 | "execution_count": 9, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "2 * v" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "id": "92622bf0", 123 | "metadata": {}, 124 | "source": [ 125 | "we can compute a linear combination of two vectors" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 43, 131 | "id": "90f7d9b9", 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAEKCAYAAAAxXHOuAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAAsTAAALEwEAmpwYAAAZaElEQVR4nO3df5RcdZnn8fdDhyAJMPwKvxKCwEZy4hAYbAMrKAMsDIGRiIMLQYybBXOiZpXdZWdyDqt4dhQHx2XPsiDYODmDuwPIDCA5KxJcxnN0BDQdJYHwyxiDdAImQRBBSNLm2T+qklQq1Z3q9L1d1d3v1zl9uuve+6083lz85D5PVXVkJpIkFWWvVhcgSRpZDBZJUqEMFklSoQwWSVKhDBZJUqEMFklSoUoNlog4PyKei4hVEbGwwf5ZEbEiIp6IiO6IOKPZtZKk9hRlvY8lIjqA54FzgR5gKTA7M5+uOWY/4M3MzIiYDtyTmVObWStJak9l3rHMAFZl5urM3AzcDcyqPSAz38gdyTYeyGbXSpLa05gSn3si8GLN4x7g1PqDIuJi4MvAYcCFA1lbXT8PmAcwfvz490ydOnXQhUvSaLFs2bKNmTmhyOcsM1iiwbZd+m6ZeT9wf0R8APhr4N80u7a6vgvoAujs7Mzu7u49LliSRpuIeKHo5yyzFdYDHF3zeBKwrq+DM/MHwPERcehA10qS2keZwbIUmBIRx0bEWOAyYHHtARHxryIiqj+fAowFXmlmrSSpPZXWCsvM3ohYACwBOoBFmbkyIuZX998G/AUwJyK2AG8Bl1aH+Q3XllWrJKk4pb3cuBWcsUjSwETEsszsLPI5fee9JKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUAaLJKlQBoskqVAGiySpUKUGS0ScHxHPRcSqiFjYYP9HI2JF9evRiDipZt+aiHgyIp6IiO4y65QkFWdMWU8cER3ALcC5QA+wNCIWZ+bTNYf9EjgzM1+NiJlAF3Bqzf6zMnNjWTVKkopX5h3LDGBVZq7OzM3A3cCs2gMy89HMfLX68HFgUon1SJKGQJnBMhF4seZxT3VbX64EvlvzOIGHI2JZRMwroT5JUglKa4UB0WBbNjww4iwqwXJGzebTM3NdRBwGfC8ins3MHzRYOw+YBzB58uTBVy1JGpQy71h6gKNrHk8C1tUfFBHTgW8AszLzlW3bM3Nd9ft64H4qrbVdZGZXZnZmZueECRMKLF+StCfKDJalwJSIODYixgKXAYtrD4iIycB9wMcy8/ma7eMjYv9tPwPnAU+VWKskqSCltcIyszciFgBLgA5gUWaujIj51f23AZ8HDgG+FhEAvZnZCRwO3F/dNga4MzMfKqtWSVJxIrPh2GNY6uzszO5u3/IiSc2KiGXVf9AXxnfeS5IKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKZbBIkgplsEiSCmWwSJIKVWqwRMT5EfFcRKyKiIUN9n80IlZUvx6NiJOaXStJak+lBUtEdAC3ADOBacDsiJhWd9gvgTMzczrw10DXANZKktpQmXcsM4BVmbk6MzcDdwOzag/IzEcz89Xqw8eBSc2ulSS1pzKDZSLwYs3jnuq2vlwJfHegayNiXkR0R0T3hg0bBlGuJKkIZQZLNNiWDQ+MOItKsPzVQNdmZldmdmZm54QJE/aoUElSccaU+Nw9wNE1jycB6+oPiojpwDeAmZn5ykDWSpLaT5l3LEuBKRFxbESMBS4DFtceEBGTgfuAj2Xm8wNZK0lqT6XdsWRmb0QsAJYAHcCizFwZEfOr+28DPg8cAnwtIgB6q22thmvLqlWSVJzIbDi6GJY6Ozuzu7u71WVI0rAREcsys7PI5/Sd95KkQhkskqRCGSySpEIZLJKkQhksKsXW3MpIemGIpOYZLCrcS797ic/98+eovoRc0ihjsKhQS1Yt4eSvn8xR+x/V6lIktYjBokJs+cMWFv6/hZz/D+fz6luvcukfX9rqkiS1SJmfFaZR4oXXXmD2vbN5rOcxAC5814UcOu7QFlclqVUMFg3Kfc/cx5WLr+S1t1/bvm3O9DmtK0hSyxks2iNv977NNQ9fwy1Lb9lp+8H7HswFUy5oUVWS2oHBogF7/pXnufSfLuWJl5/YZd9l776MfcbsM/RFSWobDu81IKtfXc3se2ez/OXlDffPOck2mDTaGSwakOMOOo5l85bx7IJn2XfMvjvte9ch72LGxBktqkxSuzBYNGC9W3u5avFVvNX7FgAHvuNAoDK0902RkgwWDdh137+OH/7qhwDMPXkuX7vgawBcMf2KVpYlqU04vNeALFm1hOv/5XoA3j3h3dx8wc3s07EPP/zVDznmwGNaXJ2kdmCwqGlrX1/LFfdX7krG7T2Oez5yD+P2HgfATTNvamVpktqIrTA1pXdrL7Pvnc3G328E4NYLb2XahGnb94/Zy3+jSKowWNSU+rmKLyuW1BeDRbvVaK4iSX0xWNSv/uYqktSIwaI+7W6uIkmNGCzqk3MVSXvCYFFDzlUk7SmDRbtwriJpMAwW7cS5iqTBKjVYIuL8iHguIlZFxMIG+6dGxGMRsSkirqnbtyYinoyIJyKiu8w6tYNzFUmDVdrbpSOiA7gFOBfoAZZGxOLMfLrmsN8AnwE+1MfTnJWZG8uqUTtzriKpCLu9Y4mIBRFx0B489wxgVWauzszNwN3ArNoDMnN9Zi4FtuzB86tAzlUkFaWZVtgRVO427qm2tpr9hRsTgRdrHvdUtzUrgYcjYllEzOvroIiYFxHdEdG9YcOGATy9tnGuIqlIuw2WzPyvwBTg74B/B/w8Iq6PiON3s7RRAOUAajs9M08BZgKfjogP9FFfV2Z2ZmbnhAkTBvD02sa5iqQiNTW8z8wEXq5+9QIHAf8UEV/pZ1kPcHTN40nAumYLy8x11e/rgfuptNZUMOcqkorWzIzlMxGxDPgK8CPgxMz8JPAe4C/6WboUmBIRx0bEWOAyYHEzRUXE+IjYf9vPwHnAU82sVfOcq0gqQzOvCjsU+HBmvlC7MTO3RsSf97UoM3sjYgGwBOgAFmXmyoiYX91/W0QcAXQDBwBbI+JqYFr1z7y/Os4ZA9yZmQ8N+H+d+uRcRVJZdhssmfn5fvY9s5u1DwIP1m27rebnl6m0yOq9Dpy0u9q055yrSCqL77wfhZyrSCqTwTLKOFeRVDaDZYTZtAmefrrxPucqkoaCwTKCrF8P55wDW/r4HAPnKpKGQmmfFaahtWIFfPCDlZ+nT991v3MVSUPFO5YR4Nvfhve9D371K7joIqj/0B3nKpKGksEyjGXC9dfDxRfDm29Wtl100c7HOFeRNNRshQ1Tb70FV14Jd921Y9sBB8CZZ+58nHMVSUPNYBmG1q2DD30Ili7defvMmTB27I7HzlUktYLBMsx0d8OsWZVwqVfbBnOuIqlVnLEMI889V2l/bWzwOzU7Oip3LOBcRVJrGSzDyAknwPLl8OyzsO++O+8780w4qPp7Pp2rSGolg2WYyYRPfKIyvAc48cTK921tMOcqklrNYBlmurrgkUcqP8+dCw88APvsU3lzpHMVSe3A4f0wsmYNXHNN5eeJE+HGG+HAA2HRIpj8zl7OvsO5iqTWM1iGiUy46ip4443K49tvr4QKwOWXw7WPOFeR1B5shQ0T9S2wba8AA+cqktqLwTIMNGqBbeNcRVK7MVjaXH8tMN+vIqkdGSxtrr8WmO9XkdSODJY21l8LzLmKpHZlsLSp/lpgzlUktTODpU311QJzriKp3Rksbai/FphzFUntzmBpM/21wJyrSBoODJY201cLzLmKpOHCYGkjfbXAnKtIGk5KDZaIOD8inouIVRGxsMH+qRHxWERsiohrBrJ2pOmvBeZcRdJwUlqwREQHcAswE5gGzI6I+n9m/wb4DPDVPVg7ovTVAnOuImm4KfOOZQawKjNXZ+Zm4G5gVu0Bmbk+M5cCWwa6diTpqwXmXEXScFRmsEwEXqx53FPdVujaiJgXEd0R0b1hw4Y9KrSV+mqBOVeRNFyVGSzRYFsWvTYzuzKzMzM7J0yY0HRx7aKvFphzFUnDVZnB0gMcXfN4ErBuCNYOG321wJyrSBrOygyWpcCUiDg2IsYClwGLh2DtsNBXC8y5iqThrrRfTZyZvRGxAFgCdACLMnNlRMyv7r8tIo4AuoEDgK0RcTUwLTNfb7S2rFpboVELzLmKpJEgMpsde7S/zs7O7O7ubnUZu7VmDZx4YuVuZeJEeOqpyt3KtY9cu70FNvfkuSyataildUoa+SJiWWZ2FvmcvvN+iPXVAnOuImmkMFiGWKMWmHMVSSOJwTKEGr0KzLmKpJGmtOG9dtZXC+zaR3y/iqSRxTuWIdKoBeZcRdJIZLAMgUYtMOcqkkYqg6VkjVpg+x3gXEXSyOWMpWSNWmDOVSSNZN6xlKhRC8y5iqSRzmApSaMW2Jt7OVeRNPLZCitJfQvs3D/r5ew7nKtIGvm8YylBoxaYv19F0mhhsBSsUQvsxxudq0gaPWyFFay+BTb99LWc/HXnKpJGD4OlQPUtsK98tZcP+34VSaOMrbCCNGqB/Y+fOVeRNPoYLAWpb4HtNcW5iqTRyVZYAepbYNf8t7WceZdzFUmjk3csg1TfArutq5f533OuImn0MlgGqb4F9tg+zlUkjW62wgahvgV24WeXcMm3natIGt28Y9lD9S2wL9+ylvkPO1eRJINlD9W2wD4+t5fbf+NcRZLAYNkj9S2wQy5xriJJ2xgsA1TfApv/t0u4calzFUnaxmAZoNoW2L+9ai3/s8e5iiTVMlgGoLYFdtSkXl6c4VxFkuqVGiwRcX5EPBcRqyJiYYP9ERE3VfeviIhTavatiYgnI+KJiOgus85m1LfA3v+563hsnXMVSapX2vtYIqIDuAU4F+gBlkbE4sx8uuawmcCU6tepwK3V79uclZkby6pxIGpbYOd9cgnfesm5iiQ1UuYdywxgVWauzszNwN3ArLpjZgHfzIrHgQMj4sgSa9ojtS2wI6asZdk7natIUl/KDJaJwIs1j3uq25o9JoGHI2JZRMwrrcrd2KkFtlcvh8yfzStvOVeRpL6U+ZEu0WBbDuCY0zNzXUQcBnwvIp7NzB/s8odUQmcewOTJkwdTb0O1LbDpn72OFb9zriJJ/SnzjqUHOLrm8SRgXbPHZOa27+uB+6m01naRmV2Z2ZmZnRMmTCio9IraFtghpy5hxR85V5Gk3SkzWJYCUyLi2IgYC1wGLK47ZjEwp/rqsNOA32bmSxExPiL2B4iI8cB5wFMl1rqLnVpg+69lywd3M1fp7YXvfKeyUJJGsdKCJTN7gQXAEuAZ4J7MXBkR8yNifvWwB4HVwCrgduBT1e2HA/8SEcuBnwDfycyHyqq1ke0tsL16OfzTs3m9t4+5SiY88ABMn165xYlG3T1JGj0iR9C/sDs7O7O7e/BveVmzBk48sXK3st+sa3njTyotsLknz2XRrEU7DvzRj+Av/xIefRSOPx6efhrGjh30ny9JQyUilmVmZ5HP6Tvv6+zUAjt+yfZQ2WmusnIlzJoFZ5xRCRWAL33JUJEk/EVfu9jeAtt/LftcfgWbqJmrvPwKfOE/wN//PWzdumPRe94DH/lIiyqWpPZisNTY/iqwvXoZe/lsNnVU5yp/+lWmffUOuOkmePvtXRfecAPs5c2fJIHBst1OLbCzr2PzkdX3q7zrUuZ8dhH0Nbs591w455yhK1SS2pz/zK7a3gI7fgl8oGaucski+MlP4KGHYEyDHP6bvxnaQiWpzRks1LTA9l/LXpc0eL/Kxo2VA3p7d144ezaccsouzydJo9moD5btLbDf98Ils9m6b937VTZsgLPPhqeq78+cPRsOOgj23hu++MUWVi5J7WnUz1i2t8DOvg6OqfscsPpQufxy+OY34ROfgP32g+OOa13hktSmRnWwbG+B1c9VLri571Dp6IC5c+GEE1pWtyS1s1EbLNtbYLEWPlw3V3ntzb5DBeD9729R1ZLU/kbtjKWrCx75fmWuwviauQoT+g8VSVK/RmWwbG+B/WndXOWomYaKJA3SqAuW7S2ww+vmKu+9zlCRpAKMumDp6oJHflI3VznnNsb92Z8bKpJUgFEVLGvWwH/+L3VzlfffwLRLPmmoSFJBRk2wbGuBvfnemrnK1NnM+dTXDRVJKtCoCZauLnhkTc1c5eCp3Pzl5YaKJBVsVATLmjXwn75QM1fpGMc9d/UybvnTlQMMFUkqzIgPlkz491f18vuZNXOVxw5i2mOrKgcYKpJUqBEfLF1d8P2smav88kDmPLi2stNQkaTCjehgWbMGrr55x1xl6m/35eY7X6vsNFQkqRQjNlgy4YpPreXtmZW5yj5bxnDv/3mLcVswVCSpRCM2WG79ei8/OmLHXKXr//YybQOGiiSVbEQGy5o1cPUDO+YqH/3Z3sxZjqEiSUNgxAVLJlx8zRK2nFaZqxyzfj+6HtxiqEjSEBlxwXLDrWt54rjKXGXM5rE8+I9vMO4jhookDZURFSybNiXX/nTHXOV/faeDaecaKpI0lEbUb5B8/qV1bD36ZQDO/dlE5r/7TENFkoZYqXcsEXF+RDwXEasiYmGD/RERN1X3r4iIU5pd28jmsZVQOXD9UXx7v9MNFUlqgdKCJSI6gFuAmcA0YHZETKs7bCYwpfo1D7h1AGsb/7mb9+W7v/3XjLvjTkNFklqgzDuWGcCqzFydmZuBu4FZdcfMAr6ZFY8DB0bEkU2ubeg//uKDnHbHtwwVSWqRMmcsE4EXax73AKc2cczEJtcCEBHzqNztAGy68Vv3PHXjt+4ZRNmlOxTY2OoimmCdxbLOYllncU4o+gnLDJZosC2bPKaZtZWNmV1AF0BEdGdm50CKHGrDoUawzqJZZ7GsszgR0V30c5YZLD3A0TWPJwHrmjxmbBNrJUltqMwZy1JgSkQcGxFjgcuAxXXHLAbmVF8ddhrw28x8qcm1kqQ2VNodS2b2RsQCYAnQASzKzJURMb+6/zbgQeACYBXwe2Buf2ub+GO7iv9fUrjhUCNYZ9Gss1jWWZzCa4zMhqMLSZL2yIj6SBdJUusZLJKkQrVtsJTxcTARcXBEfC8ifl79flCr6oyIoyPi+xHxTESsjIjP1qz5QkSsjYgnql8XtKrO6r41EfFktZbumu2Fns9BnMsTas7VExHxekRcXd3XinM5NSIei4hNEXFNM2tbdG02rLMNr83+zueQXJuDqbMNr8+PVv/7WRERj0bESbtbO+DzmZlt90VlYP8L4DgqLz1eDkyrO+YC4LtU3vNyGvDj3a0FvgIsrP68ELihhXUeCZxS/Xl/4PmaOr8AXNMO57O6bw1waIPnLex8DrbGuud5GTimhefyMOC9wJdq/+w2vDb7qrPdrs2GdQ7VtVlEnW12fb4POKj680xK+P/Odr1jKevjYGYBd1R/vgP4UKvqzMyXMvOnAJn5O+AZKp84UIbBnM/+FHk+i6rxHOAXmfnCIGoZVJ2ZuT4zlwJbBrB2yK/Nvupst2uzn/PZn7Y5n3Xa4fp8NDNfrT58nMr7BHe3dkDns12Dpa+PemnmmP7WHp6V98lQ/X5YC+vcLiLeCfwJ8OOazQuqt6qLCriNH2ydCTwcEcui8hE62xR5Pgs5l1Te83RX3bahPpd7srYV1+Zutcm12Z+huDaLqHObdrs+r6TSBdjd2gGdz3YNliH5OJgCDKbOys6I/YB7gasz8/Xq5luB44GTgZeA/97iOk/PzFOo3DZ/OiI+MMh6GiniXI4FLgL+sWZ/K85lGWsHatB/Vhtdm/0ZimsTijmfbXV9RsRZVILlrwa6dnfaNVgG83Ew/a399bbWSfX7+hbWSUTsTeU/3H/IzPu2HZCZv87MP2TmVuB2KreoLaszM7d9Xw/cX1NPkedzUDVWzQR+mpm/3rahRedyT9a24trsU5tdm30aomtz0HVWtc31GRHTgW8AszLzlSbWDuh8tmuwlPVxMIuBj1d//jjwQKvqjIgA/g54JjNvrF1QNze4GHiqhXWOj4j9q3WNB86rqafI8zmYv/NtZlPXZmjRudyTta24Nhtqw2uzrzqH6tocVJ012uL6jIjJwH3AxzLz+SbXDux8NvNKg1Z8UXkF0PNUXqVwbXXbfGB+9eeg8svAfgE8CXT2t7a6/RDgEeDn1e8Ht6pO4Awqt5krgCeqXxdU9/3v6rErqn+hR7awzuOovDpkObCyzPM5yL/zccArwB/VPWcrzuURVP719zrwWvXnA9rw2mxYZxtem33VOWTXZgF/7+10fX4DeLXm77a7v7V7cj79SBdJUqHatRUmSRqmDBZJUqEMFklSoQwWSVKhDBZJUqEMFklSoQwWSVKhDBapJBHx3uqHC76j+i7xlRHxx62uSyqbb5CUShQRXwTeAewL9GTml1tcklQ6g0UqUfUzl5YCbwPvy8w/tLgkqXS2wqRyHQzsR+U3Mb6jxbVIQ8I7FqlEEbGYym/iO5bKBwwuaHFJUunGtLoAaaSKiDlAb2beGREdwKMRcXZm/nOra5PK5B2LJKlQzlgkSYUyWCRJhTJYJEmFMlgkSYUyWCRJhTJYJEmFMlgkSYX6/7IN1OLRibb1AAAAAElFTkSuQmCC\n", 137 | "text/plain": [ 138 | "
" 139 | ] 140 | }, 141 | "metadata": { 142 | "needs_background": "light" 143 | }, 144 | "output_type": "display_data" 145 | } 146 | ], 147 | "source": [ 148 | "V = np.array([v, w, v + w])\n", 149 | "origin = np.array([[0, 0, 0],[0, 0, 0]]) # origin point\n", 150 | "fig, ax = plt.subplots()\n", 151 | "ax.quiver(*origin, V[:,0], V[:,1], color=['r','b','g'], scale=10)\n", 152 | "ax.set(xlim=(0, 0.2), ylim=(0, 0.3))\n", 153 | "plt.xlabel('x')\n", 154 | "plt.ylabel('y')\n", 155 | "plt.show()\n" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "id": "8cfe903f", 161 | "metadata": {}, 162 | "source": [ 163 | "### 1.2 Lengths and Dot products\n", 164 | "The inner product of two vectors is implemented in NumPy by the dot() function." 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 20, 170 | "id": "59305e51", 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "(3, 2)" 177 | ] 178 | }, 179 | "execution_count": 20, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "data = np.array([[1, 2], [3, 4], [5, 6]])\n", 186 | "rows, cols = data.shape\n", 187 | "rows, cols" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "id": "200ad0db", 193 | "metadata": {}, 194 | "source": [ 195 | "Two vectors whose internal product is zero are orthogonal" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 9, 201 | "id": "20ff3e58", 202 | "metadata": {}, 203 | "outputs": [ 204 | { 205 | "data": { 206 | "text/plain": [ 207 | "0" 208 | ] 209 | }, 210 | "execution_count": 9, 211 | "metadata": {}, 212 | "output_type": "execute_result" 213 | } 214 | ], 215 | "source": [ 216 | "v = np.array([1, 3, 2])\n", 217 | "w = np.array([4, -4, 4])\n", 218 | "np.dot(v, w)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "id": "b338209e", 224 | "metadata": {}, 225 | "source": [ 226 | "The length of a vector is defined as $||\\vec{v}|| = \\sqrt{\\vec{v} \\cdot \\vec{v}} $" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 13, 232 | "id": "d0540e9f", 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "3.7416573867739413" 239 | ] 240 | }, 241 | "execution_count": 13, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "math.sqrt(np.dot(v,v))" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "79216113", 253 | "metadata": {}, 254 | "source": [ 255 | "so that the unit vector\n", 256 | "\n", 257 | "$$ \\frac{\\vec{v}}{||\\vec{v}||} $$\n", 258 | "\n", 259 | "has length 1" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "id": "17eeb99d", 265 | "metadata": {}, 266 | "source": [ 267 | "The angle $\\theta$ between two vectors $\\vec{v}$ and $\\vec{w}$ is defined as\n", 268 | "\n", 269 | "$$ \\cos{\\theta} = \\frac{\\vec{v} \\cdot \\vec{w}}{||\\vec{v}|| ||\\vec{w}||} $$" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 16, 275 | "id": "550c437f", 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/plain": [ 281 | "0.7071067811865475" 282 | ] 283 | }, 284 | "execution_count": 16, 285 | "metadata": {}, 286 | "output_type": "execute_result" 287 | } 288 | ], 289 | "source": [ 290 | "v = np.array([1, 0])\n", 291 | "w = np.array([1, 1])\n", 292 | "len_v = math.sqrt(np.dot(v, v))\n", 293 | "len_w = math.sqrt(np.dot(w, w))\n", 294 | "np.dot(v, w) / (len_v * len_w)" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "id": "a095e2a2", 300 | "metadata": {}, 301 | "source": [ 302 | "### 1.3 Matrices\n", 303 | "A matrix is a list of vectors\n", 304 | "\n", 305 | "$$ A = \\begin{bmatrix} 1 & 2 \\\\ 3 & 4 \\\\ 5 & 6 \\end{bmatrix}$$\n", 306 | "\n", 307 | "We can combine the matrix vectors by computig the inner product between the matrix and a vector, e.g. $\\vec{x} = [7, 8]$ that represent how we want to combine the matrix vectors\n", 308 | "\n", 309 | "$$ \\begin{bmatrix} 1 & 2 \\\\ 3 & 4 \\\\ 5 & 6 \\end{bmatrix} \\begin{bmatrix} 7 \\\\ 8 \\end{bmatrix} = \\begin{bmatrix} 23 \\\\ 53 \\\\ 83 \\end{bmatrix}$$" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 22, 315 | "id": "9cf3b36d", 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "(3, 2)" 322 | ] 323 | }, 324 | "execution_count": 22, 325 | "metadata": {}, 326 | "output_type": "execute_result" 327 | } 328 | ], 329 | "source": [ 330 | "A = np.array([[1, 2], [3, 4], [5, 6]])\n", 331 | "A.shape # rows, columns" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 25, 337 | "id": "4a5edfb1", 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/plain": [ 343 | "array([23, 53, 83])" 344 | ] 345 | }, 346 | "execution_count": 25, 347 | "metadata": {}, 348 | "output_type": "execute_result" 349 | } 350 | ], 351 | "source": [ 352 | "x = np.array([7, 8])\n", 353 | "np.dot(A, x)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "id": "2856f488", 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [] 363 | } 364 | ], 365 | "metadata": { 366 | "kernelspec": { 367 | "display_name": "Python 3 (ipykernel)", 368 | "language": "python", 369 | "name": "python3" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 3 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython3", 381 | "version": "3.7.12" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 5 386 | } 387 | -------------------------------------------------------------------------------- /python/finance/data/ENI.MI.csv: -------------------------------------------------------------------------------- 1 | Date,Open,High,Low,Close,Adj Close,Volume 2 | 2021-08-23,10.132000,10.234000,10.092000,10.234000,9.527454,15205357 3 | 2021-08-24,10.288000,10.358000,10.248000,10.318000,9.605656,13473152 4 | 2021-08-25,10.304000,10.340000,10.274000,10.330000,9.616826,8469631 5 | 2021-08-26,10.290000,10.368000,10.252000,10.306000,9.594483,10300966 6 | 2021-08-27,10.338000,10.450000,10.318000,10.428000,9.708061,14166843 7 | 2021-08-30,10.448000,10.492000,10.410000,10.432000,9.711784,8640464 8 | 2021-08-31,10.450000,10.472000,10.380000,10.454000,9.732265,18640305 9 | 2021-09-01,10.492000,10.616000,10.480000,10.496000,9.771366,19475031 10 | 2021-09-02,10.536000,10.680000,10.502000,10.632000,9.897977,17048493 11 | 2021-09-03,10.630000,10.660000,10.512000,10.530000,9.803019,14713759 12 | 2021-09-06,10.572000,10.678000,10.540000,10.586000,9.855153,10952359 13 | 2021-09-07,10.602000,10.664000,10.550000,10.582000,9.851429,11697761 14 | 2021-09-08,10.570000,10.694000,10.462000,10.590000,9.858876,18936416 15 | 2021-09-09,10.580000,10.654000,10.528000,10.586000,9.855153,13919894 16 | 2021-09-10,10.620000,10.652000,10.532000,10.544000,9.816052,12568028 17 | 2021-09-13,10.608000,10.848000,10.602000,10.806000,10.059963,28029686 18 | 2021-09-14,10.824000,11.008000,10.824000,10.904000,10.151198,21974204 19 | 2021-09-15,10.940000,11.068000,10.926000,10.990000,10.231260,20646537 20 | 2021-09-16,11.032000,11.198000,11.004000,11.024000,10.262913,30759090 21 | 2021-09-17,11.110000,11.164000,10.924000,10.942000,10.186575,36040734 22 | 2021-09-20,10.550000,10.584000,10.330000,10.414000,10.091608,26213320 23 | 2021-09-21,10.458000,10.708000,10.452000,10.606000,10.277664,19744092 24 | 2021-09-22,10.738000,10.928000,10.700000,10.886000,10.548996,22755868 25 | 2021-09-23,10.980000,10.998000,10.832000,10.902000,10.564502,15922933 26 | 2021-09-24,10.920000,10.978000,10.860000,10.932000,10.593573,12280924 27 | 2021-09-27,11.000000,11.208000,10.992000,11.186000,10.839709,22930665 28 | 2021-09-28,11.282000,11.438000,11.260000,11.264000,10.915295,28629086 29 | 2021-09-29,11.260000,11.404000,11.150000,11.388000,11.035456,17341462 30 | 2021-09-30,11.412000,11.578000,11.402000,11.546000,11.188564,27298469 31 | 2021-10-01,11.416000,11.544000,11.302000,11.482000,11.126546,19563954 32 | 2021-10-04,11.492000,11.758000,11.462000,11.642000,11.281592,25481124 33 | 2021-10-05,11.700000,11.914000,11.632000,11.848000,11.481215,28265202 34 | 2021-10-06,11.848000,11.848000,11.596000,11.666000,11.304850,22966517 35 | 2021-10-07,11.750000,11.798000,11.386000,11.588000,11.229264,25591317 36 | 2021-10-08,11.696000,11.900000,11.652000,11.858000,11.490906,25139901 37 | 2021-10-11,11.940000,12.072000,11.846000,12.006000,11.634323,22362197 38 | 2021-10-12,11.900000,12.012000,11.828000,11.974000,11.603314,16097752 39 | 2021-10-13,11.974000,11.992000,11.690000,11.822000,11.456019,19824484 40 | 2021-10-14,11.944000,12.046000,11.932000,11.980000,11.609128,19042947 41 | 2021-10-15,12.078000,12.226000,12.060000,12.208000,11.830070,21210629 42 | 2021-10-18,12.216000,12.304000,12.186000,12.236000,11.857203,17152230 43 | 2021-10-19,12.258000,12.294000,12.156000,12.172000,11.795185,16104721 44 | 2021-10-20,12.170000,12.350000,12.156000,12.318000,11.936666,13570949 45 | 2021-10-21,12.268000,12.318000,12.110000,12.134000,11.758361,15135627 46 | 2021-10-22,12.138000,12.218000,12.052000,12.078000,11.704095,13108483 47 | 2021-10-25,12.166000,12.336000,12.112000,12.278000,11.897903,14628741 48 | 2021-10-26,12.290000,12.500000,12.146000,12.358000,11.975427,19923852 49 | 2021-10-27,12.338000,12.442000,12.192000,12.268000,11.888213,19233474 50 | 2021-10-28,12.174000,12.224000,12.020000,12.164000,11.787433,16985428 51 | 2021-10-29,12.194000,12.484000,12.180000,12.404000,12.020003,26402029 52 | 2021-11-01,12.478000,12.796000,12.450000,12.746000,12.351416,24242601 53 | 2021-11-02,12.746000,12.832000,12.452000,12.540000,12.151793,21518514 54 | 2021-11-03,12.490000,12.496000,12.292000,12.380000,11.996746,16475518 55 | 2021-11-04,12.412000,12.640000,12.412000,12.560000,12.171174,16348057 56 | 2021-11-05,12.454000,12.740000,12.454000,12.672000,12.279706,14126171 57 | 2021-11-08,12.720000,12.834000,12.616000,12.682000,12.289397,13584199 58 | 2021-11-09,12.646000,12.754000,12.568000,12.620000,12.229316,10066288 59 | 2021-11-10,12.674000,12.826000,12.660000,12.664000,12.271954,16042075 60 | 2021-11-11,12.600000,12.698000,12.536000,12.646000,12.254511,10370345 61 | 2021-11-12,12.600000,12.620000,12.446000,12.512000,12.124660,13719125 62 | 2021-11-15,12.440000,12.586000,12.412000,12.546000,12.157606,10901389 63 | 2021-11-16,12.602000,12.698000,12.560000,12.598000,12.207996,12105355 64 | 2021-11-17,12.574000,12.666000,12.542000,12.592000,12.202183,10063683 65 | 2021-11-18,12.450000,12.488000,12.310000,12.430000,12.045198,18750020 66 | 2021-11-19,12.620000,12.762000,12.136000,12.168000,11.791309,27790201 67 | 2021-11-22,12.200000,12.360000,12.072000,12.328000,11.946356,16066193 68 | 2021-11-23,12.208000,12.472000,12.152000,12.380000,11.996746,14566945 69 | 2021-11-24,12.450000,12.552000,12.258000,12.370000,11.987056,14142661 70 | 2021-11-25,12.390000,12.398000,12.244000,12.296000,11.915346,10793759 71 | 2021-11-26,11.800000,11.828000,11.492000,11.530000,11.173059,36240991 72 | 2021-11-29,11.786000,12.016000,11.668000,11.800000,11.434702,22433835 73 | 2021-11-30,11.546000,11.700000,11.448000,11.642000,11.281592,26761672 74 | 2021-12-01,11.830000,12.060000,11.766000,11.904000,11.535482,17637484 75 | 2021-12-02,11.790000,11.986000,11.706000,11.940000,11.570367,18137069 76 | 2021-12-03,12.090000,12.214000,12.008000,12.010000,11.638201,18439109 77 | 2021-12-06,12.120000,12.360000,12.120000,12.292000,11.911470,13925642 78 | 2021-12-07,12.430000,12.616000,12.402000,12.568000,12.178926,15369236 79 | 2021-12-08,12.480000,12.550000,12.360000,12.390000,12.006436,11415930 80 | 2021-12-09,12.410000,12.422000,12.168000,12.266000,11.886275,12442861 81 | 2021-12-10,12.184000,12.366000,12.176000,12.206000,11.828133,8227832 82 | 2021-12-13,12.260000,12.278000,11.978000,12.020000,11.647891,12390937 83 | 2021-12-14,12.044000,12.186000,12.000000,12.148000,11.771928,11456932 84 | 2021-12-15,12.076000,12.120000,11.914000,12.000000,11.628510,14308412 85 | 2021-12-16,12.158000,12.266000,12.060000,12.198000,11.820380,17590204 86 | 2021-12-17,12.104000,12.176000,11.922000,12.052000,11.678900,19831352 87 | 2021-12-20,11.700000,11.834000,11.522000,11.834000,11.467649,16531786 88 | 2021-12-21,11.990000,12.198000,11.918000,12.180000,11.802938,11864621 89 | 2021-12-22,12.182000,12.196000,11.992000,12.174000,11.797123,9485605 90 | 2021-12-23,12.194000,12.338000,12.156000,12.240000,11.861080,9547017 91 | 2021-12-27,12.120000,12.310000,12.100000,12.294000,11.913408,6158602 92 | 2021-12-28,12.304000,12.470000,12.302000,12.404000,12.020003,9148633 93 | 2021-12-29,12.400000,12.476000,12.224000,12.280000,11.899841,9731333 94 | 2021-12-30,12.260000,12.300000,12.190000,12.220000,11.841700,8216497 95 | 2022-01-03,12.300000,12.478000,12.274000,12.408000,12.023879,9327410 96 | 2022-01-04,12.454000,12.708000,12.436000,12.610000,12.219625,15884351 97 | 2022-01-05,12.630000,12.778000,12.568000,12.756000,12.361106,16047955 98 | 2022-01-06,12.614000,12.794000,12.550000,12.650000,12.258387,15643210 99 | 2022-01-07,12.690000,12.800000,12.652000,12.790000,12.394053,12857361 100 | 2022-01-10,12.800000,12.882000,12.628000,12.684000,12.291335,13906778 101 | 2022-01-11,12.726000,12.820000,12.622000,12.810000,12.413435,10082884 102 | 2022-01-12,12.896000,13.086000,12.892000,13.052000,12.647943,22009160 103 | 2022-01-13,13.002000,13.054000,12.946000,13.024000,12.620810,13225765 104 | 2022-01-14,12.996000,13.184000,12.972000,13.160000,12.752599,15720891 105 | 2022-01-17,13.210000,13.270000,13.104000,13.212000,12.802989,12338054 106 | 2022-01-18,13.284000,13.334000,13.158000,13.270000,12.859194,18979901 107 | 2022-01-19,13.300000,13.474000,13.238000,13.420000,13.004550,19794212 108 | 2022-01-20,13.370000,13.404000,13.220000,13.290000,12.878574,19821766 109 | 2022-01-21,13.140000,13.206000,12.966000,13.106000,12.700271,17881760 110 | 2022-01-24,13.114000,13.170000,12.578000,12.686000,12.293273,20679442 111 | 2022-01-25,12.832000,13.142000,12.670000,13.094000,12.688643,18607939 112 | 2022-01-26,13.214000,13.488000,13.200000,13.484000,13.066569,24678628 113 | 2022-01-27,13.300000,13.840000,13.280000,13.812000,13.384415,25616509 114 | 2022-01-28,13.800000,13.802000,13.400000,13.574000,13.153783,19845858 115 | 2022-01-31,13.590000,13.640000,13.234000,13.308000,12.896017,20919460 116 | 2022-02-01,13.324000,13.456000,13.180000,13.456000,13.039436,16676711 117 | 2022-02-02,13.382000,13.498000,13.330000,13.356000,12.942532,13493904 118 | 2022-02-03,13.356000,13.420000,13.204000,13.288000,12.876637,13518177 119 | 2022-02-04,13.384000,13.562000,13.346000,13.480000,13.062693,17295995 120 | 2022-02-07,13.380000,13.418000,13.092000,13.186000,12.777794,16200649 121 | 2022-02-08,13.120000,13.318000,12.970000,13.036000,12.632438,12746726 122 | 2022-02-09,13.066000,13.330000,12.978000,13.252000,12.841751,12584136 123 | 2022-02-10,13.152000,13.366000,13.152000,13.298000,12.886327,10720904 124 | 2022-02-11,13.180000,13.556000,13.142000,13.528000,13.109206,14123872 125 | 2022-02-14,13.500000,13.592000,13.200000,13.324000,12.911522,20387459 126 | 2022-02-15,13.282000,13.432000,13.074000,13.172000,12.764228,15663413 127 | 2022-02-16,13.170000,13.378000,13.086000,13.308000,12.896017,13926631 128 | 2022-02-17,13.202000,13.380000,13.132000,13.334000,12.921212,14169615 129 | 2022-02-18,13.520000,13.600000,13.378000,13.470000,13.053002,19761436 130 | 2022-02-21,13.492000,13.536000,13.104000,13.314000,12.901832,15944636 131 | 2022-02-22,13.062000,13.548000,13.062000,13.492000,13.074321,18850071 132 | 2022-02-23,13.450000,13.618000,13.354000,13.426000,13.010364,14234323 133 | 2022-02-24,13.250000,13.718000,13.066000,13.362000,12.948346,35581182 134 | 2022-02-25,13.422000,13.830000,13.394000,13.780000,13.353405,28060530 135 | 2022-02-28,13.670000,13.906000,13.358000,13.832000,13.403795,27714174 136 | 2022-03-01,13.920000,14.398000,13.884000,14.252000,13.810793,34839560 137 | 2022-03-02,14.402000,14.572000,14.200000,14.530000,14.080187,29727128 138 | 2022-03-03,14.600000,14.852000,13.804000,13.866000,13.436743,27291045 139 | 2022-03-04,13.746000,13.782000,12.810000,12.854000,12.456072,32874311 140 | 2022-03-07,12.782000,13.668000,12.310000,13.406000,12.990984,35013621 141 | 2022-03-08,13.292000,13.770000,13.274000,13.670000,13.246811,24350724 142 | 2022-03-09,13.750000,13.900000,13.308000,13.602000,13.180916,23835591 143 | 2022-03-10,13.498000,13.608000,13.060000,13.104000,12.698333,21272480 144 | 2022-03-11,13.114000,13.480000,13.034000,13.036000,12.632438,15443864 145 | 2022-03-14,13.034000,13.118000,12.822000,12.978000,12.576233,14375697 146 | 2022-03-15,12.774000,12.924000,12.540000,12.924000,12.523905,19719778 147 | 2022-03-16,13.040000,13.124000,12.688000,12.770000,12.374673,20561605 148 | 2022-03-17,12.730000,13.174000,12.674000,13.110000,12.704146,23535647 149 | 2022-03-18,13.190000,13.200000,12.532000,12.728000,12.333972,29960266 150 | 2022-03-21,12.770000,13.238000,12.656000,13.094000,12.688643,18673723 151 | 2022-03-22,13.190000,13.434000,13.086000,13.104000,12.698333,15058029 152 | 2022-03-23,13.028000,13.398000,13.006000,13.222000,12.812680,15153328 153 | 2022-03-24,13.282000,13.486000,13.194000,13.314000,12.901832,11395491 154 | 2022-03-25,13.250000,13.544000,13.088000,13.464000,13.047188,11646774 155 | 2022-03-28,13.380000,13.726000,13.246000,13.272000,12.861132,13570473 156 | 2022-03-29,13.346000,13.558000,12.922000,13.096000,12.690580,17382862 157 | 2022-03-30,13.196000,13.382000,13.158000,13.382000,12.967727,12866099 158 | 2022-03-31,13.280000,13.466000,13.208000,13.294000,12.882450,10324704 159 | 2022-04-01,13.220000,13.458000,13.154000,13.414000,12.998735,9768288 160 | 2022-04-04,13.376000,13.506000,13.332000,13.442000,13.025869,8561573 161 | 2022-04-05,13.502000,13.566000,13.318000,13.508000,13.089827,8963221 162 | 2022-04-06,13.424000,13.582000,13.310000,13.388000,12.973540,11734380 163 | 2022-04-07,13.358000,13.518000,13.196000,13.262000,12.851441,10932585 164 | 2022-04-08,13.424000,13.818000,13.424000,13.818000,13.390229,15918230 165 | 2022-04-11,13.778000,14.084000,13.728000,13.796000,13.368910,10182489 166 | 2022-04-12,13.794000,13.950000,13.706000,13.866000,13.436743,9330978 167 | 2022-04-13,13.866000,14.280000,13.866000,14.174000,13.735208,15748581 168 | 2022-04-14,14.158000,14.268000,14.004000,14.200000,13.760403,12333670 169 | 2022-04-19,14.210000,14.430000,14.138000,14.150000,13.711950,11319929 170 | 2022-04-20,14.254000,14.278000,14.066000,14.228000,13.787536,10261390 171 | 2022-04-21,14.294000,14.298000,14.000000,14.000000,13.566595,11387400 172 | 2022-04-22,13.730000,13.770000,13.464000,13.580000,13.159596,15981425 173 | 2022-04-25,13.302000,13.330000,12.930000,12.930000,12.529719,19172922 174 | 2022-04-26,13.184000,13.224000,12.780000,12.958000,12.556852,14584407 175 | 2022-04-27,12.964000,13.020000,12.806000,12.930000,12.529719,10301442 176 | 2022-04-28,13.008000,13.310000,12.962000,13.158000,12.750661,9925222 177 | 2022-04-29,13.450000,13.464000,13.170000,13.390000,12.975479,12047757 178 | 2022-05-02,13.278000,13.488000,13.110000,13.200000,12.791361,10101913 179 | 2022-05-03,13.306000,13.550000,13.066000,13.528000,13.109206,11352948 180 | 2022-05-04,13.524000,13.732000,13.524000,13.600000,13.178978,10937079 181 | 2022-05-05,13.740000,13.794000,13.428000,13.510000,13.091764,9047418 182 | 2022-05-06,13.550000,13.840000,13.514000,13.604000,13.182854,11964854 183 | 2022-05-09,13.638000,13.736000,13.026000,13.078000,12.673138,12271233 184 | 2022-05-10,13.160000,13.418000,13.046000,13.258000,12.847566,11465475 185 | 2022-05-11,13.300000,13.620000,13.212000,13.620000,13.198359,10849388 186 | 2022-05-12,13.378000,13.530000,13.270000,13.356000,12.942532,10580131 187 | 2022-05-13,13.530000,13.640000,13.344000,13.640000,13.217740,9604893 188 | 2022-05-16,13.586000,13.886000,13.564000,13.826000,13.397982,9820136 189 | 2022-05-17,13.950000,14.168000,13.892000,13.932000,13.500700,12180448 190 | 2022-05-18,13.812000,14.144000,13.810000,13.832000,13.403795,11314162 191 | 2022-05-19,13.788000,13.950000,13.624000,13.786000,13.359220,15046537 192 | 2022-05-20,13.928000,14.120000,13.820000,13.890000,13.460000,17198097 193 | 2022-05-23,13.624000,13.822000,13.612000,13.714000,13.714000,13271908 194 | 2022-05-24,13.620000,13.686000,13.502000,13.610000,13.610000,9516370 195 | 2022-05-25,13.740000,14.120000,13.724000,14.120000,14.120000,17831449 196 | 2022-05-26,14.120000,14.246000,14.054000,14.246000,14.246000,10961160 197 | 2022-05-27,14.260000,14.336000,14.078000,14.192000,14.192000,11271087 198 | 2022-05-30,14.250000,14.274000,14.026000,14.200000,14.200000,8656883 199 | 2022-05-31,14.300000,14.478000,14.198000,14.198000,14.198000,20843817 200 | 2022-06-01,14.198000,14.318000,14.050000,14.098000,14.098000,12269529 201 | 2022-06-02,14.150000,14.150000,13.966000,14.072000,14.072000,7555112 202 | 2022-06-03,14.130000,14.258000,14.054000,14.258000,14.258000,8852539 203 | 2022-06-06,14.326000,14.598000,14.304000,14.432000,14.432000,13025445 204 | 2022-06-07,14.428000,14.508000,14.284000,14.428000,14.428000,11093002 205 | 2022-06-08,14.530000,14.556000,14.302000,14.392000,14.392000,9910104 206 | 2022-06-09,14.380000,14.450000,14.026000,14.026000,14.026000,14315519 207 | 2022-06-10,14.018000,14.020000,13.240000,13.240000,13.240000,22774413 208 | 2022-06-13,13.062000,13.138000,12.792000,12.978000,12.978000,16703779 209 | 2022-06-14,13.140000,13.352000,12.762000,13.170000,13.170000,12124965 210 | 2022-06-15,13.296000,13.392000,12.936000,13.254000,13.254000,14258080 211 | 2022-06-16,13.210000,13.302000,12.486000,12.606000,12.606000,25435857 212 | 2022-06-17,12.574000,12.722000,12.006000,12.010000,12.010000,27990234 213 | 2022-06-20,12.090000,12.270000,12.032000,12.044000,12.044000,11731385 214 | 2022-06-21,12.154000,12.236000,12.008000,12.080000,12.080000,10985501 215 | 2022-06-22,11.766000,11.814000,11.562000,11.660000,11.660000,17707302 216 | 2022-06-23,11.510000,11.842000,11.338000,11.430000,11.430000,15465466 217 | 2022-06-24,11.280000,11.708000,11.210000,11.666000,11.666000,14695680 218 | 2022-06-27,11.588000,11.754000,11.406000,11.430000,11.430000,16186512 219 | 2022-06-28,11.516000,11.720000,11.492000,11.500000,11.500000,11782048 220 | 2022-06-29,11.456000,11.940000,11.430000,11.602000,11.602000,14245962 221 | 2022-06-30,11.500000,11.560000,11.224000,11.328000,11.328000,13465225 222 | 2022-07-01,11.300000,11.450000,11.134000,11.228000,11.228000,10140782 223 | 2022-07-04,11.414000,11.628000,11.414000,11.498000,11.498000,11895933 224 | 2022-07-05,11.512000,11.524000,10.754000,10.832000,10.832000,21932084 225 | 2022-07-06,10.950000,11.080000,10.632000,10.756000,10.756000,16034945 226 | 2022-07-07,10.898000,11.180000,10.882000,11.046000,11.046000,17104234 227 | 2022-07-08,11.046000,11.352000,10.972000,11.226000,11.226000,12923833 228 | 2022-07-11,11.100000,11.316000,11.018000,11.200000,11.200000,10096639 229 | 2022-07-12,11.180000,11.330000,11.006000,11.138000,11.138000,10193478 230 | 2022-07-13,11.144000,11.254000,10.946000,11.116000,11.116000,10229951 231 | 2022-07-14,11.020000,11.156000,10.500000,10.644000,10.644000,18198463 232 | 2022-07-15,10.654000,10.996000,10.578000,10.838000,10.838000,14967921 233 | 2022-07-18,10.890000,11.186000,10.870000,11.014000,11.014000,10679802 234 | 2022-07-19,10.978000,11.360000,10.956000,11.304000,11.304000,12975328 235 | 2022-07-20,11.400000,11.410000,11.100000,11.174000,11.174000,9352632 236 | 2022-07-21,10.960000,11.148000,10.740000,11.000000,11.000000,11031394 237 | 2022-07-22,10.998000,11.142000,10.926000,10.970000,10.970000,9676893 238 | 2022-07-25,10.920000,11.088000,10.798000,11.062000,11.062000,8858839 239 | 2022-07-26,11.142000,11.228000,11.014000,11.020000,11.020000,10479759 240 | 2022-07-27,11.130000,11.288000,11.068000,11.200000,11.200000,8704917 241 | 2022-07-28,11.330000,11.354000,11.028000,11.092000,11.092000,11750707 242 | 2022-07-29,11.316000,11.844000,11.184000,11.716000,11.716000,21159221 243 | 2022-08-01,11.730000,12.118000,11.702000,11.702000,11.702000,15527324 244 | 2022-08-02,11.720000,11.776000,11.382000,11.444000,11.444000,15408263 245 | 2022-08-03,11.490000,11.544000,11.304000,11.400000,11.400000,14708791 246 | 2022-08-04,11.302000,11.570000,11.272000,11.390000,11.390000,13633577 247 | 2022-08-05,11.330000,11.556000,11.288000,11.456000,11.456000,12122650 248 | 2022-08-08,11.568000,11.658000,11.372000,11.446000,11.446000,8746013 249 | 2022-08-09,11.470000,11.604000,11.364000,11.492000,11.492000,8793567 250 | 2022-08-10,11.482000,11.560000,11.286000,11.424000,11.424000,8666139 251 | 2022-08-11,11.490000,11.620000,11.460000,11.590000,11.590000,8666810 252 | 2022-08-12,11.680000,11.788000,11.576000,11.630000,11.630000,7423167 253 | 2022-08-16,11.608000,11.720000,11.512000,11.578000,11.578000,8205962 254 | 2022-08-17,11.600000,11.744000,11.570000,11.692000,11.692000,8577984 255 | 2022-08-18,11.738000,11.934000,11.696000,11.934000,11.934000,8843209 256 | 2022-08-19,11.902000,12.006000,11.722000,11.770000,11.770000,8503102 257 | 2022-08-22,11.750000,11.962000,11.544000,11.838000,11.838000,9397201 -------------------------------------------------------------------------------- /r/stat_learning/data/Auto.csv: -------------------------------------------------------------------------------- 1 | mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name 2 | 18,8,307,130,3504,12,70,1,chevrolet chevelle malibu 3 | 15,8,350,165,3693,11.5,70,1,buick skylark 320 4 | 18,8,318,150,3436,11,70,1,plymouth satellite 5 | 16,8,304,150,3433,12,70,1,amc rebel sst 6 | 17,8,302,140,3449,10.5,70,1,ford torino 7 | 15,8,429,198,4341,10,70,1,ford galaxie 500 8 | 14,8,454,220,4354,9,70,1,chevrolet impala 9 | 14,8,440,215,4312,8.5,70,1,plymouth fury iii 10 | 14,8,455,225,4425,10,70,1,pontiac catalina 11 | 15,8,390,190,3850,8.5,70,1,amc ambassador dpl 12 | 15,8,383,170,3563,10,70,1,dodge challenger se 13 | 14,8,340,160,3609,8,70,1,plymouth 'cuda 340 14 | 15,8,400,150,3761,9.5,70,1,chevrolet monte carlo 15 | 14,8,455,225,3086,10,70,1,buick estate wagon (sw) 16 | 24,4,113,95,2372,15,70,3,toyota corona mark ii 17 | 22,6,198,95,2833,15.5,70,1,plymouth duster 18 | 18,6,199,97,2774,15.5,70,1,amc hornet 19 | 21,6,200,85,2587,16,70,1,ford maverick 20 | 27,4,97,88,2130,14.5,70,3,datsun pl510 21 | 26,4,97,46,1835,20.5,70,2,volkswagen 1131 deluxe sedan 22 | 25,4,110,87,2672,17.5,70,2,peugeot 504 23 | 24,4,107,90,2430,14.5,70,2,audi 100 ls 24 | 25,4,104,95,2375,17.5,70,2,saab 99e 25 | 26,4,121,113,2234,12.5,70,2,bmw 2002 26 | 21,6,199,90,2648,15,70,1,amc gremlin 27 | 10,8,360,215,4615,14,70,1,ford f250 28 | 10,8,307,200,4376,15,70,1,chevy c20 29 | 11,8,318,210,4382,13.5,70,1,dodge d200 30 | 9,8,304,193,4732,18.5,70,1,hi 1200d 31 | 27,4,97,88,2130,14.5,71,3,datsun pl510 32 | 28,4,140,90,2264,15.5,71,1,chevrolet vega 2300 33 | 25,4,113,95,2228,14,71,3,toyota corona 34 | 19,6,232,100,2634,13,71,1,amc gremlin 35 | 16,6,225,105,3439,15.5,71,1,plymouth satellite custom 36 | 17,6,250,100,3329,15.5,71,1,chevrolet chevelle malibu 37 | 19,6,250,88,3302,15.5,71,1,ford torino 500 38 | 18,6,232,100,3288,15.5,71,1,amc matador 39 | 14,8,350,165,4209,12,71,1,chevrolet impala 40 | 14,8,400,175,4464,11.5,71,1,pontiac catalina brougham 41 | 14,8,351,153,4154,13.5,71,1,ford galaxie 500 42 | 14,8,318,150,4096,13,71,1,plymouth fury iii 43 | 12,8,383,180,4955,11.5,71,1,dodge monaco (sw) 44 | 13,8,400,170,4746,12,71,1,ford country squire (sw) 45 | 13,8,400,175,5140,12,71,1,pontiac safari (sw) 46 | 18,6,258,110,2962,13.5,71,1,amc hornet sportabout (sw) 47 | 22,4,140,72,2408,19,71,1,chevrolet vega (sw) 48 | 19,6,250,100,3282,15,71,1,pontiac firebird 49 | 18,6,250,88,3139,14.5,71,1,ford mustang 50 | 23,4,122,86,2220,14,71,1,mercury capri 2000 51 | 28,4,116,90,2123,14,71,2,opel 1900 52 | 30,4,79,70,2074,19.5,71,2,peugeot 304 53 | 30,4,88,76,2065,14.5,71,2,fiat 124b 54 | 31,4,71,65,1773,19,71,3,toyota corolla 1200 55 | 35,4,72,69,1613,18,71,3,datsun 1200 56 | 27,4,97,60,1834,19,71,2,volkswagen model 111 57 | 26,4,91,70,1955,20.5,71,1,plymouth cricket 58 | 24,4,113,95,2278,15.5,72,3,toyota corona hardtop 59 | 25,4,97.5,80,2126,17,72,1,dodge colt hardtop 60 | 23,4,97,54,2254,23.5,72,2,volkswagen type 3 61 | 20,4,140,90,2408,19.5,72,1,chevrolet vega 62 | 21,4,122,86,2226,16.5,72,1,ford pinto runabout 63 | 13,8,350,165,4274,12,72,1,chevrolet impala 64 | 14,8,400,175,4385,12,72,1,pontiac catalina 65 | 15,8,318,150,4135,13.5,72,1,plymouth fury iii 66 | 14,8,351,153,4129,13,72,1,ford galaxie 500 67 | 17,8,304,150,3672,11.5,72,1,amc ambassador sst 68 | 11,8,429,208,4633,11,72,1,mercury marquis 69 | 13,8,350,155,4502,13.5,72,1,buick lesabre custom 70 | 12,8,350,160,4456,13.5,72,1,oldsmobile delta 88 royale 71 | 13,8,400,190,4422,12.5,72,1,chrysler newport royal 72 | 19,3,70,97,2330,13.5,72,3,mazda rx2 coupe 73 | 15,8,304,150,3892,12.5,72,1,amc matador (sw) 74 | 13,8,307,130,4098,14,72,1,chevrolet chevelle concours (sw) 75 | 13,8,302,140,4294,16,72,1,ford gran torino (sw) 76 | 14,8,318,150,4077,14,72,1,plymouth satellite custom (sw) 77 | 18,4,121,112,2933,14.5,72,2,volvo 145e (sw) 78 | 22,4,121,76,2511,18,72,2,volkswagen 411 (sw) 79 | 21,4,120,87,2979,19.5,72,2,peugeot 504 (sw) 80 | 26,4,96,69,2189,18,72,2,renault 12 (sw) 81 | 22,4,122,86,2395,16,72,1,ford pinto (sw) 82 | 28,4,97,92,2288,17,72,3,datsun 510 (sw) 83 | 23,4,120,97,2506,14.5,72,3,toyouta corona mark ii (sw) 84 | 28,4,98,80,2164,15,72,1,dodge colt (sw) 85 | 27,4,97,88,2100,16.5,72,3,toyota corolla 1600 (sw) 86 | 13,8,350,175,4100,13,73,1,buick century 350 87 | 14,8,304,150,3672,11.5,73,1,amc matador 88 | 13,8,350,145,3988,13,73,1,chevrolet malibu 89 | 14,8,302,137,4042,14.5,73,1,ford gran torino 90 | 15,8,318,150,3777,12.5,73,1,dodge coronet custom 91 | 12,8,429,198,4952,11.5,73,1,mercury marquis brougham 92 | 13,8,400,150,4464,12,73,1,chevrolet caprice classic 93 | 13,8,351,158,4363,13,73,1,ford ltd 94 | 14,8,318,150,4237,14.5,73,1,plymouth fury gran sedan 95 | 13,8,440,215,4735,11,73,1,chrysler new yorker brougham 96 | 12,8,455,225,4951,11,73,1,buick electra 225 custom 97 | 13,8,360,175,3821,11,73,1,amc ambassador brougham 98 | 18,6,225,105,3121,16.5,73,1,plymouth valiant 99 | 16,6,250,100,3278,18,73,1,chevrolet nova custom 100 | 18,6,232,100,2945,16,73,1,amc hornet 101 | 18,6,250,88,3021,16.5,73,1,ford maverick 102 | 23,6,198,95,2904,16,73,1,plymouth duster 103 | 26,4,97,46,1950,21,73,2,volkswagen super beetle 104 | 11,8,400,150,4997,14,73,1,chevrolet impala 105 | 12,8,400,167,4906,12.5,73,1,ford country 106 | 13,8,360,170,4654,13,73,1,plymouth custom suburb 107 | 12,8,350,180,4499,12.5,73,1,oldsmobile vista cruiser 108 | 18,6,232,100,2789,15,73,1,amc gremlin 109 | 20,4,97,88,2279,19,73,3,toyota carina 110 | 21,4,140,72,2401,19.5,73,1,chevrolet vega 111 | 22,4,108,94,2379,16.5,73,3,datsun 610 112 | 18,3,70,90,2124,13.5,73,3,maxda rx3 113 | 19,4,122,85,2310,18.5,73,1,ford pinto 114 | 21,6,155,107,2472,14,73,1,mercury capri v6 115 | 26,4,98,90,2265,15.5,73,2,fiat 124 sport coupe 116 | 15,8,350,145,4082,13,73,1,chevrolet monte carlo s 117 | 16,8,400,230,4278,9.5,73,1,pontiac grand prix 118 | 29,4,68,49,1867,19.5,73,2,fiat 128 119 | 24,4,116,75,2158,15.5,73,2,opel manta 120 | 20,4,114,91,2582,14,73,2,audi 100ls 121 | 19,4,121,112,2868,15.5,73,2,volvo 144ea 122 | 15,8,318,150,3399,11,73,1,dodge dart custom 123 | 24,4,121,110,2660,14,73,2,saab 99le 124 | 20,6,156,122,2807,13.5,73,3,toyota mark ii 125 | 11,8,350,180,3664,11,73,1,oldsmobile omega 126 | 20,6,198,95,3102,16.5,74,1,plymouth duster 127 | 19,6,232,100,2901,16,74,1,amc hornet 128 | 15,6,250,100,3336,17,74,1,chevrolet nova 129 | 31,4,79,67,1950,19,74,3,datsun b210 130 | 26,4,122,80,2451,16.5,74,1,ford pinto 131 | 32,4,71,65,1836,21,74,3,toyota corolla 1200 132 | 25,4,140,75,2542,17,74,1,chevrolet vega 133 | 16,6,250,100,3781,17,74,1,chevrolet chevelle malibu classic 134 | 16,6,258,110,3632,18,74,1,amc matador 135 | 18,6,225,105,3613,16.5,74,1,plymouth satellite sebring 136 | 16,8,302,140,4141,14,74,1,ford gran torino 137 | 13,8,350,150,4699,14.5,74,1,buick century luxus (sw) 138 | 14,8,318,150,4457,13.5,74,1,dodge coronet custom (sw) 139 | 14,8,302,140,4638,16,74,1,ford gran torino (sw) 140 | 14,8,304,150,4257,15.5,74,1,amc matador (sw) 141 | 29,4,98,83,2219,16.5,74,2,audi fox 142 | 26,4,79,67,1963,15.5,74,2,volkswagen dasher 143 | 26,4,97,78,2300,14.5,74,2,opel manta 144 | 31,4,76,52,1649,16.5,74,3,toyota corona 145 | 32,4,83,61,2003,19,74,3,datsun 710 146 | 28,4,90,75,2125,14.5,74,1,dodge colt 147 | 24,4,90,75,2108,15.5,74,2,fiat 128 148 | 26,4,116,75,2246,14,74,2,fiat 124 tc 149 | 24,4,120,97,2489,15,74,3,honda civic 150 | 26,4,108,93,2391,15.5,74,3,subaru 151 | 31,4,79,67,2000,16,74,2,fiat x1.9 152 | 19,6,225,95,3264,16,75,1,plymouth valiant custom 153 | 18,6,250,105,3459,16,75,1,chevrolet nova 154 | 15,6,250,72,3432,21,75,1,mercury monarch 155 | 15,6,250,72,3158,19.5,75,1,ford maverick 156 | 16,8,400,170,4668,11.5,75,1,pontiac catalina 157 | 15,8,350,145,4440,14,75,1,chevrolet bel air 158 | 16,8,318,150,4498,14.5,75,1,plymouth grand fury 159 | 14,8,351,148,4657,13.5,75,1,ford ltd 160 | 17,6,231,110,3907,21,75,1,buick century 161 | 16,6,250,105,3897,18.5,75,1,chevroelt chevelle malibu 162 | 15,6,258,110,3730,19,75,1,amc matador 163 | 18,6,225,95,3785,19,75,1,plymouth fury 164 | 21,6,231,110,3039,15,75,1,buick skyhawk 165 | 20,8,262,110,3221,13.5,75,1,chevrolet monza 2+2 166 | 13,8,302,129,3169,12,75,1,ford mustang ii 167 | 29,4,97,75,2171,16,75,3,toyota corolla 168 | 23,4,140,83,2639,17,75,1,ford pinto 169 | 20,6,232,100,2914,16,75,1,amc gremlin 170 | 23,4,140,78,2592,18.5,75,1,pontiac astro 171 | 24,4,134,96,2702,13.5,75,3,toyota corona 172 | 25,4,90,71,2223,16.5,75,2,volkswagen dasher 173 | 24,4,119,97,2545,17,75,3,datsun 710 174 | 18,6,171,97,2984,14.5,75,1,ford pinto 175 | 29,4,90,70,1937,14,75,2,volkswagen rabbit 176 | 19,6,232,90,3211,17,75,1,amc pacer 177 | 23,4,115,95,2694,15,75,2,audi 100ls 178 | 23,4,120,88,2957,17,75,2,peugeot 504 179 | 22,4,121,98,2945,14.5,75,2,volvo 244dl 180 | 25,4,121,115,2671,13.5,75,2,saab 99le 181 | 33,4,91,53,1795,17.5,75,3,honda civic cvcc 182 | 28,4,107,86,2464,15.5,76,2,fiat 131 183 | 25,4,116,81,2220,16.9,76,2,opel 1900 184 | 25,4,140,92,2572,14.9,76,1,capri ii 185 | 26,4,98,79,2255,17.7,76,1,dodge colt 186 | 27,4,101,83,2202,15.3,76,2,renault 12tl 187 | 17.5,8,305,140,4215,13,76,1,chevrolet chevelle malibu classic 188 | 16,8,318,150,4190,13,76,1,dodge coronet brougham 189 | 15.5,8,304,120,3962,13.9,76,1,amc matador 190 | 14.5,8,351,152,4215,12.8,76,1,ford gran torino 191 | 22,6,225,100,3233,15.4,76,1,plymouth valiant 192 | 22,6,250,105,3353,14.5,76,1,chevrolet nova 193 | 24,6,200,81,3012,17.6,76,1,ford maverick 194 | 22.5,6,232,90,3085,17.6,76,1,amc hornet 195 | 29,4,85,52,2035,22.2,76,1,chevrolet chevette 196 | 24.5,4,98,60,2164,22.1,76,1,chevrolet woody 197 | 29,4,90,70,1937,14.2,76,2,vw rabbit 198 | 33,4,91,53,1795,17.4,76,3,honda civic 199 | 20,6,225,100,3651,17.7,76,1,dodge aspen se 200 | 18,6,250,78,3574,21,76,1,ford granada ghia 201 | 18.5,6,250,110,3645,16.2,76,1,pontiac ventura sj 202 | 17.5,6,258,95,3193,17.8,76,1,amc pacer d/l 203 | 29.5,4,97,71,1825,12.2,76,2,volkswagen rabbit 204 | 32,4,85,70,1990,17,76,3,datsun b-210 205 | 28,4,97,75,2155,16.4,76,3,toyota corolla 206 | 26.5,4,140,72,2565,13.6,76,1,ford pinto 207 | 20,4,130,102,3150,15.7,76,2,volvo 245 208 | 13,8,318,150,3940,13.2,76,1,plymouth volare premier v8 209 | 19,4,120,88,3270,21.9,76,2,peugeot 504 210 | 19,6,156,108,2930,15.5,76,3,toyota mark ii 211 | 16.5,6,168,120,3820,16.7,76,2,mercedes-benz 280s 212 | 16.5,8,350,180,4380,12.1,76,1,cadillac seville 213 | 13,8,350,145,4055,12,76,1,chevy c10 214 | 13,8,302,130,3870,15,76,1,ford f108 215 | 13,8,318,150,3755,14,76,1,dodge d100 216 | 31.5,4,98,68,2045,18.5,77,3,honda accord cvcc 217 | 30,4,111,80,2155,14.8,77,1,buick opel isuzu deluxe 218 | 36,4,79,58,1825,18.6,77,2,renault 5 gtl 219 | 25.5,4,122,96,2300,15.5,77,1,plymouth arrow gs 220 | 33.5,4,85,70,1945,16.8,77,3,datsun f-10 hatchback 221 | 17.5,8,305,145,3880,12.5,77,1,chevrolet caprice classic 222 | 17,8,260,110,4060,19,77,1,oldsmobile cutlass supreme 223 | 15.5,8,318,145,4140,13.7,77,1,dodge monaco brougham 224 | 15,8,302,130,4295,14.9,77,1,mercury cougar brougham 225 | 17.5,6,250,110,3520,16.4,77,1,chevrolet concours 226 | 20.5,6,231,105,3425,16.9,77,1,buick skylark 227 | 19,6,225,100,3630,17.7,77,1,plymouth volare custom 228 | 18.5,6,250,98,3525,19,77,1,ford granada 229 | 16,8,400,180,4220,11.1,77,1,pontiac grand prix lj 230 | 15.5,8,350,170,4165,11.4,77,1,chevrolet monte carlo landau 231 | 15.5,8,400,190,4325,12.2,77,1,chrysler cordoba 232 | 16,8,351,149,4335,14.5,77,1,ford thunderbird 233 | 29,4,97,78,1940,14.5,77,2,volkswagen rabbit custom 234 | 24.5,4,151,88,2740,16,77,1,pontiac sunbird coupe 235 | 26,4,97,75,2265,18.2,77,3,toyota corolla liftback 236 | 25.5,4,140,89,2755,15.8,77,1,ford mustang ii 2+2 237 | 30.5,4,98,63,2051,17,77,1,chevrolet chevette 238 | 33.5,4,98,83,2075,15.9,77,1,dodge colt m/m 239 | 30,4,97,67,1985,16.4,77,3,subaru dl 240 | 30.5,4,97,78,2190,14.1,77,2,volkswagen dasher 241 | 22,6,146,97,2815,14.5,77,3,datsun 810 242 | 21.5,4,121,110,2600,12.8,77,2,bmw 320i 243 | 21.5,3,80,110,2720,13.5,77,3,mazda rx-4 244 | 43.1,4,90,48,1985,21.5,78,2,volkswagen rabbit custom diesel 245 | 36.1,4,98,66,1800,14.4,78,1,ford fiesta 246 | 32.8,4,78,52,1985,19.4,78,3,mazda glc deluxe 247 | 39.4,4,85,70,2070,18.6,78,3,datsun b210 gx 248 | 36.1,4,91,60,1800,16.4,78,3,honda civic cvcc 249 | 19.9,8,260,110,3365,15.5,78,1,oldsmobile cutlass salon brougham 250 | 19.4,8,318,140,3735,13.2,78,1,dodge diplomat 251 | 20.2,8,302,139,3570,12.8,78,1,mercury monarch ghia 252 | 19.2,6,231,105,3535,19.2,78,1,pontiac phoenix lj 253 | 20.5,6,200,95,3155,18.2,78,1,chevrolet malibu 254 | 20.2,6,200,85,2965,15.8,78,1,ford fairmont (auto) 255 | 25.1,4,140,88,2720,15.4,78,1,ford fairmont (man) 256 | 20.5,6,225,100,3430,17.2,78,1,plymouth volare 257 | 19.4,6,232,90,3210,17.2,78,1,amc concord 258 | 20.6,6,231,105,3380,15.8,78,1,buick century special 259 | 20.8,6,200,85,3070,16.7,78,1,mercury zephyr 260 | 18.6,6,225,110,3620,18.7,78,1,dodge aspen 261 | 18.1,6,258,120,3410,15.1,78,1,amc concord d/l 262 | 19.2,8,305,145,3425,13.2,78,1,chevrolet monte carlo landau 263 | 17.7,6,231,165,3445,13.4,78,1,buick regal sport coupe (turbo) 264 | 18.1,8,302,139,3205,11.2,78,1,ford futura 265 | 17.5,8,318,140,4080,13.7,78,1,dodge magnum xe 266 | 30,4,98,68,2155,16.5,78,1,chevrolet chevette 267 | 27.5,4,134,95,2560,14.2,78,3,toyota corona 268 | 27.2,4,119,97,2300,14.7,78,3,datsun 510 269 | 30.9,4,105,75,2230,14.5,78,1,dodge omni 270 | 21.1,4,134,95,2515,14.8,78,3,toyota celica gt liftback 271 | 23.2,4,156,105,2745,16.7,78,1,plymouth sapporo 272 | 23.8,4,151,85,2855,17.6,78,1,oldsmobile starfire sx 273 | 23.9,4,119,97,2405,14.9,78,3,datsun 200-sx 274 | 20.3,5,131,103,2830,15.9,78,2,audi 5000 275 | 17,6,163,125,3140,13.6,78,2,volvo 264gl 276 | 21.6,4,121,115,2795,15.7,78,2,saab 99gle 277 | 16.2,6,163,133,3410,15.8,78,2,peugeot 604sl 278 | 31.5,4,89,71,1990,14.9,78,2,volkswagen scirocco 279 | 29.5,4,98,68,2135,16.6,78,3,honda accord lx 280 | 21.5,6,231,115,3245,15.4,79,1,pontiac lemans v6 281 | 19.8,6,200,85,2990,18.2,79,1,mercury zephyr 6 282 | 22.3,4,140,88,2890,17.3,79,1,ford fairmont 4 283 | 20.2,6,232,90,3265,18.2,79,1,amc concord dl 6 284 | 20.6,6,225,110,3360,16.6,79,1,dodge aspen 6 285 | 17,8,305,130,3840,15.4,79,1,chevrolet caprice classic 286 | 17.6,8,302,129,3725,13.4,79,1,ford ltd landau 287 | 16.5,8,351,138,3955,13.2,79,1,mercury grand marquis 288 | 18.2,8,318,135,3830,15.2,79,1,dodge st. regis 289 | 16.9,8,350,155,4360,14.9,79,1,buick estate wagon (sw) 290 | 15.5,8,351,142,4054,14.3,79,1,ford country squire (sw) 291 | 19.2,8,267,125,3605,15,79,1,chevrolet malibu classic (sw) 292 | 18.5,8,360,150,3940,13,79,1,chrysler lebaron town @ country (sw) 293 | 31.9,4,89,71,1925,14,79,2,vw rabbit custom 294 | 34.1,4,86,65,1975,15.2,79,3,maxda glc deluxe 295 | 35.7,4,98,80,1915,14.4,79,1,dodge colt hatchback custom 296 | 27.4,4,121,80,2670,15,79,1,amc spirit dl 297 | 25.4,5,183,77,3530,20.1,79,2,mercedes benz 300d 298 | 23,8,350,125,3900,17.4,79,1,cadillac eldorado 299 | 27.2,4,141,71,3190,24.8,79,2,peugeot 504 300 | 23.9,8,260,90,3420,22.2,79,1,oldsmobile cutlass salon brougham 301 | 34.2,4,105,70,2200,13.2,79,1,plymouth horizon 302 | 34.5,4,105,70,2150,14.9,79,1,plymouth horizon tc3 303 | 31.8,4,85,65,2020,19.2,79,3,datsun 210 304 | 37.3,4,91,69,2130,14.7,79,2,fiat strada custom 305 | 28.4,4,151,90,2670,16,79,1,buick skylark limited 306 | 28.8,6,173,115,2595,11.3,79,1,chevrolet citation 307 | 26.8,6,173,115,2700,12.9,79,1,oldsmobile omega brougham 308 | 33.5,4,151,90,2556,13.2,79,1,pontiac phoenix 309 | 41.5,4,98,76,2144,14.7,80,2,vw rabbit 310 | 38.1,4,89,60,1968,18.8,80,3,toyota corolla tercel 311 | 32.1,4,98,70,2120,15.5,80,1,chevrolet chevette 312 | 37.2,4,86,65,2019,16.4,80,3,datsun 310 313 | 28,4,151,90,2678,16.5,80,1,chevrolet citation 314 | 26.4,4,140,88,2870,18.1,80,1,ford fairmont 315 | 24.3,4,151,90,3003,20.1,80,1,amc concord 316 | 19.1,6,225,90,3381,18.7,80,1,dodge aspen 317 | 34.3,4,97,78,2188,15.8,80,2,audi 4000 318 | 29.8,4,134,90,2711,15.5,80,3,toyota corona liftback 319 | 31.3,4,120,75,2542,17.5,80,3,mazda 626 320 | 37,4,119,92,2434,15,80,3,datsun 510 hatchback 321 | 32.2,4,108,75,2265,15.2,80,3,toyota corolla 322 | 46.6,4,86,65,2110,17.9,80,3,mazda glc 323 | 27.9,4,156,105,2800,14.4,80,1,dodge colt 324 | 40.8,4,85,65,2110,19.2,80,3,datsun 210 325 | 44.3,4,90,48,2085,21.7,80,2,vw rabbit c (diesel) 326 | 43.4,4,90,48,2335,23.7,80,2,vw dasher (diesel) 327 | 36.4,5,121,67,2950,19.9,80,2,audi 5000s (diesel) 328 | 30,4,146,67,3250,21.8,80,2,mercedes-benz 240d 329 | 44.6,4,91,67,1850,13.8,80,3,honda civic 1500 gl 330 | 33.8,4,97,67,2145,18,80,3,subaru dl 331 | 29.8,4,89,62,1845,15.3,80,2,vokswagen rabbit 332 | 32.7,6,168,132,2910,11.4,80,3,datsun 280-zx 333 | 23.7,3,70,100,2420,12.5,80,3,mazda rx-7 gs 334 | 35,4,122,88,2500,15.1,80,2,triumph tr7 coupe 335 | 32.4,4,107,72,2290,17,80,3,honda accord 336 | 27.2,4,135,84,2490,15.7,81,1,plymouth reliant 337 | 26.6,4,151,84,2635,16.4,81,1,buick skylark 338 | 25.8,4,156,92,2620,14.4,81,1,dodge aries wagon (sw) 339 | 23.5,6,173,110,2725,12.6,81,1,chevrolet citation 340 | 30,4,135,84,2385,12.9,81,1,plymouth reliant 341 | 39.1,4,79,58,1755,16.9,81,3,toyota starlet 342 | 39,4,86,64,1875,16.4,81,1,plymouth champ 343 | 35.1,4,81,60,1760,16.1,81,3,honda civic 1300 344 | 32.3,4,97,67,2065,17.8,81,3,subaru 345 | 37,4,85,65,1975,19.4,81,3,datsun 210 mpg 346 | 37.7,4,89,62,2050,17.3,81,3,toyota tercel 347 | 34.1,4,91,68,1985,16,81,3,mazda glc 4 348 | 34.7,4,105,63,2215,14.9,81,1,plymouth horizon 4 349 | 34.4,4,98,65,2045,16.2,81,1,ford escort 4w 350 | 29.9,4,98,65,2380,20.7,81,1,ford escort 2h 351 | 33,4,105,74,2190,14.2,81,2,volkswagen jetta 352 | 33.7,4,107,75,2210,14.4,81,3,honda prelude 353 | 32.4,4,108,75,2350,16.8,81,3,toyota corolla 354 | 32.9,4,119,100,2615,14.8,81,3,datsun 200sx 355 | 31.6,4,120,74,2635,18.3,81,3,mazda 626 356 | 28.1,4,141,80,3230,20.4,81,2,peugeot 505s turbo diesel 357 | 30.7,6,145,76,3160,19.6,81,2,volvo diesel 358 | 25.4,6,168,116,2900,12.6,81,3,toyota cressida 359 | 24.2,6,146,120,2930,13.8,81,3,datsun 810 maxima 360 | 22.4,6,231,110,3415,15.8,81,1,buick century 361 | 26.6,8,350,105,3725,19,81,1,oldsmobile cutlass ls 362 | 20.2,6,200,88,3060,17.1,81,1,ford granada gl 363 | 17.6,6,225,85,3465,16.6,81,1,chrysler lebaron salon 364 | 28,4,112,88,2605,19.6,82,1,chevrolet cavalier 365 | 27,4,112,88,2640,18.6,82,1,chevrolet cavalier wagon 366 | 34,4,112,88,2395,18,82,1,chevrolet cavalier 2-door 367 | 31,4,112,85,2575,16.2,82,1,pontiac j2000 se hatchback 368 | 29,4,135,84,2525,16,82,1,dodge aries se 369 | 27,4,151,90,2735,18,82,1,pontiac phoenix 370 | 24,4,140,92,2865,16.4,82,1,ford fairmont futura 371 | 36,4,105,74,1980,15.3,82,2,volkswagen rabbit l 372 | 37,4,91,68,2025,18.2,82,3,mazda glc custom l 373 | 31,4,91,68,1970,17.6,82,3,mazda glc custom 374 | 38,4,105,63,2125,14.7,82,1,plymouth horizon miser 375 | 36,4,98,70,2125,17.3,82,1,mercury lynx l 376 | 36,4,120,88,2160,14.5,82,3,nissan stanza xe 377 | 36,4,107,75,2205,14.5,82,3,honda accord 378 | 34,4,108,70,2245,16.9,82,3,toyota corolla 379 | 38,4,91,67,1965,15,82,3,honda civic 380 | 32,4,91,67,1965,15.7,82,3,honda civic (auto) 381 | 38,4,91,67,1995,16.2,82,3,datsun 310 gx 382 | 25,6,181,110,2945,16.4,82,1,buick century limited 383 | 38,6,262,85,3015,17,82,1,oldsmobile cutlass ciera (diesel) 384 | 26,4,156,92,2585,14.5,82,1,chrysler lebaron medallion 385 | 22,6,232,112,2835,14.7,82,1,ford granada l 386 | 32,4,144,96,2665,13.9,82,3,toyota celica gt 387 | 36,4,135,84,2370,13,82,1,dodge charger 2.2 388 | 27,4,151,90,2950,17.3,82,1,chevrolet camaro 389 | 27,4,140,86,2790,15.6,82,1,ford mustang gl 390 | 44,4,97,52,2130,24.6,82,2,vw pickup 391 | 32,4,135,84,2295,11.6,82,1,dodge rampage 392 | 28,4,120,79,2625,18.6,82,1,ford ranger 393 | 31,4,119,82,2720,19.4,82,1,chevy s-10 394 | --------------------------------------------------------------------------------