% select(-species)
60 | dim(orig_coords)
61 | dist_mat <- orig_coords %>% dist(method="euclidean")
62 | dim(as.matrix(dist_mat))
63 | mds_coords <- cmdscale(dist_mat, k=2)
64 | dim(mds_coords)
65 | ```
66 |
67 | 4차원 공간의 150개의 관측치에서 150*150 거리행렬을 생성한 후
68 | 이를 사용해 2차원 공간으로 차원을 축소했다.
69 | 축소된 2차원에서 각 종들의 분포는 다음과 같다.
70 | ```{r}
71 | df <- tibble(x1=mds_coords[,1], x2=mds_coords[,2],
72 | species=iris$species)
73 | df %>% ggplot(aes(x1, x2, col=species)) + geom_point()
74 | ```
75 |
76 |
--------------------------------------------------------------------------------
/addendum/rsconnect/documents/dimension-reduction.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
1 | name: Document
2 | title:
3 | username:
4 | account: rpubs
5 | server: rpubs.com
6 | hostUrl: rpubs.com
7 | appId: https://api.rpubs.com/api/v1/document/318229/dcfa2ef3f268496da3ba5f5fb06e1bf8
8 | bundleId: https://api.rpubs.com/api/v1/document/318229/dcfa2ef3f268496da3ba5f5fb06e1bf8
9 | url: http://rpubs.com/publish/claim/318229/10e0ac576ae7454dbc98563704813ee6
10 | when: 1507945656.17115
11 |
--------------------------------------------------------------------------------
/addendum/rsconnect/documents/dimension-reduction.Rmd/rpubs.com/rpubs/Publish Document.dcf:
--------------------------------------------------------------------------------
1 | name: Publish Document
2 | title:
3 | username:
4 | account: rpubs
5 | server: rpubs.com
6 | hostUrl: rpubs.com
7 | appId: https://api.rpubs.com/api/v1/document/318230/0b8a3440f5aa4d2fa50bda4b129fd9a3
8 | bundleId: https://api.rpubs.com/api/v1/document/318230/0b8a3440f5aa4d2fa50bda4b129fd9a3
9 | url: http://rpubs.com/publish/claim/318230/ad037587d9e545b6b0e21fd6ccb0a62e
10 | when: 1507945830.04961
11 |
--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.R:
--------------------------------------------------------------------------------
1 |
2 |
3 | # datasets 패키지에서 제공하는 다양한 자료들과 도움말
4 | help(package='datasets')
5 | # ggplot2 패키지에서 제공되는 데이터
6 | data(package='ggplot2')
7 | # 현재 실행환경에서 로드되어서 사용가능한 모든 데이터를 살펴보려면 옵션 없이
8 | data()
9 |
10 |
11 | # 자료 다운로드:
12 | # curl https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data > housing.data
13 | # curl https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names > housing.names
14 |
15 | boston <- read.table("housing.data")
16 | library(dplyr)
17 | glimpse(boston)
18 |
19 | names(boston) <- c('crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv')
20 | glimpse(boston)
21 |
22 | plot(boston)
23 | summary(boston)
24 |
25 | # 큰 자료 읽어 들이기
26 | library(data.table)
27 | DT <- fread("very_big.csv")
28 | DT <- fread("very_big.csv", data.table=FALSE)
29 |
30 |
31 | # R에서 SQL연습
32 | # install.packages("sqldf")
33 | library(sqldf)
34 | sqldf("select * from iris")
35 | sqldf("select count(*) from iris")
36 | sqldf("select Species, count(*), avg(`Sepal.Length`)
37 | from iris
38 | group by `Species`")
39 | sqldf("select Species, `Sepal.Length`, `Sepal.Width`
40 | from iris
41 | where `Sepal.Length` < 4.5
42 | order by `Sepal.Width`")
43 |
44 |
45 |
46 | library(dplyr)
47 | (df1 <- data_frame(x = c(1, 2), y = 2:1))
48 | (df2 <- data_frame(x = c(1, 3), a = 10, b = "a"))
49 | sqldf("select *
50 | from df1 inner join df2
51 | on df1.x = df2.x")
52 | sqldf("select *
53 | from df1 left join df2
54 | on df1.x = df2.x")
55 |
56 |
57 | # install.packages("foreign")
58 | library(foreign)
59 | x <- read.dbf(system.file("files/sids.dbf", package="foreign")[1])
60 | dplyr::glimpse(x)
61 | summary(x)
62 |
63 |
64 | #-------------------------------
65 | # 기본적인 gapminder 자료 처리
66 |
67 | # 자료를 로드한다
68 | library(gapminder)
69 |
70 | # 행과 열 선택
71 | gapminder[gapminder$country=='Korea, Rep.', c('pop', 'gdpPercap')]
72 |
73 | # 행 선택
74 | gapminder[gapminder$country=='Korea, Rep.', ]
75 | gapminder[gapminder$year==2007, ]
76 | gapminder[gapminder$country=='Korea, Rep.' & gapminder$year==2007, ]
77 | gapminder[1:10,]
78 | head(gapminder, 10)
79 |
80 | # 정렬
81 | gapminder[order(gapminder$year, gapminder$country),]
82 |
83 | # 변수 선택:
84 | gapminder[, c('pop', 'gdpPercap')]
85 | gapminder[, 1:3]
86 |
87 | # 변수 이름 바꾸기: gdpPercap 를 gdp_per_cap 으로 변경
88 | f2 = gapminder
89 | names(f2)
90 | names(f2)[6] = 'gdp_per_cap'
91 |
92 | # 변수변환과 변수 생성
93 | f2 = gapminder
94 | f2$total_gdp = f2$pop * f2$gdpPercap
95 |
96 | # 요약통계량 계산
97 | median(gapminder$gdpPercap)
98 | apply(gapminder[,4:6], 2, mean)
99 | summary(gapminder)
100 |
101 |
102 | #----------------------------
103 | library(dplyr)
104 |
105 | # tbl_df() 와 glimpse()
106 | i2 <- tbl_df(iris)
107 | class(i2)
108 | i2
109 | glimpse(i2)
110 |
111 | iris %>% head
112 | iris %>% head(10)
113 |
114 |
115 |
116 |
117 | filter(gapminder, country=='Korea, Rep.')
118 | filter(gapminder, year==2007)
119 | filter(gapminder, country=='Korea, Rep.' & year==2007)
120 |
121 | gapminder %>% filter(country=='Korea, Rep.')
122 | gapminder %>% filter(year==2007)
123 | gapminder %>% filter(country=='Korea, Rep.' & year==2007)
124 |
125 |
126 | arrange(gapminder, year, country)
127 | gapminder %>% arrange(year, country)
128 |
129 |
130 |
131 | select(gapminder, pop, gdpPercap)
132 | gapminder %>% select(pop, gdpPercap)
133 |
134 |
135 |
136 | gapminder %>%
137 | mutate(total_gdp = pop * gdpPercap,
138 | le_gdp_ratio = lifeExp / gdpPercap,
139 | lgrk = le_gdp_ratio * 100)
140 |
141 |
142 | gapminder %>%
143 | summarize(n_obs = n(),
144 | n_countries = n_distinct(country),
145 | n_years = n_distinct(year),
146 | med_gdpc = median(gdpPercap),
147 | max_gdppc = max(gdpPercap))
148 |
149 |
150 | sample_n(gapminder, 10)
151 | sample_frac(gapminder, 0.01)
152 |
153 |
154 | distinct(select(gapminder, country))
155 | distinct(select(gapminder, year))
156 |
157 |
158 | gapminder %>% select(country) %>% distinct()
159 | gapminder %>% select(year) %>% distinct()
160 |
161 |
162 | gapminder %>%
163 | filter(year == 2007) %>%
164 | group_by(continent) %>%
165 | summarize(median(lifeExp))
166 |
167 |
168 |
169 | # 함수형 프로그래밍의 장점 예시
170 | d1 = filter(gapminder, year == 2007)
171 | d2 = group_by(d1, continent)
172 | d3 = summarize(d2, lifeExp = median(lifeExp))
173 | arrange(d3, -lifeExp)
174 |
175 | arrange(
176 | summarize(
177 | group_by(
178 | filter(gapminder, year==2007), continent
179 | ), lifeExp=median(lifeExp)
180 | ), -lifeExp
181 | )
182 |
183 |
184 | gapminder %>%
185 | filter(year == 2007) %>%
186 | group_by(continent) %>%
187 | summarize(lifeExp = median(lifeExp)) %>%
188 | arrange(-lifeExp)
189 |
190 |
191 | # 조인 연산자; inner, left, right, full(outer) join
192 | (df1 <- data_frame(x = c(1, 2), y = 2:1))
193 | (df2 <- data_frame(x = c(1, 3), a = 10, b = "a"))
194 | df1 %>% inner_join(df2)
195 | df1 %>% left_join(df2)
196 | df1 %>% right_join(df2)
197 | df1 %>% full_join(df2)
198 |
199 |
--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.sh:
--------------------------------------------------------------------------------
1 |
2 | # 자료를 다운받는다
3 | curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > adult.data
4 |
5 | # 첫 열줄을 보여준다.
6 | head adult.data
7 |
8 | # 마지막 10줄을 보여준다.
9 | tail adult.data
10 |
11 | # 첫 5줄을 다른 파일에 저장한다.
12 | head -5 adult.data > adult.data.small
13 | cat adult.data.small
14 |
15 | # 콤마 열 분리문자를 탭으로 바꾼후 다른 파일에 저장한다.
16 | tr "," "\t" < adult.data.small > adult.data.small.tab
17 | cat adult.data.small.tab
18 |
19 | # 자료 길이가 몇줄인지 보여준다. (32562)
20 | wc -l adult.data
21 |
22 | # 간단히 줄이기
23 | head -5 adult.data | tr "," "\t" > adult.data.small.tab
24 |
25 | # 직업군(work class)의 도수분포
26 | $ cut -d ',' -f 2 < adult.data | sort | uniq -c | sort -nr
27 |
--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.sql:
--------------------------------------------------------------------------------
1 | -- SQL 연습 문제 해답
2 |
3 | select EmployeeID, count(*) n
4 | from Orders
5 | group by EmployeeID
6 | order by n desc;
7 |
8 |
9 | select a.EmployeeID,
10 | b.FirstName,
11 | b.LastName,
12 | count(*) n
13 | from Orders a
14 | inner join Employees b
15 | on a.EmployeeID = b.EmployeeID
16 | group by a.EmployeeID
17 | order by n desc;
18 |
19 |
20 | select a.OrderID,
21 | a.OrderDate,
22 | sum(Quantity) as n_items,
23 | sum(Quantity*c.Price) as total_price
24 | from Orders a
25 | inner join OrderDetails b
26 | on a.OrderID = b.OrderID
27 | inner join Products c
28 | on b.ProductID = c.ProductID
29 | group by a.OrderID;
30 |
31 |
32 |
--------------------------------------------------------------------------------
/ch04-data-visualization/ch04-data-visualization.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(gridExtra)
3 | library(gapminder)
4 |
5 | # install.packages("gapminder")
6 | help(package = "gapminder")
7 | library(gapminder)
8 | ?gapminder
9 | gapminder
10 |
11 | head(gapminder)
12 |
13 | tail(gapminder)
14 |
15 | library(dplyr)
16 | glimpse(gapminder)
17 |
18 |
19 | gapminder$lifeExp
20 | gapminder$gdpPercap
21 | gapminder[, c('lifeExp', 'gdpPercap')]
22 | gapminder %>% select(gdpPercap, lifeExp)
23 |
24 | # 요약통계량과 상관관계
25 | summary(gapminder$lifeExp)
26 | summary(gapminder$gdpPercap)
27 | cor(gapminder$lifeExp, gapminder$gdpPercap)
28 |
29 |
30 | # 베이스 패키지 시각화
31 | #@ 4.1
32 | png("../plots/4-1.png", 5.5, 4, units='in', pointsize=9, res=600)
33 | opar = par(mfrow=c(2,2))
34 | hist(gapminder$lifeExp)
35 | hist(gapminder$gdpPercap, nclass=50)
36 | # hist(sqrt(gapminder$gdpPercap), nclass=50)
37 | hist(log10(gapminder$gdpPercap), nclass=50)
38 | plot(log10(gapminder$gdpPercap), gapminder$lifeExp, cex=.5)
39 | par(opar)
40 | dev.off()
41 |
42 |
43 | cor(gapminder$lifeExp, log10(gapminder$gdpPercap))
44 |
45 | # 앤스콤의 사인방(Anscombe's quartet)
46 | # https://en.wikipedia.org/wiki/Anscombe%27s_quartet
47 | # https://commons.wikimedia.org/wiki/File:Anscombe%27s_quartet_3.svg
48 | svg("Anscombe's quartet 3.svg", width=11, height=8)
49 | op <- par(las=1, mfrow=c(2,2), mar=1.5+c(4,4,1,1), oma=c(0,0,0,0),
50 | lab=c(6,6,7), cex.lab=2.0, cex.axis=1.3, mgp=c(3,1,0))
51 | ff <- y ~ x
52 | for(i in 1:4) {
53 | ff[[2]] <- as.name(paste("y", i, sep=""))
54 | ff[[3]] <- as.name(paste("x", i, sep=""))
55 | lmi <- lm(ff, data= anscombe)
56 | xl <- substitute(expression(x[i]), list(i=i))
57 | yl <- substitute(expression(y[i]), list(i=i))
58 | plot(ff, data=anscombe, col="red", pch=21, cex=2.4, bg = "orange",
59 | xlim=c(3,19), ylim=c(3,13)
60 | , xlab=eval(xl), ylab=yl # for version 3
61 | )
62 | abline(lmi, col="blue")
63 | }
64 | par(op)
65 | dev.off()
66 |
67 | # gapminder 예제의 시각화를 ggplot2로 해보자
68 | library(ggplot2)
69 | library(dplyr)
70 | gapminder %>% ggplot(aes(x=lifeExp)) + geom_histogram()
71 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
72 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
73 | scale_x_log10()
74 | gapminder %>% ggplot(aes(x=gdpPercap, y=lifeExp)) + geom_point() +
75 | scale_x_log10() + geom_smooth()
76 |
77 | library(gridExtra)
78 | p1 <- gapminder %>% ggplot(aes(x=lifeExp)) + geom_histogram()
79 | p2 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
80 | p3 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
81 | scale_x_log10()
82 | p4 <- gapminder %>% ggplot(aes(x=gdpPercap, y=lifeExp)) + geom_point() +
83 | scale_x_log10() + geom_smooth()
84 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
85 | ggsave("../plots/4-3.png", g, width=5.5, height=4, units='in', dpi=600)
86 |
87 |
88 |
89 |
90 |
91 | library(ggplot2)
92 | ?ggplot
93 | example(ggplot)
94 |
95 | df <- data.frame(gp = factor(rep(letters[1:3], each = 10)),
96 | y = rnorm(30))
97 | glimpse(df)
98 |
99 | ds <- df %>% group_by(gp) %>% summarize(mean = mean(y), sd = sd(y))
100 | ds
101 |
102 |
103 | ggplot(df, aes(x = gp, y = y)) +
104 | geom_point() +
105 | geom_point(data = ds, aes(y = mean),
106 | colour = 'red', size = 3)
107 |
108 |
109 | ggplot(df) +
110 | geom_point(aes(x = gp, y = y)) +
111 | geom_point(data = ds, aes(x = gp, y = mean),
112 | colour = 'red', size = 3)
113 |
114 |
115 | ggplot() +
116 | geom_point(data = df, aes(x = gp, y = y)) +
117 | geom_point(data = ds, aes(x = gp, y = mean),
118 | colour = 'red', size = 3) +
119 | geom_errorbar(data = ds, aes(x = gp,
120 | ymin = mean - sd, ymax = mean + sd),
121 | colour = 'red', width = 0.4)
122 |
123 |
124 | ggplot(gapminder, aes(lifeExp)) + geom_histogram()
125 | gapminder %>% ggplot(aes(lifeExp)) + geom_histogram()
126 |
127 |
128 | ?diamonds
129 | ?mpg
130 | glimpse(diamonds)
131 | glimpse(mpg)
132 |
133 | # 1. 한 수량형 변수
134 |
135 | library(gapminder)
136 | library(ggplot2)
137 | library(dplyr)
138 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
139 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
140 | scale_x_log10()
141 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_freqpoly() +
142 | scale_x_log10()
143 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_density() +
144 | scale_x_log10()
145 |
146 |
147 | #@ 4.4
148 | p1 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
149 | p2 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
150 | scale_x_log10()
151 | p3 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_freqpoly() +
152 | scale_x_log10()
153 | p4 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_density() +
154 | scale_x_log10()
155 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
156 | ggsave("../plots/4-4.png", g, width=6, height=4, units='in', dpi=600)
157 |
158 | summary(gapminder)
159 |
160 |
161 | # 2. 한 범주형 변수
162 |
163 | #@ 4.5
164 | diamonds %>% ggplot(aes(cut)) + geom_bar()
165 | ggsave("../plots/4-5.png", width=5.5, height=4, units='in', dpi=600)
166 |
167 | table(diamonds$cut)
168 |
169 | prop.table(table(diamonds$cut))
170 |
171 | round(prop.table(table(diamonds$cut))*100, 1)
172 |
173 | diamonds %>%
174 | group_by(cut) %>%
175 | tally() %>%
176 | mutate(pct = round(n / sum(n) * 100, 1))
177 |
178 |
179 | # 3. 두 수량형 변수
180 |
181 | diamonds %>% ggplot(aes(carat, price)) + geom_point()
182 | diamonds %>% ggplot(aes(carat, price)) + geom_point(alpha=.01)
183 | mpg %>% ggplot(aes(cyl, hwy)) + geom_point()
184 | mpg %>% ggplot(aes(cyl, hwy)) + geom_jitter()
185 |
186 |
187 | set.seed(1704)
188 | p1 <- diamonds %>% ggplot(aes(carat, price)) + geom_point()
189 | p2 <- diamonds %>% ggplot(aes(carat, price)) + geom_point(alpha=.01)
190 | p3 <- mpg %>% ggplot(aes(cyl, hwy)) + geom_point()
191 | p4 <- mpg %>% ggplot(aes(cyl, hwy)) + geom_jitter()
192 | ggsave("../plots/4-6.png", arrangeGrob(p1, p2, p3, p4, ncol=2),
193 | width=5.5, height=4, units='in', dpi=600)
194 |
195 |
196 | pairs(diamonds %>% sample_n(1000))
197 |
198 | png("../plots/4-7.png", 5.5*1.2, 4*1.2, units='in', pointsize=9, res=400)
199 | set.seed(1704)
200 | pairs(diamonds %>% sample_n(1000))
201 | dev.off()
202 |
203 | # 4. 수량형 변수와 범주형 변수
204 |
205 | mpg %>% ggplot(aes(class, hwy)) + geom_boxplot()
206 | ggsave("../plots/4-8.png", width=5.5, height=4, units='in', dpi=600)
207 |
208 |
209 | mpg %>% ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
210 | geom_boxplot(alpha=.5)
211 |
212 | mpg %>% mutate(class=reorder(class, hwy, median)) %>%
213 | ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
214 | geom_boxplot(alpha=.5)
215 |
216 | mpg %>%
217 | mutate(class=factor(class, levels=
218 | c("2seater", "subcompact", "compact", "midsize",
219 | "minivan", "suv", "pickup"))) %>%
220 | ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
221 | geom_boxplot(alpha=.5)
222 |
223 | mpg %>%
224 | mutate(class=factor(class, levels=
225 | c("2seater", "subcompact", "compact", "midsize",
226 | "minivan", "suv", "pickup"))) %>%
227 | ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
228 | geom_boxplot(alpha=.5) + coord_flip()
229 |
230 |
231 | set.seed(1704)
232 | p1 <- mpg %>% ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
233 | geom_boxplot(alpha=.5)
234 | p2 <- mpg %>% mutate(class=reorder(class, hwy, median)) %>%
235 | ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
236 | geom_boxplot(alpha=.5)
237 | p3 <- mpg %>%
238 | mutate(class=factor(class, levels=
239 | c("2seater", "subcompact", "compact", "midsize",
240 | "minivan", "suv", "pickup"))) %>%
241 | ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
242 | geom_boxplot(alpha=.5)
243 | p4 <- mpg %>%
244 | mutate(class=factor(class, levels=
245 | c("2seater", "subcompact", "compact", "midsize",
246 | "minivan", "suv", "pickup"))) %>%
247 | ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
248 | geom_boxplot(alpha=.5) + coord_flip()
249 | ggsave("../plots/4-9.png", arrangeGrob(p1, p2, p3, p4, ncol=2),
250 | width=5.5*2, height=4*1.5, units='in', dpi=400)
251 |
252 |
253 |
254 | # 5. 두 범주형 변수
255 |
256 | glimpse(data.frame(Titanic))
257 |
258 | xtabs(Freq ~ Class + Sex + Age + Survived, data.frame(Titanic))
259 |
260 |
261 | ?Titanic
262 | Titanic
263 |
264 |
265 | mosaicplot(Titanic, main = "Survival on the Titanic")
266 |
267 | mosaicplot(Titanic, main = "Survival on the Titanic", color=TRUE)
268 |
269 | png("../plots/4-10.png", 5.5, 4, units='in', pointsize=9, res=600)
270 | mosaicplot(Titanic, main = "Survival on the Titanic", color=TRUE)
271 | dev.off()
272 |
273 | # 아이들 사이에 생존률이 더 높을까?
274 | apply(Titanic, c(3, 4), sum)
275 |
276 | round(prop.table(apply(Titanic, c(3, 4), sum), margin = 1),3)
277 |
278 | # 남-녀 생존률의 비교
279 | apply(Titanic, c(2, 4), sum)
280 |
281 | round(prop.table(apply(Titanic, c(2, 4), sum), margin = 1),3)
282 |
283 |
284 | t2 = data.frame(Titanic)
285 |
286 | t2 %>% group_by(Sex) %>%
287 | summarize(n = sum(Freq),
288 | survivors=sum(ifelse(Survived=="Yes", Freq, 0))) %>%
289 | mutate(rate_survival=survivors/n)
290 |
291 |
292 | # 6. 더 많은 변수를 보여주는 기술 (1): 각 geom 의 다른 속성들을 사용한다.
293 |
294 | gapminder %>% filter(year==2007) %>%
295 | ggplot(aes(gdpPercap, lifeExp)) +
296 | geom_point() + scale_x_log10() +
297 | ggtitle("Gapminder data for 2007")
298 |
299 |
300 | gapminder %>% filter(year==2002) %>%
301 | ggplot(aes(gdpPercap, lifeExp)) +
302 | geom_point(aes(size=pop, col=continent)) + scale_x_log10() +
303 | ggtitle("Gapminder data for 2007")
304 |
305 | p1 <- gapminder %>% filter(year==2007) %>%
306 | ggplot(aes(gdpPercap, lifeExp)) +
307 | geom_point() + scale_x_log10() +
308 | ggtitle("Gapminder data for 2007")
309 | p2 <- gapminder %>% filter(year==2002) %>%
310 | ggplot(aes(gdpPercap, lifeExp)) +
311 | geom_point(aes(size=pop, col=continent)) + scale_x_log10() +
312 | ggtitle("Gapminder data for 2007")
313 | ggsave("../plots/4-11.png", arrangeGrob(p1, p2, ncol=2),
314 | width=5.5*1.7, height=4, units='in', dpi=600)
315 |
316 | # 7. 더 많은 변수를 보여주는 기술 (2). facet_* 함수를 사용한다.
317 |
318 | gapminder %>%
319 | ggplot(aes(year, lifeExp, group=country)) +
320 | geom_line()
321 |
322 |
323 | gapminder %>%
324 | ggplot(aes(year, lifeExp, group=country, col=continent)) +
325 | geom_line()
326 |
327 |
328 | gapminder %>%
329 | ggplot(aes(year, lifeExp, group=country)) +
330 | geom_line() +
331 | facet_wrap(~ continent)
332 |
333 | p1 <- gapminder %>%
334 | ggplot(aes(year, lifeExp, group=country)) +
335 | geom_line()
336 | p2 <- gapminder %>%
337 | ggplot(aes(year, lifeExp, group=country, col=continent)) +
338 | geom_line()
339 | p3 <- gapminder %>%
340 | ggplot(aes(year, lifeExp, group=country)) +
341 | geom_line() +
342 | facet_wrap(~ continent)
343 | ggsave("../plots/4-12.png", arrangeGrob(p1, p2, p3, ncol=2),
344 | width=5.5*2, height=4*2, units='in', dpi=150)
345 |
346 |
347 |
--------------------------------------------------------------------------------
/ch04-data-visualization/ch04-data-visualization.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch05-coding-style.R:
--------------------------------------------------------------------------------
1 | # 지저분한 코드 예
2 | sc<-function(x,y,verbose=TRUE) {
3 | n<-length(x)
4 | if(n<=1||n!=length(y)) stop("Arguments x and y have different lengths: ",length(x)," and ",length(y),".")
5 | if(TRUE%in%is.na(x)||TRUE%in%is.na(y)) stop(" Arguments x and y must not have missing values.")
6 | cv<-var(x,y)
7 | if(verbose) cat("Covariance = ",round(cv,4),".\n",sep= "")
8 | return(cv)
9 | }
10 |
11 |
12 |
13 | # 깨끗한 코드 예
14 | CalculateSampleCovariance <- function(x, y, verbose = TRUE) {
15 | # Computes the sample covariance between two vectors.
16 | #
17 | # Args:
18 | # x: One of two vectors whose sample covariance is to be calculated.
19 | # y: The other vector. x and y must have the same length, greater than one,
20 | # with no missing values.
21 | # verbose: If TRUE, prints sample covariance; if not, not. Default is TRUE.
22 | #
23 | # Returns:
24 | # The sample covariance between x and y.
25 | n <- length(x)
26 | # Error handling
27 | if (n <= 1 || n != length(y)) {
28 | stop("Arguments x and y have different lengths: ",
29 | length(x), " and ", length(y), ".")
30 | }
31 | if (TRUE %in% is.na(x) || TRUE %in% is.na(y)) {
32 | stop(" Arguments x and y must not have missing values.")
33 | }
34 | covariance <- var(x, y)
35 | if (verbose)
36 | cat("Covariance = ", round(covariance, 4), ".\n", sep = "")
37 | return(covariance)
38 | }
39 |
--------------------------------------------------------------------------------
/ch06-statistics-concepts/ch06-statistics-concepts.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(gridExtra)
3 |
4 |
5 | # 약제 1을 복용했을 때 수면시간의 증가 (단위는 시간이다)
6 |
7 | y <- sleep$extra[sleep$group == 1]
8 | y
9 |
10 | summary(y)
11 | sd(y)
12 |
13 | par(mfrow=c(2,2))
14 | hist(y)
15 | boxplot(y)
16 | qqnorm(y); qqline(y)
17 | hist(y, prob=TRUE)
18 | lines(density(y), lty=2)
19 |
20 |
21 | png("../plots/6-1.png", 5.5, 4, units='in', pointsize=9, res=600)
22 | y <- sleep$extra[sleep$group == 1]
23 | opar <- par(mfrow=c(2,2))
24 | hist(y)
25 | boxplot(y)
26 | qqnorm(y); qqline(y)
27 | hist(y, prob=TRUE)
28 | lines(density(y), lty=2)
29 | par(opar)
30 | dev.off()
31 |
32 | # '일변량 t-검정(one-sample t-test)'
33 | t.test(y)
34 |
35 | t.test(y, alternative="greater")
36 |
37 |
38 | # 개개인의 수면시간증가값 모형
39 | # 평균이 0이고, 표준편차가 1.8(시간)인 종 모양의 분포(bell shaped distribution)
40 | # N(0, 1.8^2)
41 | curve(dnorm(x, 0, 1.8), -4, 4)
42 |
43 | png("../plots/6-2.png", 5.5, 4, units='in', pointsize=9, res=600)
44 | curve(dnorm(x, 0, 1.8), -4, 4)
45 | dev.off()
46 |
47 |
48 | # 크기가 10개인 새로운 표본
49 | options(digits = 3)
50 | set.seed(1606)
51 | (y_star <- rnorm(10, 0, 1.8))
52 | mean(y_star-0); sd(y_star)
53 | (t_star <- mean(y_star-0) / (sd(y_star)/sqrt(length(y_star))))
54 |
55 |
56 | (y_star <- rnorm(10, 0, 1.8))
57 | mean(y_star-0); sd(y_star)
58 | (t_star <- mean(y_star-0) / (sd(y_star)/sqrt(length(y_star))))
59 |
60 |
61 | (y_star <- rnorm(10, 0, 1.8))
62 | mean(y_star-0); sd(y_star)
63 | (t_star <- mean(y_star-0) / (sd(y_star)/sqrt(length(y_star))))
64 |
65 |
66 | # 10,000개의 평행우주의 표본 (각 표본은 10개의 관측치를 포함한다)
67 | # , 그리고 각 표본의 평균값, 표본표준편차, 그리고 t-통계량 값을 계산할 수 있다:
68 |
69 | set.seed(1606)
70 | B <- 1e4
71 | n <- 10
72 | xbars_star <- rep(NA, B)
73 | sds_star <- rep(NA, B)
74 | ts_star <- rep(NA, B)
75 | for(b in 1:B){
76 | y_star <- rnorm(n, 0, 1.789)
77 | m <- mean(y_star)
78 | s <- sd(y_star)
79 | xbars_star[b] <- m
80 | sds_star[b] <- s
81 | ts_star[b] <- m / (s/sqrt(n))
82 | }
83 |
84 |
85 | opar <- par(mfrow=c(2,2))
86 | hist(xbars_star, nclass=100)
87 | abline(v = 0.75, col='red')
88 | hist(sds_star, nclass=100)
89 | abline(v = 1.789, col='red')
90 | hist(ts_star, nclass=100)
91 | abline(v = 1.3257, col='red')
92 | qqnorm(ts_star); qqline(ts_star)
93 | par(opar)
94 |
95 | png("../plots/6-3.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
96 | opar <- par(mfrow=c(2,2))
97 | hist(xbars_star, nclass=100)
98 | abline(v = 0.75, col='red')
99 | hist(sds_star, nclass=100)
100 | abline(v = 1.789, col='red')
101 | hist(ts_star, nclass=100)
102 | abline(v = 1.3257, col='red')
103 | qqnorm(ts_star); qqline(ts_star)
104 | par(opar)
105 | dev.off()
106 |
107 |
108 | # 우리가 관측한 t-통계량 값 1.3257은 시뮬레이션 분포에서 어디에 있는가?
109 |
110 | length(which(ts_star > 1.3257)) / B
111 |
112 | # 스튜던트 t 분포
113 | # 다양한 자유도 값에 따른 t 밀도함수
114 | # https://en.wikipedia.org/wiki/Student%27s_t-distribution
115 | # Credit: 권용찬
116 | nlist=c(1,2,5)
117 | x <- seq(-5, 5, 0.05)
118 | y <- matrix(0, nr=length(x), nc=length(nlist))
119 | plot(x, type="n", xlab="x", ylab="P(x)",
120 | xlim=c(-5,5), ylim=c(0, 0.45))
121 | for( i in 1:length(nlist)){
122 | y[,i] <- dt(x, df=nlist[i])
123 | lines(x, y[,i], col=i, lwd=2)
124 | }
125 | lines(x, dnorm(x), col=4, lwd=2)
126 | legend_text <- c(expression(paste(nu,"=1 ")),
127 | expression(paste(nu,"=2 ")),
128 | expression(paste(nu,"=5 ")),
129 | expression(paste(nu,"=",infinity)))
130 | legend("topright", legend=legend_text, lty=1, lwd=2, col=c(1:3,4),
131 | inset=.05)
132 |
133 |
134 | png("../plots/6-4.png", 5.5, 4, units='in', pointsize=9, res=600)
135 | nlist=c(1,2,5)
136 | x <- seq(-5, 5, 0.05)
137 | y <- matrix(0, nr=length(x), nc=length(nlist))
138 | plot(x, type="n", xlab="x", ylab="P(x)",
139 | xlim=c(-5,5), ylim=c(0, 0.45))
140 | for( i in 1:length(nlist)){
141 | y[,i] <- dt(x, df=nlist[i])
142 | lines(x, y[,i], col=i, lwd=2)
143 | }
144 | lines(x, dnorm(x), col=4, lwd=2)
145 | legend_text <- c(expression(paste(nu,"=1 ")),
146 | expression(paste(nu,"=2 ")),
147 | expression(paste(nu,"=5 ")),
148 | expression(paste(nu,"=",infinity)))
149 | legend("topright", legend=legend_text, lty=1, lwd=2, col=c(1:3,4), inset=.05)
150 | dev.off()
151 |
152 |
153 |
154 |
155 | # 8. 신뢰구간의 의미
156 | set.seed(1606)
157 | (y_star <- rnorm(10, 1, 1.8))
158 | t.test(y_star)$conf.int
159 | (y_star <- rnorm(10, 1, 1.8))
160 | t.test(y_star)$conf.int
161 | (y_star <- rnorm(10, 1, 1.8))
162 | t.test(y_star)$conf.int
163 |
164 |
165 | library(tidyverse)
166 | set.seed(1606)
167 | B = 1e2
168 | conf_intervals <-
169 | data.frame(b=rep(NA, B),
170 | lower=rep(NA, B),
171 | xbar=rep(NA, B),
172 | upper=rep(NA, B))
173 | true_mu <- 1.0
174 | for(b in 1:B){
175 | (y_star <- rnorm(10, true_mu, 1.8))
176 | conf_intervals[b, ] = c(b=b,
177 | lower=t.test(y_star)$conf.int[1],
178 | xbar=mean(y_star),
179 | upper=t.test(y_star)$conf.int[2])
180 | }
181 | conf_intervals <- conf_intervals %>%
182 | mutate(lucky = (lower <= true_mu & true_mu <= upper))
183 |
184 | glimpse(conf_intervals)
185 | table(conf_intervals$lucky)
186 | conf_intervals %>% ggplot(aes(b, xbar, col=lucky)) +
187 | geom_point() +
188 | geom_errorbar(aes(ymin=lower, ymax=upper)) +
189 | geom_hline(yintercept=true_mu, col='red')
190 | ggsave("../plots/6-6.png", width=5.5, height=4, units='in', dpi=600)
191 |
192 |
193 |
194 | # 6.10.2. 중심극한정리
195 |
196 | hist(c(0, 1), nclass=100, prob=TRUE, main='Individual sleep time increase')
197 | set.seed(1606)
198 | B <- 1e4
199 | n <- 10
200 | xbars_star= rep(NA, B)
201 | for(b in 1:B){
202 | xbars_star[b] <- mean(sample(c(0,1), size=n, replace=TRUE))
203 | }
204 | hist(xbars_star, nclass=100, main='Sample mean of 10 obs')
205 |
206 | png("../plots/6-8.png", 5.5, 4*.8, units='in', pointsize=9, res=600)
207 | opar = par(mfrow=c(1,2))
208 | hist(c(0, 1), nclass=100, prob=TRUE, main='Individual sleep time increase')
209 | hist(xbars_star, nclass=100, main='Sample mean of 10 obs')
210 | par(opar)
211 | dev.off()
212 |
213 | # 6.11. 모수추정의 정확도는 sqrt(n)에 비례한다.
214 | diff(t.test(y)$conf.int)
215 | mean(y)
216 | diff(t.test(y)$conf.int)/2
217 |
218 | # 자료의 incremental 가치
219 | png("../plots/6-9.png", 5.5, 4*.8, units='in', pointsize=9, res=600)
220 | opar = par(mfrow=c(1,2))
221 | curve(1/sqrt(x), 1, 1000, log='x', main='s.e. vs sample size')
222 | curve((1/sqrt(x) - 1/sqrt(x+10)) / (1/sqrt(x)), 1, 1000, log='x',
223 | main='% decrease in s.e. \nwhen adding 10 obs')
224 | par(opar)
225 | dev.off()
226 |
--------------------------------------------------------------------------------
/ch06-statistics-concepts/ch06-statistics-concepts.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch06-statistics-concepts/ch06-statistics-concepts.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#!/usr/bin/env python\n",
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import matplotlib.mlab as mlab\n",
14 | "import scipy.stats as stats\n",
15 | "import seaborn as sns\n",
16 | "\n",
17 | "%matplotlib inline"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": null,
23 | "metadata": {},
24 | "outputs": [],
25 | "source": [
26 | "# sleep 데이터 [Scheffe (1959)]\n",
27 | "data = {\"no.\": range(1, 21),\n",
28 | " \"extra\": [0.7, -1.6, -0.2, -1.2, -0.1,\n",
29 | " 3.4, 3.7, 0.8, 0.0, 2.0,\n",
30 | " 1.9, 0.8, 1.1, 0.1, -0.1,\n",
31 | " 4.4, 5.5, 1.6, 4.6, 3.4],\n",
32 | " \"group\": [(x // 10) + 1 for x in range(20)],\n",
33 | " \"ID\": list(range(1, 11)) * 2}\n",
34 | " \n",
35 | "sleep = pd.DataFrame(data=data,\n",
36 | " columns=[\"no.\", \"extra\", \"group\", \"ID\"])\n",
37 | "sleep.set_index(\"no.\")\n",
38 | "sleep"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# 약제 1을 복용했을 때 수면시간의 증가 (단위는 시간이다)\n",
48 | "y = sleep.loc[sleep[\"group\"]==1][\"extra\"]\n",
49 | "print(y)\n",
50 | "\n",
51 | "y.describe()"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": null,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "# histogram parameters\n",
61 | "# note that the bins are inclusive of their lower bounds,\n",
62 | "# and exclusive of their upper bounds\n",
63 | "bin_width = 1\n",
64 | "y_min = np.rint(y.min())\n",
65 | "y_max = np.rint(y.max()) + 1\n",
66 | "\n",
67 | "# set space between subplots\n",
68 | "fig = plt.figure(figsize=(10, 10))\n",
69 | "fig.subplots_adjust(hspace=.5, wspace=.5)\n",
70 | "\n",
71 | "# histogram\n",
72 | "ax1 = fig.add_subplot(221)\n",
73 | "ax1.set_title(\"Histogram of y\")\n",
74 | "ax1.set_xlabel(\"y\")\n",
75 | "ax1.set_ylabel(\"Frequency\")\n",
76 | "sns.distplot(y, ax=ax1, bins=np.arange(y_min, y_max, bin_width),\n",
77 | " kde=False)\n",
78 | "ax1.set_xticks(np.arange(y_min, y_max))\n",
79 | "ax1.set_xlim(y_min, y_max-1)\n",
80 | "\n",
81 | "# boxplot\n",
82 | "ax2 = fig.add_subplot(222)\n",
83 | "ax2.set_title(\"Boxplot of y\")\n",
84 | "ax2.set_xlabel(\"group\")\n",
85 | "ax2.set_ylabel(\"extra\")\n",
86 | "sns.boxplot(data=y, ax=ax2)\n",
87 | "ax2.set_yticks(np.arange(y_min, y_max))\n",
88 | "\n",
89 | "# normal q-q plot\n",
90 | "ax3 = fig.add_subplot(223)\n",
91 | "#plt.title(\"Normal Q-Q plot\")\n",
92 | "z = (y-np.mean(y)) / np.std(y)\n",
93 | "res = stats.probplot(z, dist=\"norm\", plot=plt)\n",
94 | "\n",
95 | "# probability density \n",
96 | "ax4 = fig.add_subplot(224)\n",
97 | "ax4.set_title(\"Histogram of y\")\n",
98 | "ax4.set_xlabel(\"y\")\n",
99 | "ax4.set_ylabel(\"Density\")\n",
100 | "sns.distplot(y, ax=ax4, bins=np.arange(y_min, y_max),\n",
101 | " hist=True)\n",
102 | "ax4.set_xticks(np.arange(y_min, y_max))\n",
103 | "ax4.set_xlim(y_min, y_max-1)\n",
104 | "\n",
105 | "plt.show()"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": null,
111 | "metadata": {},
112 | "outputs": [],
113 | "source": [
114 | "# perform t-test on y\n",
115 | "t, pvalue = stats.ttest_1samp(y, .0)\n",
116 | "# calculate confidence interval\n",
117 | "confidence_level = 0.95\n",
118 | "low, high = stats.t.interval(confidence_level, len(y)-1, \n",
119 | " loc=np.mean(y), scale=stats.sem(y))\n",
120 | "\n",
121 | "print(\"data: y\")\n",
122 | "print(\"t = % .4f, p-value = %.4f\" % (t, pvalue))\n",
123 | "print(\"alternative hypothesis: true mean is not equal to 0\")\n",
124 | "print(\"%d percent confidence interval:\\n%.7f\\t%.7f\" % (confidence_level*100, low, high))"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": null,
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "# 개개인의 수면시간증가값 모형\n",
134 | "# 평균이 0이고, 표준편차가 1.8(시간)인 종 모양의 분포(bell shaped distribution)\n",
135 | "# N(0, 1.8^2)\n",
136 | "mu = 0\n",
137 | "variance = 1.8**2\n",
138 | "sigma = 1.8\n",
139 | "np.random.seed(1708)\n",
140 | "\n",
141 | "x = np.linspace(mu - 3*variance, mu + 3*variance, 100)\n",
142 | "\n",
143 | "fig, ax = plt.subplots()\n",
144 | "ax.plot(x, mlab.normpdf(x, mu, sigma))\n",
145 | "ax.set_xlim(-4, 4)\n",
146 | "\n",
147 | "plt.show()"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "# 10,000개의 평행우주의 표본, (각 표본은 10개의 관측치를 포함한다)\n",
157 | "# 그리고 각 표본의 평균값, 표본표준편차, 그리고 t-통계량 값을 계산할 수 있다\n",
158 | "B = 10000\n",
159 | "n = 10\n",
160 | "\n",
161 | "mu = 0\n",
162 | "varaince = 1.789**2\n",
163 | "sigma = 1.789\n",
164 | "np.random.seed(1708)\n",
165 | "\n",
166 | "xbars_star = np.zeros(B, dtype=float)\n",
167 | "sds_star = np.zeros(B, dtype=float)\n",
168 | "ts_star = np.zeros(B, dtype=float)\n",
169 | "for b in range(B):\n",
170 | " y_star = np.random.normal(mu, sigma, n)\n",
171 | " m = y_star.mean()\n",
172 | " s = y_star.std()\n",
173 | " xbars_star[b] = m\n",
174 | " sds_star[b] = s\n",
175 | " ts_star[b] = (m / (s / np.sqrt(n)))"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "# set space between subplots\n",
185 | "plt.figure(figsize=(10, 10))\n",
186 | "plt.subplots_adjust(hspace=.5, wspace=.5)\n",
187 | "\n",
188 | "# histogram\n",
189 | "plt.subplot(221)\n",
190 | "plt.title(\"Histogram of xbars_star\")\n",
191 | "plt.xlabel(\"xbars_star\")\n",
192 | "plt.ylabel(\"Frequency\")\n",
193 | "sns.distplot(xbars_star.T, kde=False)\n",
194 | "plt.axvline(x=0.75)\n",
195 | "\n",
196 | "plt.subplot(222)\n",
197 | "plt.title(\"Histogram of sds_star\")\n",
198 | "plt.xlabel(\"sds_star\")\n",
199 | "plt.ylabel(\"Frequency\")\n",
200 | "sns.distplot(sds_star, kde=False)\n",
201 | "plt.axvline(x=1.789)\n",
202 | "\n",
203 | "plt.subplot(223)\n",
204 | "plt.title(\"Histogram of ts_star\")\n",
205 | "plt.xlabel(\"ts_star\")\n",
206 | "plt.ylabel(\"Frequency\")\n",
207 | "sns.distplot(ts_star, kde=False)\n",
208 | "plt.axvline(x=1.3257)\n",
209 | "\n",
210 | "# normal q-q plot\n",
211 | "plt.subplot(224)\n",
212 | "z = (ts_star-np.mean(ts_star)) / np.std(ts_star)\n",
213 | "stats.probplot(z, dist=\"norm\", plot=plt)\n",
214 | "\n",
215 | "plt.show()\n",
216 | "\n",
217 | "# Calculate p-value manually\n",
218 | "pvalue = np.argwhere(ts_star > 1.3257).size / B\n",
219 | "print(pvalue)"
220 | ]
221 | },
222 | {
223 | "cell_type": "code",
224 | "execution_count": null,
225 | "metadata": {},
226 | "outputs": [],
227 | "source": [
228 | "# 스튜던트 t 분포\n",
229 | "# 다양한 자유도 값에 따른 t 밀도함수\n",
230 | "INF = 999\n",
231 | "nlist = np.array([1, 2, 5, INF])\n",
232 | "x = np.arange(-5.0, 5.05, 0.05)\n",
233 | "\n",
234 | "fig, ax = plt.subplots()\n",
235 | "ax.set_xlim(-5, 5)\n",
236 | "ax.set_ylim(0, 0.45)\n",
237 | "\n",
238 | "for df in nlist:\n",
239 | " ax.plot(x, stats.t.pdf(x, df), label=\"df={}\".format(df))\n",
240 | "\n",
241 | "ax.legend()\n",
242 | "plt.show()"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": null,
248 | "metadata": {},
249 | "outputs": [],
250 | "source": [
251 | "# 8. 신뢰구간의 의미\n",
252 | "\n",
253 | "n = 10\n",
254 | "mu = 1\n",
255 | "sigma = 1.8\n",
256 | "confidence_level = 0.95\n",
257 | "\n",
258 | "# calculate confidence interval on different set of values\n",
259 | "# from the same probability distribution\n",
260 | "np.random.seed(1708)\n",
261 | "y_star = np.random.normal(mu, sigma, n)\n",
262 | "print(stats.t.interval(confidence_level, len(y_star)-1, \n",
263 | " loc=np.mean(y_star), scale=stats.sem(y_star)))\n",
264 | "\n",
265 | "y_star = np.random.normal(mu, sigma, n)\n",
266 | "print(stats.t.interval(confidence_level, len(y_star)-1, \n",
267 | " loc=np.mean(y_star), scale=stats.sem(y_star)))\n",
268 | "\n",
269 | "y_star = np.random.normal(mu, sigma, n)\n",
270 | "print(stats.t.interval(confidence_level, len(y_star)-1, \n",
271 | " loc=np.mean(y_star), scale=stats.sem(y_star)))"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": null,
277 | "metadata": {},
278 | "outputs": [],
279 | "source": [
280 | "# run simulation \n",
281 | "B = 100\n",
282 | "n = 10\n",
283 | "\n",
284 | "conf_intervals = pd.DataFrame(\n",
285 | " data={\n",
286 | " \"b\": np.zeros(B, dtype=int),\n",
287 | " \"lower\": np.zeros(B, dtype=float),\n",
288 | " \"xbar\": np.zeros(B, dtype=float),\n",
289 | " \"upper\": np.zeros(B, dtype=float)\n",
290 | " }\n",
291 | ")\n",
292 | "\n",
293 | "true_mu = 1.0\n",
294 | "sigma = 1.8\n",
295 | "confidence_level = 0.95\n",
296 | "np.random.seed(1708)\n",
297 | "\n",
298 | "plt.figure(figsize=(10, 10))\n",
299 | "\n",
300 | "for b in range(B):\n",
301 | " y_star = np.random.normal(true_mu, sigma, n)\n",
302 | " lower, upper = stats.t.interval(confidence_level, len(y_star)-1, \n",
303 | " loc=np.mean(y_star), scale=stats.sem(y_star))\n",
304 | " conf_intervals.loc[b, \"b\"] = b\n",
305 | " conf_intervals.loc[b, \"lower\"] = lower\n",
306 | " conf_intervals.loc[b, \"xbar\"] = y_star.mean()\n",
307 | " conf_intervals.loc[b, \"upper\"] = upper\n",
308 | " \n",
309 | "for index, row in conf_intervals.iterrows():\n",
310 | " if row[\"lower\"] <= true_mu and true_mu <= row[\"upper\"]: \n",
311 | " conf_intervals.loc[index, \"lucky\"] = True\n",
312 | " else:\n",
313 | " conf_intervals.loc[index, \"lucky\"] = False\n",
314 | "\n",
315 | "lucky = conf_intervals.loc[conf_intervals.loc[:,\"lucky\"]==True]\n",
316 | "unlucky = conf_intervals.loc[conf_intervals.loc[:, \"lucky\"]==False]\n",
317 | "\n",
318 | "plt.scatter(lucky[\"b\"], lucky[\"xbar\"], color=\"b\", label=\"lucky\")\n",
319 | "plt.scatter(unlucky[\"b\"], unlucky[\"xbar\"], color=\"r\", label=\"unlucky\")\n",
320 | "plt.hlines(y=true_mu, xmin=-10, xmax=B+10, lw=1, color=\"r\")\n",
321 | "for index, row in lucky.iterrows():\n",
322 | " x, ymax, ymin = row[\"b\"], row[\"upper\"], row[\"lower\"]\n",
323 | " plt.vlines(x=x, ymin=ymin, ymax=ymax, color=\"b\", lw=1)\n",
324 | "for index, row in unlucky.iterrows():\n",
325 | " x, ymax, ymin = row[\"b\"], row[\"upper\"], row[\"lower\"]\n",
326 | " plt.vlines(x=x, ymin=ymin, ymax=ymax, color=\"r\", lw=1)\n",
327 | " \n",
328 | "plt.xlabel(\"b\")\n",
329 | "plt.ylabel(\"xbar\")\n",
330 | "plt.legend()\n",
331 | "plt.show()"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": [
340 | "# 6.10.2. 중심극한정리\n",
341 | "np.random.seed(1708)\n",
342 | "B = 10000\n",
343 | "n = 10\n",
344 | "xbars_star = np.zeros(B//n, dtype=float)\n",
345 | "extra = np.random.randint(2, size=B)\n",
346 | "\n",
347 | "fig = plt.figure(figsize=(15, 5))\n",
348 | "fig.subplots_adjust(hspace=.5, wspace=.5)\n",
349 | "\n",
350 | "for b in range(B//n):\n",
351 | " xbars_star[b] = np.mean(extra[b:b+10])\n",
352 | " \n",
353 | "ax1 = fig.add_subplot(121)\n",
354 | "ax1.set_title(\"Individual sleep time increase\")\n",
355 | "ax1.set_xlabel(\"c(0, 1)\")\n",
356 | "ax1.set_ylabel(\"Density\")\n",
357 | "sns.distplot(extra, ax=ax1, kde=False)\n",
358 | "\n",
359 | "ax2 = fig.add_subplot(122)\n",
360 | "ax2.set_title(\"Sample mean of 10 obs\")\n",
361 | "ax2.set_xlabel(\"xbars_star\")\n",
362 | "ax2.set_ylabel(\"Frequency\")\n",
363 | "sns.distplot(xbars_star, ax=ax2, kde=False)\n",
364 | "\n",
365 | "plt.show()"
366 | ]
367 | }
368 | ],
369 | "metadata": {
370 | "kernelspec": {
371 | "display_name": "Python 3",
372 | "language": "python",
373 | "name": "python3"
374 | },
375 | "language_info": {
376 | "codemirror_mode": {
377 | "name": "ipython",
378 | "version": 3
379 | },
380 | "file_extension": ".py",
381 | "mimetype": "text/x-python",
382 | "name": "python",
383 | "nbconvert_exporter": "python",
384 | "pygments_lexer": "ipython3",
385 | "version": "3.6.1"
386 | }
387 | },
388 | "nbformat": 4,
389 | "nbformat_minor": 2
390 | }
391 |
--------------------------------------------------------------------------------
/ch07-basic-analysis/ch07-basic-analysis.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | library(gridExtra)
3 |
4 | mpg <- tbl_df(mpg)
5 | mpg
6 |
7 | # 7.2. 모든 자료에 행해야 할 분석
8 | library(dplyr)
9 | library(ggplot2)
10 | glimpse(mpg)
11 |
12 | head(mpg)
13 |
14 | summary(mpg)
15 |
16 | # 7.3. 수량형 변수의 분석
17 | summary(mpg$hwy)
18 | mean(mpg$hwy)
19 | median(mpg$hwy)
20 | range(mpg$hwy)
21 | quantile(mpg$hwy)
22 |
23 |
24 | png("../plots/7-1.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
25 | opar <- par(mfrow=c(2,2))
26 | hist(mpg$hwy)
27 | boxplot(mpg$hwy)
28 | qqnorm(mpg$hwy)
29 | qqline(mpg$hwy)
30 | par(opar)
31 | dev.off()
32 |
33 |
34 | # 7.3.1. 일변량 t-검정
35 | hwy <- mpg$hwy
36 | n <- length(hwy)
37 | mu0 <- 22.9
38 | t.test(hwy, mu=mu0, alternative = "greater")
39 |
40 |
41 | t.test(hwy)
42 |
43 | # 7.3.2. 이상점과 로버스트 통계방법
44 | c(mean(hwy), sd(hwy))
45 | c(median(hwy), mad(hwy))
46 |
47 |
48 | # 7.4. 성공-실패값 범주형 변수의 분석
49 | set.seed(1606)
50 | n <- 100
51 | p <- 0.5
52 | x <- rbinom(n, 1, p)
53 | x <- factor(x, levels = c(0,1), labels = c("no", "yes"))
54 | x
55 |
56 | table(x)
57 |
58 | prop.table(table(x))
59 |
60 | barplot(table(x))
61 |
62 | binom.test(x=length(x[x=='yes']), n = length(x), p = 0.5, alternative = "two.sided")
63 |
64 |
65 |
66 | binom.test(x=5400, n = 10000)
67 |
68 |
69 | n <- c(100, 1000, 2000, 10000, 1e6)
70 | data.frame(n=n, moe=round(1.96 * sqrt(1/(4 * n)),4))
71 | curve(1.96 * sqrt(1/(4 * x)), 10, 10000, log='x')
72 | grid()
73 |
74 | png("../plots/7-2.png", 5.5, 4, units='in', pointsize=9, res=600)
75 | n <- c(100, 1000, 2000, 10000, 1e6)
76 | data.frame(n=n, moe=round(1.96 * sqrt(1/(4 * n)),4))
77 | curve(1.96 * sqrt(1/(4 * x)), 10, 10000, log='x')
78 | grid()
79 | dev.off()
80 |
81 |
82 | # 7.6. 수량형 X, 수량형 Y의 분석
83 |
84 | ggplot(mpg, aes(cty, hwy)) + geom_jitter() + geom_smooth(method="lm")
85 | ggsave("../plots/7-4.png", width=5.5, height=4, units='in', dpi=600)
86 |
87 | cor(mpg$cty, mpg$hwy)
88 | with(mpg, cor(cty, hwy))
89 | with(mpg, cor(cty, hwy, method = "kendall"))
90 | with(mpg, cor(cty, hwy, method = "spearman"))
91 |
92 |
93 | # 7.6.3. 선형회귀모형 적합
94 |
95 | (hwy_lm <- lm(hwy ~ cty, data=mpg))
96 | summary(hwy_lm)
97 |
98 | predict(hwy_lm)
99 | resid(hwy_lm)
100 | predict(hwy_lm, newdata = data.frame(cty=c(10, 20, 30)))
101 |
102 |
103 |
104 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
105 | plot(hwy_lm, las = 1) # Residuals, Fitted, ...
106 | par(opar)
107 |
108 | png("../plots/7-6.png", 5.5, 4*1.2, units='in', pointsize=9, res=600)
109 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
110 | plot(hwy_lm, las = 1) # Residuals, Fitted, ...
111 | par(opar)
112 | dev.off()
113 |
114 | # 7.6.6. 로버스트 선형 회귀분석
115 |
116 | library(MASS)
117 | set.seed(123) # make reproducible
118 | lqs(stack.loss ~ ., data = stackloss) # 로버스트
119 | lm(stack.loss ~ ., data = stackloss) # 보통 선형모형
120 |
121 |
122 | # 7.6.7. 비선형/비모수적 방법, 평활법과 LOESS
123 |
124 | plot(hwy ~ displ, data=mpg)
125 | mpg_lo <- loess(hwy ~ displ, data=mpg)
126 | mpg_lo
127 | summary(mpg_lo)
128 | xs <- seq(2,7,length.out = 100)
129 | mpg_pre <- predict(mpg_lo, newdata=data.frame(displ=xs), se=TRUE)
130 | lines(xs, mpg_pre$fit)
131 | lines(xs, mpg_pre$fit - 1.96*mpg_pre$se.fit, lty=2)
132 | lines(xs, mpg_pre$fit + 1.96*mpg_pre$se.fit, lty=2)
133 |
134 | ggplot(mpg, aes(displ, hwy)) +
135 | geom_point() +
136 | geom_smooth()
137 |
138 |
139 | png("../plots/7-8-left.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
140 | plot(hwy ~ displ, data=mpg)
141 | mpg_lo <- loess(hwy ~ displ, data=mpg)
142 | xs <- seq(2,7,length.out = 100)
143 | mpg_pre <- predict(mpg_lo, newdata=data.frame(displ=xs), se=TRUE)
144 | lines(xs, mpg_pre$fit)
145 | lines(xs, mpg_pre$fit - 1.96*mpg_pre$se.fit, lty=2)
146 | lines(xs, mpg_pre$fit + 1.96*mpg_pre$se.fit, lty=2)
147 | dev.off()
148 |
149 | ggplot(mpg, aes(displ, hwy)) +
150 | geom_point() +
151 | geom_smooth()
152 | ggsave("../plots/7-8-right.png", width=5.5*.8, height=4, units='in', dpi=600)
153 |
154 |
155 | # 7.7. 범주형 x, 수량형 y
156 |
157 | mpg %>% ggplot(aes(class, hwy)) + geom_boxplot()
158 | ggsave("../plots/7-9.png", width=5.5, height=4, units='in', dpi=600)
159 |
160 |
161 | (hwy_lm2 <- lm(hwy ~ class, data=mpg))
162 | summary(hwy_lm2)
163 |
164 |
165 | predict(hwy_lm2, newdata=data.frame(class="pickup"))
166 |
167 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
168 | plot(hwy_lm2, las = 1) # Residuals, Fitted, ...
169 | par(opar)
170 |
171 | png("../plots/7-10.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
172 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
173 | plot(hwy_lm2, las = 1) # Residuals, Fitted, ...
174 | par(opar)
175 | dev.off()
176 |
177 |
178 | # 7.8. 수량형 x, 범주형 y (성공-실패)
179 |
180 | library(gridExtra)
181 | p1 <- ggplot(data.frame(x=c(0, 1)), aes(x)) +
182 | stat_function(fun=function(x) log(x/(1-x))) + ylab('logit(x)') +
183 | ggtitle("Logit function")
184 | p2 <- ggplot(data.frame(y=c(-6, 6)), aes(y)) +
185 | stat_function(fun=function(y) 1/(1+exp(-y))) + ylab('logistic(y)') +
186 | ggtitle("Logistic function")
187 | g <- arrangeGrob(p1, p2, ncol=2)
188 | ggsave("../plots/7-11.png", g, width=5.5*1.5, height=4, units='in', dpi=600)
189 |
190 |
191 | chall <- read.csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/challenger.csv')
192 | chall <- tbl_df(chall)
193 | glimpse(chall)
194 |
195 |
196 | chall %>% ggplot(aes(temperature, distress_ct)) +
197 | geom_point()
198 |
199 | chall %>% ggplot(aes(factor(distress_ct), temperature)) +
200 | geom_boxplot()
201 |
202 |
203 | p1 <- chall %>% ggplot(aes(temperature, distress_ct)) +
204 | geom_point()
205 | p2 <- chall %>% ggplot(aes(factor(distress_ct), temperature)) +
206 | geom_boxplot()
207 | g <- arrangeGrob(p1, p2, ncol=2)
208 | ggsave("../plots/7-12.png", g, width=5.5*1.5, height=4, units='in', dpi=600)
209 |
210 |
211 | (chall_glm <-
212 | glm(cbind(distress_ct, o_ring_ct - distress_ct) ~
213 | temperature, data=chall, family='binomial'))
214 |
215 | summary(chall_glm)
216 |
217 | predict(chall_glm, data.frame(temperature=30))
218 |
219 | exp(3.45) / (exp(3.45) +1)
220 | predict(chall_glm, data.frame(temperature=30), type='response')
221 |
222 |
223 | logistic <- function(x){exp(x)/(exp(x)+1)}
224 |
225 | plot(c(20,85), c(0,1), type = "n", xlab = "temperature",
226 | ylab = "prob")
227 | tp <- seq(20, 85, 1)
228 | chall_glm_pred <-
229 | predict(chall_glm,
230 | data.frame(temperature = tp),
231 | se.fit = TRUE)
232 | lines(tp, logistic(chall_glm_pred$fit))
233 | lines(tp, logistic(chall_glm_pred$fit - 1.96 * chall_glm_pred$se.fit), lty=2)
234 | lines(tp, logistic(chall_glm_pred$fit + 1.96 * chall_glm_pred$se.fit), lty=2)
235 | abline(v=30, lty=2, col='blue')
236 |
237 |
238 | logistic <- function(x){exp(x)/(exp(x)+1)}
239 |
240 | png("../plots/7-13.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
241 | plot(c(20,85), c(0,1), type = "n", xlab = "temperature", ylab = "prob")
242 | tp <- seq(20, 85, 1)
243 | chall_glm_pred <- predict(chall_glm, data.frame(temperature = tp), se.fit = TRUE)
244 | lines(tp, logistic(chall_glm_pred$fit))
245 | lines(tp, logistic(chall_glm_pred$fit - 1.96 * chall_glm_pred$se.fit), lty=2)
246 | lines(tp, logistic(chall_glm_pred$fit + 1.96 * chall_glm_pred$se.fit), lty=2)
247 | abline(v=30, lty=2, col='blue')
248 | dev.off()
249 |
250 |
--------------------------------------------------------------------------------
/ch07-basic-analysis/ch07-basic-analysis.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch07-basic-analysis/correlation-example.R:
--------------------------------------------------------------------------------
1 | # Source: https://commons.wikimedia.org/wiki/File:Correlation_examples2.svg
2 |
3 |
4 | #Title: An example of the correlation of x and y for various distributions of (x,y) pairs
5 | #Tags: Mathematics; Statistics; Correlation
6 | #Author: Denis Boigelot
7 | #Packets needed : mvtnorm (rmvnorm), RSVGTipsDevice (devSVGTips)
8 | #How to use: output()
9 | #
10 | #This is an translated version in R of an Matematica 6 code by Imagecreator.
11 |
12 | library(mvtnorm)
13 | library(RSVGTipsDevice)
14 |
15 | MyPlot <- function(xy, xlim = c(-4, 4), ylim = c(-4, 4), eps = 1e-15) {
16 | title = round(cor(xy[,1], xy[,2]), 1)
17 | if (sd(xy[,2]) < eps) title = "" # corr. coeff. is undefined
18 | plot(xy, main = title, xlab = "", ylab = "",
19 | col = "darkblue", pch = 16, cex = 0.2,
20 | xaxt = "n", yaxt = "n", bty = "n",
21 | xlim = xlim, ylim = ylim)
22 | }
23 |
24 | MvNormal <- function(n = 1000, cor = 0.8) {
25 | for (i in cor) {
26 | sd = matrix(c(1, i, i, 1), ncol = 2)
27 | x = rmvnorm(n, c(0, 0), sd)
28 | MyPlot(x)
29 | }
30 | }
31 |
32 | rotation <- function(t, X) return(X %*% matrix(c(cos(t), sin(t), -sin(t), cos(t)), ncol = 2))
33 |
34 | RotNormal <- function(n = 1000, t = pi/2) {
35 | sd = matrix(c(1, 1, 1, 1), ncol = 2)
36 | x = rmvnorm(n, c(0, 0), sd)
37 | for (i in t)
38 | MyPlot(rotation(i, x))
39 | }
40 |
41 | Others <- function(n = 1000) {
42 | x = runif(n, -1, 1)
43 | y = 4 * (x^2 - 1/2)^2 + runif(n, -1, 1)/3
44 | MyPlot(cbind(x,y), xlim = c(-1, 1), ylim = c(-1/3, 1+1/3))
45 |
46 | y = runif(n, -1, 1)
47 | xy = rotation(-pi/8, cbind(x,y))
48 | lim = sqrt(2+sqrt(2)) / sqrt(2)
49 | MyPlot(xy, xlim = c(-lim, lim), ylim = c(-lim, lim))
50 |
51 | xy = rotation(-pi/8, xy)
52 | MyPlot(xy, xlim = c(-sqrt(2), sqrt(2)), ylim = c(-sqrt(2), sqrt(2)))
53 |
54 | y = 2*x^2 + runif(n, -1, 1)
55 | MyPlot(cbind(x,y), xlim = c(-1, 1), ylim = c(-1, 3))
56 |
57 | y = (x^2 + runif(n, 0, 1/2)) * sample(seq(-1, 1, 2), n, replace = TRUE)
58 | MyPlot(cbind(x,y), xlim = c(-1.5, 1.5), ylim = c(-1.5, 1.5))
59 |
60 | y = cos(x*pi) + rnorm(n, 0, 1/8)
61 | x = sin(x*pi) + rnorm(n, 0, 1/8)
62 | MyPlot(cbind(x,y), xlim = c(-1.5, 1.5), ylim = c(-1.5, 1.5))
63 |
64 | xy1 = rmvnorm(n/4, c( 3, 3))
65 | xy2 = rmvnorm(n/4, c(-3, 3))
66 | xy3 = rmvnorm(n/4, c(-3, -3))
67 | xy4 = rmvnorm(n/4, c( 3, -3))
68 | MyPlot(rbind(xy1, xy2, xy3, xy4), xlim = c(-3-4, 3+4), ylim = c(-3-4, 3+4))
69 | }
70 |
71 | output <- function() {
72 | devSVGTips(width = 7, height = 3.2) # remove first and last line for no svg exporting
73 | par(mfrow = c(3, 7), oma = c(0,0,0,0), mar=c(2,2,2,0))
74 | MvNormal(800, c(1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0));
75 | RotNormal(200, c(0, pi/12, pi/6, pi/4, pi/2-pi/6, pi/2-pi/12, pi/2));
76 | Others(800)
77 | dev.off() # remove first and last line for no svg exporting
78 | }
79 |
80 | output() # Produce
81 |
82 | {
83 | library(mvtnorm)
84 | png('../plots/7-5.png', width = 7, height = 3.2, units = 'in',
85 | res=600, pointsize = 12) # remove first and last line for no svg exporting
86 | par(mfrow = c(3, 7), oma = c(0,0,0,0), mar=c(2,2,2,0))
87 | MvNormal(800, c(1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0));
88 | RotNormal(200, c(0, pi/12, pi/6, pi/4, pi/2-pi/6, pi/2-pi/12, pi/2));
89 | Others(800)
90 | dev.off() # remove first and last line for no svg exporting
91 | }
92 |
--------------------------------------------------------------------------------
/ch08-classification/adult/adult.R:
--------------------------------------------------------------------------------
1 | # 8. 빅데이터 분류분석 I: 기본개념과 로지스틱모형
2 |
3 | install.packages(c("dplyr", "ggplot2", "ISLR", "MASS", "glmnet",
4 | "randomForest", "gbm", "rpart", "boot"))
5 |
6 | library(tidyverse)
7 | library(gridExtra)
8 | library(ROCR)
9 |
10 | library(ISLR)
11 | library(MASS)
12 | library(glmnet)
13 | library(randomForest)
14 | library(gbm)
15 | library(rpart)
16 | library(boot)
17 |
18 |
19 |
20 |
21 | binomial_deviance <- function(y_obs, yhat){
22 | epsilon = 0.0001
23 | yhat = ifelse(yhat < epsilon, epsilon, yhat)
24 | yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
25 | a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
26 | b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
27 | return(2*sum(a + b))
28 | }
29 |
30 |
31 |
32 |
33 | # curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > adult.data
34 | # curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names > adult.names
35 |
36 | adult <- read.csv("adult.data", header = FALSE, strip.white = TRUE)
37 | names(adult) <- c('age', 'workclass', 'fnlwgt', 'education',
38 | 'education_num', 'marital_status', 'occupation',
39 | 'relationship', 'race', 'sex',
40 | 'capital_gain', 'capital_loss',
41 | 'hours_per_week', 'native_country',
42 | 'wage')
43 |
44 |
45 | glimpse(adult)
46 |
47 | summary(adult)
48 |
49 | levels(adult$wage)
50 |
51 | # 8.3.3. 범주형 설명변수에서 문제의 복잡도
52 |
53 | levels(adult$race)
54 | adult$race[1:5]
55 | levels(adult$sex)
56 | adult$sex[1:5]
57 |
58 | x <- model.matrix( ~ race + sex + age, adult)
59 | glimpse(x)
60 | colnames(x)
61 |
62 |
63 | x_orig <- adult %>% dplyr::select(sex, race, age)
64 | View(x_orig)
65 |
66 | x_mod <- model.matrix( ~ sex + race + age, adult)
67 | View(x_mod)
68 |
69 |
70 | x <- model.matrix( ~ . - wage, adult)
71 | dim(x)
72 |
73 | # 8.4. 훈련, 검증, 테스트셋의 구분
74 |
75 | set.seed(1601)
76 | n <- nrow(adult)
77 | idx <- 1:n
78 | training_idx <- sample(idx, n * .60)
79 | idx <- setdiff(idx, training_idx)
80 | validate_idx = sample(idx, n * .20)
81 | test_idx <- setdiff(idx, validate_idx)
82 | length(training_idx)
83 | length(validate_idx)
84 | length(test_idx)
85 | training <- adult[training_idx,]
86 | validation <- adult[validate_idx,]
87 | test <- adult[test_idx,]
88 |
89 |
90 | # 8.5. 시각화
91 |
92 | training %>%
93 | ggplot(aes(age, fill=wage)) +
94 | geom_density(alpha=.5)
95 | ggsave("../../plots/8-3.png", width=5.5, height=4, units='in', dpi=600)
96 |
97 |
98 |
99 | training %>%
100 | filter(race %in% c('Black', 'White')) %>%
101 | ggplot(aes(age, fill=wage)) +
102 | geom_density(alpha=.5) +
103 | ylim(0, 0.1) +
104 | facet_grid(race ~ sex, scales = 'free_y')
105 | ggsave("../../plots/8-4.png", width=5.5, height=4, units='in', dpi=600)
106 |
107 |
108 |
109 | training %>%
110 | ggplot(aes(`education_num`, fill=wage)) +
111 | geom_bar()
112 | ggsave("../../plots/8-5.png", width=5.5, height=4, units='in', dpi=600)
113 |
114 |
115 | # 8.6. 로지스틱 회귀분석
116 | ad_glm_full <- glm(wage ~ ., data=training, family=binomial)
117 |
118 | summary(ad_glm_full)
119 |
120 |
121 | alias(ad_glm_full)
122 |
123 |
124 | predict(ad_glm_full, newdata = adult[1:5,], type="response")
125 |
126 |
127 | # 8.6.4. 예측 정확도 지표
128 | y_obs <- ifelse(validation$wage == ">50K", 1, 0)
129 | yhat_lm <- predict(ad_glm_full, newdata=validation, type='response')
130 |
131 | library(gridExtra)
132 |
133 | p1 <- ggplot(data.frame(y_obs, yhat_lm),
134 | aes(y_obs, yhat_lm, group=y_obs,
135 | fill=factor(y_obs))) +
136 | geom_boxplot()
137 | p2 <- ggplot(data.frame(y_obs, yhat_lm),
138 | aes(yhat_lm, fill=factor(y_obs))) +
139 | geom_density(alpha=.5)
140 | grid.arrange(p1, p2, ncol=2)
141 |
142 | g <- arrangeGrob(p1, p2, ncol=2)
143 | ggsave("../../plots/8-6.png", g, width=5.5*1.5, height=4, units='in', dpi=600)
144 |
145 |
146 |
147 | binomial_deviance(y_obs, yhat_lm)
148 |
149 | library(ROCR)
150 | pred_lm <- prediction(yhat_lm, y_obs)
151 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
152 | plot(perf_lm, col='black', main="ROC Curve for GLM")
153 | abline(0,1)
154 | performance(pred_lm, "auc")@y.values[[1]]
155 |
156 |
157 | png("../../plots/8-7.png", 5.5, 4, units='in', pointsize=9, res=600)
158 | pred_lm <- prediction(yhat_lm, y_obs)
159 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
160 | plot(perf_lm, col='black', main="ROC Curve for GLM")
161 | abline(0,1)
162 | dev.off()
163 |
164 |
165 | # 9. 빅데이터 분류분석 II: 라쏘와 랜덤포레스트
166 |
167 | # 9.1. glmnet 함수를 통한 라쏘 모형, 능형회귀, 변수선택
168 | xx <- model.matrix(wage ~ .-1, adult)
169 | x <- xx[training_idx, ]
170 | y <- ifelse(training$wage == ">50K", 1, 0)
171 | dim(x)
172 |
173 | ad_glmnet_fit <- glmnet(x, y)
174 |
175 | plot(ad_glmnet_fit)
176 |
177 | png("../../plots/9-1.png", 5.5, 4, units='in', pointsize=9, res=600)
178 | plot(ad_glmnet_fit)
179 | dev.off()
180 |
181 | ad_glmnet_fit
182 |
183 | coef(ad_glmnet_fit, s = c(.1713, .1295))
184 |
185 |
186 |
187 | ad_cvfit <- cv.glmnet(x, y, family = "binomial")
188 |
189 | plot(ad_cvfit)
190 |
191 | png("../../plots/9-2.png", 5.5, 4, units='in', pointsize=9, res=600)
192 | plot(ad_cvfit)
193 | dev.off()
194 |
195 | log(ad_cvfit$lambda.min)
196 | log(ad_cvfit$lambda.1se)
197 |
198 | coef(ad_cvfit, s=ad_cvfit$lambda.1se)
199 | coef(ad_cvfit, s="lambda.1se")
200 |
201 | length(which(coef(ad_cvfit, s="lambda.min")>0))
202 | length(which(coef(ad_cvfit, s="lambda.1se")>0))
203 |
204 | # 9.1.4. 값의 선택
205 |
206 | set.seed(1607)
207 | foldid <- sample(1:10, size=length(y), replace=TRUE)
208 | cv1 <- cv.glmnet(x, y, foldid=foldid, alpha=1, family='binomial')
209 | cv.5 <- cv.glmnet(x, y, foldid=foldid, alpha=.5, family='binomial')
210 | cv0 <- cv.glmnet(x, y, foldid=foldid, alpha=0, family='binomial')
211 |
212 | png("../../plots/9-3.png", 5.5, 4, units='in', pointsize=7, res=600)
213 | par(mfrow=c(2,2))
214 | plot(cv1, main="Alpha=1.0")
215 | plot(cv.5, main="Alpha=0.5")
216 | plot(cv0, main="Alpha=0.0")
217 | plot(log(cv1$lambda), cv1$cvm, pch=19, col="red",
218 | xlab="log(Lambda)", ylab=cv1$name, main="alpha=1.0")
219 | points(log(cv.5$lambda), cv.5$cvm, pch=19, col="grey")
220 | points(log(cv0$lambda), cv0$cvm, pch=19, col="blue")
221 | legend("topleft", legend=c("alpha= 1", "alpha= .5", "alpha 0"),
222 | pch=19, col=c("red","grey","blue"))
223 | dev.off()
224 |
225 |
226 | predict(ad_cvfit, s="lambda.1se", newx = x[1:5,], type='response')
227 |
228 | y_obs <- ifelse(validation$wage == ">50K", 1, 0)
229 | yhat_glmnet <- predict(ad_cvfit, s="lambda.1se", newx=xx[validate_idx,], type='response')
230 | yhat_glmnet <- yhat_glmnet[,1] # change to a vectro from [n*1] matrix
231 | binomial_deviance(y_obs, yhat_glmnet)
232 | # [1] 4257.118
233 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
234 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
235 |
236 | performance(pred_glmnet, "auc")@y.values[[1]]
237 |
238 | png("../../plots/9-4.png", 5.5, 4, units='in', pointsize=9, res=600)
239 | plot(perf_lm, col='black', main="ROC Curve")
240 | plot(perf_glmnet, col='blue', add=TRUE)
241 | abline(0,1, col='gray')
242 | legend('bottomright', inset=.1,
243 | legend=c("GLM", "glmnet"),
244 | col=c('black', 'blue'), lty=1, lwd=2)
245 | dev.off()
246 |
247 |
248 | # 9.2. 나무모형
249 | library(rpart)
250 | cvr_tr <- rpart(wage ~ ., data = training)
251 | cvr_tr
252 |
253 |
254 | printcp(cvr_tr)
255 | summary(cvr_tr)
256 |
257 |
258 |
259 | png("../../plots/9-6.png", 5.5, 4, units='in', pointsize=9, res=600)
260 | opar <- par(mfrow = c(1,1), xpd = NA)
261 | plot(cvr_tr)
262 | text(cvr_tr, use.n = TRUE)
263 | par(opar)
264 | dev.off()
265 |
266 |
267 | yhat_tr <- predict(cvr_tr, validation)
268 | yhat_tr <- yhat_tr[,">50K"]
269 | binomial_deviance(y_obs, yhat_tr)
270 | pred_tr <- prediction(yhat_tr, y_obs)
271 | perf_tr <- performance(pred_tr, measure = "tpr", x.measure = "fpr")
272 | performance(pred_tr, "auc")@y.values[[1]]
273 |
274 | png("../../plots/9-7.png", 5.5, 4, units='in', pointsize=9, res=600)
275 | plot(perf_lm, col='black', main="ROC Curve")
276 | plot(perf_tr, col='blue', add=TRUE)
277 | abline(0,1, col='gray')
278 | legend('bottomright', inset=.1,
279 | legend = c("GLM", "Tree"),
280 | col=c('black', 'blue'), lty=1, lwd=2)
281 | dev.off()
282 |
283 |
284 | # 9.3. 랜덤 포레스트 -----------
285 |
286 | set.seed(1607)
287 | ad_rf <- randomForest(wage ~ ., training)
288 | ad_rf
289 |
290 | png("../../plots/9-8.png", 5.5, 4, units='in', pointsize=9, res=600)
291 | plot(ad_rf)
292 | dev.off()
293 |
294 | tmp <- importance(ad_rf)
295 | head(round(tmp[order(-tmp[,1]), 1, drop=FALSE], 2), n=10)
296 |
297 | png("../../plots/9-9.png", 5.5, 4, units='in', pointsize=9, res=600)
298 | varImpPlot(ad_rf)
299 | dev.off()
300 |
301 | predict(ad_rf, newdata = adult[1:5,])
302 |
303 | predict(ad_rf, newdata = adult[1:5,], type="prob")
304 |
305 |
306 | yhat_rf <- predict(ad_rf, newdata=validation, type='prob')[,'>50K']
307 | binomial_deviance(y_obs, yhat_rf)
308 | pred_rf <- prediction(yhat_rf, y_obs)
309 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
310 | performance(pred_tr, "auc")@y.values[[1]]
311 |
312 | png("../../plots/9-10.png", 5.5, 4, units='in', pointsize=9, res=600)
313 | plot(perf_lm, col='black', main="ROC Curve")
314 | plot(perf_glmnet, add=TRUE, col='blue')
315 | plot(perf_rf, add=TRUE, col='red')
316 | abline(0,1, col='gray')
317 | legend('bottomright', inset=.1,
318 | legend = c("GLM", "glmnet", "RF"),
319 | col=c('black', 'blue', 'red'), lty=1, lwd=2)
320 | dev.off()
321 |
322 |
323 | # 9.3.5. 예측확률값 자체의 비교
324 | p1 <- data.frame(yhat_glmnet, yhat_rf) %>%
325 | ggplot(aes(yhat_glmnet, yhat_rf)) +
326 | geom_point(alpha=.5) +
327 | geom_abline() +
328 | geom_smooth()
329 | p2 <- reshape2::melt(data.frame(yhat_glmnet, yhat_rf)) %>%
330 | ggplot(aes(value, fill=variable)) +
331 | geom_density(alpha=.5)
332 | grid.arrange(p1, p2, ncol=2)
333 | g <- arrangeGrob(p1, p2, ncol=2)
334 | ggsave("../../plots/9-11.png", g, width=5.5*1.2, height=4*.8, units='in', dpi=600)
335 |
336 |
337 | # 9.4. 부스팅 ----------
338 |
339 | set.seed(1607)
340 | adult_gbm <- training %>% mutate(wage=ifelse(wage == ">50K", 1, 0))
341 | ad_gbm <- gbm(wage ~ ., data=adult_gbm,
342 | distribution="bernoulli",
343 | n.trees=50000, cv.folds=3, verbose=TRUE)
344 | (best_iter <- gbm.perf(ad_gbm, method="cv"))
345 |
346 | ad_gbm2 <- gbm.more(ad_gbm, n.new.trees=10000)
347 | (best_iter <- gbm.perf(ad_gbm2, method="cv"))
348 |
349 |
350 | png("../../plots/9-12.png", 5.5, 4, units='in', pointsize=9, res=600)
351 | (best_iter <- gbm.perf(ad_gbm2, method="cv"))
352 | dev.off()
353 |
354 |
355 | predict(ad_gbm, n.trees=best_iter, newdata=adult_gbm[1:5,], type='response')
356 |
357 | yhat_gbm <- predict(ad_gbm, n.trees=best_iter, newdata=validation, type='response')
358 | binomial_deviance(y_obs, yhat_gbm)
359 | pred_gbm <- prediction(yhat_gbm, y_obs)
360 | perf_gbm <- performance(pred_gbm, measure="tpr", x.measure="fpr")
361 | performance(pred_gbm, "auc")@y.values[[1]]
362 |
363 |
364 | png("../../plots/9-13.png", 5.5, 4, units='in', pointsize=9, res=600)
365 | plot(perf_lm, col='black', main="ROC Curve")
366 | plot(perf_glmnet, add=TRUE, col='blue')
367 | plot(perf_rf, add=TRUE, col='red')
368 | plot(perf_gbm, add=TRUE, col='cyan')
369 | abline(0,1, col='gray')
370 | legend('bottomright', inset=.1,
371 | legend=c("GLM", "glmnet", "RF", "GBM"),
372 | col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
373 | dev.off()
374 |
375 |
376 |
377 | # 9.5. 모형 비교, 최종 모형 선택, 일반화 성능 평가 ----
378 |
379 |
380 | # 9.5.2. 모형의 예측확률값의 분포 비교
381 | # exmaple(pairs) 에서 따옴
382 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
383 | usr <- par("usr"); on.exit(par(usr))
384 | par(usr = c(0, 1, 0, 1))
385 | r <- abs(cor(x, y))
386 | txt <- format(c(r, 0.123456789), digits = digits)[1]
387 | txt <- paste0(prefix, txt)
388 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
389 | text(0.5, 0.5, txt, cex = cex.cor * r)
390 | }
391 |
392 | png("../../plots/9-14.png", 5.5, 4, units='in', pointsize=9, res=600)
393 | pairs(data.frame(y_obs=y_obs,
394 | yhat_lm=yhat_lm,
395 | yhat_glmnet=c(yhat_glmnet),
396 | yhat_rf=yhat_rf,
397 | yhat_gbm=yhat_gbm),
398 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
399 | upper.panel = panel.cor)
400 | dev.off()
401 |
402 |
403 | # 9.5.3. 테스트셋을 이용한 일반화능력 계산
404 | y_obs_test <- ifelse(test$wage == ">50K", 1, 0)
405 | yhat_gbm_test <- predict(ad_gbm, n.trees=best_iter, newdata=test, type='response')
406 | binomial_deviance(y_obs_test, yhat_gbm_test)
407 | pred_gbm_test <- prediction(yhat_gbm_test, y_obs_test)
408 | performance(pred_gbm_test, "auc")@y.values[[1]]
409 |
410 | # 9.6.5. 캐럿 (caret) 패키지
411 | install.packages("caret", dependencies = c("Depends", "Suggests"))
412 |
413 |
414 |
415 | # This is for the earlier ROC curve example. ---
416 | {
417 | png("../../plots/8-1.png", 5.5*1.2, 4*.8, units='in', pointsize=9, res=600)
418 | opar <- par(mfrow=c(1,2))
419 | plot(perf_lm, col='black', main="ROC Curve")
420 | plot(perf_tr, col='blue', add=TRUE)
421 | abline(0,1, col='gray')
422 | legend('bottomright', inset=.1,
423 | legend = c("GLM", "Tree"),
424 | col=c('black', 'blue'), lty=1, lwd=2)
425 | plot(perf_lm, col='black', main="ROC Curve")
426 | plot(perf_glmnet, add=TRUE, col='blue')
427 | plot(perf_rf, add=TRUE, col='red')
428 | plot(perf_gbm, add=TRUE, col='cyan')
429 | abline(0,1, col='gray')
430 | legend('bottomright', inset=.1,
431 | legend=c("GLM", "glmnet", "RF", "GBM"),
432 | col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
433 | par(opar)
434 | dev.off()
435 | }
436 |
--------------------------------------------------------------------------------
/ch08-classification/adult/adult.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch08-classification/adult/adult.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#!/usr/bin/env python\n",
10 | "import pandas as pd\n",
11 | "import numpy as np\n",
12 | "import matplotlib.pyplot as plt\n",
13 | "import seaborn as sns\n",
14 | "import statsmodels.api as sm\n",
15 | "import ggplot\n",
16 | "\n",
17 | "from sklearn import preprocessing\n",
18 | "from sklearn.metrics import roc_curve, auc\n",
19 | "from IPython.display import display \n",
20 | "\n",
21 | "%matplotlib inline"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": null,
27 | "metadata": {
28 | "scrolled": false
29 | },
30 | "outputs": [],
31 | "source": [
32 | "# 1. You can choose to load the data into pandas DataFrame straight from the URL\n",
33 | "URL = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\"\n",
34 | "\n",
35 | "# 2. or you can also download the dataset manually. In this case change the URL accordingly. \n",
36 | "# curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > adult.data\n",
37 | "# curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names > adult.names\n",
38 | "\n",
39 | "adult = pd.read_table(\n",
40 | " URL,\n",
41 | " \n",
42 | " # specify the file encoding\n",
43 | " encoding=\"utf-8\",\n",
44 | " \n",
45 | " # specify the separator in the data\n",
46 | " sep=\",\", # comma separated values\n",
47 | " \n",
48 | " # ignore spaces after the separator\n",
49 | " skipinitialspace=True,\n",
50 | " index_col=None,\n",
51 | " \n",
52 | " # use manual headers\n",
53 | " header=None,\n",
54 | " names=[\n",
55 | " \"age\", \"workclass\", \"fnlwgt\", \"education\", \n",
56 | " \"education-num\", \"marital-status\", \"occupation\",\n",
57 | " \"relationship\", \"race\", \"sex\", \"capital-gain\", \n",
58 | " \"capital-loss\", \"hours-per-week\", \"native-country\",\n",
59 | " \"wage\"\n",
60 | " ]\n",
61 | ")\n",
62 | "\n",
63 | "pd.set_option(\"display.max_rows\", 10)\n",
64 | "display(adult)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "adult.info()\n",
74 | "\n",
75 | "groupby_wage = adult.groupby(\"wage\").describe()\n",
76 | "for i in groupby_wage.columns.levels[0]:\n",
77 | " groupby_wage[i].index.name= \"wage / \" + str(i)\n",
78 | " display(groupby_wage[i])\n",
79 | "\n",
80 | "print(adult[\"wage\"].unique()) "
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": null,
86 | "metadata": {},
87 | "outputs": [],
88 | "source": [
89 | "# 8.3.3. 범주형 설명변수에서 문제의 복잡도\n",
90 | "print(adult[\"race\"].unique())\n",
91 | "print(adult[\"race\"][0:5])\n",
92 | "print(adult[\"sex\"].unique())\n",
93 | "print(adult[\"sex\"][0:5])\n",
94 | "\n",
95 | "design_matrix_race = pd.get_dummies(adult[\"race\"], drop_first=True)\n",
96 | "design_matrix_sex = pd.get_dummies(adult[\"sex\"], drop_first=True)\n",
97 | "intercept = pd.DataFrame(1, index=np.arange(adult.shape[0]), columns=[\"(Intercept)\"])\n",
98 | "example_design_matrix = pd.concat([intercept, design_matrix_race, design_matrix_sex, adult[\"age\"]], axis=1)\n",
99 | "\n",
100 | "# original matrix\n",
101 | "display(adult[[\"race\", \"sex\", \"age\"]])\n",
102 | "# example design matrix\n",
103 | "display(example_design_matrix)\n",
104 | "\n",
105 | "# convert each categorical feature using one-hot encoding\n",
106 | "obj_df = adult.select_dtypes(include=[\"object\"]).drop(\"wage\", axis=1)\n",
107 | "int_df = adult.select_dtypes(include=[\"int64\"])\n",
108 | "\n",
109 | "design_matrix_objs = pd.get_dummies(obj_df, drop_first=True)\n",
110 | "design_matrix_adult = pd.concat([intercept, design_matrix_objs, int_df], axis=1)\n",
111 | "display(design_matrix_adult)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": [
120 | "# 8.4. 훈련, 검증, 테스트셋의 구분\n",
121 | "np.random.seed(1709)\n",
122 | "\n",
123 | "''' 1. You can do it the easy way\n",
124 | "\n",
125 | "from sklearn.model_selection import train_test_split\n",
126 | "training, test = train_test_split(adult, test_size=0.2)\n",
127 | "training, validation = train_test_split(training, test_size=0.25)\n",
128 | "print(training.shape)\n",
129 | "print(validation.shape)\n",
130 | "print(test.shape)\n",
131 | "'''\n",
132 | "\n",
133 | "# 2. Or you can split the dataset manually\n",
134 | "n = adult.shape[0]\n",
135 | "idx = np.arange(n)\n",
136 | "np.random.shuffle(idx)\n",
137 | "\n",
138 | "training_size = int(n*0.6)\n",
139 | "validate_size = int(n*0.2)\n",
140 | "\n",
141 | "training_idx = idx[:training_size]\n",
142 | "validate_idx = idx[training_size:training_size+validate_size]\n",
143 | "test_idx = idx[training_size+validate_size:]\n",
144 | "\n",
145 | "training = adult.loc[training_idx]\n",
146 | "validation = adult.loc[validate_idx]\n",
147 | "test = adult.loc[test_idx]\n",
148 | "print(training.shape)\n",
149 | "print(validation.shape)\n",
150 | "print(test.shape)"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "# 8.5. 시각화\n",
160 | "fig1 = plt.figure(figsize=(5, 5))\n",
161 | "sns.set_style(\"dark\", {'axes.grid' : True})\n",
162 | "\n",
163 | "ax1 = fig1.add_subplot(111)\n",
164 | "ax1.set_xlabel(\"age\")\n",
165 | "ax1.set_ylabel(\"density\")\n",
166 | "ax1.set_ylim(0, 0.04)\n",
167 | "\n",
168 | "df1 = training[training[\"wage\"] == \"<=50K\"]\n",
169 | "df2 = training[training[\"wage\"] == \">50K\"]\n",
170 | "\n",
171 | "sns.distplot(df1[\"age\"], ax=ax1, hist=False,\n",
172 | " kde_kws={\"alpha\": .3, \"color\": \"g\",\n",
173 | " \"shade\": True, \"label\": \"<=50K\"})\n",
174 | "sns.distplot(df2[\"age\"], ax=ax1, hist=False,\n",
175 | " kde_kws={\"alpha\": .3, \"color\": \"b\",\n",
176 | " \"shade\": True, \"label\": \">50K\"})\n",
177 | "\n",
178 | "df3 = training.loc[(training[\"race\"] == \"White\") |\n",
179 | " (training[\"race\"] == \"Black\")]\n",
180 | "\n",
181 | "g = sns.FacetGrid(df3, row=\"race\", col=\"sex\", hue=\"wage\",\n",
182 | " hue_kws={\"color\": [\"b\", \"g\"]})\n",
183 | "g.set(ylim=(0, .05))\n",
184 | "g.map(sns.distplot, \"age\", \"wage\", hist=False,\n",
185 | " kde_kws={\"alpha\": .3, \"shade\": True})\n",
186 | "\n",
187 | "fig2 = plt.figure(figsize=(10, 10))\n",
188 | "ax2 = fig2.add_subplot(211)\n",
189 | "sns.countplot(x=\"education-num\", hue=\"wage\", data=training, ax=ax2)\n",
190 | "\n",
191 | "plt.show()"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {},
198 | "outputs": [],
199 | "source": [
200 | "# 8.6. 로지스틱 회귀분석\n",
201 | "\n",
202 | "# glms cannot interpret strings.\n",
203 | "# We have to assign each label a numeric id. \n",
204 | "le = preprocessing.LabelEncoder()\n",
205 | "le.fit([\"<=50K\", \">50K\"])\n",
206 | "y = le.transform(training[\"wage\"])\n",
207 | "X = design_matrix_adult.loc[training_idx]\n",
208 | "\n",
209 | "lm = sm.GLM(y, sm.add_constant(X), family=sm.families.Binomial())\n",
210 | "res = lm.fit()\n",
211 | "\n",
212 | "resid_deviance = res.resid_deviance\n",
213 | "\n",
214 | "print(\"Deviance Residuals:\", \"\\nMin: {} \\nMedian: {} \\nMax: {}\".format(\n",
215 | " resid_deviance.max(), resid_deviance.min(), resid_deviance.median()))\n",
216 | "\n",
217 | "display(res.summary())\n",
218 | "\n",
219 | "test_X = design_matrix_adult[1:6]\n",
220 | "res.predict(test_X)"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": null,
226 | "metadata": {},
227 | "outputs": [],
228 | "source": [
229 | "# 8.6.4. 예측 정확도 지표\n",
230 | "fig = plt.figure(figsize=(10, 10))\n",
231 | "ax1 = fig.add_subplot(121)\n",
232 | "\n",
233 | "y_obs = le.transform(validation[\"wage\"])\n",
234 | "yhat_lm = res.predict(design_matrix_adult.loc[validate_idx])\n",
235 | "df = pd.DataFrame({\"x\": y_obs, \"y\": yhat_lm})\n",
236 | "sns.boxplot(x=\"x\", y=\"y\", data=df, ax=ax1)\n",
237 | "\n",
238 | "ax1.set_xlabel(\"y_obs\")\n",
239 | "ax1.set_ylabel(\"yhat_lim\")\n",
240 | "\n",
241 | "ax2 = fig.add_subplot(122)\n",
242 | "ax2.set_ylim(0, 8)\n",
243 | "\n",
244 | "sns.distplot(df[df[\"x\"] == 1][\"y\"], ax=ax2, hist=False,\n",
245 | " kde_kws={\"alpha\": .3, \"color\": \"b\",\n",
246 | " \"shade\": True, \"label\": \"1\"})\n",
247 | "sns.distplot(df[df[\"x\"] == 0][\"y\"], ax=ax2, hist=False,\n",
248 | " kde_kws={\"alpha\": .3, \"color\": \"g\",\n",
249 | " \"shade\": True, \"label\": \"0\"})\n",
250 | "\n",
251 | "print(res.deviance)\n",
252 | "\n",
253 | "plt.show()"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {},
260 | "outputs": [],
261 | "source": [
262 | "fig = plt.figure(figsize=(10, 5))\n",
263 | "ax = fig.add_subplot(111)\n",
264 | "ax.set_title(\"ROC curve for GLM\")\n",
265 | "ax.set_xlabel(\"False positive rate\")\n",
266 | "ax.set_ylabel(\"True positive rate\")\n",
267 | "\n",
268 | "fpr, tpr, _ = roc_curve(y_obs, yhat_lm)\n",
269 | "plt.plot(fpr, tpr)\n",
270 | "plt.plot([0, 1], [0, 1], color='navy', linestyle='--')\n",
271 | "\n",
272 | "plt.show()\n",
273 | "\n",
274 | "print(auc(fpr, tpr))"
275 | ]
276 | }
277 | ],
278 | "metadata": {
279 | "kernelspec": {
280 | "display_name": "Python 3",
281 | "language": "python",
282 | "name": "python3"
283 | },
284 | "language_info": {
285 | "codemirror_mode": {
286 | "name": "ipython",
287 | "version": 3
288 | },
289 | "file_extension": ".py",
290 | "mimetype": "text/x-python",
291 | "name": "python",
292 | "nbconvert_exporter": "python",
293 | "pygments_lexer": "ipython3",
294 | "version": "3.6.1"
295 | }
296 | },
297 | "nbformat": 4,
298 | "nbformat_minor": 2
299 | }
300 |
--------------------------------------------------------------------------------
/ch08-classification/breast-cancer/breast-cancer-wisconsin.R:
--------------------------------------------------------------------------------
1 | # 빅데이터 분별분석. 암 예측.
2 | #
3 | if (!file.exists("breast-cancer-wisconsin.data")){
4 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data > breast-cancer-wisconsin.data')
5 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names > breast-cancer-wisconsin.names')
6 | }
7 |
8 | rmse <- function(yi, yhat_i){
9 | sqrt(mean((yi - yhat_i)^2))
10 | }
11 |
12 | binomial_deviance <- function(y_obs, yhat){
13 | epsilon = 0.0001
14 | yhat = ifelse(yhat < epsilon, epsilon, yhat)
15 | yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
16 | a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
17 | b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
18 | return(2*sum(a + b))
19 | }
20 |
21 |
22 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
23 | usr <- par("usr"); on.exit(par(usr))
24 | par(usr = c(0, 1, 0, 1))
25 | r <- abs(cor(x, y))
26 | txt <- format(c(r, 0.123456789), digits = digits)[1]
27 | txt <- paste0(prefix, txt)
28 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
29 | text(0.5, 0.5, txt, cex = cex.cor * r)
30 | }
31 |
32 |
33 |
34 | library(dplyr)
35 | library(ggplot2)
36 | library(MASS)
37 | library(glmnet)
38 | library(randomForest)
39 | library(gbm)
40 | library(rpart)
41 | library(boot)
42 | library(data.table)
43 | library(ROCR)
44 | library(gridExtra)
45 |
46 | data <- tbl_df(read.table("breast-cancer-wisconsin.data", strip.white = TRUE,
47 | sep=",", header = FALSE, na.strings = '?'))
48 | names(data) <- c('id', 'thickness', 'unif_cell_size', 'unif_cell_shape',
49 | 'marginal_adhesion', 'cell_size', 'bare_nuclei',
50 | 'bland_cromatin', 'normal_nucleoli', 'mitoses', 'class')
51 |
52 | glimpse(data)
53 |
54 | # 1. 결측치 처리
55 | data$bare_nuclei[is.na(data$bare_nuclei)] <- median(data$bare_nuclei, na.rm = TRUE)
56 | # 2. id 변수 제거
57 | data <- data %>% dplyr::select(-id)
58 | # 3. class 변수를 인자 변수로 변환
59 | data$class <- factor(ifelse(data$class == 2, 0, 1))
60 |
61 | glimpse(data)
62 |
63 |
64 | summary(data)
65 |
66 | pairs(data %>% sample_n(min(1000, nrow(data))),
67 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
68 | upper.panel = panel.cor)
69 |
70 | library(ggplot2)
71 | library(dplyr)
72 | library(gridExtra)
73 | p1 <- data %>% ggplot(aes(class)) + geom_bar()
74 | p2 <- data %>% ggplot(aes(class, unif_cell_size)) +
75 | geom_jitter(col='gray') +
76 | geom_boxplot(alpha=.5)
77 | p3 <- data %>% ggplot(aes(class, bare_nuclei)) +
78 | geom_jitter(col='gray') +
79 | geom_boxplot(alpha=.5)
80 | p4 <- data %>% ggplot(aes(unif_cell_size, bare_nuclei)) +
81 | geom_jitter(col='gray') + geom_smooth()
82 | grid.arrange(p1, p2, p3, p4, ncol=2)
83 |
84 |
85 | # 트래인셋과 테스트셋의 구분
86 | set.seed(1606)
87 | n <- nrow(data)
88 | idx <- 1:n
89 | training_idx <- sample(idx, n * .60)
90 | idx <- setdiff(idx, training_idx)
91 | validate_idx <- sample(idx, n * .20)
92 | test_idx <- setdiff(idx, validate_idx)
93 | training <- data[training_idx,]
94 | validation <- data[validate_idx,]
95 | test <- data[test_idx,]
96 |
97 |
98 | #-----------------
99 | # 로지스틱 회귀모형
100 | data_lm_full <- glm(class ~ ., data=training, family=binomial)
101 | summary(data_lm_full)
102 |
103 | predict(data_lm_full, newdata = data[1:5,])
104 |
105 | # 선형회귀모형에서 변수선택
106 | data_lm_full_2 <- lm(class ~ .^2, data=training)
107 | summary(data_lm_full_2)
108 |
109 | length(coef(data_lm_full_2))
110 |
111 | library(MASS)
112 | data_step <- stepAIC(data_lm_full,
113 | scope = list(upper = ~ .^2, lower = ~1))
114 |
115 | data_step
116 | anova(data_step)
117 | summary(data_step)
118 | length(coef(data_step))
119 |
120 |
121 | # 모형평가
122 | y_obs <- validation$class
123 | yhat_lm <- predict(data_lm_full, newdata=validation)
124 | yhat_lm_2 <- predict(data_lm_full_2, newdata=validation)
125 | yhat_step <- predict(data_step, newdata=validation)
126 | rmse(y_obs, yhat_lm)
127 | rmse(y_obs, yhat_lm_2)
128 | rmse(y_obs, yhat_step)
129 |
130 | library(ROCR)
131 | pred_lm <- prediction(yhat_lm, y_obs)
132 | performance(pred_lm, "auc")@y.values[[1]]
133 | binomial_deviance(y_obs, yhat_glmnet)
134 |
135 | #-----------------
136 | # 라쏘 모형 적합
137 | # xx <- model.matrix(class ~ .^2-1, data)
138 | xx <- model.matrix(class ~ .-1, data)
139 | x <- xx[training_idx, ]
140 | y <- as.numeric(training$class)
141 | glimpse(x)
142 |
143 | data_cvfit <- cv.glmnet(x, y, family = "binomial")
144 | plot(data_cvfit)
145 |
146 |
147 | coef(data_cvfit, s = c("lambda.1se"))
148 | coef(data_cvfit, s = c("lambda.min"))
149 |
150 |
151 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,])
152 |
153 |
154 |
155 |
156 | predict(ad_cvfit, s="lambda.1se", newx = x[1:5,], type='response')
157 |
158 | y_obs <- as.numeric(validation$class)
159 | yhat_glmnet <- predict(ad_cvfit, s="lambda.1se", newx=xx[validate_idx,], type='response')
160 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
161 | binomial_deviance(y_obs, yhat_glmnet)
162 |
163 |
164 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
165 | performance(pred_glmnet, "auc")@y.values[[1]]
166 |
167 |
168 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
169 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
170 | plot(perf_lm, col='black', main="ROC Curve for GLM")
171 | abline(0,1)
172 |
173 |
174 | plot(perf_lm, col='black', main="ROC Curve")
175 | plot(perf_glmnet, col='blue', add=TRUE)
176 | abline(0,1)
177 | legend('bottomright', inset=.1,
178 | legend=c("GLM", "glmnet"),
179 | col=c('black', 'blue'), lty=1, lwd=2)
180 |
181 |
182 |
183 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,])
184 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
185 | rmse(y_obs, yhat_glmnet)
186 |
187 | #-----------------
188 | # 나무모형
189 | data_tr <- rpart(class ~ ., data = training)
190 | data_tr
191 |
192 | printcp(data_tr)
193 | summary(data_tr)
194 |
195 | opar <- par(mfrow = c(1,1), xpd = NA)
196 | plot(data_tr)
197 | text(data_tr, use.n = TRUE)
198 | par(opar)
199 |
200 |
201 | yhat_tr <- predict(data_tr, validation)
202 | rmse(y_obs, yhat_tr)
203 |
204 |
205 | #-----------------
206 | # 랜덤포레스트
207 | set.seed(1607)
208 | data_rf <- randomForest(class ~ ., training)
209 | data_rf
210 |
211 | opar <- par(mfrow=c(1,2))
212 | plot(data_rf)
213 | varImpPlot(data_rf)
214 | par(opar)
215 |
216 |
217 | yhat_rf <- predict(data_rf, newdata=validation)
218 | rmse(y_obs, yhat_rf)
219 |
220 |
221 | #-----------------
222 | # 부스팅
223 | set.seed(1607)
224 | data_gbm <- gbm(class ~ ., data=training,
225 | n.trees=40000, cv.folds=3, verbose = TRUE)
226 | (best_iter = gbm.perf(data_gbm, method="cv"))
227 |
228 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation)
229 | rmse(y_obs, yhat_gbm)
230 |
231 |
232 | # 최종 모형선택과 테스트셋 오차계산
233 | data.frame(lm = rmse(y_obs, yhat_step),
234 | glmnet = rmse(y_obs, yhat_glmnet),
235 | rf = rmse(y_obs, yhat_rf),
236 | gbm = rmse(y_obs, yhat_gbm)) %>%
237 | reshape2::melt(value.name = 'rmse', variable.name = 'method')
238 |
239 | rmse(test$class, predict(data_rf, newdata = test))
240 |
241 |
242 | # 회귀분석의 오차의 시각화
243 | boxplot(list(lm = y_obs-yhat_step,
244 | glmnet = y_obs-yhat_glmnet,
245 | rf = y_obs-yhat_rf,
246 | gbm = y_obs-yhat_gbm), ylab="Error in Validation Set")
247 | abline(h=0, lty=2, col='blue')
248 |
249 |
250 | pairs(data.frame(y_obs=y_obs,
251 | yhat_lm=yhat_step,
252 | yhat_glmnet=c(yhat_glmnet),
253 | yhat_rf=yhat_rf,
254 | yhat_gbm=yhat_gbm),
255 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
256 | upper.panel = panel.cor)
257 |
258 |
--------------------------------------------------------------------------------
/ch08-classification/breast-cancer/breast-cancer.R:
--------------------------------------------------------------------------------
1 | # 빅데이터 분별분석. 암 예측.
2 | #
3 | if (!file.exists("wdbc.data")){
4 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data > wdbc.data')
5 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names > wdbc.names')
6 | }
7 |
8 | rmse <- function(yi, yhat_i){
9 | sqrt(mean((yi - yhat_i)^2))
10 | }
11 |
12 | binomial_deviance <- function(y_obs, yhat){
13 | epsilon = 0.0001
14 | yhat = ifelse(yhat < epsilon, epsilon, yhat)
15 | yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
16 | a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
17 | b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
18 | return(2*sum(a + b))
19 | }
20 |
21 |
22 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
23 | usr <- par("usr"); on.exit(par(usr))
24 | par(usr = c(0, 1, 0, 1))
25 | r <- abs(cor(x, y))
26 | txt <- format(c(r, 0.123456789), digits = digits)[1]
27 | txt <- paste0(prefix, txt)
28 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
29 | text(0.5, 0.5, txt, cex = cex.cor * r)
30 | }
31 |
32 |
33 |
34 | library(tidyverse)
35 | library(gridExtra)
36 | library(MASS)
37 | library(glmnet)
38 | library(randomForest)
39 | library(gbm)
40 | library(rpart)
41 | library(boot)
42 | library(data.table)
43 | library(ROCR)
44 |
45 | data <- tbl_df(read.table("wdbc.data", strip.white = TRUE,
46 | sep=",", header = FALSE))
47 | feature_names <- c('radius', 'texture', 'perimeter', 'area', 'smoothness',
48 | 'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal_dim')
49 | names(data) <-
50 | c('id', 'class',
51 | paste0('mean_', feature_names),
52 | paste0('se_', feature_names),
53 | paste0('worst_', feature_names))
54 |
55 | glimpse(data)
56 |
57 |
58 | # 1. id 변수 제거
59 | data <- data %>% dplyr::select(-id)
60 | # 2. class 변수를 인자 변수로 변환
61 | data$class <- factor(ifelse(data$class == 'B', 0, 1))
62 |
63 | glimpse(data)
64 |
65 | summary(data)
66 |
67 | png("../../plots/10-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=9, res=600)
68 | pairs(data %>% dplyr::select(class, starts_with('mean_')) %>%
69 | sample_n(min(1000, nrow(data))),
70 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
71 | upper.panel = panel.cor)
72 | dev.off()
73 |
74 | pairs(data %>% dplyr::select(class, starts_with('se_')) %>%
75 | sample_n(min(1000, nrow(data))),
76 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
77 | upper.panel = panel.cor)
78 |
79 | pairs(data %>% dplyr::select(class, starts_with('worst_')) %>%
80 | sample_n(min(1000, nrow(data))),
81 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
82 | upper.panel = panel.cor)
83 |
84 | library(ggplot2)
85 | library(dplyr)
86 | library(gridExtra)
87 | p1 <- data %>% ggplot(aes(class)) + geom_bar()
88 | p2 <- data %>% ggplot(aes(class, mean_concave_points)) +
89 | geom_jitter(col='gray') +
90 | geom_boxplot(alpha=.5)
91 | p3 <- data %>% ggplot(aes(class, mean_radius)) +
92 | geom_jitter(col='gray') +
93 | geom_boxplot(alpha=.5)
94 | p4 <- data %>% ggplot(aes(mean_concave_points, mean_radius)) +
95 | geom_jitter(col='gray') + geom_smooth()
96 | grid.arrange(p1, p2, p3, p4, ncol=2)
97 |
98 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
99 | ggsave("../../plots/10-2.png", g, width=5.5*1.2, height=4*1.2, units='in', dpi=600)
100 |
101 |
102 | # 트래인셋과 테스트셋의 구분
103 | set.seed(1606)
104 | n <- nrow(data)
105 | idx <- 1:n
106 | training_idx <- sample(idx, n * .60)
107 | idx <- setdiff(idx, training_idx)
108 | validate_idx <- sample(idx, n * .20)
109 | test_idx <- setdiff(idx, validate_idx)
110 | training <- data[training_idx,]
111 | validation <- data[validate_idx,]
112 | test <- data[test_idx,]
113 |
114 |
115 | #-----------------
116 | # 로지스틱 회귀모형
117 | data_lm_full <- glm(class ~ ., data=training, family=binomial)
118 | summary(data_lm_full)
119 | anova(data_lm_full)
120 |
121 | predict(data_lm_full, newdata = data[1:5,], type='response')
122 |
123 | table(y_obs, yhat_lm)
124 |
125 | # 모형평가
126 | y_obs <- as.numeric(as.character(validation$class))
127 | yhat_lm <- predict(data_lm_full, newdata = validation, type='response')
128 | pred_lm <- prediction(yhat_lm, y_obs)
129 | performance(pred_lm, "auc")@y.values[[1]]
130 | binomial_deviance(y_obs, yhat_lm)
131 |
132 | #-----------------
133 | # 라쏘 모형 적합
134 | # xx <- model.matrix(class ~ .^2-1, data)
135 | xx <- model.matrix(class ~ .-1, data)
136 | x <- xx[training_idx, ]
137 | y <- as.numeric(as.character(training$class))
138 | glimpse(x)
139 |
140 | data_cvfit <- cv.glmnet(x, y, family = "binomial")
141 | plot(data_cvfit)
142 |
143 | png("../../plots/10-3.png", 5.5, 4, units='in', pointsize=9, res=600)
144 | plot(data_cvfit)
145 | dev.off()
146 |
147 | coef(data_cvfit, s = c("lambda.1se"))
148 | coef(data_cvfit, s = c("lambda.min"))
149 |
150 |
151 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,], type='response')
152 |
153 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,], type='response')
154 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
155 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
156 | performance(pred_glmnet, "auc")@y.values[[1]]
157 | binomial_deviance(y_obs, yhat_glmnet)
158 |
159 |
160 | #-----------------
161 | # 나무모형
162 | data_tr <- rpart(class ~ ., data = training)
163 | data_tr
164 |
165 | printcp(data_tr)
166 | summary(data_tr)
167 |
168 | png("../../plots/10-4.png", 5.5, 4, units='in', pointsize=9, res=600)
169 | opar <- par(mfrow = c(1,1), xpd = NA)
170 | plot(data_tr)
171 | text(data_tr, use.n = TRUE)
172 | par(opar)
173 | dev.off()
174 |
175 |
176 | yhat_tr <- predict(data_tr, validation)
177 | yhat_tr <- yhat_tr[,"1"]
178 | pred_tr <- prediction(yhat_tr, y_obs)
179 | performance(pred_tr, "auc")@y.values[[1]]
180 | binomial_deviance(y_obs, yhat_tr)
181 |
182 |
183 | #-----------------
184 | # 랜덤포레스트
185 | set.seed(1607)
186 | data_rf <- randomForest(class ~ ., training)
187 | data_rf
188 |
189 | png("../../plots/10-5.png", 5.5*1.5, 4*1.2, units='in', pointsize=9, res=600)
190 | opar <- par(mfrow=c(1,2))
191 | plot(data_rf)
192 | varImpPlot(data_rf)
193 | par(opar)
194 | dev.off()
195 |
196 |
197 | yhat_rf <- predict(data_rf, newdata=validation, type='prob')[,'1']
198 | pred_rf <- prediction(yhat_rf, y_obs)
199 | performance(pred_rf, "auc")@y.values[[1]]
200 | binomial_deviance(y_obs, yhat_rf)
201 |
202 |
203 | #-----------------
204 | # 부스팅
205 | set.seed(1607)
206 | data_for_gbm <-
207 | training %>%
208 | mutate(class=as.numeric(as.character(class)))
209 | data_gbm <- gbm(class ~ ., data=data_for_gbm, distribution="bernoulli",
210 | n.trees=50000, cv.folds=3, verbose=TRUE)
211 | (best_iter = gbm.perf(data_gbm, method="cv"))
212 |
213 | png("../../plots/10-6.png", 5.5, 4, units='in', pointsize=9, res=600)
214 | (best_iter = gbm.perf(data_gbm, method="cv"))
215 | dev.off()
216 |
217 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation, type='response')
218 | pred_gbm <- prediction(yhat_gbm, y_obs)
219 | performance(pred_gbm, "auc")@y.values[[1]]
220 | binomial_deviance(y_obs, yhat_gbm)
221 |
222 | #------------------
223 | # 최종 모형선택과 테스트셋 오차계산
224 | data.frame(method=c('lm', 'glmnet', 'rf', 'gbm'),
225 | auc = c(performance(pred_lm, "auc")@y.values[[1]],
226 | performance(pred_glmnet, "auc")@y.values[[1]],
227 | performance(pred_rf, "auc")@y.values[[1]],
228 | performance(pred_gbm, "auc")@y.values[[1]]),
229 | bin_dev = c(binomial_deviance(y_obs, yhat_lm),
230 | binomial_deviance(y_obs, yhat_glmnet),
231 | binomial_deviance(y_obs, yhat_rf),
232 | binomial_deviance(y_obs, yhat_gbm)))
233 |
234 | # glmnet이 최종 승리자:
235 | y_obs_test <- as.numeric(as.character(test$class))
236 | yhat_glmnet_test <- predict(data_cvfit, s="lambda.min", newx=xx[test_idx,], type='response')
237 | yhat_glmnet_test <- yhat_glmnet_test[,1]
238 | pred_glmnet_test <- prediction(yhat_glmnet_test, y_obs_test)
239 | performance(pred_glmnet_test, "auc")@y.values[[1]]
240 | binomial_deviance(y_obs_test, yhat_glmnet_test)
241 |
242 | # 예측값들의 상관관계
243 |
244 |
245 | #-----------
246 | # ROC 커브
247 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
248 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
249 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
250 | perf_gbm <- performance(pred_gbm, measure="tpr", x.measure="fpr")
251 |
252 |
253 | png("../../plots/10-7.png", 5.5, 4, units='in', pointsize=9, res=600)
254 | plot(perf_lm, col='black', main="ROC Curve")
255 | plot(perf_glmnet, add=TRUE, col='blue')
256 | plot(perf_rf, add=TRUE, col='red')
257 | plot(perf_gbm, add=TRUE, col='cyan')
258 | abline(0,1)
259 | legend('bottomright', inset=.1,
260 | legend=c("GLM", "glmnet", "RF", "GBM"),
261 | col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
262 | dev.off()
263 |
264 |
265 | png("../../plots/10-8.png", 5.5, 4, units='in', pointsize=9, res=600)
266 | pairs(data.frame(y_obs=y_obs,
267 | yhat_lm=yhat_lm,
268 | yhat_glmnet=c(yhat_glmnet),
269 | yhat_rf=yhat_rf,
270 | yhat_gbm=yhat_gbm),
271 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
272 | upper.panel = panel.cor)
273 | dev.off()
274 |
--------------------------------------------------------------------------------
/ch08-classification/breast-cancer/breast-cancer.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch08-classification/spam-detection/spam-detection.R:
--------------------------------------------------------------------------------
1 | # 빅데이터 분별분석. 스팸 메일 예측
2 | #
3 | if (!file.exists("spambase.data")){
4 | system('curl https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data > spambase.data')
5 | system('curl https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names > spambase.names')
6 | }
7 |
8 | binomial_deviance <- function(y_obs, yhat){
9 | epsilon = 0.0001
10 | yhat = ifelse(yhat < epsilon, epsilon, yhat)
11 | yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
12 | a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
13 | b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
14 | return(2*sum(a + b))
15 | }
16 |
17 |
18 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
19 | usr <- par("usr"); on.exit(par(usr))
20 | par(usr = c(0, 1, 0, 1))
21 | r <- abs(cor(x, y))
22 | txt <- format(c(r, 0.123456789), digits = digits)[1]
23 | txt <- paste0(prefix, txt)
24 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
25 | text(0.5, 0.5, txt, cex = cex.cor * r)
26 | }
27 |
28 |
29 |
30 | library(dplyr)
31 | library(ggplot2)
32 | library(MASS)
33 | library(glmnet)
34 | library(randomForest)
35 | library(gbm)
36 | library(rpart)
37 | library(boot)
38 | library(data.table)
39 | library(ROCR)
40 | library(gridExtra)
41 |
42 | data <- tbl_df(read.table("spambase.data", strip.white = TRUE,
43 | sep=",", header = FALSE))
44 | names(data) <-
45 | c('word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our',
46 | 'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail',
47 | 'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses',
48 | 'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
49 | 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp',
50 | 'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs',
51 | 'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
52 | 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
53 | 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re',
54 | 'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(',
55 | 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average',
56 | 'capital_run_length_longest', 'capital_run_length_total',
57 | # 'spam'
58 | 'class'
59 | )
60 | names(data)[58] <- 'class'
61 | data$class <- factor(data$class)
62 |
63 | glimpse(data)
64 |
65 | summary(data)
66 |
67 | png("../../plots/11-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=10, res=600)
68 | set.seed(1610)
69 | pairs(data %>% dplyr::select(1:10, 58) %>%
70 | sample_n(min(1000, nrow(data))),
71 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
72 | upper.panel = panel.cor)
73 | dev.off()
74 |
75 |
76 | png("../../plots/11-2.png", 5.5*1.2, 4*1.2, units='in', pointsize=10, res=600)
77 | set.seed(1610)
78 | pairs(data %>% dplyr::select(48:57, 58) %>%
79 | sample_n(min(1000, nrow(data))),
80 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
81 | upper.panel = panel.cor)
82 | dev.off()
83 |
84 | #
85 | tmp <- as.data.frame(cor(data[,-58], as.numeric(data$class)))
86 | tmp <- tmp %>% rename(cor=V1)
87 | tmp$var <- rownames(tmp)
88 | tmp %>%
89 | ggplot(aes(reorder(var, cor), cor)) +
90 | geom_point() +
91 | coord_flip()
92 | ggsave("../../plots/11-3.png", width=5.5*1.8, height=4*1.8, units='in', dpi=400)
93 |
94 |
95 | library(ggplot2)
96 | library(dplyr)
97 | library(gridExtra)
98 | p1 <- data %>% ggplot(aes(class)) + geom_bar()
99 | p2 <- data %>% ggplot(aes(class, `char_freq_$`)) +
100 | geom_jitter(col='gray') +
101 | geom_boxplot(alpha=.5) +
102 | scale_y_sqrt()
103 | p3 <- data %>% ggplot(aes(`char_freq_$`, group=class, fill=class)) +
104 | geom_density(alpha=.5) +
105 | scale_x_sqrt() + scale_y_sqrt()
106 | p4 <- data %>% ggplot(aes(class, capital_run_length_longest)) +
107 | geom_jitter(col='gray') +
108 | geom_boxplot(alpha=.5) +
109 | scale_y_log10()
110 | grid.arrange(p1, p2, p3, p4, ncol=2)
111 |
112 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
113 | ggsave("../../plots/11-4.png", g, width=5.5, height=4, units='in', dpi=600)
114 |
115 | ?'`'
116 |
117 |
118 | # 변수명의 특수문자 처리
119 |
120 | old_names <- names(data)
121 | new_names <- make.names(names(data), unique = TRUE)
122 | cbind(old_names, new_names) [old_names!=new_names, ]
123 |
124 | names(data) <- new_names
125 |
126 | # 트래인셋과 테스트셋의 구분
127 | set.seed(1606)
128 | n <- nrow(data)
129 | idx <- 1:n
130 | training_idx <- sample(idx, n * .60)
131 | idx <- setdiff(idx, training_idx)
132 | validate_idx <- sample(idx, n * .20)
133 | test_idx <- setdiff(idx, validate_idx)
134 | training <- data[training_idx,]
135 | validation <- data[validate_idx,]
136 | test <- data[test_idx,]
137 |
138 |
139 | #-----------------
140 | # 로지스틱 회귀모형
141 | data_lm_full <- glm(class ~ ., data=training, family=binomial)
142 | summary(data_lm_full)
143 |
144 | predict(data_lm_full, newdata = data[1:5,], type='response')
145 |
146 | # 모형평가
147 | y_obs <- as.numeric(as.character(validation$class))
148 | yhat_lm <- predict(data_lm_full, newdata = validation, type='response')
149 | pred_lm <- prediction(yhat_lm, y_obs)
150 | performance(pred_lm, "auc")@y.values[[1]]
151 | binomial_deviance(y_obs, yhat_lm)
152 |
153 | #-----------------
154 | # 라쏘 모형 적합
155 | # xx <- model.matrix(class ~ .^2-1, data)
156 | xx <- model.matrix(class ~ .-1, data)
157 | x <- xx[training_idx, ]
158 | y <- as.numeric(as.character(training$class))
159 | glimpse(x)
160 |
161 | data_cvfit <- cv.glmnet(x, y, family = "binomial")
162 | plot(data_cvfit)
163 |
164 | png("../../plots/11-5.png", 5.5, 4, units='in', pointsize=9, res=600)
165 | plot(data_cvfit)
166 | dev.off()
167 |
168 | coef(data_cvfit, s = c("lambda.1se"))
169 | coef(data_cvfit, s = c("lambda.min"))
170 |
171 |
172 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,], type='response')
173 |
174 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,], type='response')
175 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
176 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
177 | performance(pred_glmnet, "auc")@y.values[[1]]
178 | binomial_deviance(y_obs, yhat_glmnet)
179 |
180 |
181 | #-----------------
182 | # 나무모형
183 | data_tr <- rpart(class ~ ., data = training)
184 | data_tr
185 |
186 | printcp(data_tr)
187 | summary(data_tr)
188 |
189 | png("../../plots/11-6.png", 5.5, 4, units='in', pointsize=9, res=600)
190 | opar <- par(mfrow = c(1,1), xpd = NA)
191 | plot(data_tr)
192 | text(data_tr, use.n = TRUE)
193 | par(opar)
194 | dev.off()
195 |
196 |
197 | yhat_tr <- predict(data_tr, validation)
198 | yhat_tr <- yhat_tr[,"1"]
199 | pred_tr <- prediction(yhat_tr, y_obs)
200 | performance(pred_tr, "auc")@y.values[[1]]
201 | binomial_deviance(y_obs, yhat_tr)
202 |
203 |
204 | #-----------------
205 | # 랜덤포레스트
206 | set.seed(1607)
207 | data_rf <- randomForest(class ~ ., data=training)
208 | data_rf
209 |
210 | png("../../plots/11-7.png", 5.5*1.5, 4*1.2, units='in', pointsize=8, res=600)
211 | opar <- par(mfrow=c(1,2))
212 | plot(data_rf)
213 | varImpPlot(data_rf)
214 | par(opar)
215 | dev.off()
216 |
217 |
218 | yhat_rf <- predict(data_rf, newdata=validation, type='prob')[,'1']
219 | pred_rf <- prediction(yhat_rf, y_obs)
220 | performance(pred_rf, "auc")@y.values[[1]]
221 | binomial_deviance(y_obs, yhat_rf)
222 |
223 |
224 | #-----------------
225 | # 부스팅
226 | set.seed(1607)
227 | data_for_gbm <-
228 | training %>%
229 | mutate(class=as.numeric(as.character(class)))
230 | data_gbm <- gbm(class ~ ., data=data_for_gbm, distribution="bernoulli",
231 | n.trees=100000, cv.folds=3, verbose=TRUE)
232 |
233 | png("../../plots/11-8.png", 5.5, 4, units='in', pointsize=9, res=600)
234 | (best_iter = gbm.perf(data_gbm, method="cv"))
235 | dev.off()
236 |
237 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation, type='response')
238 | pred_gbm <- prediction(yhat_gbm, y_obs)
239 | performance(pred_gbm, "auc")@y.values[[1]]
240 | binomial_deviance(y_obs, yhat_gbm)
241 |
242 | #------------------
243 | # 최종 모형선택과 테스트셋 오차계산
244 | data.frame(method=c('lm', 'glmnet', 'rf', 'gbm'),
245 | auc = c(performance(pred_lm, "auc")@y.values[[1]],
246 | performance(pred_glmnet, "auc")@y.values[[1]],
247 | performance(pred_rf, "auc")@y.values[[1]],
248 | performance(pred_gbm, "auc")@y.values[[1]]),
249 | bin_dev = c(binomial_deviance(y_obs, yhat_lm),
250 | binomial_deviance(y_obs, yhat_glmnet),
251 | binomial_deviance(y_obs, yhat_rf),
252 | binomial_deviance(y_obs, yhat_gbm)))
253 |
254 | # glmnet이 최종 승리자인 경우:
255 | y_obs_test <- as.numeric(as.character(test$class))
256 | yhat_glmnet_test <- predict(data_cvfit, s="lambda.min", newx=xx[test_idx,], type='response')
257 | yhat_glmnet_test <- yhat_glmnet_test[,1]
258 | pred_glmnet_test <- prediction(yhat_glmnet_test, y_obs_test)
259 | performance(pred_glmnet_test, "auc")@y.values[[1]]
260 | binomial_deviance(y_obs_test, yhat_glmnet_test)
261 |
262 | # 랜덤포레스트가 최종 승리자인 경우:
263 | y_obs_test <- as.numeric(as.character(test$class))
264 | yhat_rf_test <- predict(data_rf, newdata=test, type='prob')[,'1']
265 | pred_rf_test <- prediction(yhat_rf_test, y_obs_test)
266 | performance(pred_rf_test, "auc")@y.values[[1]]
267 | binomial_deviance(y_obs_test, yhat_rf_test)
268 |
269 |
270 | # 예측값들의 상관관계
271 | pairs(data.frame(y_obs=y_obs,
272 | yhat_lm=yhat_lm,
273 | yhat_glmnet=c(yhat_glmnet),
274 | yhat_rf=yhat_rf,
275 | yhat_gbm=yhat_gbm),
276 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
277 | upper.panel = panel.cor)
278 |
279 |
280 | #-----------
281 | # ROC 커브
282 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
283 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
284 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
285 | perf_gbm <- performance(pred_gbm, measure="tpr", x.measure="fpr")
286 |
287 | png("../../plots/11-9.png", 5.5, 4, units='in', pointsize=9, res=600)
288 | plot(perf_lm, col='black', main="ROC Curve")
289 | plot(perf_glmnet, add=TRUE, col='blue')
290 | plot(perf_rf, add=TRUE, col='red')
291 | plot(perf_gbm, add=TRUE, col='cyan')
292 | abline(0,1)
293 | legend('bottomright', inset=.1,
294 | legend=c("GLM", "glmnet", "RF", "GBM"),
295 | col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
296 | dev.off()
297 |
298 |
299 | png("../../plots/11-10.png", 5.5, 4, units='in', pointsize=9, res=600)
300 | pairs(data.frame(y_obs=y_obs,
301 | yhat_lm=yhat_lm,
302 | yhat_glmnet=c(yhat_glmnet),
303 | yhat_rf=yhat_rf,
304 | yhat_gbm=yhat_gbm),
305 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
306 | upper.panel = panel.cor)
307 | dev.off()
308 |
--------------------------------------------------------------------------------
/ch08-classification/spam-detection/spam-detection.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch08-classification/spam-detection/spam-detection.md:
--------------------------------------------------------------------------------
1 | # spam-detection
2 | 스팸 메일 예측 문제
3 |
4 | 변수들:
5 |
6 | spam
7 | word_freq_make
8 | word_freq_address
9 | word_freq_all
10 | word_freq_3d
11 | word_freq_our
12 | word_freq_over
13 | word_freq_remove
14 | word_freq_internet
15 | word_freq_order
16 | word_freq_mail
17 | word_freq_receive
18 | word_freq_will
19 | word_freq_people
20 | word_freq_report
21 | word_freq_addresses
22 | word_freq_free
23 | word_freq_business
24 | word_freq_email
25 | word_freq_you
26 | word_freq_credit
27 | word_freq_your
28 | word_freq_font
29 | word_freq_000
30 | word_freq_money
31 | word_freq_hp
32 | word_freq_hpl
33 | word_freq_george
34 | word_freq_650
35 | word_freq_lab
36 | word_freq_labs
37 | word_freq_telnet
38 | word_freq_857
39 | word_freq_data
40 | word_freq_415
41 | word_freq_85
42 | word_freq_technology
43 | word_freq_1999
44 | word_freq_parts
45 | word_freq_pm
46 | word_freq_direct
47 | word_freq_cs
48 | word_freq_meeting
49 | word_freq_original
50 | word_freq_project
51 | word_freq_re
52 | word_freq_edu
53 | word_freq_table
54 | word_freq_conference
55 | char_freq_;
56 | char_freq_(
57 | char_freq_[
58 | char_freq_!
59 | char_freq_$
60 | char_freq_#
61 | capital_run_length_average
62 | capital_run_length_longest
63 | capital_run_length_total
64 |
--------------------------------------------------------------------------------
/ch12-r-markdown/ch10-r-markdown.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "R Markdown 예제"
3 | author: "권재명"
4 | output: html_document
5 | ---
6 |
7 | ```{r setup, include=FALSE}
8 | knitr::opts_chunk$set(echo = TRUE)
9 | ```
10 |
11 | R 마크다운을 사용하여 R 코드, 코드실행 결과 텍스트, 도표를 포함한 문서를 쉽게 작성할 수 있습니다.
12 |
13 | ```{r cars, fig.width=4, fig.height=3, message=FALSE}
14 | library(ggplot2)
15 | qplot(speed, dist, data=cars) +
16 | geom_smooth()
17 | summary(cars)
18 | ```
19 |
--------------------------------------------------------------------------------
/ch12-r-markdown/ch10-r-markdown.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch13-regression/housing/housing.R:
--------------------------------------------------------------------------------
1 | # 빅데이터 회귀분석. 부동산 가격 예측
2 | #
3 | if (!file.exists("housing.data")){
4 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data > housing.data')
5 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names > housing.names')
6 | }
7 |
8 | rmse <- function(yi, yhat_i){
9 | sqrt(mean((yi - yhat_i)^2))
10 | }
11 |
12 | mae <- function(yi, yhat_i){
13 | mean(abs(yi - yhat_i))
14 | }
15 |
16 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
17 | usr <- par("usr"); on.exit(par(usr))
18 | par(usr = c(0, 1, 0, 1))
19 | r <- abs(cor(x, y))
20 | txt <- format(c(r, 0.123456789), digits = digits)[1]
21 | txt <- paste0(prefix, txt)
22 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
23 | text(0.5, 0.5, txt, cex = cex.cor * r)
24 | }
25 |
26 |
27 |
28 | library(dplyr)
29 | library(ggplot2)
30 | library(MASS)
31 | library(glmnet)
32 | library(randomForest)
33 | library(gbm)
34 | library(rpart)
35 | library(boot)
36 | library(data.table)
37 | library(ROCR)
38 | library(gridExtra)
39 |
40 | data <- tbl_df(read.table("housing.data", strip.white = TRUE))
41 | names(data) <- c('crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age',
42 | 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'medv')
43 | glimpse(data)
44 |
45 | summary(data)
46 |
47 | pairs(data %>% sample_n(min(1000, nrow(data))))
48 |
49 | png("../../plots/13-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=9, res=600)
50 | pairs(data %>% sample_n(min(1000, nrow(data))),
51 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
52 | upper.panel = panel.cor)
53 | dev.off()
54 |
55 |
56 | # 트래인셋과 테스트셋의 구분
57 | set.seed(1606)
58 | n <- nrow(data)
59 | idx <- 1:n
60 | training_idx <- sample(idx, n * .60)
61 | idx <- setdiff(idx, training_idx)
62 | validate_idx <- sample(idx, n * .20)
63 | test_idx <- setdiff(idx, validate_idx)
64 | training <- data[training_idx,]
65 | validation <- data[validate_idx,]
66 | test <- data[test_idx,]
67 |
68 |
69 | # 선형회귀모형 (linear regression model)
70 | data_lm_full <- lm(medv ~ ., data=training)
71 | summary(data_lm_full)
72 |
73 | predict(data_lm_full, newdata = data[1:5,])
74 |
75 | # 선형회귀모형에서 변수선택
76 | data_lm_full_2 <- lm(medv ~ .^2, data=training)
77 | summary(data_lm_full_2)
78 |
79 | length(coef(data_lm_full_2))
80 |
81 | library(MASS)
82 | data_step <- stepAIC(data_lm_full,
83 | scope = list(upper = ~ .^2, lower = ~1))
84 |
85 | data_step
86 | anova(data_step)
87 | summary(data_step)
88 | length(coef(data_step))
89 |
90 |
91 | # 모형평가
92 | y_obs <- validation$medv
93 | yhat_lm <- predict(data_lm_full, newdata=validation)
94 | yhat_lm_2 <- predict(data_lm_full_2, newdata=validation)
95 | yhat_step <- predict(data_step, newdata=validation)
96 | rmse(y_obs, yhat_lm)
97 | rmse(y_obs, yhat_lm_2)
98 | rmse(y_obs, yhat_step)
99 |
100 |
101 | # 라쏘 모형 적합
102 | xx <- model.matrix(medv ~ .^2-1, data)
103 | x <- xx[training_idx, ]
104 | y <- training$medv
105 | glimpse(x)
106 |
107 | data_cvfit <- cv.glmnet(x, y)
108 | plot(data_cvfit)
109 |
110 |
111 | png("../../plots/13-2.png", 5.5, 4, units='in', pointsize=9, res=600)
112 | plot(data_cvfit)
113 | dev.off()
114 |
115 | coef(data_cvfit, s = c("lambda.1se"))
116 | coef(data_cvfit, s = c("lambda.min"))
117 |
118 | (tmp <- coef(data_cvfit, s = c("lambda.1se")))
119 | tmp <- tmp[,1]
120 | length(tmp[abs(tmp)>0])
121 | (tmp <- coef(data_cvfit, s = c("lambda.min")))
122 | length(tmp[abs(tmp)>0])
123 |
124 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,])
125 |
126 | y_obs <- validation$medv
127 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,])
128 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
129 | rmse(y_obs, yhat_glmnet)
130 |
131 | # 나무모형
132 | data_tr <- rpart(medv ~ ., data = training)
133 | data_tr
134 |
135 | printcp(data_tr)
136 | summary(data_tr)
137 |
138 | png("../../plots/13-3.png", 5.5, 4, units='in', pointsize=9, res=600)
139 | opar <- par(mfrow = c(1,1), xpd = NA)
140 | plot(data_tr)
141 | text(data_tr, use.n = TRUE)
142 | par(opar)
143 | dev.off()
144 |
145 |
146 | yhat_tr <- predict(data_tr, validation)
147 | rmse(y_obs, yhat_tr)
148 |
149 |
150 | # 랜덤포레스트
151 | set.seed(1607)
152 | data_rf <- randomForest(medv ~ ., training)
153 | data_rf
154 |
155 | png("../../plots/13-4.png", 5.5, 4*.8, units='in', pointsize=9, res=600)
156 | par(mfrow=c(1,2))
157 | plot(data_rf)
158 | varImpPlot(data_rf)
159 | dev.off()
160 |
161 |
162 | yhat_rf <- predict(data_rf, newdata=validation)
163 | rmse(y_obs, yhat_rf)
164 |
165 |
166 | # 부스팅
167 | set.seed(1607)
168 | data_gbm <- gbm(medv ~ ., data=training,
169 | n.trees=40000, cv.folds=3, verbose = TRUE)
170 |
171 | png("../../plots/13-5.png", 5.5, 4, units='in', pointsize=9, res=600)
172 | (best_iter = gbm.perf(data_gbm, method="cv"))
173 | dev.off()
174 |
175 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation)
176 | rmse(y_obs, yhat_gbm)
177 |
178 |
179 | # 최종 모형선택과 테스트셋 오차계산
180 | data.frame(lm = rmse(y_obs, yhat_step),
181 | glmnet = rmse(y_obs, yhat_glmnet),
182 | rf = rmse(y_obs, yhat_rf),
183 | gbm = rmse(y_obs, yhat_gbm)) %>%
184 | reshape2::melt(value.name = 'rmse', variable.name = 'method')
185 |
186 | rmse(test$medv, predict(data_rf, newdata = test))
187 |
188 |
189 | # 회귀분석의 오차의 시각화
190 | png("../../plots/13-6.png", 5.5, 4, units='in', pointsize=9, res=600)
191 | boxplot(list(lm = y_obs-yhat_step,
192 | glmnet = y_obs-yhat_glmnet,
193 | rf = y_obs-yhat_rf,
194 | gbm = y_obs-yhat_gbm), ylab="Error in Validation Set")
195 | abline(h=0, lty=2, col='blue')
196 | dev.off()
197 |
198 |
199 | png("../../plots/13-7.png", 5.5, 4, units='in', pointsize=9, res=600)
200 | pairs(data.frame(y_obs=y_obs,
201 | yhat_lm=yhat_step,
202 | yhat_glmnet=c(yhat_glmnet),
203 | yhat_rf=yhat_rf,
204 | yhat_gbm=yhat_gbm),
205 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
206 | upper.panel = panel.cor)
207 | dev.off()
208 |
--------------------------------------------------------------------------------
/ch13-regression/housing/housing.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch13-regression/wine-quality/wine-quality.R:
--------------------------------------------------------------------------------
1 | # 빅데이터 회귀분석. 와인 품질 예측
2 | #
3 | if (!file.exists("winequality-white.csv")){
4 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv > winequality-red.csv')
5 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv > winequality-white.csv')
6 | system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names > winequality.names')
7 | }
8 |
9 | rmse <- function(yi, yhat_i){
10 | sqrt(mean((yi - yhat_i)^2))
11 | }
12 |
13 | mae <- function(yi, yhat_i){
14 | mean(abs(yi - yhat_i))
15 | }
16 |
17 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
18 | usr <- par("usr"); on.exit(par(usr))
19 | par(usr = c(0, 1, 0, 1))
20 | r <- abs(cor(x, y))
21 | txt <- format(c(r, 0.123456789), digits = digits)[1]
22 | txt <- paste0(prefix, txt)
23 | if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
24 | text(0.5, 0.5, txt, cex = cex.cor * r)
25 | }
26 |
27 |
28 |
29 | library(dplyr)
30 | library(ggplot2)
31 | library(MASS)
32 | library(glmnet)
33 | library(randomForest)
34 | library(gbm)
35 | library(rpart)
36 | library(boot)
37 | library(data.table)
38 | library(ROCR)
39 | library(gridExtra)
40 |
41 | data <- tbl_df(read.table("winequality-white.csv", strip.white = TRUE,
42 | sep=";", header = TRUE))
43 | glimpse(data)
44 |
45 | summary(data)
46 |
47 | pairs(data %>% sample_n(min(1000, nrow(data))))
48 |
49 | png("../../plots/14-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=10, res=600)
50 | set.seed(1704)
51 | pairs(data %>% sample_n(min(1000, nrow(data))),
52 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
53 | upper.panel = panel.cor)
54 | dev.off()
55 |
56 |
57 | library(ggplot2)
58 | library(dplyr)
59 | library(gridExtra)
60 | p1 <- data %>% ggplot(aes(quality)) + geom_bar()
61 | p2 <- data %>% ggplot(aes(factor(quality), alcohol)) + geom_boxplot()
62 | p3 <- data %>% ggplot(aes(factor(quality), density)) + geom_boxplot()
63 | p4 <- data %>% ggplot(aes(alcohol, density)) + geom_point(alpha=.1) + geom_smooth()
64 | grid.arrange(p1, p2, p3, p4, ncol=2)
65 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
66 | ggsave("../../plots/14-2.png", g, width=5.5, height=4, units='in', dpi=600)
67 |
68 |
69 | # 트래인셋과 테스트셋의 구분
70 | set.seed(1606)
71 | n <- nrow(data)
72 | idx <- 1:n
73 | training_idx <- sample(idx, n * .60)
74 | idx <- setdiff(idx, training_idx)
75 | validate_idx <- sample(idx, n * .20)
76 | test_idx <- setdiff(idx, validate_idx)
77 | training <- data[training_idx,]
78 | validation <- data[validate_idx,]
79 | test <- data[test_idx,]
80 |
81 |
82 | # 선형회귀모형 (linear regression model)
83 | data_lm_full <- lm(quality ~ ., data=training)
84 | summary(data_lm_full)
85 |
86 | predict(data_lm_full, newdata = data[1:5,])
87 |
88 | # 선형회귀모형에서 변수선택
89 | data_lm_full_2 <- lm(quality ~ .^2, data=training)
90 | summary(data_lm_full_2)
91 |
92 | length(coef(data_lm_full_2))
93 |
94 | library(MASS)
95 | data_step <- stepAIC(data_lm_full,
96 | scope = list(upper = ~ .^2, lower = ~1))
97 |
98 | data_step
99 | anova(data_step)
100 | summary(data_step)
101 | length(coef(data_step))
102 |
103 |
104 | # 모형평가
105 | y_obs <- validation$quality
106 | yhat_lm <- predict(data_lm_full, newdata=validation)
107 | yhat_lm_2 <- predict(data_lm_full_2, newdata=validation)
108 | yhat_step <- predict(data_step, newdata=validation)
109 | rmse(y_obs, yhat_lm)
110 | rmse(y_obs, yhat_lm_2)
111 | rmse(y_obs, yhat_step)
112 |
113 |
114 | # 라쏘 모형 적합
115 | xx <- model.matrix(quality ~ .^2-1, data)
116 | # xx <- model.matrix(quality ~ .-1, data)
117 | x <- xx[training_idx, ]
118 | y <- training$quality
119 | glimpse(x)
120 |
121 | data_cvfit <- cv.glmnet(x, y)
122 |
123 | png("../../plots/14-3.png", 5.5, 4, units='in', pointsize=10, res=600)
124 | plot(data_cvfit)
125 | dev.off()
126 |
127 |
128 | coef(data_cvfit, s = c("lambda.1se"))
129 | coef(data_cvfit, s = c("lambda.min"))
130 |
131 | (tmp <- coef(data_cvfit, s = c("lambda.1se")))
132 | length(tmp[abs(tmp)>0])
133 | (tmp <- coef(data_cvfit, s = c("lambda.min")))
134 | length(tmp[abs(tmp)>0])
135 |
136 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,])
137 |
138 | y_obs <- validation$quality
139 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,])
140 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
141 | rmse(y_obs, yhat_glmnet)
142 |
143 | # 나무모형
144 | data_tr <- rpart(quality ~ ., data = training)
145 | data_tr
146 |
147 | printcp(data_tr)
148 | summary(data_tr)
149 |
150 | png("../../plots/14-4.png", 5.5, 4, units='in', pointsize=10, res=600)
151 | opar <- par(mfrow = c(1,1), xpd = NA)
152 | plot(data_tr)
153 | text(data_tr, use.n = TRUE)
154 | par(opar)
155 | dev.off()
156 |
157 | yhat_tr <- predict(data_tr, validation)
158 | rmse(y_obs, yhat_tr)
159 |
160 |
161 | # 랜덤포레스트
162 | set.seed(1607)
163 | data_rf <- randomForest(quality ~ ., training)
164 | data_rf
165 |
166 | png("../../plots/14-5.png", 5.5*1.5, 4, units='in', pointsize=9, res=600)
167 | opar <- par(mfrow=c(1,2))
168 | plot(data_rf)
169 | varImpPlot(data_rf)
170 | par(opar)
171 | dev.off()
172 |
173 | yhat_rf <- predict(data_rf, newdata=validation)
174 | rmse(y_obs, yhat_rf)
175 |
176 |
177 | # 부스팅
178 | set.seed(1607)
179 | data_gbm <- gbm(quality ~ ., data=training,
180 | n.trees=40000, cv.folds=3, verbose = TRUE)
181 |
182 | png("../../plots/14-6.png", 5.5, 4, units='in', pointsize=9, res=600)
183 | (best_iter = gbm.perf(data_gbm, method="cv"))
184 | dev.off()
185 |
186 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation)
187 | rmse(y_obs, yhat_gbm)
188 |
189 |
190 | # 최종 모형선택과 테스트셋 오차계산
191 | data.frame(lm = rmse(y_obs, yhat_step),
192 | glmnet = rmse(y_obs, yhat_glmnet),
193 | rf = rmse(y_obs, yhat_rf),
194 | gbm = rmse(y_obs, yhat_gbm)) %>%
195 | reshape2::melt(value.name = 'rmse', variable.name = 'method')
196 |
197 | rmse(test$quality, predict(data_rf, newdata = test))
198 |
199 |
200 | # 회귀분석의 오차의 시각화
201 | boxplot(list(lm = y_obs-yhat_step,
202 | glmnet = y_obs-yhat_glmnet,
203 | rf = y_obs-yhat_rf,
204 | gbm = y_obs-yhat_gbm), ylab="Error in Validation Set")
205 | abline(h=0, lty=2, col='blue')
206 |
207 |
208 | png("../../plots/14-7.png", 5.5, 4, units='in', pointsize=9, res=600)
209 | pairs(data.frame(y_obs=y_obs,
210 | yhat_lm=yhat_step,
211 | yhat_glmnet=c(yhat_glmnet),
212 | yhat_rf=yhat_rf,
213 | yhat_gbm=yhat_gbm),
214 | lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
215 | upper.panel = panel.cor)
216 | dev.off()
217 |
--------------------------------------------------------------------------------
/ch13-regression/wine-quality/wine-quality.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/ch15-word-cloud/ch13-word-cloud.R:
--------------------------------------------------------------------------------
1 | # install.packages(c("tm", "SnowballC", "wordcloud"))
2 |
3 | library(tm)
4 | library(SnowballC)
5 | library(wordcloud)
6 | library(dplyr)
7 |
8 | data <- read.csv('JEOPARDY_CSV.csv', stringsAsFactors = FALSE,
9 | nrows = 10000)
10 | dplyr::glimpse(data)
11 |
12 | data_corpus <- Corpus(VectorSource(data$Question))
13 | data_corpus
14 | # ?Corpus
15 | # data_corpus <- tm_map(data_corpus, PlainTextDocument)
16 | # stopwords('english')
17 |
18 | data_corpus <- tm_map(data_corpus, content_transformer(tolower))
19 | as.character(data_corpus[[1]])
20 | data_corpus <- tm_map(data_corpus, removePunctuation)
21 | as.character(data_corpus[[1]])
22 | data_corpus <- tm_map(data_corpus, removeWords, stopwords('english'))
23 | as.character(data_corpus[[1]])
24 |
25 | data_corpus <- tm_map(data_corpus, stemDocument)
26 | as.character(data_corpus[[1]])
27 |
28 | citation(package='wordcloud')
29 | wordcloud(data_corpus, max.words=100, random.order=FALSE,
30 | colors=brewer.pal(8, "Dark2"))
31 |
32 | data$Question[1]
33 | as.character(data_corpus[[1]])
34 |
35 | png("../plots/15-1.png", 5.5, 4, units='in', pointsize=9, res=600)
36 | wordcloud(data_corpus, max.words=100, random.order=FALSE,
37 | colors=brewer.pal(8, "Dark2"))
38 | dev.off()
39 |
--------------------------------------------------------------------------------
/ch15-word-cloud/ch13-word-cloud.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
--------------------------------------------------------------------------------
/figure-export-boilerplate.R:
--------------------------------------------------------------------------------
1 | # A few useful lines to export plots
2 | # 1. base R graph
3 | png("../../plots/.png", 5.5, 4, units='in', pointsize=9, res=600)
4 | dev.off()
5 |
6 | # 2. single ggplot
7 | ggsave("../../plots/.png", width=5.5, height=4, units='in', dpi=600)
8 |
9 | # 3. plot matrix from library(gridExtra)
10 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
11 | ggsave("../../plots/.png", g, width=5.5, height=4, units='in', dpi=600)
12 |
--------------------------------------------------------------------------------
/notebooks/download-gapminder-tsv.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "본서에서는 R의 gapminder 패키지를 사용하였다.\n",
8 | "\n",
52 | " \n",
53 | "
\n",
111 | "\n",
54 | " \n",
62 | " \n",
63 | " \n",
64 | " \n",
55 | " country \n",
56 | " continent \n",
57 | " year \n",
58 | " lifeExp \n",
59 | " pop \n",
60 | " gdpPercap \n",
61 | " \n",
65 | " \n",
73 | " 0 \n",
66 | " Afghanistan \n",
67 | " Asia \n",
68 | " 1952 \n",
69 | " 28.801 \n",
70 | " 8425333 \n",
71 | " 779.445314 \n",
72 | " \n",
74 | " \n",
82 | " 1 \n",
75 | " Afghanistan \n",
76 | " Asia \n",
77 | " 1957 \n",
78 | " 30.332 \n",
79 | " 9240934 \n",
80 | " 820.853030 \n",
81 | " \n",
83 | " \n",
91 | " 2 \n",
84 | " Afghanistan \n",
85 | " Asia \n",
86 | " 1962 \n",
87 | " 31.997 \n",
88 | " 10267083 \n",
89 | " 853.100710 \n",
90 | " \n",
92 | " \n",
100 | " 3 \n",
93 | " Afghanistan \n",
94 | " Asia \n",
95 | " 1967 \n",
96 | " 34.020 \n",
97 | " 11537966 \n",
98 | " 836.197138 \n",
99 | " \n",
101 | " \n",
109 | " \n",
110 | "4 \n",
102 | " Afghanistan \n",
103 | " Asia \n",
104 | " 1972 \n",
105 | " 36.088 \n",
106 | " 13079460 \n",
107 | " 739.981106 \n",
108 | "