├── .gitignore
├── README.md
├── addendum
    ├── addendum.Rproj
    ├── dimension-reduction.Rmd
    └── rsconnect
    │   └── documents
    │       └── dimension-reduction.Rmd
    │           └── rpubs.com
    │               └── rpubs
    │                   ├── Document.dcf
    │                   └── Publish Document.dcf
├── ch03-data-processing
    ├── ch03-data-processing.R
    ├── ch03-data-processing.Rproj
    ├── ch03-data-processing.sh
    └── ch03-data-processing.sql
├── ch04-data-visualization
    ├── ch04-data-visualization.R
    └── ch04-data-visualization.Rproj
├── ch05-coding-style.R
├── ch06-statistics-concepts
    ├── ch06-statistics-concepts.R
    ├── ch06-statistics-concepts.Rproj
    └── ch06-statistics-concepts.ipynb
├── ch07-basic-analysis
    ├── ch07-basic-analysis.R
    ├── ch07-basic-analysis.Rproj
    └── correlation-example.R
├── ch08-classification
    ├── adult
    │   ├── adult.R
    │   ├── adult.Rproj
    │   └── adult.ipynb
    ├── breast-cancer
    │   ├── breast-cancer-wisconsin.R
    │   ├── breast-cancer.R
    │   └── breast-cancer.Rproj
    └── spam-detection
    │   ├── spam-detection.R
    │   ├── spam-detection.Rproj
    │   └── spam-detection.md
├── ch12-r-markdown
    ├── ch10-r-markdown.Rmd
    └── ch10-r-markdown.Rproj
├── ch13-regression
    ├── housing
    │   ├── housing.R
    │   └── housing.Rproj
    └── wine-quality
    │   ├── wine-quality.R
    │   └── wine-quality.Rproj
├── ch15-word-cloud
    ├── ch13-word-cloud.R
    └── ch13-word-cloud.Rproj
├── figure-export-boilerplate.R
├── notebooks
    ├── data
    │   └── gapminder.tsv
    ├── download-gapminder-tsv.ipynb
    ├── python-data-processing.ipynb
    ├── python-data-visualization.ipynb
    └── session-01-official.ipynb
└── solutions
    ├── README.md
    ├── ch03-tidyverse-solutions.Rmd
    ├── ch04-ggplot-solutions.Rmd
    ├── ch08-classification-solutions.Rmd
    ├── ch10-classification-solutions.Rmd
    ├── ch13-regression-solutions.Rmd
    ├── ch15-nlp-solutions.Rmd
    ├── ipds-kr-solutions-toc.Rmd
    ├── rsconnect
        └── documents
        │   ├── ch03-tidyverse-solutions.Rmd
        │       └── rpubs.com
        │       │   └── rpubs
        │       │       ├── Document.dcf
        │       │       └── Publish Document.dcf
        │   ├── ch04-ggplot-solutions.Rmd
        │       └── rpubs.com
        │       │   └── rpubs
        │       │       ├── Document.dcf
        │       │       └── Publish Document.dcf
        │   ├── ch08-classification-solutions.Rmd
        │       └── rpubs.com
        │       │   └── rpubs
        │       │       └── Document.dcf
        │   ├── ch10-classification-solutions.Rmd
        │       └── rpubs.com
        │       │   └── rpubs
        │       │       └── Document.dcf
        │   ├── ch13-regression-solutions.Rmd
        │       └── rpubs.com
        │       │   └── rpubs
        │       │       ├── Document.dcf
        │       │       └── Publish Document.dcf
        │   ├── ch15-nlp-solutions.Rmd
        │       └── rpubs.com
        │       │   └── rpubs
        │       │       ├── Document.dcf
        │       │       └── Publish Document.dcf
        │   └── ipds-kr-solutions-toc.Rmd
        │       └── rpubs.com
        │           └── rpubs
        │               ├── Document.dcf
        │               └── Publish Document.dcf
    └── solutions.Rproj


/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | 
 6 | _junk/
 7 | *.data
 8 | plots/
 9 | *.names
10 | *.csv
11 | *.zip
12 | 
13 | **/html/
14 | **/*_files/
15 | *.html
16 | 
17 | .ipynb_checkpoints
18 | *cache


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # <따라하며 배우는 데이터 과학> (2017) 소스코드
 2 | "Introduction to Practical Data Science (IPDS)" (2017)
 3 | 
 4 | 책 웹사이트: https://dataninja.me/ipds-kr/
 5 | 
 6 | ## 목차:
 7 | 
 8 | 1. 데이터 과학이란?
 9 | 2. 데이터 분석 환경 구성하기
10 | 3. 데이터 취득과 데이터 가공: SQL과 dplyr
11 | 4. 데이터 시각화 I: ggplot2
12 | 5. 코딩 스타일
13 | 6. 통계의 기본 개념 복습
14 | 7. 데이터 종류에 따른 분석 기법
15 | 8. 빅데이터 분류분석 I: 기본개념과 로지스틱 모형
16 | 9. 빅데이터 분류분석 II: 라쏘와 랜덤 포레스트
17 | 10. 빅데이터 분류분석 III: 암 예측
18 | 11. 빅데이터 분류분석 IV: 스팸 메일 예측
19 | 12. 분석결과 정리와 공유, R 마크다운
20 | 13. 빅데이터 회귀분석 I. 부동산 가격 예측
21 | 14. 빅데이터 회귀분석 II. 와인 품질 예측
22 | 15. 데이터 시각화 II: 단어 구름을 사용한 텍스트 데이터의 시각화
23 | 16. 실리콘 밸리에서 데이터 과학자 되기
24 | 
25 | ## 사용 전에 깔아야 할 소프트웨어
26 | 
27 | - R <https://www.r-project.org/>
28 | - R스튜디오 (RStudio) <https://www.rstudio.com/>
29 | 
30 | 
31 | ## 코드 사용법: R
32 | 
33 | - 컴퓨터에 git clone 클론해서 사용하세요.
34 | - 각 서브디렉토리는 RStudio 프로젝트입니다. 
35 | - 각 서브디렉토리 안에 있는 *.Rproject 파일을 열면 R스튜디오 프로젝트가 열립니다. 
36 | - *.R 파일들을 열어서 책을 따라 실행해 보세요.
37 | 
38 | ## 코드 사용법: 파이썬
39 | 
40 | 주피터 노트북을 `notebooks` 폴더에서 시작하면 됩니다.
41 | 자세한 내용은 <https://dataninja.me/ipds-kr/python-setup> 을 참조.
42 | 
43 | ## Credits
44 | The gapminder data file
45 | `notebooks/data/gapminder.tsv` is copied from
46 | <https://github.com/jennybc/gapminder/blob/master/data-raw/07_gap-merged-with-continent.tsv>
47 | by Jennifer Bryan (NA). gapminder: Data from Gapminder.
48 | https://github.com/jennybc/gapminder,
49 | http://www.gapminder.org/data/,
50 | https://doi.org/10.5281/zenodo.594018.
51 | 


--------------------------------------------------------------------------------
/addendum/addendum.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/addendum/dimension-reduction.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "<따라 하며 배우는 데이터 과학> 보충내용 I: 비지도학습과 차원축소"
 3 | author: "권재명"
 4 | date: "10/13/2017"
 5 | output:
 6 |   html_document:
 7 |     toc: true
 8 |     toc_depth: 3
 9 | ---
10 | ```{r setup, include=FALSE}
11 | knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
12 | ```
13 | 이번 장에서는 비지도 학습 (unsupervised learning)과 차원축소(dimensionality reduction)을 위한 방법을 알아보자.
14 | 우선 필수패키지를 로드한다:
15 | ```{r}
16 | suppressPackageStartupMessages(library(tidyverse))
17 | ```
18 | 
19 | 예제 자료로 유명한 피셔의 `iris`자료를 사용하자.
20 | R 내장 자료에서
21 | 변수 이름을 좀 더 다루기 쉬운 
22 | 형태(소문자; 마침표 대신 밑줄)로 바꾼다.
23 | 그러고 나면 대략 다음과 같은 모양이다.
24 | ```{r}
25 | names(iris) <- gsub("\\.", "_", tolower(names(iris)))
26 | iris %>% glimpse()
27 | ```
28 | 
29 | 그리는데 시간이 좀 걸리지만 `GGally` 패키지를 이용하여
30 | 다섯 변수들간의 관계를 산점도행렬로 그려볼 수 있다:
31 | ```{r}
32 | GGally::ggpairs(iris, mapping=aes(color=species))
33 | ```
34 | 
35 | 이 중에 처음 4변수들간에는 큰 상관관계가 있다:
36 | ```{r}
37 | iris %>% select(-species) %>% cor()
38 | ```
39 | 
40 | 차원축소는 이처럼 차원이 높은 변수 $X_1, ..., X_p$ 를
41 | 낮은 차원의 변수 $X'_1, ..., X'_k$ ($k<p$) 로 줄이는 작업이다.
42 | 
43 | 차원축소를 위한 고전적인 방법중 하나는
44 | MDS (Multidimensional Scaling)이다.
45 | $n$개의 관측치간의
46 | 거리 (distance) 혹은 비유사성(dissimilarity)이
47 | 행렬 $\{d_{ij}, i,j=1, ...,n\}$ 
48 | (보통 원래 $p$ 차원에서의 유클리드 거리)
49 | 로 주어진다고 하자.
50 | MDS 는 $k (<p)$ 차원 안에서
51 | $n$ 관측치의 좌표간의 유클리드 거리 
52 | $\{d'_{ij}, i,j=1,...,n\}$ 가
53 | 원래 $d_{ij}$ 와 가능한한 가깝도록
54 | $X_{ij}, i=1,...,n, j=1,...,k$ 를 생성한다.
55 | 시각화를 위해 보통 $k=2$ 가 많이 사용된다.
56 | 
57 | R에서는 다음처럼 실행한다:
58 | ```{r}
59 | orig_coords <- iris %>% select(-species) 
60 | dim(orig_coords)
61 | dist_mat <- orig_coords %>% dist(method="euclidean")
62 | dim(as.matrix(dist_mat))
63 | mds_coords <- cmdscale(dist_mat, k=2)
64 | dim(mds_coords)
65 | ```
66 | 
67 | 4차원 공간의 150개의 관측치에서 150*150 거리행렬을 생성한 후
68 | 이를 사용해 2차원 공간으로 차원을 축소했다.
69 | 축소된 2차원에서 각 종들의 분포는 다음과 같다.
70 | ```{r}
71 | df <- tibble(x1=mds_coords[,1], x2=mds_coords[,2],
72 |              species=iris$species)
73 | df %>% ggplot(aes(x1, x2, col=species)) + geom_point()
74 | ```
75 | 
76 | 


--------------------------------------------------------------------------------
/addendum/rsconnect/documents/dimension-reduction.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/318229/dcfa2ef3f268496da3ba5f5fb06e1bf8
 8 | bundleId: https://api.rpubs.com/api/v1/document/318229/dcfa2ef3f268496da3ba5f5fb06e1bf8
 9 | url: http://rpubs.com/publish/claim/318229/10e0ac576ae7454dbc98563704813ee6
10 | when: 1507945656.17115
11 | 


--------------------------------------------------------------------------------
/addendum/rsconnect/documents/dimension-reduction.Rmd/rpubs.com/rpubs/Publish Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Publish Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/318230/0b8a3440f5aa4d2fa50bda4b129fd9a3
 8 | bundleId: https://api.rpubs.com/api/v1/document/318230/0b8a3440f5aa4d2fa50bda4b129fd9a3
 9 | url: http://rpubs.com/publish/claim/318230/ad037587d9e545b6b0e21fd6ccb0a62e
10 | when: 1507945830.04961
11 | 


--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # datasets 패키지에서 제공하는 다양한 자료들과 도움말
  4 | help(package='datasets')
  5 | # ggplot2 패키지에서 제공되는 데이터
  6 | data(package='ggplot2')
  7 | # 현재 실행환경에서 로드되어서 사용가능한 모든 데이터를 살펴보려면 옵션 없이
  8 | data()
  9 | 
 10 | 
 11 | # 자료 다운로드:
 12 | # curl https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data > housing.data
 13 | # curl https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names > housing.names
 14 | 
 15 | boston <- read.table("housing.data")
 16 | library(dplyr)
 17 | glimpse(boston)
 18 | 
 19 | names(boston) <- c('crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'black', 'lstat', 'medv')
 20 | glimpse(boston)
 21 | 
 22 | plot(boston)
 23 | summary(boston)
 24 | 
 25 | # 큰 자료 읽어 들이기
 26 | library(data.table)
 27 | DT <- fread("very_big.csv")
 28 | DT <- fread("very_big.csv", data.table=FALSE)
 29 | 
 30 | 
 31 | # R에서 SQL연습
 32 | # install.packages("sqldf")
 33 | library(sqldf)
 34 | sqldf("select * from iris")
 35 | sqldf("select count(*) from iris")
 36 | sqldf("select Species, count(*), avg(`Sepal.Length`)
 37 |       from iris
 38 |       group by `Species`")
 39 | sqldf("select Species, `Sepal.Length`, `Sepal.Width`
 40 |       from iris
 41 |       where `Sepal.Length` < 4.5
 42 |       order by `Sepal.Width`")
 43 | 
 44 | 
 45 | 
 46 | library(dplyr)
 47 | (df1 <- data_frame(x = c(1, 2), y = 2:1))
 48 | (df2 <- data_frame(x = c(1, 3), a = 10, b = "a"))
 49 | sqldf("select *
 50 |     from df1 inner join df2
 51 |       on df1.x = df2.x")
 52 | sqldf("select *
 53 |     from df1 left join df2
 54 |       on df1.x = df2.x")
 55 | 
 56 | 
 57 | # install.packages("foreign")
 58 | library(foreign)
 59 | x <- read.dbf(system.file("files/sids.dbf", package="foreign")[1])
 60 | dplyr::glimpse(x)
 61 | summary(x)
 62 | 
 63 | 
 64 | #-------------------------------
 65 | # 기본적인 gapminder 자료 처리
 66 | 
 67 | # 자료를 로드한다
 68 | library(gapminder)
 69 | 
 70 | # 행과 열 선택
 71 | gapminder[gapminder$country=='Korea, Rep.', c('pop', 'gdpPercap')]
 72 | 
 73 | # 행 선택
 74 | gapminder[gapminder$country=='Korea, Rep.', ]
 75 | gapminder[gapminder$year==2007, ]
 76 | gapminder[gapminder$country=='Korea, Rep.' & gapminder$year==2007, ]
 77 | gapminder[1:10,]
 78 | head(gapminder, 10)
 79 | 
 80 | # 정렬
 81 | gapminder[order(gapminder$year, gapminder$country),]
 82 | 
 83 | # 변수 선택:
 84 | gapminder[, c('pop', 'gdpPercap')]
 85 | gapminder[, 1:3]
 86 | 
 87 | # 변수 이름 바꾸기: gdpPercap 를 gdp_per_cap 으로 변경
 88 | f2 = gapminder
 89 | names(f2)
 90 | names(f2)[6] = 'gdp_per_cap'
 91 | 
 92 | # 변수변환과 변수 생성
 93 | f2 = gapminder
 94 | f2$total_gdp = f2$pop * f2$gdpPercap
 95 | 
 96 | # 요약통계량 계산
 97 | median(gapminder$gdpPercap)
 98 | apply(gapminder[,4:6], 2, mean)
 99 | summary(gapminder)
100 | 
101 | 
102 | #----------------------------
103 | library(dplyr)
104 | 
105 | # tbl_df() 와 glimpse()
106 | i2 <- tbl_df(iris)
107 | class(i2)
108 | i2
109 | glimpse(i2)
110 | 
111 | iris %>% head
112 | iris %>% head(10)
113 | 
114 | 
115 | 
116 | 
117 | filter(gapminder, country=='Korea, Rep.')
118 | filter(gapminder, year==2007)
119 | filter(gapminder, country=='Korea, Rep.' & year==2007)
120 | 
121 | gapminder %>% filter(country=='Korea, Rep.')
122 | gapminder %>% filter(year==2007)
123 | gapminder %>% filter(country=='Korea, Rep.' & year==2007)
124 | 
125 | 
126 | arrange(gapminder, year, country)
127 | gapminder %>% arrange(year, country)
128 | 
129 | 
130 | 
131 | select(gapminder, pop, gdpPercap)
132 | gapminder %>% select(pop, gdpPercap)
133 | 
134 | 
135 | 
136 | gapminder %>%
137 |   mutate(total_gdp = pop * gdpPercap,
138 |          le_gdp_ratio = lifeExp / gdpPercap,
139 |          lgrk = le_gdp_ratio * 100)
140 | 
141 | 
142 | gapminder %>%
143 |   summarize(n_obs = n(),
144 |             n_countries = n_distinct(country),
145 |             n_years = n_distinct(year),
146 |             med_gdpc = median(gdpPercap),
147 |             max_gdppc = max(gdpPercap))
148 | 
149 | 
150 | sample_n(gapminder, 10)
151 | sample_frac(gapminder, 0.01)
152 | 
153 | 
154 | distinct(select(gapminder, country))
155 | distinct(select(gapminder, year))
156 | 
157 | 
158 | gapminder %>% select(country) %>% distinct()
159 | gapminder %>% select(year) %>% distinct()
160 | 
161 | 
162 | gapminder %>%
163 |   filter(year == 2007) %>%
164 |   group_by(continent) %>%
165 |   summarize(median(lifeExp))
166 | 
167 | 
168 | 
169 | # 함수형 프로그래밍의 장점 예시
170 | d1 = filter(gapminder, year == 2007)
171 | d2 = group_by(d1, continent)
172 | d3 = summarize(d2, lifeExp = median(lifeExp))
173 | arrange(d3, -lifeExp)
174 | 
175 | arrange(
176 |   summarize(
177 |     group_by(
178 |       filter(gapminder, year==2007), continent
179 |     ), lifeExp=median(lifeExp)
180 |   ), -lifeExp
181 | )
182 | 
183 | 
184 | gapminder %>%
185 |   filter(year == 2007) %>%
186 |   group_by(continent) %>%
187 |   summarize(lifeExp = median(lifeExp)) %>%
188 |   arrange(-lifeExp)
189 | 
190 | 
191 | # 조인 연산자;  inner, left, right, full(outer) join
192 | (df1 <- data_frame(x = c(1, 2), y = 2:1))
193 | (df2 <- data_frame(x = c(1, 3), a = 10, b = "a"))
194 | df1 %>% inner_join(df2)
195 | df1 %>% left_join(df2)
196 | df1 %>% right_join(df2)
197 | df1 %>% full_join(df2)
198 | 
199 | 


--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | # 자료를 다운받는다
 3 | curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > adult.data
 4 | 
 5 | # 첫 열줄을 보여준다.
 6 | head adult.data 
 7 | 
 8 | # 마지막 10줄을 보여준다.
 9 | tail adult.data
10 | 
11 | # 첫 5줄을 다른 파일에 저장한다.
12 | head -5 adult.data > adult.data.small
13 | cat adult.data.small
14 | 
15 | # 콤마 열 분리문자를 탭으로 바꾼후 다른 파일에 저장한다.
16 | tr "," "\t" < adult.data.small > adult.data.small.tab
17 | cat adult.data.small.tab
18 | 
19 | # 자료 길이가 몇줄인지 보여준다. (32562)
20 | wc -l adult.data
21 | 
22 | # 간단히 줄이기
23 | head -5 adult.data | tr "," "\t" > adult.data.small.tab
24 | 
25 | # 직업군(work class)의 도수분포 
26 | $ cut -d ',' -f 2 < adult.data | sort | uniq -c | sort -nr
27 | 


--------------------------------------------------------------------------------
/ch03-data-processing/ch03-data-processing.sql:
--------------------------------------------------------------------------------
 1 | -- SQL 연습 문제 해답
 2 | 
 3 | select EmployeeID, count(*) n
 4 | from Orders
 5 | group by EmployeeID
 6 | order by n desc;
 7 | 
 8 | 
 9 | select a.EmployeeID, 
10 |     b.FirstName, 
11 |     b.LastName, 
12 |     count(*) n
13 | from Orders a
14 |     inner join Employees b
15 |         on a.EmployeeID = b.EmployeeID
16 | group by a.EmployeeID
17 | order by n desc;
18 | 
19 | 
20 | select a.OrderID, 
21 |     a.OrderDate, 
22 |     sum(Quantity) as n_items, 
23 |     sum(Quantity*c.Price) as total_price
24 | from Orders a
25 |     inner join OrderDetails b
26 |         on a.OrderID = b.OrderID
27 |     inner join Products c
28 |         on b.ProductID = c.ProductID
29 | group by a.OrderID;
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/ch04-data-visualization/ch04-data-visualization.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(gridExtra)
  3 | library(gapminder)
  4 | 
  5 | # install.packages("gapminder")
  6 | help(package = "gapminder")
  7 | library(gapminder)
  8 | ?gapminder
  9 | gapminder
 10 | 
 11 | head(gapminder)
 12 | 
 13 | tail(gapminder)
 14 | 
 15 | library(dplyr)
 16 | glimpse(gapminder)
 17 | 
 18 | 
 19 | gapminder$lifeExp
 20 | gapminder$gdpPercap
 21 | gapminder[, c('lifeExp', 'gdpPercap')]
 22 | gapminder %>% select(gdpPercap, lifeExp)
 23 | 
 24 | # 요약통계량과 상관관계
 25 | summary(gapminder$lifeExp)
 26 | summary(gapminder$gdpPercap)
 27 | cor(gapminder$lifeExp, gapminder$gdpPercap)
 28 | 
 29 | 
 30 | # 베이스 패키지 시각화
 31 | #@ 4.1
 32 | png("../plots/4-1.png", 5.5, 4, units='in', pointsize=9, res=600)
 33 | opar = par(mfrow=c(2,2))
 34 | hist(gapminder$lifeExp)
 35 | hist(gapminder$gdpPercap, nclass=50)
 36 | # hist(sqrt(gapminder$gdpPercap), nclass=50)
 37 | hist(log10(gapminder$gdpPercap), nclass=50)
 38 | plot(log10(gapminder$gdpPercap), gapminder$lifeExp, cex=.5)
 39 | par(opar)
 40 | dev.off()
 41 | 
 42 | 
 43 | cor(gapminder$lifeExp, log10(gapminder$gdpPercap))
 44 | 
 45 | # 앤스콤의 사인방(Anscombe's quartet)
 46 | # https://en.wikipedia.org/wiki/Anscombe%27s_quartet
 47 | # https://commons.wikimedia.org/wiki/File:Anscombe%27s_quartet_3.svg
 48 | svg("Anscombe's quartet 3.svg", width=11, height=8)
 49 | op <- par(las=1, mfrow=c(2,2), mar=1.5+c(4,4,1,1), oma=c(0,0,0,0),
 50 |           lab=c(6,6,7), cex.lab=2.0, cex.axis=1.3, mgp=c(3,1,0))
 51 | ff <- y ~ x
 52 | for(i in 1:4) {
 53 |   ff[[2]] <- as.name(paste("y", i, sep=""))
 54 |   ff[[3]] <- as.name(paste("x", i, sep=""))
 55 |   lmi <- lm(ff, data= anscombe)
 56 |   xl <- substitute(expression(x[i]), list(i=i))
 57 |   yl <- substitute(expression(y[i]), list(i=i))
 58 |   plot(ff, data=anscombe, col="red", pch=21, cex=2.4, bg = "orange",
 59 |        xlim=c(3,19), ylim=c(3,13)
 60 |        , xlab=eval(xl), ylab=yl  # for version 3
 61 |   )
 62 |   abline(lmi, col="blue")
 63 | }
 64 | par(op)
 65 | dev.off()
 66 | 
 67 | # gapminder 예제의 시각화를 ggplot2로 해보자
 68 | library(ggplot2)
 69 | library(dplyr)
 70 | gapminder %>% ggplot(aes(x=lifeExp)) + geom_histogram()
 71 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
 72 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
 73 |   scale_x_log10()
 74 | gapminder %>% ggplot(aes(x=gdpPercap, y=lifeExp)) + geom_point() +
 75 |   scale_x_log10() + geom_smooth()
 76 | 
 77 | library(gridExtra)
 78 | p1 <- gapminder %>% ggplot(aes(x=lifeExp)) + geom_histogram()
 79 | p2 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
 80 | p3 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
 81 |   scale_x_log10()
 82 | p4 <- gapminder %>% ggplot(aes(x=gdpPercap, y=lifeExp)) + geom_point() +
 83 |   scale_x_log10() + geom_smooth()
 84 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
 85 | ggsave("../plots/4-3.png", g, width=5.5, height=4, units='in', dpi=600)
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | library(ggplot2)
 92 | ?ggplot
 93 | example(ggplot)
 94 | 
 95 | df <- data.frame(gp = factor(rep(letters[1:3], each = 10)),
 96 |                  y = rnorm(30))
 97 | glimpse(df)
 98 | 
 99 | ds <- df %>% group_by(gp) %>% summarize(mean = mean(y), sd = sd(y))
100 | ds
101 | 
102 | 
103 | ggplot(df, aes(x = gp, y = y)) +
104 |    geom_point() +
105 |    geom_point(data = ds, aes(y = mean),
106 |               colour = 'red', size = 3)
107 | 
108 | 
109 | ggplot(df) +
110 |    geom_point(aes(x = gp, y = y)) +
111 |    geom_point(data = ds, aes(x = gp, y = mean),
112 |                  colour = 'red', size = 3)
113 | 
114 | 
115 | ggplot() +
116 |   geom_point(data = df, aes(x = gp, y = y)) +
117 |   geom_point(data = ds, aes(x = gp, y = mean),
118 |                         colour = 'red', size = 3) +
119 |   geom_errorbar(data = ds, aes(x = gp,
120 |                     ymin = mean - sd, ymax = mean + sd),
121 |                     colour = 'red', width = 0.4)
122 | 
123 | 
124 | ggplot(gapminder, aes(lifeExp)) + geom_histogram()
125 | gapminder %>% ggplot(aes(lifeExp)) + geom_histogram()
126 | 
127 | 
128 | ?diamonds
129 | ?mpg
130 | glimpse(diamonds)
131 | glimpse(mpg)
132 | 
133 | # 1. 한 수량형 변수
134 | 
135 | library(gapminder)
136 | library(ggplot2)
137 | library(dplyr)
138 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
139 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
140 |   scale_x_log10()
141 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_freqpoly() +
142 |   scale_x_log10()
143 | gapminder %>% ggplot(aes(x=gdpPercap)) + geom_density() +
144 |   scale_x_log10()
145 | 
146 | 
147 | #@ 4.4
148 | p1 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram()
149 | p2 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_histogram() +
150 |   scale_x_log10()
151 | p3 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_freqpoly() +
152 |   scale_x_log10()
153 | p4 <- gapminder %>% ggplot(aes(x=gdpPercap)) + geom_density() +
154 |   scale_x_log10()
155 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
156 | ggsave("../plots/4-4.png", g, width=6, height=4, units='in', dpi=600)
157 | 
158 | summary(gapminder)
159 | 
160 | 
161 | # 2. 한 범주형 변수
162 | 
163 | #@ 4.5
164 | diamonds %>% ggplot(aes(cut)) + geom_bar()
165 | ggsave("../plots/4-5.png", width=5.5, height=4, units='in', dpi=600)
166 | 
167 | table(diamonds$cut)
168 | 
169 | prop.table(table(diamonds$cut))
170 | 
171 | round(prop.table(table(diamonds$cut))*100, 1)
172 | 
173 | diamonds %>%
174 |   group_by(cut) %>%
175 |   tally() %>%
176 |   mutate(pct = round(n / sum(n) * 100, 1))
177 | 
178 | 
179 | # 3. 두 수량형 변수
180 | 
181 | diamonds %>% ggplot(aes(carat, price)) + geom_point()
182 | diamonds %>% ggplot(aes(carat, price)) + geom_point(alpha=.01)
183 | mpg %>% ggplot(aes(cyl, hwy)) + geom_point()
184 | mpg %>% ggplot(aes(cyl, hwy)) + geom_jitter()
185 | 
186 | 
187 | set.seed(1704)
188 | p1 <- diamonds %>% ggplot(aes(carat, price)) + geom_point()
189 | p2 <- diamonds %>% ggplot(aes(carat, price)) + geom_point(alpha=.01)
190 | p3 <- mpg %>% ggplot(aes(cyl, hwy)) + geom_point()
191 | p4 <- mpg %>% ggplot(aes(cyl, hwy)) + geom_jitter()
192 | ggsave("../plots/4-6.png", arrangeGrob(p1, p2, p3, p4, ncol=2),
193 |        width=5.5, height=4, units='in', dpi=600)
194 | 
195 | 
196 | pairs(diamonds %>% sample_n(1000))
197 | 
198 | png("../plots/4-7.png", 5.5*1.2, 4*1.2, units='in', pointsize=9, res=400)
199 | set.seed(1704)
200 | pairs(diamonds %>% sample_n(1000))
201 | dev.off()
202 | 
203 | # 4. 수량형 변수와 범주형 변수
204 | 
205 | mpg %>% ggplot(aes(class, hwy)) + geom_boxplot()
206 | ggsave("../plots/4-8.png", width=5.5, height=4, units='in', dpi=600)
207 | 
208 | 
209 | mpg %>% ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
210 |   geom_boxplot(alpha=.5)
211 | 
212 | mpg %>% mutate(class=reorder(class, hwy, median)) %>%
213 |   ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
214 |   geom_boxplot(alpha=.5)
215 | 
216 | mpg %>%
217 |   mutate(class=factor(class, levels=
218 |                     c("2seater", "subcompact", "compact", "midsize",
219 |                           "minivan", "suv", "pickup"))) %>%
220 |   ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
221 |   geom_boxplot(alpha=.5)
222 | 
223 | mpg %>%
224 |   mutate(class=factor(class, levels=
225 |                     c("2seater", "subcompact", "compact", "midsize",
226 |                           "minivan", "suv", "pickup"))) %>%
227 |   ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
228 |   geom_boxplot(alpha=.5) + coord_flip()
229 | 
230 | 
231 | set.seed(1704)
232 | p1 <- mpg %>% ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
233 |   geom_boxplot(alpha=.5)
234 | p2 <- mpg %>% mutate(class=reorder(class, hwy, median)) %>%
235 |   ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
236 |   geom_boxplot(alpha=.5)
237 | p3 <- mpg %>%
238 |   mutate(class=factor(class, levels=
239 |                         c("2seater", "subcompact", "compact", "midsize",
240 |                           "minivan", "suv", "pickup"))) %>%
241 |   ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
242 |   geom_boxplot(alpha=.5)
243 | p4 <- mpg %>%
244 |   mutate(class=factor(class, levels=
245 |                         c("2seater", "subcompact", "compact", "midsize",
246 |                           "minivan", "suv", "pickup"))) %>%
247 |   ggplot(aes(class, hwy)) + geom_jitter(col='gray') +
248 |   geom_boxplot(alpha=.5) + coord_flip()
249 | ggsave("../plots/4-9.png", arrangeGrob(p1, p2, p3, p4, ncol=2),
250 |        width=5.5*2, height=4*1.5, units='in', dpi=400)
251 | 
252 | 
253 | 
254 | # 5. 두 범주형 변수
255 | 
256 | glimpse(data.frame(Titanic))
257 | 
258 | xtabs(Freq ~ Class + Sex + Age + Survived, data.frame(Titanic))
259 | 
260 | 
261 | ?Titanic
262 | Titanic
263 | 
264 | 
265 | mosaicplot(Titanic, main = "Survival on the Titanic")
266 | 
267 | mosaicplot(Titanic, main = "Survival on the Titanic", color=TRUE)
268 | 
269 | png("../plots/4-10.png", 5.5, 4, units='in', pointsize=9, res=600)
270 | mosaicplot(Titanic, main = "Survival on the Titanic", color=TRUE)
271 | dev.off()
272 | 
273 | # 아이들 사이에 생존률이 더 높을까?
274 | apply(Titanic, c(3, 4), sum)
275 | 
276 | round(prop.table(apply(Titanic, c(3, 4), sum), margin = 1),3)
277 | 
278 | # 남-녀 생존률의 비교
279 | apply(Titanic, c(2, 4), sum)
280 | 
281 | round(prop.table(apply(Titanic, c(2, 4), sum), margin = 1),3)
282 | 
283 | 
284 | t2 = data.frame(Titanic)
285 | 
286 | t2 %>% group_by(Sex) %>%
287 |     summarize(n = sum(Freq),
288 |               survivors=sum(ifelse(Survived=="Yes", Freq, 0))) %>%
289 |     mutate(rate_survival=survivors/n)
290 | 
291 | 
292 | # 6. 더 많은 변수를 보여주는 기술 (1): 각 geom 의 다른 속성들을 사용한다.
293 | 
294 | gapminder %>% filter(year==2007) %>%
295 |   ggplot(aes(gdpPercap, lifeExp)) +
296 |   geom_point() + scale_x_log10() +
297 |   ggtitle("Gapminder data for 2007")
298 | 
299 | 
300 | gapminder %>% filter(year==2002) %>%
301 |   ggplot(aes(gdpPercap, lifeExp)) +
302 |   geom_point(aes(size=pop, col=continent)) + scale_x_log10() +
303 |   ggtitle("Gapminder data for 2007")
304 | 
305 | p1 <- gapminder %>% filter(year==2007) %>%
306 |   ggplot(aes(gdpPercap, lifeExp)) +
307 |   geom_point() + scale_x_log10() +
308 |   ggtitle("Gapminder data for 2007")
309 | p2 <- gapminder %>% filter(year==2002) %>%
310 |   ggplot(aes(gdpPercap, lifeExp)) +
311 |   geom_point(aes(size=pop, col=continent)) + scale_x_log10() +
312 |   ggtitle("Gapminder data for 2007")
313 | ggsave("../plots/4-11.png", arrangeGrob(p1, p2, ncol=2),
314 |        width=5.5*1.7, height=4, units='in', dpi=600)
315 | 
316 | # 7. 더 많은 변수를 보여주는 기술 (2). facet_* 함수를 사용한다.
317 | 
318 | gapminder %>%
319 |   ggplot(aes(year, lifeExp, group=country)) +
320 |   geom_line()
321 | 
322 | 
323 | gapminder %>%
324 |   ggplot(aes(year, lifeExp, group=country, col=continent)) +
325 |   geom_line()
326 | 
327 | 
328 | gapminder %>%
329 |   ggplot(aes(year, lifeExp, group=country)) +
330 |   geom_line() +
331 |   facet_wrap(~ continent)
332 | 
333 | p1 <- gapminder %>%
334 |   ggplot(aes(year, lifeExp, group=country)) +
335 |   geom_line()
336 | p2 <- gapminder %>%
337 |   ggplot(aes(year, lifeExp, group=country, col=continent)) +
338 |   geom_line()
339 | p3 <- gapminder %>%
340 |   ggplot(aes(year, lifeExp, group=country)) +
341 |   geom_line() +
342 |   facet_wrap(~ continent)
343 | ggsave("../plots/4-12.png", arrangeGrob(p1, p2, p3, ncol=2),
344 |        width=5.5*2, height=4*2, units='in', dpi=150)
345 | 
346 | 
347 | 


--------------------------------------------------------------------------------
/ch04-data-visualization/ch04-data-visualization.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch05-coding-style.R:
--------------------------------------------------------------------------------
 1 | # 지저분한 코드 예
 2 | sc<-function(x,y,verbose=TRUE) {
 3 | n<-length(x)
 4 | if(n<=1||n!=length(y)) stop("Arguments x and y have different lengths: ",length(x)," and ",length(y),".")
 5 | if(TRUE%in%is.na(x)||TRUE%in%is.na(y)) stop(" Arguments x and y must not have missing values.")
 6 | cv<-var(x,y)
 7 | if(verbose) cat("Covariance = ",round(cv,4),".\n",sep= "")
 8 | return(cv)
 9 | }
10 | 
11 | 
12 | 
13 | # 깨끗한 코드 예
14 | CalculateSampleCovariance <- function(x, y, verbose = TRUE) {
15 |   # Computes the sample covariance between two vectors.
16 |   #
17 |   # Args:
18 |   #   x: One of two vectors whose sample covariance is to be calculated.
19 |   #   y: The other vector. x and y must have the same length, greater than one,
20 |   #      with no missing values.
21 |   #   verbose: If TRUE, prints sample covariance; if not, not. Default is TRUE.
22 |   #
23 |   # Returns:
24 |   #   The sample covariance between x and y.
25 |   n <- length(x)
26 |   # Error handling
27 |   if (n <= 1 || n != length(y)) {
28 |     stop("Arguments x and y have different lengths: ",
29 |          length(x), " and ", length(y), ".")
30 |   }
31 |   if (TRUE %in% is.na(x) || TRUE %in% is.na(y)) {
32 |     stop(" Arguments x and y must not have missing values.")
33 |   }
34 |   covariance <- var(x, y)
35 |   if (verbose)
36 |     cat("Covariance = ", round(covariance, 4), ".\n", sep = "")
37 |   return(covariance)
38 | }
39 | 


--------------------------------------------------------------------------------
/ch06-statistics-concepts/ch06-statistics-concepts.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(gridExtra)
  3 | 
  4 | 
  5 | # 약제 1을 복용했을 때 수면시간의 증가 (단위는 시간이다)
  6 | 
  7 | y <- sleep$extra[sleep$group == 1]
  8 | y
  9 | 
 10 | summary(y)
 11 | sd(y)
 12 | 
 13 | par(mfrow=c(2,2))
 14 | hist(y)
 15 | boxplot(y)
 16 | qqnorm(y); qqline(y)
 17 | hist(y, prob=TRUE)
 18 | lines(density(y), lty=2)
 19 | 
 20 | 
 21 | png("../plots/6-1.png", 5.5, 4, units='in', pointsize=9, res=600)
 22 | y <- sleep$extra[sleep$group == 1]
 23 | opar <- par(mfrow=c(2,2))
 24 | hist(y)
 25 | boxplot(y)
 26 | qqnorm(y); qqline(y)
 27 | hist(y, prob=TRUE)
 28 | lines(density(y), lty=2)
 29 | par(opar)
 30 | dev.off()
 31 | 
 32 | # '일변량 t-검정(one-sample t-test)'
 33 | t.test(y)
 34 | 
 35 | t.test(y, alternative="greater")
 36 | 
 37 | 
 38 | # 개개인의 수면시간증가값 모형
 39 | #  평균이 0이고, 표준편차가 1.8(시간)인 종 모양의 분포(bell shaped distribution)
 40 | # N(0, 1.8^2)
 41 | curve(dnorm(x, 0, 1.8), -4, 4)
 42 | 
 43 | png("../plots/6-2.png", 5.5, 4, units='in', pointsize=9, res=600)
 44 | curve(dnorm(x, 0, 1.8), -4, 4)
 45 | dev.off()
 46 | 
 47 | 
 48 | # 크기가 10개인 새로운 표본
 49 | options(digits = 3)
 50 | set.seed(1606)
 51 | (y_star <- rnorm(10, 0, 1.8))
 52 | mean(y_star-0); sd(y_star)
 53 | (t_star <- mean(y_star-0) / (sd(y_star)/sqrt(length(y_star))))
 54 | 
 55 | 
 56 | (y_star <- rnorm(10, 0, 1.8))
 57 | mean(y_star-0); sd(y_star)
 58 | (t_star <- mean(y_star-0) / (sd(y_star)/sqrt(length(y_star))))
 59 | 
 60 | 
 61 | (y_star <- rnorm(10, 0, 1.8))
 62 | mean(y_star-0); sd(y_star)
 63 | (t_star <- mean(y_star-0) / (sd(y_star)/sqrt(length(y_star))))
 64 | 
 65 | 
 66 | # 10,000개의 평행우주의 표본 (각 표본은 10개의 관측치를 포함한다)
 67 | # , 그리고 각 표본의 평균값, 표본표준편차, 그리고 t-통계량 값을  계산할 수 있다:
 68 | 
 69 | set.seed(1606)
 70 | B <- 1e4
 71 | n <- 10
 72 | xbars_star <- rep(NA, B)
 73 | sds_star <- rep(NA, B)
 74 | ts_star <- rep(NA, B)
 75 | for(b in 1:B){
 76 |   y_star <- rnorm(n, 0, 1.789)
 77 |   m <- mean(y_star)
 78 |   s <- sd(y_star)
 79 |   xbars_star[b] <- m
 80 |   sds_star[b] <- s
 81 |   ts_star[b] <- m / (s/sqrt(n))
 82 | }
 83 | 
 84 | 
 85 | opar <- par(mfrow=c(2,2))
 86 | hist(xbars_star, nclass=100)
 87 | abline(v = 0.75, col='red')
 88 | hist(sds_star, nclass=100)
 89 | abline(v = 1.789, col='red')
 90 | hist(ts_star, nclass=100)
 91 | abline(v = 1.3257, col='red')
 92 | qqnorm(ts_star); qqline(ts_star)
 93 | par(opar)
 94 | 
 95 | png("../plots/6-3.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
 96 | opar <- par(mfrow=c(2,2))
 97 | hist(xbars_star, nclass=100)
 98 | abline(v = 0.75, col='red')
 99 | hist(sds_star, nclass=100)
100 | abline(v = 1.789, col='red')
101 | hist(ts_star, nclass=100)
102 | abline(v = 1.3257, col='red')
103 | qqnorm(ts_star); qqline(ts_star)
104 | par(opar)
105 | dev.off()
106 | 
107 | 
108 | # 우리가 관측한 t-통계량 값 1.3257은 시뮬레이션 분포에서 어디에 있는가?
109 | 
110 | length(which(ts_star > 1.3257)) / B
111 | 
112 | # 스튜던트 t 분포
113 | # 다양한 자유도 값에 따른 t 밀도함수
114 | # https://en.wikipedia.org/wiki/Student%27s_t-distribution
115 | # Credit: 권용찬
116 | nlist=c(1,2,5)
117 | x <- seq(-5, 5, 0.05)
118 | y <- matrix(0, nr=length(x), nc=length(nlist))
119 | plot(x, type="n", xlab="x", ylab="P(x)",
120 |      xlim=c(-5,5), ylim=c(0, 0.45))
121 | for( i in 1:length(nlist)){
122 |   y[,i] <- dt(x, df=nlist[i])
123 |   lines(x, y[,i], col=i, lwd=2)
124 | }
125 | lines(x, dnorm(x), col=4, lwd=2)
126 | legend_text <- c(expression(paste(nu,"=1 ")),
127 |                  expression(paste(nu,"=2 ")),
128 |                  expression(paste(nu,"=5 ")),
129 |                  expression(paste(nu,"=",infinity)))
130 | legend("topright", legend=legend_text, lty=1, lwd=2, col=c(1:3,4),
131 |        inset=.05)
132 | 
133 | 
134 | png("../plots/6-4.png", 5.5, 4, units='in', pointsize=9, res=600)
135 | nlist=c(1,2,5)
136 | x <- seq(-5, 5, 0.05)
137 | y <- matrix(0, nr=length(x), nc=length(nlist))
138 | plot(x, type="n", xlab="x", ylab="P(x)",
139 |      xlim=c(-5,5), ylim=c(0, 0.45))
140 | for( i in 1:length(nlist)){
141 |   y[,i] <- dt(x, df=nlist[i])
142 |   lines(x, y[,i], col=i, lwd=2)
143 | }
144 | lines(x, dnorm(x), col=4, lwd=2)
145 | legend_text <- c(expression(paste(nu,"=1 ")),
146 |                  expression(paste(nu,"=2 ")),
147 |                  expression(paste(nu,"=5 ")),
148 |                  expression(paste(nu,"=",infinity)))
149 | legend("topright", legend=legend_text, lty=1, lwd=2, col=c(1:3,4), inset=.05)
150 | dev.off()
151 | 
152 | 
153 | 
154 | 
155 | # 8. 신뢰구간의 의미
156 | set.seed(1606)
157 | (y_star <- rnorm(10, 1, 1.8))
158 | t.test(y_star)$conf.int
159 | (y_star <- rnorm(10, 1, 1.8))
160 | t.test(y_star)$conf.int
161 | (y_star <- rnorm(10, 1, 1.8))
162 | t.test(y_star)$conf.int
163 | 
164 | 
165 | library(tidyverse)
166 | set.seed(1606)
167 | B = 1e2
168 | conf_intervals <-
169 |   data.frame(b=rep(NA, B),
170 |              lower=rep(NA, B),
171 |              xbar=rep(NA, B),
172 |              upper=rep(NA, B))
173 | true_mu <- 1.0
174 | for(b in 1:B){
175 |   (y_star <- rnorm(10, true_mu, 1.8))
176 |   conf_intervals[b, ] = c(b=b,
177 |                           lower=t.test(y_star)$conf.int[1],
178 |                           xbar=mean(y_star),
179 |                           upper=t.test(y_star)$conf.int[2])
180 | }
181 | conf_intervals <- conf_intervals %>%
182 |   mutate(lucky = (lower <= true_mu & true_mu <= upper))
183 | 
184 | glimpse(conf_intervals)
185 | table(conf_intervals$lucky)
186 | conf_intervals %>% ggplot(aes(b, xbar, col=lucky)) +
187 |   geom_point() +
188 |   geom_errorbar(aes(ymin=lower, ymax=upper)) +
189 |   geom_hline(yintercept=true_mu, col='red')
190 | ggsave("../plots/6-6.png", width=5.5, height=4, units='in', dpi=600)
191 | 
192 | 
193 | 
194 | # 6.10.2. 중심극한정리
195 | 
196 | hist(c(0, 1), nclass=100, prob=TRUE, main='Individual sleep time increase')
197 | set.seed(1606)
198 | B <- 1e4
199 | n <- 10
200 | xbars_star= rep(NA, B)
201 | for(b in 1:B){
202 |   xbars_star[b] <- mean(sample(c(0,1), size=n, replace=TRUE))
203 | }
204 | hist(xbars_star, nclass=100, main='Sample mean of 10 obs')
205 | 
206 | png("../plots/6-8.png", 5.5, 4*.8, units='in', pointsize=9, res=600)
207 | opar = par(mfrow=c(1,2))
208 | hist(c(0, 1), nclass=100, prob=TRUE, main='Individual sleep time increase')
209 | hist(xbars_star, nclass=100, main='Sample mean of 10 obs')
210 | par(opar)
211 | dev.off()
212 | 
213 | # 6.11. 모수추정의 정확도는 sqrt(n)에 비례한다.
214 | diff(t.test(y)$conf.int)
215 | mean(y)
216 | diff(t.test(y)$conf.int)/2
217 | 
218 | # 자료의 incremental 가치
219 | png("../plots/6-9.png", 5.5, 4*.8, units='in', pointsize=9, res=600)
220 | opar = par(mfrow=c(1,2))
221 | curve(1/sqrt(x), 1, 1000, log='x', main='s.e. vs sample size')
222 | curve((1/sqrt(x) - 1/sqrt(x+10)) / (1/sqrt(x)), 1, 1000, log='x',
223 |       main='% decrease in s.e. \nwhen adding 10 obs')
224 | par(opar)
225 | dev.off()
226 | 


--------------------------------------------------------------------------------
/ch06-statistics-concepts/ch06-statistics-concepts.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch06-statistics-concepts/ch06-statistics-concepts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#!/usr/bin/env python\n",
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import matplotlib.mlab as mlab\n",
 14 |     "import scipy.stats as stats\n",
 15 |     "import seaborn as sns\n",
 16 |     "\n",
 17 |     "%matplotlib inline"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# sleep 데이터 [Scheffe (1959)]\n",
 27 |     "data = {\"no.\": range(1, 21),\n",
 28 |     "        \"extra\": [0.7, -1.6, -0.2, -1.2, -0.1,\n",
 29 |     "                  3.4,  3.7,  0.8,  0.0,  2.0,\n",
 30 |     "                  1.9,  0.8,  1.1,  0.1, -0.1,\n",
 31 |     "                  4.4,  5.5,  1.6,  4.6,  3.4],\n",
 32 |     "        \"group\": [(x // 10) + 1 for x in range(20)],\n",
 33 |     "        \"ID\": list(range(1, 11)) * 2}\n",
 34 |     "       \n",
 35 |     "sleep = pd.DataFrame(data=data,\n",
 36 |     "                     columns=[\"no.\", \"extra\", \"group\", \"ID\"])\n",
 37 |     "sleep.set_index(\"no.\")\n",
 38 |     "sleep"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "# 약제 1을 복용했을 때 수면시간의 증가 (단위는 시간이다)\n",
 48 |     "y = sleep.loc[sleep[\"group\"]==1][\"extra\"]\n",
 49 |     "print(y)\n",
 50 |     "\n",
 51 |     "y.describe()"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "# histogram parameters\n",
 61 |     "# note that the bins are inclusive of their lower bounds,\n",
 62 |     "# and exclusive of their upper bounds\n",
 63 |     "bin_width = 1\n",
 64 |     "y_min = np.rint(y.min())\n",
 65 |     "y_max = np.rint(y.max()) + 1\n",
 66 |     "\n",
 67 |     "# set space between subplots\n",
 68 |     "fig = plt.figure(figsize=(10, 10))\n",
 69 |     "fig.subplots_adjust(hspace=.5, wspace=.5)\n",
 70 |     "\n",
 71 |     "# histogram\n",
 72 |     "ax1 = fig.add_subplot(221)\n",
 73 |     "ax1.set_title(\"Histogram of y\")\n",
 74 |     "ax1.set_xlabel(\"y\")\n",
 75 |     "ax1.set_ylabel(\"Frequency\")\n",
 76 |     "sns.distplot(y, ax=ax1, bins=np.arange(y_min, y_max, bin_width),\n",
 77 |     "             kde=False)\n",
 78 |     "ax1.set_xticks(np.arange(y_min, y_max))\n",
 79 |     "ax1.set_xlim(y_min, y_max-1)\n",
 80 |     "\n",
 81 |     "# boxplot\n",
 82 |     "ax2 = fig.add_subplot(222)\n",
 83 |     "ax2.set_title(\"Boxplot of y\")\n",
 84 |     "ax2.set_xlabel(\"group\")\n",
 85 |     "ax2.set_ylabel(\"extra\")\n",
 86 |     "sns.boxplot(data=y, ax=ax2)\n",
 87 |     "ax2.set_yticks(np.arange(y_min, y_max))\n",
 88 |     "\n",
 89 |     "# normal q-q plot\n",
 90 |     "ax3 = fig.add_subplot(223)\n",
 91 |     "#plt.title(\"Normal Q-Q plot\")\n",
 92 |     "z = (y-np.mean(y)) / np.std(y)\n",
 93 |     "res = stats.probplot(z, dist=\"norm\", plot=plt)\n",
 94 |     "\n",
 95 |     "# probability density \n",
 96 |     "ax4 = fig.add_subplot(224)\n",
 97 |     "ax4.set_title(\"Histogram of y\")\n",
 98 |     "ax4.set_xlabel(\"y\")\n",
 99 |     "ax4.set_ylabel(\"Density\")\n",
100 |     "sns.distplot(y, ax=ax4, bins=np.arange(y_min, y_max),\n",
101 |     "             hist=True)\n",
102 |     "ax4.set_xticks(np.arange(y_min, y_max))\n",
103 |     "ax4.set_xlim(y_min, y_max-1)\n",
104 |     "\n",
105 |     "plt.show()"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "# perform t-test on y\n",
115 |     "t, pvalue = stats.ttest_1samp(y, .0)\n",
116 |     "# calculate confidence interval\n",
117 |     "confidence_level = 0.95\n",
118 |     "low, high = stats.t.interval(confidence_level, len(y)-1, \n",
119 |     "                              loc=np.mean(y), scale=stats.sem(y))\n",
120 |     "\n",
121 |     "print(\"data: y\")\n",
122 |     "print(\"t = % .4f, p-value = %.4f\" % (t, pvalue))\n",
123 |     "print(\"alternative hypothesis: true mean is not equal to 0\")\n",
124 |     "print(\"%d percent confidence interval:\\n%.7f\\t%.7f\" % (confidence_level*100, low, high))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "# 개개인의 수면시간증가값 모형\n",
134 |     "# 평균이 0이고, 표준편차가 1.8(시간)인 종 모양의 분포(bell shaped distribution)\n",
135 |     "# N(0, 1.8^2)\n",
136 |     "mu = 0\n",
137 |     "variance = 1.8**2\n",
138 |     "sigma = 1.8\n",
139 |     "np.random.seed(1708)\n",
140 |     "\n",
141 |     "x = np.linspace(mu - 3*variance, mu + 3*variance, 100)\n",
142 |     "\n",
143 |     "fig, ax = plt.subplots()\n",
144 |     "ax.plot(x, mlab.normpdf(x, mu, sigma))\n",
145 |     "ax.set_xlim(-4, 4)\n",
146 |     "\n",
147 |     "plt.show()"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "# 10,000개의 평행우주의 표본, (각 표본은 10개의 관측치를 포함한다)\n",
157 |     "# 그리고 각 표본의 평균값, 표본표준편차, 그리고 t-통계량 값을  계산할 수 있다\n",
158 |     "B = 10000\n",
159 |     "n = 10\n",
160 |     "\n",
161 |     "mu = 0\n",
162 |     "varaince = 1.789**2\n",
163 |     "sigma = 1.789\n",
164 |     "np.random.seed(1708)\n",
165 |     "\n",
166 |     "xbars_star = np.zeros(B, dtype=float)\n",
167 |     "sds_star = np.zeros(B, dtype=float)\n",
168 |     "ts_star = np.zeros(B, dtype=float)\n",
169 |     "for b in range(B):\n",
170 |     "    y_star = np.random.normal(mu, sigma, n)\n",
171 |     "    m = y_star.mean()\n",
172 |     "    s = y_star.std()\n",
173 |     "    xbars_star[b] = m\n",
174 |     "    sds_star[b] = s\n",
175 |     "    ts_star[b] =  (m / (s / np.sqrt(n)))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "# set space between subplots\n",
185 |     "plt.figure(figsize=(10, 10))\n",
186 |     "plt.subplots_adjust(hspace=.5, wspace=.5)\n",
187 |     "\n",
188 |     "# histogram\n",
189 |     "plt.subplot(221)\n",
190 |     "plt.title(\"Histogram of xbars_star\")\n",
191 |     "plt.xlabel(\"xbars_star\")\n",
192 |     "plt.ylabel(\"Frequency\")\n",
193 |     "sns.distplot(xbars_star.T, kde=False)\n",
194 |     "plt.axvline(x=0.75)\n",
195 |     "\n",
196 |     "plt.subplot(222)\n",
197 |     "plt.title(\"Histogram of sds_star\")\n",
198 |     "plt.xlabel(\"sds_star\")\n",
199 |     "plt.ylabel(\"Frequency\")\n",
200 |     "sns.distplot(sds_star, kde=False)\n",
201 |     "plt.axvline(x=1.789)\n",
202 |     "\n",
203 |     "plt.subplot(223)\n",
204 |     "plt.title(\"Histogram of ts_star\")\n",
205 |     "plt.xlabel(\"ts_star\")\n",
206 |     "plt.ylabel(\"Frequency\")\n",
207 |     "sns.distplot(ts_star, kde=False)\n",
208 |     "plt.axvline(x=1.3257)\n",
209 |     "\n",
210 |     "# normal q-q plot\n",
211 |     "plt.subplot(224)\n",
212 |     "z = (ts_star-np.mean(ts_star)) / np.std(ts_star)\n",
213 |     "stats.probplot(z, dist=\"norm\", plot=plt)\n",
214 |     "\n",
215 |     "plt.show()\n",
216 |     "\n",
217 |     "# Calculate p-value manually\n",
218 |     "pvalue = np.argwhere(ts_star > 1.3257).size / B\n",
219 |     "print(pvalue)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# 스튜던트 t 분포\n",
229 |     "# 다양한 자유도 값에 따른 t 밀도함수\n",
230 |     "INF = 999\n",
231 |     "nlist = np.array([1, 2, 5, INF])\n",
232 |     "x = np.arange(-5.0, 5.05, 0.05)\n",
233 |     "\n",
234 |     "fig, ax = plt.subplots()\n",
235 |     "ax.set_xlim(-5, 5)\n",
236 |     "ax.set_ylim(0, 0.45)\n",
237 |     "\n",
238 |     "for df in nlist:\n",
239 |     "    ax.plot(x, stats.t.pdf(x, df), label=\"df={}\".format(df))\n",
240 |     "\n",
241 |     "ax.legend()\n",
242 |     "plt.show()"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "# 8. 신뢰구간의 의미\n",
252 |     "\n",
253 |     "n = 10\n",
254 |     "mu = 1\n",
255 |     "sigma = 1.8\n",
256 |     "confidence_level = 0.95\n",
257 |     "\n",
258 |     "# calculate confidence interval on different set of values\n",
259 |     "# from the same probability distribution\n",
260 |     "np.random.seed(1708)\n",
261 |     "y_star = np.random.normal(mu, sigma, n)\n",
262 |     "print(stats.t.interval(confidence_level, len(y_star)-1, \n",
263 |     "                       loc=np.mean(y_star), scale=stats.sem(y_star)))\n",
264 |     "\n",
265 |     "y_star = np.random.normal(mu, sigma, n)\n",
266 |     "print(stats.t.interval(confidence_level, len(y_star)-1, \n",
267 |     "                       loc=np.mean(y_star), scale=stats.sem(y_star)))\n",
268 |     "\n",
269 |     "y_star = np.random.normal(mu, sigma, n)\n",
270 |     "print(stats.t.interval(confidence_level, len(y_star)-1, \n",
271 |     "                       loc=np.mean(y_star), scale=stats.sem(y_star)))"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": null,
277 |    "metadata": {},
278 |    "outputs": [],
279 |    "source": [
280 |     "# run simulation \n",
281 |     "B = 100\n",
282 |     "n = 10\n",
283 |     "\n",
284 |     "conf_intervals = pd.DataFrame(\n",
285 |     "    data={\n",
286 |     "        \"b\": np.zeros(B, dtype=int),\n",
287 |     "        \"lower\": np.zeros(B, dtype=float),\n",
288 |     "        \"xbar\": np.zeros(B, dtype=float),\n",
289 |     "        \"upper\": np.zeros(B, dtype=float)\n",
290 |     "    }\n",
291 |     ")\n",
292 |     "\n",
293 |     "true_mu = 1.0\n",
294 |     "sigma = 1.8\n",
295 |     "confidence_level = 0.95\n",
296 |     "np.random.seed(1708)\n",
297 |     "\n",
298 |     "plt.figure(figsize=(10, 10))\n",
299 |     "\n",
300 |     "for b in range(B):\n",
301 |     "    y_star = np.random.normal(true_mu, sigma, n)\n",
302 |     "    lower, upper = stats.t.interval(confidence_level, len(y_star)-1, \n",
303 |     "                                    loc=np.mean(y_star), scale=stats.sem(y_star))\n",
304 |     "    conf_intervals.loc[b, \"b\"] = b\n",
305 |     "    conf_intervals.loc[b, \"lower\"] = lower\n",
306 |     "    conf_intervals.loc[b, \"xbar\"] = y_star.mean()\n",
307 |     "    conf_intervals.loc[b, \"upper\"] = upper\n",
308 |     " \n",
309 |     "for index, row in conf_intervals.iterrows():\n",
310 |     "    if row[\"lower\"] <= true_mu and true_mu <= row[\"upper\"]: \n",
311 |     "        conf_intervals.loc[index, \"lucky\"] = True\n",
312 |     "    else:\n",
313 |     "        conf_intervals.loc[index, \"lucky\"] = False\n",
314 |     "\n",
315 |     "lucky = conf_intervals.loc[conf_intervals.loc[:,\"lucky\"]==True]\n",
316 |     "unlucky = conf_intervals.loc[conf_intervals.loc[:, \"lucky\"]==False]\n",
317 |     "\n",
318 |     "plt.scatter(lucky[\"b\"], lucky[\"xbar\"], color=\"b\", label=\"lucky\")\n",
319 |     "plt.scatter(unlucky[\"b\"], unlucky[\"xbar\"], color=\"r\", label=\"unlucky\")\n",
320 |     "plt.hlines(y=true_mu, xmin=-10, xmax=B+10, lw=1, color=\"r\")\n",
321 |     "for index, row in lucky.iterrows():\n",
322 |     "    x, ymax, ymin = row[\"b\"], row[\"upper\"], row[\"lower\"]\n",
323 |     "    plt.vlines(x=x, ymin=ymin, ymax=ymax, color=\"b\", lw=1)\n",
324 |     "for index, row in unlucky.iterrows():\n",
325 |     "    x, ymax, ymin = row[\"b\"], row[\"upper\"], row[\"lower\"]\n",
326 |     "    plt.vlines(x=x, ymin=ymin, ymax=ymax, color=\"r\", lw=1)\n",
327 |     "    \n",
328 |     "plt.xlabel(\"b\")\n",
329 |     "plt.ylabel(\"xbar\")\n",
330 |     "plt.legend()\n",
331 |     "plt.show()"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "# 6.10.2. 중심극한정리\n",
341 |     "np.random.seed(1708)\n",
342 |     "B = 10000\n",
343 |     "n = 10\n",
344 |     "xbars_star = np.zeros(B//n, dtype=float)\n",
345 |     "extra = np.random.randint(2, size=B)\n",
346 |     "\n",
347 |     "fig = plt.figure(figsize=(15, 5))\n",
348 |     "fig.subplots_adjust(hspace=.5, wspace=.5)\n",
349 |     "\n",
350 |     "for b in range(B//n):\n",
351 |     "    xbars_star[b] = np.mean(extra[b:b+10])\n",
352 |     "    \n",
353 |     "ax1 = fig.add_subplot(121)\n",
354 |     "ax1.set_title(\"Individual sleep time increase\")\n",
355 |     "ax1.set_xlabel(\"c(0, 1)\")\n",
356 |     "ax1.set_ylabel(\"Density\")\n",
357 |     "sns.distplot(extra, ax=ax1, kde=False)\n",
358 |     "\n",
359 |     "ax2 = fig.add_subplot(122)\n",
360 |     "ax2.set_title(\"Sample mean of 10 obs\")\n",
361 |     "ax2.set_xlabel(\"xbars_star\")\n",
362 |     "ax2.set_ylabel(\"Frequency\")\n",
363 |     "sns.distplot(xbars_star, ax=ax2, kde=False)\n",
364 |     "\n",
365 |     "plt.show()"
366 |    ]
367 |   }
368 |  ],
369 |  "metadata": {
370 |   "kernelspec": {
371 |    "display_name": "Python 3",
372 |    "language": "python",
373 |    "name": "python3"
374 |   },
375 |   "language_info": {
376 |    "codemirror_mode": {
377 |     "name": "ipython",
378 |     "version": 3
379 |    },
380 |    "file_extension": ".py",
381 |    "mimetype": "text/x-python",
382 |    "name": "python",
383 |    "nbconvert_exporter": "python",
384 |    "pygments_lexer": "ipython3",
385 |    "version": "3.6.1"
386 |   }
387 |  },
388 |  "nbformat": 4,
389 |  "nbformat_minor": 2
390 | }
391 | 


--------------------------------------------------------------------------------
/ch07-basic-analysis/ch07-basic-analysis.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | library(gridExtra)
  3 | 
  4 | mpg <- tbl_df(mpg)
  5 | mpg
  6 | 
  7 | # 7.2. 모든 자료에 행해야 할 분석
  8 | library(dplyr)
  9 | library(ggplot2)
 10 | glimpse(mpg)
 11 | 
 12 | head(mpg)
 13 | 
 14 | summary(mpg)
 15 | 
 16 | # 7.3. 수량형 변수의 분석
 17 | summary(mpg$hwy)
 18 | mean(mpg$hwy)
 19 | median(mpg$hwy)
 20 | range(mpg$hwy)
 21 | quantile(mpg$hwy)
 22 | 
 23 | 
 24 | png("../plots/7-1.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
 25 | opar <- par(mfrow=c(2,2))
 26 | hist(mpg$hwy)
 27 | boxplot(mpg$hwy)
 28 | qqnorm(mpg$hwy)
 29 | qqline(mpg$hwy)
 30 | par(opar)
 31 | dev.off()
 32 | 
 33 | 
 34 | # 7.3.1. 일변량 t-검정
 35 | hwy <- mpg$hwy
 36 | n <- length(hwy)
 37 | mu0 <- 22.9
 38 | t.test(hwy, mu=mu0, alternative = "greater")
 39 | 
 40 | 
 41 | t.test(hwy)
 42 | 
 43 | # 7.3.2. 이상점과 로버스트 통계방법
 44 | c(mean(hwy), sd(hwy))
 45 | c(median(hwy), mad(hwy))
 46 | 
 47 | 
 48 | # 7.4. 성공-실패값 범주형 변수의 분석
 49 | set.seed(1606)
 50 | n <- 100
 51 | p <- 0.5
 52 | x <- rbinom(n, 1, p)
 53 | x <- factor(x, levels = c(0,1), labels = c("no", "yes"))
 54 | x
 55 | 
 56 | table(x)
 57 | 
 58 | prop.table(table(x))
 59 | 
 60 | barplot(table(x))
 61 | 
 62 | binom.test(x=length(x[x=='yes']), n = length(x), p = 0.5, alternative = "two.sided")
 63 | 
 64 | 
 65 | 
 66 | binom.test(x=5400, n = 10000)
 67 | 
 68 | 
 69 | n <- c(100, 1000, 2000, 10000, 1e6)
 70 | data.frame(n=n, moe=round(1.96 * sqrt(1/(4 * n)),4))
 71 | curve(1.96 * sqrt(1/(4 * x)), 10, 10000, log='x')
 72 | grid()
 73 | 
 74 | png("../plots/7-2.png", 5.5, 4, units='in', pointsize=9, res=600)
 75 | n <- c(100, 1000, 2000, 10000, 1e6)
 76 | data.frame(n=n, moe=round(1.96 * sqrt(1/(4 * n)),4))
 77 | curve(1.96 * sqrt(1/(4 * x)), 10, 10000, log='x')
 78 | grid()
 79 | dev.off()
 80 | 
 81 | 
 82 | # 7.6. 수량형 X, 수량형 Y의 분석
 83 | 
 84 | ggplot(mpg, aes(cty, hwy)) + geom_jitter() + geom_smooth(method="lm")
 85 | ggsave("../plots/7-4.png", width=5.5, height=4, units='in', dpi=600)
 86 | 
 87 | cor(mpg$cty, mpg$hwy)
 88 | with(mpg, cor(cty, hwy))
 89 | with(mpg, cor(cty, hwy, method = "kendall"))
 90 | with(mpg, cor(cty, hwy, method = "spearman"))
 91 | 
 92 | 
 93 | # 7.6.3. 선형회귀모형 적합
 94 | 
 95 | (hwy_lm <- lm(hwy ~ cty, data=mpg))
 96 | summary(hwy_lm)
 97 | 
 98 | predict(hwy_lm)
 99 | resid(hwy_lm)
100 | predict(hwy_lm, newdata = data.frame(cty=c(10, 20, 30)))
101 | 
102 | 
103 | 
104 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
105 | plot(hwy_lm, las = 1)   # Residuals, Fitted, ...
106 | par(opar)
107 | 
108 | png("../plots/7-6.png", 5.5, 4*1.2, units='in', pointsize=9, res=600)
109 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
110 | plot(hwy_lm, las = 1)   # Residuals, Fitted, ...
111 | par(opar)
112 | dev.off()
113 | 
114 | # 7.6.6. 로버스트 선형 회귀분석
115 | 
116 | library(MASS)
117 | set.seed(123) # make reproducible
118 | lqs(stack.loss ~ ., data = stackloss) # 로버스트
119 | lm(stack.loss ~ ., data = stackloss) # 보통 선형모형
120 | 
121 | 
122 | # 7.6.7. 비선형/비모수적 방법, 평활법과 LOESS
123 | 
124 | plot(hwy ~ displ, data=mpg)
125 | mpg_lo <- loess(hwy ~ displ, data=mpg)
126 | mpg_lo
127 | summary(mpg_lo)
128 | xs <- seq(2,7,length.out = 100)
129 | mpg_pre <- predict(mpg_lo, newdata=data.frame(displ=xs), se=TRUE)
130 | lines(xs, mpg_pre$fit)
131 | lines(xs, mpg_pre$fit - 1.96*mpg_pre$se.fit, lty=2)
132 | lines(xs, mpg_pre$fit + 1.96*mpg_pre$se.fit, lty=2)
133 | 
134 | ggplot(mpg, aes(displ, hwy)) +
135 |   geom_point() +
136 |   geom_smooth()
137 | 
138 | 
139 | png("../plots/7-8-left.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
140 | plot(hwy ~ displ, data=mpg)
141 | mpg_lo <- loess(hwy ~ displ, data=mpg)
142 | xs <- seq(2,7,length.out = 100)
143 | mpg_pre <- predict(mpg_lo, newdata=data.frame(displ=xs), se=TRUE)
144 | lines(xs, mpg_pre$fit)
145 | lines(xs, mpg_pre$fit - 1.96*mpg_pre$se.fit, lty=2)
146 | lines(xs, mpg_pre$fit + 1.96*mpg_pre$se.fit, lty=2)
147 | dev.off()
148 | 
149 | ggplot(mpg, aes(displ, hwy)) +
150 |   geom_point() +
151 |   geom_smooth()
152 | ggsave("../plots/7-8-right.png", width=5.5*.8, height=4, units='in', dpi=600)
153 | 
154 | 
155 | # 7.7. 범주형 x, 수량형 y
156 | 
157 | mpg %>% ggplot(aes(class, hwy)) + geom_boxplot()
158 | ggsave("../plots/7-9.png", width=5.5, height=4, units='in', dpi=600)
159 | 
160 | 
161 | (hwy_lm2 <- lm(hwy ~ class, data=mpg))
162 | summary(hwy_lm2)
163 | 
164 | 
165 | predict(hwy_lm2, newdata=data.frame(class="pickup"))
166 | 
167 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
168 | plot(hwy_lm2, las = 1)    # Residuals, Fitted, ...
169 | par(opar)
170 | 
171 | png("../plots/7-10.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
172 | opar <- par(mfrow = c(2,2), oma = c(0, 0, 1.1, 0))
173 | plot(hwy_lm2, las = 1)    # Residuals, Fitted, ...
174 | par(opar)
175 | dev.off()
176 | 
177 | 
178 | # 7.8. 수량형 x, 범주형 y (성공-실패)
179 | 
180 | library(gridExtra)
181 | p1 <- ggplot(data.frame(x=c(0, 1)), aes(x)) +
182 |   stat_function(fun=function(x) log(x/(1-x))) + ylab('logit(x)') +
183 |   ggtitle("Logit function")
184 | p2 <- ggplot(data.frame(y=c(-6, 6)), aes(y)) +
185 |   stat_function(fun=function(y) 1/(1+exp(-y))) + ylab('logistic(y)') +
186 |   ggtitle("Logistic function")
187 | g <- arrangeGrob(p1, p2, ncol=2)
188 | ggsave("../plots/7-11.png", g, width=5.5*1.5, height=4, units='in', dpi=600)
189 | 
190 | 
191 | chall <- read.csv('https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/challenger.csv')
192 | chall <- tbl_df(chall)
193 | glimpse(chall)
194 | 
195 | 
196 | chall %>% ggplot(aes(temperature, distress_ct)) +
197 |   geom_point()
198 | 
199 | chall %>% ggplot(aes(factor(distress_ct), temperature)) +
200 |   geom_boxplot()
201 | 
202 | 
203 | p1 <- chall %>% ggplot(aes(temperature, distress_ct)) +
204 |   geom_point()
205 | p2 <- chall %>% ggplot(aes(factor(distress_ct), temperature)) +
206 |   geom_boxplot()
207 | g <- arrangeGrob(p1, p2, ncol=2)
208 | ggsave("../plots/7-12.png", g, width=5.5*1.5, height=4, units='in', dpi=600)
209 | 
210 | 
211 | (chall_glm <-
212 |     glm(cbind(distress_ct, o_ring_ct - distress_ct) ~
213 |         temperature, data=chall, family='binomial'))
214 | 
215 | summary(chall_glm)
216 | 
217 | predict(chall_glm, data.frame(temperature=30))
218 | 
219 | exp(3.45) / (exp(3.45) +1)
220 | predict(chall_glm, data.frame(temperature=30), type='response')
221 | 
222 | 
223 | logistic <- function(x){exp(x)/(exp(x)+1)}
224 | 
225 | plot(c(20,85), c(0,1), type = "n", xlab = "temperature",
226 |      ylab = "prob")
227 | tp <- seq(20, 85, 1)
228 | chall_glm_pred <-
229 |   predict(chall_glm,
230 |           data.frame(temperature = tp),
231 |           se.fit = TRUE)
232 | lines(tp, logistic(chall_glm_pred$fit))
233 | lines(tp, logistic(chall_glm_pred$fit - 1.96 * chall_glm_pred$se.fit), lty=2)
234 | lines(tp, logistic(chall_glm_pred$fit + 1.96 * chall_glm_pred$se.fit), lty=2)
235 | abline(v=30, lty=2, col='blue')
236 | 
237 | 
238 | logistic <- function(x){exp(x)/(exp(x)+1)}
239 | 
240 | png("../plots/7-13.png", 5.5*.8, 4, units='in', pointsize=9, res=600)
241 | plot(c(20,85), c(0,1), type = "n", xlab = "temperature", ylab = "prob")
242 | tp <- seq(20, 85, 1)
243 | chall_glm_pred <- predict(chall_glm, data.frame(temperature = tp), se.fit = TRUE)
244 | lines(tp, logistic(chall_glm_pred$fit))
245 | lines(tp, logistic(chall_glm_pred$fit - 1.96 * chall_glm_pred$se.fit), lty=2)
246 | lines(tp, logistic(chall_glm_pred$fit + 1.96 * chall_glm_pred$se.fit), lty=2)
247 | abline(v=30, lty=2, col='blue')
248 | dev.off()
249 | 
250 | 


--------------------------------------------------------------------------------
/ch07-basic-analysis/ch07-basic-analysis.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch07-basic-analysis/correlation-example.R:
--------------------------------------------------------------------------------
 1 | # Source: https://commons.wikimedia.org/wiki/File:Correlation_examples2.svg
 2 | 
 3 | 
 4 | #Title: An example of the correlation of x and y for various distributions of (x,y) pairs
 5 | #Tags: Mathematics; Statistics; Correlation
 6 | #Author: Denis Boigelot
 7 | #Packets needed : mvtnorm (rmvnorm), RSVGTipsDevice (devSVGTips)
 8 | #How to use: output()
 9 | #
10 | #This is an translated version in R of an Matematica 6 code by Imagecreator.
11 | 
12 | library(mvtnorm)
13 | library(RSVGTipsDevice)
14 | 
15 | MyPlot <- function(xy, xlim = c(-4, 4), ylim = c(-4, 4), eps = 1e-15) {
16 |    title = round(cor(xy[,1], xy[,2]), 1)
17 |    if (sd(xy[,2]) < eps) title = "" # corr. coeff. is undefined
18 |    plot(xy, main = title, xlab = "", ylab = "",
19 |         col = "darkblue", pch = 16, cex = 0.2,
20 |         xaxt = "n", yaxt = "n", bty = "n",
21 |         xlim = xlim, ylim = ylim)
22 | }
23 | 
24 | MvNormal <- function(n = 1000, cor = 0.8) {
25 |    for (i in cor) {
26 |       sd = matrix(c(1, i, i, 1), ncol = 2)
27 |       x = rmvnorm(n, c(0, 0), sd)
28 |       MyPlot(x)
29 |    }
30 | }
31 | 
32 | rotation <- function(t, X) return(X %*% matrix(c(cos(t), sin(t), -sin(t), cos(t)), ncol = 2))
33 | 
34 | RotNormal <- function(n = 1000, t = pi/2) {
35 |    sd = matrix(c(1, 1, 1, 1), ncol = 2)
36 |    x = rmvnorm(n, c(0, 0), sd)
37 |    for (i in t)
38 |       MyPlot(rotation(i, x))
39 | }
40 | 
41 | Others <- function(n = 1000) {
42 |    x = runif(n, -1, 1)
43 |    y = 4 * (x^2 - 1/2)^2 + runif(n, -1, 1)/3
44 |    MyPlot(cbind(x,y), xlim = c(-1, 1), ylim = c(-1/3, 1+1/3))
45 | 
46 |    y = runif(n, -1, 1)
47 |    xy = rotation(-pi/8, cbind(x,y))
48 |    lim = sqrt(2+sqrt(2)) / sqrt(2)
49 |    MyPlot(xy, xlim = c(-lim, lim), ylim = c(-lim, lim))
50 | 
51 |    xy = rotation(-pi/8, xy)
52 |    MyPlot(xy, xlim = c(-sqrt(2), sqrt(2)), ylim = c(-sqrt(2), sqrt(2)))
53 |    
54 |    y = 2*x^2 + runif(n, -1, 1)
55 |    MyPlot(cbind(x,y), xlim = c(-1, 1), ylim = c(-1, 3))
56 | 
57 |    y = (x^2 + runif(n, 0, 1/2)) * sample(seq(-1, 1, 2), n, replace = TRUE)
58 |    MyPlot(cbind(x,y), xlim = c(-1.5, 1.5), ylim = c(-1.5, 1.5))
59 | 
60 |    y = cos(x*pi) + rnorm(n, 0, 1/8)
61 |    x = sin(x*pi) + rnorm(n, 0, 1/8)
62 |    MyPlot(cbind(x,y), xlim = c(-1.5, 1.5), ylim = c(-1.5, 1.5))
63 | 
64 |    xy1 = rmvnorm(n/4, c( 3,  3))
65 |    xy2 = rmvnorm(n/4, c(-3,  3))
66 |    xy3 = rmvnorm(n/4, c(-3, -3))
67 |    xy4 = rmvnorm(n/4, c( 3, -3))
68 |    MyPlot(rbind(xy1, xy2, xy3, xy4), xlim = c(-3-4, 3+4), ylim = c(-3-4, 3+4))
69 | }
70 | 
71 | output <- function() {
72 |    devSVGTips(width = 7, height = 3.2) # remove first and last line for no svg exporting
73 |    par(mfrow = c(3, 7), oma = c(0,0,0,0), mar=c(2,2,2,0))
74 |    MvNormal(800, c(1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0));
75 |    RotNormal(200, c(0, pi/12, pi/6, pi/4, pi/2-pi/6, pi/2-pi/12, pi/2));
76 |    Others(800)
77 |    dev.off() # remove first and last line for no svg exporting
78 | }
79 | 
80 | output() # Produce
81 | 
82 | {
83 |   library(mvtnorm)
84 |   png('../plots/7-5.png', width = 7, height = 3.2, units = 'in',
85 |       res=600, pointsize = 12) # remove first and last line for no svg exporting
86 |   par(mfrow = c(3, 7), oma = c(0,0,0,0), mar=c(2,2,2,0))
87 |   MvNormal(800, c(1.0, 0.8, 0.4, 0.0, -0.4, -0.8, -1.0));
88 |   RotNormal(200, c(0, pi/12, pi/6, pi/4, pi/2-pi/6, pi/2-pi/12, pi/2));
89 |   Others(800)
90 |   dev.off() # remove first and last line for no svg exporting
91 | }
92 | 


--------------------------------------------------------------------------------
/ch08-classification/adult/adult.R:
--------------------------------------------------------------------------------
  1 | # 8. 빅데이터 분류분석 I: 기본개념과 로지스틱모형
  2 | 
  3 | install.packages(c("dplyr", "ggplot2", "ISLR", "MASS", "glmnet",
  4 |                    "randomForest", "gbm", "rpart", "boot"))
  5 | 
  6 | library(tidyverse)
  7 | library(gridExtra)
  8 | library(ROCR)
  9 | 
 10 | library(ISLR)
 11 | library(MASS)
 12 | library(glmnet)
 13 | library(randomForest)
 14 | library(gbm)
 15 | library(rpart)
 16 | library(boot)
 17 | 
 18 | 
 19 | 
 20 | 
 21 | binomial_deviance <- function(y_obs, yhat){
 22 |   epsilon = 0.0001
 23 |   yhat = ifelse(yhat < epsilon, epsilon, yhat)
 24 |   yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
 25 |   a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
 26 |   b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
 27 |   return(2*sum(a + b))
 28 | }
 29 | 
 30 | 
 31 | 
 32 | 
 33 | # curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > adult.data
 34 | # curl  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names > adult.names
 35 | 
 36 | adult <- read.csv("adult.data", header = FALSE, strip.white = TRUE)
 37 | names(adult) <- c('age', 'workclass', 'fnlwgt', 'education',
 38 |                   'education_num', 'marital_status', 'occupation',
 39 |                   'relationship', 'race', 'sex',
 40 |                   'capital_gain', 'capital_loss',
 41 |                   'hours_per_week', 'native_country',
 42 |                   'wage')
 43 | 
 44 | 
 45 | glimpse(adult)
 46 | 
 47 | summary(adult)
 48 | 
 49 | levels(adult$wage)
 50 | 
 51 | # 8.3.3. 범주형 설명변수에서 문제의 복잡도
 52 | 
 53 | levels(adult$race)
 54 | adult$race[1:5]
 55 | levels(adult$sex)
 56 | adult$sex[1:5]
 57 | 
 58 | x <- model.matrix( ~ race + sex + age, adult)
 59 | glimpse(x)
 60 | colnames(x)
 61 | 
 62 | 
 63 | x_orig <- adult %>% dplyr::select(sex, race, age)
 64 | View(x_orig)
 65 | 
 66 | x_mod <- model.matrix( ~ sex + race + age, adult)
 67 | View(x_mod)
 68 | 
 69 | 
 70 | x <- model.matrix( ~ . - wage, adult)
 71 | dim(x)
 72 | 
 73 | # 8.4. 훈련, 검증, 테스트셋의 구분
 74 | 
 75 | set.seed(1601)
 76 | n <- nrow(adult)
 77 | idx <- 1:n
 78 | training_idx <- sample(idx, n * .60)
 79 | idx <- setdiff(idx, training_idx)
 80 | validate_idx = sample(idx, n * .20)
 81 | test_idx <- setdiff(idx, validate_idx)
 82 | length(training_idx)
 83 | length(validate_idx)
 84 | length(test_idx)
 85 | training <- adult[training_idx,]
 86 | validation <- adult[validate_idx,]
 87 | test <- adult[test_idx,]
 88 | 
 89 | 
 90 | # 8.5. 시각화
 91 | 
 92 | training %>%
 93 |   ggplot(aes(age, fill=wage)) +
 94 |   geom_density(alpha=.5)
 95 | ggsave("../../plots/8-3.png", width=5.5, height=4, units='in', dpi=600)
 96 | 
 97 | 
 98 | 
 99 | training %>%
100 |   filter(race %in% c('Black', 'White')) %>%
101 |   ggplot(aes(age, fill=wage)) +
102 |   geom_density(alpha=.5) +
103 |   ylim(0, 0.1) +
104 |   facet_grid(race ~ sex, scales = 'free_y')
105 | ggsave("../../plots/8-4.png", width=5.5, height=4, units='in', dpi=600)
106 | 
107 | 
108 | 
109 | training %>%
110 |   ggplot(aes(`education_num`, fill=wage)) +
111 |   geom_bar()
112 | ggsave("../../plots/8-5.png", width=5.5, height=4, units='in', dpi=600)
113 | 
114 | 
115 | # 8.6. 로지스틱 회귀분석
116 | ad_glm_full <- glm(wage ~ ., data=training, family=binomial)
117 | 
118 | summary(ad_glm_full)
119 | 
120 | 
121 | alias(ad_glm_full)
122 | 
123 | 
124 | predict(ad_glm_full, newdata = adult[1:5,], type="response")
125 | 
126 | 
127 | # 8.6.4. 예측 정확도 지표
128 | y_obs <- ifelse(validation$wage == ">50K", 1, 0)
129 | yhat_lm <- predict(ad_glm_full, newdata=validation, type='response')
130 | 
131 | library(gridExtra)
132 | 
133 | p1 <- ggplot(data.frame(y_obs, yhat_lm),
134 |              aes(y_obs, yhat_lm, group=y_obs,
135 |                  fill=factor(y_obs))) +
136 |   geom_boxplot()
137 | p2 <- ggplot(data.frame(y_obs, yhat_lm),
138 |              aes(yhat_lm, fill=factor(y_obs))) +
139 |   geom_density(alpha=.5)
140 | grid.arrange(p1, p2, ncol=2)
141 | 
142 | g <- arrangeGrob(p1, p2, ncol=2)
143 | ggsave("../../plots/8-6.png", g, width=5.5*1.5, height=4, units='in', dpi=600)
144 | 
145 | 
146 | 
147 | binomial_deviance(y_obs, yhat_lm)
148 | 
149 | library(ROCR)
150 | pred_lm <- prediction(yhat_lm, y_obs)
151 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
152 | plot(perf_lm, col='black', main="ROC Curve for GLM")
153 | abline(0,1)
154 | performance(pred_lm, "auc")@y.values[[1]]
155 | 
156 | 
157 | png("../../plots/8-7.png", 5.5, 4, units='in', pointsize=9, res=600)
158 | pred_lm <- prediction(yhat_lm, y_obs)
159 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
160 | plot(perf_lm, col='black', main="ROC Curve for GLM")
161 | abline(0,1)
162 | dev.off()
163 | 
164 | 
165 | # 9. 빅데이터 분류분석 II: 라쏘와 랜덤포레스트
166 | 
167 | # 9.1. glmnet 함수를 통한 라쏘 모형, 능형회귀, 변수선택
168 | xx <- model.matrix(wage ~ .-1, adult)
169 | x <- xx[training_idx, ]
170 | y <- ifelse(training$wage == ">50K", 1, 0)
171 | dim(x)
172 | 
173 | ad_glmnet_fit <- glmnet(x, y)
174 | 
175 | plot(ad_glmnet_fit)
176 | 
177 | png("../../plots/9-1.png", 5.5, 4, units='in', pointsize=9, res=600)
178 | plot(ad_glmnet_fit)
179 | dev.off()
180 | 
181 | ad_glmnet_fit
182 | 
183 | coef(ad_glmnet_fit, s = c(.1713, .1295))
184 | 
185 | 
186 | 
187 | ad_cvfit <- cv.glmnet(x, y, family = "binomial")
188 | 
189 | plot(ad_cvfit)
190 | 
191 | png("../../plots/9-2.png", 5.5, 4, units='in', pointsize=9, res=600)
192 | plot(ad_cvfit)
193 | dev.off()
194 | 
195 | log(ad_cvfit$lambda.min)
196 | log(ad_cvfit$lambda.1se)
197 | 
198 | coef(ad_cvfit, s=ad_cvfit$lambda.1se)
199 | coef(ad_cvfit, s="lambda.1se")
200 | 
201 | length(which(coef(ad_cvfit, s="lambda.min")>0))
202 | length(which(coef(ad_cvfit, s="lambda.1se")>0))
203 | 
204 | # 9.1.4.  값의 선택
205 | 
206 | set.seed(1607)
207 | foldid <- sample(1:10, size=length(y), replace=TRUE)
208 | cv1 <- cv.glmnet(x, y, foldid=foldid, alpha=1, family='binomial')
209 | cv.5 <- cv.glmnet(x, y, foldid=foldid, alpha=.5, family='binomial')
210 | cv0 <- cv.glmnet(x, y, foldid=foldid, alpha=0, family='binomial')
211 | 
212 | png("../../plots/9-3.png", 5.5, 4, units='in', pointsize=7, res=600)
213 | par(mfrow=c(2,2))
214 | plot(cv1, main="Alpha=1.0")
215 | plot(cv.5, main="Alpha=0.5")
216 | plot(cv0, main="Alpha=0.0")
217 | plot(log(cv1$lambda), cv1$cvm, pch=19, col="red",
218 |      xlab="log(Lambda)", ylab=cv1$name, main="alpha=1.0")
219 | points(log(cv.5$lambda), cv.5$cvm, pch=19, col="grey")
220 | points(log(cv0$lambda), cv0$cvm, pch=19, col="blue")
221 | legend("topleft", legend=c("alpha= 1", "alpha= .5", "alpha 0"),
222 |        pch=19, col=c("red","grey","blue"))
223 | dev.off()
224 | 
225 | 
226 | predict(ad_cvfit, s="lambda.1se", newx = x[1:5,], type='response')
227 | 
228 | y_obs <- ifelse(validation$wage == ">50K", 1, 0)
229 | yhat_glmnet <- predict(ad_cvfit, s="lambda.1se", newx=xx[validate_idx,], type='response')
230 | yhat_glmnet <- yhat_glmnet[,1] # change to a vectro from [n*1] matrix
231 | binomial_deviance(y_obs, yhat_glmnet)
232 | # [1] 4257.118
233 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
234 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
235 | 
236 | performance(pred_glmnet, "auc")@y.values[[1]]
237 | 
238 | png("../../plots/9-4.png", 5.5, 4, units='in', pointsize=9, res=600)
239 | plot(perf_lm, col='black', main="ROC Curve")
240 | plot(perf_glmnet, col='blue', add=TRUE)
241 | abline(0,1, col='gray')
242 | legend('bottomright', inset=.1,
243 |        legend=c("GLM", "glmnet"),
244 |        col=c('black', 'blue'), lty=1, lwd=2)
245 | dev.off()
246 | 
247 | 
248 | # 9.2. 나무모형
249 | library(rpart)
250 | cvr_tr <- rpart(wage ~ ., data = training)
251 | cvr_tr
252 | 
253 | 
254 | printcp(cvr_tr)
255 | summary(cvr_tr)
256 | 
257 | 
258 | 
259 | png("../../plots/9-6.png", 5.5, 4, units='in', pointsize=9, res=600)
260 | opar <- par(mfrow = c(1,1), xpd = NA)
261 | plot(cvr_tr)
262 | text(cvr_tr, use.n = TRUE)
263 | par(opar)
264 | dev.off()
265 | 
266 | 
267 | yhat_tr <- predict(cvr_tr, validation)
268 | yhat_tr <- yhat_tr[,">50K"]
269 | binomial_deviance(y_obs, yhat_tr)
270 | pred_tr <- prediction(yhat_tr, y_obs)
271 | perf_tr <- performance(pred_tr, measure = "tpr", x.measure = "fpr")
272 | performance(pred_tr, "auc")@y.values[[1]]
273 | 
274 | png("../../plots/9-7.png", 5.5, 4, units='in', pointsize=9, res=600)
275 | plot(perf_lm, col='black', main="ROC Curve")
276 | plot(perf_tr, col='blue', add=TRUE)
277 | abline(0,1, col='gray')
278 | legend('bottomright', inset=.1,
279 |     legend = c("GLM", "Tree"),
280 |     col=c('black', 'blue'), lty=1, lwd=2)
281 | dev.off()
282 | 
283 | 
284 | # 9.3. 랜덤 포레스트 -----------
285 | 
286 | set.seed(1607)
287 | ad_rf <- randomForest(wage ~ ., training)
288 | ad_rf
289 | 
290 | png("../../plots/9-8.png", 5.5, 4, units='in', pointsize=9, res=600)
291 | plot(ad_rf)
292 | dev.off()
293 | 
294 | tmp <- importance(ad_rf)
295 | head(round(tmp[order(-tmp[,1]), 1, drop=FALSE], 2), n=10)
296 | 
297 | png("../../plots/9-9.png", 5.5, 4, units='in', pointsize=9, res=600)
298 | varImpPlot(ad_rf)
299 | dev.off()
300 | 
301 | predict(ad_rf, newdata = adult[1:5,])
302 | 
303 | predict(ad_rf, newdata = adult[1:5,], type="prob")
304 | 
305 | 
306 | yhat_rf <- predict(ad_rf, newdata=validation, type='prob')[,'>50K']
307 | binomial_deviance(y_obs, yhat_rf)
308 | pred_rf <- prediction(yhat_rf, y_obs)
309 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
310 | performance(pred_tr, "auc")@y.values[[1]]
311 | 
312 | png("../../plots/9-10.png", 5.5, 4, units='in', pointsize=9, res=600)
313 | plot(perf_lm, col='black', main="ROC Curve")
314 | plot(perf_glmnet, add=TRUE, col='blue')
315 | plot(perf_rf, add=TRUE, col='red')
316 | abline(0,1, col='gray')
317 | legend('bottomright', inset=.1,
318 |        legend = c("GLM", "glmnet", "RF"),
319 |        col=c('black', 'blue', 'red'), lty=1, lwd=2)
320 | dev.off()
321 | 
322 | 
323 | # 9.3.5. 예측확률값 자체의 비교
324 | p1 <- data.frame(yhat_glmnet, yhat_rf) %>%
325 |   ggplot(aes(yhat_glmnet, yhat_rf)) +
326 |   geom_point(alpha=.5) +
327 |   geom_abline() +
328 |   geom_smooth()
329 | p2 <- reshape2::melt(data.frame(yhat_glmnet, yhat_rf)) %>%
330 |   ggplot(aes(value, fill=variable)) +
331 |   geom_density(alpha=.5)
332 | grid.arrange(p1, p2, ncol=2)
333 | g <- arrangeGrob(p1, p2, ncol=2)
334 | ggsave("../../plots/9-11.png", g, width=5.5*1.2, height=4*.8, units='in', dpi=600)
335 | 
336 | 
337 | # 9.4. 부스팅 ----------
338 | 
339 | set.seed(1607)
340 | adult_gbm <- training %>% mutate(wage=ifelse(wage == ">50K", 1, 0))
341 | ad_gbm <- gbm(wage ~ ., data=adult_gbm,
342 |              distribution="bernoulli",
343 |              n.trees=50000, cv.folds=3, verbose=TRUE)
344 | (best_iter <- gbm.perf(ad_gbm, method="cv"))
345 | 
346 | ad_gbm2 <- gbm.more(ad_gbm, n.new.trees=10000)
347 | (best_iter <- gbm.perf(ad_gbm2, method="cv"))
348 | 
349 | 
350 | png("../../plots/9-12.png", 5.5, 4, units='in', pointsize=9, res=600)
351 | (best_iter <- gbm.perf(ad_gbm2, method="cv"))
352 | dev.off()
353 | 
354 | 
355 | predict(ad_gbm, n.trees=best_iter, newdata=adult_gbm[1:5,], type='response')
356 | 
357 | yhat_gbm <- predict(ad_gbm, n.trees=best_iter, newdata=validation, type='response')
358 | binomial_deviance(y_obs, yhat_gbm)
359 | pred_gbm <- prediction(yhat_gbm, y_obs)
360 | perf_gbm <- performance(pred_gbm, measure="tpr", x.measure="fpr")
361 | performance(pred_gbm, "auc")@y.values[[1]]
362 | 
363 | 
364 | png("../../plots/9-13.png", 5.5, 4, units='in', pointsize=9, res=600)
365 | plot(perf_lm, col='black', main="ROC Curve")
366 | plot(perf_glmnet, add=TRUE, col='blue')
367 | plot(perf_rf, add=TRUE, col='red')
368 | plot(perf_gbm, add=TRUE, col='cyan')
369 | abline(0,1, col='gray')
370 | legend('bottomright', inset=.1,
371 |     legend=c("GLM", "glmnet", "RF", "GBM"),
372 |     col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
373 | dev.off()
374 | 
375 | 
376 | 
377 | # 9.5. 모형 비교, 최종 모형 선택, 일반화 성능 평가 ----
378 | 
379 | 
380 | # 9.5.2. 모형의 예측확률값의 분포 비교
381 | # exmaple(pairs) 에서 따옴
382 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
383 |   usr <- par("usr"); on.exit(par(usr))
384 |   par(usr = c(0, 1, 0, 1))
385 |   r <- abs(cor(x, y))
386 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
387 |   txt <- paste0(prefix, txt)
388 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
389 |   text(0.5, 0.5, txt, cex = cex.cor * r)
390 | }
391 | 
392 | png("../../plots/9-14.png", 5.5, 4, units='in', pointsize=9, res=600)
393 | pairs(data.frame(y_obs=y_obs,
394 |                 yhat_lm=yhat_lm,
395 |                 yhat_glmnet=c(yhat_glmnet),
396 |               yhat_rf=yhat_rf,
397 |                 yhat_gbm=yhat_gbm),
398 |     lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
399 |     upper.panel = panel.cor)
400 | dev.off()
401 | 
402 | 
403 | # 9.5.3. 테스트셋을 이용한 일반화능력 계산
404 | y_obs_test <- ifelse(test$wage == ">50K", 1, 0)
405 | yhat_gbm_test <- predict(ad_gbm, n.trees=best_iter, newdata=test, type='response')
406 | binomial_deviance(y_obs_test, yhat_gbm_test)
407 | pred_gbm_test <- prediction(yhat_gbm_test, y_obs_test)
408 | performance(pred_gbm_test, "auc")@y.values[[1]]
409 | 
410 | # 9.6.5. 캐럿 (caret) 패키지
411 | install.packages("caret", dependencies = c("Depends", "Suggests"))
412 | 
413 | 
414 | 
415 | # This is for the earlier ROC curve example. ---
416 | {
417 |   png("../../plots/8-1.png", 5.5*1.2, 4*.8, units='in', pointsize=9, res=600)
418 |   opar <- par(mfrow=c(1,2))
419 |   plot(perf_lm, col='black', main="ROC Curve")
420 |   plot(perf_tr, col='blue', add=TRUE)
421 |   abline(0,1, col='gray')
422 |   legend('bottomright', inset=.1,
423 |       legend = c("GLM", "Tree"),
424 |       col=c('black', 'blue'), lty=1, lwd=2)
425 |   plot(perf_lm, col='black', main="ROC Curve")
426 |   plot(perf_glmnet, add=TRUE, col='blue')
427 |   plot(perf_rf, add=TRUE, col='red')
428 |   plot(perf_gbm, add=TRUE, col='cyan')
429 |   abline(0,1, col='gray')
430 |   legend('bottomright', inset=.1,
431 |       legend=c("GLM", "glmnet", "RF", "GBM"),
432 |       col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
433 |   par(opar)
434 |   dev.off()
435 | }
436 | 


--------------------------------------------------------------------------------
/ch08-classification/adult/adult.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch08-classification/adult/adult.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#!/usr/bin/env python\n",
 10 |     "import pandas as pd\n",
 11 |     "import numpy as np\n",
 12 |     "import matplotlib.pyplot as plt\n",
 13 |     "import seaborn as sns\n",
 14 |     "import statsmodels.api as sm\n",
 15 |     "import ggplot\n",
 16 |     "\n",
 17 |     "from sklearn import preprocessing\n",
 18 |     "from sklearn.metrics import roc_curve, auc\n",
 19 |     "from IPython.display import display   \n",
 20 |     "\n",
 21 |     "%matplotlib inline"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {
 28 |     "scrolled": false
 29 |    },
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "# 1. You can choose to load the data into pandas DataFrame straight from the URL\n",
 33 |     "URL = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\"\n",
 34 |     "\n",
 35 |     "# 2. or you can also download the dataset manually. In this case change the URL accordingly. \n",
 36 |     "# curl https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data > adult.data\n",
 37 |     "# curl  https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names > adult.names\n",
 38 |     "\n",
 39 |     "adult = pd.read_table(\n",
 40 |     "    URL,\n",
 41 |     "    \n",
 42 |     "    # specify the file encoding\n",
 43 |     "    encoding=\"utf-8\",\n",
 44 |     "    \n",
 45 |     "    # specify the separator in the data\n",
 46 |     "    sep=\",\",      # comma separated values\n",
 47 |     "    \n",
 48 |     "    # ignore spaces after the separator\n",
 49 |     "    skipinitialspace=True,\n",
 50 |     "    index_col=None,\n",
 51 |     "    \n",
 52 |     "    # use manual headers\n",
 53 |     "    header=None,\n",
 54 |     "    names=[\n",
 55 |     "        \"age\", \"workclass\", \"fnlwgt\", \"education\", \n",
 56 |     "        \"education-num\", \"marital-status\", \"occupation\",\n",
 57 |     "        \"relationship\", \"race\", \"sex\", \"capital-gain\", \n",
 58 |     "        \"capital-loss\", \"hours-per-week\", \"native-country\",\n",
 59 |     "        \"wage\"\n",
 60 |     "    ]\n",
 61 |     ")\n",
 62 |     "\n",
 63 |     "pd.set_option(\"display.max_rows\", 10)\n",
 64 |     "display(adult)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "adult.info()\n",
 74 |     "\n",
 75 |     "groupby_wage = adult.groupby(\"wage\").describe()\n",
 76 |     "for i in groupby_wage.columns.levels[0]:\n",
 77 |     "    groupby_wage[i].index.name= \"wage / \" + str(i)\n",
 78 |     "    display(groupby_wage[i])\n",
 79 |     "\n",
 80 |     "print(adult[\"wage\"].unique())    "
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "# 8.3.3. 범주형 설명변수에서 문제의 복잡도\n",
 90 |     "print(adult[\"race\"].unique())\n",
 91 |     "print(adult[\"race\"][0:5])\n",
 92 |     "print(adult[\"sex\"].unique())\n",
 93 |     "print(adult[\"sex\"][0:5])\n",
 94 |     "\n",
 95 |     "design_matrix_race = pd.get_dummies(adult[\"race\"], drop_first=True)\n",
 96 |     "design_matrix_sex = pd.get_dummies(adult[\"sex\"], drop_first=True)\n",
 97 |     "intercept = pd.DataFrame(1, index=np.arange(adult.shape[0]), columns=[\"(Intercept)\"])\n",
 98 |     "example_design_matrix = pd.concat([intercept, design_matrix_race, design_matrix_sex, adult[\"age\"]], axis=1)\n",
 99 |     "\n",
100 |     "# original matrix\n",
101 |     "display(adult[[\"race\", \"sex\", \"age\"]])\n",
102 |     "# example design matrix\n",
103 |     "display(example_design_matrix)\n",
104 |     "\n",
105 |     "# convert each categorical feature using one-hot encoding\n",
106 |     "obj_df = adult.select_dtypes(include=[\"object\"]).drop(\"wage\", axis=1)\n",
107 |     "int_df = adult.select_dtypes(include=[\"int64\"])\n",
108 |     "\n",
109 |     "design_matrix_objs = pd.get_dummies(obj_df, drop_first=True)\n",
110 |     "design_matrix_adult = pd.concat([intercept, design_matrix_objs, int_df], axis=1)\n",
111 |     "display(design_matrix_adult)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "# 8.4. 훈련, 검증, 테스트셋의 구분\n",
121 |     "np.random.seed(1709)\n",
122 |     "\n",
123 |     "''' 1. You can do it the easy way\n",
124 |     "\n",
125 |     "from sklearn.model_selection import train_test_split\n",
126 |     "training, test = train_test_split(adult, test_size=0.2)\n",
127 |     "training, validation = train_test_split(training, test_size=0.25)\n",
128 |     "print(training.shape)\n",
129 |     "print(validation.shape)\n",
130 |     "print(test.shape)\n",
131 |     "'''\n",
132 |     "\n",
133 |     "# 2. Or you can split the dataset manually\n",
134 |     "n = adult.shape[0]\n",
135 |     "idx = np.arange(n)\n",
136 |     "np.random.shuffle(idx)\n",
137 |     "\n",
138 |     "training_size = int(n*0.6)\n",
139 |     "validate_size = int(n*0.2)\n",
140 |     "\n",
141 |     "training_idx = idx[:training_size]\n",
142 |     "validate_idx = idx[training_size:training_size+validate_size]\n",
143 |     "test_idx = idx[training_size+validate_size:]\n",
144 |     "\n",
145 |     "training = adult.loc[training_idx]\n",
146 |     "validation = adult.loc[validate_idx]\n",
147 |     "test = adult.loc[test_idx]\n",
148 |     "print(training.shape)\n",
149 |     "print(validation.shape)\n",
150 |     "print(test.shape)"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "# 8.5. 시각화\n",
160 |     "fig1 = plt.figure(figsize=(5, 5))\n",
161 |     "sns.set_style(\"dark\", {'axes.grid' : True})\n",
162 |     "\n",
163 |     "ax1 = fig1.add_subplot(111)\n",
164 |     "ax1.set_xlabel(\"age\")\n",
165 |     "ax1.set_ylabel(\"density\")\n",
166 |     "ax1.set_ylim(0, 0.04)\n",
167 |     "\n",
168 |     "df1 = training[training[\"wage\"] == \"<=50K\"]\n",
169 |     "df2 = training[training[\"wage\"] == \">50K\"]\n",
170 |     "\n",
171 |     "sns.distplot(df1[\"age\"], ax=ax1, hist=False,\n",
172 |     "             kde_kws={\"alpha\": .3, \"color\": \"g\",\n",
173 |     "                      \"shade\": True, \"label\": \"<=50K\"})\n",
174 |     "sns.distplot(df2[\"age\"], ax=ax1, hist=False,\n",
175 |     "             kde_kws={\"alpha\": .3, \"color\": \"b\",\n",
176 |     "                      \"shade\": True, \"label\": \">50K\"})\n",
177 |     "\n",
178 |     "df3 = training.loc[(training[\"race\"] == \"White\") |\n",
179 |     "                   (training[\"race\"] == \"Black\")]\n",
180 |     "\n",
181 |     "g = sns.FacetGrid(df3, row=\"race\", col=\"sex\", hue=\"wage\",\n",
182 |     "                  hue_kws={\"color\": [\"b\", \"g\"]})\n",
183 |     "g.set(ylim=(0, .05))\n",
184 |     "g.map(sns.distplot, \"age\", \"wage\", hist=False,\n",
185 |     "      kde_kws={\"alpha\": .3, \"shade\": True})\n",
186 |     "\n",
187 |     "fig2 = plt.figure(figsize=(10, 10))\n",
188 |     "ax2 = fig2.add_subplot(211)\n",
189 |     "sns.countplot(x=\"education-num\", hue=\"wage\", data=training, ax=ax2)\n",
190 |     "\n",
191 |     "plt.show()"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "# 8.6. 로지스틱 회귀분석\n",
201 |     "\n",
202 |     "# glms cannot interpret strings.\n",
203 |     "# We have to assign each label a numeric id. \n",
204 |     "le = preprocessing.LabelEncoder()\n",
205 |     "le.fit([\"<=50K\", \">50K\"])\n",
206 |     "y = le.transform(training[\"wage\"])\n",
207 |     "X = design_matrix_adult.loc[training_idx]\n",
208 |     "\n",
209 |     "lm = sm.GLM(y, sm.add_constant(X), family=sm.families.Binomial())\n",
210 |     "res = lm.fit()\n",
211 |     "\n",
212 |     "resid_deviance = res.resid_deviance\n",
213 |     "\n",
214 |     "print(\"Deviance Residuals:\", \"\\nMin: {} \\nMedian: {} \\nMax: {}\".format(\n",
215 |     "       resid_deviance.max(), resid_deviance.min(), resid_deviance.median()))\n",
216 |     "\n",
217 |     "display(res.summary())\n",
218 |     "\n",
219 |     "test_X = design_matrix_adult[1:6]\n",
220 |     "res.predict(test_X)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {},
227 |    "outputs": [],
228 |    "source": [
229 |     "# 8.6.4. 예측 정확도 지표\n",
230 |     "fig = plt.figure(figsize=(10, 10))\n",
231 |     "ax1 = fig.add_subplot(121)\n",
232 |     "\n",
233 |     "y_obs = le.transform(validation[\"wage\"])\n",
234 |     "yhat_lm = res.predict(design_matrix_adult.loc[validate_idx])\n",
235 |     "df = pd.DataFrame({\"x\": y_obs, \"y\": yhat_lm})\n",
236 |     "sns.boxplot(x=\"x\", y=\"y\", data=df, ax=ax1)\n",
237 |     "\n",
238 |     "ax1.set_xlabel(\"y_obs\")\n",
239 |     "ax1.set_ylabel(\"yhat_lim\")\n",
240 |     "\n",
241 |     "ax2 = fig.add_subplot(122)\n",
242 |     "ax2.set_ylim(0, 8)\n",
243 |     "\n",
244 |     "sns.distplot(df[df[\"x\"] == 1][\"y\"], ax=ax2, hist=False,\n",
245 |     "             kde_kws={\"alpha\": .3, \"color\": \"b\",\n",
246 |     "                      \"shade\": True, \"label\": \"1\"})\n",
247 |     "sns.distplot(df[df[\"x\"] == 0][\"y\"], ax=ax2, hist=False,\n",
248 |     "             kde_kws={\"alpha\": .3, \"color\": \"g\",\n",
249 |     "                      \"shade\": True, \"label\": \"0\"})\n",
250 |     "\n",
251 |     "print(res.deviance)\n",
252 |     "\n",
253 |     "plt.show()"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": null,
259 |    "metadata": {},
260 |    "outputs": [],
261 |    "source": [
262 |     "fig = plt.figure(figsize=(10, 5))\n",
263 |     "ax = fig.add_subplot(111)\n",
264 |     "ax.set_title(\"ROC curve for GLM\")\n",
265 |     "ax.set_xlabel(\"False positive rate\")\n",
266 |     "ax.set_ylabel(\"True positive rate\")\n",
267 |     "\n",
268 |     "fpr, tpr, _ = roc_curve(y_obs, yhat_lm)\n",
269 |     "plt.plot(fpr, tpr)\n",
270 |     "plt.plot([0, 1], [0, 1], color='navy', linestyle='--')\n",
271 |     "\n",
272 |     "plt.show()\n",
273 |     "\n",
274 |     "print(auc(fpr, tpr))"
275 |    ]
276 |   }
277 |  ],
278 |  "metadata": {
279 |   "kernelspec": {
280 |    "display_name": "Python 3",
281 |    "language": "python",
282 |    "name": "python3"
283 |   },
284 |   "language_info": {
285 |    "codemirror_mode": {
286 |     "name": "ipython",
287 |     "version": 3
288 |    },
289 |    "file_extension": ".py",
290 |    "mimetype": "text/x-python",
291 |    "name": "python",
292 |    "nbconvert_exporter": "python",
293 |    "pygments_lexer": "ipython3",
294 |    "version": "3.6.1"
295 |   }
296 |  },
297 |  "nbformat": 4,
298 |  "nbformat_minor": 2
299 | }
300 | 


--------------------------------------------------------------------------------
/ch08-classification/breast-cancer/breast-cancer-wisconsin.R:
--------------------------------------------------------------------------------
  1 | # 빅데이터 분별분석. 암 예측.
  2 | #
  3 | if (!file.exists("breast-cancer-wisconsin.data")){
  4 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data > breast-cancer-wisconsin.data')
  5 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names > breast-cancer-wisconsin.names')
  6 | }
  7 | 
  8 | rmse <- function(yi, yhat_i){
  9 |   sqrt(mean((yi - yhat_i)^2))
 10 | }
 11 | 
 12 | binomial_deviance <- function(y_obs, yhat){
 13 |   epsilon = 0.0001
 14 |   yhat = ifelse(yhat < epsilon, epsilon, yhat)
 15 |   yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
 16 |   a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
 17 |   b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
 18 |   return(2*sum(a + b))
 19 | }
 20 | 
 21 | 
 22 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 23 |   usr <- par("usr"); on.exit(par(usr))
 24 |   par(usr = c(0, 1, 0, 1))
 25 |   r <- abs(cor(x, y))
 26 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 27 |   txt <- paste0(prefix, txt)
 28 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 29 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 30 | }
 31 | 
 32 | 
 33 | 
 34 | library(dplyr)
 35 | library(ggplot2)
 36 | library(MASS)
 37 | library(glmnet)
 38 | library(randomForest)
 39 | library(gbm)
 40 | library(rpart)
 41 | library(boot)
 42 | library(data.table)
 43 | library(ROCR)
 44 | library(gridExtra)
 45 | 
 46 | data <- tbl_df(read.table("breast-cancer-wisconsin.data", strip.white = TRUE,
 47 |                           sep=",", header = FALSE, na.strings = '?'))
 48 | names(data) <- c('id', 'thickness', 'unif_cell_size', 'unif_cell_shape',
 49 |                  'marginal_adhesion', 'cell_size', 'bare_nuclei',
 50 |                  'bland_cromatin', 'normal_nucleoli', 'mitoses', 'class')
 51 | 
 52 | glimpse(data)
 53 | 
 54 | # 1. 결측치 처리
 55 | data$bare_nuclei[is.na(data$bare_nuclei)] <- median(data$bare_nuclei, na.rm = TRUE)
 56 | # 2. id 변수 제거
 57 | data <- data %>% dplyr::select(-id)
 58 | # 3. class 변수를 인자 변수로 변환
 59 | data$class <- factor(ifelse(data$class == 2, 0, 1))
 60 | 
 61 | glimpse(data)
 62 | 
 63 | 
 64 | summary(data)
 65 | 
 66 | pairs(data %>% sample_n(min(1000, nrow(data))),
 67 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 68 |       upper.panel = panel.cor)
 69 | 
 70 | library(ggplot2)
 71 | library(dplyr)
 72 | library(gridExtra)
 73 | p1 <- data %>% ggplot(aes(class)) + geom_bar()
 74 | p2 <- data %>% ggplot(aes(class, unif_cell_size)) +
 75 |   geom_jitter(col='gray') +
 76 |   geom_boxplot(alpha=.5)
 77 | p3 <- data %>% ggplot(aes(class, bare_nuclei)) +
 78 |   geom_jitter(col='gray') +
 79 |   geom_boxplot(alpha=.5)
 80 | p4 <- data %>% ggplot(aes(unif_cell_size, bare_nuclei)) +
 81 |   geom_jitter(col='gray') + geom_smooth()
 82 | grid.arrange(p1, p2, p3, p4, ncol=2)
 83 | 
 84 | 
 85 | # 트래인셋과 테스트셋의 구분
 86 | set.seed(1606)
 87 | n <- nrow(data)
 88 | idx <- 1:n
 89 | training_idx <- sample(idx, n * .60)
 90 | idx <- setdiff(idx, training_idx)
 91 | validate_idx <- sample(idx, n * .20)
 92 | test_idx <- setdiff(idx, validate_idx)
 93 | training <- data[training_idx,]
 94 | validation <- data[validate_idx,]
 95 | test <- data[test_idx,]
 96 | 
 97 | 
 98 | #-----------------
 99 | # 로지스틱 회귀모형
100 | data_lm_full <- glm(class ~ ., data=training, family=binomial)
101 | summary(data_lm_full)
102 | 
103 | predict(data_lm_full, newdata = data[1:5,])
104 | 
105 | # 선형회귀모형에서 변수선택
106 | data_lm_full_2 <- lm(class ~ .^2, data=training)
107 | summary(data_lm_full_2)
108 | 
109 | length(coef(data_lm_full_2))
110 | 
111 | library(MASS)
112 | data_step <- stepAIC(data_lm_full,
113 |                      scope = list(upper = ~ .^2, lower = ~1))
114 | 
115 | data_step
116 | anova(data_step)
117 | summary(data_step)
118 | length(coef(data_step))
119 | 
120 | 
121 | # 모형평가
122 | y_obs <- validation$class
123 | yhat_lm <- predict(data_lm_full, newdata=validation)
124 | yhat_lm_2 <- predict(data_lm_full_2, newdata=validation)
125 | yhat_step <- predict(data_step, newdata=validation)
126 | rmse(y_obs, yhat_lm)
127 | rmse(y_obs, yhat_lm_2)
128 | rmse(y_obs, yhat_step)
129 | 
130 | library(ROCR)
131 | pred_lm <- prediction(yhat_lm, y_obs)
132 | performance(pred_lm, "auc")@y.values[[1]]
133 | binomial_deviance(y_obs, yhat_glmnet)
134 | 
135 | #-----------------
136 | # 라쏘 모형 적합
137 | # xx <- model.matrix(class ~ .^2-1, data)
138 | xx <- model.matrix(class ~ .-1, data)
139 | x <- xx[training_idx, ]
140 | y <- as.numeric(training$class)
141 | glimpse(x)
142 | 
143 | data_cvfit <- cv.glmnet(x, y, family = "binomial")
144 | plot(data_cvfit)
145 | 
146 | 
147 | coef(data_cvfit, s = c("lambda.1se"))
148 | coef(data_cvfit, s = c("lambda.min"))
149 | 
150 | 
151 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,])
152 | 
153 | 
154 | 
155 | 
156 | predict(ad_cvfit, s="lambda.1se", newx = x[1:5,], type='response')
157 | 
158 | y_obs <- as.numeric(validation$class)
159 | yhat_glmnet <- predict(ad_cvfit, s="lambda.1se", newx=xx[validate_idx,], type='response')
160 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
161 | binomial_deviance(y_obs, yhat_glmnet)
162 | 
163 | 
164 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
165 | performance(pred_glmnet, "auc")@y.values[[1]]
166 | 
167 | 
168 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
169 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
170 | plot(perf_lm, col='black', main="ROC Curve for GLM")
171 | abline(0,1)
172 | 
173 | 
174 | plot(perf_lm, col='black', main="ROC Curve")
175 | plot(perf_glmnet, col='blue', add=TRUE)
176 | abline(0,1)
177 | legend('bottomright', inset=.1,
178 |     legend=c("GLM", "glmnet"),
179 |     col=c('black', 'blue'), lty=1, lwd=2)
180 | 
181 | 
182 | 
183 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,])
184 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
185 | rmse(y_obs, yhat_glmnet)
186 | 
187 | #-----------------
188 | # 나무모형
189 | data_tr <- rpart(class ~ ., data = training)
190 | data_tr
191 | 
192 | printcp(data_tr)
193 | summary(data_tr)
194 | 
195 | opar <- par(mfrow = c(1,1), xpd = NA)
196 | plot(data_tr)
197 | text(data_tr, use.n = TRUE)
198 | par(opar)
199 | 
200 | 
201 | yhat_tr <- predict(data_tr, validation)
202 | rmse(y_obs, yhat_tr)
203 | 
204 | 
205 | #-----------------
206 | # 랜덤포레스트
207 | set.seed(1607)
208 | data_rf <- randomForest(class ~ ., training)
209 | data_rf
210 | 
211 | opar <- par(mfrow=c(1,2))
212 | plot(data_rf)
213 | varImpPlot(data_rf)
214 | par(opar)
215 | 
216 | 
217 | yhat_rf <- predict(data_rf, newdata=validation)
218 | rmse(y_obs, yhat_rf)
219 | 
220 | 
221 | #-----------------
222 | # 부스팅
223 | set.seed(1607)
224 | data_gbm <- gbm(class ~ ., data=training,
225 |                 n.trees=40000, cv.folds=3, verbose = TRUE)
226 | (best_iter = gbm.perf(data_gbm, method="cv"))
227 | 
228 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation)
229 | rmse(y_obs, yhat_gbm)
230 | 
231 | 
232 | # 최종 모형선택과  테스트셋 오차계산
233 | data.frame(lm = rmse(y_obs, yhat_step),
234 |            glmnet = rmse(y_obs, yhat_glmnet),
235 |            rf = rmse(y_obs, yhat_rf),
236 |            gbm = rmse(y_obs, yhat_gbm)) %>%
237 |   reshape2::melt(value.name = 'rmse', variable.name = 'method')
238 | 
239 | rmse(test$class, predict(data_rf, newdata = test))
240 | 
241 | 
242 | # 회귀분석의 오차의 시각화
243 | boxplot(list(lm = y_obs-yhat_step,
244 |              glmnet = y_obs-yhat_glmnet,
245 |              rf = y_obs-yhat_rf,
246 |              gbm = y_obs-yhat_gbm), ylab="Error in Validation Set")
247 | abline(h=0, lty=2, col='blue')
248 | 
249 | 
250 | pairs(data.frame(y_obs=y_obs,
251 |                  yhat_lm=yhat_step,
252 |                  yhat_glmnet=c(yhat_glmnet),
253 |                  yhat_rf=yhat_rf,
254 |                  yhat_gbm=yhat_gbm),
255 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
256 |       upper.panel = panel.cor)
257 | 
258 | 


--------------------------------------------------------------------------------
/ch08-classification/breast-cancer/breast-cancer.R:
--------------------------------------------------------------------------------
  1 | # 빅데이터 분별분석. 암 예측.
  2 | #
  3 | if (!file.exists("wdbc.data")){
  4 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data > wdbc.data')
  5 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names > wdbc.names')
  6 | }
  7 | 
  8 | rmse <- function(yi, yhat_i){
  9 |   sqrt(mean((yi - yhat_i)^2))
 10 | }
 11 | 
 12 | binomial_deviance <- function(y_obs, yhat){
 13 |   epsilon = 0.0001
 14 |   yhat = ifelse(yhat < epsilon, epsilon, yhat)
 15 |   yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
 16 |   a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
 17 |   b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
 18 |   return(2*sum(a + b))
 19 | }
 20 | 
 21 | 
 22 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 23 |   usr <- par("usr"); on.exit(par(usr))
 24 |   par(usr = c(0, 1, 0, 1))
 25 |   r <- abs(cor(x, y))
 26 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 27 |   txt <- paste0(prefix, txt)
 28 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 29 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 30 | }
 31 | 
 32 | 
 33 | 
 34 | library(tidyverse)
 35 | library(gridExtra)
 36 | library(MASS)
 37 | library(glmnet)
 38 | library(randomForest)
 39 | library(gbm)
 40 | library(rpart)
 41 | library(boot)
 42 | library(data.table)
 43 | library(ROCR)
 44 | 
 45 | data <- tbl_df(read.table("wdbc.data", strip.white = TRUE,
 46 |                           sep=",", header = FALSE))
 47 | feature_names <- c('radius', 'texture', 'perimeter', 'area', 'smoothness',
 48 |                    'compactness', 'concavity', 'concave_points', 'symmetry', 'fractal_dim')
 49 | names(data) <-
 50 |   c('id', 'class',
 51 |     paste0('mean_', feature_names),
 52 |     paste0('se_', feature_names),
 53 |     paste0('worst_', feature_names))
 54 | 
 55 | glimpse(data)
 56 | 
 57 | 
 58 | # 1. id 변수 제거
 59 | data <- data %>% dplyr::select(-id)
 60 | # 2. class 변수를 인자 변수로 변환
 61 | data$class <- factor(ifelse(data$class == 'B', 0, 1))
 62 | 
 63 | glimpse(data)
 64 | 
 65 | summary(data)
 66 | 
 67 | png("../../plots/10-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=9, res=600)
 68 | pairs(data %>% dplyr::select(class, starts_with('mean_')) %>%
 69 |         sample_n(min(1000, nrow(data))),
 70 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 71 |       upper.panel = panel.cor)
 72 | dev.off()
 73 | 
 74 | pairs(data %>% dplyr::select(class, starts_with('se_')) %>%
 75 |         sample_n(min(1000, nrow(data))),
 76 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 77 |       upper.panel = panel.cor)
 78 | 
 79 | pairs(data %>% dplyr::select(class, starts_with('worst_')) %>%
 80 |         sample_n(min(1000, nrow(data))),
 81 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 82 |       upper.panel = panel.cor)
 83 | 
 84 | library(ggplot2)
 85 | library(dplyr)
 86 | library(gridExtra)
 87 | p1 <- data %>% ggplot(aes(class)) + geom_bar()
 88 | p2 <- data %>% ggplot(aes(class, mean_concave_points)) +
 89 |   geom_jitter(col='gray') +
 90 |   geom_boxplot(alpha=.5)
 91 | p3 <- data %>% ggplot(aes(class, mean_radius)) +
 92 |   geom_jitter(col='gray') +
 93 |   geom_boxplot(alpha=.5)
 94 | p4 <- data %>% ggplot(aes(mean_concave_points, mean_radius)) +
 95 |   geom_jitter(col='gray') + geom_smooth()
 96 | grid.arrange(p1, p2, p3, p4, ncol=2)
 97 | 
 98 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
 99 | ggsave("../../plots/10-2.png", g, width=5.5*1.2, height=4*1.2, units='in', dpi=600)
100 | 
101 | 
102 | # 트래인셋과 테스트셋의 구분
103 | set.seed(1606)
104 | n <- nrow(data)
105 | idx <- 1:n
106 | training_idx <- sample(idx, n * .60)
107 | idx <- setdiff(idx, training_idx)
108 | validate_idx <- sample(idx, n * .20)
109 | test_idx <- setdiff(idx, validate_idx)
110 | training <- data[training_idx,]
111 | validation <- data[validate_idx,]
112 | test <- data[test_idx,]
113 | 
114 | 
115 | #-----------------
116 | # 로지스틱 회귀모형
117 | data_lm_full <- glm(class ~ ., data=training, family=binomial)
118 | summary(data_lm_full)
119 | anova(data_lm_full)
120 | 
121 | predict(data_lm_full, newdata = data[1:5,], type='response')
122 | 
123 | table(y_obs, yhat_lm)
124 | 
125 | # 모형평가
126 | y_obs <- as.numeric(as.character(validation$class))
127 | yhat_lm <- predict(data_lm_full, newdata = validation, type='response')
128 | pred_lm <- prediction(yhat_lm, y_obs)
129 | performance(pred_lm, "auc")@y.values[[1]]
130 | binomial_deviance(y_obs, yhat_lm)
131 | 
132 | #-----------------
133 | # 라쏘 모형 적합
134 | # xx <- model.matrix(class ~ .^2-1, data)
135 | xx <- model.matrix(class ~ .-1, data)
136 | x <- xx[training_idx, ]
137 | y <- as.numeric(as.character(training$class))
138 | glimpse(x)
139 | 
140 | data_cvfit <- cv.glmnet(x, y, family = "binomial")
141 | plot(data_cvfit)
142 | 
143 | png("../../plots/10-3.png", 5.5, 4, units='in', pointsize=9, res=600)
144 | plot(data_cvfit)
145 | dev.off()
146 | 
147 | coef(data_cvfit, s = c("lambda.1se"))
148 | coef(data_cvfit, s = c("lambda.min"))
149 | 
150 | 
151 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,], type='response')
152 | 
153 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,], type='response')
154 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
155 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
156 | performance(pred_glmnet, "auc")@y.values[[1]]
157 | binomial_deviance(y_obs, yhat_glmnet)
158 | 
159 | 
160 | #-----------------
161 | # 나무모형
162 | data_tr <- rpart(class ~ ., data = training)
163 | data_tr
164 | 
165 | printcp(data_tr)
166 | summary(data_tr)
167 | 
168 | png("../../plots/10-4.png", 5.5, 4, units='in', pointsize=9, res=600)
169 | opar <- par(mfrow = c(1,1), xpd = NA)
170 | plot(data_tr)
171 | text(data_tr, use.n = TRUE)
172 | par(opar)
173 | dev.off()
174 | 
175 | 
176 | yhat_tr <- predict(data_tr, validation)
177 | yhat_tr <- yhat_tr[,"1"]
178 | pred_tr <- prediction(yhat_tr, y_obs)
179 | performance(pred_tr, "auc")@y.values[[1]]
180 | binomial_deviance(y_obs, yhat_tr)
181 | 
182 | 
183 | #-----------------
184 | # 랜덤포레스트
185 | set.seed(1607)
186 | data_rf <- randomForest(class ~ ., training)
187 | data_rf
188 | 
189 | png("../../plots/10-5.png", 5.5*1.5, 4*1.2, units='in', pointsize=9, res=600)
190 | opar <- par(mfrow=c(1,2))
191 | plot(data_rf)
192 | varImpPlot(data_rf)
193 | par(opar)
194 | dev.off()
195 | 
196 | 
197 | yhat_rf <- predict(data_rf, newdata=validation, type='prob')[,'1']
198 | pred_rf <- prediction(yhat_rf, y_obs)
199 | performance(pred_rf, "auc")@y.values[[1]]
200 | binomial_deviance(y_obs, yhat_rf)
201 | 
202 | 
203 | #-----------------
204 | # 부스팅
205 | set.seed(1607)
206 | data_for_gbm <-
207 |   training %>%
208 |   mutate(class=as.numeric(as.character(class)))
209 | data_gbm <- gbm(class ~ ., data=data_for_gbm, distribution="bernoulli",
210 |               n.trees=50000, cv.folds=3, verbose=TRUE)
211 | (best_iter = gbm.perf(data_gbm, method="cv"))
212 | 
213 | png("../../plots/10-6.png", 5.5, 4, units='in', pointsize=9, res=600)
214 | (best_iter = gbm.perf(data_gbm, method="cv"))
215 | dev.off()
216 | 
217 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation, type='response')
218 | pred_gbm <- prediction(yhat_gbm, y_obs)
219 | performance(pred_gbm, "auc")@y.values[[1]]
220 | binomial_deviance(y_obs, yhat_gbm)
221 | 
222 | #------------------
223 | # 최종 모형선택과  테스트셋 오차계산
224 | data.frame(method=c('lm', 'glmnet', 'rf', 'gbm'),
225 |            auc = c(performance(pred_lm, "auc")@y.values[[1]],
226 |                    performance(pred_glmnet, "auc")@y.values[[1]],
227 |                    performance(pred_rf, "auc")@y.values[[1]],
228 |                    performance(pred_gbm, "auc")@y.values[[1]]),
229 |            bin_dev = c(binomial_deviance(y_obs, yhat_lm),
230 |                        binomial_deviance(y_obs, yhat_glmnet),
231 |                        binomial_deviance(y_obs, yhat_rf),
232 |                        binomial_deviance(y_obs, yhat_gbm)))
233 | 
234 | # glmnet이 최종 승리자:
235 | y_obs_test <- as.numeric(as.character(test$class))
236 | yhat_glmnet_test <- predict(data_cvfit, s="lambda.min", newx=xx[test_idx,], type='response')
237 | yhat_glmnet_test <- yhat_glmnet_test[,1]
238 | pred_glmnet_test <- prediction(yhat_glmnet_test, y_obs_test)
239 | performance(pred_glmnet_test, "auc")@y.values[[1]]
240 | binomial_deviance(y_obs_test, yhat_glmnet_test)
241 | 
242 | # 예측값들의 상관관계
243 | 
244 | 
245 | #-----------
246 | # ROC 커브
247 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
248 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
249 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
250 | perf_gbm <- performance(pred_gbm, measure="tpr", x.measure="fpr")
251 | 
252 | 
253 | png("../../plots/10-7.png", 5.5, 4, units='in', pointsize=9, res=600)
254 | plot(perf_lm, col='black', main="ROC Curve")
255 | plot(perf_glmnet, add=TRUE, col='blue')
256 | plot(perf_rf, add=TRUE, col='red')
257 | plot(perf_gbm, add=TRUE, col='cyan')
258 | abline(0,1)
259 | legend('bottomright', inset=.1,
260 |     legend=c("GLM", "glmnet", "RF", "GBM"),
261 |     col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
262 | dev.off()
263 | 
264 | 
265 | png("../../plots/10-8.png", 5.5, 4, units='in', pointsize=9, res=600)
266 | pairs(data.frame(y_obs=y_obs,
267 |                  yhat_lm=yhat_lm,
268 |                  yhat_glmnet=c(yhat_glmnet),
269 |                  yhat_rf=yhat_rf,
270 |                  yhat_gbm=yhat_gbm),
271 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
272 |       upper.panel = panel.cor)
273 | dev.off()
274 | 


--------------------------------------------------------------------------------
/ch08-classification/breast-cancer/breast-cancer.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch08-classification/spam-detection/spam-detection.R:
--------------------------------------------------------------------------------
  1 | # 빅데이터 분별분석. 스팸 메일 예측
  2 | #
  3 | if (!file.exists("spambase.data")){
  4 |   system('curl https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data > spambase.data')
  5 |   system('curl https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names > spambase.names')
  6 | }
  7 | 
  8 | binomial_deviance <- function(y_obs, yhat){
  9 |   epsilon = 0.0001
 10 |   yhat = ifelse(yhat < epsilon, epsilon, yhat)
 11 |   yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
 12 |   a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
 13 |   b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
 14 |   return(2*sum(a + b))
 15 | }
 16 | 
 17 | 
 18 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 19 |   usr <- par("usr"); on.exit(par(usr))
 20 |   par(usr = c(0, 1, 0, 1))
 21 |   r <- abs(cor(x, y))
 22 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 23 |   txt <- paste0(prefix, txt)
 24 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 25 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 26 | }
 27 | 
 28 | 
 29 | 
 30 | library(dplyr)
 31 | library(ggplot2)
 32 | library(MASS)
 33 | library(glmnet)
 34 | library(randomForest)
 35 | library(gbm)
 36 | library(rpart)
 37 | library(boot)
 38 | library(data.table)
 39 | library(ROCR)
 40 | library(gridExtra)
 41 | 
 42 | data <- tbl_df(read.table("spambase.data", strip.white = TRUE,
 43 |                           sep=",", header = FALSE))
 44 | names(data) <-
 45 |   c('word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our',
 46 |     'word_freq_over', 'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail',
 47 |     'word_freq_receive', 'word_freq_will', 'word_freq_people', 'word_freq_report', 'word_freq_addresses',
 48 |     'word_freq_free', 'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
 49 |     'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp',
 50 |     'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs',
 51 |     'word_freq_telnet', 'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
 52 |     'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
 53 |     'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re',
 54 |     'word_freq_edu', 'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(',
 55 |     'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average',
 56 |     'capital_run_length_longest', 'capital_run_length_total',
 57 |     # 'spam'
 58 |     'class'
 59 |   )
 60 | names(data)[58] <- 'class'
 61 | data$class <- factor(data$class)
 62 | 
 63 | glimpse(data)
 64 | 
 65 | summary(data)
 66 | 
 67 | png("../../plots/11-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=10, res=600)
 68 | set.seed(1610)
 69 | pairs(data %>% dplyr::select(1:10, 58) %>%
 70 |         sample_n(min(1000, nrow(data))),
 71 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 72 |       upper.panel = panel.cor)
 73 | dev.off()
 74 | 
 75 | 
 76 | png("../../plots/11-2.png", 5.5*1.2, 4*1.2, units='in', pointsize=10, res=600)
 77 | set.seed(1610)
 78 | pairs(data %>% dplyr::select(48:57, 58) %>%
 79 |         sample_n(min(1000, nrow(data))),
 80 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 81 |       upper.panel = panel.cor)
 82 | dev.off()
 83 | 
 84 | #
 85 | tmp <- as.data.frame(cor(data[,-58], as.numeric(data$class)))
 86 | tmp <- tmp %>% rename(cor=V1)
 87 | tmp$var <- rownames(tmp)
 88 | tmp %>%
 89 |   ggplot(aes(reorder(var, cor), cor)) +
 90 |   geom_point() +
 91 |   coord_flip()
 92 | ggsave("../../plots/11-3.png", width=5.5*1.8, height=4*1.8, units='in', dpi=400)
 93 | 
 94 | 
 95 | library(ggplot2)
 96 | library(dplyr)
 97 | library(gridExtra)
 98 | p1 <- data %>% ggplot(aes(class)) + geom_bar()
 99 | p2 <- data %>% ggplot(aes(class, `char_freq_$`)) +
100 |   geom_jitter(col='gray') +
101 |   geom_boxplot(alpha=.5) +
102 |   scale_y_sqrt()
103 | p3 <- data %>% ggplot(aes(`char_freq_$`, group=class, fill=class)) +
104 |   geom_density(alpha=.5) +
105 |   scale_x_sqrt() + scale_y_sqrt()
106 | p4 <- data %>% ggplot(aes(class, capital_run_length_longest)) +
107 |   geom_jitter(col='gray') +
108 |   geom_boxplot(alpha=.5) +
109 |   scale_y_log10()
110 | grid.arrange(p1, p2, p3, p4, ncol=2)
111 | 
112 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
113 | ggsave("../../plots/11-4.png", g, width=5.5, height=4, units='in', dpi=600)
114 | 
115 | ?'`'
116 | 
117 | 
118 | # 변수명의 특수문자 처리
119 | 
120 | old_names <- names(data)
121 | new_names <- make.names(names(data), unique = TRUE)
122 | cbind(old_names, new_names) [old_names!=new_names, ]
123 | 
124 | names(data) <- new_names
125 | 
126 | # 트래인셋과 테스트셋의 구분
127 | set.seed(1606)
128 | n <- nrow(data)
129 | idx <- 1:n
130 | training_idx <- sample(idx, n * .60)
131 | idx <- setdiff(idx, training_idx)
132 | validate_idx <- sample(idx, n * .20)
133 | test_idx <- setdiff(idx, validate_idx)
134 | training <- data[training_idx,]
135 | validation <- data[validate_idx,]
136 | test <- data[test_idx,]
137 | 
138 | 
139 | #-----------------
140 | # 로지스틱 회귀모형
141 | data_lm_full <- glm(class ~ ., data=training, family=binomial)
142 | summary(data_lm_full)
143 | 
144 | predict(data_lm_full, newdata = data[1:5,], type='response')
145 | 
146 | # 모형평가
147 | y_obs <- as.numeric(as.character(validation$class))
148 | yhat_lm <- predict(data_lm_full, newdata = validation, type='response')
149 | pred_lm <- prediction(yhat_lm, y_obs)
150 | performance(pred_lm, "auc")@y.values[[1]]
151 | binomial_deviance(y_obs, yhat_lm)
152 | 
153 | #-----------------
154 | # 라쏘 모형 적합
155 | # xx <- model.matrix(class ~ .^2-1, data)
156 | xx <- model.matrix(class ~ .-1, data)
157 | x <- xx[training_idx, ]
158 | y <- as.numeric(as.character(training$class))
159 | glimpse(x)
160 | 
161 | data_cvfit <- cv.glmnet(x, y, family = "binomial")
162 | plot(data_cvfit)
163 | 
164 | png("../../plots/11-5.png", 5.5, 4, units='in', pointsize=9, res=600)
165 | plot(data_cvfit)
166 | dev.off()
167 | 
168 | coef(data_cvfit, s = c("lambda.1se"))
169 | coef(data_cvfit, s = c("lambda.min"))
170 | 
171 | 
172 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,], type='response')
173 | 
174 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,], type='response')
175 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
176 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
177 | performance(pred_glmnet, "auc")@y.values[[1]]
178 | binomial_deviance(y_obs, yhat_glmnet)
179 | 
180 | 
181 | #-----------------
182 | # 나무모형
183 | data_tr <- rpart(class ~ ., data = training)
184 | data_tr
185 | 
186 | printcp(data_tr)
187 | summary(data_tr)
188 | 
189 | png("../../plots/11-6.png", 5.5, 4, units='in', pointsize=9, res=600)
190 | opar <- par(mfrow = c(1,1), xpd = NA)
191 | plot(data_tr)
192 | text(data_tr, use.n = TRUE)
193 | par(opar)
194 | dev.off()
195 | 
196 | 
197 | yhat_tr <- predict(data_tr, validation)
198 | yhat_tr <- yhat_tr[,"1"]
199 | pred_tr <- prediction(yhat_tr, y_obs)
200 | performance(pred_tr, "auc")@y.values[[1]]
201 | binomial_deviance(y_obs, yhat_tr)
202 | 
203 | 
204 | #-----------------
205 | # 랜덤포레스트
206 | set.seed(1607)
207 | data_rf <- randomForest(class ~ ., data=training)
208 | data_rf
209 | 
210 | png("../../plots/11-7.png", 5.5*1.5, 4*1.2, units='in', pointsize=8, res=600)
211 | opar <- par(mfrow=c(1,2))
212 | plot(data_rf)
213 | varImpPlot(data_rf)
214 | par(opar)
215 | dev.off()
216 | 
217 | 
218 | yhat_rf <- predict(data_rf, newdata=validation, type='prob')[,'1']
219 | pred_rf <- prediction(yhat_rf, y_obs)
220 | performance(pred_rf, "auc")@y.values[[1]]
221 | binomial_deviance(y_obs, yhat_rf)
222 | 
223 | 
224 | #-----------------
225 | # 부스팅
226 | set.seed(1607)
227 | data_for_gbm <-
228 |   training %>%
229 |   mutate(class=as.numeric(as.character(class)))
230 | data_gbm <- gbm(class ~ ., data=data_for_gbm, distribution="bernoulli",
231 |               n.trees=100000, cv.folds=3, verbose=TRUE)
232 | 
233 | png("../../plots/11-8.png", 5.5, 4, units='in', pointsize=9, res=600)
234 | (best_iter = gbm.perf(data_gbm, method="cv"))
235 | dev.off()
236 | 
237 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation, type='response')
238 | pred_gbm <- prediction(yhat_gbm, y_obs)
239 | performance(pred_gbm, "auc")@y.values[[1]]
240 | binomial_deviance(y_obs, yhat_gbm)
241 | 
242 | #------------------
243 | # 최종 모형선택과  테스트셋 오차계산
244 | data.frame(method=c('lm', 'glmnet', 'rf', 'gbm'),
245 |            auc = c(performance(pred_lm, "auc")@y.values[[1]],
246 |                    performance(pred_glmnet, "auc")@y.values[[1]],
247 |                    performance(pred_rf, "auc")@y.values[[1]],
248 |                    performance(pred_gbm, "auc")@y.values[[1]]),
249 |            bin_dev = c(binomial_deviance(y_obs, yhat_lm),
250 |                        binomial_deviance(y_obs, yhat_glmnet),
251 |                        binomial_deviance(y_obs, yhat_rf),
252 |                        binomial_deviance(y_obs, yhat_gbm)))
253 | 
254 | # glmnet이 최종 승리자인 경우:
255 | y_obs_test <- as.numeric(as.character(test$class))
256 | yhat_glmnet_test <- predict(data_cvfit, s="lambda.min", newx=xx[test_idx,], type='response')
257 | yhat_glmnet_test <- yhat_glmnet_test[,1]
258 | pred_glmnet_test <- prediction(yhat_glmnet_test, y_obs_test)
259 | performance(pred_glmnet_test, "auc")@y.values[[1]]
260 | binomial_deviance(y_obs_test, yhat_glmnet_test)
261 | 
262 | # 랜덤포레스트가 최종 승리자인 경우:
263 | y_obs_test <- as.numeric(as.character(test$class))
264 | yhat_rf_test <- predict(data_rf, newdata=test, type='prob')[,'1']
265 | pred_rf_test <- prediction(yhat_rf_test, y_obs_test)
266 | performance(pred_rf_test, "auc")@y.values[[1]]
267 | binomial_deviance(y_obs_test, yhat_rf_test)
268 | 
269 | 
270 | # 예측값들의 상관관계
271 | pairs(data.frame(y_obs=y_obs,
272 |                  yhat_lm=yhat_lm,
273 |                  yhat_glmnet=c(yhat_glmnet),
274 |                  yhat_rf=yhat_rf,
275 |                  yhat_gbm=yhat_gbm),
276 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
277 |       upper.panel = panel.cor)
278 | 
279 | 
280 | #-----------
281 | # ROC 커브
282 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
283 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
284 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
285 | perf_gbm <- performance(pred_gbm, measure="tpr", x.measure="fpr")
286 | 
287 | png("../../plots/11-9.png", 5.5, 4, units='in', pointsize=9, res=600)
288 | plot(perf_lm, col='black', main="ROC Curve")
289 | plot(perf_glmnet, add=TRUE, col='blue')
290 | plot(perf_rf, add=TRUE, col='red')
291 | plot(perf_gbm, add=TRUE, col='cyan')
292 | abline(0,1)
293 | legend('bottomright', inset=.1,
294 |     legend=c("GLM", "glmnet", "RF", "GBM"),
295 |     col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
296 | dev.off()
297 | 
298 | 
299 | png("../../plots/11-10.png", 5.5, 4, units='in', pointsize=9, res=600)
300 | pairs(data.frame(y_obs=y_obs,
301 |                  yhat_lm=yhat_lm,
302 |                  yhat_glmnet=c(yhat_glmnet),
303 |                  yhat_rf=yhat_rf,
304 |                  yhat_gbm=yhat_gbm),
305 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
306 |       upper.panel = panel.cor)
307 | dev.off()
308 | 


--------------------------------------------------------------------------------
/ch08-classification/spam-detection/spam-detection.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch08-classification/spam-detection/spam-detection.md:
--------------------------------------------------------------------------------
 1 | # spam-detection
 2 | 스팸 메일 예측 문제
 3 | 
 4 | 변수들:
 5 | 
 6 |     spam
 7 |     word_freq_make
 8 |     word_freq_address
 9 |     word_freq_all
10 |     word_freq_3d
11 |     word_freq_our
12 |     word_freq_over
13 |     word_freq_remove
14 |     word_freq_internet
15 |     word_freq_order
16 |     word_freq_mail
17 |     word_freq_receive
18 |     word_freq_will
19 |     word_freq_people
20 |     word_freq_report
21 |     word_freq_addresses
22 |     word_freq_free
23 |     word_freq_business
24 |     word_freq_email
25 |     word_freq_you
26 |     word_freq_credit
27 |     word_freq_your
28 |     word_freq_font
29 |     word_freq_000
30 |     word_freq_money
31 |     word_freq_hp
32 |     word_freq_hpl
33 |     word_freq_george
34 |     word_freq_650
35 |     word_freq_lab
36 |     word_freq_labs
37 |     word_freq_telnet
38 |     word_freq_857
39 |     word_freq_data
40 |     word_freq_415
41 |     word_freq_85
42 |     word_freq_technology
43 |     word_freq_1999
44 |     word_freq_parts
45 |     word_freq_pm
46 |     word_freq_direct
47 |     word_freq_cs
48 |     word_freq_meeting
49 |     word_freq_original
50 |     word_freq_project
51 |     word_freq_re
52 |     word_freq_edu
53 |     word_freq_table
54 |     word_freq_conference
55 |     char_freq_;
56 |     char_freq_(
57 |     char_freq_[
58 |     char_freq_!
59 |     char_freq_$
60 |     char_freq_#
61 |     capital_run_length_average
62 |     capital_run_length_longest
63 |     capital_run_length_total
64 | 


--------------------------------------------------------------------------------
/ch12-r-markdown/ch10-r-markdown.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R Markdown 예제"
 3 | author: "권재명"
 4 | output: html_document
 5 | ---
 6 | 
 7 | ```{r setup, include=FALSE}
 8 | knitr::opts_chunk$set(echo = TRUE)
 9 | ```
10 | 
11 | R 마크다운을 사용하여 R 코드, 코드실행 결과 텍스트, 도표를 포함한 문서를 쉽게 작성할 수 있습니다.
12 | 
13 | ```{r cars, fig.width=4, fig.height=3, message=FALSE}
14 | library(ggplot2)
15 | qplot(speed, dist, data=cars) + 
16 |   geom_smooth()
17 | summary(cars)
18 | ```
19 | 


--------------------------------------------------------------------------------
/ch12-r-markdown/ch10-r-markdown.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch13-regression/housing/housing.R:
--------------------------------------------------------------------------------
  1 | # 빅데이터 회귀분석. 부동산 가격 예측
  2 | #
  3 | if (!file.exists("housing.data")){
  4 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data > housing.data')
  5 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names > housing.names')
  6 | }
  7 | 
  8 | rmse <- function(yi, yhat_i){
  9 |   sqrt(mean((yi - yhat_i)^2))
 10 | }
 11 | 
 12 | mae <- function(yi, yhat_i){
 13 |   mean(abs(yi - yhat_i))
 14 | }
 15 | 
 16 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 17 |   usr <- par("usr"); on.exit(par(usr))
 18 |   par(usr = c(0, 1, 0, 1))
 19 |   r <- abs(cor(x, y))
 20 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 21 |   txt <- paste0(prefix, txt)
 22 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 23 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 24 | }
 25 | 
 26 | 
 27 | 
 28 | library(dplyr)
 29 | library(ggplot2)
 30 | library(MASS)
 31 | library(glmnet)
 32 | library(randomForest)
 33 | library(gbm)
 34 | library(rpart)
 35 | library(boot)
 36 | library(data.table)
 37 | library(ROCR)
 38 | library(gridExtra)
 39 | 
 40 | data <- tbl_df(read.table("housing.data", strip.white = TRUE))
 41 | names(data) <- c('crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age',
 42 |                  'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat', 'medv')
 43 | glimpse(data)
 44 | 
 45 | summary(data)
 46 | 
 47 | pairs(data %>% sample_n(min(1000, nrow(data))))
 48 | 
 49 | png("../../plots/13-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=9, res=600)
 50 | pairs(data %>% sample_n(min(1000, nrow(data))),
 51 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 52 |       upper.panel = panel.cor)
 53 | dev.off()
 54 | 
 55 | 
 56 | # 트래인셋과 테스트셋의 구분
 57 | set.seed(1606)
 58 | n <- nrow(data)
 59 | idx <- 1:n
 60 | training_idx <- sample(idx, n * .60)
 61 | idx <- setdiff(idx, training_idx)
 62 | validate_idx <- sample(idx, n * .20)
 63 | test_idx <- setdiff(idx, validate_idx)
 64 | training <- data[training_idx,]
 65 | validation <- data[validate_idx,]
 66 | test <- data[test_idx,]
 67 | 
 68 | 
 69 | # 선형회귀모형 (linear regression model)
 70 | data_lm_full <- lm(medv ~ ., data=training)
 71 | summary(data_lm_full)
 72 | 
 73 | predict(data_lm_full, newdata = data[1:5,])
 74 | 
 75 | # 선형회귀모형에서 변수선택
 76 | data_lm_full_2 <- lm(medv ~ .^2, data=training)
 77 | summary(data_lm_full_2)
 78 | 
 79 | length(coef(data_lm_full_2))
 80 | 
 81 | library(MASS)
 82 | data_step <- stepAIC(data_lm_full,
 83 |                    scope = list(upper = ~ .^2, lower = ~1))
 84 | 
 85 | data_step
 86 | anova(data_step)
 87 | summary(data_step)
 88 | length(coef(data_step))
 89 | 
 90 | 
 91 | # 모형평가
 92 | y_obs <- validation$medv
 93 | yhat_lm <- predict(data_lm_full, newdata=validation)
 94 | yhat_lm_2 <- predict(data_lm_full_2, newdata=validation)
 95 | yhat_step <- predict(data_step, newdata=validation)
 96 | rmse(y_obs, yhat_lm)
 97 | rmse(y_obs, yhat_lm_2)
 98 | rmse(y_obs, yhat_step)
 99 | 
100 | 
101 | # 라쏘 모형 적합
102 | xx <- model.matrix(medv ~ .^2-1, data)
103 | x <- xx[training_idx, ]
104 | y <- training$medv
105 | glimpse(x)
106 | 
107 | data_cvfit <- cv.glmnet(x, y)
108 | plot(data_cvfit)
109 | 
110 | 
111 | png("../../plots/13-2.png", 5.5, 4, units='in', pointsize=9, res=600)
112 | plot(data_cvfit)
113 | dev.off()
114 | 
115 | coef(data_cvfit, s = c("lambda.1se"))
116 | coef(data_cvfit, s = c("lambda.min"))
117 | 
118 | (tmp <- coef(data_cvfit, s = c("lambda.1se")))
119 | tmp <- tmp[,1]
120 | length(tmp[abs(tmp)>0])
121 | (tmp <- coef(data_cvfit, s = c("lambda.min")))
122 | length(tmp[abs(tmp)>0])
123 | 
124 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,])
125 | 
126 | y_obs <- validation$medv
127 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,])
128 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
129 | rmse(y_obs, yhat_glmnet)
130 | 
131 | # 나무모형
132 | data_tr <- rpart(medv ~ ., data = training)
133 | data_tr
134 | 
135 | printcp(data_tr)
136 | summary(data_tr)
137 | 
138 | png("../../plots/13-3.png", 5.5, 4, units='in', pointsize=9, res=600)
139 | opar <- par(mfrow = c(1,1), xpd = NA)
140 | plot(data_tr)
141 | text(data_tr, use.n = TRUE)
142 | par(opar)
143 | dev.off()
144 | 
145 | 
146 | yhat_tr <- predict(data_tr, validation)
147 | rmse(y_obs, yhat_tr)
148 | 
149 | 
150 | # 랜덤포레스트
151 | set.seed(1607)
152 | data_rf <- randomForest(medv ~ ., training)
153 | data_rf
154 | 
155 | png("../../plots/13-4.png", 5.5, 4*.8, units='in', pointsize=9, res=600)
156 | par(mfrow=c(1,2))
157 | plot(data_rf)
158 | varImpPlot(data_rf)
159 | dev.off()
160 | 
161 | 
162 | yhat_rf <- predict(data_rf, newdata=validation)
163 | rmse(y_obs, yhat_rf)
164 | 
165 | 
166 | # 부스팅
167 | set.seed(1607)
168 | data_gbm <- gbm(medv ~ ., data=training,
169 |               n.trees=40000, cv.folds=3, verbose = TRUE)
170 | 
171 | png("../../plots/13-5.png", 5.5, 4, units='in', pointsize=9, res=600)
172 | (best_iter = gbm.perf(data_gbm, method="cv"))
173 | dev.off()
174 | 
175 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation)
176 | rmse(y_obs, yhat_gbm)
177 | 
178 | 
179 | # 최종 모형선택과  테스트셋 오차계산
180 | data.frame(lm = rmse(y_obs, yhat_step),
181 |            glmnet = rmse(y_obs, yhat_glmnet),
182 |            rf = rmse(y_obs, yhat_rf),
183 |            gbm = rmse(y_obs, yhat_gbm)) %>%
184 |   reshape2::melt(value.name = 'rmse', variable.name = 'method')
185 | 
186 | rmse(test$medv, predict(data_rf, newdata = test))
187 | 
188 | 
189 | # 회귀분석의 오차의 시각화
190 | png("../../plots/13-6.png", 5.5, 4, units='in', pointsize=9, res=600)
191 | boxplot(list(lm = y_obs-yhat_step,
192 |              glmnet = y_obs-yhat_glmnet,
193 |              rf = y_obs-yhat_rf,
194 |              gbm = y_obs-yhat_gbm), ylab="Error in Validation Set")
195 | abline(h=0, lty=2, col='blue')
196 | dev.off()
197 | 
198 | 
199 | png("../../plots/13-7.png", 5.5, 4, units='in', pointsize=9, res=600)
200 | pairs(data.frame(y_obs=y_obs,
201 |                  yhat_lm=yhat_step,
202 |                  yhat_glmnet=c(yhat_glmnet),
203 |                  yhat_rf=yhat_rf,
204 |                  yhat_gbm=yhat_gbm),
205 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
206 |       upper.panel = panel.cor)
207 | dev.off()
208 | 


--------------------------------------------------------------------------------
/ch13-regression/housing/housing.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch13-regression/wine-quality/wine-quality.R:
--------------------------------------------------------------------------------
  1 | # 빅데이터 회귀분석. 와인 품질 예측
  2 | #
  3 | if (!file.exists("winequality-white.csv")){
  4 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv > winequality-red.csv')
  5 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv > winequality-white.csv')
  6 |   system('curl http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names > winequality.names')
  7 | }
  8 | 
  9 | rmse <- function(yi, yhat_i){
 10 |   sqrt(mean((yi - yhat_i)^2))
 11 | }
 12 | 
 13 | mae <- function(yi, yhat_i){
 14 |   mean(abs(yi - yhat_i))
 15 | }
 16 | 
 17 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 18 |   usr <- par("usr"); on.exit(par(usr))
 19 |   par(usr = c(0, 1, 0, 1))
 20 |   r <- abs(cor(x, y))
 21 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 22 |   txt <- paste0(prefix, txt)
 23 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 24 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 25 | }
 26 | 
 27 | 
 28 | 
 29 | library(dplyr)
 30 | library(ggplot2)
 31 | library(MASS)
 32 | library(glmnet)
 33 | library(randomForest)
 34 | library(gbm)
 35 | library(rpart)
 36 | library(boot)
 37 | library(data.table)
 38 | library(ROCR)
 39 | library(gridExtra)
 40 | 
 41 | data <- tbl_df(read.table("winequality-white.csv", strip.white = TRUE,
 42 |                           sep=";", header = TRUE))
 43 | glimpse(data)
 44 | 
 45 | summary(data)
 46 | 
 47 | pairs(data %>% sample_n(min(1000, nrow(data))))
 48 | 
 49 | png("../../plots/14-1.png", 5.5*1.2, 4*1.2, units='in', pointsize=10, res=600)
 50 | set.seed(1704)
 51 | pairs(data %>% sample_n(min(1000, nrow(data))),
 52 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
 53 |       upper.panel = panel.cor)
 54 | dev.off()
 55 | 
 56 | 
 57 | library(ggplot2)
 58 | library(dplyr)
 59 | library(gridExtra)
 60 | p1 <- data %>% ggplot(aes(quality)) + geom_bar()
 61 | p2 <- data %>% ggplot(aes(factor(quality), alcohol)) + geom_boxplot()
 62 | p3 <- data %>% ggplot(aes(factor(quality), density)) + geom_boxplot()
 63 | p4 <- data %>% ggplot(aes(alcohol, density)) + geom_point(alpha=.1) + geom_smooth()
 64 | grid.arrange(p1, p2, p3, p4, ncol=2)
 65 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
 66 | ggsave("../../plots/14-2.png", g, width=5.5, height=4, units='in', dpi=600)
 67 | 
 68 | 
 69 | # 트래인셋과 테스트셋의 구분
 70 | set.seed(1606)
 71 | n <- nrow(data)
 72 | idx <- 1:n
 73 | training_idx <- sample(idx, n * .60)
 74 | idx <- setdiff(idx, training_idx)
 75 | validate_idx <- sample(idx, n * .20)
 76 | test_idx <- setdiff(idx, validate_idx)
 77 | training <- data[training_idx,]
 78 | validation <- data[validate_idx,]
 79 | test <- data[test_idx,]
 80 | 
 81 | 
 82 | # 선형회귀모형 (linear regression model)
 83 | data_lm_full <- lm(quality ~ ., data=training)
 84 | summary(data_lm_full)
 85 | 
 86 | predict(data_lm_full, newdata = data[1:5,])
 87 | 
 88 | # 선형회귀모형에서 변수선택
 89 | data_lm_full_2 <- lm(quality ~ .^2, data=training)
 90 | summary(data_lm_full_2)
 91 | 
 92 | length(coef(data_lm_full_2))
 93 | 
 94 | library(MASS)
 95 | data_step <- stepAIC(data_lm_full,
 96 |                      scope = list(upper = ~ .^2, lower = ~1))
 97 | 
 98 | data_step
 99 | anova(data_step)
100 | summary(data_step)
101 | length(coef(data_step))
102 | 
103 | 
104 | # 모형평가
105 | y_obs <- validation$quality
106 | yhat_lm <- predict(data_lm_full, newdata=validation)
107 | yhat_lm_2 <- predict(data_lm_full_2, newdata=validation)
108 | yhat_step <- predict(data_step, newdata=validation)
109 | rmse(y_obs, yhat_lm)
110 | rmse(y_obs, yhat_lm_2)
111 | rmse(y_obs, yhat_step)
112 | 
113 | 
114 | # 라쏘 모형 적합
115 | xx <- model.matrix(quality ~ .^2-1, data)
116 | # xx <- model.matrix(quality ~ .-1, data)
117 | x <- xx[training_idx, ]
118 | y <- training$quality
119 | glimpse(x)
120 | 
121 | data_cvfit <- cv.glmnet(x, y)
122 | 
123 | png("../../plots/14-3.png", 5.5, 4, units='in', pointsize=10, res=600)
124 | plot(data_cvfit)
125 | dev.off()
126 | 
127 | 
128 | coef(data_cvfit, s = c("lambda.1se"))
129 | coef(data_cvfit, s = c("lambda.min"))
130 | 
131 | (tmp <- coef(data_cvfit, s = c("lambda.1se")))
132 | length(tmp[abs(tmp)>0])
133 | (tmp <- coef(data_cvfit, s = c("lambda.min")))
134 | length(tmp[abs(tmp)>0])
135 | 
136 | predict.cv.glmnet(data_cvfit, s="lambda.min", newx = x[1:5,])
137 | 
138 | y_obs <- validation$quality
139 | yhat_glmnet <- predict(data_cvfit, s="lambda.min", newx=xx[validate_idx,])
140 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
141 | rmse(y_obs, yhat_glmnet)
142 | 
143 | # 나무모형
144 | data_tr <- rpart(quality ~ ., data = training)
145 | data_tr
146 | 
147 | printcp(data_tr)
148 | summary(data_tr)
149 | 
150 | png("../../plots/14-4.png", 5.5, 4, units='in', pointsize=10, res=600)
151 | opar <- par(mfrow = c(1,1), xpd = NA)
152 | plot(data_tr)
153 | text(data_tr, use.n = TRUE)
154 | par(opar)
155 | dev.off()
156 | 
157 | yhat_tr <- predict(data_tr, validation)
158 | rmse(y_obs, yhat_tr)
159 | 
160 | 
161 | # 랜덤포레스트
162 | set.seed(1607)
163 | data_rf <- randomForest(quality ~ ., training)
164 | data_rf
165 | 
166 | png("../../plots/14-5.png", 5.5*1.5, 4, units='in', pointsize=9, res=600)
167 | opar <- par(mfrow=c(1,2))
168 | plot(data_rf)
169 | varImpPlot(data_rf)
170 | par(opar)
171 | dev.off()
172 | 
173 | yhat_rf <- predict(data_rf, newdata=validation)
174 | rmse(y_obs, yhat_rf)
175 | 
176 | 
177 | # 부스팅
178 | set.seed(1607)
179 | data_gbm <- gbm(quality ~ ., data=training,
180 |                 n.trees=40000, cv.folds=3, verbose = TRUE)
181 | 
182 | png("../../plots/14-6.png", 5.5, 4, units='in', pointsize=9, res=600)
183 | (best_iter = gbm.perf(data_gbm, method="cv"))
184 | dev.off()
185 | 
186 | yhat_gbm <- predict(data_gbm, n.trees=best_iter, newdata=validation)
187 | rmse(y_obs, yhat_gbm)
188 | 
189 | 
190 | # 최종 모형선택과  테스트셋 오차계산
191 | data.frame(lm = rmse(y_obs, yhat_step),
192 |            glmnet = rmse(y_obs, yhat_glmnet),
193 |            rf = rmse(y_obs, yhat_rf),
194 |            gbm = rmse(y_obs, yhat_gbm)) %>%
195 |   reshape2::melt(value.name = 'rmse', variable.name = 'method')
196 | 
197 | rmse(test$quality, predict(data_rf, newdata = test))
198 | 
199 | 
200 | # 회귀분석의 오차의 시각화
201 | boxplot(list(lm = y_obs-yhat_step,
202 |              glmnet = y_obs-yhat_glmnet,
203 |              rf = y_obs-yhat_rf,
204 |              gbm = y_obs-yhat_gbm), ylab="Error in Validation Set")
205 | abline(h=0, lty=2, col='blue')
206 | 
207 | 
208 | png("../../plots/14-7.png", 5.5, 4, units='in', pointsize=9, res=600)
209 | pairs(data.frame(y_obs=y_obs,
210 |                  yhat_lm=yhat_step,
211 |                  yhat_glmnet=c(yhat_glmnet),
212 |                  yhat_rf=yhat_rf,
213 |                  yhat_gbm=yhat_gbm),
214 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
215 |       upper.panel = panel.cor)
216 | dev.off()
217 | 


--------------------------------------------------------------------------------
/ch13-regression/wine-quality/wine-quality.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/ch15-word-cloud/ch13-word-cloud.R:
--------------------------------------------------------------------------------
 1 | # install.packages(c("tm", "SnowballC", "wordcloud"))
 2 | 
 3 | library(tm)
 4 | library(SnowballC)
 5 | library(wordcloud)
 6 | library(dplyr)
 7 | 
 8 | data <- read.csv('JEOPARDY_CSV.csv', stringsAsFactors = FALSE,
 9 |                  nrows = 10000)
10 | dplyr::glimpse(data)
11 | 
12 | data_corpus <- Corpus(VectorSource(data$Question))
13 | data_corpus
14 | # ?Corpus
15 | # data_corpus <- tm_map(data_corpus, PlainTextDocument)
16 | # stopwords('english')
17 | 
18 | data_corpus <- tm_map(data_corpus, content_transformer(tolower))
19 | as.character(data_corpus[[1]])
20 | data_corpus <- tm_map(data_corpus, removePunctuation)
21 | as.character(data_corpus[[1]])
22 | data_corpus <- tm_map(data_corpus, removeWords, stopwords('english'))
23 | as.character(data_corpus[[1]])
24 | 
25 | data_corpus <- tm_map(data_corpus, stemDocument)
26 | as.character(data_corpus[[1]])
27 | 
28 | citation(package='wordcloud')
29 | wordcloud(data_corpus, max.words=100, random.order=FALSE,
30 |           colors=brewer.pal(8, "Dark2"))
31 | 
32 | data$Question[1]
33 | as.character(data_corpus[[1]])
34 | 
35 | png("../plots/15-1.png", 5.5, 4, units='in', pointsize=9, res=600)
36 | wordcloud(data_corpus, max.words=100, random.order=FALSE,
37 |           colors=brewer.pal(8, "Dark2"))
38 | dev.off()
39 | 


--------------------------------------------------------------------------------
/ch15-word-cloud/ch13-word-cloud.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------
/figure-export-boilerplate.R:
--------------------------------------------------------------------------------
 1 | # A few useful lines to export plots
 2 | # 1. base R graph
 3 | png("../../plots/.png", 5.5, 4, units='in', pointsize=9, res=600)
 4 | dev.off()
 5 | 
 6 | # 2. single ggplot
 7 | ggsave("../../plots/.png", width=5.5, height=4, units='in', dpi=600)
 8 | 
 9 | # 3. plot matrix from library(gridExtra)
10 | g <- arrangeGrob(p1, p2, p3, p4, ncol=2)
11 | ggsave("../../plots/.png", g, width=5.5, height=4, units='in', dpi=600)
12 | 


--------------------------------------------------------------------------------
/notebooks/download-gapminder-tsv.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "본서에서는 R의 gapminder 패키지를 사용하였다.\n",
 8 |     "<https://cran.r-project.org/web/packages/gapminder/index.html>\n",
 9 |     "패키지 소스의 갭마인더 데이터를 파이썬으로 읽어들이자.\n"
10 |    ]
11 |   },
12 |   {
13 |    "cell_type": "code",
14 |    "execution_count": 1,
15 |    "metadata": {},
16 |    "outputs": [],
17 |    "source": [
18 |     "import os\n",
19 |     "import requests\n",
20 |     "import shutil\n",
21 |     "\n",
22 |     "path = \"data/gapminder.tsv\"\n",
23 |     "if not os.path.exists(\"data\"):\n",
24 |     "    os.mkdir(\"data\")\n",
25 |     "\n",
26 |     "gapminder_r_url = \"https://raw.githubusercontent.com/jennybc/gapminder/master/data-raw/07_gap-merged-with-continent.tsv\"\n",
27 |     "r = requests.get(gapminder_r_url, stream=True)\n",
28 |     "r.raise_for_status()\n",
29 |     "\n",
30 |     "with open(path, \"wb\") as f:\n",
31 |     "    r.raw.decode_content = True\n",
32 |     "    shutil.copyfileobj(r.raw, f)\n"
33 |    ]
34 |   },
35 |   {
36 |    "cell_type": "code",
37 |    "execution_count": 2,
38 |    "metadata": {},
39 |    "outputs": [
40 |     {
41 |      "name": "stdout",
42 |      "output_type": "stream",
43 |      "text": [
44 |       "country\tcontinent\tyear\tlifeExp\tpop\tgdpPercap\r\n",
45 |       "Afghanistan\tAsia\t1952\t28.801\t8425333\t779.4453145\r\n",
46 |       "Afghanistan\tAsia\t1957\t30.332\t9240934\t820.8530296\r\n",
47 |       "Afghanistan\tAsia\t1962\t31.997\t10267083\t853.10071\r\n",
48 |       "Afghanistan\tAsia\t1967\t34.02\t11537966\t836.1971382\r\n",
49 |       "Afghanistan\tAsia\t1972\t36.088\t13079460\t739.9811058\r\n",
50 |       "Afghanistan\tAsia\t1977\t38.438\t14880372\t786.11336\r\n",
51 |       "Afghanistan\tAsia\t1982\t39.854\t12881816\t978.0114388\r\n",
52 |       "Afghanistan\tAsia\t1987\t40.822\t13867957\t852.3959448\r\n",
53 |       "Afghanistan\tAsia\t1992\t41.674\t16317921\t649.3413952\r\n"
54 |      ]
55 |     }
56 |    ],
57 |    "source": [
58 |     "!head $path"
59 |    ]
60 |   }
61 |  ],
62 |  "metadata": {
63 |   "kernelspec": {
64 |    "display_name": "Python 2",
65 |    "language": "python",
66 |    "name": "python2"
67 |   },
68 |   "language_info": {
69 |    "codemirror_mode": {
70 |     "name": "ipython",
71 |     "version": 2
72 |    },
73 |    "file_extension": ".py",
74 |    "mimetype": "text/x-python",
75 |    "name": "python",
76 |    "nbconvert_exporter": "python",
77 |    "pygments_lexer": "ipython2",
78 |    "version": "2.7.14"
79 |   }
80 |  },
81 |  "nbformat": 4,
82 |  "nbformat_minor": 2
83 | }
84 | 


--------------------------------------------------------------------------------
/notebooks/session-01-official.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "data": {
 10 |       "text/plain": [
 11 |        "(3313, 6)"
 12 |       ]
 13 |      },
 14 |      "execution_count": 1,
 15 |      "metadata": {},
 16 |      "output_type": "execute_result"
 17 |     }
 18 |    ],
 19 |    "source": [
 20 |     "%matplotlib inline\n",
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "\n",
 24 |     "gapminder_r_url = \"https://raw.githubusercontent.com/jennybc/gapminder/master/data-raw/07_gap-merged-with-continent.tsv\"\n",
 25 |     "gapminder = pd.read_csv(gapminder_r_url, sep=\"\\t\")\n",
 26 |     "gapminder.shape"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "data": {
 36 |       "text/html": [
 37 |        "<div>\n",
 38 |        "<style>\n",
 39 |        "    .dataframe thead tr:only-child th {\n",
 40 |        "        text-align: right;\n",
 41 |        "    }\n",
 42 |        "\n",
 43 |        "    .dataframe thead th {\n",
 44 |        "        text-align: left;\n",
 45 |        "    }\n",
 46 |        "\n",
 47 |        "    .dataframe tbody tr th {\n",
 48 |        "        vertical-align: top;\n",
 49 |        "    }\n",
 50 |        "</style>\n",
 51 |        "<table border=\"1\" class=\"dataframe\">\n",
 52 |        "  <thead>\n",
 53 |        "    <tr style=\"text-align: right;\">\n",
 54 |        "      <th></th>\n",
 55 |        "      <th>country</th>\n",
 56 |        "      <th>continent</th>\n",
 57 |        "      <th>year</th>\n",
 58 |        "      <th>lifeExp</th>\n",
 59 |        "      <th>pop</th>\n",
 60 |        "      <th>gdpPercap</th>\n",
 61 |        "    </tr>\n",
 62 |        "  </thead>\n",
 63 |        "  <tbody>\n",
 64 |        "    <tr>\n",
 65 |        "      <th>0</th>\n",
 66 |        "      <td>Afghanistan</td>\n",
 67 |        "      <td>Asia</td>\n",
 68 |        "      <td>1952</td>\n",
 69 |        "      <td>28.801</td>\n",
 70 |        "      <td>8425333</td>\n",
 71 |        "      <td>779.445314</td>\n",
 72 |        "    </tr>\n",
 73 |        "    <tr>\n",
 74 |        "      <th>1</th>\n",
 75 |        "      <td>Afghanistan</td>\n",
 76 |        "      <td>Asia</td>\n",
 77 |        "      <td>1957</td>\n",
 78 |        "      <td>30.332</td>\n",
 79 |        "      <td>9240934</td>\n",
 80 |        "      <td>820.853030</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>2</th>\n",
 84 |        "      <td>Afghanistan</td>\n",
 85 |        "      <td>Asia</td>\n",
 86 |        "      <td>1962</td>\n",
 87 |        "      <td>31.997</td>\n",
 88 |        "      <td>10267083</td>\n",
 89 |        "      <td>853.100710</td>\n",
 90 |        "    </tr>\n",
 91 |        "    <tr>\n",
 92 |        "      <th>3</th>\n",
 93 |        "      <td>Afghanistan</td>\n",
 94 |        "      <td>Asia</td>\n",
 95 |        "      <td>1967</td>\n",
 96 |        "      <td>34.020</td>\n",
 97 |        "      <td>11537966</td>\n",
 98 |        "      <td>836.197138</td>\n",
 99 |        "    </tr>\n",
100 |        "    <tr>\n",
101 |        "      <th>4</th>\n",
102 |        "      <td>Afghanistan</td>\n",
103 |        "      <td>Asia</td>\n",
104 |        "      <td>1972</td>\n",
105 |        "      <td>36.088</td>\n",
106 |        "      <td>13079460</td>\n",
107 |        "      <td>739.981106</td>\n",
108 |        "    </tr>\n",
109 |        "  </tbody>\n",
110 |        "</table>\n",
111 |        "</div>"
112 |       ],
113 |       "text/plain": [
114 |        "       country continent  year  lifeExp       pop   gdpPercap\n",
115 |        "0  Afghanistan      Asia  1952   28.801   8425333  779.445314\n",
116 |        "1  Afghanistan      Asia  1957   30.332   9240934  820.853030\n",
117 |        "2  Afghanistan      Asia  1962   31.997  10267083  853.100710\n",
118 |        "3  Afghanistan      Asia  1967   34.020  11537966  836.197138\n",
119 |        "4  Afghanistan      Asia  1972   36.088  13079460  739.981106"
120 |       ]
121 |      },
122 |      "execution_count": 2,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "gapminder.head()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 3,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "<matplotlib.axes._subplots.AxesSubplot at 0x110a2df10>"
140 |       ]
141 |      },
142 |      "execution_count": 3,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     },
146 |     {
147 |      "data": {
148 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAD8CAYAAABgmUMCAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBo\ndHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAEphJREFUeJzt3X+w3XV95/HnS6KFsNXwI1iaYAPb\nDOo4ovGW0rJ1W7CuiCV2R1ZaWzMMbTpTdtXanRo77WK76wzOuEUdd2hZaTe4VYtUC7uwtRG1nf1D\nNAFW0OiQIoVrKLktv1SqiL77x/nc5Rouyfkk99xzzs3zMXPmfL+f8zn3+/7MzbmvfL+f7/l+U1VI\nkjSsZ4y7AEnSdDE4JEldDA5JUheDQ5LUxeCQJHUxOCRJXQwOSVIXg0OS1MXgkCR1WTXuAkbhxBNP\nrA0bNoy7DEmaKrt27fqHqlp7sH4rMjg2bNjAzp07x12GJE2VJH83TD8PVUmSuowsOJL8cZJ9Se5c\n0HZ8kh1J7mrPx7X2JHlfkj1JvpBk04L3bGn970qyZVT1SpKGM8o9jv8BvGq/tm3AzVW1Ebi5rQOc\nB2xsj63AlTAIGuAy4MeBM4HL5sNGkjQeIwuOqvob4MH9mjcD29vyduC1C9qvqYHPAmuSnAz8G2BH\nVT1YVQ8BO3hqGEmSltFyz3E8t6ruB2jPJ7X2dcB9C/rNtrana3+KJFuT7Eyyc25ubskLlyQNTMrk\neBZpqwO0P7Wx6qqqmqmqmbVrD3o2mSTpEC13cDzQDkHRnve19lnglAX91gN7D9AuSRqT5Q6OG4D5\nM6O2ANcvaH9jO7vqLOCRdijrE8ArkxzXJsVf2dokSWMysi8AJvkw8NPAiUlmGZwddTlwbZJLgHuB\nC1v3m4BXA3uAx4CLAarqwST/Gfh86/f7VbX/hLskaRmlatEpg6k2MzNTfnNc0oZtN45lu/dcfv5Y\ntnu4kuyqqpmD9ZuUyXFJ0pQwOCRJXQwOSVIXg0OS1MXgkCR1MTgkSV0MDklSF4NDktTF4JAkdTE4\nJEldDA5JUheDQ5LUxeCQJHUxOCRJXQwOSVIXg0OS1MXgkCR1MTgkSV0MDklSF4NDktTF4JAkdTE4\nJEldDA5JUheDQ5LUxeCQJHUxOCRJXQwOSVIXg0OS1MXgkCR1MTgkSV0MDklSF4NDktTF4JAkdRlL\ncCT5jSRfTHJnkg8nOTrJqUluSXJXkj9L8qzW9wfa+p72+oZx1CxJGlj24EiyDngTMFNVLwKOAi4C\n3gVcUVUbgYeAS9pbLgEeqqofBa5o/SRJY7JqjNs9Jsl3gNXA/cA5wC+217cD7wCuBDa3ZYDrgPcn\nSVXVchYs6dBt2HbjuEvQElr2PY6q+hrwbuBeBoHxCLALeLiqnmjdZoF1bXkdcF977xOt/wnLWbMk\n6UnjOFR1HIO9iFOBHwaOBc5bpOv8HkUO8NrCn7s1yc4kO+fm5paqXEnSfsYxOf4K4KtVNVdV3wE+\nBvwksCbJ/KGz9cDetjwLnALQXn8O8OD+P7SqrqqqmaqaWbt27ajHIElHrHEEx73AWUlWJwlwLvAl\n4NPA61qfLcD1bfmGtk57/VPOb0jS+IxjjuMWBpPctwJ3tBquAt4GvDXJHgZzGFe3t1wNnNDa3wps\nW+6aJUlPGstZVVV1GXDZfs13A2cu0vdbwIXLUZck6eD85rgkqYvBIUnqYnBIkroYHJKkLgaHJKmL\nwSFJ6mJwSJK6GBySpC4GhySpi8EhSepicEiSuhgckqQuBockqYvBIUnqYnBIkroYHJKkLgaHJKmL\nwSFJ6mJwSJK6GBySpC4GhySpi8EhSepicEiSuhgckqQuBockqYvBIUnqYnBIkroYHJKkLgaHJKmL\nwSFJ6mJwSJK6DBUcSV406kIkSdNh2D2OP0zyuSS/nmTNSCuSJE20oYKjqv4V8AbgFGBnkg8l+dlD\n3WiSNUmuS/LlJLuT/ESS45PsSHJXez6u9U2S9yXZk+QLSTYd6nYlSYdv6DmOqroL+B3gbcC/Bt7X\n/vD/20PY7nuBv6yq5wNnALuBbcDNVbURuLmtA5wHbGyPrcCVh7A9SdISGXaO48VJrmDwB/4c4Oeq\n6gVt+YqeDSZ5NvBy4GqAqnq8qh4GNgPbW7ftwGvb8mbgmhr4LLAmyck925QkLZ1h9zjeD9wKnFFV\nl1bVrQBVtZfBXkiP04A54E+S3JbkA0mOBZ5bVfe3n3s/cFLrvw64b8H7Z1ubJGkMhg2OVwMfqqp/\nAkjyjCSrAarqg53bXAVsAq6sqpcC3+TJw1KLySJt9ZROydYkO5PsnJub6yxJkjSsYYPjk8AxC9ZX\nt7ZDMQvMVtUtbf06BkHywPwhqPa8b0H/Uxa8fz2wd/8fWlVXVdVMVc2sXbv2EEuTJB3MsMFxdFV9\nY36lLa8+lA1W1d8D9yU5vTWdC3wJuAHY0tq2ANe35RuAN7azq84CHpk/pCVJWn6rhuz3zSSb5uc2\nkrwM+KfD2O5/AP40ybOAu4GLGYTYtUkuAe4FLmx9b2JwqGwP8FjrK0kak2GD4y3AR5PMHyI6GXj9\noW60qm4HZhZ56dxF+hZw6aFuS5K0tIYKjqr6fJLnA6czmKz+clV9Z6SVSZIm0rB7HAA/Bmxo73lp\nEqrqmpFUJUmaWEMFR5IPAv8SuB34bmsuwOCQpCPMsHscM8AL23yDJOkINuzpuHcCPzTKQiRJ02HY\nPY4TgS8l+Rzw7fnGqrpgJFVJkibWsMHxjlEWIUmaHsOejvvXSX4E2FhVn2zXqTpqtKVJkibRsJdV\n/1UG15T6o9a0DviLURUlSZpcw06OXwqcDTwK//+mTicd8B2SpBVp2OD4dlU9Pr+SZBWLXNpckrTy\nDRscf53kt4Fj2r3GPwr8r9GVJUmaVMMGxzYGd+27A/g1Bles7b3znyRpBRj2rKrvAf+9PSRJR7Bh\nr1X1VRaZ06iq05a8IknSROu5VtW8oxncZOn4pS9HkjTphprjqKp/XPD4WlW9BzhnxLVJkibQsIeq\nNi1YfQaDPZAfHElFkqSJNuyhqv+6YPkJ4B7g3y15NZKkiTfsWVU/M+pCJEnTYdhDVW890OtV9QdL\nU44kadL1nFX1Y8ANbf3ngL8B7htFUZKkydVzI6dNVfV1gCTvAD5aVb8yqsIkSZNp2EuOPA94fMH6\n48CGJa9GkjTxht3j+CDwuSQfZ/AN8p8HrhlZVZKkiTXsWVXvTPJ/gJ9qTRdX1W2jK0uSNKmGPVQF\nsBp4tKreC8wmOXVENUmSJtiwt469DHgb8PbW9Ezgf46qKEnS5Bp2j+PngQuAbwJU1V685IgkHZGG\nDY7Hq6pol1ZPcuzoSpIkTbJhg+PaJH8ErEnyq8An8aZOknREGvasqne3e40/CpwO/Keq2jHSyiRJ\nE+mgwZHkKOATVfUKwLCQpCPcQQ9VVdV3gceSPGcZ6pEkTbhhvzn+LeCOJDtoZ1YBVNWbDnXDbU9m\nJ/C1qnpN+17IRxjckvZW4Jer6vEkP8DgW+ovA/4ReH1V3XOo25UkHZ5hJ8dvBH6XwRVxdy14HI43\nA7sXrL8LuKKqNgIPAZe09kuAh6rqR4ErWj9J0pgccI8jyfOq6t6q2r6UG02yHjgfeCfw1iRhcA/z\nX2xdtgPvAK4ENrdlgOuA9ydJOz1YkrTMDrbH8RfzC0n+fAm3+x7gt4DvtfUTgIer6om2Pgusa8vr\naPf9aK8/0vp/nyRbk+xMsnNubm4JS5UkLXSw4MiC5dOWYoNJXgPsq6qFh7qySNca4rUnG6quqqqZ\nqppZu3btElQqSVrMwSbH62mWD8fZwAVJXg0cDTybwR7ImiSr2l7FemBv6z8LnMLgwoqrgOcADy5R\nLZKkTgfb4zgjyaNJvg68uC0/muTrSR49lA1W1duran1VbQAuAj5VVW8APg28rnXbAlzflm9o67TX\nP+X8hiSNzwH3OKrqqOUqhMHVdz+S5L8AtwFXt/argQ8m2cNgT+OiZaxJkrpt2Hbj2LZ9z+Xnj3wb\nw36PYySq6jPAZ9ry3cCZi/T5FnDhshYmSXpaPTdykiTJ4JAk9TE4JEldDA5JUheDQ5LUxeCQJHUx\nOCRJXQwOSVIXg0OS1MXgkCR1MTgkSV0MDklSF4NDktTF4JAkdTE4JEldDA5JUheDQ5LUxeCQJHUx\nOCRJXcZ6z3FJy2fDthvHXYJWCPc4JEldDA5JUheDQ5LUxeCQJHUxOCRJXQwOSVIXg0OS1MXgkCR1\nMTgkSV0MDklSFy85orE6Ei+Dcc/l54+7BOmwGBzSMjsSw1Iri4eqJEldlj04kpyS5NNJdif5YpI3\nt/bjk+xIcld7Pq61J8n7kuxJ8oUkm5a7ZknSk8axx/EE8JtV9QLgLODSJC8EtgE3V9VG4Oa2DnAe\nsLE9tgJXLn/JkqR5yx4cVXV/Vd3alr8O7AbWAZuB7a3bduC1bXkzcE0NfBZYk+TkZS5bktSMdY4j\nyQbgpcAtwHOr6n4YhAtwUuu2DrhvwdtmW9v+P2trkp1Jds7NzY2ybEk6oo0tOJL8C+DPgbdU1aMH\n6rpIWz2loeqqqpqpqpm1a9cuVZmSpP2MJTiSPJNBaPxpVX2sNT8wfwiqPe9r7bPAKQvevh7Yu1y1\nSpK+3zjOqgpwNbC7qv5gwUs3AFva8hbg+gXtb2xnV50FPDJ/SEuStPzG8QXAs4FfBu5Icntr+23g\ncuDaJJcA9wIXttduAl4N7AEeAy5e3nIlSQste3BU1f9l8XkLgHMX6V/ApSMtSpI0NL85LknqYnBI\nkroYHJKkLgaHJKmLwSFJ6mJwSJK6GBySpC4GhySpi8EhSeriPcflPbAldXGPQ5LUxeCQJHUxOCRJ\nXQwOSVIXg0OS1MXgkCR1MTgkSV0MDklSF4NDktTF4JAkdTE4JEldDA5JUheDQ5LUxeCQJHUxOCRJ\nXQwOSVIXg0OS1MXgkCR1MTgkSV285/gE8d7fkqaBexySpC4GhySpi8EhSeoyNcGR5FVJvpJkT5Jt\n465Hko5UUxEcSY4C/htwHvBC4BeSvHC8VUnSkWkqggM4E9hTVXdX1ePAR4DNY65Jko5I03I67jrg\nvgXrs8CPj2pjnhYrSU9vWoIji7TV93VItgJb2+o3knxl5FWN1onAP4y7iCWyksYCK2s8K2kssLLG\nc0hjybsOa5s/MkynaQmOWeCUBevrgb0LO1TVVcBVy1nUKCXZWVUz465jKaykscDKGs9KGgusrPFM\n8limZY7j88DGJKcmeRZwEXDDmGuSpCPSVOxxVNUTSf498AngKOCPq+qLYy5Lko5IUxEcAFV1E3DT\nuOtYRivmsBsrayywssazksYCK2s8EzuWVNXBe0mS1EzLHIckaUIYHGOW5Ogkn0vy/5J8McnvtfZT\nk9yS5K4kf9ZOCpgKSY5KcluS/93Wp3ks9yS5I8ntSXa2tuOT7Gjj2ZHkuHHXOawka5Jcl+TLSXYn\n+YlpHE+S09vvZP7xaJK3TONY5iX5jfY34M4kH25/Gybys2NwjN+3gXOq6gzgJcCrkpwFvAu4oqo2\nAg8Bl4yxxl5vBnYvWJ/msQD8TFW9ZMGpkduAm9t4bm7r0+K9wF9W1fOBMxj8nqZuPFX1lfY7eQnw\nMuAx4ONM4VgAkqwD3gTMVNWLGJwEdBET+tkxOMasBr7RVp/ZHgWcA1zX2rcDrx1Ded2SrAfOBz7Q\n1sOUjuUANjMYB0zReJI8G3g5cDVAVT1eVQ8zpeNZ4Fzgb6vq75jusawCjkmyClgN3M+EfnYMjgnQ\nDu3cDuwDdgB/CzxcVU+0LrMMLrsyDd4D/BbwvbZ+AtM7FhiE+F8l2dWuTgDw3Kq6H6A9nzS26vqc\nBswBf9IOJX4gybFM73jmXQR8uC1P5Viq6mvAu4F7GQTGI8AuJvSzY3BMgKr6btvlXs/ggo4vWKzb\n8lbVL8lrgH1VtWth8yJdJ34sC5xdVZsYXJn50iQvH3dBh2EVsAm4sqpeCnyTKTmU83TaMf8LgI+O\nu5bD0eZiNgOnAj8MHMvg39z+JuKzY3BMkHbY4DPAWcCatssKi1xiZUKdDVyQ5B4GVzA+h8EeyDSO\nBYCq2tue9zE4hn4m8ECSkwHa877xVdhlFpitqlva+nUMgmRaxwODP663VtUDbX1ax/IK4KtVNVdV\n3wE+BvwkE/rZMTjGLMnaJGva8jEM/gHtBj4NvK512wJcP54Kh1dVb6+q9VW1gcHhg09V1RuYwrEA\nJDk2yQ/OLwOvBO5kcLmbLa3b1Iynqv4euC/J6a3pXOBLTOl4ml/gycNUML1juRc4K8nqNi84/7uZ\nyM+OXwAcsyQvZjDpdRSDIL+2qn4/yWkM/td+PHAb8EtV9e3xVdonyU8D/7GqXjOtY2l1f7ytrgI+\nVFXvTHICcC3wPAYf+Aur6sExldklyUsYnLjwLOBu4GLavzumbDxJVjO43cJpVfVIa5vm383vAa8H\nnmDwOfkVBnMaE/fZMTgkSV08VCVJ6mJwSJK6GBySpC4GhySpi8EhSepicEiSuhgckqQuBockqcs/\nAyQbPfgFtb76AAAAAElFTkSuQmCC\n",
149 |       "text/plain": [
150 |        "<matplotlib.figure.Figure at 0x10d0bad50>"
151 |       ]
152 |      },
153 |      "metadata": {},
154 |      "output_type": "display_data"
155 |     }
156 |    ],
157 |    "source": [
158 |     "gapminder.lifeExp.plot.hist()"
159 |    ]
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 2",
165 |    "language": "python",
166 |    "name": "python2"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 2
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython2",
178 |    "version": "2.7.14"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 2
183 | }
184 | 


--------------------------------------------------------------------------------
/solutions/README.md:
--------------------------------------------------------------------------------
 1 | # <따라 하며 배우는 데이터 과학> 연습문제 해답
 2 | 
 3 | - 해답 목차: <http://rpubs.com/dataninja/ipds-kr-solutions>
 4 | 
 5 | - 3장 (tidyverse) 해답: <http://rpubs.com/dataninja/ipds-kr-ch03>
 6 | - 4장 (ggplot2) 해답: <http://rpubs.com/dataninja/ipds-kr-ch04>
 7 | - 8-9장 (classification) 해답: <http://rpubs.com/dataninja/ipds-kr-ch08>
 8 | - 10-11장 (classification) 해답: <http://rpubs.com/dataninja/ipds-kr-ch10>
 9 | - 13-14장 (regression) 해답: <http://rpubs.com/dataninja/ipds-kr-ch13>
10 | - 15장 (word cloud) 해답: <http://rpubs.com/dataninja/ipds-kr-ch15>
11 | 


--------------------------------------------------------------------------------
/solutions/ch03-tidyverse-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "<따라 하며 배우는 데이터 과학> 3장 연습문제 해답"
  3 | author: "권재명"
  4 | date: "9/27/2017"
  5 | output:
  6 |   html_document:
  7 |     toc: true
  8 |     toc_depth: 2
  9 | ---
 10 | 저자 책 웹페이지: <https://dataninja.me/ipds-kr/>
 11 | 
 12 | 
 13 | ```{r setup, include=FALSE}
 14 | knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
 15 | ```
 16 | 
 17 | # 1. (갭마인더 자료 추가분석)
 18 | dplyr 패키지를 이용하여 갭마인더 데이터에서 다음 요약 통계량을 계산하라.
 19 | 
 20 | 일단은 필수패키지인 `tidyverse`를 로드합니다.
 21 | 그리고 `gapminder` 데이터 패키지를 로드합니다.
 22 | (로딩 메시지를 감추기 위해 `suppressMessages()` 명령을 사용했습니다.)
 23 | ```{r}
 24 | # install.packages("tidyverse")
 25 | # install.packages("gapminder")
 26 | suppressMessages(library(tidyverse))
 27 | suppressMessages(library(gapminder))
 28 | ```
 29 | 
 30 | 
 31 | ## a. 2007년도 나라별 일인당 국민소득
 32 | 
 33 | ```{r}
 34 | gapminder %>% 
 35 |   filter(year==2007) %>%
 36 |   select(country, gdpPercap)
 37 | ```
 38 | 
 39 | 
 40 | ## b. 2007년도 대륙별 일인당 평균수명의 평균과 중앙값
 41 | ```{r}
 42 | gapminder %>%
 43 |   filter(year==2007) %>%
 44 |   group_by(continent) %>%
 45 |   summarize(n(), mean(lifeExp), median(lifeExp))
 46 | ```
 47 | (고급문제: 위의 평균은 각 나라의 산술평균입니다. 
 48 | 만약 각 나라별 인구수를 가중치로 한 가중평균(weighted average)을 내고자
 49 | 한다면 어떻게 해야 할까요?)
 50 | 
 51 | 
 52 | 
 53 | # 2 (온라인 예제 자료 읽어 들이기)
 54 | 예제 데이터를 제공하는 다음 페이지들을 방문하여 각 페이지에서 흥미있는 데이터를 하나씩 선택하여 다운로드한 후, R에 읽어 들이는 코드를 작성하라.
 55 | 
 56 | ## a. UCI 머신러닝 리포(UCI Machine Learning Repository)
 57 | <https://goo.gl/fstR7> or <http://archive.ics.uci.edu/ml/index.php>
 58 | 
 59 | 
 60 | ## b. R 예제 데이터: 
 61 | <https://goo.gl/AlvXNr> or <https://vincentarelbundock.github.io/Rdatasets/>
 62 | (생략)
 63 | 
 64 | ## c. 머신러닝/데이터 과학 공유/경연 사이트인 캐글:
 65 | <https://www.kaggle.com/>
 66 | 
 67 | 4번문제에서 살펴볼 다음 자료를 다운로드하였습니다:
 68 | <https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset> 
 69 | 
 70 | ```{r}
 71 | df <- read_csv("imdb-5000-movie-dataset.zip")
 72 | ```
 73 | 위에서 `parsing failure` 에러가 난 이유는 무엇일까?
 74 | 에러 메시지를 자세히 살펴보면 몇 줄의 budget 변수 값이 너무
 75 | 큰 것을 알 수 있다. `read_csv()` 함수는 데이터 파일의 
 76 | 처음 1000줄을 읽고 
 77 | 자동적으로 각 변수의 변수형을 결정하는데
 78 | 크지 않은 숫자값에는 `integer`형을 배정하게 된다.
 79 | 
 80 | 하지만 문제가 되는 4 행들은 
 81 | budget 값이 `integer` 형태의 최대값보다 더 큰 값이다 보니 에러가 난 것이다.
 82 | R의 최대 `integer`값은 다음과 같다:
 83 | ```{r}
 84 | .Machine$integer.max
 85 | ```
 86 | 
 87 | 문제가 되는 행들(예를 들어 2989 행; 봉준호 감독의 <괴물>이다!)은
 88 | 예산이 120억원이다 보니 값이 너무 커진것이다.
 89 | 문제가 되는 행에서 변수값은 결측치 (NA)가 된다:
 90 | ```{r}
 91 | 12215500000 > .Machine$integer.max
 92 | df[2989,] %>% glimpse()
 93 | ```
 94 | 
 95 | 이것을 해결하는 방법은 몇가지가 있을 수 있다. `read_csv(..., col_types=)`
 96 | 옵션에서 수동으로 변수형을 integer보다 최대값이 큰 double 등으로 
 97 | 지정해줄 수 있다. (`.Machine$double.xmax`를 실행해보라)
 98 | 아니면 간편하게, `guess_max=` 옵션으로 좀 더 많은 행을 살펴보고
 99 | 변수형을 결정하라고 할 수 있다:
100 | ```{r}
101 | 
102 | df2 <- read_csv("imdb-5000-movie-dataset.zip", guess_max = 1e6)
103 | df2[2989,] %>% glimpse()
104 | ```
105 | 
106 | 
107 | 이 에러에서 알 수 있는 또 하나의 중요한 사실은
108 | budget 등이 영화 제작 국가의 통화로 표기되어 있다는 것이다
109 | (미국은 달러, 한국은 원, 등등).
110 | 이러한 사실을 모르고 분석하면 잘못된 결론을 얻을 수 있으니 주의하도록 하자.
111 | 
112 | 이러한 문제는 각 budget 변수의 분포를 나라별로 그려보면 간단히 알 수 있다:
113 | ```{r}
114 | df2 %>% ggplot(aes(budget)) + geom_histogram()
115 | ```
116 | 즉, 달러 이외의 통화를 사용하는 나라 때문에 분포에 많은 
117 | 이상치가 존재하게 된다.
118 | 
119 | 먼저, 가장 많은 영화를 만들어내는 나라를 살펴보고, 미국의 `country` 코드를
120 | 알아내도록 하자:
121 | ```{r}
122 | df2 %>% count(country) %>% arrange(-n)
123 | ```
124 | 
125 | 미국은 `country=="USA"` 임을 알 수 있다. 미국 영화들의 예산의 분포를 
126 | 그려보면 아주 심각한 이상치들이 제거된
127 | 다음과 같은  분포를 얻게 된다.
128 | ```{r}
129 | df2 %>% 
130 |   filter(country=="USA") %>%
131 |   ggplot(aes(budget)) + geom_histogram()
132 | ```
133 | 
134 | 
135 | 
136 | ## d. 위키피디아의 머신러닝 연구를 위한 데이터세트 리스트: 
137 | <https://goo.gl/SpCOlK> or <https://en.wikipedia.org/wiki/List_of_datasets_for_machine_learning_research>
138 | 
139 | (생략)
140 | 
141 | 
142 | 
143 | # 3 (범주별 요약 통계량)
144 | 위에서 읽어 들인 데이터의 범주별 요약 통계량을 작성하라. 
145 | dplyr 패키지의 `%>%` 연산자, `group_by()`, `summarize()` 함수를 사용하여야 한다.
146 | 
147 | 
148 | 위에서 읽어들인 IMDB 자료를 살펴보도록 하자.
149 | 미국에서 
150 | 각 10년간 만들어진 영화의 개수, 
151 |   영화 편당 제작비의 총액과 중간값, 
152 |   영화 편당 수익의 총액과 중간값,
153 |   그리고 ROI (총수익 / 총제작비)을 계산해 보도록 하자.
154 | ```{r}
155 | df2 %>%
156 |   filter(country=="USA") %>%
157 |   group_by(decade=floor(title_year/10)*10) %>%
158 |   summarize(n_movies=n(),
159 |             tot_budet = sum(budget, na.rm=TRUE),
160 |             median_budget = median(budget, na.rm=TRUE),
161 |             tot_gross = sum(as.numeric(gross), na.rm=TRUE),
162 |             median_gross = median(gross, na.rm=TRUE),
163 |             roi = tot_gross/tot_budet)
164 | ```
165 | (여기서 `as.numeric()`을 사용한 이유는 integer overflow 에러 때문이다.)
166 | 물론 이 분석에는 물가 상승이 반영되어 있지 않지만, 미국 영화산업의 
167 | 성장과 특이한 사항들을 알 수 있다. 예를 들어 1970년대에는 몇십편의
168 | 영화밖에 제작되지 않았지만, 편당 수익의 중간값, 그리고 ROI 값은
169 | 현재보다 크다!
170 | 
171 | 
172 | 
173 | # 4 (IMDB 자료 분석)
174 | 캐글 웹사이트에서 다음 IMDB(Internet Movie Database) 영화 정보 데이터를 다운로드하도록 하자(<https://goo.gl/R08lpm> or <https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset> 무료 캐글 계정이 필요하다). dplyr 패키지를 이용하여 다음 질문에 답하라.
175 | 
176 | ## a. 이 데이터는 어떤 변수로 이루어져 있는가?
177 | <https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset> 페이지,
178 | 그리고 <https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset/data>의 변수설명을 살펴보자.
179 | 위에서 R로 읽어들인 데이터프레임은 다음과 같다.
180 | ```{r}
181 | glimpse(df2)
182 | ```
183 | 
184 | ## b. 연도별 리뷰받은 영화의 개수는?
185 | 
186 | 
187 | ```{r}
188 | df2 %>%
189 |   group_by(title_year) %>%
190 |   count()
191 | df2 %>%
192 |   group_by(title_year) %>%
193 |   summarize(n_movies=n()) %>% 
194 |   ggplot(aes(title_year, n_movies)) + geom_point() + geom_line()
195 | ```
196 | 
197 | ## c. 연도별 리뷰평점의 개수는?
198 | ```{r}
199 | df2 %>%
200 |   group_by(title_year) %>%
201 |   summarize(avg_imdb_score = mean(imdb_score)) %>%
202 |   ggplot(aes(title_year, avg_imdb_score)) + geom_point() + geom_line()
203 | ```
204 | 
205 | (고급: 위의 시각화에 오차구간을 추가하려면? )
206 | 
207 | 
208 | # 5 (SQL 연습)
209 | ‘Online SQL Tryit Editor(<https://goo.gl/NJDGdw> 혹은 <https://www.w3schools.com/sql/trysql.asp?filename=trysql_select_all> )’에 방문해보자. 
210 | 이 페이지에서는 가상의 레스토랑의 재료 주문정보를 기록한 데이터베이스를 예제로 제공하고 있다. 
211 | 이 페이지를 이용해 다음 질문에 답하라.
212 | 
213 | ## a. 다음 질문에 대답하는 SQL 문을 작성하고 실행하라.
214 | i. Orders 테이블에서 employeeID별 주문 수는? 가장 주문 수가 많은 employeeID부터 내림차순
215 | 으로 출력하라.
216 | 
217 | ```{sql eval=FALSE}
218 | SELECT EmployeeID, count(*) AS n_orders
219 | FROM Orders
220 | GROUP BY EmployeeID
221 | ORDER BY n_orders DESC;
222 | ```
223 | 
224 | 
225 | ii. 위의 결과를 Employees 테이블과 결합하여 같은 결과에 FirstName과 LastName을 추가하여 출력하라.
226 | ```{sql eval=FALSE}
227 | SELECT a.EmployeeID, b.FirstName, b.LastName, count(*) AS n_orders
228 | FROM Orders a INNER JOIN Employees b
229 |   ON a.EmployeeID = b.EmployeeID
230 | GROUP BY b.EmployeeID, b.FirstName, b.LastName
231 | ORDER BY n_orders DESC;
232 | ```
233 | 
234 | 
235 | iii. Orders, OrderDetails, Products 테이블을 결합하여 각 OrderID별로 주문 날짜, 
236 | 주문품목 양(새 열 이름은 n_items으로), 주문 총액(열 이름은 total_price으로)을 출력하라.
237 | ```{sql eval=FALSE}
238 | SELECT a.OrderID,
239 |   a.OrderDate,
240 |   SUM(b.Quantity) as n_items,
241 |   SUM(b.Quantity * c.Price) as total_price
242 | FROM Orders a
243 |   INNER JOIN OrderDetails b
244 |     ON a.OrderID = b.OrderID
245 |   INNER JOIN Products c
246 |     ON b.ProductID = c.ProductID
247 | GROUP BY a.OrderID, a.OrderDate
248 | ORDER BY a.OrderID;
249 | ```
250 | 
251 | 
252 | 
253 | ## b. 웹페이지에는 총 8개의 테이블이 있다. 각 테이블은 각각 어떤 열로 구성되어 있는가?
254 | 
255 | 각 테이블을 클릭하면 됩니다. 각 테이블에 다음 명령을 실행해 줍니다:
256 | ```{sql eval=FALSE}
257 | SELECT * FROM Orders;
258 | ```
259 | 
260 | 
261 | ## c. [고급]
262 | 각 테이블들 간에 공통되는 열들은 어떤 것들인가
263 | (예를 들어, Orders 테이블과 Customers 테이블 모두 CustomerID 열을 가지고 있다)? 
264 | 
265 | 테이블들 간의 관계를 어떻게 나타낼 수 있을까? 
266 | 
267 | 개체-관계 모델 (entity-relationship model, ER model)은 테이블 간의 관계를 나타내는 데 많이 사용된다. 
268 | 


--------------------------------------------------------------------------------
/solutions/ch04-ggplot-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "<따라 하며 배우는 데이터 과학> 4장 연습문제 해답"
  3 | author: "권재명"
  4 | date: "9/27/2017"
  5 | output:
  6 |   html_document:
  7 |     toc: true
  8 |     toc_depth: 3
  9 | ---
 10 | 
 11 | 저자 책 웹페이지: <https://dataninja.me/ipds-kr/>
 12 | 
 13 | 
 14 | ```{r setup, include=FALSE}
 15 | knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
 16 | ```
 17 | 
 18 | 일단은 필수패키지인 `tidyverse`를 로드하자.
 19 | (로딩 메시지를 감추기 위해 `suppressMessages()` 명령을 사용.)
 20 | ```{r}
 21 | # install.packages("tidyverse")
 22 | suppressMessages(library(tidyverse))
 23 | ```
 24 | 
 25 | # 1. (IMDB 자료 시각화)
 26 | 캐글 웹사이트에서 다음 IMDB(Internet Movie Database) 영화 정보 데이터를 다운로드하도록 하자
 27 | (<https://goo.gl/R08lpm> 혹은 <https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset>, 무료 캐글 계정이 필요하다).
 28 | 
 29 | 데이터에 대해서는
 30 | 3장 연습문제 해답을 참조하자. <http://rpubs.com/dataninja/ipds-kr-solutions-ch03>
 31 | 
 32 | 데이터 `zip` 파일을 다운로드한 후, R로 자료를 읽어들이자:
 33 | ```{r}
 34 | df2 <- read_csv("imdb-5000-movie-dataset.zip", guess_max = 1e6)
 35 | ```
 36 | 
 37 | ## a. 이 데이터는 어떤 변수로 이루어져 있는가?
 38 | 
 39 | ```{r}
 40 | df2 %>% glimpse()
 41 | ```
 42 | 
 43 | ## b. 시각화를 통해 다음 질문에 답해보자
 44 | (~~분석 예는 https://goo.gl/pYPzvi 에서 찾을 수 있다~~
 45 | 아쉽게도 원링크 <https://www.kaggle.com/adhok93/d/deepmatrix/imdb-5000-movie-dataset/eda-with-plotly>
 46 | 는 삭제되었습니다)
 47 | 
 48 | 
 49 | ### i. 연도별 리뷰받은 영화의 편수는?
 50 | 
 51 | ```{r}
 52 | df2 %>%
 53 |   group_by(title_year) %>%
 54 |   summarize(n_movies=n()) %>% 
 55 |   ggplot(aes(title_year, n_movies)) + geom_point() + geom_line()
 56 | ```
 57 | 
 58 | ### ii. 연도별 리뷰평점의 변화는?
 59 | ```{r}
 60 | df2 %>%
 61 |   group_by(title_year) %>%
 62 |   summarize(avg_imdb_score = mean(imdb_score)) %>%
 63 |   ggplot(aes(title_year, avg_imdb_score)) + geom_point() + geom_line()
 64 | ```
 65 | 
 66 | 평균 점수는 점점 낮아지고 있음을 볼 수 있다.
 67 | 
 68 | (고급 분석: 이러한 평균 점수의 하락 추세의 원인은 무엇일까?)
 69 | 
 70 | 
 71 | ### iii. 영상물 등급(content_rating)에 따라서 리뷰평점의 분포에 차이가 있는가?
 72 | 
 73 | 우선 등급의 분포부터 살펴보자:
 74 | ```{r}
 75 | df2 %>%
 76 |   ggplot(aes(content_rating)) + geom_bar()
 77 | ```
 78 | 
 79 | 이로부터 대부분의 영화들의 
 80 | 영상물 등급은 다음 넷 중 하나임을 알 수 있다:
 81 | G, PG, PG-13, R. 
 82 | 아래 분석은 이 네 등급의 영화에 집중하도록 하자.
 83 | 
 84 | 각 등급에 따른 리뷰평점 분포의 병렬상자그림을 그려보면:
 85 | ```{r}
 86 | df2 %>%
 87 |   filter(content_rating %in% c("G", "PG", "PG-13", "R")) %>%
 88 |   ggplot(aes(content_rating, imdb_score)) + geom_boxplot()
 89 | ```
 90 | 
 91 | 이로부터, 리뷰평점의 중간값은 G > R > PG > PG-13 의 순서임을
 92 | 알 수 있다. 
 93 | 그리고 이상치에 가까운 
 94 | 최고의 평점을 받은 R 등급 영화들이 있음을 알 수 있다.
 95 | 
 96 | (고급: 이 최고 평점을 받은 R 등급 영화들은 무엇일까?)
 97 | 
 98 | 
 99 | 유사한 시각화로, 각 등급별로 평점의 확률밀도함수를 
100 | 겹쳐 그려볼 수도 있다:
101 | ```{r}
102 | df2 %>%
103 |   filter(content_rating %in% c("G", "PG", "PG-13", "R")) %>%
104 |   ggplot(aes(imdb_score, fill=content_rating, linetype=content_rating)) + 
105 |   geom_density(alpha=.3)
106 | ```
107 | 
108 | (필자는 색맹이므로, `fill=` 옵션으로는 
109 | 각 집단이 충분히 구분이 되지 않아서 `linetype=`
110 | 옵션도 사용하였다)
111 | 이 시각화로부터 추가적으로 알 수 있는 것은
112 | G 등급 영화 중 평점이 높은 영화의 비중이 꾀
113 | 높다는 것이다. (아마도 디즈니 영화들일까?)
114 | 
115 | 조금 더 전통적인 통계학적 가설검정을 적용하자면
116 | 분산분석 (ANOVA; Analysis of Variance) 을
117 | 해 보면  된다.
118 | ```{r}
119 | summary(lm(imdb_score ~ content_rating, 
120 |            data=df2 %>% 
121 |              filter(content_rating %in% c("G", "PG", "PG-13", "R"))))
122 | ```
123 | 자료의 개수가 워낙 많아서이기도 하지만
124 | 등급 집단간에 평점 평균이 통계적으로 유의한 차이가 있음을
125 | 알 수 있다.
126 | 
127 | 
128 | ### iv. 페이스북 좋아요 개수와 리뷰평점의 사이의 관계는?
129 | 일단 페북 좋아요 개수(`move_facebook_likes`) 변수의
130 | 분포를 살펴보자.
131 | 꼬리가 아주 긴 분포이므로, 
132 | 일단 제곱근 변환을 해 주었다.
133 | (독자들은 log10 변환도 해 보길 권한다.)
134 | 
135 | ```{r}
136 | df2 %>%
137 |   ggplot(aes(movie_facebook_likes)) +
138 |   geom_histogram() +
139 |   scale_x_sqrt()
140 | ```
141 | 
142 | 이 시각화로부터 분포의 이상한 점이 눈에 띈다.
143 | 변환 후 분포 중간에 이상한 갭이 있다는 것이다.
144 | 
145 | (아직 필자는 그 이유를 찾지 못했으니, 알아낸 분은
146 | 공유 바랍니다)
147 | 
148 | 이에 반해 평점의 분포는 상당히 정상적이다:
149 | ```{r}
150 | df2 %>%
151 |   ggplot(aes(imdb_score)) +
152 |   geom_histogram()
153 | ```
154 | 
155 | 이제, 
156 | 제곱근 변환된 좋아요 개수와 스코어 간의 산점도를 그려보자.
157 | 
158 | ```{r}
159 | df2 %>%
160 |   ggplot(aes(movie_facebook_likes, imdb_score)) + 
161 |   geom_point() +
162 |   scale_x_sqrt() +
163 |   geom_smooth()
164 | ```
165 | 
166 | 평활 곡선으로부터 양의 상관관계가 있음을 알 수 있다.
167 | 어느정도는 상식적이게도
168 | "페북 좋아요 개수가 높을수록 리뷰 평점이 높다".
169 | 
170 | 
171 | 하지만 이 분석이 정확한 분석일까?
172 | 페북을 사람들이 사용한것은 비교적 최근의 일이다.
173 | 따라서 과거의 영화는 좋은 영화이더라도 
174 | 페북의 좋아요 개수가 적을 수도 있다.
175 | 
176 | 이를 확인하기 위해 년간 페북 좋아요 개수의 분포를 
177 | 살펴보자:
178 | ```{r}
179 | df2 %>%
180 |   ggplot(aes(as.factor(title_year), movie_facebook_likes)) +
181 |   geom_boxplot() +
182 |   scale_y_sqrt()
183 | ```
184 | 
185 | 예상대로, 2010년 이전과 이후의 좋아요 개수의 분포는
186 | 무척 다르다.
187 | 
188 | 따라서, 앞서와 같은 산점도를 
189 | 그리되  2010년 이후, 그리고 미국 영화로 제한하여
190 | 시각화 해 보자:
191 | ```{r}
192 | df2 %>%
193 |   filter(title_year > 2010 & country == "USA") %>%
194 |   ggplot(aes(movie_facebook_likes, imdb_score)) + 
195 |   geom_point() +
196 |   scale_x_sqrt() +
197 |   geom_smooth()
198 | ```
199 | 
200 | 마찬가지의 자료에서, 좋아요 개수가 100개가 넘는 
201 | 데이터에 관해 두 변수간의 상관관계는 높은 편이다:
202 | ```{r}
203 | df3 <- df2 %>%
204 |   filter(title_year > 2010 & country == "USA") %>%
205 |   filter(movie_facebook_likes > 100)
206 | cor(sqrt(df3$movie_facebook_likes), df3$imdb_score)
207 | ```
208 | 
209 | 선형회귀분석을 적용하면 모수추정과 가설검정 결과도 
210 | 얻을 수 있다:
211 | ```{r}
212 | summary(lm(imdb_score ~ sqrt(movie_facebook_likes), data=df3))
213 | 
214 | ```
215 | 
216 | 
217 | 
218 | 
219 | ## c. 이 데이터의 다른 흥미있는 시각화는 어떤 것이 있을까?
220 | (생략)
221 | 
222 | # 2 (포켓몬 데이터)
223 | 캐글 웹사이트에서 다음 포켓몬 데이터를 다운로드하자
224 | (<https://goo.gl/sMPKtX>, 혹은 <https://www.kaggle.com/abcsds/pokemon> 무료 캐글 계정이 필요하다).
225 | 이 데이터를 시각화하라. 
226 | <https://goo.gl/3fxt2x> 혹은 <https://www.kaggle.com/ndrewgele/visualizing-pok-mon-stats-with-seaborn>을 참고하라.
227 | 
228 | 
229 | 웹페이지에서 `pokemon.zip` 자료를 다운받은 후
230 | 다음처럼 R로 읽어들인다:
231 | ```{r}
232 | df_pkm <- read_csv("pokemon.zip")
233 | ```
234 | 
235 | 데이터의 대강 모양은 다음과 같다:
236 | ```{r}
237 | df_pkm %>% glimpse()
238 | ```
239 | 
240 | 다양한 시각화가 가능하겠지만
241 | 위의 예제 페이지에 나온 시각화를 해 보자면:
242 | ```{r}
243 | df_pkm %>% ggplot(aes(HP)) + geom_histogram()
244 | df_pkm %>% ggplot(aes(Attack)) + geom_histogram()
245 | df_pkm %>% ggplot(aes(HP, Attack)) + geom_point(alpha=.3)
246 | ```
247 | 
248 | 이 외에 다양한 시각화가 가능하겠지만,
249 | HP 와 Attack 간의 관계가 각 Type 에 따라 어떻게 
250 | 변하는지 알고자 한다면 다음과 같은
251 | `facet_wrap()` 함수가 유용하다:
252 | 
253 | ```{r}
254 | df_pkm %>% 
255 |   ggplot(aes(HP, Attack)) + 
256 |   geom_point(alpha=.3) +
257 |   # geom_smooth() + 
258 |   facet_wrap(~`Type 1`)
259 | ```
260 | 
261 | 


--------------------------------------------------------------------------------
/solutions/ch08-classification-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "<따라 하며 배우는 데이터 과학> 8-9장 연습문제 해답"
  3 | author: "권재명"
  4 | date: "9/27/2017"
  5 | output:
  6 |   html_document:
  7 |     toc: true
  8 |     toc_depth: 3
  9 | ---
 10 | 
 11 | 저자 책 웹페이지: <https://dataninja.me/ipds-kr/>
 12 | 
 13 | 
 14 | ```{r setup, include=FALSE}
 15 | knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
 16 | ```
 17 | 
 18 | # 원 문제:
 19 | 
 20 | - Ch 8 빅데이터 분류분석 I: 기본 개념과 로지스틱 모형
 21 | - Ch 9 빅데이터 분류분석 II: 라쏘와 랜덤 포레스트
 22 | 
 23 | <https://goo.gl/hmyTre> 혹은 
 24 | <https://archive.ics.uci.edu/ml/datasets.html> 
 25 | 에서 고차원 분류분석 데이터를 찾아서 
 26 | 로지스틱 분류분석을 실행하고, 
 27 | 결과를 슬라이드 10여 장 내외로 요약하라.
 28 | 
 29 | UCI 보다는 캐글에 있는 다음 자료를 분석해 보자.
 30 | <https://www.kaggle.com/ludobenistant/hr-analytics>
 31 | 
 32 | 
 33 | # R 환경 준비
 34 | 일단은 필수패키지인 `tidyverse`, 그리고 
 35 | 머신러닝을 위한 몇가지 패키지를 로드하자.
 36 | (로딩 메시지를 감추기 위해 `suppressMessages()` 명령을 사용.)
 37 | ```{r}
 38 | # install.packages("tidyverse")
 39 | suppressMessages(library(tidyverse))
 40 | 
 41 | # install.packages(c("ROCR", "MASS", "glmnet", "randomForest", "gbm", "rpart", "boot"))
 42 | suppressMessages(library(gridExtra))
 43 | suppressMessages(library(ROCR))
 44 | suppressMessages(library(MASS))
 45 | suppressMessages(library(glmnet))
 46 | suppressMessages(library(randomForest))
 47 | suppressMessages(library(gbm))
 48 | suppressMessages(library(rpart))
 49 | suppressMessages(library(boot))
 50 | ```
 51 | 
 52 | 
 53 | 책에서 기술한대로 이항 오차 함수, 그리고 `panel.cor` 함수를 정의하자:
 54 | ```{r}
 55 | binomial_deviance <- function(y_obs, yhat){
 56 |   epsilon = 0.0001
 57 |   yhat = ifelse(yhat < epsilon, epsilon, yhat)
 58 |   yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
 59 |   a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
 60 |   b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
 61 |   return(2*sum(a + b))
 62 | }
 63 | 
 64 | # exmaple(pairs) 에서 따옴
 65 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 66 |   usr <- par("usr"); on.exit(par(usr))
 67 |   par(usr = c(0, 1, 0, 1))
 68 |   r <- abs(cor(x, y))
 69 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 70 |   txt <- paste0(prefix, txt)
 71 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 72 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 73 | }
 74 | 
 75 | ```
 76 | 
 77 | 자료를 `human-resources-analytics.zip` 파일로 다운받은 후 다음처럼 R에 읽어들인다:
 78 | 
 79 | ```{r}
 80 | df <- read_csv("human-resources-analytics.zip")
 81 | glimpse(df)
 82 | ```
 83 | 
 84 | 분석의 목적은 다른 변수를 이용하여 `left` 여부를 예측하는 것이다.
 85 | 
 86 | 각 변수들의 요약통계량을 살펴보자:
 87 | ```{r}
 88 | summary(df)
 89 | ```
 90 | 
 91 | 범주형 변수들의 도수분포는 다음과 같다:
 92 | ```{r}
 93 | table(df$left)
 94 | table(df$sales)
 95 | table(df$salary)
 96 | ```
 97 | 
 98 | 수량형 변수들간의 관계는 산점도 행렬로 살펴볼 수 있다:
 99 | ```{r}
100 | set.seed(2017)
101 | df %>% 
102 |   dplyr::select(-sales, -salary) %>% 
103 |   sample_n(500) %>% 
104 |   pairs(lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
105 |     upper.panel = panel.cor)
106 | ```
107 | (`select` 함수가 `MASS` 라이브러리에 재정의 된 관계로 `dplyr::select()`로 표기했다. )
108 | 반응변수와 큰 상관관계가 있는 변수는 satisfaction_level 임을 알 수 있고,
109 | 설명변수중 past_evaluation, number_projects, average_monthly_hours 간에
110 | 비교적 높은 상관관계가 있음을 알 수 있다.
111 | 
112 | (혹시 시간이 걸리더라도 좀 더 고급진 산점도행렬을 얻고자 한다면 다음처럼
113 | `GGally::ggparis()` 함수를 사용하자)
114 | ```{r}
115 | # install.packages("GGally")
116 | # suppressMessages(library(GGally))
117 | # set.seed(2017); df %>% sample_n(1000)  %>% GGally::ggpairs()
118 | ```
119 | 
120 | 
121 | 
122 | ## 훈련, 검증, 테스트셋의 구분
123 | 
124 | 
125 | 모형행렬은 반응변수 `left` 를 제외한 모든 변수들을
126 | `model.matrix()` 에 입력해주면 얻을 수 있다:
127 | ```{r}
128 | x <- model.matrix( ~ . - left, data=df)
129 | glimpse(x)
130 | colnames(x)
131 | dim(x)
132 | ```
133 | 모형의 차원은 $p=19$ 임을 알 수 있다.
134 | 
135 | 원 데이터를 6:4:4 비율로 훈련, 검증, 테스트셋으로 나누도록 하자.
136 | (재현 가능성을 위해 `set.seed()`를 사용했다.)
137 | ```{r}
138 | set.seed(2017)
139 | n <- nrow(df)
140 | idx <- 1:n
141 | training_idx <- sample(idx, n * .60)
142 | idx <- setdiff(idx, training_idx)
143 | validate_idx = sample(idx, n * .20)
144 | test_idx <- setdiff(idx, validate_idx)
145 | length(training_idx)
146 | length(validate_idx)
147 | length(test_idx)
148 | training <- df[training_idx,]
149 | validation <- df[validate_idx,]
150 | test <- df[test_idx,]
151 | ```
152 | 
153 | 
154 | 
155 | ## A. 로지스틱 회귀분석
156 | 
157 | ```{r}
158 | df_glm_full <- glm(left ~ ., data=training, family=binomial)
159 | summary(df_glm_full)
160 | ```
161 | 
162 | 
163 | 로지스틱 모형의 예측 정확도 지표는 다음처럼 계산하고 시각화할 수 있다:
164 | ```{r}
165 | y_obs <- validation$left
166 | yhat_lm <- predict(df_glm_full, newdata=validation, type='response')
167 | ggplot(data.frame(y_obs, yhat_lm),
168 |              aes(yhat_lm, fill=factor(y_obs))) +
169 |   geom_density(alpha=.5)
170 | binomial_deviance(y_obs, yhat_lm)
171 | pred_lm <- prediction(yhat_lm, y_obs)
172 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
173 | plot(perf_lm, col='black', main="ROC Curve for GLM")
174 | performance(pred_lm, "auc")@y.values[[1]]
175 | ```
176 | 
177 | 
178 | ## B. glmnet 함수를 통한 라쏘 모형, 능형회귀, 변수선택
179 | 
180 | ```{r}
181 | xx <- model.matrix(left ~ .-1, df)
182 | x <- xx[training_idx, ]
183 | y <- training$left
184 | df_cvfit <- cv.glmnet(x, y, family = "binomial")
185 | ```
186 | 
187 | 
188 | ```{r}
189 | plot(df_cvfit)
190 | ```
191 | 
192 | 
193 | ```{r}
194 | y_obs <- validation$left
195 | yhat_glmnet <- predict(df_cvfit, s="lambda.1se", newx=xx[validate_idx,], type='response')
196 | yhat_glmnet <- yhat_glmnet[,1] # change to a vectro from [n*1] matrix
197 | binomial_deviance(y_obs, yhat_glmnet)
198 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
199 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
200 | performance(pred_glmnet, "auc")@y.values[[1]]
201 | ```
202 | 
203 | 
204 | ## C. 나무모형
205 | 
206 | 나무모형을 적합하는 `rpart::rpart()` 함수를 적용할 때 주의할 사항은
207 | 수량형 반응변수 `left` 를 인자로 변환해주어서 
208 | 회귀 나무모형이 아니라 분류분석 나무모형을 적합하는 것이다.
209 | ```{r}
210 | df_tr <- rpart(as.factor(left) ~ ., data = training)
211 | df_tr
212 | # printcp(df_tr)
213 | # summary(df_tr)
214 | opar <- par(mfrow = c(1,1), xpd = NA)
215 | plot(df_tr)
216 | text(df_tr, use.n = TRUE)
217 | par(opar)
218 | ```
219 | 
220 | 나무모형의 출력 결과를 살펴보면 어떤 변수들의 조합이
221 | 직원의 이직율을 높이는 지 알수 있다.
222 | 그림에서 가장 "성공"(이직)의 비율이 높은 잎(leaf)는 가장 오른쪽의 
223 | 잎, 즉:
224 | 
225 | - `3) satisfaction_level< 0.465 2551 1025 1 (0.40180321 0.59819679)`
226 | -  `7) number_project< 2.5 1069  125 1 (0.11693171 0.88306829)`
227 | - `15) last_evaluation< 0.575 985   48 1 (0.04873096 0.95126904) *`
228 | 
229 | 만족도가 낮고, 일한 프로젝트가 적고, 마지막 업무평가가 좋지 않은 집단임을 알 수 있다.
230 | 
231 | ```{r}
232 | yhat_tr <- predict(df_tr, validation)[, "1"]
233 | binomial_deviance(y_obs, yhat_tr)
234 | pred_tr <- prediction(yhat_tr, y_obs)
235 | perf_tr <- performance(pred_tr, measure = "tpr", x.measure = "fpr")
236 | performance(pred_tr, "auc")@y.values[[1]]
237 | ```
238 | 
239 | 
240 | ## D. 랜덤 포레스트 -----------
241 | `randomForest()` 함수를 적용할 때 주의할 사항은:
242 | 
243 | 1. 앞서 나무모형과 마찬가지로
244 |   수량형 반응변수 `left` 를 인자로 변환해주어서 
245 |   회귀모형이 아닌 분류분석이 실행되도록 한다.
246 | 2. 설명변수중 character 형인 두 변수 `sales`, `salary` 도 인자형으로 
247 |   바꿔줘야 한다.
248 | 
249 | 
250 | ```{r}
251 | set.seed(2017)
252 | df_rf <- randomForest(as.factor(left) ~ ., training %>%
253 |                         mutate(salary=as.factor(salary),
254 |                                sales=as.factor(sales)))
255 | df_rf
256 | ```
257 | 
258 | 랜덤포레스트 모형의 오류 감소 추세 그래프는 다음과 같다:
259 | ```{r}
260 | plot(df_rf)
261 | ```
262 | 
263 | 각 변수들의 모형에의 기여도는 다음과 같다:
264 | ```{r}
265 | varImpPlot(df_rf)
266 | ```
267 | 
268 | 예측을 실행할 때는, 훈련셋과 마찬가지의 `as.factor` 변환을 
269 | 같은 변수에 적용해야 한다:
270 | ```{r}
271 | yhat_rf <- predict(df_rf,
272 |                    newdata=validation %>%
273 |                      mutate(salary=as.factor(salary), sales=as.factor(sales)),
274 |                    type='prob')[,'1']
275 | binomial_deviance(y_obs, yhat_rf)
276 | pred_rf <- prediction(yhat_rf, y_obs)
277 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
278 | performance(pred_tr, "auc")@y.values[[1]]
279 | ```
280 | 
281 | 
282 | ## E. 부스팅 
283 | (결과 생략)
284 | 관심있는 독자는 다음 코드로 부스팅 모형을 실행할 수 있다.
285 | 앞서 랜덤포레스트 모형과 마찬가지로, 문자형 변수를 인자형 변수로 먼저 변환해
286 | 주어야 한다.
287 | ```{r eval=FALSE}
288 | set.seed(2017)
289 | df_gbm <- gbm(left ~ ., data=training %>%
290 |                      mutate(salary=as.factor(salary), sales=as.factor(sales)),
291 |              distribution="bernoulli",
292 |              n.trees=1000, cv.folds=3, verbose=TRUE)
293 | (best_iter <- gbm.perf(df_gbm, method="cv"))
294 | 
295 | 
296 | yhat_gbm <- predict(df_gbm, n.trees=best_iter,
297 |                     newdata=validation %>%
298 |                      mutate(salary=as.factor(salary), sales=as.factor(sales)), 
299 |                     type='response')
300 | binomial_deviance(y_obs, yhat_gbm)
301 | pred_gbm <- prediction(yhat_gbm, y_obs)
302 | perf_gbm <- performance(pred_gbm, measure="tpr", x.measure="fpr")
303 | performance(pred_gbm, "auc")@y.values[[1]]
304 | ```
305 | 
306 | 
307 | ## 모형 비교, 최종 모형 선택, 일반화 성능 평가
308 | 
309 | 다음과 같은 시각화로 각 예측모형들의 예측확률들의 관계를 알 수 있다:
310 | ```{r}
311 | pairs(data.frame(y_obs=y_obs,
312 |                  yhat_lm=yhat_lm,
313 |                  yhat_glmnet=c(yhat_glmnet),
314 |                  yhat_tr=yhat_tr,
315 |                  yhat_rf=yhat_rf),
316 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
317 |       upper.panel = panel.cor)
318 | ```
319 | 로지스틱 모형과 glmnet 모형은 매우 유사한 결과를 주는 것을 알 수 있다.
320 | 그리고, 나무 모형과 랜덤포레스트 모형도 상관관계가 높다.
321 | 반응변수의 관측치와 가장 상관관계가 높은, 즉 예측력이 높은 방법은 랜덤포레스트이다.
322 | 
323 | 테스트셋을 이용해 일반화 능력을 계산해보자:
324 | ```{r}
325 | y_obs_test <- test$left
326 | yhat_rf_test <- predict(df_rf,
327 |                    newdata=test %>%
328 |                      mutate(salary=as.factor(salary), sales=as.factor(sales)),
329 |                    type='prob')[,'1']
330 | binomial_deviance(y_obs_test, yhat_rf_test)
331 | pred_rf_test <- prediction(yhat_rf_test, y_obs_test)
332 | performance(pred_rf_test, "auc")@y.values[[1]]
333 | ```
334 | 
335 | 
336 | 마지막으로 ROC 커브를 통해 네 예측방법을 비교해보자.
337 | ```{r}
338 | plot(perf_lm, col='black', main="ROC Curve")
339 | plot(perf_glmnet, add=TRUE, col='blue')
340 | plot(perf_tr, add=TRUE, col='red')
341 | plot(perf_rf, add=TRUE, col='cyan')
342 | legend('bottomright', inset=.1,
343 |        legend=c("GLM", "glmnet", "Tree", "RF"),
344 |        col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
345 | ```
346 | 
347 | ## 결론
348 | 자료 자체가 가상의 (synthetic), 시뮬레이트 된 자료이므로
349 | 비교적 간단한 모형인 나무모형으로도 무척 높은 예측력을 얻을 수 있었다.
350 | 변수 해석에 관해서는 나무모형 결과를 참조하라.
351 | 


--------------------------------------------------------------------------------
/solutions/ch10-classification-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "<따라 하며 배우는 데이터 과학> 10-11장 연습문제 해답"
  3 | author: "권재명"
  4 | date: "9/28/2017"
  5 | output:
  6 |   html_document:
  7 |     toc: true
  8 |     toc_depth: 3
  9 | ---
 10 | 
 11 | 저자 책 웹페이지: <https://dataninja.me/ipds-kr/>
 12 | 
 13 | 
 14 | ```{r setup, include=FALSE}
 15 | knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
 16 | ```
 17 | 
 18 | # R 환경 준비
 19 | 일단은 필수패키지인 `tidyverse`, 그리고 
 20 | 머신러닝을 위한 몇가지 패키지를 로드하자.
 21 | (로딩 메시지를 감추기 위해 `suppressMessages()` 명령을 사용.)
 22 | ```{r}
 23 | # install.packages("tidyverse")
 24 | suppressMessages(library(tidyverse))
 25 | 
 26 | # install.packages(c("ROCR", "MASS", "glmnet", "randomForest", "gbm", "rpart", "boot"))
 27 | suppressMessages(library(gridExtra))
 28 | suppressMessages(library(ROCR))
 29 | suppressMessages(library(MASS))
 30 | suppressMessages(library(glmnet))
 31 | suppressMessages(library(randomForest))
 32 | suppressMessages(library(gbm))
 33 | suppressMessages(library(rpart))
 34 | suppressMessages(library(boot))
 35 | ```
 36 | 
 37 | 책에서 기술한대로 이항 오차 함수, 그리고 `panel.cor` 함수를 정의하자:
 38 | ```{r}
 39 | binomial_deviance <- function(y_obs, yhat){
 40 |   epsilon = 0.0001
 41 |   yhat = ifelse(yhat < epsilon, epsilon, yhat)
 42 |   yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
 43 |   a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
 44 |   b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
 45 |   return(2*sum(a + b))
 46 | }
 47 | 
 48 | # exmaple(pairs) 에서 따옴
 49 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 50 |   usr <- par("usr"); on.exit(par(usr))
 51 |   par(usr = c(0, 1, 0, 1))
 52 |   r <- abs(cor(x, y))
 53 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 54 |   txt <- paste0(prefix, txt)
 55 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 56 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 57 | }
 58 | 
 59 | ```
 60 | 
 61 | 
 62 | 
 63 | # 1. (위스콘신 유방암 데이터 II)
 64 | 위스콘신 유방암 데이터 중 약간 다른 데이터인 <https://goo.gl/gY8Iri> 혹은 <http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data>
 65 |   를 분석하라. 변수에 대한 설명은
 66 | <https://goo.gl/CqLTuk> 혹은
 67 | <http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names>
 68 |   에서 볼 수 있다. 분석의 목적은 다른 10개의 변수를 사용하여 class = 2(양성; benign), 4(악성; malign) 값을 예측하는 것이다.
 69 | 
 70 | 우선 다음 명령으로 자료를 다운받자:
 71 | ```{bash eval=FALSE}
 72 | wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data 
 73 | wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names 
 74 | ```
 75 | 
 76 | R 로 자료를 읽어들인 후, 
 77 | 적절한 변수명을 부여하고, 
 78 | ID 변수 ("Sample code number")를 제거하고,
 79 | 반응변수인 `class`를 0(Benign, 원래값 = 2)과 1(Malignant, 원래값 = 4)로 변환하자.
 80 | 원 데이터 파일에서, 결측치를 `?`로 나타내고 있으므로 `read_csv()`문의  `na=` 옵션을 
 81 | 사용하여 결측치를 올바로 읽어들여야 한다.
 82 | ```{r}
 83 | df <- read_csv("breast-cancer-wisconsin.data", col_names = FALSE, na="?")
 84 | names(df) <- tolower(gsub(" ", "_",
 85 |              c("Sample code number",
 86 |                "Clump Thickness", 
 87 |                "Uniformity of Cell Size", 
 88 |                "Uniformity of Cell Shape", 
 89 |                "Marginal Adhesion", 
 90 |                "Single Epithelial Cell Size", 
 91 |                "Bare Nuclei", 
 92 |                "Bland Chromatin", 
 93 |                "Normal Nucleoli", 
 94 |                "Mitoses", 
 95 |                "Class")))
 96 | df <- df %>%
 97 |   mutate(is_malig=ifelse(class==4, 1, 0)) %>%
 98 |   dplyr::select(-sample_code_number, -class)
 99 | glimpse(df)
100 | ```
101 | 
102 | 
103 | 
104 | ##  a. (결측치 처리)
105 | 설명변수중에결측치가있는가?
106 | 어느변수에몇개의결측치가있는가?
107 | 어떻게해결하는것이좋 을까? 
108 | (관심 있는 독자는 Saar-Tsechansky & Provost (2007) 등을 참고하라.)
109 | 
110 | 결측치를 찾아내는 간단한 방법은 `summary()` 함수를 사용하는 것이다:
111 | ```{r}
112 | summary(df)
113 | ```
114 | `bare_nuclei` 변수에 16개의 결측치가 있음을 알 수 있다.
115 | 
116 | 결측치를 해결하는 다양한 방법이 있지만 여기서는 간단히 
117 | 중앙값으로 대치하도록 하자.
118 | ```{r}
119 | df <- df %>%
120 |   mutate(bare_nuclei=ifelse(is.na(bare_nuclei), 
121 |                             median(df$bare_nuclei, na.rm=TRUE),
122 |                             bare_nuclei))
123 | summary(df)
124 | ```
125 | 
126 | 
127 | ##  b. (분류분석)
128 | 결측치를 표본의 중앙값으로 대치하고 분류 예측분석을 시행하라. 
129 | 어떤 모형이 가장 성능이 좋은 가? 결과를 슬라이드 10여 장 내외로 요약하라.
130 | 
131 | 
132 | 수량형 변수들간의 관계는 산점도 행렬로 살펴볼 수 있다:
133 | ```{r}
134 | set.seed(2017)
135 | df %>% 
136 |   sample_n(500) %>% 
137 |   pairs(lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
138 |     upper.panel = panel.cor)
139 | ```
140 | (`select` 함수가 `MASS` 라이브러리에 재정의 된 관계로 `dplyr::select()`로 표기했다. )
141 | 대부분의 설명변수가 반응변수와 상관관계가 높음을 알 수 있다.
142 | 설명변수 간의 상관관계도 높은 편이다.
143 | 
144 | 
145 | 
146 | 
147 | 
148 | 
149 | 
150 | 
151 | ## 훈련, 검증, 테스트셋의 구분
152 | 
153 | 원 데이터를 6:4:4 비율로 훈련, 검증, 테스트셋으로 나누도록 하자.
154 | (재현 가능성을 위해 `set.seed()`를 사용했다.)
155 | ```{r}
156 | set.seed(2017)
157 | n <- nrow(df)
158 | idx <- 1:n
159 | training_idx <- sample(idx, n * .60)
160 | idx <- setdiff(idx, training_idx)
161 | validate_idx = sample(idx, n * .20)
162 | test_idx <- setdiff(idx, validate_idx)
163 | length(training_idx)
164 | length(validate_idx)
165 | length(test_idx)
166 | training <- df[training_idx,]
167 | validation <- df[validate_idx,]
168 | test <- df[test_idx,]
169 | ```
170 | 
171 | 
172 | 
173 | ## A. 로지스틱 회귀분석
174 | 
175 | ```{r}
176 | df_glm_full <- glm(is_malig ~ ., data=training, family=binomial)
177 | summary(df_glm_full)
178 | ```
179 | 통계적으로 유의한 변수들로는,
180 | `bare_nuclei`값이 클수록,
181 | `clump_thickness`값이 클수록, 
182 | 그리고 `bland_chromatin` 값이 클수록, 
183 | 악성일 확율이 높음을 알 수 있다.
184 | 
185 | 로지스틱 모형의 예측 정확도 지표는 다음처럼 계산하고 시각화할 수 있다:
186 | ```{r}
187 | y_obs <- validation$is_malig
188 | yhat_lm <- predict(df_glm_full, newdata=validation, type='response')
189 | ggplot(data.frame(y_obs, yhat_lm),
190 |              aes(yhat_lm, fill=factor(y_obs))) +
191 |   geom_density(alpha=.5)
192 | binomial_deviance(y_obs, yhat_lm)
193 | pred_lm <- prediction(yhat_lm, y_obs)
194 | perf_lm <- performance(pred_lm, measure = "tpr", x.measure = "fpr")
195 | performance(pred_lm, "auc")@y.values[[1]]
196 | ```
197 | 
198 | 
199 | ## B. glmnet 함수를 통한 라쏘 모형, 능형회귀, 변수선택
200 | 
201 | ```{r}
202 | xx <- model.matrix(is_malig ~ .-1, df)
203 | x <- xx[training_idx, ]
204 | y <- training$is_malig
205 | df_cvfit <- cv.glmnet(x, y, family = "binomial")
206 | ```
207 | 
208 | 
209 | ```{r}
210 | plot(df_cvfit)
211 | ```
212 | 
213 | 
214 | ```{r}
215 | y_obs <- validation$is_malig
216 | yhat_glmnet <- predict(df_cvfit, s="lambda.1se", newx=xx[validate_idx,], type='response')
217 | yhat_glmnet <- yhat_glmnet[,1] # change to a vectro from [n*1] matrix
218 | binomial_deviance(y_obs, yhat_glmnet)
219 | pred_glmnet <- prediction(yhat_glmnet, y_obs)
220 | perf_glmnet <- performance(pred_glmnet, measure="tpr", x.measure="fpr")
221 | performance(pred_glmnet, "auc")@y.values[[1]]
222 | ```
223 | 
224 | 
225 | ## C. 나무모형
226 | 
227 | 나무모형을 적합하는 `rpart::rpart()` 함수를 적용할 때 주의할 사항은
228 | 수량형 반응변수 `is_malig` 를 인자로 변환해주어서 
229 | 회귀 나무모형이 아니라 분류분석 나무모형을 적합하는 것이다.
230 | ```{r}
231 | df_tr <- rpart(as.factor(is_malig) ~ ., data = training)
232 | df_tr
233 | # printcp(df_tr)
234 | # summary(df_tr)
235 | opar <- par(mfrow = c(1,1), xpd = NA)
236 | plot(df_tr)
237 | text(df_tr, use.n = TRUE)
238 | par(opar)
239 | ```
240 | 
241 | 나무모형의 출력 결과를 살펴보면 어떤 변수들의 조합이
242 | 가장 악성 종양일 가능성이 높은지를 알 수 있다.
243 | 그림에서 가장 "악성(is_malig)"의 비율이 높은 잎(leaf)는 가장 오른쪽의 
244 | 잎이다. 즉, 다음 조건을 만족할 때, 103 케이스 중, 101개의 관측치가 악성이였다:
245 | 
246 | - `3) uniformity_of_cell_size>=3.5 131   9 1 (0.06870229 0.93129771)`
247 | - `7) bland_chromatin>=3.5 103   2 1 (0.01941748 0.98058252) *`
248 | 
249 | 
250 | 
251 | ```{r}
252 | yhat_tr <- predict(df_tr, validation)[, "1"]
253 | binomial_deviance(y_obs, yhat_tr)
254 | pred_tr <- prediction(yhat_tr, y_obs)
255 | perf_tr <- performance(pred_tr, measure = "tpr", x.measure = "fpr")
256 | performance(pred_tr, "auc")@y.values[[1]]
257 | ```
258 | 
259 | 
260 | ## D. 랜덤 포레스트
261 | `randomForest()` 함수를 적용할 때 주의할 사항은
262 | 앞서 나무모형과 마찬가지로
263 |   수량형 반응변수 `is_malig` 를 인자로 변환해주어서 
264 |   회귀모형이 아닌 분류분석이 실행되도록 한다.
265 | 
266 | 
267 | ```{r}
268 | set.seed(2017)
269 | df_rf <- randomForest(as.factor(is_malig) ~ ., training)
270 | df_rf
271 | ```
272 | 
273 | 랜덤포레스트 모형의 오류 감소 추세 그래프는 다음과 같다:
274 | ```{r}
275 | plot(df_rf)
276 | ```
277 | 
278 | 각 변수들의 모형에의 기여도는 다음과 같다:
279 | ```{r}
280 | varImpPlot(df_rf)
281 | ```
282 | 
283 | 랜덤포레스트 모형의 예측결과는 다음과 같다:
284 | ```{r}
285 | yhat_rf <- predict(df_rf, newdata=validation, type='prob')[,'1']
286 | binomial_deviance(y_obs, yhat_rf)
287 | pred_rf <- prediction(yhat_rf, y_obs)
288 | perf_rf <- performance(pred_rf, measure="tpr", x.measure="fpr")
289 | performance(pred_tr, "auc")@y.values[[1]]
290 | ```
291 | 
292 | 
293 | ## E. 부스팅 
294 | (결과 생략)
295 | 
296 | ## 모형 비교, 최종 모형 선택, 일반화 성능 평가
297 | 
298 | 다음과 같은 시각화로 각 예측모형들의 예측확률들의 관계를 알 수 있다:
299 | ```{r}
300 | pairs(data.frame(y_obs=y_obs,
301 |                  yhat_lm=yhat_lm,
302 |                  yhat_glmnet=c(yhat_glmnet),
303 |                  yhat_tr=yhat_tr,
304 |                  yhat_rf=yhat_rf),
305 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
306 |       upper.panel = panel.cor)
307 | ```
308 | 고려한 모든 모형이 유사한 예측결과를 줌을 알 수 있다.
309 | 
310 | 
311 | 검증셋 AUC 값이 가장 높은 모형은 간단한 선형모형이었다.
312 | 테스트셋을 이용해 선형모형의 일반화 능력을 계산해보자:
313 | ```{r}
314 | y_obs_test <- test$is_malig
315 | yhat_glm_test <- predict(df_glm_full, newdata=test, type='response')
316 | binomial_deviance(y_obs_test, yhat_glm_test)
317 | pred_glm_test <- prediction(yhat_glm_test, y_obs_test)
318 | performance(pred_glm_test, "auc")@y.values[[1]]
319 | ```
320 | 
321 | 
322 | 마지막으로 ROC 커브를 통해 네 예측방법을 비교해보자.
323 | ```{r}
324 | plot(perf_lm, col='black', main="ROC Curve")
325 | plot(perf_glmnet, add=TRUE, col='blue')
326 | plot(perf_tr, add=TRUE, col='red')
327 | plot(perf_rf, add=TRUE, col='cyan')
328 | legend('bottomright', inset=.1,
329 |        legend=c("GLM", "glmnet", "Tree", "RF"),
330 |        col=c('black', 'blue', 'red', 'cyan'), lty=1, lwd=2)
331 | ```
332 | 
333 | ## 결론
334 | 반응변수가 비교적 예측이 쉬운 변수이다.
335 | 간단한 로지스틱 모형으로 높은 예측력을 얻을 수 있다.
336 | 변수 해석에 관해서는 로지스틱 모형 결과와, 나무모형 결과를 참조하라.
337 | 
338 | 
339 | # 2. (위스콘신 유방암 데이터 III)
340 |   2 위스콘신 유방암 데이터 중 또 다른 데이터인 <https://goo.gl/KaZD7Y>
341 |     혹은 <http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data>
342 |     를 분석하라. 이 진단(diagnostics) 데 이터에 대한 설명은 <https://goo.gl/mVFQUa> 혹은
343 |   <http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.names>
344 | 에서 볼 수 있다. 두 가지 분석이 가능하다. 
345 | 
346 | - (1) 2열의 outcome 변수를 예측하기[분류분석];
347 | - (2) 재발(recurrent)한 관측치들에 대해서 3열의 
348 |     재발 기간(time to recur) 예측하 기[회귀분석].
349 | 
350 | 이 중 분류분석인 (1)을 시행하라. 분석 결과를 슬라이드 10여 장 내외로 요약하라.
351 | 
352 | 우선 자료를 다운로드한다:
353 | ```{bash eval=FALSE}
354 | wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.data
355 | wget http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wpbc.names
356 | ```
357 | 
358 | (생략. 위와 같은 방법을 적용하면 된다.)
359 | 
360 | 
361 | 
362 | # 3 (스팸 데이터)
363 | 스팸 데이터 <https://goo.gl/gIZU9C> 혹은 <https://archive.ics.uci.edu/ml/datasets/Spambase> 
364 | 를 분석하라. 어떤 모형이 가장 높은 성능을 주는가? 
365 | 분석 결과를 슬 라이드 10여 장 내외로 요약하라.
366 | 
367 | (생략. 본문의 11장을 참조.)
368 | 
369 | 
370 | # 4 (고차원 분류분석)
371 | <https://archive.ics.uci.edu/ml/datasets> 혹은
372 | <https://www.kaggle.com/datasets> 에서 다른 고차원 분류분석 데이터를 찾아서 본문에 설명한 분석을 실행하고, 
373 | 결과를 슬라이드 10여 장 내외로 요약하라.
374 | 
375 | (생략. 8-9장 연습문제 해답 참조.
376 | <http://rpubs.com/dataninja/ipds-kr-ch08> )
377 | 
378 | 


--------------------------------------------------------------------------------
/solutions/ch13-regression-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "<따라 하며 배우는 데이터 과학> 13-14장 연습문제 해답"
  3 | author: "권재명"
  4 | date: "9/29/2017"
  5 | output:
  6 |   html_document:
  7 |     toc: true
  8 |     toc_depth: 3
  9 | ---
 10 | 
 11 | 저자 책 웹페이지: <https://dataninja.me/ipds-kr/>
 12 | 
 13 | 
 14 | ```{r setup, include=FALSE}
 15 | knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
 16 | ```
 17 | 
 18 | # R 환경 준비
 19 | 일단은 필수패키지인 `tidyverse`, 그리고 
 20 | 머신러닝을 위한 몇가지 패키지를 로드하자.
 21 | (로딩 메시지를 감추기 위해 `suppressMessages()` 명령을 사용.)
 22 | ```{r}
 23 | # install.packages("tidyverse")
 24 | suppressMessages(library(tidyverse))
 25 | 
 26 | # install.packages(c("ROCR", "MASS", "glmnet", "randomForest", "gbm", "rpart", "boot"))
 27 | suppressMessages(library(gridExtra))
 28 | suppressMessages(library(ROCR))
 29 | suppressMessages(library(MASS))
 30 | suppressMessages(library(glmnet))
 31 | suppressMessages(library(randomForest))
 32 | suppressMessages(library(gbm))
 33 | suppressMessages(library(rpart))
 34 | suppressMessages(library(boot))
 35 | ```
 36 | 
 37 | 책에서 기술한대로 RMSE (root mean squared error), 
 38 | MAE (median absolute error),
 39 | `panel.cor` 함수를 정의하자:
 40 | ```{r}
 41 | rmse <- function(yi, yhat_i){
 42 |   sqrt(mean((yi - yhat_i)^2))
 43 | }
 44 | 
 45 | mae <- function(yi, yhat_i){
 46 |   mean(abs(yi - yhat_i))
 47 | }
 48 | 
 49 | # exmaple(pairs) 에서 따옴
 50 | panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
 51 |   usr <- par("usr"); on.exit(par(usr))
 52 |   par(usr = c(0, 1, 0, 1))
 53 |   r <- abs(cor(x, y))
 54 |   txt <- format(c(r, 0.123456789), digits = digits)[1]
 55 |   txt <- paste0(prefix, txt)
 56 |   if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
 57 |   text(0.5, 0.5, txt, cex = cex.cor * r)
 58 | }
 59 | 
 60 | ```
 61 | 
 62 | 
 63 | 
 64 | # 13-1. (아이오와 주 주택 가격데이터 분석)
 65 | 아이오와 주의 에임스시 주택 가격데이터(De Cock, 2011)를 구하여 
 66 | 회귀분석을 행하라. 
 67 | 데이터는 <https://goo.gl/ul7Ub7>
 68 | (<https://www.kaggle.com/c/house-prices-advanced-regression-techniques>)
 69 | 혹은 
 70 | <https://goo.gl/8gKgaT> (<http://www.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls>)
 71 | <https://goo.gl/qgVg2z> (<https://ww2.amstat.org/publications/jse/v19n3/decock/AmesHousing.txt>) 에서 구할 수 있다. 
 72 | 
 73 | 변수 설명은 
 74 | <https://goo.gl/2vcCfT> (<https://ww2.amstat.org/publications/jse/v19n3/decock/DataDocumentation.txt>)
 75 | 를 참조하라.
 76 | <https://ww2.amstat.org/publications/jse/v19n3/decock.pdf> 문서를 참조해도 좋다.
 77 | 
 78 | 이 데이터에 대한 회귀분석을 행하라. 
 79 | 본문에서 기술한 방법 중 어떤 회귀분 석 방법이 가장 정확한 결과를 주는가? 
 80 | 결과를 보고서로 정리하라.
 81 | 
 82 | ## 자료 취득
 83 | 우선 다음 명령으로 자료를 다운받자:
 84 | ```{bash eval=FALSE}
 85 | wget https://ww2.amstat.org/publications/jse/v19n3/decock/AmesHousing.txt
 86 | wget https://ww2.amstat.org/publications/jse/v19n3/decock/AmesHousing.xls
 87 | ```
 88 | 
 89 | R 로 자료를 읽어들인 후, 
 90 | 다음처럼 변수명을 변환하자:
 91 | 
 92 | 1. `make.names(..., unique=TRUE)` 함수로 변수명을 R 에서 사용이 쉬운 이름으로 바꾼다.
 93 | 1. 마침표(.) 대신 밑줄(_)을 사용한다.
 94 | 1. 모두 소문자로 바꾼다
 95 | 
 96 | 그리고, id 변수인 order, pid 를 제거한다.
 97 | 
 98 | ```{r}
 99 | df1 <- read_tsv("AmesHousing.txt")
100 | names(df1) <- tolower(gsub("\\.", "_", make.names(names(df1), unique=TRUE)))
101 | df1 <- df1 %>% dplyr::select(-order, -pid)
102 | glimpse(df1)
103 | ```
104 | 
105 | 
106 | ##  결측치 처리
107 | 
108 | 자료의 여러 변수에 결측치가 포함되어 있다.
109 | 결측치를 찾아내는 간단한 방법은 `summary()` 함수를 사용하는 것이다:
110 | ```{r}
111 | # summary(df)
112 | ```
113 | 
114 | 
115 | 또다른 방법은 다음처럼 `summarize_all()` + `funs()` 트릭을 이용하는 것이다:
116 | ```{r}
117 | df1 %>%
118 |   summarize_all(funs(length(which(is.na(.)))/length(.))) %>% 
119 |   glimpse()
120 | ```
121 | 이로부터 여러 변수들이 결측치를 가지고 있음을 알 수 있다.
122 | 
123 | 결측치를 해결하는 다양한 방법이 있지만 여기서는 간단히 처리한다:
124 | 
125 | 1. 수량형 변수는 중앙값으로 대치한다.
126 | 2. 문자형 변수는 ~~최빈값~~`"NA"` 문자열로 대치한다.
127 | 
128 | 아래 명령은 `mutate_if()`, `rename_all()` 함수등을 이용하여 
129 | 위의 처리를 해준다:
130 | ```{r}
131 | df2 <- df1 %>%
132 |   mutate_if(is.numeric, funs(imp=ifelse(is.na(.), median(., na.rm=TRUE), .))) %>%
133 |   # mutate_if(is.character, funs(imp=ifelse(is.na(.), sort(table(.), decreasing=TRUE)[1], .))) %>%
134 |   mutate_if(is.character, funs(imp=ifelse(is.na(.), "NA", .))) %>%
135 |   dplyr::select(ends_with("_imp")) %>%
136 |   rename_all(funs(gsub("_imp", "", .)))
137 | df2 %>% glimpse()
138 | ```
139 | 
140 | 그리고, `mo_sold` 변수는 수량형으로 읽어들였지만, 수량형보다는 범주형으로
141 | 간주하는 것이 좋을 것 같다.
142 | 이 외에 다양한 변수를 하나하나 살펴보면 다른 많은 전처리를 해 줄 수 
143 | 있겠지만, 일단 위와 같은 변환을 한 자료를 우리의 분석자료로 저장하도록 하자:
144 | 
145 | ```{r}
146 | df <- df2 %>% mutate(mo_sold=as.character(mo_sold))
147 | ```
148 | 
149 | 
150 | 
151 | ## 훈련, 검증, 테스트셋의 구분
152 | 
153 | 원 데이터를 6:4:4 비율로 훈련, 검증, 테스트셋으로 나누도록 하자.
154 | (재현 가능성을 위해 `set.seed()`를 사용했다.)
155 | ```{r}
156 | set.seed(2017)
157 | n <- nrow(df)
158 | idx <- 1:n
159 | training_idx <- sample(idx, n * .60)
160 | idx <- setdiff(idx, training_idx)
161 | validate_idx = sample(idx, n * .20)
162 | test_idx <- setdiff(idx, validate_idx)
163 | length(training_idx)
164 | length(validate_idx)
165 | length(test_idx)
166 | training <- df[training_idx,]
167 | validation <- df[validate_idx,]
168 | test <- df[test_idx,]
169 | ```
170 | 
171 | 
172 | 일부 분석 함수는 
173 | 문자형 변수를 자동적으로 인자형으로 변환하지 않으므로, 다음 
174 | 데이터셋도 만들어 두자. `mutate_if()` 함수를 이용하였다.
175 | ```{r}
176 | dff <- df %>% mutate_if(is.character, as.factor)
177 | glimpse(dff)
178 | training_f <- dff[training_idx, ]
179 | validation_f <- dff[validate_idx, ]
180 | test_f <- dff[test_idx, ]
181 | ```
182 | 
183 | 
184 | ## A. 회귀분석
185 | 
186 | 일단 모든 변수를 다 넣은 선형모형을 돌려보자:
187 | ```{r}
188 | df_lm_full <- lm(saleprice ~ ., data=training_f)
189 | summary(df_lm_full)
190 | ```
191 | 통계적으로 유의한 여러 변수들이 잡힌다.
192 | 
193 | 아쉽게도, 선형모형을 실행하려면 다음과 같은 에러가 생긴다.
194 | 훈련셋에는 없는 인자 수준이 검증 셋에 나타나기 때문이다.
195 | ```{r eval=FALSE}
196 | y_obs <- validation$saleprice
197 | yhat_lm <- predict(df_lm_full, newdata=validation_f)
198 | ```
199 | 
200 | ```
201 |  Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : 
202 |   factor ms_zoning has new levels A (agr) 
203 | ```
204 | 
205 | 
206 | (고급문제: 위의 에러는 어떻게 해결할 수 있을까?)
207 | 
208 | 선형모형 자체는 일반적으로 높은 예측력을 보이지 않기 때문에,
209 | 다음처럼 스텝(stepwise) 절차를 통한 변수선택을 시행할 수 있다.
210 | (실행시간 관계상 생략)
211 | 독자들의 컴퓨터에서 실행해 볼 것을 권한다.
212 | 
213 | ```{r eval=FALSE}
214 | df_step <- stepAIC(df_lm_full, scope = list(upper = ~ ., lower = ~1))
215 | df_step
216 | anova(df_step)
217 | summary(df_step)
218 | length(coef(df_step))
219 | length(coef(df_lm_full))
220 | ```
221 | 참고로, 저자의 컴퓨터에서의 실행 후에
222 | 원 모형의 모수 개수는 286, 
223 | 스텝 변수선택 이후의 모수 개수는 147 이었다.
224 | 
225 | 만약 위와 같은 `df_lm_full`, `df_step` 모형이 
226 | 제대로 작동하면 다음처럼 검증셋에서의 RMSE 오차값을 구할 수 있다.
227 | ```{r eval=FALSE}
228 | y_obs <- validation$saleprice
229 | yhat_lm <- predict(df_lm_full, newdata=validation)
230 | yhat_step <- predict(df_step, newdata=validation)
231 | rmse(y_obs, yhat_lm)
232 | rmse(y_obs, yhat_step)
233 | ```
234 | 
235 | 
236 | 
237 | ## B. glmnet 함수를 통한 라쏘 모형, 능형회귀, 변수선택
238 | 
239 | ```{r}
240 | xx <- model.matrix(saleprice ~ .-1, df)
241 | x <- xx[training_idx, ]
242 | y <- training$saleprice
243 | df_cvfit <- cv.glmnet(x, y)
244 | ```
245 | 
246 | 람다 모수의 값에 따른 오차의 값의 변화 추이는 다음과 같다:
247 | ```{r}
248 | plot(df_cvfit)
249 | # coef(df_cvfit, s = c("lambda.1se"))
250 | # coef(df_cvfit, s = c("lambda.min"))
251 | ```
252 | 
253 | 
254 | 라쏘 모형의 RMSE, MAE 값은:
255 | ```{r}
256 | y_obs <- validation$saleprice
257 | yhat_glmnet <- predict(df_cvfit, s="lambda.min", newx=xx[validate_idx,])
258 | yhat_glmnet <- yhat_glmnet[,1] # change to a vector from [n*1] matrix
259 | rmse(y_obs, yhat_glmnet)
260 | mae(y_obs, yhat_glmnet)
261 | ```
262 | 
263 | 
264 | ## C. 나무모형
265 | 
266 | `rpart::rpart()` 함수를 사용해
267 | 나무 회귀분석모형을 적합하자.
268 | ```{r}
269 | df_tr <- rpart(saleprice ~ ., data = training)
270 | df_tr
271 | # printcp(df_tr)
272 | # summary(df_tr)
273 | opar <- par(mfrow = c(1,1), xpd = NA)
274 | plot(df_tr)
275 | text(df_tr, use.n = TRUE)
276 | par(opar)
277 | ```
278 | 
279 | 나무모형의 출력 결과를 살펴보면 최고의 집값으로 이어지는 변수의 조합은
280 | 다음과 같음을 알 수 있다:
281 | ```
282 |    3) overall_qual>=7.5 289 2.448313e+12 310114.40  
283 |      7) total_bsmt_sf>=1721.5 83 8.706003e+11 391959.40  
284 |       15) gr_liv_area>=2225.5 35 4.250778e+11 461694.10  
285 |         31) neighborhood=CollgCr,NoRidge,NridgHt,StoneBr 28 1.775330e+11 497176.50 *
286 | ```
287 | 
288 | 
289 | 아쉽게도 `rpart::rpart` 모형도 훈련셋에서 관측되지 않은 
290 | 인자 레벨이 나오면 
291 | 앞서와 같은 오류 메시지를 보내며 예측을 해내지 못한다:
292 | ```{r eval=FALSE}
293 | yhat_tr <- predict(df_tr, validation)
294 | # rmse(y_obs, yhat_tr)
295 | ```
296 | 
297 | 
298 | ## D. 랜덤 포레스트
299 | `randomForest()` 함수를 적용할 때
300 | X 예측변수들중 문자열 변수들은 
301 | 인자형 변수로 바꿔 줘야 한다.
302 | 앞서 만들어둔 `training_f` 를 사용한다.
303 | 
304 | ```{r}
305 | set.seed(2017)
306 | df_rf <- randomForest(saleprice ~ ., training_f)
307 | df_rf
308 | ```
309 | 
310 | 랜덤포레스트 모형의 오류 감소 추세 그래프는 다음과 같다:
311 | ```{r}
312 | plot(df_rf)
313 | ```
314 | 
315 | 각 변수들의 모형에의 기여도는 다음과 같다:
316 | ```{r}
317 | varImpPlot(df_rf)
318 | ```
319 | 
320 | 랜덤포레스트 모형의 예측결과는 다음과 같다:
321 | ```{r}
322 | yhat_rf <- predict(df_rf, newdata=validation_f)
323 | rmse(y_obs, yhat_rf)
324 | mae(y_obs, yhat_rf)
325 | ```
326 | 
327 | 
328 | ## E. 부스팅 
329 | `gbm::gbm()` 함수로 부스팅 모형을 적합할 수 있다.
330 | 랜덤포레스트와 마찬가지로 
331 | X 예측변수들중 문자열 변수들은 
332 | 인자형 변수로 바꿔 줘야 한다.
333 | (실행시간 관계상 생략)
334 | 
335 | ```{r eval=FALSE}
336 | set.seed(2017)
337 | df_gbm <- gbm(saleprice ~ ., data=training_f,
338 |               n.trees=40000, cv.folds=3, verbose = TRUE)
339 | (best_iter = gbm.perf(df_gbm, method="cv"))
340 | yhat_gbm <- predict(df_gbm, n.trees=best_iter, newdata=validation_f)
341 | rmse(y_obs, yhat_gbm)
342 | ```
343 | 
344 | 
345 | 
346 | ## 모형 비교, 최종 모형 선택, 일반화 성능 평가
347 | 검증셋에서 예측능력이 가장 높은 (RMSE 값과 MAE 값이 가장 작은)
348 | 것은 랜덤포레스트이다:
349 | ```{r}
350 | tibble(method=c("glmnet", "rf"),
351 |        rmse=c(rmse(y_obs, yhat_glmnet), rmse(y_obs, yhat_rf)), 
352 |        mae=c(mae(y_obs, yhat_glmnet), mae(y_obs, yhat_rf)))
353 | ```
354 | 
355 | 테스트셋을 이용해 랜덤포레스트모형의 일반화 능력을 계산해보자:
356 | ```{r}
357 | y_obs_test <- test$saleprice
358 | yhat_rf_test <- predict(df_rf, newdata=test_f)
359 | rmse(y_obs_test, yhat_rf_test)
360 | mae(y_obs_test, yhat_rf_test)
361 | ```
362 | 
363 | 
364 | 다음과 같은 시각화로 예측모형들의 오차의 분포를 비교할 수 있다.
365 | glmnet 에 비해 
366 | 랜덤포레스트 모형이
367 | 아주 큰 예측오차의 수가 적은 것을 알 수 있다.
368 | 즉, 랜덤포레스트 모형이 좀 더 로버스트하다고 할 수 있다.
369 | ```{r}
370 | boxplot(list(# lm = y_obs-yhat_step,
371 |              # gbm = y_obs-yhat_gbm,
372 |              glmnet = y_obs-yhat_glmnet,
373 |              rf = y_obs-yhat_rf
374 |              ), ylab="Error in Validation Set")
375 | abline(h=0, lty=2, col='blue')
376 | ```
377 | 
378 | 
379 | 다음 시각화는 glmnet 과 random forest 예측값, 그리고 실제 관측치와의 
380 | 관계를 보여준다. 
381 | RMSE, MAE 결과와 마찬가지로, 
382 | 관측값과의 상관관계도 랜덤 포레스트가 더 높다:
383 | ```{r}
384 | pairs(data.frame(y_obs=y_obs,
385 |                  # yhat_lm=yhat_lm,
386 |                  yhat_glmnet=c(yhat_glmnet),
387 |                  # yhat_tr=yhat_tr,
388 |                  yhat_rf=yhat_rf),
389 |       lower.panel=function(x,y){ points(x,y); abline(0, 1, col='red')},
390 |       upper.panel = panel.cor)
391 | ```
392 | 
393 | 
394 | ## 결론
395 | 이번 자료는 차원도 높고,
396 | 결측치도 많은 분석이 어려운 자료였다.
397 | 하지만 비교적 적은 코딩으로
398 | 예측력이 상당히 높은 
399 | 랜덤포레스트 모형을 적합할 수 있었다.
400 | 
401 | ## 추가 연구 문제
402 | 관심있는 독자는 이 데이터에서 추가로 다음 분석을 시도해 볼 것을 권한다:
403 | 
404 | 1. 훈련셋에서 관측되지 않았지만
405 |   검증/테스트셋에 나타나는 인자변수의 범주가 있을 때
406 |   `factor ... has new levels ...` 에러가 생긴다.
407 |   이 에러를 해결하려면 어떻게 하면 될까?
408 |   (위의 `df_lm` 과 `df_tr` 모형을 예로 설명하라)
409 | 1. 이 데이터는 설명변수의 차원이 무척 높다.
410 |   높은 차원을 의미가 높은 낮은 차원으로 변환하는 방법 중 하나는
411 |   주성분분석(Principal Component Analysis, PCA) 이다.
412 |   `prcomp()` 로 X변수들의 주성분 분석을 시행하라.
413 | 1. 원래 X변수들 대신 주성분 변수를 사용한 회귀분석을
414 |   주성분회귀 (principal component regression, PCR) 이라고 한다.
415 |    R의 `pls` 라이브러리를 사용하여 주성분 분석을 시행하라.
416 |    RMSE, MAE 오차의 크기는?
417 | 1. X 변수들 사이의, 그리고 X-Y변수간의 흥미로운 관계는 어떤 것이 있을까?
418 | 1. 비정형 자료등의 복잡한 고차원 (large $p$) 자료가
419 |   대량으로 있을 때 (large $n$)
420 |   딥러닝(deep learning)을 적용하여 높은 예측력을 얻을 수 있다.
421 |   딥러닝은 분류분석에 흔히 사용되지만
422 |   회귀분석에도 사용될 수 있다.
423 |   (Means Squared Error 혹은 L2를 cost function 으로 사용)
424 |   텐서플로우(tensorflow <https://www.tensorflow.org/>) 를 
425 |   사용하여 이 문제를 풀어보자.
426 |   랜덤포레스트보다 더 적은 오차를 얻을 수 있는가?
427 | 
428 | 
429 | # 14-1. (적포도주 품질 예측)
430 | 회귀분석을 본문에 기술된 적포도주 데이터(winequality-red.csv)에 실행해보라. 
431 | 결과를 슬라이드 10여 장
432 | 내외로 요약하라.
433 | 
434 | (생략; 교재 본문 참조)
435 | 
436 | 
437 | # 14-2. (전복 나이 예측)
438 | <https://goo.gl/R0Pyrt> (<http://archive.ics.uci.edu/ml/datasets/Abalone>) 
439 | 데이터에 회귀분석을 적용하고, 결과를 슬라이드 10여 장 내외로 요약하라.
440 | 
441 | (생략; 결측치가 없고, 변수 개수도 적은 간단한 문제입니다.)
442 | 
443 | - n = 4177
444 | - p = 8
445 | - 결측치? - 없음
446 | - 반응변수:	`Rings`. integer. +1.5 gives the age in years.
447 | 
448 | 
449 | # 14-3. (대기 질 예측)
450 | <https://goo.gl/etZcrE> (<http://archive.ics.uci.edu/ml/datasets/Air+Quality>)
451 | 데이터에 회귀분석을 적용하고, 결과를 슬라이드 10여 장 내외로 요약하라.
452 | 
453 | (생략; 시계열 분석 데이터로 적당합니다.)
454 | 
455 | - n = 9358
456 | - p = 15
457 | - 결측치? - 있음 (-200 값은 결측치)
458 | - 반응변수:	변수 2-11. 
459 |     기타 변수(날짜, 시간, 온도, 습도)등은 예측변수로 사용 가능.
460 | 
461 | 
462 | # 14-4. (자유 선택 과제)
463 | <https://goo.gl/hmyTre> (<https://archive.ics.uci.edu/ml/datasets.html>)
464 | 혹은 <https://goo.gl/zSrO3C> (<https://www.kaggle.com/datasets>)
465 | 에서 다른 고차원 회귀분석 데이터를 찾아서 본문에 설명한 분석을 실행하고, 
466 | 결과를 슬라이드 10여 장 내외로 요약하라.
467 | 
468 | (생략)
469 | 


--------------------------------------------------------------------------------
/solutions/ch15-nlp-solutions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "<따라 하며 배우는 데이터 과학> 15장 연습문제 해답"
  3 | author: "권재명"
  4 | date: "9/30/2017"
  5 | output:
  6 |   html_document:
  7 |     toc: true
  8 |     toc_depth: 3
  9 | ---
 10 | 
 11 | (Author notes: Turn `cache=TRUE` in `knitr::opts_chunk()`
 12 | leads to [this error](https://stackoverflow.com/questions/42394244).
 13 | So, just use `cache=FALSE` for this markdown file)
 14 | 
 15 | ```{r setup, include=FALSE}
 16 | knitr::opts_chunk$set(echo = TRUE, cache=FALSE)
 17 | ```
 18 | 
 19 | 
 20 | 저자 책 웹페이지: <https://dataninja.me/ipds-kr/>
 21 | 
 22 | 
 23 | # R 환경 준비
 24 | 일단은 필수패키지인 `tidyverse`, 그리고 
 25 | NLP를 위한 몇가지 패키지를 로드하자.
 26 | (로딩 메시지를 감추기 위해 `suppressMessages()` 명령을 사용.)
 27 | ```{r}
 28 | # install.packages("tidyverse")
 29 | suppressMessages(library(tidyverse))
 30 | 
 31 | # install.packages(c("tm", "SnowballC", "wordcloud"))
 32 | library(tm)
 33 | library(SnowballC)
 34 | library(wordcloud)
 35 | ```
 36 | 
 37 | 
 38 | # 1. (유닉스 명령)
 39 | `JEOPARDY_CSV.csv` 파일에서 유닉스의 cut, sort, uniq 명령을 사용하여 
 40 | Round와 Category 변수의 도수 분포를 구하라.
 41 | 
 42 | ## Round 도수분포:
 43 | ```
 44 | $ cut -d ',' -f 3 JEOPARDY_CSV.csv| sort | uniq -c
 45 |    1  Round
 46 | 105912 Double Jeopardy!
 47 | 3631 Final Jeopardy!
 48 | 107384 Jeopardy!
 49 |    3 Tiebreaker
 50 | ```
 51 | 위 방법의 단점은 첫 줄의 변수이름 (`Round`)도 출력에 포함된다는 것이다.
 52 | 하지만 간편히 자료를 살펴보는데는 좋은 방법이다.
 53 | 
 54 | ## Category 도수분포:
 55 | `Category` 변수는 범주가 무척 많기 때문에,
 56 | 가장 빈도수가 높은 10가지 값만 표현했다. (`head -10`)
 57 | 이 방법의 단점은 카테고리가 따옴표 안의 문자열이므로, 쉼표를 포함할 수도
 58 | 있다는 것이다. 이러한 문제 없이 CSV 파일을 처리하려면 R이나 파이썬의
 59 | CSV 파일 처리 라이브러리를 사용해야 한다.
 60 | ```
 61 | $ cut -d ',' -f 4 JEOPARDY_CSV.csv| sort | uniq -c | sort -n -r | head -10
 62 |  547 "BEFORE & AFTER"
 63 |  519 "SCIENCE"
 64 |  496 "LITERATURE"
 65 |  418 "AMERICAN HISTORY"
 66 |  401 "POTPOURRI"
 67 |  377 "WORLD HISTORY"
 68 |  371 "WORD ORIGINS"
 69 |  351 "COLLEGES & UNIVERSITIES"
 70 |  349 "HISTORY"
 71 |  342 "SPORTS"
 72 | 
 73 | ```
 74 | 
 75 | 
 76 | # 2. R 스튜디오의 샤이니 단어 구름 예제의 코드를 살펴보자
 77 | (<https://goo.gl/Dpez2b>(<http://shiny.rstudio.com/gallery/word-cloud.html>)). 
 78 | 여기서 사용된 데이터는 무엇인가? 
 79 | 문서를 다운로드하여 문서에 대한 단어 구름을 그려보자.
 80 | 
 81 | `server.R` 파일의 주석에서 볼 수 있듯이,
 82 | <한여름밤의 꿈>,  <베니스의 상인>, <로미오와 줄리엣> 이다. 우리는
 83 | <햄릿>을 사용하도록 하자.
 84 | <http://www.gutenberg.org/cache/epub/1524/pg1524.txt>
 85 | 
 86 | 우선 햄릿 텍스트를 컴퓨터에 다운로드한다:
 87 | ```
 88 | wget http://www.gutenberg.org/cache/epub/1524/pg1524.txt
 89 | ```
 90 | 
 91 | R에서 읽어 들인 후,
 92 | 본문에 설명한 변환을 차례대로 해 준다.
 93 | 각 처리 이후에 코퍼스의 3144번째 줄이 어떻게 변환되는지
 94 | 출력해 보았다.
 95 | (햄릿의 그 유명한 "to be, or not to be..." 독백이다.)
 96 | ```{r}
 97 | text <- readLines("pg1524.txt", encoding="UTF-8")
 98 | corpus <- Corpus(VectorSource(text))
 99 | corpus <- tm_map(corpus, content_transformer(tolower))
100 | as.character(corpus[[3144]])
101 | corpus <- tm_map(corpus, removePunctuation)
102 | as.character(corpus[[3144]])
103 | corpus <- tm_map(corpus, removeNumbers)
104 | as.character(corpus[[3144]])
105 | corpus <- tm_map(corpus, removeWords,
106 |                  c(stopwords('SMART'), "thy", "thou", "thee", "and", "but"))
107 | as.character(corpus[[3144]])
108 | corpus <- tm_map(corpus, stemDocument)
109 | as.character(corpus[[3144]])
110 | ```
111 | 
112 | 위의 출력결과에서 볼 수 있듯이, 
113 | 처리과정이 개선의 여지가 있다.
114 | 예문에서의 `,--` 부분이 통째로 없어져서 `bethat` 이란 존재하지 않는
115 | 단어가 나온 것이 그것이다.
116 | 하지만 일단 위의 코퍼스를 사용하도록 하자.
117 | 
118 | 이제 단어구름을 생성할 수 있다:
119 | ```{r}
120 | wordcloud(corpus, max.words=100, random.order=FALSE,
121 |           colors=brewer.pal(8, "Dark2"))
122 | ```
123 | 
124 | 
125 | 
126 | 
127 | # 3. KoNLPy를 설치해보자.
128 | <http://konlpy-ko.readthedocs.io/ko/v0.4.3/install/> 
129 | 설명대로 
130 | 맥 컴퓨터, 파이썬 2.7 에서는 다음처럼 설치하면 된다:
131 | 
132 | ```{bash eval=FALSE}
133 | pip install konlpy     # Python 2.x
134 | ```
135 | 
136 | 
137 | # 4. (국회 의안 내용 단어 구름)
138 | KoNLPy 홈페이지에는 국회 의안의 내용의 단어 구름을 그려주는 예제가 있다
139 | (<https://goo.gl/lDJxW3>, 
140 | <http://konlpy.org/ko/latest/examples/wordcloud/>). 
141 | R로 이 예를 구현하라
142 | 
143 | 
144 | 
145 | 의안의 텍스트를 다운로드하는 함수를 파이썬에서 번역하도록 하자:
146 | ```{r}
147 | get_bill_text <- function(bill_num){
148 |   # R version of get_bill_text in KoNLPy homepage
149 |   # install.packages(c("XML", "RCurl")
150 |   suppressMessages(library(RCurl))
151 |   suppressMessages(library(XML))
152 |   url <- sprintf('http://pokr.kr/bill/%s/text', bill_num)
153 |   html <- getURL(url)
154 |   doc <- htmlParse(html, asText=TRUE, encoding = "UTF-8")
155 |   txt <- xpathSApply(doc, "//div[@id='bill-sections']/pre/text()")[[1]]
156 |   txt <- as(txt, "character") # change XMLInternalTextNode class object to chars
157 |   return(txt)
158 | }
159 | bill_txt <- get_bill_text('1904882')
160 | ```
161 | 
162 | 이제 의안 텍스트는 준비 되었다.
163 | 
164 | 
165 | 한글은 영어와는 달리 모든 단어가 아니라
166 | 명사를 추출해서 단어 구름을 그리는 것이 의미가 있다.
167 | 한글 NLP 를 위한 라이브러리 KoNLP 를 사용하도록 하자.
168 | <https://cran.r-project.org/web/packages/KoNLP/index.html>
169 | 
170 | ```{r}
171 | # install.packages("KoNLP")
172 | # The following line is added to make this document renders via knitr.
173 | # See <https://stackoverflow.com/questions/42394244/rstudio-knit-button-fail-to-load-rjava>
174 | dyn.load('/Library/Java/JavaVirtualMachines/jdk1.8.0_45.jdk/Contents/Home/jre/lib/server/libjvm.dylib')
175 | library(KoNLP)
176 | useSejongDic() # 세종 사전을 다운로드한다.
177 | nouns <- extractNoun(bill_txt) # 명사를 추출한다
178 | ```
179 | 
180 | 
181 | 이제 `nouns` 명사 벡터를 사용해 단어구름을 생성한다.
182 | 맥에서는 폰트를 설정해 주어서 한글이 깨지지 않게 해 주어야 한다.
183 | ```{r}
184 | # dplyr::glimpse(nouns)
185 | par(family='Dotum')
186 | wordcloud(nouns, max.words=100, random.order=FALSE,
187 |           colors=brewer.pal(8, "Dark2"))
188 | ```
189 | 
190 | `wordcloud2` 패키지(<https://cran.r-project.org/web/packages/wordcloud2/index.html>)
191 | 를 사용하면 자바스크립트를 이용한 단어구름을 그려준다. 
192 | 이를 위해서 단어 빈도 데이터프레임을 먼저 만들고 다음 옵션을 추가했다:
193 | 
194 | - `fontFamily = "Dotum"` : 한글폰트 사용
195 | - `shuffle=FALSE` : 재현가능하도록 랜덤화 방지
196 | - `minRotation=0, maxRotation=0,` : 단어회전 금지
197 | - `color=rep(brewer.pal(8, "Dark2"), length.out=n_words_to_show)` : 
198 |     위의 예제와 동일한 색깔 팔레트 사용
199 | 
200 | 출력결과는 다음과 같다:
201 | 
202 | ```{r}
203 | # install.packages("wordcloud2")
204 | library(wordcloud2)
205 | n_words_to_show <- 100
206 | df <- tibble(noun=nouns) %>% 
207 |   group_by(noun) %>% 
208 |   count() %>% 
209 |   arrange(desc(n)) %>%
210 |   head(n_words_to_show) %>%
211 |   as.data.frame()
212 | # df %>% glimpse()
213 | wordcloud2(df %>% head(n_words_to_show), fontFamily = "Dotum",
214 |            shuffle=FALSE, minRotation=0, maxRotation=0,
215 |            color=rep(brewer.pal(8, "Dark2"), length.out=n_words_to_show))
216 | ```
217 | 
218 | 
219 | ## 참고문헌:
220 | 
221 | - <http://konlpy.org/ko/latest/examples/wordcloud/>
222 | - <http://dbrang.tistory.com/1061>
223 | - <https://github.com/haven-jeon/KoNLP>
224 | - <http://freesearch.pe.kr/archives/4538>
225 | - <http://freesearch.pe.kr/archives/4446>
226 | 


--------------------------------------------------------------------------------
/solutions/ipds-kr-solutions-toc.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "<따라 하며 배우는 데이터 과학> 연습문제 해답"
 3 | author: "권재명"
 4 | date: "9/30/2017"
 5 | output:
 6 |   html_document
 7 | ---
 8 | 
 9 | ```{r setup, include=FALSE}
10 | knitr::opts_chunk$set(echo = TRUE, cache=TRUE)
11 | ```
12 | 
13 | - 저자 책 웹페이지: <https://dataninja.me/ipds-kr/>
14 | - 질문이나 피드백:
15 |     - <dataninjame@gmail.com>
16 |     - <https://www.facebook.com/dataninja.me/>
17 | 
18 | 
19 | ## 연습문제 해답
20 | - 3장 (tidyverse) 해답: http://rpubs.com/dataninja/ipds-kr-ch03
21 | - 4장 (ggplot2) 해답: http://rpubs.com/dataninja/ipds-kr-ch04
22 | - 8-9장 (classification) 해답: http://rpubs.com/dataninja/ipds-kr-ch08
23 | - 10-11장 (classification) 해답: http://rpubs.com/dataninja/ipds-kr-ch10
24 | - 13-14장 (regression) 해답: http://rpubs.com/dataninja/ipds-kr-ch13
25 | - 15장 (word cloud) 해답: http://rpubs.com/dataninja/ipds-kr-ch15
26 | 
27 | 연습문제 해답들은 R마크다운 (RMarkdown) 으로 작성되었습니다.
28 | 
29 | 원본 파일들은 다음 리포에서 확인할 수 있습니다.
30 | <https://github.com/jaimyoung/ipds-kr/tree/master/solutions> 
31 | 
32 | 
33 | ## 보너스: 본문 그림 4-11, 12
34 | * But it's interactive, using <https://plot.ly/r/>
35 | 
36 | ```{r}
37 | suppressMessages(library(tidyverse))
38 | library(gapminder)
39 | suppressMessages(library(plotly))
40 | 
41 | g1 <- gapminder %>% filter(year==2002) %>%
42 |   ggplot(aes(gdpPercap, lifeExp, text=country)) +
43 |   geom_point(aes(size=pop, col=continent)) + scale_x_log10() +
44 |   ggtitle("Gapminder data for 2007")
45 | suppressMessages(ggplotly(g1))
46 | 
47 | g2 <- gapminder %>%
48 |   ggplot(aes(year, lifeExp, group=country)) +
49 |   geom_line() +
50 |   facet_wrap(~ continent)
51 | suppressMessages(ggplotly(g2))
52 | ```
53 | 
54 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch03-tidyverse-solutions.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/312900/0fd3d01887534faf96bda70d88336661
 8 | bundleId: https://api.rpubs.com/api/v1/document/312900/0fd3d01887534faf96bda70d88336661
 9 | url: http://rpubs.com/publish/claim/312900/dee3dbeb8d79478e9e2e9e2687b5f8b6
10 | when: 1506551142.80662
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch03-tidyverse-solutions.Rmd/rpubs.com/rpubs/Publish Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Publish Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/312900/0fd3d01887534faf96bda70d88336661
 8 | bundleId: https://api.rpubs.com/api/v1/document/312900/0fd3d01887534faf96bda70d88336661
 9 | url: http://rpubs.com/dataninja/312900
10 | when: 1506582161.7343
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch04-ggplot-solutions.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313026/5a68eacd312b4766a5e261d14e483bd8
 8 | bundleId: https://api.rpubs.com/api/v1/document/313026/5a68eacd312b4766a5e261d14e483bd8
 9 | url: http://rpubs.com/publish/claim/313026/81ce8b3e38f745cda2a86d3ab9525e1b
10 | when: 1506581931.60272
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch04-ggplot-solutions.Rmd/rpubs.com/rpubs/Publish Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Publish Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313026/5a68eacd312b4766a5e261d14e483bd8
 8 | bundleId: https://api.rpubs.com/api/v1/document/313026/5a68eacd312b4766a5e261d14e483bd8
 9 | url: http://rpubs.com/dataninja/313026
10 | when: 1506582273.88577
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch08-classification-solutions.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313283/6ca45900a4c4447596ff3d3f5f441d34
 8 | bundleId: https://api.rpubs.com/api/v1/document/313283/6ca45900a4c4447596ff3d3f5f441d34
 9 | url: http://rpubs.com/publish/claim/313283/1baa6f87864840f88570f59f75c28f50
10 | when: 1506654485.96179
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch10-classification-solutions.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313325/e4d392ec546b493e94b7098d60fcac49
 8 | bundleId: https://api.rpubs.com/api/v1/document/313325/e4d392ec546b493e94b7098d60fcac49
 9 | url: http://rpubs.com/publish/claim/313325/884188a166c040788cd265ffc850a516
10 | when: 1506667118.96985
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch13-regression-solutions.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313637/3413bc21286a471a8776c89af59dd3e5
 8 | bundleId: https://api.rpubs.com/api/v1/document/313637/3413bc21286a471a8776c89af59dd3e5
 9 | url: http://rpubs.com/publish/claim/313637/9972f6940908481c8e6e58b0be86b93a
10 | when: 1506759406.5853
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch13-regression-solutions.Rmd/rpubs.com/rpubs/Publish Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Publish Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313637/3413bc21286a471a8776c89af59dd3e5
 8 | bundleId: https://api.rpubs.com/api/v1/document/313637/3413bc21286a471a8776c89af59dd3e5
 9 | url: http://rpubs.com/dataninja/313637
10 | when: 1506759794.84452
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch15-nlp-solutions.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313649/b160f22477ec43638c7a636167847492
 8 | bundleId: https://api.rpubs.com/api/v1/document/313649/b160f22477ec43638c7a636167847492
 9 | url: http://rpubs.com/publish/claim/313649/f0e1c8a531f54823882ef6c10a515081
10 | when: 1506767392.56389
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ch15-nlp-solutions.Rmd/rpubs.com/rpubs/Publish Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Publish Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313649/b160f22477ec43638c7a636167847492
 8 | bundleId: https://api.rpubs.com/api/v1/document/313649/b160f22477ec43638c7a636167847492
 9 | url: http://rpubs.com/dataninja/313649
10 | when: 1506967427.62825
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ipds-kr-solutions-toc.Rmd/rpubs.com/rpubs/Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313652/5c2495a636374e3aaaba5c3e9959ca8c
 8 | bundleId: https://api.rpubs.com/api/v1/document/313652/5c2495a636374e3aaaba5c3e9959ca8c
 9 | url: http://rpubs.com/publish/claim/313652/7e00fd947d8142aeb50e7c59e339ab8c
10 | when: 1506768825.76045
11 | 


--------------------------------------------------------------------------------
/solutions/rsconnect/documents/ipds-kr-solutions-toc.Rmd/rpubs.com/rpubs/Publish Document.dcf:
--------------------------------------------------------------------------------
 1 | name: Publish Document
 2 | title:
 3 | username:
 4 | account: rpubs
 5 | server: rpubs.com
 6 | hostUrl: rpubs.com
 7 | appId: https://api.rpubs.com/api/v1/document/313652/5c2495a636374e3aaaba5c3e9959ca8c
 8 | bundleId: https://api.rpubs.com/api/v1/document/313652/5c2495a636374e3aaaba5c3e9959ca8c
 9 | url: http://rpubs.com/dataninja/313652
10 | when: 1506769610.50296
11 | 


--------------------------------------------------------------------------------
/solutions/solutions.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 


--------------------------------------------------------------------------------