├── LICENSE
├── README.md
└── Source-File
    ├── 主成份分析
        ├── .Rhistory
        ├── .Rprofile
        ├── 2012MLB.csv
        ├── style.css
        ├── 主成份分析.Rmd
        └── 主成份分析.html
    ├── 分割、合併、離群值、虛擬變數
        ├── 1.PNG
        ├── 2.PNG
        ├── 3.PNG
        ├── 4.PNG
        ├── 5.PNG
        ├── 6.png
        ├── 7.PNG
        ├── 8.PNG
        ├── 9.PNG
        ├── Thumbs.db
        ├── _RData.gz
        ├── _Rhistory
        ├── style.css
        ├── 分割、合併、離群值、虛擬變數.Rmd
        └── 分割、合併、離群值、虛擬變數.html
    ├── 分群分析
        ├── .Rhistory
        ├── .Rprofile
        ├── 1.png
        ├── 2.png
        ├── style.css
        ├── 分群分析.Rmd
        └── 分群分析.html
    ├── 基本資料型態
        ├── .Rhistory
        ├── .Rprofile
        ├── 1.png
        ├── 2.png
        ├── 3.png
        ├── 4.png
        ├── 5.png
        ├── 6.png
        ├── 7.png
        ├── Payment_and_value_of_care_-_Hospital.csv
        ├── Thumbs.db
        ├── style.css
        ├── 基本資料型態.Rmd
        └── 基本資料型態.html
    ├── 套件與函式
        ├── .Rhistory
        ├── .Rprofile
        ├── 1.png
        ├── 2.png
        ├── 3.png
        ├── 4.png
        ├── 5.png
        ├── 6.png
        ├── 7.png
        ├── 8.png
        ├── 9.png
        ├── rpubs_conn
        │   └── api_id.txt
        ├── style.css
        ├── 套件與函式.Rmd
        └── 套件與函式.html
    ├── 安裝R與RStudio
        ├── 1.png
        ├── 10.png
        ├── 11.png
        ├── 12.PNG
        ├── 13.png
        ├── 14.png
        ├── 15.png
        ├── 16.png
        ├── 17.png
        ├── 18.png
        ├── 19.png
        ├── 2.png
        ├── 20.png
        ├── 21.png
        ├── 22.png
        ├── 23.PNG
        ├── 3.png
        ├── 4.png
        ├── 5.png
        ├── 6-1.png
        ├── 6-2.png
        ├── 7.png
        ├── 8.png
        ├── 9.png
        ├── Thumbs.db
        ├── _Rhistory
        ├── _Rprofile
        ├── style.css
        ├── 安裝R與RStudio.Rmd
        └── 安裝R與RStudio.html
    ├── 決策樹
        ├── _Rprofile
        ├── style.css
        ├── titanic.raw.rdata
        ├── 決策樹.Rmd
        └── 決策樹.html
    ├── 流程控制
        ├── .Rhistory
        ├── .Rprofile
        ├── style.css
        ├── 流程控制.Rmd
        └── 流程控制.html
    ├── 線性迴歸、變異數分析
        ├── .Rhistory
        ├── .Rprofile
        ├── style.css
        ├── 線性迴歸、變異數分析.Rmd
        └── 線性迴歸、變異數分析.html
    ├── 繪圖–資料視覺化
        ├── .Rhistory
        ├── .Rprofile
        ├── 1.png
        ├── 2.png
        ├── 3.png
        ├── Combination of Plots.png
        ├── Thumbs.db
        ├── style.css
        ├── 繪圖–資料視覺化.Rmd
        ├── 繪圖–資料視覺化.html
        ├── 繪圖–資料視覺化
        │   └── figure-html
        │   │   ├── unnamed-chunk-10-1.png
        │   │   ├── unnamed-chunk-11-1.png
        │   │   ├── unnamed-chunk-12-1.png
        │   │   ├── unnamed-chunk-13-1.png
        │   │   ├── unnamed-chunk-14-1.png
        │   │   ├── unnamed-chunk-15-1.png
        │   │   ├── unnamed-chunk-16-1.png
        │   │   ├── unnamed-chunk-17-1.png
        │   │   ├── unnamed-chunk-18-1.png
        │   │   ├── unnamed-chunk-19-1.png
        │   │   ├── unnamed-chunk-2-1.png
        │   │   ├── unnamed-chunk-21-1.png
        │   │   ├── unnamed-chunk-22-1.png
        │   │   ├── unnamed-chunk-23-1.png
        │   │   ├── unnamed-chunk-24-1.png
        │   │   ├── unnamed-chunk-25-1.png
        │   │   ├── unnamed-chunk-26-1.png
        │   │   ├── unnamed-chunk-3-1.png
        │   │   ├── unnamed-chunk-4-1.png
        │   │   ├── unnamed-chunk-5-1.png
        │   │   ├── unnamed-chunk-6-1.png
        │   │   ├── unnamed-chunk-7-1.png
        │   │   └── unnamed-chunk-9-1.png
        ├── 繪圖–資料視覺化_cache
        │   └── html
        │   │   ├── __packages
        │   │   ├── unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.RData
        │   │   ├── unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.rdb
        │   │   └── unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.rdx
        └── 繪圖–資料視覺化_files
        │   └── figure-html
        │       ├── unnamed-chunk-10-1.png
        │       ├── unnamed-chunk-11-1.png
        │       ├── unnamed-chunk-12-1.png
        │       ├── unnamed-chunk-13-1.png
        │       ├── unnamed-chunk-14-1.png
        │       ├── unnamed-chunk-15-1.png
        │       ├── unnamed-chunk-16-1.png
        │       ├── unnamed-chunk-17-1.png
        │       ├── unnamed-chunk-18-1.png
        │       ├── unnamed-chunk-19-1.png
        │       ├── unnamed-chunk-2-1.png
        │       ├── unnamed-chunk-21-1.png
        │       ├── unnamed-chunk-22-1.png
        │       ├── unnamed-chunk-23-1.png
        │       ├── unnamed-chunk-24-1.png
        │       ├── unnamed-chunk-25-1.png
        │       ├── unnamed-chunk-26-1.png
        │       ├── unnamed-chunk-3-1.png
        │       ├── unnamed-chunk-4-1.png
        │       ├── unnamed-chunk-5-1.png
        │       ├── unnamed-chunk-6-1.png
        │       ├── unnamed-chunk-7-1.png
        │       └── unnamed-chunk-9-1.png
    ├── 遺漏值處理
        ├── .Rhistory
        ├── .Rprofile
        ├── style.css
        ├── 遺漏值處理.Rmd
        └── 遺漏值處理.html
    ├── 關聯式規則
        ├── 1.png
        ├── 2.png
        ├── _Rprofile
        ├── style.css
        ├── titanic.raw.rdata
        ├── 關聯式規則.Rmd
        └── 關聯式規則.html
    └── 類神經網路
        ├── .Rhistory
        ├── .Rprofile
        ├── 1.png
        ├── 2.png
        ├── style.css
        ├── 類神經網路.Rmd
        ├── 類神經網路.html
        ├── 類神經網路_cache
            └── html
            │   ├── __packages
            │   ├── unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.RData
            │   ├── unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.rdb
            │   └── unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.rdx
        └── 類神經網路_files
            └── figure-html
                └── unnamed-chunk-6-1.png


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 POLab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Data Mining(R programming language)
  3 | 
  4 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
  5 | 
  6 | 
  7 | [資料探勘課程](http://course-query.acad.ncku.edu.tw/crm/course_map/course.php?dept=N0&cono=N061700) 是國立成功大學工學院工程管理在職專班所開授的進階課程，開課教師為[李家岩](http://polab.imis.ncku.edu.tw/Bio.html)老師。
  8 | 
  9 | 內容主要為資料科學(Data Science)與大數據(Big Data)，鼓勵學生進行案例探討、分析與實作。 此 Github 網站為延伸教學資源，與 R 語言實作教學有關。
 10 | 
 11 | 課程參考書目(Reference):
 12 | 1. [Gareth James, Daniela Witten, Trevor Hastie and Robert Tibshirani, 2014. An Introduction to Statistical Learning with Applications in R. Springer.](http://www-bcf.usc.edu/~gareth/ISL/)
 13 | 2. [Hastie, T., R. Tibshirani, and J. Friedman, 2009. The Elements of Statistical Learning: Data Mining, Inference, and Prediction. 2nd ed., Springer.](https://web.stanford.edu/~hastie/ElemStatLearn/)
 14 | 
 15 | 
 16 | 此 Github 網站內容主要為資料科學家 [skydome20的R系列筆記](https://github.com/skydome20/R-Notes) ，一併致謝。(若因 Rpubs 維修而無法閱讀文章，可參考 [R系列筆記備份資源](https://github.com/skydome20/R-Notes))
 17 | 
 18 | ※好文分享：[2019/08/17-What’s next for the popular programming language R?](https://qz.com/1661487/hadley-wickham-on-the-future-of-r-python-and-the-tidyverse/)
 19 | 
 20 | --------------------------------------------------------------------------------------------------------------------------------
 21 | 
 22 | ## **助教群**    
 23 |  
 24 | [1]: http://rpubs.com/skydome20/Table
 25 | [2]: http://rpubs.com/allan811118/R_programming_00
 26 | [3]: http://rpubs.com/james_datacatcher
 27 | [4]: http://rpubs.com/jeff_datascience/DS_Notebook
 28 | [5]: https://github.com/Jacky12Cheng
 29 | 
 30 | 
 31 | | 擔任年份    |  助教名稱    |    暱稱          |         LinkedIn                          | E-mail                |
 32 | |:----------:| :----------:|:---------------:|:------------------------------------------: |:-------------------:|
 33 | | 2019下     | 詹京哲       | Ginger          | https://www.linkedin.com/in/ginger-zhan    | bmw2142@gmail.com      |
 34 | | 2019上     | 鄭宇翔     | [Jacky][5]    | https://www.linkedin.com/in/yu-hsiangCheng/    | zxcv9100207@gmail.com |
 35 | | 2018       | 洪佑鑫       | [Jeff][4]       | https://www.linkedin.com/in/hungyuhsin/    | p96064037@gs.ncku.edu.tw   |
 36 | | 2018       | 吳昭賢       | [James][3]      | https://www.linkedin.com/in/iamjameswu/    | new393988911@gmail.com |
 37 | | 2017       | 周百建       | [Allan][2]      | https://www.linkedin.com/in/iamallanchou   |    |
 38 | | 2017       | 張博凱       | Bokai           | https://www.linkedin.com/in/bo-kai-Jang    |    |
 39 | | 2016       | 洪紹嚴       | [skydome20][1]  | https://www.linkedin.com/in/skydome20      |    |
 40 | 
 41 | 
 42 | 
 43 | 
 44 | --------------------------------------------------------------------------------------------------------------------------------
 45 |   
 46 |   
 47 | ## **教學資源**
 48 | 
 49 | ### **:triangular_flag_on_post:環境建置**    
 50 |    
 51 | |  更新時間       |                      文章                                                                               |
 52 | |  :-----------:  | :-----------------------------------------------------------------------------------------------------: |
 53 | |  2017-05-19     | [安裝R與RStudio](https://rpubs.com/skydome20/R1-R_and_RStudio)   | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | ### **:triangular_flag_on_post:基本觀念 & 語法**   
 59 | 
 60 | |  更新時間       |                      文章                                                                                             |
 61 | | :-----------:   |:-----------------------------------------------------------------------------------------------------:                |
 62 | |  2017-05-19     | [基本資料型態](https://rpubs.com/skydome20/R-Note2-dataType)                       | 
 63 | |  2017-05-19     | [套件與函式](https://rpubs.com/skydome20/R-Note3-function_and_package)                             | 
 64 | |  2017-05-19     | [流程控制(for, while, ifelse, switch)](http://rpubs.com/skydome20/R-Note11-Control_Flow)       | 
 65 | 
 66 | 
 67 |    
 68 | ### **:triangular_flag_on_post:資料預處理**   
 69 |    
 70 | |  更新時間       |                      文章                                                                                                                        |
 71 | | :-----------: | :-----------------------------------------------------------------------------------------------------:                                            |
 72 | |  2017-05-19     | [分割、合併、離群值、虛擬變數](https://po-lab.github.io/Data-Mining/Source-File/分割、合併、離群值、虛擬變數/分割、合併、離群值、虛擬變數.html)  | 
 73 | |  2017-05-19     | [遺漏值處理(Impute Missing Value)](http://www.rpubs.com/skydome20/R-Note10-Missing_Value)                                  | 
 74 | 
 75 | 
 76 |    
 77 | ### **:triangular_flag_on_post:資料視覺化**  
 78 | 
 79 | |  更新時間       |                      文章                                                                                      |
 80 | |  :-----------: | :-----------------------------------------------------------------------------------------------------:         |
 81 | |  2017-05-19     | [繪圖–資料視覺化](http://rpubs.com/skydome20/R-Note4-Plotting_System)       |
 82 | 
 83 | 
 84 | 
 85 | 
 86 |    
 87 | ### **:triangular_flag_on_post:模型建構**  
 88 |    
 89 |    
 90 | 
 91 | |  更新時間        |                      文章                                                                               |
 92 | | :-----------:   | :-----------------------------------------------------------------------------------------------------:|
 93 | |  2017-05-19     | [線性迴歸與變異數分析(Linear Regression)](http://rpubs.com/skydome20/R-Note5-First_Practice)  |
 94 | |  2019-12-04     | [邏輯斯迴歸(Logistic Regression)](https://rpubs.com/ginger_zhan/logistic_regression)  |
 95 | |  2017-05-19     | [關聯式規則(Association Rule)](http://www.rpubs.com/skydome20/R-Note6-Apriori-DecisionTree)  | 
 96 | |  2018-03-03     | [特徵選取(Feature Selection)：逐步迴歸(Stepwise Regression)與套索迴歸(LASSO)](http://rpubs.com/skydome20/R-Note18-Subsets_Shrinkage_Methods)        | 
 97 | |  2019-03-16     | 特徵萃取(Feature Extraction)：[主成份分析(PCA)](http://rpubs.com/skydome20/R-Note7-PCA)與[獨立成分分析(ICA)](http://rpubs.com/skydome20/R-Note17-ICA)   | 
 98 | |  2017-05-19     | [決策樹(Decision Tree):分類與迴歸樹CART](http://rpubs.com/allan811118/R_programming_08)   | 
 99 | |  2017-05-19     | [類神經網路(Neural Networks)：倒傳遞類神經網路(Backpropagation)](http://rpubs.com/skydome20/R-Note8-ANN)  | 
100 | |  2017-05-19     | [深度學習(Deep Learning)](http://rpubs.com/skydome20/R-Note12-DigitRecognizer-Kaggle)    | 
101 | |  2017-05-19     | [分群分析(Clustering)：華德法(Ward)與K平均法(K-means)](http://www.rpubs.com/skydome20/R-Note9-Clustering)   | 
102 | |  2019-07-24     | [分群分析(Clustering)：自適應共振理論(ART)](https://rpubs.com/JackyCheng/AdaptiveResonanceTheory)   | 
103 | |  2017-05-19     | [分群分析(Clustering)：自我映射組織(SOM)與模糊適應共振理論(Fuzzy-ART)]   | 
104 | |  2017-05-19     | [支援向量機(Support Vector Machine)](http://rpubs.com/skydome20/R-Note14-SVM-SVR)   |
105 | |  2018-03-03     | [集成算法(Ensemble Method)：隨機森林(Random Forest)與梯度推進(Gradient Boosting)](http://rpubs.com/skydome20/R-Note16-Ensemble_Learning)   |
106 | |  2019-02-21     | [總結(Summary)：Comparison of 13 Algorithms in 165 Datasets](https://machinelearningmastery.com/start-with-gradient-boosting/)   |
107 | 
108 | 
109 | ### **:pushpin: Python**
110 | ### **:triangular_flag_on_post: 其他介紹**
111 | |  更新時間       |                      文章                                                                                                      |
112 | | :-----------:  | :-----------------------------------------------------------------------------------------------------:                        |
113 | |  2019-11-14     | [可解釋性人工智慧 Explainable Artificial Intelligence(XAI)](https://github.com/ITingHung/Explainable-Artificial-Intelligence-XAI-) |
114 | |  2019-11-14     | [代價敏感 Cost Sensitive](https://github.com/wutsungyu/Cost-Sensitive)
115 | 


--------------------------------------------------------------------------------
/Source-File/主成份分析/.Rhistory:
--------------------------------------------------------------------------------
 1 | require(markdown)
 2 | api_id.path <- 'rpubs_conn/api_id.txt'
 3 | note.title <- 'R7'
 4 | note.html <- 'R7.html'
 5 | # Update
 6 | if (file.exists('rpubs_conn/api_id.txt')){
 7 | print('Start Updating')
 8 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
 9 | # update article on Rpubs
10 | update.result <- rpubsUpload(title = note.title,
11 | htmlFile = note.html,
12 | id = api.id
13 | )
14 | browseURL(update.result$continueUrl)
15 | print('update success')
16 | update.result$continueUrl
17 | # Upload
18 | }else{
19 | print('Start Uploading')
20 | dir.create('rpubs_conn')
21 | # upload article on Rpubs
22 | upload.result <- rpubsUpload(title = note.title,
23 | htmlFile = 'R1.html'
24 | )
25 | upload.result$id
26 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
27 | browseURL(upload.result$continueUrl)
28 | print('upload success')
29 | upload.result$continueUrl
30 | }
31 | 


--------------------------------------------------------------------------------
/Source-File/主成份分析/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/主成份分析/2012MLB.csv:
--------------------------------------------------------------------------------
 1 | Team,G,R,H,H1B,H2B,H3B,HR,RBI,BB,SO,SB,AVG,OBP
 2 | Texas Rangers,152,764,1444,941,283,32,188,738,448,1030,89,0.275,0.335
 3 | Los Angeles Angels,153,726,1424,972,252,20,180,691,423,1041,125,0.273,0.331
 4 | Colorado Rockies,152,718,1425,932,286,49,158,680,432,1129,96,0.272,0.329
 5 | St. Louis Cardinals,153,723,1447,983,277,35,152,691,503,1128,89,0.272,0.338
 6 | San Francisco Giants,153,683,1419,997,273,54,95,642,453,1032,111,0.27,0.327
 7 | Detroit Tigers,152,689,1379,922,267,37,153,663,490,1042,53,0.268,0.336
 8 | Kansas City Royals,152,644,1412,971,282,37,122,613,380,958,126,0.267,0.319
 9 | Boston Red Sox,154,716,1413,907,330,16,160,678,411,1129,92,0.264,0.319
10 | Minnesota Twins,153,673,1377,964,258,29,126,640,481,1004,129,0.262,0.326
11 | Washington Nationals,152,680,1377,894,281,23,179,640,448,1236,95,0.261,0.322
12 | New York Yankees,152,736,1345,849,259,12,225,710,523,1115,90,0.261,0.333
13 | Milwaukee Brewers,152,733,1362,854,284,36,188,699,447,1185,150,0.261,0.327
14 | Arizona Diamondbacks,152,693,1334,855,293,32,154,673,504,1190,87,0.261,0.329
15 | Philadelphia Phillies,153,652,1340,910,253,25,152,628,429,1021,112,0.256,0.317
16 | Chicago White Sox,152,702,1321,875,220,29,197,681,423,1124,101,0.255,0.317
17 | Cincinnati Reds,153,651,1315,834,284,28,169,618,462,1182,87,0.253,0.318
18 | New York Mets,152,612,1276,856,272,19,129,591,482,1186,72,0.25,0.317
19 | Atlanta Braves,153,672,1283,860,251,27,145,635,537,1215,93,0.249,0.321
20 | Cleveland Indians,153,620,1290,890,246,24,130,590,513,1005,102,0.249,0.321
21 | Los Angeles Dodgers,153,591,1270,894,252,21,103,563,462,1094,97,0.248,0.314
22 | San Diego Padres,153,617,1269,858,257,41,113,579,506,1173,142,0.248,0.32
23 | Baltimore Orioles,152,667,1296,826,260,16,194,635,456,1227,51,0.247,0.312
24 | Pittsburgh Pirates,152,622,1250,825,229,34,162,591,413,1271,66,0.245,0.306
25 | Miami Marlins,153,587,1260,843,250,37,130,555,458,1143,144,0.245,0.309
26 | Toronto Blue Jays,151,669,1242,804,231,21,186,631,439,1178,121,0.243,0.307
27 | Chicago Cubs,153,586,1238,821,251,36,130,545,413,1171,92,0.242,0.302
28 | Tampa Bay Rays,153,659,1229,802,241,29,157,628,539,1241,131,0.241,0.318
29 | Houston Astros,153,551,1211,821,232,27,131,514,443,1293,103,0.237,0.303
30 | Oakland Athletics,152,651,1223,767,249,29,178,622,515,1296,117,0.236,0.309
31 | Seattle Mariners,153,573,1206,824,218,26,138,543,436,1183,101,0.233,0.294
32 | 


--------------------------------------------------------------------------------
/Source-File/主成份分析/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/主成份分析/主成份分析.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---主成份分析(PCA)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 |   
 14 | 本篇內容，會繼續介紹一些常用的資料探勘模型：   
 15 |    
 16 | ------
 17 |    
 18 | #**主成份分析(Principal Component Analysis)**   
 19 |    
 20 | 這裡拿網路上一個公開資料，2012年美國職棒MLB的資料，來進行分析，<a href="https://sites.google.com/site/rlearningsite/data/2012MLB.csv?attredirects=0" target="_blank">資料載點如下</a>。   
 21 | 
 22 | 在本篇文章中，會使用到以下函式：   
 23 | 
 24 | * prcomp()：主成份分析的基本函式   
 25 | 
 26 | * plot()：繪製陡坡圖(screet plot)，選擇多少個主成份   
 27 | 
 28 | * dotchart()：繪製主成份負荷圖(PCA loadings plot) 
 29 | 
 30 | * biplot()：繪製主成份負荷圖(PCA loadings plot)    
 31 |    
 32 | ## 1. 主成份分析
 33 | 當下載好資料後，第一步便是先讀取資料：
 34 | ```{r}
 35 | data <- read.csv("2012MLB.csv",  # 資料檔名 
 36 |                  header=T,          # 資料中的第一列，作為欄位名稱
 37 |                  sep=",")           # 將逗號視為分隔符號來讀取資料
 38 | 
 39 | head(data)
 40 | ```   
 41 |    
 42 | 在這裡，選擇**一壘安打、二壘安打、三壘安打、全壘打、打點、盜壘次數、四壞球**，這七個變數，進行主成份分析`prcomp()`：   
 43 | ```{r}
 44 | pca <- prcomp(formula = ~ H1B+H2B+H3B+HR+RBI+SB+BB,  #選擇七個變數 
 45 |               data = data,                           # 資料
 46 |               scale = TRUE)                          # 正規化資料
 47 | # 這就是我們的主成份
 48 | pca  
 49 | ```
 50 | 上面的報表是這樣解釋：   
 51 | 
 52 | * Standard deviations：特徵值開根號   
 53 | 
 54 | * Rotation：特徵向量，也就是各個主成份，所對應的線性組合(linear combination)的係數   
 55 | 
 56 | ## 2. 選擇多少個主成份？   
 57 | 
 58 | 當主成份算出來以後，接下來要做的是「選擇幾個主成份」！   
 59 | 
 60 | 我們可以繪製「陡坡圖Scree plot」以及「累積解釋圖Pareto plot」：
 61 | 
 62 | ### ‧陡坡圖(Scree plot)   
 63 | 
 64 | ```{r}
 65 | # 使用plot()函式
 66 | plot(pca,         # 放pca
 67 |      type="line", # 用直線連結每個點
 68 |      main="Scree Plot for 2012MLB") # 主標題
 69 | 
 70 | # 用藍線標示出特徵值=1的地方
 71 | abline(h=1, col="blue") # Kaiser eigenvalue-greater-than-one rule
 72 | ```
 73 |    
 74 | 根據**凱莎原則**，特徵值大於1的主成份就可以選取；而且第三個以後的主成份變異趨於平緩，因此選擇**前三個主成份**是比較好的選擇。    
 75 | 
 76 | 
 77 | ### ‧累積解釋圖(Pareto plot)   
 78 | 這裡就比較複雜，需要進行四個步驟：
 79 | 
 80 | 1. 求出每個主成份的特徵值(也就是variance = std^2)   
 81 | 
 82 | ```{r}
 83 |     vars <- (pca$sdev)^2  # 從pca中取出標準差(pca$sdev)後再平方，計算variance(特徵值)
 84 |     vars
 85 | ```   
 86 |    
 87 | 2. 計算每個主成分的解釋比例 = 各個主成份的特徵值/總特徵值   
 88 | ```{r}
 89 |     # 計算每個主成分的解釋比例 = 各個主成分的特徵值/總特徵值
 90 |     props <- vars / sum(vars)    
 91 |     props
 92 | ```   
 93 |    
 94 | 3. 累加每個主成份的解釋比例(aggregated effects)
 95 | ```{r}
 96 |     cumulative.props <- cumsum(props)  # 累加前n個元素的值
 97 |     cumulative.props
 98 | ```   
 99 | 
100 | 4. 把累積解釋比例畫成圖：
101 | ```{r}
102 |     #當我們取前三個主成份，可以解釋 70.64% 的變異
103 |     cumulative.props[3]
104 | 
105 |     # 累積解釋比例圖
106 |     plot(cumulative.props)
107 | ```
108 | 
109 | 所以原本的資料集，經過主成份分析後，會轉換成新的**以主成份代替**的資料集(pca$x)。   
110 | 以下步驟是取**前三個主成份**，作為新的資料集：
111 | ```{r}
112 | # pca$rotation 
113 | top3_pca.data <- pca$x[, 1:3]
114 | top3_pca.data 
115 | ```
116 | 
117 | ## 3. 主成份負荷 (主成份和原變數的關係)  
118 | 
119 | 每一個主成份，都是**原變數經過線性組合**後產生的值。   
120 | 
121 | 而要解釋主成份的話，就需要觀察主成份和原變數之間的關係，也就是觀察原變數在線性組合中的**係數**(特徵向量)，對主成份究竟是正面還是負面、具有多大的影響。
122 | 
123 | ```{r}
124 | # 特徵向量(原變數的線性組合)
125 | pca$rotation
126 | ```
127 |    
128 | 取**前三個主成份**的特徵向量：
129 | ```{r}
130 | top3.pca.eigenvector <- pca$rotation[, 1:3]
131 | top3.pca.eigenvector
132 | ```
133 |    
134 | 我們可以繪製主成份負荷圖，觀察原變數和主成份之間的關係：
135 | ```{r}
136 | first.pca <- top3.pca.eigenvector[, 1]   #  第一主成份
137 | second.pca <- top3.pca.eigenvector[, 2]  #  第二主成份
138 | third.pca <- top3.pca.eigenvector[, 3]   #  第三主成份
139 | ```
140 | 
141 | (以下有用到**排序**的技巧，可以參考<a href="http://www.rpubs.com/skydome20/R-Note3-function_and_package" target="_blank">前篇筆記</a> 內的`order(), sort()`)   
142 | 
143 | ### ‧第一主成份：   
144 | 
145 | **SB(盜壘)、BB(四壞球)與PC-1呈現正相關**，看起來和「上壘」有關。
146 | ```{r}
147 | # 第一主成份：由小到大排序原變數的係數
148 | first.pca[order(first.pca, decreasing=FALSE)]  
149 | # 使用dotchart，繪製主成份負荷圖
150 | dotchart(first.pca[order(first.pca, decreasing=FALSE)] ,   # 排序後的係數
151 |          main="Loading Plot for PC1",                      # 主標題
152 |          xlab="Variable Loadings",                         # x軸的標題
153 |          col="red")                                        # 顏色
154 | ```     
155 |    
156 |    
157 | ### ‧第二主成份：   
158 | **HR(全壘打)、BB(四壞球)、RBI(打點)與PC-2呈現正相關**，看起來和「打擊者」有關。   
159 | ```{r}
160 | # 第二主成份：由小到大排序原變數的係數
161 | second.pca[order(second.pca, decreasing=FALSE)]  
162 | # 使用dotchart，繪製主成份負荷圖
163 | dotchart(second.pca[order(second.pca, decreasing=FALSE)] ,  # 排序後的係數
164 |          main="Loading Plot for PC2",                       # 主標題
165 |          xlab="Variable Loadings",                          # x軸的標題
166 |          col="blue")                                        # 顏色
167 | ```     
168 |    
169 | 
170 | ### ‧第三主成份：   
171 | **H1B(一壘安打)、H2B(二壘安打)與PC-3呈現正相關**，看起來和「安打」有關。
172 | 
173 | ```{r}
174 | # 第三主成份：由小到大排序原變數的係數
175 | third.pca[order(third.pca, decreasing=FALSE)]  
176 | # 使用dotchart，繪製主成份負荷圖
177 | dotchart(third.pca[order(third.pca, decreasing=FALSE)] ,   # 排序後的係數
178 |          main="Loading Plot for PC3",                      # 主標題
179 |          xlab="Variable Loadings",                         # x軸的標題
180 |          col="purple")                                     # 顏色
181 | ```     
182 |    
183 | 我們也可以繪製另一種主成份負荷圖，觀察每個球隊擅長的特性是什麼：
184 | 
185 | * 右邊的球隊適合上壘，多以盜壘(SB)和四壞球(BB)保送見長，全壘打表現中等(e.g.編號19)   
186 | 
187 | * 左上方的球隊以力量取勝，在全壘打(HR)和打點(RBI)上有顯著的優勢(e.g.編號11)   
188 | 
189 | * 下方的球隊不擅長全壘打，但在安打上的表現遠勝於其他球隊，盜壘也有一定水準(e.g.編號5)
190 | 
191 | ```{r}
192 | # 選取 PC1 和 PC2 繪製主成份負荷圖
193 | biplot(pca, choices=1:2)  
194 | ```
195 |    
196 | (註)   
197 | 
198 | 根據2012年美國職棒的統計：   
199 | 
200 | 1. 編號19的洛杉磯道奇隊，累積97次盜壘，只有101支全壘打。   
201 | 
202 | 2. 編號11的紐約洋基隊，全壘打產量累積224支，高居全聯盟之冠。      
203 | 
204 | 3. 編號5的舊金山巨人隊，全壘打累積95支，是所有球隊唯一未破百的隊伍；但一壘安打卻將近1000支，盜壘109次。   
205 | 
206 | ------
207 |    
208 | #**總結**  
209 | 
210 | 本篇筆記以2012美國的職棒資料進行主成份分析，練習如何挑選幾個主成份，並且根據主份負荷進行解釋上的探討。
211 | 
212 | 主成份分析，因為會對原始資料進行轉軸，因此有時候會比較難解釋；不過換個角度思考，只要能從主成份中找到有趣的故事，那麼這一次分析往往就能發現有價值的策略或insight。   
213 | 
214 | It's still a long way to go~   
215 |    
216 | #**Reference**  
217 | 本篇筆記參考<a href="https://sites.google.com/site/rlearningsite/factor/pca" target="_blank">R的世界-主成分分析</a>製作而成。  
218 | 
219 | 
220 | 


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/1.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/2.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/3.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/4.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/4.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/5.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/6.png


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/7.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/7.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/8.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/8.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/9.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/9.PNG


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/Thumbs.db


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/_RData.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分割、合併、離群值、虛擬變數/_RData.gz


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/_Rhistory:
--------------------------------------------------------------------------------
  1 | split_data <- as.data.frame(split_data)
  2 | View(split_data)
  3 | require(datasets)  # source package
  4 | data <- iris
  5 | View(data)
  6 | split_data <- split(data, sample(rep(1:2, 75)))
  7 | split_data <- as.data.frame(split_data)
  8 | View(split_data)
  9 | data <- iris
 10 | subset(data, Sepal.Length > 5)
 11 | subset_data <- subset(data, Sepal.Length > 5) # 只會出現 Sepal.Length > 5 的資料
 12 | View(subset_data)
 13 | # Chunk 1
 14 | require(datasets)  # source package
 15 | data <- iris
 16 | # Chunk 2
 17 | split_data <- split(data, sample(rep(1:2, 75)))
 18 | # Chunk 1
 19 | require(datasets)  # source package
 20 | data <- iris
 21 | # Chunk 2
 22 | split_data <- split(data, sample(rep(1:2, 75)))
 23 | # Chunk 1
 24 | require(datasets)  # source package
 25 | data <- iris
 26 | # Chunk 2
 27 | split_data <- split(data, sample(rep(1:2, 75)))
 28 | # Chunk 1
 29 | require(datasets)  # source package
 30 | data <- iris
 31 | # Chunk 2
 32 | split_data <- split(data, sample(rep(1:2, 75)))
 33 | require(datasets)  # source package
 34 | data <- iris
 35 | subset(data, Sepal.Length > 5) # 只會出現 Sepal.Length > 5 的資料
 36 | require(datasets)  # source package
 37 | data <- iris
 38 | subset(data, Sepal.Length > 5) # 只會出現 Sepal.Length > 5 的資料
 39 | data <- iris
 40 | data <- iris
 41 | subset(data, Sepal.Length > 5) # 只會出現 Sepal.Length > 5 的資料
 42 | subset(data, Sepal.Length > 5)
 43 | data1 <- subset(data, Sepal.Length > 5)
 44 | data2 <- subset(data, Sepal.Length > 5,select = Sepal.Length)
 45 | data3 <- subset(data, Sepal.Length > 5,select = - Sepal.Length
 46 | data3 <- subset(data, Sepal.Length > 5,select = - Sepal.Length)
 47 | data3 <- subset(data, Sepal.Length > 5,select = -Sepal.Length)
 48 | require(datasets)  # source package
 49 | data <- iris
 50 | data1 <- subset(data, Sepal.Length > 5)
 51 | data2 <- subset(data, Sepal.Length > 5,select = Sepal.Length)
 52 | data3 <- subset(data, Sepal.Length > 5,select = -Sepal.Length)
 53 | View(data)
 54 | View(data1)
 55 | View(data)
 56 | View(data1)
 57 | View(data2)
 58 | data2 <- subset(data, Sepal.Length > 5,select = c(Sepal.Length,species))
 59 | data1 <- subset(data, Sepal.Length > 5)
 60 | data2 <- subset(data, Sepal.Length > 5,select = c(Sepal.Length,species))
 61 | data2 <- subset(data, Sepal.Length > 5,select = c("Sepal.Length","species"))
 62 | data1 <- subset(data, Sepal.Length > 5)
 63 | data2 <- subset(data, Sepal.Length > 5,select = c("Sepal.Length","species"))
 64 | data1 <- subset(data, Sepal.Length > 5)
 65 | data <- iris
 66 | data1 <- subset(data, Sepal.Length > 5)
 67 | data2 <- subset(data, Sepal.Length > 5,select = c("Sepal.Length","species"))
 68 | data3 <- subset(data, Sepal.Length > 5,select = -Sepal.Length)
 69 | View(data)
 70 | data2 <- subset(data, Sepal.Length > 5, select = c("Sepal.Length","species"))
 71 | data2 <- subset(data, Sepal.Length > 5, select = Sepal.Length)
 72 | View(data2)
 73 | require(datasets)  # source package
 74 | data <- iris
 75 | data1 <- subset(data, Sepal.Length > 5)
 76 | View(data1)
 77 | data2 <- subset(data, Sepal.Length > 5, select = c("Sepal.Length",))
 78 | data2 <- subset(data, Sepal.Length > 5, select = c("Sepal.Length","species"))
 79 | data2 <- subset(data, Sepal.Length > 5, select = c("Sepal.Length","Species"))
 80 | data3 <- subset(data, Sepal.Length > 5,select = -Sepal.Length)
 81 | View(data2)
 82 | View(data3)
 83 | ```{r, results='hide'}
 84 | subset(data, Sepal.Length > 5,select = select = c("Sepal.Length","Species")) # 只會出現 Sepal.Length > 5 的資料且欄位只會出現 Sepal.Length 和 Species
 85 | rbind(x,y)
 86 | cbind(x,y)
 87 | x <- c(1, 2, 3)
 88 | y <- c(10, 20, 5)
 89 | x <- c(1, 2, 3)
 90 | y <- c(10, 20)
 91 | rbind(x,y)
 92 | cbind(x,y)
 93 | y <- c(10, 20, 5)
 94 | rbind(x,y)
 95 | cbind(x,y)
 96 | ID <- c(1,2,3,4)
 97 | Name <- c("A","B","C","D")
 98 | Student1 <- data.frame(ID,Name)
 99 | ID <- c(5,6,7,8)
100 | Name <- c("E","F","G","H")
101 | Student2 <- data.frame(ID,Name)
102 | union(Student1,Student2)
103 | union()
104 | Total_student <- rbind(Student1,Student2)
105 | View(Total_student)
106 | ID <- c(1,2,3,4)
107 | Name <- c("A","B","C","D")
108 | Score <- c(60,70,80,90)
109 | Sex <- c("M","F","M","M")
110 | Student1 <- data.frame(ID,Name)
111 | Student2 <- data.frame(Score,Sex)
112 | View(Student1)
113 | View(Student2)
114 | cbind(Student1,Student2)
115 | install.packages("outliers")
116 | ---
117 | title: "R_programming - (5)資料預處理(Data preprocessing)"
118 | author: "Allan (allan811118@gmail.com)"
119 | date: "2017/04/22"
120 | output:
121 | html_document:
122 | css: style.css
123 | ---
124 | <a href="http://rpubs.com/allan811118/R_programming_00" target="_blank">【回到課程大綱】</a>
125 | ------
126 | 本篇內容為「資料預處理」(或者稱**資料清洗**)的手法上。
127 | 畢竟在資料分析的流程中，其實有60~70%的時間是在進行「資料預處理」，如果沒有好的資料，後續的分析其實就可能會有很大的偏誤。
128 | 在「資料預處理」時，我們時常會遇到很多問題需要解決。
129 | 當然，也有有很多對應的小技巧，可以幫助我們處理這些問題。
130 | 首先，因為我們之後會使用到非常多的套件，故必須先更新R的版本至<a href="http://cran.csie.ntu.edu.tw/" target="_blank">【3.4.0】</a> ，而本篇內容有**資料分割**、**資料合併**、**處理離群值(outlier)**和**轉虛擬變數(Dummy variable)**等技巧！
131 | ------
132 | #**1. 資料分割**
133 | 當我們想要將一個表單切割成不同的子表單時，會使用到以下的函式:
134 | `split()`、`subset()`，以下會詳細介紹其函式的用法。
135 | 我們再拿我們熟悉的好朋友，鳶尾花資料(iris)來練習吧^_<
136 | **● 使用`split()`函式進行資料分割:**
137 | ```{r}
138 | require(datasets)  # source package
139 | data <- iris
140 | ```
141 | <img src="1.png"/>
142 | ```{r}
143 | split_data <- split(data, sample(rep(1:2, 75)))
144 | ```
145 | <img src="2.png"/>
146 | 由於rep(1:2,75)產生1,2交錯的向量，但加了前面的sample則是隨機抽取，所以向量1,2會被打亂，split會依照sample(rep(1:2,75))分組，都是1的會在同一組，都是2的也會在同一組。
147 | **● 使用`subset()`函式進行資料分割:**
148 | ```{r, results='hide'}
149 | require(datasets)  # source package
150 | data <- iris
151 | subset(data, Sepal.Length > 5) # 只會出現 Sepal.Length > 5 的資料
152 | ```
153 | <img src="3.png"/>
154 | ```{r, results='hide'}
155 | subset(data, Sepal.Length > 5,select = c("Sepal.Length","Species")) # 只會出現 Sepal.Length > 5 的資料且欄位只會出現 Sepal.Length 和 Species
156 | ```
157 | <img src="4.png"/>
158 | ```{r, results='hide'}
159 | subset(data, Sepal.Length > 5,select = - Sepal.Length) # selct = 負的代表不要出現的欄位
160 | ```
161 | <img src="5.png"/>
162 | ------
163 | #**2. 資料合併**
164 | 當我們想要將兩筆資料合併時，會使用到以下的函式:
165 | `rbind()`、`cbind()`、`merge()`，以下會詳細介紹其函式的用法。
166 | **● 使用`rbind()`函式進行資料合併:**
167 | `rbind()`可以用來追加資料，需要對應欄位(變數)名稱
168 | ```{r, echo=TRUE}
169 | # 首先先建立兩個 Data frame
170 | ID <- c(1,2,3,4)
171 | Name <- c("A","B","C","D")
172 | Student1 <- data.frame(ID,Name)
173 | ID <- c(5,6,7,8)
174 | Name <- c("E","F","G","H")
175 | Student2 <- data.frame(ID,Name)
176 | ```
177 | ```{r, echo=TRUE}
178 | # 透過 row 合併
179 | rbind(Student1,Student2)
180 | ```
181 | **● 使用`cbind()`函式進行資料合併:**
182 | `cbind()`可以用來新增變數到原本的資料表單中，不需要對應欄位(變數)名稱
183 | ```{r, echo=TRUE}
184 | # 首先先建立兩個 Data frame
185 | ID <- c(1,2,3,4)
186 | Name <- c("A","B","C","D")
187 | Score <- c(60,70,80,90)
188 | Sex <- c("M","F","M","M")
189 | Student1 <- data.frame(ID,Name)
190 | Student2 <- data.frame(Score,Sex)
191 | ```
192 | ```{r, echo=TRUE}
193 | # 透過 column 合併
194 | cbind(Student1,Student2)
195 | ```
196 | **● 使用`merge()`函式進行資料合併:**
197 | `merge()`能夠依據兩個表單中共同有的欄位(變數)名稱來合併資料
198 | ```{r}
199 | # 首先先建立兩個 data frame
200 | df1 <- data.frame(CustomerId = c(1:5), Product = c(rep("Toaster", 3), rep("Radio", 2)))
201 | df2 <- data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 1)))
202 | ```
203 | <img src="6.png" height="400px" width="300px" />
204 | 將兩個 data frame 透過 "CustomerId" 欄位進行合併:
205 | ```{r}
206 | #Inner join，保留兩資料集 "CustomerId" 欄位中，取交集的值來合併
207 | merge(x = df1, y = df2, by = "CustomerId")
208 | ```
209 | merge 函式的第一、二個參數是指定要合併的資料表，而 by 參數則是指定資料辨識的依據欄位
210 | ```{r}
211 | #Full join，保留兩資料集 "CustomerId" 欄位中，取聯集的值來合併
212 | merge(x = df1, y = df2, by = "CustomerId", all = TRUE)
213 | ```
214 | all 是用來詢問是否顯示所有欄位的資料
215 | ```{r}
216 | #Left join，保留 x (df1表單) "CustomerId" 欄位中的值來合併
217 | merge(x = df1, y = df2, by = "CustomerId", all.x = TRUE)
218 | ```
219 | ```{r}
220 | #Right join，保留 y (df2表單) "CustomerId" 欄位中的值來合併
221 | merge(x = df1, y = df2, by = "CustomerId", all.y = TRUE)
222 | ```
223 | 要注意，`merge()`僅針對兩筆具有共同變數的資料進行合併，由於`merge()`會針對by參數所指定的變數做交叉比對，因此該變數的編碼值必須是「單一獨立」且不能「重複」。例如「學號」、「身分證號」等，否則merge會出現個案增多的錯誤結果。
224 | 同學們也可以試試看`dplyr`套件來進行資料處理，當資料量很大時，處理速度會比一般的函式還要快一些哦~
225 | ------
226 | #**3. 處理離群值(outlier)**
227 | 在探勘的流程中，資料中是否存在離群值(outlier)，可能會嚴重影響到資料分析的結果，甚至會影響到模式建立的正確性。
228 | 因此判斷離群值的方法便相當重要，以下將介紹四種以敘述統計為基礎的離群值判斷方法，，包括標準化分數法、Hampel identifier法、盒鬚圖法和截尾平均值法。
229 | **● 標準化分數判斷法:**
230 | 將資料轉成標準化分數或Z分數進行判斷，根據常態分配的性質，約有99%資料的Z分數會落在平均值的3倍標準差之內，因此Z分數大於3或小於-3的數據將視為離群值(可自訂其他數據為切割點)
231 | ```{r}
232 | #
233 | ```
234 | ------
235 | #**4. 轉虛擬變數(Dummy variable)**
236 | ------
237 | #**總結**
238 | 由於未經處理的資料經常會有資料不完整、不一致或存在雜訊的問題。
239 | 因此在資料預處理時就要將這些問題排除，畢竟往後的分析流程都是延續我們預處理完的資料，有好品質的資料才能挖掘出好品質的資訊。
240 | 加油~
241 | ------
242 | >####**Reference - skydome20**<a href="https://rpubs.com/skydome20/Table" target="_blank">【R系列筆記】</a>
243 | View(data)
244 | scale(data, center,scale)
245 | scale(data, Sepal.Length,scale)
246 | scale(x, center=F, scale=T)
247 | scale(data, center=F, scale=T)
248 | scale(data$"iris", center=F, scale=T)
249 | boxplot(data)
250 | View(data)
251 | View(df2)
252 | View(data)
253 | scale(x, center = TRUE, scale = TRUE)
254 | scale(data, center = TRUE, scale = TRUE)
255 | View(data)
256 | scale(data$"iris", center = TRUE, scale = TRUE)
257 | scale(data, center = TRUE, scale = TRUE)
258 | View(Student1)
259 | data <- subset(data, Sepal.Length > 5,select = - Species)
260 | scale(data, center = TRUE, scale = TRUE)
261 | data <- iris
262 | data <- subset(data,select = - Species)
263 | data <- iris
264 | data <- subset(data,select = - Species)
265 | data <- scale(data, center = TRUE, scale = TRUE)
266 | View(data)
267 | data <- subset(data,Sepal.Length > 3)
268 | data <- subset(data, Sepal.Length > 3)
269 | data <- subset(data, Sepal.Width > 3)
270 | data <- iris
271 | data <- subset(data,select = - Species)
272 | scale_data <- scale(data, center = TRUE, scale = TRUE)
273 | View(scale_data)
274 | boxplot(scale_data)
275 | boxplot(scale_data)
276 | summary(scale_data)
277 | summary <- summary(scale_data)
278 | (scale_data)
279 | summary(data)
280 | boxplot(data)
281 | data <- iris
282 | boxplot(data)
283 | summary(data)
284 | boxplot(data)
285 | boxplot(data)
286 | data <- subset(data,select = - Species)
287 | boxplot(data)
288 | summary(data)
289 | data <- iris
290 | data <- subset(data,select = - Species)
291 | scale_data <- scale(data, center = TRUE, scale = TRUE)
292 | View(scale_data)
293 | data <- subset(data, Sepal.Width > 2)
294 | View(data)
295 | data <- iris
296 | data <- subset(data,select = - Species)
297 | scale_data <- scale(data, center = TRUE, scale = TRUE)
298 | View(scale_data)
299 | data <- subset(scale_data, Sepal.Width > 2)
300 | data <- subset(scale_data, Sepal.Length > 2)
301 | data <- iris
302 | data <- subset(data,select = - Species)
303 | scale_data <- scale(data, center = TRUE, scale = TRUE)
304 | subset(scale_data, Sepal.Length > 2)
305 | require(datasets)  # source package
306 | data <- iris
307 | subset(data, Sepal.Length > 5) # 只會出現 Sepal.Length > 5 的資料
308 | data <- iris
309 | data <- subset(data,select = - Species)
310 | scale_data <- scale(data, center = TRUE, scale = TRUE)
311 | View(scale_data)
312 | subset(scale_data, Sepal.Length > 2.0)
313 | scale_data <- as.data.frame(scale_data)
314 | subset(scale_data, Sepal.Length > 2.0)
315 | View(scale_data)
316 | View(scale_data)
317 | subset(scale_data, Sepal.Length > 2)
318 | subset(scale_data, Sepal.Length > 2,Sepal.Width > 2)
319 | scale_data <- as.data.frame(scale_data)
320 | subset(scale_data, Sepal.Length > 2,Sepal.Width > 2)
321 | subset(scale_data, Sepal.Length > 2)
322 | subset(scale_data, Sepal.Widthh > 2)
323 | subset(scale_data, Sepal.Width > 2)
324 | subset(scale_data, Sepal.Length > 2)
325 | subset(scale_data, Sepal.Width > 2)
326 | subset(scale_data, Petal.Length > 2)
327 | subset(scale_data, Petal.Width > 2)
328 | subset(scale_data, Sepal.Length < -2)
329 | subset(scale_data, Sepal.Width < -2)
330 | subset(scale_data, Sepal.Length < 2)
331 | data <- iris
332 | data <- subset(data,select = - Species)
333 | scale_data <- scale(data, center = TRUE, scale = TRUE)
334 | scale_data <- as.data.frame(scale_data)
335 | subset(scale_data, Sepal.Length < 2)
336 | subset(scale_data, Sepal.Length < 2)
337 | View(scale_data)
338 | scale_data <- subset(scale_data, Sepal.Length < 2)
339 | scale_data <- subset(scale_data, Sepal.Width < 2)
340 | scale_data <- subset(scale_data, Sepal.Length < 2 | Sepal.Width < 2) # 刪除Sepal.Length欄位中，Z分數小於2的值
341 | data <- iris
342 | data <- subset(data,select = - Species) # 去除不是數值型態的column
343 | scale_data <- scale(data, center = TRUE, scale = TRUE) # 標準化表單中的數值
344 | scale_data <- as.data.frame(scale_data) # 轉成Data frame型態
345 | scale_data <- subset(scale_data, Sepal.Length < 2 | Sepal.Width < 2) # 刪除Sepal.Length欄位中，Z分數小於2的值
346 | scale_data <- subset(scale_data, Sepal.Length < 2 | Sepal.Width < 2)
347 | scale_data <- subset(scale_data, Sepal.Length < 2 | Sepal.Width < 2 )
348 | scale_data <- subset(scale_data, Sepal.Length < 2 | Sepal.Width < 2 )
349 | scale_data <- subset(scale_data, Sepal.Length < 2 )
350 | scale_data <- subset(scale_data, Sepal.Length < 2 || Sepal.Width < 2 )
351 | scale_data <- subset(scale_data, Sepal.Width < 2 )
352 | scale_data <- as.data.frame(scale_data)
353 | data <- subset(data,select = - Species)
354 | scale_data <- scale(data, center = TRUE, scale = TRUE)
355 | scale_data <- as.data.frame(scale_data)
356 | scale_data <- subset(scale_data, Sepal.Width < 2 | Sepal.Width < 2)
357 | scale_data <- subset(scale_data, Sepal.Width < 2 | Sepal.Width < 2)
358 | data <- subset(data,select = - Species)
359 | scale_data <- scale(data, center = TRUE, scale = TRUE)
360 | scale_data <- as.data.frame(scale_data)
361 | scale_data <- subset(scale_data, Sepal.Width < 2 | Sepal.Width < 2)
362 | data <- subset(data,select = - Species)
363 | scale_data <- scale(data, center = TRUE, scale = TRUE)
364 | scale_data <- as.data.frame(scale_data)
365 | View(scale_data)
366 | scale_data <- subset(scale_data, Sepal.Width < 2 | Sepal.Width < 2)
367 | scale_data <- subset(scale_data, Sepal.Width < 2 & Sepal.Width < 2)
368 | scale_data <- scale(data, center = TRUE, scale = TRUE)
369 | scale_data <- as.data.frame(scale_data)
370 | scale_data <- subset(scale_data, Sepal.Width < 2 & Sepal.Width < 2)
371 | data <- iris
372 | data <- subset(data,select = - Species)
373 | scale_data <- scale(data, center = TRUE, scale = TRUE)
374 | scale_data <- as.data.frame(scale_data)
375 | scale_data <- subset(scale_data, Sepal.Width < 2 & Sepal.Width < 2)
376 | View(scale_data)
377 | scale_data <- subset(scale_data, Sepal.Width < 2|Sepal.Width < 2)
378 | data <- subset(data,select = - Species)
379 | scale_data <- scale(data, center = TRUE, scale = TRUE)
380 | scale_data <- as.data.frame(scale_data)
381 | scale_data <- subset(scale_data, Sepal.Width < 2|Sepal.Width < 2)
382 | data <- subset(data,select = - Species)
383 | scale_data <- scale(data, center = TRUE, scale = TRUE)
384 | scale_data <- as.data.frame(scale_data)
385 | scale_data <- subset(scale_data, Sepal.Width<2|Sepal.Width<2)
386 | data <- subset(data,select = - Species)
387 | scale_data <- scale(data, center = TRUE, scale = TRUE)
388 | scale_data <- as.data.frame(scale_data)
389 | View(scale_data)
390 | scale_data <- subset(scale_data, Sepal.Length < 2 | Sepal.Width < 2)
391 | scale_data <- subset(scale_data, Sepal.Length < 2 & Sepal.Width < 2)
392 | scale_data
393 | scale_data <- subset(scale_data, Sepal.Length < 2 & Sepal.Width < 2 & Petal.Length < 2 & Petal.Width < 2)
394 | data <- subset(data,select = - Species)
395 | scale_data <- scale(data, center = TRUE, scale = TRUE)
396 | scale_data <- as.data.frame(scale_data)
397 | View(scale_data)
398 | scale_data <- subset(scale_data, Sepal.Length < 2 & Sepal.Width < 2 & Petal.Length < 2 & Petal.Width < 2)
399 | scale_data <- subset(scale_data, Sepal.Length > -2 & Sepal.Width > -2 & Petal.Length > -2 & Petal.Width > -2)
400 | View(scale_data)
401 | data <- iris
402 | #data <- subset(data,select = - Species) # 去除不是數值型態的column
403 | scale_data <- scale(data, center = TRUE, scale = TRUE) # 標準化表單中的數值
404 | data <- iris
405 | data <- subset(data,select = - Species)
406 | scale_data <- scale(data, center = TRUE, scale = TRUE)
407 | scale_data <- as.data.frame(scale_data)
408 | data <- iris
409 | data <- subset(data,select = - Species)
410 | scale_data <- scale(data, center = TRUE, scale = TRUE)
411 | scale_data <- subset(scale_data, Sepal.Length < 2 & Sepal.Width < 2 & Petal.Length < 2 & Petal.Width < 2)
412 | data <- iris
413 | data <- subset(data,select = - Species)
414 | scale_data <- scale(data, center = TRUE, scale = TRUE)
415 | scale_data <- as.data.frame(scale_data)
416 | scale_data <- subset(scale_data, Sepal.Length < 2 & Sepal.Width < 2 & Petal.Length < 2 & Petal.Width < 2)
417 | scale_data <- subset(scale_data, Sepal.Length > -2 & Sepal.Width > -2 & Petal.Length > -2 & Petal.Width > -2)
418 | data <- iris
419 | data <- subset(data,select = - Species)
420 | boxplot(data)
421 | boxplot(data)
422 | summary(data)
423 | boxplot(data)
424 | boxplot(data)
425 | boxplot(data)
426 | boxplot$out(data)
427 | boxplot(data)
428 | boxplot(data)
429 | boxplot(data)
430 | boxplot.stats(data)$out
431 | boxplot(data) # 繪製盒鬚圖
432 | data <- iris
433 | data <- subset(data,select = - Species) # 去除不是數值型態的column
434 | boxplot(data) # 繪製盒鬚圖
435 | data <- iris
436 | data <- subset(data,select = - Species)
437 | boxplot(data)
438 | boxplot.stats(data)$out
439 | data <- iris
440 | data <- subset(data,select = - Species)
441 | boxplot(data)
442 | boxplot(data)
443 | summary(data)
444 | data <- iris
445 | data <- subset(data,select = - Species)
446 | boxplot(data)
447 | data1 <- boxplot.stats(data, do.conf = FALSE, do.out = FALSE)
448 | data1 <- boxplot.stats(data, do.conf = FALSE, do.out = TRUE)
449 | data1 <- boxplot.stats(data$"iris", do.conf = FALSE, do.out = TRUE)
450 | data$out
451 | data <- iris
452 | data <- subset(data,select = - Species)
453 | boxplot(data)
454 | data$out
455 | bp <- boxplot(c(1,10:20,100,120))
456 | bp$out
457 | data <- iris
458 | data <- subset(data,select = - Species)
459 | boxplot(data)
460 | data$out
461 | summary(data)
462 | identify(data)
463 | boxplot(data)
464 | boxplot(data)
465 | identify(data)
466 | summary(data)
467 | identify(data)
468 | boxplot(data)
469 | identify(data)
470 | boxplot(data)
471 | data$out
472 | View(data)
473 | rm.outlier(data, fill = FALSE, median = FALSE, opposite = FALSE)
474 | library("outliers", lib.loc="~/R/win-library/3.4")
475 | data <- iris
476 | data <- subset(data,select = - Species)
477 | boxplot(data)
478 | rm.outlier(data, fill = FALSE, median = FALSE, opposite = FALSE)
479 | rm.outlier(data$data, fill = FALSE, median = FALSE, opposite = FALSE)
480 | summary(data)
481 | summary(data)
482 | boxplot(data) # 繪製盒鬚圖
483 | data <- subset(data, Sepal.Width < 4 & Sepal.Width > 2)
484 | boxplot(data)
485 | require(datasets)  # source package
486 | data <- iris
487 | data <- subset(data,select = - Species) # 去除不是數值型態的column
488 | boxplot(data) # 繪製盒鬚圖
489 | summary(data)
490 | require(datasets)  # source package
491 | data <- iris
492 | data <- subset(data,select = - Species) # 去除不是數值型態的column
493 | boxplot(data) # 繪製盒鬚圖
494 | summary(data)
495 | boxplot(data) # 繪製盒鬚圖
496 | boxplot(data)$out
497 | View(data)
498 | rm.outlier(data)
499 | install.packages("outliers")
500 | require(outliers)
501 | rm.outlier(data)
502 | rm.outlier(data, fill = FALSE, median = FALSE, opposite = FALSE)
503 | rm.outlier(data, fill = FALSE, median = FALSE, opposite = FALSE)
504 | boxplot(data)$out
505 | rm(boxplot(data)$out)
506 | boxplot(data)$out
507 | IQR(data)
508 | x2 <- data[!(data %in% data$out)]
509 | View(x2)
510 | View(data)
511 | boxplot(data) # 繪製盒鬚圖
512 | x2 <- data[!(data %in% data$out)]
513 | 


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre{
13 |   font-size: 16px;
14 |   font-family:  "Times New Roman";
15 | }


--------------------------------------------------------------------------------
/Source-File/分割、合併、離群值、虛擬變數/分割、合併、離群值、虛擬變數.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---分割、合併、離群值、虛擬變數"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 |   
 14 | 本篇內容為「資料預處理」(或者稱**資料清洗**)的手法上。    
 15 | 
 16 | 畢竟在資料分析的流程中，其實有60~70%的時間是在進行「資料預處理」，如果沒有好的資料，後續的分析其實就可能會有很大的偏誤。
 17 | 在「資料預處理」時，我們時常會遇到很多問題需要解決。
 18 | 當然，也有有很多對應的小技巧，可以幫助我們處理這些問題。
 19 | 
 20 | 首先，因為我們之後會使用到非常多的套件，故必須先更新R的版本至<a href="http://cran.csie.ntu.edu.tw/" target="_blank">【3.4.0】</a> ，而本篇內容有**資料分割**、**資料合併**、**處理離群值(outlier)**和**轉虛擬變數(Dummy variable)**等技巧！   
 21 | 
 22 | ------
 23 | 
 24 | #**1. 資料分割**   
 25 | 
 26 | 當我們想要將一個表單切割成不同的子表單時，會使用到以下的函式:
 27 | `split()`、`subset()`，以下會詳細介紹其函式的用法。
 28 | 
 29 | 我們再拿我們熟悉的好朋友，鳶尾花資料(iris)來練習吧^_<
 30 | 
 31 | **● 使用`split()`函式進行資料分割:**
 32 | ```{r}
 33 | require(datasets)  # source package
 34 | data <- iris
 35 | ```
 36 | <img src="1.png"/>
 37 | 
 38 | ```{r}
 39 | split_data <- split(data, sample(rep(1:2, 75)))
 40 | ```
 41 | <img src="2.png"/>
 42 | 由於rep(1:2,75)產生1,2交錯的向量，但加了前面的sample則是隨機抽取，所以向量1,2會被打亂，split會依照sample(rep(1:2,75))分組，都是1的會在同一組，都是2的也會在同一組。
 43 | 
 44 | **● 使用`subset()`函式進行資料分割:**
 45 | ```{r, results='hide'}
 46 | require(datasets)  # source package
 47 | data <- iris
 48 | subset(data, Sepal.Length > 5) # 只會出現 Sepal.Length > 5 的資料
 49 | ```
 50 | <img src="3.png"/>
 51 | 
 52 | ```{r, results='hide'}
 53 | subset(data, Sepal.Length == 5,select = c("Sepal.Length","Species")) # 只會出現 Sepal.Length 等於 5 的資料，且欄位只會出現 Sepal.Length 和 Species
 54 | ```
 55 | <img src="4.png"/>
 56 | 
 57 | ```{r, results='hide'}
 58 | subset(data, Sepal.Length > 5,select = - Sepal.Length) # selct = 負的代表不要出現的欄位
 59 | ```
 60 | <img src="5.png"/>
 61 | 
 62 | ------
 63 | 
 64 | #**2. 資料合併**   
 65 | 
 66 | 當我們想要將兩筆資料合併時，會使用到以下的函式:
 67 | `rbind()`、`cbind()`、`merge()`，以下會詳細介紹其函式的用法。
 68 | 
 69 | **● 使用`rbind()`函式進行資料合併:**
 70 | `rbind()`可以用來追加資料，需要對應欄位(變數)名稱
 71 | ```{r, echo=TRUE}
 72 | # 首先先建立兩個 Data frame
 73 | ID <- c(1,2,3,4)
 74 | Name <- c("A","B","C","D")
 75 | Student1 <- data.frame(ID,Name)
 76 | 
 77 | ID <- c(5,6,7,8)
 78 | Name <- c("E","F","G","H")
 79 | Student2 <- data.frame(ID,Name)
 80 | ```
 81 | 
 82 | ```{r, echo=TRUE}
 83 | # 透過 row 合併
 84 | rbind(Student1,Student2)
 85 | ```
 86 | 
 87 | **● 使用`cbind()`函式進行資料合併:**
 88 | `cbind()`可以用來新增變數到原本的資料表單中，不需要對應欄位(變數)名稱
 89 | ```{r, echo=TRUE}
 90 | # 首先先建立兩個 Data frame
 91 | ID <- c(1,2,3,4)
 92 | Name <- c("A","B","C","D")
 93 | 
 94 | Score <- c(60,70,80,90)
 95 | Sex <- c("M","F","M","M")
 96 | 
 97 | Student1 <- data.frame(ID,Name)
 98 | Student2 <- data.frame(Score,Sex)
 99 | ```
100 | 
101 | ```{r, echo=TRUE}
102 | # 透過 column 合併
103 | cbind(Student1,Student2)
104 | ```
105 | 
106 | **● 使用`merge()`函式進行資料合併:**
107 | `merge()`能夠依據兩個表單中共同有的欄位(變數)名稱來合併資料
108 | ```{r}
109 | # 首先先建立兩個 data frame
110 | df1 <- data.frame(CustomerId = c(1:5), Product = c(rep("Toaster", 3), rep("Radio", 2)))
111 | df2 <- data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio", 1)))
112 | ```
113 | <img src="6.png" height="400px" width="300px" />
114 | 
115 | 將兩個 data frame 透過 "CustomerId" 欄位進行合併:
116 | ```{r}
117 | # Inner join，保留兩資料集 "CustomerId" 欄位中，取交集的值來合併
118 | merge(x = df1, y = df2, by = "CustomerId")
119 | ```
120 | merge 函式的第一、二個參數是指定要合併的資料表，而 by 參數則是指定資料辨識的依據欄位
121 | 
122 | ```{r}
123 | # Full join，保留兩資料集 "CustomerId" 欄位中，取聯集的值來合併
124 | merge(x = df1, y = df2, by = "CustomerId", all = TRUE)
125 | ```
126 | all 是用來詢問是否顯示所有欄位的資料
127 | 
128 | ```{r}
129 | # Left join，保留 x (df1表單) "CustomerId" 欄位中的值來合併
130 | merge(x = df1, y = df2, by = "CustomerId", all.x = TRUE)
131 | ```
132 | 
133 | ```{r}
134 | # Right join，保留 y (df2表單) "CustomerId" 欄位中的值來合併
135 | merge(x = df1, y = df2, by = "CustomerId", all.y = TRUE)
136 | ```
137 | 要注意，`merge()`僅針對兩筆具有共同變數的資料進行合併，由於`merge()`會針對by參數所指定的變數做交叉比對，因此該變數的編碼值必須是「單一獨立」且不能「重複」。例如「學號」、「身分證號」等，否則merge會出現個案增多的錯誤結果。
138 | 
139 | 同學們也可以試試看`dplyr`套件來進行資料處理，當資料量很大時，處理速度會比一般的函式還要快一些哦~
140 | 
141 | ------
142 | 
143 | #**3. 處理離群值(outlier)**   
144 | 
145 | 在探勘的流程中，資料中是否存在離群值(outlier)，可能會嚴重影響到資料分析的結果，甚至會影響到模式建立的正確性。
146 | 
147 | 因此判斷離群值的方法便相當重要，以下將介紹兩種以敘述統計為基礎的離群值判斷方法，包括**標準化分數**、**盒鬚圖**。
148 | 
149 | **● 標準化分數判斷:**
150 | 將資料轉成標準化分數或Z分數進行判斷，根據常態分配的性質，約有99%資料的Z分數會落在平均值的3倍標準差之內，因此Z分數大於3或小於-3的數據將視為離群值(可自訂其他數據為切割點)
151 | ```{r}
152 | require(datasets)  # source package
153 | data <- iris
154 | data <- subset(data,select = - Species) # 去除不是數值型態的column
155 | scale_data <- scale(data, center = TRUE, scale = TRUE) # 標準化表單中的數值
156 | scale_data <- as.data.frame(scale_data) # 轉成Data frame型態
157 | scale_data <- subset(scale_data, Sepal.Length < 2 & Sepal.Width < 2 & Petal.Length < 2 & Petal.Width < 2) # 留下全部欄位中，Z分數小於2的值
158 | scale_data <- subset(scale_data, Sepal.Length > -2 & Sepal.Width > -2 & Petal.Length > -2 & Petal.Width > -2) # 留下全部欄位中，Z分數大於-2的值
159 | ```
160 | 
161 | **● 盒鬚圖判斷:**
162 | Tukey(1977)將變數中任何位於內籬（inner fence）與外籬之間的數據視為該變數的潛在離群值。另外，如果變數中有任何數據位於外籬（outter fence）之外的，則視它們為該變數的離群值，外籬指的是Q1向下延伸或Q3向上延伸3倍IQR的距離
163 | ```{r}
164 | require(datasets)  # source package
165 | data <- iris
166 | data <- subset(data,select = - Species) # 去除不是數值型態的column
167 | boxplot(data) # 繪製盒鬚圖
168 | summary(data)
169 | data <- subset(data, Sepal.Width < 4 & Sepal.Width > 2) # 留下Sepal.Width欄位中，數值小於4或大於2的值
170 | boxplot(data)
171 | ```
172 | 
173 | 處理離群值時，首先應考量懷疑為離群值的數據是否可以被解釋，如果可以，則可依合理的原則處理，例如資料完全不合理即可移除；但如果資料經查證後，不但無誤，而且發現該離群值是來自於非常特殊的個案，我們應該深入瞭解其數據為何如此特別，且必須深入探討決定應該刪除抑或保留該數據。
174 | 
175 | 同學們也可以試試看`outliers`套件來進行離群值判斷，實際跑看看套件中的Example，不過有些時候運用比較簡單的方法處理，說不定效果還會比較好哦~
176 | 
177 | ------
178 | 
179 | #**4. 轉虛擬變數(Dummy variable)**   
180 | 
181 | 在迴歸分析（線性、羅吉斯…等）中，當自變數為類別變數時，我們都要先進行轉換**虛擬變數(Dummy variable)**的動作，以人工變數量化類別變數，通常取值為0或1。
182 | 
183 | 以下我們使用鳶尾花資料(iris)做練習~
184 | 
185 | **●使用`dummies`套件轉換 :**
186 | ```{r, message=FALSE}
187 | require(dummies)  # 轉換虛擬變數的套件
188 | data <- iris 
189 | ```
190 | <img src="7.png"/>
191 | 
192 | ```{r}
193 | alldummy_data <- dummy.data.frame(data)
194 | ```
195 | <img src="8.png"/>
196 | 
197 | 函式會自動抓取表單中，屬於類別變數的欄位轉換成虛擬變數
198 | ```{r}
199 | justdummy_data <- dummy.data.frame(data, all = F)
200 | ```
201 | <img src="9.png"/>
202 | 
203 | **all = F**，可以只顯示出轉換後的虛擬變數欄位
204 | 
205 | **●使用`model.matrix`函式轉換 :**
206 | ```{r, message=FALSE}
207 | data <- iris
208 | justdummy_data <- model.matrix(~data$Species-1) # 轉換出只有虛擬變數的欄位
209 | alldummy_data <- cbind(data,justdummy_data) # 合併表單by couumn
210 | ```
211 | 注意:利用**model.matrix**函式轉換的欄位資料型態要為**factor**！！
212 | 
213 | 
214 | 
215 | 
216 | 引入虛擬變數會使得原本的模型變得更複雜，但對於問題的解釋會更清楚，也就是一個方程式能達到兩個方程式的概念，而且較接近現實~
217 | 
218 | 要注意，在模型中引入多個虛擬變數時，虛擬變數的個數要遵守轉換原則：**如果在類別變數欄位中有n種互斥的屬性，則只在模型中引入（n-1）個虛擬變數。**
219 | 
220 | ------
221 | 
222 | #**總結**    
223 | 
224 | 由於未經處理的資料經常會有資料不完整、不一致或存在雜訊的問題。
225 | 
226 | 因此在資料預處理時就要將這些問題排除，畢竟往後的分析流程都是延續我們預處理完的資料，有好品質的資料才能挖掘出好品質的資訊。
227 | 
228 | 加油~
229 | 
230 | ------
231 | 


--------------------------------------------------------------------------------
/Source-File/分群分析/.Rhistory:
--------------------------------------------------------------------------------
 1 | require(markdown)
 2 | api_id.path <- 'rpubs_conn/api_id.txt'
 3 | note.title <- 'R9'
 4 | note.html <- 'R9.html'
 5 | # Update
 6 | if (file.exists('rpubs_conn/api_id.txt')){
 7 | print('Start Updating')
 8 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
 9 | # update article on Rpubs
10 | update.result <- rpubsUpload(title = note.title,
11 | htmlFile = note.html,
12 | id = api.id
13 | )
14 | browseURL(update.result$continueUrl)
15 | print('update success')
16 | update.result$continueUrl
17 | # Upload
18 | }else{
19 | print('Start Uploading')
20 | dir.create('rpubs_conn')
21 | # upload article on Rpubs
22 | upload.result <- rpubsUpload(title = note.title,
23 | htmlFile = 'R1.html'
24 | )
25 | upload.result$id
26 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
27 | browseURL(upload.result$continueUrl)
28 | print('upload success')
29 | upload.result$continueUrl
30 | }
31 | 


--------------------------------------------------------------------------------
/Source-File/分群分析/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/分群分析/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分群分析/1.png


--------------------------------------------------------------------------------
/Source-File/分群分析/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/分群分析/2.png


--------------------------------------------------------------------------------
/Source-File/分群分析/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/分群分析/分群分析.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---分群分析(Clustering)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 |   
 14 | 本篇內容，會繼續介紹一些常用的資料探勘模型：   
 15 |    
 16 | ------
 17 |    
 18 | 在分群分析中，主要可以分成兩種類型：
 19 | 
 20 | - 階層式分群(Hierarchical Clustering)：不需指定分群數目，讓資料自動由上往下/由下往上結合起來。   
 21 | 
 22 | - 分割式分群(Partitional Clustering)：需事先指定分群數目，經過不斷的迭代，直到群內的變異最小。  
 23 | 
 24 | 
 25 | ------
 26 | 
 27 | ## 1. 階層式分群(Hierarchical Clustering)   
 28 | 
 29 | 這裡使用`iris`的資料：
 30 | ```{r}
 31 | head(iris)
 32 | ```     
 33 |    
 34 | 由於分群屬於「非監督式學習」的演算法，   
 35 | 因此我們先把`iris`內的品種(Species)欄位拿掉，以剩下的資料進行分群：   
 36 |    
 37 | ```{r}
 38 | data <- iris[, -5] # 因為Species是第五欄位，故移除掉
 39 | head(data)         # 現在data只剩下前四個欄位的資料
 40 | ```
 41 | 
 42 | 在階層式分群中，主要是以資料之間的「距離」遠近，來決定兩筆資料是否接近。   
 43 | R的話，我們可以使用`dist()`，來建立資料之間的「距離矩陣」(Distance Matrix)，判斷資料之間的遠與近：   
 44 |    
 45 | ```{r, message=FALSE}
 46 | E.dist <- dist(data, method="euclidean") # 歐式距離
 47 | M.dist <- dist(data, method="manhattan") # 曼哈頓距離
 48 | ```   
 49 | (由於以上的矩陣太過龐大，這裡就不顯示出來，大家可以自行在自己的電腦上觀察！)   
 50 | 
 51 | 接下來，我們就可以根據資料間的距離，來進行階層式分群，使用的函式是`hclust()`：
 52 | 
 53 | ```{r}
 54 | par(mfrow=c(1,2)) # 讓圖片以1x2的方式呈現，詳情請見(4)繪圖-資料視覺化
 55 | 
 56 | # 使用歐式距離進行分群
 57 | h.E.cluster <- hclust(E.dist)
 58 | plot(h.E.cluster, xlab="歐式距離")
 59 | 
 60 | # 使用曼哈頓距離進行分群
 61 | h.M.cluster <- hclust(M.dist) 
 62 | plot(h.M.cluster, xlab="曼哈頓距離")
 63 | ```   
 64 |       
 65 | ------
 66 |    
 67 |    
 68 | 當我們有了「距離矩陣」後，要如何把資料結合起來，不同的方法也會產生不同的效果。   
 69 |    
 70 | 一般來說，主要有以下五種方法：   
 71 |    
 72 | <img src="1.png">   
 73 | 
 74 | 而我們可以在`hclust()`裡面調整參數`method`，選擇不同的方法：   
 75 | 
 76 | ```{r, results='hide'}
 77 | hclust(E.dist, method="single")   # 最近法
 78 | hclust(E.dist, method="complete") # 最遠法
 79 | hclust(E.dist, method="average")  # 平均法
 80 | hclust(E.dist, method="centroid") # 中心法
 81 | hclust(E.dist, method="ward.D2")  # 華德法
 82 | ```   
 83 | 
 84 | 那麼，在這個例子中，我們就用**歐式距離**搭配**華德法**，來進行階層式分群：   
 85 | 
 86 | ```{r}
 87 | E.dist <- dist(data, method="euclidean")      # 歐式距離
 88 | h.cluster <- hclust(E.dist, method="ward.D2") # 華德法
 89 | 
 90 | # 視覺化
 91 | plot(h.cluster)
 92 | abline(h=9, col="red")
 93 | ```   
 94 | 
 95 | 由上圖，可以觀察最佳的分群數目是3個，   
 96 | 因此我們可以利用`cutree()`，讓整個階層的結構縮減，變成分成三群的狀態：   
 97 | ```{r}
 98 | cut.h.cluster <- cutree(h.cluster, k=3)  # 分成三群
 99 | cut.h.cluster                            # 分群結果
100 | table(cut.h.cluster, iris$Species)       # 分群結果和實際結果比較
101 | ```
102 |    
103 | 看起來，這次分群很成功地把**setosa**分到第一群；**versicolor**分到第二群；   
104 | 不過，**virginica**似乎遇到了點小麻煩？   
105 | 
106 | 讓我們回去看原始資料的分佈情況吧：   
107 | ```{r, message=FALSE, echo=FALSE}
108 |   require(ggplot2)
109 |   ggplot(data=iris) +                        # 準備畫布
110 |     geom_point(aes(x=Petal.Length,           # 散布圖
111 |                    y=Petal.Width,
112 |                    color=Species)) +         # 把不同品種的資料標上顏色
113 |     
114 |     theme_bw()                               # 改變主題背景成白色
115 | ```
116 | 
117 | 果然，圖中的右上角，有一些**virginica(藍色)**和**versicolor(綠色)**靠得十分近。
118 |    
119 | 因此他們被分到第二群也是很合理的事情！   
120 | 
121 | ------
122 | 
123 | ## 2. 切割式分群(Partitional Clustering)   
124 | 
125 | 在切割式分群裡，最常見就是**K-Cluster**的方法，並且根據分群條件的不同，可以分成：
126 | 
127 | ###1. K-Means   
128 | 使用函式是`kmeans()`：
129 | 
130 | ```{r, message=FALSE}
131 | # 分成三群
132 | kmeans.cluster <- kmeans(data, centers=3) 
133 | 
134 | # 群內的變異數
135 | kmeans.cluster$withinss
136 | 
137 | # 分群結果和實際結果比較
138 | table(kmeans.cluster$cluster, iris$Species)  
139 | 
140 | # 視覺化 k-means 分群結果(基於ggplot2的語法)
141 | require(factoextra)
142 | fviz_cluster(kmeans.cluster,           # 分群結果
143 |              data = data,              # 資料
144 |              geom = c("point","text"), # 點和標籤(point & label)
145 |              frame.type = "norm")      # 框架型態
146 | 
147 | ```   
148 | 
149 | ###2. K-Medoid      
150 | 使用函式是`pam()`，在`cluster`這個套件裡面：
151 | ```{r, message=FALSE}
152 | require(cluster)
153 | 
154 | # pam = Partitioning Around Medoids
155 | kmedoid.cluster <- pam(data, k=3) 
156 | 
157 | # 群內的變異數
158 | kmedoid.cluster$objective
159 | 
160 | # 分群結果和實際結果比較
161 | table(kmedoid.cluster$clustering, iris$Species) 
162 | 
163 | # 視覺化 k-medoid 分群結果(基於ggplot2的語法)
164 | require(factoextra)
165 | fviz_cluster(kmedoid.cluster,       # 分群結果
166 |              data = data,           # 資料
167 |              geom = c("point"),     # 點 (point)
168 |              frame.type = "norm")   # 框架型態
169 | ```
170 | 
171 | ------
172 | 
173 | ##3. 分群的最佳數目(Optimal number of clusters)      
174 |    
175 |    
176 | 如今，你已經學會**階層式分群**和**切割式分群**的R語言要怎麼寫了！   
177 | 
178 | 不過，在進行分群時，往往會遇到一個很重要的問題，那就是：**最佳的分群數目為何？**   
179 | 
180 |    
181 | ------
182 |    
183 | 
184 | ###(1)**Elbow Method**    
185 | 
186 | 要解決這個問題，我們先回顧一下分群的目的，就是「使群內的總變異最小；使群間的總變異最大」，是吧？   
187 | 
188 | 換句話說，我們只要找出一個數字n，使得資料被分成n群時，群內的總變異(SSE)會最小，那麼n = 最佳的分群數目(optimal number for clusters)！   
189 | 
190 | 這樣的方法，就被稱為**Elbow Method**！   
191 | 
192 | 在`factoextra`的套件裡，已經幫我們寫好函式`fviz_nbclust()`，可以讓我們實踐**Elbow Method**。   
193 | 
194 | 函式`fviz_nbclust()`，是基於`ggplot2`的語法，將**Elbow Method**的結果視覺化，   
195 | 概念和主成份分析中的陡坡圖(scree plot)幾乎一模一樣，相信大家會感覺相當熟悉！      
196 | 
197 | ```{r, message=FALSE}
198 | require(factoextra)
199 | 
200 | # Elbow Method 應用在 階層式分析
201 | # 注意：這裡使用的是hcut()，屬於factoextra套件，並非上面提的hclust()
202 | fviz_nbclust(data, 
203 |              FUNcluster = hcut,  # hierarchical clustering
204 |              method = "wss",     # total within sum of square
205 |              k.max = 12          # max number of clusters to consider
206 |              ) + 
207 |     
208 | labs(title="Elbow Method for HC") +
209 |     
210 | geom_vline(xintercept = 3,       # 在 X=3的地方 
211 |            linetype = 2)         # 畫一條虛線
212 | 
213 | # Elbow Method 應用在 K-Means
214 | fviz_nbclust(data, 
215 |              FUNcluster = kmeans,# K-Means
216 |              method = "wss",     # total within sum of square
217 |              k.max = 12          # max number of clusters to consider
218 |              ) +
219 |     
220 | labs(title="Elbow Method for K-Means") +
221 |     
222 | geom_vline(xintercept = 3,        # 在 X=3的地方 
223 |            linetype = 2)          # 畫一條垂直虛線
224 | 
225 | # Elbow Method 應用在 K-Medoid
226 | fviz_nbclust(data, 
227 |              FUNcluster = pam,   # K-Medoid
228 |              method = "wss",     # total within sum of square
229 |              k.max = 12          # max number of clusters to consider
230 |              ) +
231 |     
232 | labs(title="Elbow Method for K-Medoid") +
233 |     
234 | geom_vline(xintercept = 3,       # 在 X=3的地方 
235 |            linetype = 2)         # 畫一條垂直虛線
236 |     
237 | 
238 | ```   
239 | 
240 |    
241 | ------
242 |    
243 | 
244 | ###(2)**Average Silhouette Method**    
245 | 
246 | 除了計算SSE以外，另一個衡量分群效果的方法，稱為平均側影法(Average silhouette Method)。  
247 | 
248 | 側影系數(Silhouette Coefficient)會根據每個資料點(i)的內聚力和分散力，衡量分群的效果(quality)。   
249 | 
250 | 
251 | 公式如下：   
252 | 
253 | <img src="2.png">    
254 | 
255 | 其中：   
256 | 
257 | - a(i) = 資料點(i)，它與群內其他資料點的平均距離   
258 | 
259 | - b(i) = 資料點(i)，它與其他群內資料點的平均距離，取最小值   
260 | 
261 | - s(i) = 側影係數，可以視為資料點(i)，在它所屬的群內是否適當的指標      
262 | 
263 | 我們便利用這個方法，取每一個資料點的側影平均值(故稱Avg. Silhouette Method)，當作衡量最佳分群數目的準則！   
264 | 
265 | 
266 | 在R裡面，寫法和**Elbow Method**完全一模一樣，差別只在於參數`method="silhouette"`而已：   
267 | 
268 | (以下只舉K-Means為例)   
269 |    
270 | ```{r, message=FALSE}
271 | require(factoextra)
272 | 
273 | # Avg. Silhouette 應用在 K-Means
274 | fviz_nbclust(data, 
275 |              FUNcluster = kmeans,   # K-Means
276 |              method = "silhouette", # Avg. Silhouette
277 |              k.max = 12             # max number of clusters
278 |              ) +
279 |     
280 | labs(title="Avg.Silhouette Method for K-Means") 
281 | 
282 | ```
283 | 
284 | 提問：為什麼這個方法建議分2群呢？(提示：和原始資料的分布型態有關。)   
285 | 
286 |    
287 | ------
288 |    
289 | 
290 | ###(4)**SOM()**    
291 | 
292 | ```{r}
293 | require(SOMbrero)
294 | 
295 | 
296 | ```
297 | 
298 | ------   
299 | 
300 | #**總結**  
301 | 
302 | 分群(Clustering)屬於非監督式學習，主要根據資料本身的特性，來進行資料分析的一種方法。   
303 | 
304 | 實務上，當我們對資料還沒有深入了解時，便可以先使用分群方法，觀察潛藏在資料中的特性，再擬定後續分析的手法。   
305 | 
306 | 在使用分群方法時，不同的分群數目，往往會對最後的結果有巨大的影響。   
307 | 因此找到**最佳的分群數目**，是很重要的課題！   
308 | 
309 | It's still a long way to go~   
310 | 
311 | 
312 | 
313 | ------   
314 | 
315 | 
316 | #**Reference**  
317 | 本篇筆記參考<a href="http://www.sthda.com/english/wiki/determining-the-optimal-number-of-clusters-3-must-known-methods-unsupervised-machine-learning#three-popular-methods-for-determining-the-optimal-number-of-clusters" target="_blank">Determining the optimal number of clusters: 3 must known methods - Unsupervised Machine Learning </a>、
318 | 以及<a href="https://cran.r-project.org/web/packages/SOMbrero/vignettes/doc-numericSOM.html" target="_blank">Using Self-Organizing Maps with SOMbrero to cluster a numeric dataset</a>製作而成。 
319 | 


--------------------------------------------------------------------------------
/Source-File/基本資料型態/.Rhistory:
--------------------------------------------------------------------------------
 1 | require(markdown)
 2 | api_id.path <- 'rpubs_conn/api_id.txt'
 3 | note.title <- 'R2'
 4 | note.html <- 'R2.html'
 5 | # Update
 6 | if (file.exists('rpubs_conn/api_id.txt')){
 7 | print('Start Updating')
 8 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
 9 | # update article on Rpubs
10 | update.result <- rpubsUpload(title = note.title,
11 | htmlFile = note.html,
12 | id = api.id
13 | )
14 | browseURL(update.result$continueUrl)
15 | print('update success')
16 | update.result$continueUrl
17 | # Upload
18 | }else{
19 | print('Start Uploading')
20 | dir.create('rpubs_conn')
21 | # upload article on Rpubs
22 | upload.result <- rpubsUpload(title = note.title,
23 | htmlFile = 'R1.html'
24 | )
25 | upload.result$id
26 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
27 | browseURL(upload.result$continueUrl)
28 | print('upload success')
29 | upload.result$continueUrl
30 | }
31 | require(markdown)
32 | api_id.path <- 'rpubs_conn/api_id.txt'
33 | note.title <- 'R2'
34 | note.html <- 'R2.html'
35 | # Update
36 | if (file.exists('rpubs_conn/api_id.txt')){
37 | print('Start Updating')
38 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
39 | # update article on Rpubs
40 | update.result <- rpubsUpload(title = note.title,
41 | htmlFile = note.html,
42 | id = api.id
43 | )
44 | browseURL(update.result$continueUrl)
45 | print('update success')
46 | update.result$continueUrl
47 | # Upload
48 | }else{
49 | print('Start Uploading')
50 | dir.create('rpubs_conn')
51 | # upload article on Rpubs
52 | upload.result <- rpubsUpload(title = note.title,
53 | htmlFile = 'R1.html'
54 | )
55 | upload.result$id
56 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
57 | browseURL(upload.result$continueUrl)
58 | print('upload success')
59 | upload.result$continueUrl
60 | }
61 | 


--------------------------------------------------------------------------------
/Source-File/基本資料型態/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/基本資料型態/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/1.png


--------------------------------------------------------------------------------
/Source-File/基本資料型態/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/2.png


--------------------------------------------------------------------------------
/Source-File/基本資料型態/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/3.png


--------------------------------------------------------------------------------
/Source-File/基本資料型態/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/4.png


--------------------------------------------------------------------------------
/Source-File/基本資料型態/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/5.png


--------------------------------------------------------------------------------
/Source-File/基本資料型態/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/6.png


--------------------------------------------------------------------------------
/Source-File/基本資料型態/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/7.png


--------------------------------------------------------------------------------
/Source-File/基本資料型態/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/基本資料型態/Thumbs.db


--------------------------------------------------------------------------------
/Source-File/基本資料型態/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/基本資料型態/基本資料型態.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---基本資料型態"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |  
 12 | ------
 13 | 
 14 | 要熟悉一個程式語言，第一步就是先了解定義在裡面的資料型態。而在R語言的資料型態，常用的有這些：
 15 | 
 16 | * `integer`  
 17 | * `number`
 18 | * `logic`
 19 | * `character`
 20 | * `factor`
 21 | * `vector`
 22 | * `list`
 23 | * `matrix`
 24 | * `data frame`
 25 | 
 26 | 看起來很多感覺很複雜，但不用擔心，接下來會一邊用例子、一邊介紹這些型態。
 27 | 
 28 | ------
 29 | 
 30 | #**1. integer，number，logic，character**
 31 | 這四個型態比較簡單：   
 32 | 
 33 | * 「整數」的`integer`
 34 | * 「實數」的`number`
 35 | * 「布林代數」的`logic`
 36 | * 「字串」的`character`
 37 | 
 38 | ##‧數字 (integer, number)  
 39 | 
 40 | 若要在R裡宣告一個數字，可以這麼做：
 41 | ```{r}
 42 | a <- 3     
 43 | b <- 1.6   
 44 | ```
 45 | 
 46 | 在這裡，我們稱`a`和`b`為「變數」(或是物件)。   
 47 | 而`<-`符號，會把右邊的東西，儲存到左邊的名字裡，使左邊的名字變成「變數」(如下圖)   
 48 | <img src="1.png" />   
 49 | 
 50 | 當賦值結束後，我們會需要確認變數的資料型態，這時候可以使用`str()`函式：
 51 | ```{r}
 52 | str(a)
 53 | ```
 54 | 
 55 | 上面結果顯示，`a`的資料型態是「number」，而非`integer`。有點奇怪，對吧？   
 56 | 這是因為R預設的數字型態是「number」，意思是，若直接把數字存到變數中，其資料型態是「number」。
 57 | 那麼，如果想要轉換資料型態，變成`integer`呢？這時候就可以使用`as.integer()`函式：
 58 | ```{r}
 59 | a <- as.integer(3)
 60 | str(a)
 61 | ```
 62 | 
 63 | 如果，想要轉換成`number`的資料型態呢？
 64 | 應該猜到了，沒錯，就是使用`as.numeric()`的函示。**(不是`number`，而是`numeric`哦！)**   
 65 | 更棒的是，這樣的概念可以「舉一反三」！下表就是各種資料型態，所對應的轉換函式： 
 66 | 
 67 | 
 68 | | 資料型態   |      轉換函式      |
 69 | |:----------:|:------------------:| 
 70 | | integer    |  as.integer()      | 
 71 | | number     |  as.numeric()      | 
 72 | | character  |  as.character()    |    
 73 | | factor     |  as.factor()       |  
 74 | | matrix     |  as.matrix()       |  
 75 | | vector     |  as.vector()       |  
 76 | | list       |  as.list()         |  
 77 | | data frame |  as.data.frame()   |
 78 | 
 79 | 有時候，我們要確認變數的資料型態，是否為`integer`，這時可以使用`is.integer()`函式：
 80 | ```{r, results='hold'}
 81 | is.integer(a)
 82 | is.integer(b)
 83 | ```
 84 | `str()`的功能是「顯示資料型態」，而`is.integer()`則是「判斷是否為integer，回傳TRUE/FALSE」。   
 85 | 當然，這時聰明的人就會思考，是不是也可以舉一反三呢？   
 86 | 答案是，可以的：
 87 | 
 88 | | 資料型態   |      判斷函式      |
 89 | |:----------:|:------------------:| 
 90 | | integer    |  is.integer()      | 
 91 | | number     |  is.numeric()      | 
 92 | | character  |  is.character()    |    
 93 | | factor     |  is.factor()       |  
 94 | | matrix     |  is.matrix()       |  
 95 | | vector     |  is.vector()       |  
 96 | | list       |  is.list()         |  
 97 | | data frame |  is.data.frame()   |
 98 | 
 99 | 
100 | ##‧布林代數 (logic)  
101 | 
102 | 所謂的布林代數(boolean)，代表的是`True`和`False`，常用於邏輯式上的判斷。而在R裡面，以`logic`的資料型態來表示：
103 | ```{r, results='hold'}
104 | a <- TRUE
105 | b <- FALSE
106 | str(a)        #確認a的型態
107 | is.integer(b) #判斷b是不是整數，但b是logic型態，所以回傳FALSE
108 | ```   
109 | 
110 | ##‧字串 (character)  
111 | 要定義一個`character`變數的話，注意放在右邊的值，要用雙引號(`" "`)括起來：
112 | ```{r}
113 | professor <- "Dr.Lee"
114 | ```   
115 |    
116 | 如果沒有括起來，右邊就會視為「變數」。若沒有事先定義的話，就會跳出錯誤訊息：
117 | ```{r,error=TRUE}
118 | professor <- Dr.Lee
119 | ```   
120 |    
121 | 換句話說，只要事先定義好變數，那麼變數之間，就可以互相傳遞自己儲存的值：
122 | ```{r}
123 | a <- "Dr.Lee"    # a的資料型態是character
124 | professor <- a   # a是變數，把a存到professor裡
125 | str(professor)   # 確認professor資料型態
126 | ```   
127 | 
128 | ------
129 |    
130 | #**2. vector，factor，list**   
131 | ##‧陣列 (vector)
132 | R裡面有一個十分重要的資料型態，那就是vector(陣列/向量)！   
133 | 我們在資料分析時，常常是針對一張表(table/sheet)進行處理。有時候會需要處理「某一列」(row)，或「某一行」(column)，而這樣長長的一個row/column，在R裡就被儲存成vector的形式。   
134 | 要定義一個vector，需要使用`c()`的函式：
135 | ```{r, results='hold'}
136 | a <- c(5,10,15,20,25)           # 建立一個number vector
137 | b <- c("Tom", "Henry", "John")  # 建立一個character vector
138 | a
139 | b
140 | ```
141 | 
142 | 在上面，定義一個vector變數叫做a，裡面的數字(1~5)就被稱為元素(element)。   
143 | `vector`的概念其實不難懂，就把它想像成一條長長的火車，每個車廂都存放一個貨品(element)。   
144 | 既然是車廂，那就一定有車廂編號，這在`vector`裡面稱為**index**，表示方式為`object[index]`。(如下圖)   
145 | <img src="2.png" />   
146 | (小練習)利用**index**，取出特定的element吧：   
147 | ```{r, results='hide'}
148 | a[3]      # Ans: 15           (取第3個element)
149 | a[1:3]    # Ans: 5 10 15      (取第1~第3個element)
150 | a[c(2,4)] # Ans: 10 20        (取第2和第4個element)
151 | ```
152 | 
153 | 在vector裡有一個需要注意的規則：「每一個element」都會是相同的型態！
154 | ```{r}
155 | a <- c(1, "john", 3) # 若是把number和character同時放入vector裡，
156 | a                    # R會自動將所有element的型態，轉變成character 
157 | 
158 | b <- c(T, 3, F)      # logic和number在vector裡的話
159 | b                    # T和F會被自動轉換成1和0，變成數字的vector
160 | ```
161 | 
162 | 當然，vector之間也可以進行數學運算：
163 | ```{r, results='hold'}
164 | a <- c(7,8,6,9,5) # 建立一個number vector
165 | b <- c(2,4,6,0,1) # 建立一個number vector
166 | 
167 | a * b             # a和b的第一個element相乘，第二個element相乘......
168 | b^3               # 對b之中的每一個element三次方
169 | b > 3             # 判斷b之中的哪些值大於 3 ，然後回傳 TRUE/FALSE
170 | ```
171 | 
172 | ##‧類別變數 (factor)  
173 | `factor`的型態，主要用來表示「類別變數」(category variable)。   
174 | (例如：性別(男、女)，年級(小一、小二....碩一、碩二)，地區(北、中、南、東)...等等。)  
175 | 在用R進行資料分析時，當遇到這類「類別變數」時，要轉換成`factor`的資料型態，再丟入模型(model)進行分析。   
176 | 
177 | `factor`的資料型態和`vector`很相似，差別在於`factor`具有額外的類別屬性(Levels)。   
178 | 要建立factor的變數，可以使用`factor()`函式：
179 | ```{r}
180 | 
181 | gender <- c("boy", "girl", "boy", "boy", "girl")  # 建立一個character vector
182 | gender <- factor(gender)   # 轉換成factor型態
183 | gender                     # Levels的屬性代表: 在這個變數裡面，存在哪些類別
184 | ```
185 | 
186 | 如果有一組類別變數，要查看裡面存在著哪些類別，可以用`levels()`函式：
187 | ```{r}
188 | levels(gender)
189 | ```
190 | 
191 | ##‧列表(list)   
192 | 還記得vector裡面，每一個element要相同型態的規定嗎？   
193 | 但有時候，我們會想要把不同資料型態的值，放到同一個變數裡面，那這時又該怎麼辦？   
194 | 在R裡面，有一種資料型態能解決這樣的問題，那就是列表(`list`)。   
195 | 舉個例子：一個人身上其實具備了許多資訊(性別，年齡，嗜好...)，可是這些資訊的型態不盡相同。因此，若我們想要儲存「一個人的所有資訊」，那就可以用`list`的資料型態(使用`list()`函式)：
196 | 
197 | ```{r}
198 | Dr.Lee <- list(gender="man", age=18, hobby=c("tease", "be teased"))
199 | Dr.Lee        # list長什麼模樣呢～
200 | str(Dr.Lee)   # 看一下list裡面的資訊～
201 | ```
202 | 
203 | 你可能已經注意到hobby這個變數。沒錯，list可以存放「任何型態」的變數，自然也包括vector。(當然也包括list)   
204 | 既然可以儲存資料，那也要可以把資料取出來才對！   
205 | R裡面有一個神奇的符號`$`，用來取出特定的資料：
206 | 
207 | ```{r, results='hold'}
208 | Dr.Lee$hobby        # Dr.Lee的嗜好
209 | Dr.Lee$age          # Dr.Lee的年紀
210 | ```
211 | 
212 | 
213 | ```{r}
214 | Dr.Lee[[3]]         # 當然，也可以用index的方式
215 | Dr.Lee[3]           # 然而，上面那行有兩個中括號，這行卻只有一個，看看結果有什麼不同吧！
216 | ```
217 | (小練習)試試看用`str()`，比較看看在list中，用"一個中括號:和"兩個中括號"，取出來的資料型態有什麼差別吧！
218 | ```{r, results='hide'}
219 | str(Dr.Lee[[3]] )     # Ans:使用兩個中括號，取出來的資料是vector
220 | str(Dr.Lee[3] )       # Ans:使用一個中括號，取出來的資料是list
221 | ```   
222 | 
223 | ------
224 | 
225 | #**3. matrix, data frame**   
226 | 之所以會把`matrix`和`data frame`在一起，是因為它們有很多觀念都很相似，使用的手法也幾乎雷同。   
227 | 
228 | ##‧矩陣(matrix)   
229 | R裡面定義矩陣的方式很簡單，就是用`matrix()`函式：
230 | ```{r}
231 | a <- matrix(c(1:6), nrow=3, ncol=2) #建立一個3x2的矩陣，依照column分別填入1~6的值
232 | a
233 | b <- matrix(c(3:8), nrow=2, ncol=3) #建立一個2x3的矩陣，依照column分別填入3~8的值
234 | b
235 | 
236 | ```
237 | 
238 | 如果我要查看a裡面，(2,2)所對應的值是什麼，可以這麼做：
239 | ```{r}
240 | a[2,2]
241 | ```
242 | 
243 | 當然，也可以查看b的「第一列」裡所有值：
244 | ```{r}
245 | b[1, ]    # 欄"空白"
246 | ```
247 | 
248 | 矩陣相乘的符號是`%*%``：
249 | ```{r}
250 | a %*% b #矩陣相乘
251 | ```
252 | 
253 | 和矩陣相關的運算函式：
254 | 
255 | * t(x)：將矩陣轉置。
256 | * %*%：矩陣相乘。
257 | * diag()：產生一個對角矩陣，或回傳矩陣的對角線向量
258 | * det()：計算矩陣行列式值，一定是要對稱矩陣。
259 | * solve()：傳回矩陣的反矩陣，非常適合解線性方程式。
260 | * eigen()：計算矩陣的特徵向量與特徵值。
261 | 
262 | (小練習)試試看這些函式的作用：
263 | ```{r, results='hide'}
264 | #建立一個3x2的矩陣，隨機從1~100內填入6個值
265 | a <- matrix(sample(1:100, size=6), nrow=3, ncol=2) 
266 | #建立一個2x3的矩陣，隨機從1~100內填入6個值
267 | b <- matrix(sample(1:100, size=6), nrow=2, ncol=3) 
268 | #建立一個4x4的方陣，隨機從1~100內填入16個值
269 | c <- matrix(sample(1:100, size=16), nrow=4, ncol=4)
270 | 
271 | t(a)       #轉置
272 | diag(b)    #對角矩陣
273 | det(c)     #計算行列式值
274 | solve(c)   #反矩陣
275 | ```
276 | 
277 | ##‧資料框(data frame)   
278 | ```{r, echo=FALSE}
279 | Payment_and_value_of_care_._Hospital <- read.csv("Payment_and_value_of_care_-_Hospital.csv")
280 | ```
281 | 毫無疑問的，這是R語言裡最特別的功能之一：data frame！   
282 | 平常我們在分析資料，多半是使用Excel，打開裡面的Sheet，然後進行統計分析或計算。   
283 | 如果說，Excel對應的東西是Sheet，那R所對應的就是data frame。   
284 | 以下直接拿一個[資料](https://data.medicare.gov/api/views/c7us-v4mf/rows.csv?accessType=DOWNLOAD)進行介紹。   
285 | 
286 | 下載資料後，用Excel打開就會像這樣：   
287 | <img src="3.png" />   
288 |    
289 | 現在來把這個csv檔，匯入到R裡面，變成一個data frame：   
290 | 首先，找到右邊的「Import Dataset」，選取「From Text File」
291 | <img src="4.png" />   
292 |    
293 | 去找剛剛下載的csv檔，選取之後會跳出這個畫面，點「Import」   
294 | <img src="5.png" />   
295 |    
296 | 這樣就成功把資料匯入到R裡面，狀態就會像下圖一樣：
297 | <img src="6.png" />   
298 |    
299 | 讓我們用`str()`查看這個資料，會發現資料型態為**data frame**，以及詳列出每一個變數的資訊：
300 | ```{r}
301 | str(Payment_and_value_of_care_._Hospital)
302 | ```
303 | 
304 | 又或者，可以利用`data.frame()`函式，創造自己的data frame：
305 | ```{r}
306 | tmp <- data.frame(Student_ID=c(1,2,3,4,5),
307 |                   name=c("Helen", "Lun", "Leon", "Kevin", "Tommy"),
308 |                   score=c(80,36, 88.9, 97.5, 60))
309 | tmp        # data frame的型態
310 | ```
311 | 
312 | data frame裡有一些好用的手法，可以有效地協助我們達成目的。   
313 | 首先，先來一段程式碼：
314 | ```{r}
315 | tmp[4,3]
316 | ```
317 | 這裡所代表的意思是：   
318 | <img src="7.png" />   
319 | 有沒有和矩陣很像？   
320 |    
321 | 因此，如果我們要看第一人的分數(第一列)：
322 | ```{r}
323 | tmp[1, ]   # 在欄的地方"空白"
324 | ```
325 | 
326 | 又或者，我們要看所有人的分數(第三欄)：
327 | ```{r}
328 | tmp[, 3]  # 在列的地方"空白"
329 | ```
330 | 
331 | 除此之外，還記得R裡面有個神奇符號`$`嗎？   
332 | 在data frame裡面，`$`可以用來指定「欄位名稱」，擷取該欄位的所有值：
333 | ```{r}
334 | tmp$name     #查看所有人的名字
335 | ```
336 | 
337 | 但有時候我們會想要特定的資訊(例如"Leon"這個人的資訊)，那這時可以這麼做：
338 | ```{r}
339 | tmp[tmp$name == "Leon", ]
340 | ```
341 | 以上包含了兩個步驟：我們先看`tmp$name == "Leon"`的結果：
342 | ```{r}
343 | tmp$name == "Leon"
344 | ```
345 | `tmp$name == "Leon"`這行指令，表示會去判斷所有人的名字(tmp$name)裡，是否有符合"Leon"的資料。如果有的話，回傳TRUE，不然就是FALSE。
346 | 
347 | 第二步，把這行條件式放到data frame的「列」中，是TRUE的「那一列資料」就會顯示出來。   
348 | 換句話說，原本的這行程式碼`tmp[tmp$name == "Leon", ]`，其實同等於下面這行程式碼：
349 | ```{r}
350 | tmp[c(FALSE, FALSE, TRUE, FALSE, FALSE), ]
351 | ```
352 | 
353 | ------
354 | 
355 | #**總結**
356 | 在R裡面，比較嶄新而且實用的資料型態，就是`vector`，`list`以及`data frame`。以經驗來說，在資料處理/分析的過程中，若是能好好運用這三種資料型態，可以達到事半功倍的效果。   
357 | 除此之外，這篇文章有提到一些實用的函式，接下來會介紹更多常見的函式，包括：<a href="http://www.rpubs.com/skydome20/R-Note3-function_and_package" target="_blank">**常見的函式、如何去下載想要的函式，如何去理解函式的狀態...等等**</a>。   
358 | It's still a long way to go~
359 | 
360 | ------
361 | 
362 | #**(額外)關於賦值**
363 | 
364 | 雖然在R中進行賦值時，除了使用符號`<-`以外，也可以使用`=`。   
365 | 但是在<a href="http://stat.ethz.ch/R-manual/R-patched/library/base/html/assignOps.html" target="_blank">**R的官方文件**</a>，又或者是<a href="https://google.github.io/styleguide/Rguide.xml" target="_blank">**Google's R Style Guide**</a>中，都強調最好不要使用`=`，因為有時候會造成<a href="http://stackoverflow.com/questions/1741820/assignment-operators-in-r-and" target="_blank">**問題**</a>。   
366 | 有興趣的話，可以參考以上連結，更進一步的了解。
367 | 
368 | 
369 | 


--------------------------------------------------------------------------------
/Source-File/套件與函式/.Rhistory:
--------------------------------------------------------------------------------
 1 | require(markdown)
 2 | api_id.path <- 'rpubs_conn/api_id.txt'
 3 | note.title <- 'R3'
 4 | note.html <- 'R3.html'
 5 | # Update
 6 | if (file.exists('rpubs_conn/api_id.txt')){
 7 | print('Start Updating')
 8 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
 9 | # update article on Rpubs
10 | update.result <- rpubsUpload(title = note.title,
11 | htmlFile = note.html,
12 | id = api.id
13 | )
14 | browseURL(update.result$continueUrl)
15 | print('update success')
16 | update.result$continueUrl
17 | # Upload
18 | }else{
19 | print('Start Uploading')
20 | dir.create('rpubs_conn')
21 | # upload article on Rpubs
22 | upload.result <- rpubsUpload(title = note.title,
23 | htmlFile = 'R1.html'
24 | )
25 | upload.result$id
26 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
27 | browseURL(upload.result$continueUrl)
28 | print('upload success')
29 | upload.result$continueUrl
30 | }
31 | require(markdown)
32 | api_id.path <- 'rpubs_conn/api_id.txt'
33 | note.title <- 'R4'
34 | note.html <- 'R4.html'
35 | # Update
36 | if (file.exists('rpubs_conn/api_id.txt')){
37 | print('Start Updating')
38 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
39 | # update article on Rpubs
40 | update.result <- rpubsUpload(title = note.title,
41 | htmlFile = note.html,
42 | id = api.id
43 | )
44 | browseURL(update.result$continueUrl)
45 | print('update success')
46 | update.result$continueUrl
47 | # Upload
48 | }else{
49 | print('Start Uploading')
50 | dir.create('rpubs_conn')
51 | # upload article on Rpubs
52 | upload.result <- rpubsUpload(title = note.title,
53 | htmlFile = 'R1.html'
54 | )
55 | upload.result$id
56 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
57 | browseURL(upload.result$continueUrl)
58 | print('upload success')
59 | upload.result$continueUrl
60 | }
61 | 


--------------------------------------------------------------------------------
/Source-File/套件與函式/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/套件與函式/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/1.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/2.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/3.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/4.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/5.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/6.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/7.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/8.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/套件與函式/9.png


--------------------------------------------------------------------------------
/Source-File/套件與函式/rpubs_conn/api_id.txt:
--------------------------------------------------------------------------------
1 | https://api.rpubs.com/api/v1/document/160671/b2f5e2c70a1940428a982bee35b08b9a
2 | 


--------------------------------------------------------------------------------
/Source-File/套件與函式/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/套件與函式/套件與函式.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---套件與函式"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 | 
 12 | ------
 13 |   
 14 | 在R語言中只是第一步。若要熟練地處理/分析資料，必須要掌握另一個技巧：   
 15 | **套件(package)與函式(function)。**   
 16 |    
 17 | 沒有寫過程式的人，可能對函式(function)不是那麼熟悉，但其實理解起來並不難。    
 18 | 還記得國中的數學課，有一個章節叫「代數與函數」嗎？   
 19 | 我們可能會根據某一道題目，建立一個f(x)=2x+3的式子對吧？這裡的f(x)，就是「函式」(函數)。   
 20 | 函式之所以好用，就是因為裡面已經幫你制定好規則。   
 21 | 因此，即使把不同的情況(x=2, x=3)丟入，也會根據相同的規則(f(x)=2x+3)，產出結果(f(x=2)=7, f(x=3)=9)。   
 22 |    
 23 | 由於R是一個Open Source的程式語言(簡單來說，全世界的人在R語言上進行開發，撰寫自己的函式)，因此具有豐沛的函式集，也就是「套件」(package)。   
 24 |    
 25 | 以下將會介紹R裡面一些實用的函式，以及如何根據自己的需求，搜尋到對應的套件並安裝。
 26 | 
 27 | ------
 28 | 
 29 | #**1. 函式(function)**
 30 | 
 31 | ##‧summary()   
 32 | 如果還記得的話，其實在上一篇的[文章](https://rpubs.com/skydome20/R-Note2-dataType)中，已經介紹過許多函式(`is.integer()`, `data frame()`, `c()`, `str()`...)   
 33 | 其中，`str()`十分好用，尤其在了解一筆資料時或未知的變數時，能派上很大的用場。   
 34 | 除此之外，還有一個相當強大、具有類似功能的函式，叫做`summary()`。   
 35 | 我們先拿R內建的鳶尾花資料(iris)，比較看看`str()`和`summary()`的功用吧：
 36 | ```{r}
 37 | str(iris)
 38 | summary(iris)
 39 | ```
 40 | 你會發現，`str()`列出資料內每個欄位的狀態，   
 41 | 而`summary()`則給出每個欄位的「最大值」、「最小值」、「平均值」、「中位數」「第一四分位數」...等等。   
 42 | 那麼，`str()`和`summary()`的差別究竟在哪裡呢？   
 43 | 這裡要教一個實用的小技巧：**當你不清楚「一個函式的用途」、「該如何使用」時，要從哪裡找到說明文件(Help)？**   
 44 | 答案很簡單！請在console裡面，輸入`?str`和`?summary`指令(如下圖)：
 45 | <img src="1.png" />  
 46 | 
 47 | ------
 48 | 
 49 | 你會發現，在右手邊的「Help」欄位，會出現說明文件，裡面包括「函式的套件」、「函式的用途」、「參數說明」、「使用上的教學示範」...   
 50 | 這個技巧，在日後會時常用到！   
 51 | **(重要觀念：當接觸一個新的函式時，第一步就是要去了解「它的用途」、以及「如何去使用」！)**   
 52 |    
 53 | (小練習)請試著在console裡，查詢`summary()`的說明文件
 54 | 
 55 | ##‧敘述統計函式   
 56 | 敘述統計是進行資料分析時，有效瀏覽(explore)、了解(recognize)資料狀態的步驟。   
 57 | 作為一個資料分析軟體，R裡面自然有許多可以協助我們進行敘述統計分析的函式：   
 58 | 
 59 | * `mean()`：平均值   
 60 | * `var()`：變異數   
 61 | * `sd()`：標準差   
 62 | * `median()`：中位數   
 63 | * `max()`：最大值   
 64 | * `min()`：最小值   
 65 | * `sum()`：綜合相加      
 66 | * `quantile()`：分位數     
 67 | * `range()`：全距    
 68 | 
 69 | 對鳶尾花資料(iris)進行簡單的示範：   
 70 | ```{r}
 71 | # 從iris的資料集中，取"Sepal.Length"(花萼長度)這個欄位的資料出來(利用神奇符號$)
 72 | iris$Sepal.Length  
 73 | ```
 74 | 
 75 | 發現到了嗎？從data frame裡面取出來的「某一欄」資料，就是一個陣列(vector)的形式。(代表每一筆觀測值的花萼長度)   
 76 | 現在，就把上面這些函式，應用到這個陣列資料上吧(雖然有點無聊XD)：   
 77 | ```{r, results='hold'}
 78 | mean(iris$Sepal.Length)     #「花萼長度」的平均值
 79 | var(iris$Sepal.Length)      #「花萼長度」的變異數
 80 | sd(iris$Sepal.Length)       #「花萼長度」的標準差
 81 | median(iris$Sepal.Length)   #「花萼長度」的中位數
 82 | max(iris$Sepal.Length)      #「花萼長度」中的最大值
 83 | min(iris$Sepal.Length)      #「花萼長度」中的最小值
 84 | sum(iris$Sepal.Length)      #「花萼長度」加總
 85 | range(iris$Sepal.Length)    #「花萼長度」最小值和最大值(全距)
 86 | ```
 87 | 
 88 | 然而`quantile()`這個函式有點特殊：   
 89 | ```{r}
 90 | quantile(iris$Sepal.Length, probs=0.25)  # 第一四分位數 
 91 | quantile(iris$Sepal.Length, probs=0.75)  # 第三四分位數 
 92 | ```
 93 | 
 94 | 注意到了嗎？在`quantile()`這個函式裡面，除了`iris$Sepal.Length`以外，後面加了一個逗號，並且多出`probs`的參數。   
 95 | 這代表什麼意思呢，當你對函式有所困惑時，就讓我們偷看一下**說明文件**吧！
 96 | <img src="2.png" />    
 97 | 
 98 | 明白了嗎？所以：   
 99 | `prob=0.25`，代表「第一四分位數」；   
100 | `prob=0.1`，代表「在連續分布上，對應到機率0.1的那個點是多少」   
101 | 以此類推...就是這麼簡單喔！   
102 | (話說回來，你能從說明文件中注意到，`quantile()`這個函式是屬於哪個套件嗎？**(Ans:`stats`套件)**)   
103 | 
104 | ------
105 | 
106 | 那如果資料裡面，有遺漏值的話呢？
107 | ```{r}
108 | a <- c(1, 2, 3, 5, 8, 13, 21, NA, 55)
109 | sum(a)
110 | ```
111 | 
112 | 遇到資料有遺漏值的時候，函式往往會無法運作，最後結果如上面顯示是`NA`。    
113 | 但如果我們想要把「NA以外的值」加總起來，又該怎麼做？    
114 | 其實挺容易的：
115 | ```{r}
116 | sum(a, na.rm=TRUE)
117 | ```
118 | 
119 | 注意到我們在`sum()`函式裡面，多加一個參數`na.rm=TRUE`。   
120 | 其中，`na.rm`意味著"remove NA"，表示若資料裡面有遺漏值`NA`，會把它忽略，對剩下的資料進行加總。   
121 | 
122 | 我們也可以查看`sum()`的說明文件，發現在函式裡面，果然有`na.rm`的參數可以設定(預設是`FALSE`)：
123 | <img src="3.png" />   
124 | 
125 | ------
126 | 
127 | ##‧其他實用函式   
128 | 除了敘述統計的函式以外，R裡面還有一些好用的函式，常常會在處理資料時派上用場。   
129 | 以下列出個人自認好用的函式，但這裡不會多加解釋。   
130 | 若有困惑的話，先閱讀函式的說明文件，再加上自我練習，才能漸漸掌握每個函式的使用：   
131 | 
132 | ##paste(), append()   
133 | ```{r, results='hold'}
134 | # paste()：拼貼字串(把 "Happy"" 和 "White Day"" 兩個字串拼貼起來，sep代表連結字串的符號)
135 | paste("Happy", "White-Day", sep=" ") 
136 | 
137 | # append()：把兩個vector串接起來 
138 | b <- c(1,2,3)
139 | c <- c(4,5,6)
140 | append(b, c)
141 | ```
142 | 
143 | ##rbind(), cbind()   
144 | ```{r}
145 | a <- data.frame(x=c(1,2,3), y=c("Henry", "Lun", "Kevin"))
146 | b <- data.frame(x=c(4,5,6), y=c("Helen", "Tommy", "Leon"))
147 | rbind(a,b) # rbind()：把兩個data frame，依據row串接起來
148 | cbind(a,b) # cbind()：把兩個data frame，依據column串接起來
149 | ```
150 | 
151 | ##sample(), seq()   
152 | ```{r, results='hold'}
153 | sample(x=1:100, size=10)  # 從1~100數字中，隨機挑10個數字，產生一個數列(vector)
154 | seq(from=1, to=5, by=0.5) # 產生一個「從1開始，每次加0.5，直到5為止」的數列(vector)
155 | ```
156 | 
157 | ##head(), tail()
158 | ```{r}
159 | head(iris, n=6)  # head(): 顯示data frame中的前6筆資料
160 | tail(iris, n=6)  # tail(): 顯示data frame中的後6筆資料
161 | ```
162 | 
163 | ##order(), sort()
164 | ```{r}
165 | a <- sample(x=1:100, size=10) # 從1~100數字中，隨機挑10個數字，產生一個數列(vector)
166 | a
167 | # 用order()，把數列由大排到小；從小排到大，decreasing = FALSE
168 | a[order(a, decreasing=TRUE)]   
169 | # 和 a[order(a, decreasing=TRUE)] 一樣效果
170 | sort(a, decreasing=TRUE)      
171 | ```
172 | 
173 | ##unique(), duplicated()
174 | ```{r}
175 | a <- c("A", "B", "C", "A", "A", "B")
176 | unique(a)       # 萃取資料中unique的element
177 | ```
178 | ```{r, results='hold'}
179 | duplicated(a)            # 若後面有重複的資料，函式會回傳TRUE，而第一個資料會是FALSE
180 | a[duplicated(a)==FALSE]  # 和 unique(a)一樣效果
181 | ```
182 | 
183 | ##which()
184 | ```{r}
185 | # which()：找出第幾個element是TRUE(在a裡面，第幾個element的值等於100)
186 | a <- c(68,73,99,100,56,100,85,36)
187 | which(a==100)
188 | ```
189 | 除此之外，網路上有人詳列出<a href="http://www3.nccu.edu.tw/~99354011/R%20commands%2811.09.13%29.pdf" target="_blank">各種好用的函式</a>的一覽表，當作參考。
190 | 
191 | ------
192 | 
193 | #**2. 套件(package)**
194 | 基本上，每一個函式都有對應的套件(package)。   
195 | 不過你可能會覺得奇怪，當我們在使用上面那些函式時，似乎沒怎麼琢磨在套件上？   
196 | 那是因為，R已經將那些套件內建在裡面，因此我們可以直接呼叫(call)這些函式來使用。    
197 |    
198 | 現在問題來了，如果我們想使用那些「沒有內建在R裡面」的函式呢？   
199 | 例如，我們想要使用一個強大的繪圖函式`ggplot()`：
200 | ```{r, error=TRUE}
201 | ggplot(data=CO2)
202 | ```
203 | 結果R跟你說「沒有這個函數」，這代表`ggplot()`函式沒有內建在R裡面。   
204 | 因此，你得另外**(1)安裝**對應的套件(package)，並且**(2)匯入到R**裡面才可以！
205 | 
206 | ------
207 | 
208 | ###(1)安裝套件   
209 | 我們先求助google大神，看`ggplot()`對應到的套件是哪一個？
210 | <img src="4.png" />   
211 | 光看搜尋結果，就會發現對應的套件是**`ggplot2`**。   
212 | 不過我們還是點開這個有CRAN字樣的網站：CRAN(Comprehensive R Archive Network)，是R的官方網站之一，裡面收錄了所有經過R官方核可後的函式、以及對應的說明文件，就像這樣：   
213 | <img src="5.png" />   
214 | 因此，現在我們知道要使用`ggplot()`函式，需要先下載**`ggplot2`**套件。   
215 | 
216 | ------
217 | 
218 | 既然知道套件名稱，那麼就要把套件下載下來。   
219 | 下載R的套件有兩種方式：
220 | 
221 | **[1] 用RStudio介面：**   
222 | 在RStudio上點開「Tools」->「Install Packages」 ：
223 | <img src="6.png" />   
224 |    
225 | 在中間填入想要安裝的套件名稱，點「Install」就大功告成！
226 | <img src="7.png" />   
227 |    
228 | **[2] 用R的函式(指令)：**   
229 | 直接在console裡面輸入函式`install.packages("套件名稱")`！
230 | <img src="8.png" />   
231 |    
232 | ------
233 | 
234 | ###(2)匯入到R   
235 | 當出現這樣的畫面，就代表安裝完成，這代表你已經把R的套件下載到電腦裡面了！    
236 | <img src="9.png" />   
237 |  
238 | 接下來，要使用套件裡面的函式，要將套件匯入到R裡面才可以，   
239 | 這時就要用到`library()`/`require()`，用來「匯入套件」的函式：
240 | ```{r}
241 | library(ggplot2) #匯入ggplot2套件到R裡面
242 | ggplot(data=CO2) + geom_boxplot(data=CO2,aes(x=conc, y=uptake, colour=Plant))
243 |  
244 | ```   
245 | 
246 | 如此一來，就可以順利使用`ggplot()`函式進行繪圖囉～   
247 | (P.S.關於ggplot繪圖的相關細節，會在之後的的筆記中提及，請不用擔心！)   
248 | 
249 | ------
250 | 
251 | #**總結**
252 | 套件和函式是R很重要的功能，但我們不可能記住所有的套件和對應的函式。   
253 | 因此，**如何妥善運用google和說明文件**，是在R的學習之路上很重要的一環！   
254 | 
255 | 接下來，會進入比較接近「程式設計」的主題：條件式(if-else, switch)、迴圈(for, while)，與函式搭配的情形。   
256 | 謝謝大家！   
257 | It's still a long way to go~   
258 | 
259 | ------
260 | 
261 | #**(額外) Happy White Day**
262 | 今天恰好是3/14，祝大家白色情人節快樂！(<a href="http://stackoverflow.com/questions/8082429/plot-a-heart-in-r" target="_blank">R code from here</a>)
263 | ```{r}
264 | require(graphics)
265 | dat<- data.frame(t=seq(0, 2*pi, by=0.1) )
266 | xhrt <- function(t) 16*sin(t)^3
267 | yhrt <- function(t) 13*cos(t)-5*cos(2*t)-2*cos(3*t)-cos(4*t)
268 | dat$y=yhrt(dat$t)
269 | dat$x=xhrt(dat$t)
270 | with(dat, plot(x,y, type="l"))
271 | with(dat, polygon(x,y, col="hotpink")) 
272 | points(c(10,-10, -15, 15), c(-10, -10, 10, 10), pch=169, font=5)
273 | ```
274 | 


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/1.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/10.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/11.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/12.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/12.PNG


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/13.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/14.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/15.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/16.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/17.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/18.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/19.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/2.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/20.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/21.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/22.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/23.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/23.PNG


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/3.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/4.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/5.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/6-1.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/6-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/6-2.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/7.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/8.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/9.png


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/Thumbs.db


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/_Rhistory:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/安裝R與RStudio/_Rhistory


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/_Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/安裝R與RStudio/安裝R與RStudio.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---安裝R與RStudio"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output: 
  6 |   html_document:
  7 |         css: style.css
  8 | ---
  9 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 10 | 
 11 | ------
 12 |    
 13 | 工欲善其事，必先利其器。
 14 | 
 15 | 要學會寫程式就必須先會安裝軟體，我們的R實作課需要利用到兩個軟體，除了「R」以外，還必須額外安裝「RStudio」，是R專屬的IDE(<a href="https://en.wikipedia.org/wiki/Integrated_development_environment" target="_blank">Integrated Development Environment</a>)。   
 16 | 
 17 | 簡單來說，RStudio可以協助你更方便、輕鬆地撰寫R的程式。
 18 | 
 19 | 以下是安裝R和RStudio的圖文說明，Let's go!  
 20 | 
 21 | # (一)安裝R   
 22 | 
 23 | ------
 24 | 
 25 | >####**1. 在瀏覽器搜尋「R」**
 26 | <img src="1.png" />   
 27 | 
 28 | >####**2. 點擊第一個連結**
 29 | <img src="2.png" />   
 30 | 
 31 | >####**3. 點擊「download R」**
 32 | <img src="3.png" />   
 33 | 
 34 | >####**4. 往下滑，找到「Taiwan」，點擊第二個下載點**
 35 | <img src="4.png" />   
 36 | 
 37 | >####**5. 根據自己的電腦作業系統(windows, Linux, Mac)，點擊適合的下載連結**
 38 | <img src="5.png" />   
 39 | 
 40 | >####**6-1. (windows版)點擊「base」，點選第一個連結**
 41 | <img src="6-1.png" />   
 42 | 
 43 | >####**6-2. (Mac版)點擊第一個連結**
 44 | <img src="6-2.png" />   
 45 | 
 46 | >####**7. 最後，你會下載一個安裝檔(.exe)，把這個安裝檔安裝起來就好囉！(一直點下一步/同意，採用預設設定就好）**
 47 | 
 48 | 
 49 | # (二)安裝RStudio   
 50 | 
 51 | ------
 52 | 
 53 | >####**1. 在瀏覽器搜尋「RStudio」，點擊第一個連結**
 54 | <img src="7.png" />    
 55 | 
 56 | >####**2. 來到這個畫面，從上方的「Products」裡點擊「RStudio」**
 57 | <img src="8.png" />   
 58 | 
 59 | >####**3. 安裝一般桌面版的RStudio**
 60 | <img src="9.png" />   
 61 | 
 62 | >####**4. 點擊左邊的版本，會進入下載點**
 63 | <img src="10.png" />    
 64 | 
 65 | >####**5. 根據自己的電腦，選擇Windows或Mac的版本**
 66 | <img src="11.png" />   
 67 | 
 68 | >####**6. 最後，和R一樣，會下載一個安裝檔(.exe)，把這個安裝檔安裝起來就好囉！(一直點下一步/同意，採用預設設定就好）**
 69 | 
 70 | # (三)安裝完畢，開始使用RStudio   
 71 | 
 72 | ------
 73 | 
 74 | >如果有看到桌面上出現RStudio捷徑，就表示安裝完成了。
 75 | <img src="12.png" />   
 76 | 
 77 | >把RStudio點開來以後，展開的畫面會像這樣：
 78 | <img src="13.png" />   
 79 | 
 80 | >依照以下順序**(1.點擊「File」 -> 2.「New File」 -> 3. 「R Script」)**
 81 | <img src="14.png" />   
 82 | 
 83 | >會看到左上角成功新增了一個新的視窗，而以下是完整的RStudio介面，我們也已經完成寫R程式之前的前置作業！
 84 | <img src="15.png" />   
 85 | 
 86 | >RStudio的介面主要可以分成四個區塊：
 87 | <img src="16.png" />   
 88 | 
 89 | >直接用例子來解釋吧！(藉由在操作的過程中，慢慢熟悉RStudio的操作規則，理解基本的觀念。)   
 90 | >   
 91 | >   
 92 | >首先，在R Script裡面寫下程式，然後按「執行」查看結果；或是針對需要執行的程式碼，游標移至那行，按下Ctrl+Enter(亦可反白多行一次執行) 
 93 | <img src="17.png" />   
 94 | 
 95 | 
 96 | 
 97 | >你會發現，在正下方的Console中，顯示了上面程式碼的結果。
 98 | >而在右上角，則會顯示出你所定義好的變數名稱及概略內容。
 99 | >此外，你可能還會發現，R是用```<-```符號來進行變數的定義(assign value)
100 | <img src="18.png" />   
101 | 
102 | >當然，R裡面遵守先乘除、後加減的法則。   
103 | >(可以把這行輸入在console中，或是先在R Script寫好再執行)
104 | >```{r}
105 | 3 * 5 + 4 / 2
106 | ```
107 | 
108 | >如果要先進行加減，可以用括號括起來，就會先進行括號內的運算。   
109 | >(可以把這行輸入在console中，或是先在R Script寫好再執行)：
110 | >```{r}
111 | 3 * (5 + 4) / 2
112 | ```
113 | 
114 | >到目前為止，已經完成了「在自己的電腦裡安裝好R和RStudio」，「簡單熟悉了操作介面」的步驟。   
115 | >接下來，3/14的助教課要開始學習R裡面的<a href="http://rpubs.com/skydome20/R-Note2-dataType" target="_blank">基本資料型態</a>。   
116 | 
117 | 
118 | # (額外)介面設定
119 | 
120 | ------
121 | 
122 | >這是額外的小技巧！   
123 | >調整「RStudio」介面的「主題」、「顏色」、「字體」的風格，可以隨自身的喜好去改變。   
124 | <img src="19.png" />   
125 | 
126 | >**調整Scripts & Console的背景顏色與字體大小**
127 | >
128 | >點選「Tools」 -> 「Global Options」
129 | <img src="20.png" />
130 | 
131 | >左方找「Appearance」，就會出現這樣的畫面：
132 | <img src="21.png" />  
133 | >最後記得點選Apply再按OK就完成設定囉！
134 | 
135 | >不一樣的感受！
136 | <img src="22.png" />  
137 | 
138 | ------
139 | 
140 | >這也是額外的小技巧！
141 | >
142 | >**可以透過設定，調整Scripts視窗中的程式呈現方式(自動換行)**
143 | >
144 | >點選「Tools」 -> 「Global Options」 -> 「Code」 -> 「Editing」
145 | <img src="23.png" />
146 | >
147 | >
148 | >勾選「Soft-wrap R source files」，假如沒有勾選的話，當程式撰寫太長時不會自動換行。
149 | >
150 | >同學可以自行測試看看~
151 | 
152 | ------
153 | 
154 | 


--------------------------------------------------------------------------------
/Source-File/決策樹/_Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/決策樹/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre{
13 |   font-size: 16px;
14 |   font-family:  "Times New Roman";
15 | }


--------------------------------------------------------------------------------
/Source-File/決策樹/titanic.raw.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/決策樹/titanic.raw.rdata


--------------------------------------------------------------------------------
/Source-File/決策樹/決策樹.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---決策樹(Decision Tree)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 |   
 14 | 本篇內容，會繼續介紹一些常用的資料探勘模型：   
 15 | 
 16 | 
 17 | 這裡拿網路上一個公開資料(鐵達尼號的乘客資料)來進行分析，<a href="http://www.rdatamining.com/data/titanic.raw.rdata?attredirects=0&d=1" target="_blank">資料載點如下</a>。   
 18 | 
 19 | ------ 
 20 |    
 21 | #**決策樹(Decision Tree)**   
 22 | 
 23 | 無論在分類或預測上，決策樹的演算法都有很好的效果。   
 24 |    
 25 | 但它最強大的地方，莫過於樹狀分支的結構：可以明顯呈現分類的規則！與一些機器學習的方法(NN, SVM...)相比，相當容易進行解釋，以及分析規則之間的關係。   
 26 | 
 27 | 這裡就簡單用CART決策樹來練習，對應的套件是`rpart`，一樣使用上次鐵達尼號的資料：
 28 | 
 29 | ```{r}
 30 | # 記得要給定資料所在的路徑(path)，例如：我把下載的資料放在C槽下：
 31 | load("C:/titanic.raw.rdata")  #匯入.rdata檔
 32 | ```    
 33 | 
 34 | ```{r, message=F,warning=F}
 35 | require(rpart)
 36 | 
 37 | # 先把資料區分成 train=0.8, test=0.2 
 38 | set.seed(22)
 39 | train.index <- sample(x=1:nrow(titanic.raw), size=ceiling(0.8*nrow(titanic.raw) ))
 40 | train <- titanic.raw[train.index, ]
 41 | test <- titanic.raw[-train.index, ]
 42 | 
 43 | # CART的模型：把存活與否的變數(Survived)當作Y，剩下的變數當作X
 44 | cart.model<- rpart(Survived ~. , 
 45 |                     data=train)
 46 | 
 47 | # 輸出各節點的細部資訊(呈現在console視窗)
 48 | cart.model
 49 | ```   
 50 | 
 51 | 要畫出決策樹(視覺化)，雖然用平常的`plot()`就可以達成   
 52 |    
 53 | 但rpart有專屬的繪圖套件`rpart.plot`，函式是`prp()`   
 54 |    
 55 | 說真的，用`prp()`畫出來的決策樹，比較好看一些：
 56 | 
 57 | ```{r, message=F,warning=F}
 58 | require(rpart.plot)	
 59 | prp(cart.model,         # 模型
 60 |     faclen=0,           # 呈現的變數不要縮寫
 61 |     fallen.leaves=TRUE, # 讓樹枝以垂直方式呈現
 62 |     shadow.col="gray",  # 最下面的節點塗上陰影
 63 |     # number of correct classifications / number of observations in that node
 64 |     extra=2)  
 65 | ```   
 66 |    
 67 | (最下面節點的數字，代表：**number of correct classifications / number of observations in that node**)   
 68 |    
 69 | 根據以上決策樹，可以發現**是男生或女生**其實很重要(因為是第一個分支規則)，其次是在船上的艙位等級。   
 70 |    
 71 | 因此，我們可以這樣解釋：   
 72 | 
 73 | ```
 74 | 即使是女性，可是擁有的艙位若是最低下的(3rd)，則大概有一半的死亡機率(82/155=53%)；   
 75 | 但當妳的艙位高人一等時，則有相當高的存活機率(197/208=95%)。  
 76 | ```    
 77 | 
 78 | 又或者是：   
 79 | 
 80 | ```
 81 | 當你是男性成人時，大概有八成機率會死(1084/1348=77%)  
 82 | ```   
 83 | 
 84 | 以及
 85 | 
 86 | ```
 87 | 若是男性小孩，就和艙位等級有關：高級艙位的小孩全都獲救(13/13)，可是低艙位的小孩有七成機率(26/37=70%)會死。  
 88 | ```   
 89 | 
 90 | **(男生好可憐)**   
 91 | 
 92 | ●也可用另一個繪圖套件`partykit`，函式是`as.party()`和`plot()`
 93 | 
 94 | ```{r, message=F,warning=F}
 95 | require(partykit)	
 96 | rparty.tree <- as.party(cart.model) # 轉換cart決策樹
 97 | rparty.tree # 輸出各節點的細部資訊
 98 | plot(rparty.tree) 
 99 | ``` 
100 | 
101 | 用這個套件畫出來的圖也是蠻容易一目了然的呢!
102 | 有不一樣的感覺~
103 | 
104 | ------
105 |    
106 | 有決策樹之後，就要進行預測！   
107 |    
108 | 還記得在線性迴歸使用過的`predict()`嗎？這時就會派上用場囉(在這裡，會同時計算預測準確率)：    
109 |    
110 | ```{r}
111 | pred <- predict(cart.model, newdata=test, type="class")
112 | 
113 | # 用table看預測的情況
114 | table(real=test$Survived, predict=pred)
115 | 
116 | # 計算預測準確率 = 對角線的數量/總數量
117 | confus.matrix <- table(real=test$Survived, predict=pred)
118 | sum(diag(confus.matrix))/sum(confus.matrix) # 對角線的數量/總數量
119 | ```
120 | 
121 | ------
122 | 
123 | 結果顯示，模型在測試集中的預測能力大約77%，但模型的預測準確率還有提升的可能嗎？我們繼續對模型進行修樹~
124 | ```{r}
125 | printcp(cart.model) # 先觀察未修剪的樹，CP欄位代表樹的成本複雜度參數
126 | ``` 
127 | 
128 | ```{r}
129 | plotcp(cart.model) # 畫圖觀察未修剪的樹
130 | ```   
131 | 
132 | ```{r}
133 | prunetree_cart.model <- prune(cart.model, cp = cart.model$cptable[which.min(cart.model$cptable[,"xerror"]),"CP"]) # 利用能使決策樹具有最小誤差的CP來修剪樹
134 | ```
135 | 
136 | 修剪完決策樹之後，讓我們重新建構一次預測模型
137 | ```{r}
138 | prunetree_pred <- predict(prunetree_cart.model, newdata=test, type="class")
139 | 
140 | # 用table看預測的情況
141 | table(real=test$Survived, predict=prunetree_pred)
142 | 
143 | prunetree_confus.matrix <- table(real=test$Survived, predict=prunetree_pred)
144 | sum(diag(prunetree_confus.matrix))/sum(prunetree_confus.matrix) # 對角線的數量/總數量
145 | ```
146 | 
147 | 很顯然，模型的預測準確率並沒有提升，一樣是大約77%，這是因為我們在修剪時所挑選到滿足條件的CP值為**0.01**，而函式`rpart()`預設的CP值就是**0.01**，故前後模型的結果一致。
148 | 
149 | ------
150 | 
151 | 再來，我們爲了避免模型過度擬合(overfitting)，故要利用K-fold Cross-Validation的方法進行交叉驗證，我們使用`caret`這個套件，而K先設定為10次~
152 | ```{r, message=F,warning=F}
153 | require(caret)
154 | require(e1071)
155 | train_control <- trainControl(method="cv", number=10)
156 | train_control.model <- train(Survived~., data=train, method="rpart", trControl=train_control)
157 | train_control.model
158 | ```
159 | 
160 | 然而，我們一開始修剪樹之後所得到的決策樹模型，最佳的預測準確率大約為77%，而現在再透過交叉驗證所Tune得的參數，使得模型的最佳預測準確率大約提升為78%。
161 | 
162 | ------
163 | 
164 | #**總結**    
165 | 
166 | 在資料探勘領域中，決策樹(Decision Tree)是相當常見的方法，例如在醫學研究上，對某種特定的疾病(糖尿病，代謝症候群等)找出可以前期篩檢分類，或是預測的因子時，就常以決策樹的方法來進行，而決策樹較爲不同之處在於可用圖像化來呈現結果，即使不了解背後理論，仍可解讀並判斷。
167 | 
168 | 進行決策樹分析要注意的是，當樣本存在類別不不衡的問題時，決策樹對於小類的樣本根本無能為力，模型的效能會大打折扣。
169 | 
170 | 
171 | ------
172 | 


--------------------------------------------------------------------------------
/Source-File/流程控制/.Rhistory:
--------------------------------------------------------------------------------
 1 | getwd()
 2 | ?rpubsUpload
 3 | require(markdown)
 4 | result <- rpubsUpload(title='test',
 5 | htmlFile='R11.html',
 6 | method=getOption('rpubs.upload.method','auto')
 7 | )
 8 | result$id
 9 | ?rpubsUpload
10 | result <- rpubsUpload(title='test',
11 | id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f',
12 | htmlFile='R11.html',
13 | method=getOption('rpubs.upload.method','auto')
14 | )
15 | result$continueUrl
16 | result <- rpubsUpload(title='test',
17 | #id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f',
18 | htmlFile='R11.html',
19 | method=getOption('rpubs.upload.method','auto')
20 | )
21 | result$continueUrl
22 | result <- rpubsUpload(title='eeee',
23 | id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f',
24 | htmlFile='R11.html',
25 | method=getOption('rpubs.upload.method','auto')
26 | )
27 | result$continueUrl
28 | browseURL(result$continueUrl)
29 | result <- rpubsUpload(title='eeee',
30 | id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f',
31 | htmlFile='R11.html',
32 | method=getOption('rpubs.upload.method','auto')
33 | )
34 | result$continueUrl
35 | browseURL(result$continueUrl)
36 | result <- rpubsUpload(title='eeee',
37 | id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f',
38 | htmlFile='R11.html'
39 | )
40 | result$continueUrl
41 | browseURL(result$continueUrl)
42 | result <- rpubsUpload(title='eeee',
43 | htmlFile='R11.html'
44 | )
45 | browseURL(result$continueUrl)
46 | result <- rpubsUpload(title='test',
47 | htmlFile='R11.html',
48 | method=getOption('rpubs.upload.method','auto')
49 | )
50 | browseURL(result$continueUrl)
51 | result <- rpubsUpload(title='test',
52 | htmlFile='R11.html',
53 | id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f'
54 | )
55 | result$continueUrl
56 | result <- rpubsUpload(title='test',
57 | htmlFile='R11.html',
58 | id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f'
59 | )
60 | browseURL(result$continueUrl)
61 | require(markdown)
62 | result <- rpubsUpload(title='test',
63 | htmlFile='R11.html',
64 | id = 'https://api.rpubs.com/api/v1/document/169098/0632eea128d44edeaeb7de1a077ade2f'
65 | )
66 | require(markdown)
67 | api_id.path <- 'rpubs_conn/api_id.txt'
68 | note.title <- 'R11'
69 | note.html <- 'R11.html'
70 | # Update
71 | if (file.exists('rpubs_conn/api_id.txt')){
72 | print('Start Updating')
73 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
74 | # update article on Rpubs
75 | update.result <- rpubsUpload(title = note.title,
76 | htmlFile = note.html,
77 | id = api.id
78 | )
79 | browseURL(update.result$continueUrl)
80 | print('update success')
81 | update.result$continueUrl
82 | # Upload
83 | }else{
84 | print('Start Uploading')
85 | dir.create('rpubs_conn')
86 | # upload article on Rpubs
87 | upload.result <- rpubsUpload(title = note.title,
88 | htmlFile = 'R1.html'
89 | )
90 | upload.result$id
91 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
92 | browseURL(upload.result$continueUrl)
93 | print('upload success')
94 | upload.result$continueUrl
95 | }
96 | 


--------------------------------------------------------------------------------
/Source-File/流程控制/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/流程控制/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/流程控制/流程控制.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---流程控制(for, while, ifelse, switch)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------  
 13 | 
 14 | **<a href="http://goo.gl/jfl71c" target="_blank">流程控制</a>** 代表在程式執行時，指令、子程式或求值的順序。如果有寫過程式的人，應該會相當熟悉這類手法(for, while, if else)，以上就是所謂流程控制的**指令**。    
 15 | 
 16 |    
 17 | 
 18 | 在R裡面，流程控制的指令主要可以分為三類：   
 19 | 
 20 | * ####**邏輯判斷**：`>`、`<`、`==`、`!=`、`%in%`、`&`、`|`、`!`   
 21 | * ####**條件指令**：`if`、`else`、`ifelse`、`switch`  
 22 | * ####**迴圈指令**：`for`、`while`、`repeat`、`break`、`next`   
 23 |    
 24 |    
 25 | 在資料預處理的過程，或是在建模時使用交叉驗證(CV)，常常會需要使用這些指令，協助我們建立一套自動化的分析流程。    
 26 |    
 27 | 在許多主流的程式語言中，流程控制的思維幾乎是必備的！   
 28 | 
 29 | 當然不同語言之間，流程控制的語法不盡相同，可是只要掌握好基本的撰寫邏輯，之後在學習新的程式語言時，是可以一以貫之，只需要花心思和時間在熟悉語法上就可以。   
 30 |    
 31 | ------
 32 |    
 33 | #**1. 邏輯判斷**   
 34 | 
 35 | 在寫程式的時候，我們時常會需要判斷一些關係是否成立(例如：數字之間的大小關係，某數字是否存在於一個向量...)。
 36 | 
 37 | 在電機資訊領域(電路)，可以想成AND, OR, NOT這類的運算子。換句話說，當關係成立/不成立的時候，我們會得到`True`/`False`的值，然後時常搭配後面會介紹的「條件指令」一起使用。   
 38 | 
 39 | * ###**大於、小於、等於：**
 40 | ```{r}
 41 | # 相信大家小學都學過了！
 42 | 
 43 | x <- 5
 44 | 
 45 | c(
 46 |   x > 3    ,    # 1.大於
 47 |   x >= 3   ,    # 2.大於等於
 48 | 
 49 |   x < 3    ,    # 3.小於
 50 |   x <= 3   ,    # 4.小於等於
 51 |  
 52 |   x == 3   ,    # 5.等於
 53 |   x != 2        # 6.不等於
 54 | )
 55 | 
 56 | ```   
 57 | 
 58 | 
 59 | * ###**是否位於某向量內：**
 60 | ```{r}
 61 | # 在R裡面，判斷某個值(或向量)，是否存在於另一個向量之中，會使用 %in% 的符號
 62 | # 以往在判斷這類情況時，我們往往需要寫迴圈(for-loop)，一一將向量裡面的element拿出來，比對看是否成立
 63 | # R無疑提供了一個相當好用的運算子！
 64 | 
 65 | x <- 5
 66 | y <- c(0,2,3)
 67 | 
 68 | x %in% c(1,2,3,4,5)    # 值是否存在向量內
 69 | y %in% c(1,2,3,4,5)    # 向量內的各值，是否存在於另一個向量內
 70 | 
 71 | ```     
 72 | 
 73 | * ###**交集，聯集，否定：**
 74 | ```{r}
 75 | # 和電路中AND, OR, NOT的概念幾乎相似
 76 | 
 77 | x <- 5
 78 | y <- 8
 79 | 
 80 | # 括號內是True
 81 | !(x > 3)         # NOT ：非；否定(!)
 82 | # (True, False)
 83 | x > 3 & y > 10   # AND：和；交集(&)
 84 | # (True, False)
 85 | x > 3 | y > 10   # OR ：或；聯集(|)
 86 | 
 87 | 
 88 | # &和&&的區別：
 89 | 
 90 | c(T,T,T) & c(F,T,T)    # 用一個&，會將向量內的每一個元素互相比對，判斷是True/False
 91 | c(T,T,T) && c(F,T,F)   # 用兩個&，只會將向量內的「第一個元素」互相比對而已
 92 | 
 93 | ```
 94 |    
 95 | ------   
 96 | 
 97 | #**2. 條件指令**
 98 |    
 99 | 我們時常會遇到以下狀況：當「某些條件」成立時，要做A；反之，則做B的情況。   
100 | 
101 | 在生活中，這類的例子比比皆是。比方說，如果今天下雨的話，我就帶傘出門；反之，則不帶出門。在這個例子裡，「今天下雨」就是所謂的**條件**，「帶傘」就是A，「不帶傘」就是B。   
102 |  
103 | 在R裡面，主要有三種運用條件指令的方法：
104 | 
105 | 
106 | * ###**if 和 else的寫法：**
107 | ```{r, eval=FALSE}
108 | # 語法格式
109 | if('條件'){
110 |  '做A'
111 | }else{
112 |  '做B'
113 | }
114 | ```   
115 |   
116 | ```{r}
117 | 
118 | # 多行寫法
119 | if(3 > 2){
120 |   TRUE
121 | }else{
122 |   FALSE
123 | }
124 | 
125 | # 單行寫法
126 | if(3 > 2) TRUE else FALSE
127 | ```
128 |    
129 | * ###**ifelse的寫法：**
130 | ```{r, eval=FALSE}
131 | # 語法格式
132 | ifelse('條件', '條件若成立：做A', '條件若不成立：做B')
133 | ```   
134 | 
135 | ```{r}
136 | ifelse(2 > 3, T, F)
137 | ```
138 | 
139 | * ###**switch的寫法：**
140 | ```{r, eval=FALSE}
141 | # 語法格式
142 | switch('指定執行第幾行/哪個名稱的程式碼', 
143 |        '第一行：做A', 
144 |        '第二行：做B',
145 |        '第三行：做C',
146 |        '第四行：做D',
147 |        ...
148 |        )
149 | ```   
150 | 
151 | ```{r}
152 | # 指定第幾行
153 | switch(2,      # 指定執行第二行程式碼，故回傳4。(請自行修改數字，看不同的結果)
154 |        1+1,    # 第一行：1+1
155 |        2^2,    # 第二行：2的平方
156 |        3*6)    # 第三行：3*6
157 | 
158 | # 指定名稱
159 | switch("Tom",  # 指定執行名稱為Tom的這行程式碼，故回傳7 (請自行修改名稱，看不同的結果)
160 |        Tom = 2+5,         
161 |        Susan = 1*0,       
162 |        Helen = "Apple",
163 |        Lee = 1024)
164 | ```
165 | 
166 | 
167 | ------   
168 | 
169 | #**3. 迴圈指令**
170 |    
171 | 在處理資料的時候，我們時常會需要不斷重複相同的動作，這時候會需要使用到迴圈指令。   
172 | 
173 | 在R裡面，主要迴圈有`for`、`while`以及`repeat`，並且搭配`break`(跳出迴圈)和`next`(省略此次迴圈，執行下一次迴圈)來創造彈性的應用。
174 | 
175 | * ###**for-loop：**
176 | ```{r}
177 | # 計算 1+2+3+4...+135 的值是多少？
178 | 
179 | result <- 0
180 | 
181 | for(i in c(1:135)){ # for-loop裡，i會依序帶入1~135的值，重複進行括號內的程式碼
182 |   
183 |   # 迴圈內重複進行的動作
184 |   result <- result + i
185 | }
186 | 
187 | result
188 | ```   
189 | 
190 | * ###**while-loop：**
191 | ```{r}
192 | # 計算 1+2+3+4...+135 的值是多少？
193 | i <- 1
194 | result <- 0
195 | 
196 | while(i < 136){   # while-loop當符合裡面的條件時，就會一直重複括號內的程式碼，直到不符合為止
197 |   
198 |   # 迴圈內重複進行的動作
199 |   result <- result + i
200 |   i <- i + 1
201 | }
202 | 
203 | result
204 | ```   
205 | 
206 | * ###**repeat-loop：**
207 | ```{r}
208 | # 計算 1+2+3+4...+135 的值是多少？
209 | i <- 1
210 | result <- 0
211 | 
212 | repeat{           # repeat和while很像，差別在於條件可以寫在任何地方，並且使用break跳出迴圈
213 |   
214 |   if(i > 135) break # 當i比135大時，用break跳出迴圈
215 |   
216 |   # 迴圈內重複進行的動作
217 |   result <- result + i
218 |   i <- i + 1
219 | }
220 | 
221 | result
222 | ```   
223 | 
224 | 
225 | * ###**break和next：**
226 | ```{r}
227 | # break 主要用來跳出迴圈
228 | for(i in c(1:5)){
229 |   
230 |   if(i == 3) break  # 當i等於3的時候，跳出迴圈
231 | 
232 |   # 迴圈內重複進行的動作
233 |   print(i)  
234 | }
235 | 
236 | 
237 | # next 主要用來省略此次迴圈的行為，直接進入下一次迴圈
238 | 
239 | for(i in c(1:5)){
240 |   
241 |   if(i == 3) next  # 當i等於3的時候，省略此次迴圈(skip)的動作，從下一個i=4開始
242 | 
243 |   # 迴圈內重複進行的動作
244 |   print(i)  
245 | }
246 | 
247 | ```   
248 | 
249 | 
250 | ------   
251 | 
252 |    
253 | #**總結**    
254 | 
255 | 流程控制的基本概念和寫法相當簡單，在資料分析的過程中是相當重要的技巧之一。   
256 | 
257 | 「條件」的判斷，釐清有哪些動作是「重複」的，或是複合式的運用，這些都是在撰寫相關的程式碼前，就需要先思考過的事情，十分需要相當清晰的邏輯思考能力。   
258 | 
259 | 除此之外，唯有多加練習、累積自己的經驗，才能達到「彈性運用、符合自己的需求」的境界。    
260 | 
261 | It's still a long way to go~   


--------------------------------------------------------------------------------
/Source-File/線性迴歸、變異數分析/.Rhistory:
--------------------------------------------------------------------------------
 1 | install.packages("car")
 2 | require(car)
 3 | install.packages("nlme")
 4 | Payment_and_value_of_care_._Hospital <- read.csv("C:/Users/123/Desktop/Payment_and_value_of_care_-_Hospital.csv")
 5 | View(Payment_and_value_of_care_._Hospital)
 6 | require(markdown)
 7 | api_id.path <- 'rpubs_conn/api_id.txt'
 8 | note.title <- 'R5'
 9 | note.html <- 'R5.html'
10 | # Update
11 | if (file.exists('rpubs_conn/api_id.txt')){
12 | print('Start Updating')
13 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
14 | # update article on Rpubs
15 | update.result <- rpubsUpload(title = note.title,
16 | htmlFile = note.html,
17 | id = api.id
18 | )
19 | browseURL(update.result$continueUrl)
20 | print('update success')
21 | update.result$continueUrl
22 | # Upload
23 | }else{
24 | print('Start Uploading')
25 | dir.create('rpubs_conn')
26 | # upload article on Rpubs
27 | upload.result <- rpubsUpload(title = note.title,
28 | htmlFile = 'R1.html'
29 | )
30 | upload.result$id
31 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
32 | browseURL(upload.result$continueUrl)
33 | print('upload success')
34 | upload.result$continueUrl
35 | }
36 | 


--------------------------------------------------------------------------------
/Source-File/線性迴歸、變異數分析/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/線性迴歸、變異數分析/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/線性迴歸、變異數分析/線性迴歸、變異數分析.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---線性迴歸、變異數分析(ANOVA)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 |   
 14 | 到現在，你已經學到許多R的技巧了。可是學習歸學習，有時候如果沒有實際去操作體會，其效果僅止於「知道」而已，並沒有達到「運用」的境界。   
 15 |    
 16 | 因此，這裡將會運用之前筆記中提到過的技巧(以及介紹新技巧)，進行一次簡單、完整的資料分析，帶大家體會整個資料探勘的流程。
 17 |    
 18 | ------
 19 |    
 20 | #**1. 資料集**   
 21 |    
 22 | 這裡使用的資料，是R內建的鳶尾花(iris)資料(來自於`datasets`套件)。   
 23 |    
 24 | 先用`str()`和`head()`，查看資料裡面的狀態：
 25 | 
 26 | ```{r}
 27 | require(datasets)  # source package
 28 | str(iris)          # check structure of iris
 29 | head(iris, n=6)
 30 | ```
 31 | 
 32 | 不難看出，iris的資料筆數為150筆，共有五個欄位：  
 33 | 
 34 | 1. 花萼長度(Sepal.Length)：計算單位是公分。(連續)    
 35 | 
 36 | 2. 花萼寬度(Sepal.Width)：計算單位是公分。(連續)     
 37 | 
 38 | 3. 花瓣長度(Petal.Length) ：計算單位是公分。(連續)    
 39 | 
 40 | 4. 花瓣寬度(Petal.Width)：計算單位是公分。(連續)     
 41 | 
 42 | 5. 品種(Species)：可分為Setosa，Versicolor和Virginica。(類別)   
 43 |    
 44 |    
 45 | 此外也可以用`summary()`，看各個欄位的基本統計資訊：   
 46 | ```{r}
 47 | summary(iris)
 48 | ```
 49 |    
 50 | ------
 51 |    
 52 | #**2. 繪圖**   
 53 | 
 54 | 在進行資料分析之前，先仔細觀察資料，看看能不能從裡面找到一些隱藏資訊。   
 55 |    
 56 | 例如，花萼長度(Sepal.Length)和花萼寬度(Sepal.Width)，既然都是花萼，可能會有相關，於是畫圖來看看：
 57 | 
 58 | ```{r, message=F}
 59 | #*** 附上三種繪圖系統的程式碼，以ggplot2輸出 ***#
 60 | 
 61 | ### Base Plotting System ###
 62 |   #  plot(x=iris$Sepal.Length, y=iris$Sepal.Width,pch=2)
 63 | 
 64 | ### Lattice ###
 65 |   #  require(lattice)
 66 |   #  xyplot(Sepal.Width~Sepal.Length, data=iris)
 67 | 
 68 | ### ggplot2 ###
 69 |   require(ggplot2)
 70 |   ggplot(data=iris) +
 71 |     geom_point(aes(x=Sepal.Length,
 72 |                    y=Sepal.Width)) +
 73 |     theme_bw() 
 74 | ```   
 75 |    
 76 | 嗯...好像看不出什麼東西？   
 77 |    
 78 | 那試試看花瓣長度(Sepal.Length)和花瓣寬度(Sepal.Width)：
 79 | 
 80 | ```{r, message=F}
 81 | #*** 附上三種繪圖系統的程式碼，以ggplot2輸出 ***#
 82 | 
 83 | ### Base Plotting System ###
 84 |   #  plot(x=iris$Petal.Length, y=iris$Petal.Width,pch=16)
 85 | 
 86 | ### Lattice ###
 87 |   #  require(lattice)
 88 |   #  xyplot(Petal.Width~Petal.Length, data=iris)
 89 | 
 90 | ### ggplot2 ###
 91 |   require(ggplot2)
 92 |   ggplot(data=iris) +                        # 準備畫布
 93 |     geom_point(aes(x=Petal.Length,           # 散布圖
 94 |                    y=Petal.Width)) +
 95 |     theme_bw()                               # 改變主題背景成白色
 96 | ```   
 97 |    
 98 | Bingo，可以觀察出來，花瓣長度和寬度之間，存在著線性關係，而且明顯分成兩群(左下角和右上角)，推測可能和種類(Species)有關，左下角的資料可能是屬於同一種類的鳶尾花。   
 99 | 
100 | 為了確認這一點，我們在上面那張圖標上顏色：   
101 |    
102 | ```{r, message=F}
103 | #*** 附上三種繪圖系統的程式碼，以ggplot2輸出 ***#
104 | 
105 | ### Base Plotting System ###
106 |   #  plot(x=iris$Petal.Length, y=iris$Petal.Width,pch=16)
107 |   #  d1 <- iris[iris$Species=="versicolor", ]
108 |   #  points(x=d1$Petal.Length, y=d1$Petal.Width,pch=16, col="green")
109 |   #  d2 <- iris[iris$Species=="setosa", ]
110 |   #  points(x=d2$Petal.Length, y=d2$Petal.Width,pch=16, col="red")
111 |   #  d3 <- iris[iris$Species=="virginica", ]
112 |   #  points(x=d3$Petal.Length, y=d3$Petal.Width,pch=16, col="blue")
113 |   #  legend("topleft", pch=16
114 |   #         legend=c("setosa","versicolor","virginica"), 
115 |   #         col=c("red", "green", "blue")
116 |   #        )
117 | 
118 |   
119 | ### Lattice ###
120 |   # require(lattice)
121 |   # xyplot(Petal.Width~Petal.Length, 
122 |   #        data=iris, 
123 |   #        pch=16,
124 |   #        group=Species, 
125 |   #        auto.key=list(space="top",
126 |   #                      columns=3, 
127 |   #                      cex.title=1, 
128 |   #                      title="Species Labels",
129 |   #                      pch=16)  
130 |   #       )
131 |   
132 | ### ggplot2 ###
133 |   require(ggplot2)
134 |   ggplot(data=iris) +                        # 準備畫布
135 |     geom_point(aes(x=Petal.Length,           # 散布圖
136 |                    y=Petal.Width,
137 |                    color=Species)) +         # 把不同品種的資料標上顏色
138 |     
139 |     theme_bw()                               # 改變主題背景成白色
140 | ```
141 |    
142 | 並且看不同種類的鳶尾花，長度和寬度的盒鬚圖：   
143 |      
144 | ```{r, message=F, warning=F}
145 | #*** 附上三種繪圖系統的程式碼，以ggplot2輸出 ***#  
146 |   
147 | ### Base Plotting System ###
148 |     # boxplot(Petal.Length~Species, data=iris, xlab="Species", ylab="Petal.Length")
149 |     # boxplot(Petal.Width~Species, data=iris, xlab="Species", ylab="Petal.Length")
150 |     
151 |   
152 | ### Lattice ###
153 |   # require(lattice)
154 |   # bwplot(x = Petal.Length~Petal.Width | Species, data = iris)
155 | 
156 | ### ggplot2 ###
157 |   require(ggplot2)
158 |   qplot(x=Petal.Length,      
159 |         y=Petal.Width, 
160 |         data=iris, 
161 |         geom="boxplot",    # graph type is boxplot
162 |         color=Species)
163 | ```
164 |    
165 | ------
166 |    
167 | 
168 | #**3. 資料預處裡**   
169 | 資料探勘的分析過程中，「資料預處理」往往是最花時間的(佔整個流程的70~80%)。   
170 |    
171 | 根據不同的資料，預處理手法也會不一樣(改變結構、類別轉啞變數、正規化...)，而在預處理之中，最常見的莫過於「遺漏值的處理」！   
172 |    
173 | 要用R檢查資料裡是否有遺漏值的存在，需要使用`is.na()`的函式：
174 | ```{r}
175 | data <- data.frame(x=c(1,2,3,NA,5),
176 |                    y=c(4,5,3,NA,NA))
177 | data  
178 | is.na(data)        # 遺漏值的地方，標註為TRUE (TRUE/FALSE矩陣的型態)
179 | table(is.na(data)) # 資料中總共有多少個遺漏值  
180 | ```
181 | 有遺漏值的資料會影響分析結果，因此我們會採取一些手段，主要可以分為兩類「移除有遺漏值的資料」、「填補遺漏值」：
182 | ```{r, results="hide"}
183 | # 移除有遺漏值的資料，以下兩種方法都可以 #
184 | data[complete.cases(data), ] # 1.使用 complete.cases() 
185 | na.omit(data)                # 2.或是使用 na.omit()
186 | ```
187 | 
188 | ```{r}
189 | # 填補遺漏值(用平均數填值) #
190 | data[is.na(data[,"y"]), "y"] <- mean(data[,"y"], na.rm=T)
191 | data
192 | ```
193 |    
194 | 現在回到iris的資料，檢查看看裡面有沒有遺漏值：   
195 | 
196 | ```{r}
197 | table(is.na(iris))
198 | ```
199 | ...看來十分完美，沒有遺漏值，所以讓我們繼續下去吧！   
200 |    
201 |       
202 | ------
203 |    
204 |    
205 | #**4. 迴歸分析**   
206 | 回歸分析是以一個或一組自變數（解釋變數、預測變項，Xi），來預測一個數值性的因變數（依變數、應變數、被預測變項，Y）。   
207 |    
208 | 相信大家都知道，簡單迴歸表示只有一個Y；複回歸則允許多個Y存在。
209 | 
210 | 要在R跑線性回歸的模型，要使用函式`lm()`(Linear Model)： **model = lm(Y ~ X1+X2+…+Xk, data=…)**    
211 |    
212 | 在這裡，我們以Sepal.Length為依變數(Y)，以Sepal.Width、Petal.Length、Petal.Width為自變數(X)，進行迴歸分析：
213 | 
214 | ```{r}
215 | model <- lm(formula= Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width,
216 |             data=iris)
217 | summary(model)
218 | ```
219 | 從報表中來看，我們可以獲得許多資訊：
220 | 
221 | * Sepal.Length = 1.85600 + 0.65084xSepal.Width + 0.70913xPetal.Length - 0.55648xPetal.Width
222 | 
223 | * 根據p-value，三個自變數(X)對Y都表示顯著。
224 | 
225 | * R-squared: 0.8586 ； Adj R-squared: 0.8557，表示模型預測能力不錯。
226 | 
227 | * Residual standard error: 0.3145
228 | 
229 | 然而，當我們建立出一個線性回歸時，必須要確認其殘差(residual)是否符合下面三個假設：
230 | 
231 | 1. 常態性(Normality)
232 | 
233 | 2. 獨立性(Independence)
234 | 
235 | 3. 變異數同質性(Homogeneity of Variance)
236 | 
237 | 故，首先我們要先從回歸模型中找到殘差的值，可以使用`names()`函式，查看回歸模型內具有的資訊：
238 | ```{r}
239 | names(model)
240 | ```
241 | 其中，**residuals**就是指殘差的值(**coefficients**代表係數)，因此我們可以取出來後進行上面三個假設的檢定：
242 | ```{r}
243 | model$residual
244 | ```
245 | 
246 | ##**常態性**   
247 | `shapiro.test()`函式可以用來檢驗殘差的常態性：   
248 | 
249 | ```{r}
250 | shapiro.test(model$residual)
251 | ```
252 | 由於**虛無假設H0:殘差服從常態分配**，因為p-value > 0.05，代表**不會拒絕H0**。   
253 |    
254 |    
255 |    
256 | ##**獨立性**   
257 | 要檢驗殘差的獨立性，需要使用套件`car`中的`durbinWatsonTest()`函式：
258 |    
259 | ```{r,message=FALSE}
260 | require(car)
261 | # 因為這個函式會自動去抓模型中的殘差，故這裡放的是模型，而不是殘差的值
262 | durbinWatsonTest(model) 
263 | ```
264 | 由於**虛無假設H0:殘差間相互獨立**，因為p-value > 0.05，代表**不會拒絕H0**。
265 |    
266 |    
267 |    
268 |    
269 | ##**變異數同質性**   
270 | 要檢驗殘差的變異數同質性，需要使用套件`car`中的`ncvTest()`函式：
271 | 
272 | ```{r,message=FALSE}
273 | require(car)
274 | # 因為這個函式會自動去抓模型中的殘差，故這裡放的是模型，而不是殘差的值
275 | ncvTest(model)
276 | ```
277 | 由於**虛無假設H0:殘差變異數具有同質性**，因為p-value < 0.05，代表**拒絕H0**。(這表示上面的線性模型無法使用)   
278 | 
279 | 
280 | ##**預測**    
281 | 最後，我們建立模型的目的，是要用來預測！
282 | 
283 | 因此，現在我們手上有一筆新的觀測值，只有Sepal.Width、Petal.Length、Petal.Width的資訊，那我們就可以用建好的迴歸模型，預測出Sepal.Length的值，這時使用`predict()`函式：
284 | 
285 | ```{r}
286 | new.iris <- data.frame(Sepal.Width=3.456, Petal.Length=1.535, Petal.Width=0.341)
287 | new.iris
288 | predict(model, new.iris)
289 | ```  
290 |       
291 | ------
292 |    
293 |    
294 | #**5. 變異數分析(anova)**   
295 | 
296 | 經過視覺化的步驟，發現三個品種鳶尾花的Petal.Width或Petal.Length(平均數)有所差異。   
297 |    
298 | 若要用統計上的檢定，要進一步地確認，就可以使用變異數分析(anova)。   
299 |    
300 | 假設檢定的對應H0和H1分別如下：   
301 | **H0:μ(Setosa)=μ(Versicolor)=μ(Virginica)**   
302 | **H1:至少有一種平均數和其他品種不相等**   
303 |    
304 | 要用one-way-anova，R的函式是`anova()`，並且事先要跑線性回歸模型：
305 | ```{r}
306 | a.lm <- lm(Petal.Width~Species, data=iris)
307 | anova(a.lm)
308 | 
309 | b.lm <- lm(Petal.Length~Species, data=iris)
310 | anova(b.lm)
311 | ```
312 |    
313 | 兩者的p-value都遠小於0.05，表示不同品種間確實有顯著差異。
314 |    
315 | ------
316 |    
317 | #**總結**  
318 | 
319 | 完成這篇筆記後，除了複習之前的技巧之外，還學到了新的技巧：**遺漏值處理，迴歸分析，變異數分析**。   
320 |    
321 | 事實上，要學會R的各種技巧並不難！難的是當我們陸續學到許多技巧後，要如何把這些技巧靈活運用在各式各樣的資料上。同時，你需要了解R再強大，充其量不過只是一個工具而已，若沒有紮實基礎與清楚的思維，也只是在舞刀耍棍罷了，實際上是派不上用場的。
322 |    
323 | 之後，會繼續介紹各種不同的模型(決策樹、類神經網路...)，在R上怎麼操作，並且根據不同的資料，導入不同(資料)處理手法。   
324 | 
325 | It's still a long way to go~   
326 |    
327 | 
328 | 


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/.Rhistory:
--------------------------------------------------------------------------------
  1 | ?with
  2 | ?hist
  3 | ?airquality
  4 | ?hist
  5 | ?airquality
  6 | ?plot
  7 | airquality$Montj
  8 | unique(airquality$Month)
  9 | ?points
 10 | ?legend
 11 | ?blox
 12 | ?boxplot
 13 | ?install.package
 14 | require(lattice)
 15 | ?histogram
 16 | ?bwplot()
 17 | require(markdown)
 18 | api_id.path <- 'rpubs_conn/api_id.txt'
 19 | note.title <- 'R4'
 20 | note.html <- 'R4.html'
 21 | # Update
 22 | if (file.exists('rpubs_conn/api_id.txt')){
 23 | print('Start Updating')
 24 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
 25 | # update article on Rpubs
 26 | update.result <- rpubsUpload(title = note.title,
 27 | htmlFile = note.html,
 28 | id = api.id
 29 | )
 30 | browseURL(update.result$continueUrl)
 31 | print('update success')
 32 | update.result$continueUrl
 33 | # Upload
 34 | }else{
 35 | print('Start Uploading')
 36 | dir.create('rpubs_conn')
 37 | # upload article on Rpubs
 38 | upload.result <- rpubsUpload(title = note.title,
 39 | htmlFile = 'R1.html'
 40 | )
 41 | upload.result$id
 42 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
 43 | browseURL(upload.result$continueUrl)
 44 | print('upload success')
 45 | upload.result$continueUrl
 46 | }
 47 | require(datasets)
 48 | head(airquality)
 49 | hist(x=airquality$Month,
 50 | main="Histogram of Month",         # 圖片的名稱
 51 | xlab="Month",                      # X軸的名稱
 52 | ylab="Frequency")                  # Y軸的名稱
 53 | boxplot(formula = Ozone ~ Month, # Y ~ X (代表X和Y軸要放的數值)
 54 | data = airquality,       # 資料
 55 | xlab = "Month",          # X軸名稱
 56 | ylab = "Ozone (ppb)",    # Y軸名稱
 57 | col ="gray")             # 顏色
 58 | plot(x=airquality$Month,            # X軸的值
 59 | y=airquality$Temp,             # Y軸的值
 60 | main="Month to Temperature",   # 圖片名稱
 61 | xlab="Month(1~12)",            # X軸名稱
 62 | ylab="Temperature(degrees F)") # Y軸名稱
 63 | plot(x=airquality$Ozone,      # X軸的值
 64 | y=airquality$Wind,       # Y軸的值
 65 | main="Ozone to Wind",    # 圖片名稱
 66 | xlab="Ozone(ppb)",       # X軸的名稱
 67 | ylab="Wind(mph)"         # Y軸的名稱
 68 | )
 69 | # 建立一個畫布，上面已經有一張散布圖(Ozone to Wind)
 70 | plot(x=airquality$Ozone,
 71 | y=airquality$Wind,
 72 | main="Ozone to Wind",
 73 | xlab="Ozone(ppb)",
 74 | ylab="Wind(mph)",
 75 | pch=16                  # 點的圖形
 76 | )
 77 | # 現在我們要在這張圖片中，把5月的資料點用藍色標註上去
 78 | May_data <- airquality[airquality$Month==5, ]   # 找出5月的資料
 79 | # 標上藍色的點
 80 | points(x=May_data$Ozone,
 81 | y=May_data$Wind,
 82 | pch=16,                  # 點的圖形
 83 | col="blue")              # 顏色
 84 | # 同理，也可以把8月的資料點用紅色標註上去
 85 | Aug_data <- airquality[airquality$Month==8, ]   # 找出8月的資料
 86 | # 標上紅色的點
 87 | points(x=Aug_data$Ozone,
 88 | y=Aug_data$Wind,
 89 | pch=16,               # 點的圖形
 90 | col="red")            # 顏色
 91 | # 在右上角做出標示
 92 | legend("topright",                                # 表示在右上角
 93 | pch = 1,                                   # pch代表點的圖案
 94 | col = c("blue", "red", "black"),           # col代表顏色
 95 | legend = c("May", "August", "Other Month") # 顏色所對應的名稱
 96 | )
 97 | # 我們也可以畫出回歸趨勢線
 98 | lm.model <- lm(Wind~Ozone, airquality)    # 建立一個線性回歸
 99 | # 畫上回歸的趨勢線
100 | abline(lm.model,
101 | lwd=2)     # lwd 代表線的粗細
102 | # c(1,2)，表示建立一個1x2的空間，用來呈現後續的圖
103 | par(mfrow = c(1,2))
104 | # 第一張圖
105 | plot(airquality$Wind, airquality$Ozone, main = "Wind to Ozone")
106 | # 第二張圖
107 | plot(airquality$Solar.R, airquality$Ozone, main = "Solar.R to Ozone")
108 | require(lattice)  # 如果無法執行，請先install.packages("lattice")
109 | # 先把月份變成類別變數
110 | airquality$Month <- as.factor(airquality$Month)
111 | # 繪圖
112 | histogram(x= ~ Ozone | Month,  # 根據月份(Month)的條件，繪製臭氧(Ozone)的直方圖
113 | data=airquality,
114 | xlab="Ozone(ppb)",
115 | layout=c(5,1))       # 以5x1的方式呈現圖表
116 | bwplot(x = Ozone ~ Month,      # 把Month放在X軸，Ozone放在Y軸
117 | data = airquality,
118 | xlab = "Month"
119 | )
120 | # 把Ozone放在x的值；當然，可以增加月份的條件( ~ Ozone | Month)
121 | densityplot( ~ Ozone ,
122 | data=airquality
123 | )
124 | # Wind放在Z軸，Temp和Ozone放在X和Y軸，根據Month條件分別繪圖
125 | cloud(x=Wind~Temp+Ozone | Month,
126 | data=airquality
127 | )
128 | xyplot(x=Wind~Temp,         # Wind放在Y軸，Temp放在X軸
129 | data=airquality,
130 | group = Month,       # 根據Month，把資料點用顏色區分開來
131 | # auto.key參數，表示設定標籤與其他資訊
132 | auto.key=list(space="top",          # 位置在上方
133 | columns=5,            # 1x5的方式呈現標籤
134 | title="Month Labels", # 標籤名稱
135 | cex.title=1)          # 標籤字體大小
136 | )
137 | # 目的:我們想要在散布圖中，畫出標示出中位數的線 #
138 | xyplot(x=Wind~Temp | Month,  # Wind放在Y軸，Temp放在X軸，並根據Month條件分別繪圖
139 | data=airquality,
140 | layout = c(5,1),      # 以5x1的方式呈現圖
141 | # 在這裡，我們要使用panel function，畫出中位數的線
142 | panel=function(x,y){
143 | # function的寫法，會用大括號包起來，裡面表示要進行的動作：
144 | # 在這個panel function裡面，我們進行了兩個動作
145 | panel.xyplot(x, y)                    # 1.繪製x-y的散布圖
146 | panel.abline(h = median(y), lty = 2)  # 2.標示出中位數的線段
147 | }
148 | )
149 | # 目的:我們想要在散布圖中，畫出線性回歸的趨勢線 #
150 | xyplot(x=Wind~Temp ,         # Wind放在Y軸，Temp放在X軸
151 | data=airquality,
152 | # 在這裡，我們要使用panel function，畫出線性回歸的趨勢線
153 | panel=function(x,y){
154 | # function的寫法，會用大括號包起來，裡面表示要進行的動作：
155 | # 在這個panel function裡面，我們進行了三個動作
156 | panel.fill(col="gray")         # 1.改變背景顏色(gray)
157 | panel.xyplot(x, y)             # 2.繪製x-y的散布圖
158 | panel.lmline(x, y, col="red")  # 3.畫出線性回歸的趨勢線
159 | }
160 | )
161 | require(ggplot2)
162 | qplot(x=Ozone,
163 | data=airquality,
164 | geom="histogram",             # 圖形=histogram
165 | main = "Histogram of Ozone",
166 | xlab="Ozone(ppb)",
167 | binwidth = 25,                # 每25單位為一區隔
168 | fill= Month                   # 以顏色標註月份，複合式的直方圖
169 | )
170 | qplot(x=Temp,
171 | y=Ozone,
172 | data=airquality,
173 | geom="point",                         # 圖形=scatter plot
174 | main = "Scatter Plot of Ozone-Temp",
175 | xlab="Temp",
176 | ylab="Ozone(ppb)",
177 | color= Month                          # 以顏色標註月份，複合式的散布圖
178 | )
179 | qplot(x=Temp,
180 | data=airquality,
181 | geom="density",        # 圖形=density
182 | xlab="Temp",
183 | color= Month           # 以顏色標註月份，複合式的機率密度圖
184 | )
185 | qplot(x=Month,
186 | y=Ozone,
187 | data=airquality,
188 | geom="boxplot",       # 圖形=boxplot
189 | xlab="Temp",
190 | color= Month          # 以顏色標註月份，複合式的合鬚圖
191 | )
192 | # 準備一個畫布，資料集=airquality
193 | canvas <- ggplot(data=airquality)
194 | # 方才準備的畫布
195 | canvas +
196 | # 以直方圖的圖形呈現資料
197 | geom_histogram(aes(x=Ozone,     # X 放Ozone
198 | fill=Month   # 根據月份顯示不同的顏色
199 | )
200 | )
201 | # 方才準備的畫布
202 | canvas +
203 | # 以直方圖的圖形呈現資料
204 | geom_histogram(aes(x=Ozone,
205 | fill=Month)  # 以粉紅色填滿
206 | ) +
207 | # 用facet()，分別各畫一張各月份的直方圖
208 | facet_grid(.~Month)   # 因為Month放在右邊，故圖片以水平方向呈現
209 | # 準備畫布
210 | ggplot(data=airquality) +
211 | # 散布圖對應的函式是geom_point()
212 | geom_point(aes(x=Temp,  # 用aes()，描繪散布圖內的各種屬性
213 | y=Ozone,
214 | main="Scatter Plot of Ozone-Temp",
215 | color=Month)
216 | ) +
217 | # 用geom_smooth()加上趨勢線
218 | geom_smooth(aes(x=Temp,
219 | y=Ozone)) +
220 | # 用labs()，進行文字上的標註(Annotation)
221 | labs(title="Scatter of Temp-Ozone",
222 | x="Temp",
223 | y="Ozone") +
224 | # 用theme_bw(background white)，改變主題背景成白色
225 | # 更多背景設定： http://docs.ggplot2.org/current/ggtheme.html
226 | theme_bw()
227 | ggplot(data=airquality) +
228 | # 要畫線的話，對應的函式是geom_line()
229 | geom_line(aes(x=Temp,
230 | y=Ozone,
231 | color=Month)
232 | ) +
233 | # 用labs()，進行文字上的標註(Annotation)
234 | labs(title="Line Plot of Temp-Ozone",
235 | x="Temp",
236 | y="Ozone") +
237 | theme_bw()
238 | # 準備畫布
239 | ggplot(data=airquality) +
240 | # 散布圖對應的函式是geom_point()
241 | geom_point(aes(x=Temp,
242 | y=Ozone,
243 | main="Scatter Plot of Ozone-Temp",
244 | color=Month)
245 | ) +
246 | # 要畫線的話，對應的函式是geom_line()
247 | geom_line(aes(x=Temp,
248 | y=Ozone,
249 | color=Month)
250 | ) +
251 | # 用labs()，進行文字上的標註(Annotation)
252 | labs(title="Combination of Scatter and Line Plots",
253 | x="Temp",
254 | y="Ozone") +
255 | theme_bw()
256 | # 自己定義一筆新的資料
257 | df <- data.frame(sex=c("child", "teen", "adult", "old man"),
258 | perc=c(21,53,85,8)
259 | )
260 | #準備畫布
261 | ggplot(data=df) +
262 | # 先畫bar plot
263 | geom_bar(aes(x=factor(1),
264 | y=perc,
265 | fill=sex),
266 | stat = "identity"
267 | ) +
268 | # 再沿著Y，轉軸成圓餅圖
269 | coord_polar("y", start=0)
270 | 


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/2.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/3.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/Combination of Plots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/Combination of Plots.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/Thumbs.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/Thumbs.db


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---繪圖–資料視覺化"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 | 
 14 | 經過上一篇的筆記，你已經明白<a href="https://rpubs.com/skydome20/R-Note3-function_and_package" target="_blank">R的套件與函式的概念，並且知道如何使用它們</a>。想必如今你正躍躍欲試，想趕快去找一筆實際資料，運用R語言來進行分析...這樣很好！真的！   
 15 |    
 16 | 你可能已經根據前幾篇的筆記，默默開始進行練習了！   
 17 |    
 18 | 而我們今天要來學習「更實用的技巧」，也是R在推出的時候，就一直被人大力稱讚的強大功能---**繪圖(Plotting)！**    
 19 |    
 20 | ------
 21 |    
 22 | 「繪圖」最主要目的，是把資料化為各式各樣的圖表(趨勢圖、長條圖、圓餅圖、箱型圖、複合式圖表...)，時常使用Excel的人，相信對這樣的動作並不陌生。   
 23 |    
 24 | 然而，你有仔細思考過，為什麼要把資料畫成圖表嗎？   
 25 |    
 26 | 事實上，這樣的動作有一個專有名詞，叫做**資料視覺化(Data Visualization)**---其牽涉的領域包括**藝術、設計、心理學、程式軟體...**毫無疑問，「資料視覺化」這個名詞，如今已經代表一門專業學科；在網路的社群上，也有來自各方人士的熱烈討論與研究。   
 27 |    
 28 | 然而，對理工背景的人來說，**設計，藝術或心理學**，就像另一個世界的語言；對人文背景的人來說，**電腦，程式或軟體**，則是無法觸及的神話...   
 29 |    
 30 | 不不不，千萬別這麼想。其實「資料視覺化」最基本、最主要的概念，只有一句話而已：**「降低資料的理解門檻」**。   
 31 |    
 32 | ------
 33 |    
 34 | 在2016/03/26，<a href="http://www.slideshare.net/tw_dsconf/ss-60041639" target="_blank">資料科學愛好者年會所舉辦的活動</a>上，台大土木系的康仕仲教授以一張圖來解釋這樣的概念：   
 35 | <img src="1.png" />   
 36 |    
 37 | 因此，「資料視覺化」可以在**不簡化資訊情況下**，**降低複雜資料的理解門檻**；也可以說，幫助我們**以較簡單的方式**，去**理解高維度(複雜)的資料**。   
 38 |    
 39 | ------
 40 |    
 41 | 資料分析前，我們需要**瀏覽資料(explore data)**，以理解資料的狀態；資料分析後，我們需要把**分析結果呈現**出來，使人容易理解。   
 42 |    
 43 | 你會發現，針對以上這兩種情況，「資料視覺化」便佔了很重要的地位，對吧？因此，學好視覺化的技巧，絕對是有其必要性。   
 44 |    
 45 | 在R裡面，主要有三個強大的繪圖系統(Plotting System)，可以幫助我們輕易達成「資料視覺化」的目的，以下將會一一介紹：   
 46 |    
 47 | #**1. Base Plotting System**   
 48 |    
 49 | 看到Base這個單字，就知道這是R最基本、最核心的繪圖系統。   
 50 |    
 51 | 不過，可不要以為基本的東西就比較陽春！事實上，光是這最基本的繪圖系統，就已經有許多功能強大的繪圖函式了！   
 52 |    
 53 | 以下都會拿R裡面，一個關於空氣品質的資料集`airquality`來進行示範：   
 54 | ```{r}
 55 | require(datasets)
 56 | head(airquality)
 57 | ```
 58 |    
 59 | ##‧hist()
 60 | 這是畫**直方圖(Histogram)**的函式。   
 61 | 當我們想要看資料裡面，各個月份(Month)有多少筆資料，可以這樣使用：   
 62 | ```{r}
 63 | hist(x=airquality$Month, 
 64 |      main="Histogram of Month",         # 圖片的名稱
 65 |      xlab="Month",                      # X軸的名稱
 66 |      ylab="Frequency")                  # Y軸的名稱
 67 | ```
 68 |    
 69 | ##‧boxplot()
 70 | 這是畫**盒鬚圖(Box Plot)**的函式。   
 71 | 當我們想要看資料裡面，不同月份的臭氧(Ozone)數值的分布情況，可以這樣使用：   
 72 |    
 73 | ```{r}
 74 | boxplot(formula = Ozone ~ Month, # Y ~ X (代表X和Y軸要放的數值) 
 75 |         data = airquality,       # 資料
 76 |         xlab = "Month",          # X軸名稱
 77 |         ylab = "Ozone (ppb)",    # Y軸名稱
 78 |         col ="gray")             # 顏色
 79 | ```
 80 |    
 81 |    
 82 | ##‧plot()
 83 | 這是畫**散布圖(Scatter Plot)**的函式。   
 84 | 當我們想要看月份(Month)和氣溫(Temp)之間的關係，可以這樣使用：   
 85 | ```{r}
 86 | plot(x=airquality$Month,            # X軸的值
 87 |      y=airquality$Temp,             # Y軸的值
 88 |      main="Month to Temperature",   # 圖片名稱
 89 |      xlab="Month(1~12)",            # X軸名稱
 90 |      ylab="Temperature(degrees F)") # Y軸名稱       
 91 | ```
 92 | 
 93 | 或者，我們想要看臭氧(Ozone)和風(Wind)之間的關係：   
 94 | ```{r}
 95 | plot(x=airquality$Ozone,      # X軸的值
 96 |      y=airquality$Wind,       # Y軸的值
 97 |      main="Ozone to Wind",    # 圖片名稱
 98 |      xlab="Ozone(ppb)",       # X軸的名稱
 99 |      ylab="Wind(mph)"         # Y軸的名稱
100 | )
101 | ```
102 |    
103 | 很容易，對吧？   
104 |    
105 | 事實上，當我們使用`plot()`函式時，代表我們其實在R裡面建立了一塊「畫布」(canvas)...對，就是藝術家作畫的那種畫布！   
106 |    
107 | 既然是一塊畫布，就表示我們可以隨自己的意思，在畫布上面增一筆、撇一畫，增加我們想要的東西。   
108 |    
109 | 因此基於這樣的觀念，在R裡面利用Base Plotting System繪圖時，可以分成「兩階段」：
110 | 
111 | **1. 創造一張圖(Creation of a plot)**
112 | 
113 | **2. 修飾這張圖(Annotation of a plot: adding lines, points, texts...)  ** 
114 |    
115 | 下面是示範的程式碼：   
116 | 
117 | ```{r, include=T}
118 | # 建立一個畫布，上面已經有一張散布圖(Ozone to Wind)
119 | plot(x=airquality$Ozone,
120 |      y=airquality$Wind,
121 |      main="Ozone to Wind",
122 |      xlab="Ozone(ppb)",
123 |      ylab="Wind(mph)",
124 |      pch=16                  # 點的圖形
125 | ) 
126 | 
127 | # 現在我們要在這張圖片中，把5月的資料點用藍色標註上去
128 | May_data <- airquality[airquality$Month==5, ]   # 找出5月的資料
129 |   # 標上藍色的點
130 | points(x=May_data$Ozone,                       
131 |        y=May_data$Wind, 
132 |        pch=16,                  # 點的圖形
133 |        col="blue")              # 顏色
134 | 
135 | # 同理，也可以把8月的資料點用紅色標註上去
136 | Aug_data <- airquality[airquality$Month==8, ]   # 找出8月的資料
137 |   # 標上紅色的點
138 | points(x=Aug_data$Ozone, 
139 |        y=Aug_data$Wind, 
140 |        pch=16,               # 點的圖形
141 |        col="red")            # 顏色
142 | 
143 | # 在右上角做出標示
144 | legend("topright",                                # 表示在右上角
145 |        pch = 1,                                   # pch代表點的圖案
146 |        col = c("blue", "red", "black"),           # col代表顏色 
147 |        legend = c("May", "August", "Other Month") # 顏色所對應的名稱
148 |        )
149 | 
150 | # 我們也可以畫出回歸趨勢線
151 | lm.model <- lm(Wind~Ozone, airquality)    # 建立一個線性回歸
152 |   # 畫上回歸的趨勢線
153 | abline(lm.model,                          
154 |        lwd=2)     # lwd 代表線的粗細
155 | ```
156 |    
157 | (上圖是集合所有動作的成果；建議可以在自己的電腦上，一步一步慢慢執行每一行程式，就可以看見整張圖是如何變化的。)   
158 |    
159 | ------
160 |    
161 | 你會發現，用`plot()`建立出一個散布圖(畫布)後，我們又運用一些函式，在畫布上面增添想要的資訊(e.g. `points()`=畫上點；`legend()`=作上標記；`abline()`=畫上線)   
162 |    
163 | 
164 | 當然，具有類似功能的函式不只這些，這裡列出一些常用的：   
165 |    
166 | |  函式   |         功能         |
167 | |:-------:|:--------------------:| 
168 | | plot    |  繪製散布圖(畫布)    | 
169 | | lines   |  把資料點用線連接    | 
170 | | points  |  繪製新的資料點      |    
171 | | text    |  補充文字說明在圖上  |  
172 | | title   |  補充主旨            |  
173 | | mtext   |  在圖外顯示大主旨    |  
174 | | axis    |  增加軸的labels      |  
175 | 
176 |    
177 | ------
178 |   
179 | 當然有的時候，我們會想要同時畫兩張圖，這時候可以使用`par()`函式：
180 | 
181 | ```{r}
182 | # c(1,2)，表示建立一個1x2的空間，用來呈現後續的圖
183 | par(mfrow = c(1,2)) 
184 | 
185 | # 第一張圖
186 | plot(airquality$Wind, airquality$Ozone, main = "Wind to Ozone") 
187 | # 第二張圖
188 | plot(airquality$Solar.R, airquality$Ozone, main = "Solar.R to Ozone")
189 | 
190 | ```
191 |    
192 | ------
193 |    
194 | #**2. Lattice**   
195 | 這是在R裡面第二個繪圖系統。   
196 |    
197 | 和Base Plotting System不同的是，Lattice的繪圖概念，並不是「兩階段」，而是直接一筆在圖上繪製所有的資訊。   
198 |    
199 | 這裡需要注意的是，使用Lattice函式的方式，和Base Plotting System不太一樣...在Lattice裡面，函式的使用主要是這樣的型態：   
200 | <img src="2.png" />   
201 |    
202 | R裡面有一個公式(formula)的概念，在建模或畫圖時，用來表達x和y的值，以及條件變數：  
203 | 
204 | * 在`~`左邊的，是`y`的值；在`~`右邊的，是`x`的值   
205 | 
206 | * `f`和`g`代表條件變數(condition variables)，可以省略(omit)   
207 | 
208 | * `data`的部分，則放置我們要繪圖的資料集
209 |    
210 |       
211 | ------
212 |    
213 | 在使用Lattice繪圖之前，必須匯入`lattice`套件：   
214 | ```{r, message=FALSE}
215 | require(lattice)  # 如果無法執行，請先install.packages("lattice")
216 | ```
217 |    
218 | ##‧histogram()
219 | 這是Lattice中，畫**直方圖(Histogram)**的函式。   
220 | 當我們想要看資料(airquality)中，臭氧(Ozone)在不同月份(Month)下的狀態資訊，可以這樣使用：      
221 | ```{r}   
222 | # 先把月份變成類別變數
223 | airquality$Month <- as.factor(airquality$Month) 
224 | 
225 | # 繪圖
226 | histogram(x= ~ Ozone | Month,  # 根據月份(Month)的條件，繪製臭氧(Ozone)的直方圖
227 |           data=airquality,     
228 |           xlab="Ozone(ppb)",  
229 |           layout=c(5,1))       # 以5x1的方式呈現圖表
230 |    
231 | ```  
232 |    
233 |    
234 | ##‧bwplot()   
235 | 這是畫**盒鬚圖(Box Plot)**的函式。     
236 | 當我們想要看資料裡面，不同月份的臭氧(Ozone)數值的分布情況，可以這樣使用：   
237 | (可以和`boxplot()`的圖對照)      
238 | ```{r}
239 | bwplot(x = Ozone ~ Month,      # 把Month放在X軸，Ozone放在Y軸
240 |        data = airquality,     
241 |        xlab = "Month"         
242 |        )
243 | ```
244 | 
245 | ##‧densityplot()
246 | 這是畫**機率密度圖**的函式。   
247 | 當我們想要看資料裡面，臭氧(Ozone)的機率密度分布關係，可以這樣使用：
248 | ```{r}
249 | # 把Ozone放在x的值；當然，可以增加月份的條件( ~ Ozone | Month)
250 | densityplot( ~ Ozone ,      
251 |             data=airquality
252 |             )
253 | ```
254 | 
255 | ##‧cloud()
256 | 這是畫**3D圖**的函式。   
257 | 當我們想要看資料裡面，不同月份下，臭氧(Ozone)、氣溫(Temp)和風(Wind)之間的關係，可以這樣使用：   
258 | ```{r}
259 | # Wind放在Z軸，Temp和Ozone放在X和Y軸，根據Month條件分別繪圖
260 | cloud(x=Wind~Temp+Ozone | Month, 
261 |        data=airquality         
262 |       )
263 | ```
264 | 
265 | ##‧xyplot()
266 | 這是畫**散布圖(Scatter Plot)**的函式。   
267 | 當我們想要看不同月份(Month)之間，氣溫(Temp)和風(Wind)之間的關係，可以這樣使用：   
268 | 
269 | ```{r}
270 | xyplot(x=Wind~Temp,         # Wind放在Y軸，Temp放在X軸
271 |        data=airquality,     
272 |        group = Month,       # 根據Month，把資料點用顏色區分開來  
273 |        
274 |        # auto.key參數，表示設定標籤與其他資訊
275 |        auto.key=list(space="top",          # 位置在上方 
276 |                      columns=5,            # 1x5的方式呈現標籤
277 |                      title="Month Labels", # 標籤名稱
278 |                      cex.title=1)          # 標籤字體大小
279 |        )
280 | ```   
281 |    
282 | 還記得在Base Plotting System裡面，當我們畫完散布圖後，可以再隨意添加各種資訊上去嗎？可惜的是，這在Lattice是行不通的。   
283 |    
284 | 「一次把所有資訊繪在圖上」，是Lattice的精髓。而要達成這樣的目的，我們要學習一個叫`panel function`的概念。   
285 |    
286 | 簡單來說，`panel function`是用來**控制所有發生在圖表內的資訊**，只是在寫法上會稍嫌困難一些。下面會舉兩個例子，幫助大家理解`panel function`的運用與概念：   
287 | 
288 | ```{r}
289 | # 目的:我們想要在散布圖中，畫出標示出中位數的線 #
290 | 
291 | xyplot(x=Wind~Temp | Month,  # Wind放在Y軸，Temp放在X軸，並根據Month條件分別繪圖
292 |        data=airquality,      
293 |        layout = c(5,1),      # 以5x1的方式呈現圖
294 |        
295 |        # 在這裡，我們要使用panel function，畫出中位數的線
296 |        panel=function(x,y){  
297 |            # function的寫法，會用大括號包起來，裡面表示要進行的動作：
298 |            # 在這個panel function裡面，我們進行了兩個動作
299 |            panel.xyplot(x, y)                    # 1.繪製x-y的散布圖
300 |            panel.abline(h = median(y), lty = 2)  # 2.標示出中位數的線段
301 |        }
302 |        
303 | )
304 | 
305 | ```
306 | 
307 | ```{r}
308 | # 目的:我們想要在散布圖中，畫出線性回歸的趨勢線 #
309 | 
310 | xyplot(x=Wind~Temp ,         # Wind放在Y軸，Temp放在X軸
311 |        data=airquality,     
312 |       
313 |        # 在這裡，我們要使用panel function，畫出線性回歸的趨勢線
314 |        panel=function(x,y){  
315 |            # function的寫法，會用大括號包起來，裡面表示要進行的動作：
316 |            # 在這個panel function裡面，我們進行了三個動作
317 |            panel.fill(col="gray")         # 1.改變背景顏色(gray)
318 |            panel.xyplot(x, y)             # 2.繪製x-y的散布圖
319 |            panel.lmline(x, y, col="red")  # 3.畫出線性回歸的趨勢線
320 |        }
321 |        
322 | )
323 | ```
324 |    
325 | [這個網站](http://www.magesblog.com/2012/12/changing-colours-and-legends-in-lattice.html)有更多關於Lattice繪圖的技巧(顏色、標籤...)，有興趣的歡迎參考。
326 |    
327 |    
328 | ------
329 |    
330 | #**3. ggplot2**
331 | 接下來要介紹的，是在R裡面最受歡迎的第三方繪圖套件：**ggplot2**。   
332 |    
333 | `ggplot2`的繪圖概念，又和上面兩者不一樣，是基於**Grammar of Graphics**的想法，直譯來說，就是「圖形的文法」。     
334 |    
335 | 這是一個十分抽象的概念。簡單來說，就是當你在用`ggplot2`繪圖的時候，要思考圖形(graphics)中的**「名詞(noun)」、「動詞(verb)」、「形容詞(adjective)」**...就是這樣的概念！   
336 |    
337 | 在`ggplot2`中，繪圖的文法包括兩個主要屬性：   
338 | 
339 | * Aesthetic attributes (美學表現)：包括「顏色、形狀、點的大小與線的粗細」等...   
340 | 
341 | * Geometric objects (幾何屬性)：包括「點、線、盒狀圖、直條圖」等...   
342 |    
343 | 此外，還有其他次要屬性：   
344 | 
345 | * Facets：提供在同一張圖內做多個子圖的方法，只要使用Faceting功能設定子圖分類的依據參數即可。   
346 | 
347 | * Stats：將資料做統計轉換。   
348 | 
349 | * Scales：修改點線的顏色、形狀、xy軸的範圍等   
350 | 
351 |    
352 | ------
353 |    
354 | 使用ggplot2時，需要匯入`ggplot2`的套件，並且根據以下步驟進行繪圖：   
355 |    
356 | ------
357 |    
358 | ##‧qplot()
359 | 這是`ggplot2`裡面比較簡易的函式，使用上很像`plot()`的觀念，不同的是，我們可以單純利用這個函式，改變其中`geom`的參數，就能畫出**直方圖、散佈圖、合鬚圖**...等等圖形。
360 | 
361 | ###**直方圖(histogram)**
362 | 
363 | ```{r,warning=FALSE}
364 | require(ggplot2)
365 | qplot(x=Ozone,                      
366 |       data=airquality,              
367 |       geom="histogram",             # 圖形=histogram
368 |       main = "Histogram of Ozone",  
369 |       xlab="Ozone(ppb)",            
370 |       binwidth = 25,                # 每25單位為一區隔
371 |       fill= Month                   # 以顏色標註月份，複合式的直方圖
372 |       )
373 | ```
374 | 
375 | ###**散布圖(scatter plot)**
376 | ```{r,warning=FALSE}
377 | qplot(x=Temp,                               
378 |       y=Ozone,                              
379 |       data=airquality,                      
380 |       geom="point",                         # 圖形=scatter plot
381 |       main = "Scatter Plot of Ozone-Temp",  
382 |       xlab="Temp",                          
383 |       ylab="Ozone(ppb)",                    
384 |       color= Month                          # 以顏色標註月份，複合式的散布圖
385 |       )
386 | ```
387 | 
388 | ###**機率密度圖(density plot)**
389 |    
390 | ```{r,warning=FALSE}
391 | qplot(x=Temp,                             
392 |       data=airquality,                     
393 |       geom="density",        # 圖形=density
394 |       xlab="Temp",                         
395 |       color= Month           # 以顏色標註月份，複合式的機率密度圖
396 | )
397 | 
398 | ```
399 |    
400 | ###**合鬚圖(boxplot)**
401 | 
402 | ```{r,warning=FALSE}
403 | qplot(x=Month,                               
404 |       y=Ozone,
405 |       data=airquality,                     
406 |       geom="boxplot",       # 圖形=boxplot
407 |       xlab="Temp",                          
408 |       color= Month          # 以顏色標註月份，複合式的合鬚圖
409 | )
410 | 
411 | ```
412 |    
413 | ------
414 |    
415 | `qplot()`裡面因為有許多內建的參數，因此使用起來，會讓人覺得很像是`plot()`或Lattice的繪圖函式。   
416 |    
417 | 然而，卻也因為是內建的參數，有時候在使用`qplot()`時，會覺得有些受到限制。   
418 |    
419 | 因此，在`ggplot2`裡面，有另外一個函式叫做`ggplot()`，就是使用**「圖形的文法」**來進行繪圖。   
420 | 
421 | 比起`qplot()`，`ggplot()`需要自己進行各種設定(美學/幾何)，雖然學習上比較複雜一點，卻也因此更加有彈性(flexible)和客製化(customized)的表現。   
422 |    
423 |    
424 |    
425 | ##‧ggplot()
426 | 雖然說是「圖形的文法」，但概念上和`plot()`的兩階段流程十分相似，只是這裡變成「三階段流程」： 
427 | 
428 | 1. 準備好資料，用`ggplot()`建構出圖形的「畫布」(canvas)。   
429 | 
430 | 2. 設定**Aesthetic attributes**：使用`aes(x, y, ...)`指定。   
431 | 
432 | 3. 指定**Geometric objects**：(`geom_histogram()`、`geom_point()`、`geom_line()`、`geom_polygon()`、`geom_errorbar()`...)。   
433 |    
434 | (*關於第二點的「美學表現」，我們可以在建構畫布(`ggplot()`)時設定，也可以在決定幾何圖形(`geom()`)時設定，十分彈性。)   
435 | 
436 | 在這裡，以airquality為例子，先單純準備好一個畫布：   
437 | ```{r}
438 | # 準備一個畫布，資料集=airquality
439 | canvas <- ggplot(data=airquality)
440 | ```
441 | 由於用`ggplot()`的時候，並不會輸出圖形，只是準備好畫布(canvas)而已，因此接下來我們要用`geom()`，決定要繪製何種圖表。   
442 | 
443 | ###**直方圖(histogram)**
444 | ```{r,warning=FALSE}
445 | # 方才準備的畫布
446 | canvas +
447 |     # 以直方圖的圖形呈現資料
448 |     geom_histogram(aes(x=Ozone,     # X 放Ozone
449 |                        fill=Month   # 根據月份顯示不同的顏色   
450 |                        ) 
451 |                    )     
452 | 
453 | ```
454 | 而當我們想要每一個月份，分別畫一張直方圖的話，可以使用`facet()`函式。    
455 |    
456 | 其參數的形式`vertical ~ horizontal`，表示圖片是要以垂直/水平的方向呈現。  
457 |    
458 | (更多：http://www.cookbook-r.com/Graphs/Facets_%28ggplot2%29/)
459 | ```{r,warning=FALSE}
460 | # 方才準備的畫布
461 | canvas +
462 |     # 以直方圖的圖形呈現資料
463 |     geom_histogram(aes(x=Ozone,
464 |                        fill=Month)  # 以粉紅色填滿         
465 |                    ) +
466 |     
467 |     # 用facet()，分別各畫一張各月份的直方圖
468 |     facet_grid(.~Month)   # 因為Month放在右邊，故圖片以水平方向呈現
469 | 
470 | ```
471 | 
472 | 這裡有個觀念十分重要，我們其實是用`+`符號，來連接不同的圖層(canvas、幾何圖表、美學表現)，所有圖層最後會呈現在圖表上。   
473 | 
474 | 
475 | ###**散布圖(scatter plot)**
476 | ```{r,warning=FALSE}
477 | # 準備畫布
478 | ggplot(data=airquality) +   
479 |     
480 |     # 散布圖對應的函式是geom_point()
481 |     geom_point(aes(x=Temp,  # 用aes()，描繪散布圖內的各種屬性
482 |                    y=Ozone,
483 |                    main="Scatter Plot of Ozone-Temp",
484 |                    color=Month) 
485 |                ) + 
486 |     # 用geom_smooth()加上趨勢線
487 |     geom_smooth(aes(x=Temp,
488 |                     y=Ozone)) +
489 |     
490 |     # 用labs()，進行文字上的標註(Annotation)
491 |     labs(title="Scatter of Temp-Ozone",
492 |          x="Temp",
493 |          y="Ozone") +
494 |     
495 |     # 用theme_bw(background white)，改變主題背景成白色
496 |     # 更多背景設定： http://docs.ggplot2.org/current/ggtheme.html            
497 |     theme_bw()          
498 | ```
499 | 
500 | 我們也可以改用畫線的方式，呈現資料：
501 | 
502 | ```{r, warning=F}
503 | ggplot(data=airquality) +   
504 |     
505 |     # 要畫線的話，對應的函式是geom_line()
506 |     geom_line(aes(x=Temp,  
507 |                    y=Ozone,
508 |                    color=Month) 
509 |                ) +
510 |     
511 |     # 用labs()，進行文字上的標註(Annotation)
512 |     labs(title="Line Plot of Temp-Ozone",
513 |          x="Temp",
514 |          y="Ozone") +
515 |     
516 |     theme_bw()
517 | 
518 | ```
519 | 
520 | 也可以把上面那兩張圖，合併在一起：   
521 |    
522 | ###**複合式圖表**
523 |    
524 | ```{r, warning=FALSE}
525 | # 準備畫布
526 | ggplot(data=airquality) +   
527 |     
528 |     # 散布圖對應的函式是geom_point()
529 |     geom_point(aes(x=Temp,  
530 |                    y=Ozone,
531 |                    main="Scatter Plot of Ozone-Temp",
532 |                    color=Month) 
533 |                ) + 
534 |     # 要畫線的話，對應的函式是geom_line()
535 |     geom_line(aes(x=Temp,  
536 |                    y=Ozone,
537 |                    color=Month) 
538 |                ) + 
539 | 
540 |     # 用labs()，進行文字上的標註(Annotation)
541 |     labs(title="Combination of Scatter and Line Plots",
542 |          x="Temp",
543 |          y="Ozone") +
544 |     
545 |     theme_bw()
546 | ```
547 | 
548 | 
549 | ###**圓餅圖**
550 | 要用`ggplot2`畫圓餅圖，要先畫出bar plot，再沿著y軸進行轉軸：   
551 | ```{r,warning=FALSE}
552 | # 自己定義一筆新的資料
553 | df <- data.frame(sex=c("child", "teen", "adult", "old man"),
554 |                  perc=c(21,53,85,8)
555 |                  )
556 | 
557 | #準備畫布
558 | ggplot(data=df) +
559 |     
560 |     # 先畫bar plot
561 |     geom_bar(aes(x=factor(1),
562 |                  y=perc,
563 |                  fill=sex),
564 |              stat = "identity"
565 |              ) +
566 |     
567 |     # 再沿著Y，轉軸成圓餅圖
568 |     coord_polar("y", start=0)
569 | ```
570 | 
571 | 
572 | ------
573 | 
574 | #**4. 儲存圖片成png,jpeg,bmp...**   
575 | 如今，你已經學會如何運用R的三個繪圖系統，來畫出自己想要的圖。   
576 |    
577 | 但當你用R畫完圖以後，它只存在於R裡面，無法自由存取。因此，我們必須把圖片儲存到自己的電腦上，這才是我們最終的目的。   
578 |    
579 | 要用R把圖片輸出其實很簡單，例如我們要把畫好的圖片輸出成.png的型態，那我們就在前後加上`png()`和`dev.off()`，把我們繪圖的程式碼包起來，就像這樣：   
580 | <img src="3.png" />   
581 |    
582 | (jpeg對應的`jpeg()`，bmp對應的是`bmp()`，pdf對應的是`pdf()`)   
583 |    
584 | 我們拿上面ggplot2畫的「複合式圖表」對它進行輸出，程式碼就像這樣：   
585 | ```{r, eval=FALSE}
586 | #表示要儲存成png的格式
587 | png("Combination of Plots.png", width=600, height=600)
588 | 
589 |     ##複合式圖表 ## 
590 |     ggplot(data=airquality) +   
591 |         
592 |         # 散布圖對應的函式是geom_point()
593 |         geom_point(aes(x=Temp,  
594 |                        y=Ozone,
595 |                        main="Scatter Plot of Ozone-Temp",
596 |                        color=Month) 
597 |         ) + 
598 |         # 要畫線的話，對應的函式是geom_line()
599 |         geom_line(aes(x=Temp,  
600 |                       y=Ozone,
601 |                       color=Month) 
602 |         ) + 
603 |         
604 |         # 用labs()，進行文字上的標註(Annotation)
605 |         labs(title="Combination of Scatter and Line Plots",
606 |              x="Temp",
607 |              y="Ozone"
608 |         ) +
609 |         
610 |         theme_bw()
611 | 
612 | dev.off() #最後要關掉輸出圖片裝置 
613 | ```
614 |    
615 | ------
616 |    
617 | #**總結**   
618 |    
619 | 這篇筆記由簡到難，介紹了三種R的繪圖系統。我們可以用不同的繪圖系統，去繪製相同效果的圖表(當然，外觀上可能會不太一樣)。雖然內容很多，學起來不太容易，不過人畢竟是視覺的生物，好好掌握資料視覺化(繪圖)的技巧，在資料分析中是一項很重要的能力。   
620 | 
621 | 下一篇筆記，我們開始真正進入資料分析的過程：拿一筆實際的資料，進行一次**簡單、卻完整**的分析(包含預處理、視覺化、建模、分析報表...)。   
622 | It's still a long way to go~   
623 |   
624 | ------
625 |    
626 | #**(額外)資料視覺化的資源**
627 |    
628 | * 台灣資料科學愛好者年會的演講：<a href="http://www.slideshare.net/tw_dsconf/ss-60041639" target="_blank">[資料視覺化之理論、賞析與實作---投影片]</a>  (2016/03/26)   
629 |   
630 | * <a href="http://blog.infographics.tw/" target="_blank">[資料視覺化]</a> ，與其<a href="https://www.facebook.com/data.visualize/" target="_blank">[臉書專頁]</a>    
631 | 
632 | * <a href="https://www.facebook.com/taiwanstat/?fref=ts" target="_blank">[用數據看台灣]</a>   
633 | 
634 |    
635 | 


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-18-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-21-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-22-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-24-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-24-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-25-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-26-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化/figure-html/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_cache/html/__packages:
--------------------------------------------------------------------------------
1 | base
2 | methods
3 | datasets
4 | utils
5 | grDevices
6 | graphics
7 | stats
8 | 


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_cache/html/unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_cache/html/unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.RData


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_cache/html/unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_cache/html/unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.rdb


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_cache/html/unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_cache/html/unnamed-chunk-5_798c07e868d8e63562f4a34c63566d0e.rdx


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-17-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-17-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-18-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-18-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-2-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-2-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-21-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-21-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-22-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-22-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-24-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-24-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-25-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-26-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-26-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-4-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/繪圖–資料視覺化/繪圖–資料視覺化_files/figure-html/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/Source-File/遺漏值處理/.Rhistory:
--------------------------------------------------------------------------------
  1 | nhanes2
  2 | install.packages("mice")
  3 | require(mice)
  4 | methods(mice)
  5 | ?prodNA
  6 | install.packages("missForest")
  7 | methods(mice)
  8 | ?mice
  9 | ?complete
 10 | ?mice
 11 | ?with
 12 | ?pool
 13 | require(mice)
 14 | ?mice
 15 | ?pool
 16 | install.packages("SOMbrero")
 17 | require(SOMbrero)
 18 | ?trainSOM
 19 | install.packages("missForest")
 20 | install.packages("mice")
 21 | install.packages("fviz_cluster")
 22 | install.packages("factoextra")
 23 | require(markdown)
 24 | api_id.path <- 'rpubs_conn/api_id.txt'
 25 | note.title <- 'R10'
 26 | note.html <- 'R10.html'
 27 | # Update
 28 | if (file.exists('rpubs_conn/api_id.txt')){
 29 | print('Start Updating')
 30 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
 31 | # update article on Rpubs
 32 | update.result <- rpubsUpload(title = note.title,
 33 | htmlFile = note.html,
 34 | id = api.id
 35 | )
 36 | browseURL(update.result$continueUrl)
 37 | print('update success')
 38 | update.result$continueUrl
 39 | # Upload
 40 | }else{
 41 | print('Start Uploading')
 42 | dir.create('rpubs_conn')
 43 | # upload article on Rpubs
 44 | upload.result <- rpubsUpload(title = note.title,
 45 | htmlFile = 'R1.html'
 46 | )
 47 | upload.result$id
 48 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
 49 | browseURL(upload.result$continueUrl)
 50 | print('upload success')
 51 | upload.result$continueUrl
 52 | }
 53 | tmp <- c(1,5,8,NA,5,NA,6)
 54 | is.na(tmp)
 55 | # 計算遺漏值的個數
 56 | sum(is.na(tmp))
 57 | require(datasets)
 58 | head(airquality)
 59 | hist(x=airquality$Month,
 60 | main="Histogram of Month",         # 圖片的名稱
 61 | xlab="Month",                      # X軸的名稱
 62 | ylab="Frequency")                  # Y軸的名稱
 63 | require(datasets)
 64 | head(airquality)
 65 | hist(x=airquality$Month,
 66 | main="Histogram of Month",         # 圖片的名稱
 67 | xlab="Month",                      # X軸的名稱
 68 | ylab="Frequency")                  # Y軸的名稱
 69 | boxplot(formula = Ozone ~ Month, # Y ~ X (代表X和Y軸要放的數值)
 70 | data = airquality,       # 資料
 71 | xlab = "Month",          # X軸名稱
 72 | ylab = "Ozone (ppb)",    # Y軸名稱
 73 | col ="gray")             # 顏色
 74 | plot(x=airquality$Month,            # X軸的值
 75 | y=airquality$Temp,             # Y軸的值
 76 | main="Month to Temperature",   # 圖片名稱
 77 | xlab="Month(1~12)",            # X軸名稱
 78 | ylab="Temperature(degrees F)") # Y軸名稱
 79 | plot(x=airquality$Ozone,      # X軸的值
 80 | y=airquality$Wind,       # Y軸的值
 81 | main="Ozone to Wind",    # 圖片名稱
 82 | xlab="Ozone(ppb)",       # X軸的名稱
 83 | ylab="Wind(mph)"         # Y軸的名稱
 84 | )
 85 | # 建立一個畫布，上面已經有一張散布圖(Ozone to Wind)
 86 | plot(x=airquality$Ozone,
 87 | y=airquality$Wind,
 88 | main="Ozone to Wind",
 89 | xlab="Ozone(ppb)",
 90 | ylab="Wind(mph)",
 91 | pch=16                  # 點的圖形
 92 | )
 93 | # 現在我們要在這張圖片中，把5月的資料點用藍色標註上去
 94 | May_data <- airquality[airquality$Month==5, ]   # 找出5月的資料
 95 | # 標上藍色的點
 96 | points(x=May_data$Ozone,
 97 | y=May_data$Wind,
 98 | pch=16,                  # 點的圖形
 99 | col="blue")              # 顏色
100 | # 同理，也可以把8月的資料點用紅色標註上去
101 | Aug_data <- airquality[airquality$Month==8, ]   # 找出8月的資料
102 | # 標上紅色的點
103 | points(x=Aug_data$Ozone,
104 | y=Aug_data$Wind,
105 | pch=16,               # 點的圖形
106 | col="red")            # 顏色
107 | # 在右上角做出標示
108 | legend("topright",                                # 表示在右上角
109 | pch = 1,                                   # pch代表點的圖案
110 | col = c("blue", "red", "black"),           # col代表顏色
111 | legend = c("May", "August", "Other Month") # 顏色所對應的名稱
112 | )
113 | # 我們也可以畫出回歸趨勢線
114 | lm.model <- lm(Wind~Ozone, airquality)    # 建立一個線性回歸
115 | # 畫上回歸的趨勢線
116 | abline(lm.model,
117 | lwd=2)     # lwd 代表線的粗細
118 | # c(1,2)，表示建立一個1x2的空間，用來呈現後續的圖
119 | par(mfrow = c(1,2))
120 | # 第一張圖
121 | plot(airquality$Wind, airquality$Ozone, main = "Wind to Ozone")
122 | # 第二張圖
123 | plot(airquality$Solar.R, airquality$Ozone, main = "Solar.R to Ozone")
124 | require(lattice)  # 如果無法執行，請先install.packages("lattice")
125 | # 先把月份變成類別變數
126 | airquality$Month <- as.factor(airquality$Month)
127 | # 繪圖
128 | histogram(x= ~ Ozone | Month,  # 根據月份(Month)的條件，繪製臭氧(Ozone)的直方圖
129 | data=airquality,
130 | xlab="Ozone(ppb)",
131 | layout=c(5,1))       # 以5x1的方式呈現圖表
132 | bwplot(x = Ozone ~ Month,      # 把Month放在X軸，Ozone放在Y軸
133 | data = airquality,
134 | xlab = "Month"
135 | )
136 | # 把Ozone放在x的值；當然，可以增加月份的條件( ~ Ozone | Month)
137 | densityplot( ~ Ozone ,
138 | data=airquality
139 | )
140 | # Wind放在Z軸，Temp和Ozone放在X和Y軸，根據Month條件分別繪圖
141 | cloud(x=Wind~Temp+Ozone | Month,
142 | data=airquality
143 | )
144 | xyplot(x=Wind~Temp,         # Wind放在Y軸，Temp放在X軸
145 | data=airquality,
146 | group = Month,       # 根據Month，把資料點用顏色區分開來
147 | # auto.key參數，表示設定標籤與其他資訊
148 | auto.key=list(space="top",          # 位置在上方
149 | columns=5,            # 1x5的方式呈現標籤
150 | title="Month Labels", # 標籤名稱
151 | cex.title=1)          # 標籤字體大小
152 | )
153 | # 目的:我們想要在散布圖中，畫出標示出中位數的線 #
154 | xyplot(x=Wind~Temp | Month,  # Wind放在Y軸，Temp放在X軸，並根據Month條件分別繪圖
155 | data=airquality,
156 | layout = c(5,1),      # 以5x1的方式呈現圖
157 | # 在這裡，我們要使用panel function，畫出中位數的線
158 | panel=function(x,y){
159 | # function的寫法，會用大括號包起來，裡面表示要進行的動作：
160 | # 在這個panel function裡面，我們進行了兩個動作
161 | panel.xyplot(x, y)                    # 1.繪製x-y的散布圖
162 | panel.abline(h = median(y), lty = 2)  # 2.標示出中位數的線段
163 | }
164 | )
165 | # 目的:我們想要在散布圖中，畫出線性回歸的趨勢線 #
166 | xyplot(x=Wind~Temp ,         # Wind放在Y軸，Temp放在X軸
167 | data=airquality,
168 | # 在這裡，我們要使用panel function，畫出線性回歸的趨勢線
169 | panel=function(x,y){
170 | # function的寫法，會用大括號包起來，裡面表示要進行的動作：
171 | # 在這個panel function裡面，我們進行了三個動作
172 | panel.fill(col="gray")         # 1.改變背景顏色(gray)
173 | panel.xyplot(x, y)             # 2.繪製x-y的散布圖
174 | panel.lmline(x, y, col="red")  # 3.畫出線性回歸的趨勢線
175 | }
176 | )
177 | require(ggplot2)
178 | qplot(x=Ozone,
179 | data=airquality,
180 | geom="histogram",             # 圖形=histogram
181 | main = "Histogram of Ozone",
182 | xlab="Ozone(ppb)",
183 | binwidth = 25,                # 每25單位為一區隔
184 | fill= Month                   # 以顏色標註月份，複合式的直方圖
185 | )
186 | qplot(x=Temp,
187 | y=Ozone,
188 | data=airquality,
189 | geom="point",                         # 圖形=scatter plot
190 | main = "Scatter Plot of Ozone-Temp",
191 | xlab="Temp",
192 | ylab="Ozone(ppb)",
193 | color= Month                          # 以顏色標註月份，複合式的散布圖
194 | )
195 | qplot(x=Temp,
196 | data=airquality,
197 | geom="density",        # 圖形=density
198 | xlab="Temp",
199 | color= Month           # 以顏色標註月份，複合式的機率密度圖
200 | )
201 | qplot(x=Month,
202 | y=Ozone,
203 | data=airquality,
204 | geom="boxplot",       # 圖形=boxplot
205 | xlab="Temp",
206 | color= Month          # 以顏色標註月份，複合式的合鬚圖
207 | )
208 | # 準備一個畫布，資料集=airquality
209 | canvas <- ggplot(data=airquality)
210 | # 方才準備的畫布
211 | canvas +
212 | # 以直方圖的圖形呈現資料
213 | geom_histogram(aes(x=Ozone,     # X 放Ozone
214 | fill=Month   # 根據月份顯示不同的顏色
215 | )
216 | )
217 | # 方才準備的畫布
218 | canvas +
219 | # 以直方圖的圖形呈現資料
220 | geom_histogram(aes(x=Ozone,
221 | fill=Month)  # 以粉紅色填滿
222 | ) +
223 | # 用facet()，分別各畫一張各月份的直方圖
224 | facet_grid(.~Month)   # 因為Month放在右邊，故圖片以水平方向呈現
225 | # 準備畫布
226 | ggplot(data=airquality) +
227 | # 散布圖對應的函式是geom_point()
228 | geom_point(aes(x=Temp,  # 用aes()，描繪散布圖內的各種屬性
229 | y=Ozone,
230 | main="Scatter Plot of Ozone-Temp",
231 | color=Month)
232 | ) +
233 | # 用geom_smooth()加上趨勢線
234 | geom_smooth(aes(x=Temp,
235 | y=Ozone)) +
236 | # 用labs()，進行文字上的標註(Annotation)
237 | labs(title="Scatter of Temp-Ozone",
238 | x="Temp",
239 | y="Ozone") +
240 | # 用theme_bw(background white)，改變主題背景成白色
241 | # 更多背景設定： http://docs.ggplot2.org/current/ggtheme.html
242 | theme_bw()
243 | ggplot(data=airquality) +
244 | # 要畫線的話，對應的函式是geom_line()
245 | geom_line(aes(x=Temp,
246 | y=Ozone,
247 | color=Month)
248 | ) +
249 | # 用labs()，進行文字上的標註(Annotation)
250 | labs(title="Line Plot of Temp-Ozone",
251 | x="Temp",
252 | y="Ozone") +
253 | theme_bw()
254 | # 準備畫布
255 | ggplot(data=airquality) +
256 | # 散布圖對應的函式是geom_point()
257 | geom_point(aes(x=Temp,
258 | y=Ozone,
259 | main="Scatter Plot of Ozone-Temp",
260 | color=Month)
261 | ) +
262 | # 要畫線的話，對應的函式是geom_line()
263 | geom_line(aes(x=Temp,
264 | y=Ozone,
265 | color=Month)
266 | ) +
267 | # 用labs()，進行文字上的標註(Annotation)
268 | labs(title="Combination of Scatter and Line Plots",
269 | x="Temp",
270 | y="Ozone") +
271 | theme_bw()
272 | # 自己定義一筆新的資料
273 | df <- data.frame(sex=c("child", "teen", "adult", "old man"),
274 | perc=c(21,53,85,8)
275 | )
276 | #準備畫布
277 | ggplot(data=df) +
278 | # 先畫bar plot
279 | geom_bar(aes(x=factor(1),
280 | y=perc,
281 | fill=sex),
282 | stat = "identity"
283 | ) +
284 | # 再沿著Y，轉軸成圓餅圖
285 | coord_polar("y", start=0)
286 | install.packages("DMwR")
287 | tmp <- c(1,5,8,NA,5,NA,6)
288 | is.na(tmp)
289 | # 計算遺漏值的個數
290 | sum(is.na(tmp))
291 | require(mice)
292 | require(missForest) # prodNA() function
293 | # 在iris資料內，隨機產生10%的遺漏值
294 | data <- prodNA(iris, noNA = 0.1)
295 | head(data)
296 | # 當一筆資料是完整的，回傳TRUE；當一筆資料有遺漏值，回傳FALSE
297 | complete.cases(data)
298 | # 移除有遺漏值的資料
299 | rm.data <- data[complete.cases(data), ]
300 | # 以下用平均數，來填補某一欄位的遺漏值
301 | mean.data <- data
302 | mean.1 <- mean(mean.data[, 1], na.rm = T)  # 第一欄位的平均數
303 | na.rows <- is.na(mean.data[, 1])           # 第一欄位中，有遺漏值存在的資料
304 | # 用第一欄位的平均數，填補第一欄位的遺漏值
305 | mean.data[na.rows, 1] <- mean.1
306 | require(DMwR)
307 | imputeData <- knnImputation(data)
308 | install.packages("TTR")
309 | require(DMwR)
310 | imputeData <- knnImputation(data)
311 | imputeData
312 | tmp <- c(1,5,8,NA,5,NA,6)
313 | is.na(tmp)
314 | # 計算遺漏值的個數
315 | sum(is.na(tmp))
316 | require(mice)
317 | require(missForest) # prodNA() function
318 | # 在iris資料內，隨機產生10%的遺漏值
319 | data <- prodNA(iris, noNA = 0.1)
320 | head(data)
321 | # 當一筆資料是完整的，回傳TRUE；當一筆資料有遺漏值，回傳FALSE
322 | complete.cases(data)
323 | # 移除有遺漏值的資料
324 | rm.data <- data[complete.cases(data), ]
325 | # 以下用平均數，來填補某一欄位的遺漏值
326 | mean.data <- data
327 | mean.1 <- mean(mean.data[, 1], na.rm = T)  # 第一欄位的平均數
328 | na.rows <- is.na(mean.data[, 1])           # 第一欄位中，有遺漏值存在的資料
329 | # 用第一欄位的平均數，填補第一欄位的遺漏值
330 | mean.data[na.rows, 1] <- mean.1
331 | require(DMwR)
332 | imputeData <- knnImputation(data)
333 | imputeData
334 | mice.data <- mice(data,
335 | m = 3,           # 產生三個被填補好的資料表
336 | maxit = 50,      # max iteration
337 | method = "cart", # 使用CART決策樹，進行遺漏值預測
338 | seed = 188)      # set.seed()，令抽樣每次都一樣
339 | # 原始資料(有遺漏值)
340 | data
341 | # 填補好的資料：因為m=3，所以會有三個填補好的資料集，可以用以下方式取出
342 | complete(mice.data, 1) # 1st data
343 | complete(mice.data, 2) # 2nd data
344 | complete(mice.data, 3) # 3rd data
345 | # e.g. 拿第二個資料，作為我後續分析的資料
346 | df <- complete(mice.data, 2)
347 | # 然後以df進行線性迴歸、類神經網路、主成份分析...等等
348 | getwd()
349 | require(markdown)
350 | api_id.path <- 'rpubs_conn/api_id.txt'
351 | note.title <- 'R10'
352 | note.html <- 'R10.html'
353 | # Update
354 | if (file.exists('rpubs_conn/api_id.txt')){
355 | print('Start Updating')
356 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
357 | # update article on Rpubs
358 | update.result <- rpubsUpload(title = note.title,
359 | htmlFile = note.html,
360 | id = api.id
361 | )
362 | browseURL(update.result$continueUrl)
363 | print('update success')
364 | update.result$continueUrl
365 | # Upload
366 | }else{
367 | print('Start Uploading')
368 | dir.create('rpubs_conn')
369 | # upload article on Rpubs
370 | upload.result <- rpubsUpload(title = note.title,
371 | htmlFile = note.html
372 | )
373 | upload.result$id
374 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
375 | browseURL(upload.result$continueUrl)
376 | print('upload success')
377 | upload.result$continueUrl
378 | }
379 | getwd()
380 | require(markdown)
381 | api_id.path <- 'rpubs_conn/api_id.txt'
382 | note.title <- 'R10'
383 | note.html <- 'R10.html'
384 | # Update
385 | if (file.exists('rpubs_conn/api_id.txt')){
386 | print('Start Updating')
387 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
388 | # update article on Rpubs
389 | update.result <- rpubsUpload(title = note.title,
390 | htmlFile = note.html,
391 | id = api.id
392 | )
393 | browseURL(update.result$continueUrl)
394 | print('update success')
395 | update.result$continueUrl
396 | # Upload
397 | }else{
398 | print('Start Uploading')
399 | dir.create('rpubs_conn')
400 | # upload article on Rpubs
401 | upload.result <- rpubsUpload(title = note.title,
402 | htmlFile = note.html
403 | )
404 | upload.result$id
405 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
406 | browseURL(upload.result$continueUrl)
407 | print('upload success')
408 | upload.result$continueUrl
409 | }
410 | require(markdown)
411 | api_id.path <- 'rpubs_conn/api_id.txt'
412 | note.title <- 'R10'
413 | note.html <- 'R10.html'
414 | # Update
415 | if (file.exists('rpubs_conn/api_id.txt')){
416 | print('Start Updating')
417 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
418 | # update article on Rpubs
419 | update.result <- rpubsUpload(title = note.title,
420 | htmlFile = note.html,
421 | id = api.id
422 | )
423 | browseURL(update.result$continueUrl)
424 | print('update success')
425 | update.result$continueUrl
426 | # Upload
427 | }else{
428 | print('Start Uploading')
429 | dir.create('rpubs_conn')
430 | # upload article on Rpubs
431 | upload.result <- rpubsUpload(title = note.title,
432 | htmlFile = note.html
433 | )
434 | upload.result$id
435 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
436 | browseURL(upload.result$continueUrl)
437 | print('upload success')
438 | upload.result$continueUrl
439 | }
440 | 


--------------------------------------------------------------------------------
/Source-File/遺漏值處理/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/遺漏值處理/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre{
13 |   font-size: 16px;
14 |   font-family:  "Times New Roman";
15 | }


--------------------------------------------------------------------------------
/Source-File/遺漏值處理/遺漏值處理.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---遺漏值處理(Impute Missing Value)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 |   
 14 | 前面幾篇，介紹了一些常用的資料探勘模型。   
 15 | 
 16 | 不過本篇內容比較不太一樣，比較著重在「資料預處理」(或者稱**資料清洗**)的手法上。    
 17 | 
 18 | 畢竟在資料分析的流程中，其實有60~70%的時間是在進行「資料預處理」。如果沒有好的資料，後續的分析其實就可能會有很大的偏誤。   
 19 | 
 20 | 在「資料預處理」時，我們時常會遇到很多問題需要解決。當然，也有有很多對應的小技巧，可以幫助我們處理這些問題。   
 21 | 
 22 | 而本篇內容，主要針對一個比較重要問題：**遺漏值(Missing Value)**，進行處理！   
 23 | 
 24 | 在R裡面，遺漏值會被表現成**NA(not available)**，而我們可以使用`is.na()`的函式，確認資料中是否有遺漏值的存在：   
 25 | 
 26 | 
 27 | ```{r}
 28 | tmp <- c(1,5,8,NA,5,NA,6)
 29 | is.na(tmp)
 30 | 
 31 | # 計算遺漏值的個數
 32 | sum(is.na(tmp))
 33 | ```      
 34 | 
 35 | 在處理遺漏值時，大多數的人都會「直接移除資料」或是用「平均值來填補遺漏值」，但這樣的做法並不推薦：前者會讓資料減少，後者不會產生任何資訊。
 36 |    
 37 |    
 38 | 因此在遺漏值處理的手法上，最推崇的就是「k-Nearest Neighbours」或「`mice`套件」來填補遺漏值。其中，mice的全名為**Multivariate Imputation via Chained Equations**。   
 39 |    
 40 | 兩者的概念很簡單，都是先用資料探勘的方法「模擬遺漏值」後，再進行「填補(impute)」。詳情會在下面介紹。   
 41 |    
 42 |    
 43 | ------
 44 |    
 45 |    
 46 | 我們先使用`iris`的資料集，讓資料中隨機產生遺漏值，再來練習剛剛介紹的處理手法：   
 47 | 
 48 | ```{r, message=FALSE}
 49 | require(missForest) # prodNA() function
 50 | 
 51 | # 在iris資料內，隨機產生10%的遺漏值
 52 | data <- prodNA(iris, noNA = 0.1)
 53 | # 可以注意到，資料裡面有NA的存在，代表Not-Available(遺漏值)
 54 | head(data)
 55 | ```   
 56 |    
 57 | 接著介紹剛剛提及的四種處理遺漏值的手法：
 58 | 
 59 | ------
 60 |   
 61 | 
 62 | ## 1. 直接移除有遺漏值的資料   
 63 | 
 64 | ```{r}
 65 | # 當一筆資料是完整的，回傳TRUE；當一筆資料有遺漏值，回傳FALSE
 66 | complete.cases(data)
 67 | 
 68 | # 移除有遺漏值的資料
 69 | rm.data <- data[complete.cases(data), ]
 70 | ```
 71 | 
 72 | 可是這麼做不太好，因為會造成資訊損失(information loss)。   
 73 | 
 74 | 所以我們常會採取「填補遺漏值」的手法，也就是下面即將介紹的！   
 75 | 
 76 | ------
 77 |   
 78 | 
 79 | ##2. 用「平均數」、「第一四分位數」...來填補遺漏值：   
 80 | 
 81 | 
 82 | ```{r, warning=FALSE}
 83 | # 以下用平均數，來填補某一欄位的遺漏值
 84 | mean.data <- data
 85 | 
 86 | mean.1 <- mean(mean.data[, 1], na.rm = T)  # 第一欄位的平均數
 87 | na.rows <- is.na(mean.data[, 1])           # 第一欄位中，有遺漏值存在的資料
 88 | 
 89 | # 用第一欄位的平均數，填補第一欄位的遺漏值
 90 | mean.data[na.rows, 1] <- mean.1
 91 | ```
 92 | 
 93 | ------
 94 |   
 95 | 
 96 | ##3. 用K-Nearest Neighbours填補遺漏值：   
 97 | 
 98 | K-Nearest Neighbours(KNN)運用在遺漏值填補上的想法很簡單：   
 99 |   
100 | 現在有一群學生的成績，包含國文、數學、自然，但老師不小心弄丟小明的國文考卷，於是小明的「國文」分數是遺漏值。   
101 | 
102 | 如果在不重考的狀況下，我們要給小明一個分數，該怎麼做？   
103 | 
104 | KNN的概念告訴我們，應該先看小明「數學和自然」的分數，看和哪些同學(K位)很相近，然後再拿那些同學(K位)的國文分數，取平均或加權平均(或是其他手法)後，當作小明的分數來填補。   
105 | 
106 | 一句話概括：「就是找和自己很像的K個鄰居，然後從他們身上複製自己所沒有的東西。」   
107 |    
108 | 這就是用KNN來填補遺漏值的想法。   
109 | 
110 | ```{r, message=FALSE}
111 | require(DMwR)
112 | imputeData <- knnImputation(data)
113 | head(imputeData)
114 | 
115 | ```
116 | 
117 | ------
118 | 
119 | 
120 | ##4. 用MICE填補遺漏值：   
121 | 
122 | 在MICE裡面，提供了很多資料探勘的模型(linear regression, logistic regression, cart, random forest, boostrap......)，來針對遺漏值進行預測！   
123 | 
124 | 概念很簡單：現在我們有欄位V1,V2,V3......Vn，每個欄位裡面都有遺漏值。   
125 | 
126 | 當我們要填補V1的遺漏值時，就先把V2,V3......Vn的欄位當作自變數(X)，把V1當作應變數(Y)，並且進行建模，然後用預測的結果來填補V1的遺漏值。   
127 | 
128 | 同理，針對V2，就用V1,V3......Vn建模，然後用預測的結果來填補V2的遺漏值。     
129 | 
130 | (由於這個函式，背後有使用Gibbs sampling(一種抽樣手法)。所以，即使使用某個模型進行遺漏值填補，也會因為抽樣手法，造成最後填補的結果有些許不同)   
131 | 
132 | ```{r, results='hide', message=FALSE}
133 | require(mice)
134 | mice.data <- mice(data,
135 |                   m = 3,           # 產生三個被填補好的資料表
136 |                   maxit = 50,      # max iteration
137 |                   method = "cart", # 使用CART決策樹，進行遺漏值預測
138 |                   seed = 188)      # set.seed()，令抽樣每次都一樣
139 | 
140 | # 原始資料(有遺漏值)
141 | data
142 | 
143 | # 填補好的資料：因為m=3，所以會有三個填補好的資料集，可以用以下方式取出
144 | 
145 | complete(mice.data, 1) # 1st data
146 | complete(mice.data, 2) # 2nd data
147 | complete(mice.data, 3) # 3rd data
148 | ```
149 | (由於上面資料集龐大，故在此不顯示出來！)   
150 | 
151 | 現在，我們可以任取其中一個「填補好的資料」，來進行後續的建模了！   
152 | 
153 | ```{r}
154 | # e.g. 拿第二個資料，作為我後續分析的資料
155 | df <- complete(mice.data, 2)
156 | head(df)
157 | # 然後以df進行線性迴歸、類神經網路、主成份分析...等等
158 | 
159 | ```
160 |    
161 | ------   
162 | 
163 |    
164 | #**總結**    
165 | 
166 | 在資料預處理時，「遺漏值處理」是很重要的步驟，最好還是選擇「填補遺漏值」的方式，才不會造成資訊損失。   
167 | 
168 | 在R裡面，其實有提供很多強大的套件，可以幫我們處理遺漏值！   
169 | 
170 | 本篇只簡單介紹`mice`套件，網路上有神人整理出**五種處理遺漏值的強大套件**，裡面都有詳細的範例：<a href="http://www.analyticsvidhya.com/blog/2016/03/tutorial-powerful-packages-imputing-missing-values/" target="_blank">Tutorial on 5 Powerful R Packages used for imputing missing values</a>，有興趣的話可以參考！      
171 | 
172 | 
173 | It's still a long way to go~   


--------------------------------------------------------------------------------
/Source-File/關聯式規則/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/關聯式規則/1.png


--------------------------------------------------------------------------------
/Source-File/關聯式規則/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/關聯式規則/2.png


--------------------------------------------------------------------------------
/Source-File/關聯式規則/_Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/關聯式規則/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/關聯式規則/titanic.raw.rdata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/關聯式規則/titanic.raw.rdata


--------------------------------------------------------------------------------
/Source-File/關聯式規則/關聯式規則.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---關聯式規則(Association rule)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------
 13 |   
 14 | 本篇內容會介紹關聯式規則(Association rule)的資料探勘模型。
 15 | 
 16 | 首先，還是要再提醒一下，記得要更新R的版本至<a href="http://cran.csie.ntu.edu.tw/" target="_blank">【3.4.0】</a>唷！而之後會用到的`arules`套件版本為<a href="https://cran.r-project.org/web/packages/arules/index.html" target="_blank">【1.5-2】</a>
 17 | 
 18 | ------
 19 |    
 20 | #**關聯式規則(Apriori)**   
 21 |    
 22 | 這裡拿網路上一個公開資料(鐵達尼號的乘客資料)來進行分析，<a href="http://www.rdatamining.com/data/titanic.raw.rdata?attredirects=0&d=1" target="_blank">資料載點如下</a>。   
 23 | 
 24 | 下載之後，你會發現資料的型態並非熟悉的.csv。   
 25 |    
 26 | 因此我們要用**函式**把資料匯入到R裡面，使用的函式是`load()`：   
 27 | 
 28 | ```{r}
 29 | # 記得要給定資料所在的路徑(path)，例如：我把下載的資料放在C槽下：
 30 | load("C:/titanic.raw.rdata")  #匯入.rdata檔
 31 | ```   
 32 |    
 33 | **(補充：如果我們要匯入.csv檔，除了之前教的"Import Dataset"方法以外，也可以用函式`read.csv()`)**   
 34 | ```{r, eval=FALSE}
 35 | data <- read.csv("C:/Users/Allan/Desktop/R_programmimg/Allan/06/data.csv")
 36 | ```
 37 |    
 38 |    
 39 | 並且用`str()`看這筆資料的狀態
 40 | ```{r}
 41 | str(titanic.raw)
 42 | ```
 43 | 
 44 | 可以發現資料裡面有四個欄位：
 45 | 
 46 | 1. Class：乘客的艙位等級  
 47 | 
 48 | 2. Sex：乘客性別    
 49 | 
 50 | 3. Age ：乘客年齡   
 51 | 
 52 | 4. Survived：沉船之後，乘客是否存活？
 53 | 
 54 |    
 55 | 關於鐵達尼號的故事，大家應該都耳熟能詳。而當我們說「女性比較容易存活」、以及「男性船員幾乎活不下來」，相信也沒人會反對吧？但是凡事講求證據~   
 56 |    
 57 | 事實上證明其實並不難，而其中一個方法，就是本篇即將用到的第一個方法：**關聯式規則(apriori)**！(對應的套件：`arules`)
 58 |    
 59 | ```{r, message=F, warning=F, results='hide'}
 60 | require(arules) # apriori關聯式規則的套件
 61 | ```
 62 | 
 63 | 還記得apriori演算法是怎麼運作的嗎？我們需要設定：
 64 | 
 65 | * 最小支持度(min support)：「規則」在資料內具有普遍性   
 66 | 
 67 | * 最小信賴度(min confidence)：「規則」要有一定的信心水準   
 68 | 
 69 |    
 70 | 而我們想要探討的規則，形式如下：「在A情況下，會存活與否」   
 71 |    
 72 | 換句話說，可以寫成**A => 存活與否**，所以把Survived這個變數放在`=>`的右手邊(right hand side)
 73 | 
 74 | ```{r, results='hide'}
 75 | # apriori rules with rhs containing "Survived" only
 76 | 
 77 | rule <- apriori(titanic.raw, 
 78 |                 # min support & confidence, 最小規則長度(lhs+rhs)
 79 |                 parameter=list(minlen=3, supp=0.1, conf=0.7),  
 80 |                 appearance = list(default="lhs",
 81 |                                   rhs=c("Survived=No", "Survived=Yes") 
 82 |                                   # 右手邊顯示的特徵
 83 |                                   )
 84 |                 )  
 85 | ```   
 86 | 
 87 | 要觀察rule需要使用`inspect()`的函式：
 88 | 
 89 | ```{r}
 90 | inspect(rule)
 91 | ```
 92 |    
 93 | 根據lift，由大到小排序rule：   
 94 | ```{r}
 95 | sort.rule <- sort(rule, by="lift")
 96 | inspect(sort.rule)
 97 | ```
 98 | 
 99 | 看第一個關聯規則：「若身分是成人女性 => 則會存活」，lift=2.3 > 1，表示預測結果比原本表現好。   
100 | 
101 | 然而，有發現到問題嗎？   
102 |    
103 | 第六個關聯規則(#編號8)「若身分是男性成人船員 => 不會存活」，對比於第五個關聯規則(#編號4)：「若身分是男性船員 => 不會存活」，其實看不到任何有用的資訊！   
104 |    
105 | 而且，第六個規則的lift <= 第五個規則的lift，   當發生這樣的情況時，我們就可以說：第六個關聯規則是**多餘的(redundant)**。   
106 |    
107 | 多餘的關聯規則，會造成分析上的雜訊，因此需要刪除它們，但該怎麼做呢？   
108 |    
109 | 首先，先看某項規則是否為其他規則的子集(subset)：
110 | ```{r}
111 | subset.matrix <- as.matrix(is.subset(x=sort.rule, y=sort.rule))
112 | ```
113 | 輸出的格式會像這樣：
114 | 
115 | <img src="2.png" />   
116 | 
117 | 上面的結果要解釋很簡單：在X的項目，如果是Y項目的子集(subset)，就會回傳TRUE。   
118 | (當你用RStudio打開**subset.matrix**這個變數時，會看見一個8x8的矩陣)
119 | 
120 | 之後再進行以下步驟：
121 | ```{r}
122 | # 把這個矩陣的下三角去除，只留上三角的資訊
123 | subset.matrix[lower.tri(subset.matrix, diag=T)] <- NA
124 | 
125 | # 計算每個column中TRUE的個數，若有一個以上的TRUE，代表此column是多餘的
126 | redundant <- colSums(subset.matrix, na.rm=T) >= 1
127 | 
128 | # 移除多餘的規則
129 | sort.rule <- sort.rule[!redundant]
130 | 
131 | inspect(sort.rule)
132 | ```
133 | 經過以上步驟後，多餘的規則消失了！   
134 | 
135 | 在R裡面，關聯式規則還提供一個視覺化的套件`arulesViz`，可以觀察每條規則的三項指標分布情況：
136 | 
137 | ```{r,message=F, warning=F }
138 | require(arulesViz)
139 | plot(sort.rule)
140 | ```
141 | 
142 | 而且也可以將規則視覺化，知道當初災難發生以後，在什麼樣的條件下比較容易存活/死亡：
143 | ```{r}
144 | plot(sort.rule, method="graph", control=list(type="items"))
145 | plot(sort.rule, method="grouped")
146 | ```
147 | 
148 | 現在對照一開始的假設：**「女性比較容易存活」、「男性船員幾乎死光光」**...(好殘酷)！   
149 | 
150 | 而這些規則就是我們的證據了！   
151 | 
152 | 
153 | ------
154 |    
155 | #**總結**  
156 | 
157 | 和一些機器學習方法只會專注在「預測準確率」的概念不太一樣，，關聯式規則在「解釋」上具有十分強大的優勢。   
158 | 
159 | 的確，在現實中，有時候確實只要「高的預測準確率」的模型，就可以達成許多目標。    
160 |    
161 | 可是也千萬別忘記了，我們正在進行的是「資料分析」，也就是去「解釋」資料中的故事。   
162 |    
163 | 事實上，有時候這個步驟反而會比「準確率」還重要。這一點，稍微思考一下就能理解了：唯有去探討資料的故事，才有機會發現到**有趣的跡象、失敗的原因、違背直覺的現象......**等等。   
164 | 
165 | 
166 | ------
167 |    
168 | #**Reference**  
169 | 
170 | <a href="http://www.rdatamining.com/examples/association-rules
171 | " target="_blank">http://www.rdatamining.com/examples/association-rules
172 | </a>   
173 |    
174 | 
175 | <a href="https://www.kaggle.com/c/titanic
176 | " target="_blank">https://www.kaggle.com/c/titanic
177 | </a>   
178 | 
179 | ------
180 | 


--------------------------------------------------------------------------------
/Source-File/類神經網路/.Rhistory:
--------------------------------------------------------------------------------
 1 | install.packages("neuralnet")
 2 | require(markdown)
 3 | api_id.path <- 'rpubs_conn/api_id.txt'
 4 | note.title <- 'R8'
 5 | note.html <- 'R8.html'
 6 | # Update
 7 | if (file.exists('rpubs_conn/api_id.txt')){
 8 | print('Start Updating')
 9 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
10 | # update article on Rpubs
11 | update.result <- rpubsUpload(title = note.title,
12 | htmlFile = note.html,
13 | id = api.id
14 | )
15 | browseURL(update.result$continueUrl)
16 | print('update success')
17 | update.result$continueUrl
18 | # Upload
19 | }else{
20 | print('Start Uploading')
21 | dir.create('rpubs_conn')
22 | # upload article on Rpubs
23 | upload.result <- rpubsUpload(title = note.title,
24 | htmlFile = 'R1.html'
25 | )
26 | upload.result$id
27 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
28 | browseURL(upload.result$continueUrl)
29 | print('upload success')
30 | upload.result$continueUrl
31 | }
32 | require(markdown)
33 | api_id.path <- 'rpubs_conn/api_id.txt'
34 | note.title <- 'R8'
35 | note.html <- 'R8.html'
36 | # Update
37 | if (file.exists('rpubs_conn/api_id.txt')){
38 | print('Start Updating')
39 | api.id <- read.table(api_id.path, nrows=1, stringsAsFactors = F)[, 1]
40 | # update article on Rpubs
41 | update.result <- rpubsUpload(title = note.title,
42 | htmlFile = note.html,
43 | id = api.id
44 | )
45 | browseURL(update.result$continueUrl)
46 | print('update success')
47 | update.result$continueUrl
48 | # Upload
49 | }else{
50 | print('Start Uploading')
51 | dir.create('rpubs_conn')
52 | # upload article on Rpubs
53 | upload.result <- rpubsUpload(title = note.title,
54 | htmlFile = 'R1.html'
55 | )
56 | upload.result$id
57 | write.table(upload.result$id, api_id.path, row.names = F, col.names = F)
58 | browseURL(upload.result$continueUrl)
59 | print('upload success')
60 | upload.result$continueUrl
61 | }
62 | # 記得要給定資料所在的路徑(path)，例如：我把下載的資料放在C槽下：
63 | load("titanic.raw.rdata")  #匯入.rdata檔
64 | data <- read.csv("C:/data.csv")
65 | require(arules) # apriori關聯式法則的套件
66 | rule <- apriori(titanic.raw,
67 | # min support & confidence, 最小規則長度(lhs+rhs)
68 | parameter=list(minlen=3, supp=0.1, conf=0.7),
69 | appearance = list(default="lhs",
70 | rhs=c("Survived=No", "Survived=Yes")
71 | # 右手邊顯示的特徵
72 | )
73 | )
74 | inspect(rule)
75 | load("titanic.raw.rdata")  #匯入.rdata檔
76 | load("titanic.raw.rdata")  #匯入.rdata檔
77 | getwd(*)
78 | getwd()
79 | 


--------------------------------------------------------------------------------
/Source-File/類神經網路/.Rprofile:
--------------------------------------------------------------------------------
1 | options(rpubs.upload.method = "internal")


--------------------------------------------------------------------------------
/Source-File/類神經網路/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/類神經網路/1.png


--------------------------------------------------------------------------------
/Source-File/類神經網路/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/類神經網路/2.png


--------------------------------------------------------------------------------
/Source-File/類神經網路/style.css:
--------------------------------------------------------------------------------
 1 | /* Whole document: */
 2 | body{
 3 |   font-family:  "Times New Roman";
 4 |   font-size: 14pt;
 5 | }
 6 | 
 7 | code.r{
 8 |   font-size: 14pt;
 9 |   font-family:  "Consolas";
10 | }
11 | 
12 | pre {
13 |   font-size: 14.5px;
14 |   font-family:  "Consolas";
15 | }


--------------------------------------------------------------------------------
/Source-File/類神經網路/類神經網路.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DM---類神經網路(BPN)"
  3 | author: "POLab"
  4 | date: "2017/05/19"
  5 | output:
  6 |   html_document:
  7 |       css: style.css
  8 | ---
  9 | 
 10 | <a href="https://github.com/PO-LAB/Data-Mining" target="_blank">【回到首頁】</a> 
 11 |    
 12 | ------  
 13 |   
 14 | 本篇內容，會繼續介紹一些常用的資料探勘模型：   
 15 |    
 16 | ------
 17 |    
 18 | 
 19 | 要使用(倒傳遞)類神經網路，R提供一個可以設定「多個隱藏層」的套件，叫做`neuralnet`。    
 20 | 
 21 | 然而，當使用類神經網路時，有一個議題十分重要，那就是我們究竟該決定「多少隱藏層和節點」？   
 22 | 
 23 | 理論上來說，我們會針對層數和節點數進行調整，看怎麼樣的組合會有最小的MSE(RMSE)，這樣的動作叫做**tune parameters**。   
 24 | 
 25 | 幸好，R提供一個套件`caret`，可以協助我們達成這樣的目的；否則的話，我們就需要自己撰寫迴圈(loop)和判斷式(if-else)，那會是一個十分複雜且龐大的工程。   
 26 | 
 27 | 接下來，會以R內建的iris資料，進行「倒傳遞類神經網路(bpn)」的示範：
 28 | 
 29 | #**(倒傳遞)類神經網路(Artificial Neural Network)**   
 30 | 
 31 | 首先，以下是必須安裝的套件：
 32 | ```{r, message=FALSE, warning=FALSE}
 33 | require(neuralnet) # for neuralnet(), nn model
 34 | require(nnet)      # for class.ind()
 35 | require(caret)     # for train(), tune parameters
 36 | 
 37 | ```
 38 | 
 39 | 很直觀的，**Sepal.Length、Sepal.Width、Petal.Length、Petal.Width**會是input nodes，而**Species**是output node。
 40 | 
 41 | 然而，由於**Species**是類別變數(也就是「分類」的問題)，類神經網路無法直接處理。   
 42 | 
 43 | 因此這個時候，必須先將**Species**，轉變成啞變數(dummy variables)的型態。
 44 | 
 45 | ```{r}
 46 | data <- iris
 47 | 
 48 | # 因為Species是類別型態，這邊轉換成三個output nodes，使用的是class.ind函式()
 49 | head(class.ind(data$Species))
 50 | 
 51 | # 並和原始的資料合併在一起，cbind意即column-bind
 52 | data <- cbind(data, class.ind(data$Species))
 53 | 
 54 | # 原始資料就會變成像這樣
 55 | head(data)
 56 | ```
 57 | 
 58 | 而在建構formula時，就可以寫成**setosa + versicolor + virginica ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width**。
 59 | 
 60 | ```{r}
 61 | formula.bpn <- setosa + versicolor + virginica ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
 62 | ```
 63 | 
 64 | 在訓練bpn模型的時候，使用的是`neuralnet()`函式：
 65 | 
 66 | ```{r}
 67 | bpn <- neuralnet(formula = formula.bpn, 
 68 |                   data = data,
 69 |                   hidden = c(2),       # 一個隱藏層：2個node
 70 |                   learningrate = 0.01, # learning rate
 71 |                   threshold = 0.01,    # partial derivatives of the error function, a stopping criteria
 72 |                   stepmax = 5e5        # 最大的ieration數 = 500000(5*10^5)
 73 | 
 74 |                   )
 75 | 
 76 | # bpn模型會長得像這樣
 77 | plot(bpn)
 78 | ```
 79 | <img src="1.png">   
 80 | 
 81 | 基本上，這就是一個類神經網路的模型。
 82 | 
 83 | ------
 84 | 
 85 | #**Tuning Parameters**   
 86 | 
 87 | 當使用不同的隱藏層數和節點數，類神經網路的模型表現與可靠度就會改變。   
 88 | 
 89 | 基本上，當遇到需要tuning parameters的問題時，就會需要觀察不同參數組合的MSE(RMSE)；當MSE最小的情況發生時，我們就可以視為是最佳的參數組合(optimal parameters)。   
 90 | 
 91 | 在R裡面，`caret`是十分強大的套件，許多需要tune parameters的問題都可以靠它來解決，而最常用的函式就是`train()`。   
 92 | 
 93 | 在繼續做下去之前，我們先把原始的資料集，分成80%的train set和20%的test set。   
 94 | 使用的手法十分簡單：可以想像現在手上資料有一百筆，那我們就隨機從裡面開始抽樣，隨機抽出80筆當成train set，剩下20筆當作test set。   
 95 | 
 96 | 以下就是在做上面的動作：   
 97 | 
 98 | ```{r}
 99 | # nrow()是用來擷取資料筆數，乘上0.8後，表示我們的train set裡面要有多少筆資料(data size)
100 | smp.size <- floor(0.8*nrow(data)) 
101 | # 因為是抽樣，有可能每次抽樣結果都不一樣，因此這裡規定好亂數表，讓每次抽樣的結果一樣
102 | set.seed(131)                     
103 | # 從原始資料裡面，抽出train set所需要的資料筆數(data size)
104 | train.ind <- sample(seq_len(nrow(data)), smp.size)
105 | # 分成train/test
106 | train <- data[train.ind, ]
107 | test <- data[-train.ind, ]
108 | ```
109 | 
110 | 然後我們根據train set，來進行tune parameters。   
111 | 
112 | (註：下面的code實際上會運行比較長的時間，當使用不同的資料集時，有時候可能會跑數天以上，需要特別留意。)   
113 | 
114 | ```{r, warning=FALSE, cache=TRUE}
115 | # tune parameters
116 | model <- train(form=formula.bpn,     # formula
117 |                data=train,           # 資料
118 |                method="neuralnet",   # 類神經網路(bpn)
119 |                
120 |                # 最重要的步驟：觀察不同排列組合(第一層1~4個nodes ; 第二層0~4個nodes)
121 |                # 看何種排列組合(多少隱藏層、每層多少個node)，會有最小的RMSE
122 |                tuneGrid = expand.grid(.layer1=c(1:4), .layer2=c(0:4), .layer3=c(0)),               
123 |                
124 |                # 以下的參數設定，和上面的neuralnet內一樣
125 |                learningrate = 0.01,  # learning rate
126 |                threshold = 0.01,     # partial derivatives of the error function, a stopping criteria
127 |                stepmax = 5e5         # 最大的ieration數 = 500000(5*10^5)
128 |                )
129 | 
130 | # 會告訴你最佳的參數組合是什麼：第一隱藏層1個node，第二隱藏層2個node
131 | model
132 | 
133 | # 把參數組合和RMSE畫成圖
134 | plot(model)
135 | ```
136 |    
137 | 所以我們就以兩層隱藏層(1,2)，重新訓練類神經網路模型：
138 | 
139 | ```{r}
140 | bpn <- neuralnet(formula = formula.bpn, 
141 |                   data = train,
142 |                   hidden = c(1,2),     # 第一隱藏層1個node，第二隱藏層2個nodes
143 |                   learningrate = 0.01, # learning rate
144 |                   threshold = 0.01,    # partial derivatives of the error function, a stopping criteria
145 |                   stepmax = 5e5        # 最大的ieration數 = 500000(5*10^5)
146 | 
147 |                   )
148 | 
149 | # 新的bpn模型會長得像這樣
150 | plot(bpn)
151 | ```
152 | 
153 | <img src="2.png">   
154 | 
155 | 
156 | ------
157 | 
158 | 
159 | #**預測**   
160 | 
161 | 接下來，就用訓練好的模型(bpn)預測test set：
162 | 
163 | ```{r}
164 | # 使用bpn模型，輸入test set後進行預測
165 | # 需要注意的是，輸入的test資料只能包含input node的值
166 | # 所以取前四個欄位，丟入模型進行預測
167 | pred <- compute(bpn, test[, 1:4])  
168 | 
169 | # 預測結果
170 | pred$net.result
171 | 
172 | # 四捨五入後，變成0/1的狀態
173 | pred.result <- round(pred$net.result)
174 | pred.result
175 | 
176 | # 把結果轉成data frame的型態
177 | pred.result <- as.data.frame(pred.result)
178 | 
179 | ```
180 | 
181 | 把預測結果轉回Species的型態：
182 | 
183 | ```{r}
184 | # 建立一個新欄位，叫做Species
185 | pred.result$Species <- ""
186 | 
187 | # 把預測結果轉回Species的型態
188 | for(i in 1:nrow(pred.result)){
189 |   if(pred.result[i, 1]==1){ pred.result[i, "Species"] <- "setosa"}
190 |   if(pred.result[i, 2]==1){ pred.result[i, "Species"] <- "versicolor"}
191 |   if(pred.result[i, 3]==1){ pred.result[i, "Species"] <- "virginica"}
192 | }
193 | 
194 | pred.result
195 | 
196 | ```
197 | 
198 | 接下來，看實際值和預測結果的差異：
199 | 
200 | ```{r}
201 | # 混淆矩陣 (預測率有96.67%)
202 | table(real    = test$Species, 
203 |       predict = pred.result$Species)
204 | 
205 | ```
206 | 
207 | 
208 | ------
209 |    
210 | #**總結**  
211 | 
212 | 類神經網路是一個很強大的方法，屬於機器學習的範疇，因此在預測上有很好的效果，可是最大的問題則是難以解釋。   
213 |    
214 | 在資工的領域中，人工智慧就是類神經網路的一個分支，屬於深度學習(deep learning)的範疇。   
215 |    
216 | 
217 | 最近世界知名的AlphaGo(Google的人工智慧)，其內部結構，就是一個多達<a href="http://www.bnext.com.tw/article/view/id/38923" target="_blank">十三層隱藏層的類神經網路</a>。  
218 | 
219 | It's still a long way to go~   
220 |    
221 | 
222 | 
223 | 


--------------------------------------------------------------------------------
/Source-File/類神經網路/類神經網路_cache/html/__packages:
--------------------------------------------------------------------------------
 1 | base
 2 | methods
 3 | datasets
 4 | utils
 5 | grDevices
 6 | graphics
 7 | stats
 8 | neuralnet
 9 | nnet
10 | lattice
11 | ggplot2
12 | caret
13 | 


--------------------------------------------------------------------------------
/Source-File/類神經網路/類神經網路_cache/html/unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/類神經網路/類神經網路_cache/html/unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.RData


--------------------------------------------------------------------------------
/Source-File/類神經網路/類神經網路_cache/html/unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.rdb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/類神經網路/類神經網路_cache/html/unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.rdb


--------------------------------------------------------------------------------
/Source-File/類神經網路/類神經網路_cache/html/unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.rdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/類神經網路/類神經網路_cache/html/unnamed-chunk-6_137c0ec415d12c537aa5eefed4d30894.rdx


--------------------------------------------------------------------------------
/Source-File/類神經網路/類神經網路_files/figure-html/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PO-LAB/Data-Mining/c4a6c3c64591502464fed49357f78d3feda643ac/Source-File/類神經網路/類神經網路_files/figure-html/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------