├── .gitignore
├── 01-data-frames
    ├── R-data-frames.Rmd
    ├── R-data-frames.md
    ├── R-data-frames_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-31-1.png
    │   │   ├── unnamed-chunk-32-1.png
    │   │   ├── unnamed-chunk-35-1.png
    │   │   ├── unnamed-chunk-36-1.png
    │   │   ├── unnamed-chunk-38-1.png
    │   │   ├── unnamed-chunk-40-1.png
    │   │   ├── unnamed-chunk-42-1.png
    │   │   ├── unnamed-chunk-45-1.png
    │   │   ├── unnamed-chunk-46-1.png
    │   │   └── unnamed-chunk-49-1.png
    ├── README.md
    ├── python-data-frames.ipynb
    ├── python-data-frames
    │   ├── output_110_1.png
    │   ├── output_116_1.png
    │   ├── output_138_1.png
    │   ├── output_144_1.png
    │   ├── output_146_1.png
    │   ├── output_150_1.png
    │   ├── output_68_2.png
    │   ├── output_70_1.png
    │   ├── output_78_1.png
    │   ├── output_80_1.png
    │   └── python-data-frames.md
    ├── tb_deaths_100.csv
    ├── tb_existing_100.csv
    └── tb_new_100.csv
├── 02-exploratory-data-analysis
    └── README.md
├── 03-dimensionality-reduction-and-clustering
    ├── README.md
    ├── dimensionality-clustering_files
    │   └── figure-html
    │   │   ├── unnamed-chunk-10-1.png
    │   │   ├── unnamed-chunk-11-1.png
    │   │   ├── unnamed-chunk-12-1.png
    │   │   ├── unnamed-chunk-13-1.png
    │   │   ├── unnamed-chunk-14-1.png
    │   │   ├── unnamed-chunk-15-1.png
    │   │   ├── unnamed-chunk-16-1.png
    │   │   ├── unnamed-chunk-23-1.png
    │   │   ├── unnamed-chunk-27-1.png
    │   │   ├── unnamed-chunk-28-1.png
    │   │   ├── unnamed-chunk-3-1.png
    │   │   ├── unnamed-chunk-5-1.png
    │   │   ├── unnamed-chunk-6-1.png
    │   │   ├── unnamed-chunk-7-1.png
    │   │   ├── unnamed-chunk-8-1.png
    │   │   └── unnamed-chunk-9-1.png
    ├── dimensionality-reduction-clustering-python.ipynb
    ├── dimensionality-reduction-clustering-r.Rmd
    ├── dimensionality-reduction-clustering_files
    │   ├── dimensionality-reduction-clustering_24_0.png
    │   ├── dimensionality-reduction-clustering_28_0.png
    │   ├── dimensionality-reduction-clustering_30_1.png
    │   ├── dimensionality-reduction-clustering_33_1.png
    │   └── dimensionality-reduction-clustering_43_1.png
    └── tb_existing_100.csv
├── 04-sentiment-analysis
    ├── README.md
    ├── original_test_data.csv
    ├── original_train_data.csv
    ├── sentiment-analysis-R.Rmd
    ├── sentiment-analysis-py.ipynb
    ├── test_data.csv
    └── train_data.tsv
├── 05-regularisation
    ├── regularisation-py.ipynb
    └── tb_new_100.csv
├── LICENSE
├── README.md
└── apps
    ├── information-retrieval
        ├── README.md
        ├── Vector Space.ipynb
        ├── Vector Space.md
        ├── tf-idf.ipynb
        └── tf-idf.md
    ├── kaggle-analytics-edge-15
        ├── Competition_TextData.R
        ├── README.md
        ├── add_corpus_abstract.R
        ├── add_corpus_all.R
        ├── add_corpus_headline.R
        ├── add_corpus_snippet.R
        ├── data
        │   ├── NYTimesBlogTest.csv
        │   ├── NYTimesBlogTrain.csv
        │   └── SampleSubmission.csv
        ├── explore.R
        ├── loader.R
        ├── main.R
        ├── results
        │   ├── SubmissionHeadlineLog.csv
        │   ├── SubmissionHeadlineRF.csv
        │   ├── SubmissionMetaTextDateRF.csv
        │   ├── SubmissionMetaTextRF.csv
        │   ├── SubmissionRF_3corpora_10000.csv
        │   ├── SubmissionRF_all_corpora_10000.csv
        │   ├── SubmissionRF_all_corpora_500.csv
        │   ├── SubmissionRF_eval.csv
        │   ├── SubmissionRF_glmterms995-2_nt10000.csv
        │   └── SubmissionRF_nt10000.csv
        ├── split_eval.R
        ├── train_glm.R
        └── train_random_forest.R
    ├── sentimentclassifier
        ├── GUI.png
        ├── README.md
        ├── genesis.txt
        ├── luther.txt
        ├── ratm.txt
        ├── rem.txt
        ├── revelation_john.txt
        ├── server.R
        ├── server_reactions.png
        ├── train_data.tsv
        └── ui.R
    ├── wine-quality-data-analysis
        ├── README.Rmd
        ├── README.html
        ├── README.md
        ├── README_files
        │   └── figure-html
        │   │   ├── unnamed-chunk-11-1.png
        │   │   ├── unnamed-chunk-11.png
        │   │   ├── unnamed-chunk-12-1.png
        │   │   ├── unnamed-chunk-12.png
        │   │   ├── unnamed-chunk-15-1.png
        │   │   ├── unnamed-chunk-15.png
        │   │   ├── unnamed-chunk-16-1.png
        │   │   ├── unnamed-chunk-16.png
        │   │   ├── unnamed-chunk-19-1.png
        │   │   ├── unnamed-chunk-19.png
        │   │   ├── unnamed-chunk-20-1.png
        │   │   ├── unnamed-chunk-20.png
        │   │   ├── unnamed-chunk-23-1.png
        │   │   ├── unnamed-chunk-23.png
        │   │   ├── unnamed-chunk-24-1.png
        │   │   ├── unnamed-chunk-24.png
        │   │   ├── unnamed-chunk-25-1.png
        │   │   ├── unnamed-chunk-25.png
        │   │   ├── unnamed-chunk-27-1.png
        │   │   ├── unnamed-chunk-27.png
        │   │   ├── unnamed-chunk-28-1.png
        │   │   ├── unnamed-chunk-28.png
        │   │   ├── unnamed-chunk-7-1.png
        │   │   └── unnamed-chunk-7.png
        ├── data
        │   ├── wineQualityReds.csv
        │   └── wineQualityWhites.csv
        └── figure
        │   ├── unnamed-chunk-11.png
        │   ├── unnamed-chunk-12.png
        │   ├── unnamed-chunk-15.png
        │   ├── unnamed-chunk-16.png
        │   ├── unnamed-chunk-19.png
        │   ├── unnamed-chunk-20.png
        │   ├── unnamed-chunk-23.png
        │   ├── unnamed-chunk-24.png
        │   ├── unnamed-chunk-25.png
        │   ├── unnamed-chunk-27.png
        │   ├── unnamed-chunk-28.png
        │   └── unnamed-chunk-7.png
    └── winerama
        └── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | ipython_notebook_spark.out
2 | *.gz
3 | *.pyc
4 | **/.ipynb_checkpoints/
5 | .ipynb_checkpoints/
6 | **/metastore_db/
7 | **/derby.log
8 | **/rsconnect/
9 | 


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-31-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-31-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-32-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-32-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-35-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-35-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-36-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-36-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-38-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-38-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-40-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-40-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-42-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-42-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-45-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-45-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-46-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-46-1.png


--------------------------------------------------------------------------------
/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-49-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/R-data-frames_files/figure-html/unnamed-chunk-49-1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_110_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_110_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_116_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_116_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_138_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_138_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_144_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_144_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_146_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_146_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_150_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_150_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_68_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_68_2.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_70_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_70_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_78_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_78_1.png


--------------------------------------------------------------------------------
/01-data-frames/python-data-frames/output_80_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/01-data-frames/python-data-frames/output_80_1.png


--------------------------------------------------------------------------------
/01-data-frames/tb_deaths_100.csv:
--------------------------------------------------------------------------------
  1 | "TB mortality, all forms (per 100 000 population per year)",1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
  2 | Afghanistan,50,49,48,47,47,46,46,45,43,43,41,39,36,37,34,33,31,30
  3 | Albania,4,4,4,4,4,5,5,5,5,5,4,4,4,4,4,4,3,3
  4 | Algeria,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
  5 | American Samoa,5,2,0,2,2,2,0,9,2,1,1,1,1,0,1,1,1,0
  6 | Andorra,4,4,4,4,3,3,3,2,3,3,2,3,3,2,2,2,2,2
  7 | Angola,60,60,61,62,63,64,66,48,56,53,75,49,34,30,37,39,29,33
  8 | Anguilla,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
  9 | Antigua and Barbuda,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 10 | Argentina,8,8,7,7,7,7,6,6,6,6,5,6,5,5,5,5,5,4
 11 | Armenia,5,5,5,6,7,8,9,9,9,11,12,12,12,12,11,10,10,10
 12 | Australia,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 13 | Austria,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1
 14 | Azerbaijan,5,5,5,6,6,7,8,8,9,9,10,10,10,11,10,10,10,10
 15 | Bahamas,11,11,11,11,11,11,11,10,11,8,7,7,9,9,9,9,9,9
 16 | Bahrain,9,8,8,7,7,7,7,6,6,6,5,5,4,4,4,4,4,5
 17 | Bangladesh,77,75,72,70,68,66,63,62,60,58,58,57,55,53,52,48,45,45
 18 | Barbados,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0
 19 | Belarus,5,4,5,5,7,7,8,9,9,10,10,10,8,8,8,8,8,8
 20 | Belgium,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1
 21 | Belize,8,9,9,9,9,9,4,6,7,8,5,3,3,5,7,4,5,7
 22 | Benin,15,15,15,15,15,15,17,17,18,17,17,17,19,20,19,19,18,18
 23 | Bermuda,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 24 | Bhutan,101,95,90,85,81,76,72,69,63,63,60,58,55,53,51,48,47,44
 25 | Bolivia,45,43,42,41,40,38,33,33,32,31,30,28,28,27,27,26,26,25
 26 | Bosnia and Herzegovina,15,15,15,15,14,13,13,12,12,10,9,8,8,8,7,8,8,7
 27 | Botswana,46,51,55,58,61,63,60,59,65,76,84,106,123,154,162,165,189,194
 28 | Brazil,7,7,7,7,6,7,7,7,6,6,6,6,6,6,5,5,4,4
 29 | British Virgin Islands,4,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2
 30 | Brunei Darussalam,10,10,10,10,10,10,10,10,10,6,9,7,6,7,7,6,5,7
 31 | Bulgaria,4,4,5,5,5,5,5,6,6,5,5,6,6,5,5,5,5,5
 32 | Burkina Faso,32,35,37,39,41,43,45,49,53,58,63,69,74,76,76,74,71,69
 33 | Burundi,43,49,56,61,65,68,71,71,86,81,87,98,106,110,110,109,107,102
 34 | Cambodia,119,119,120,120,120,118,120,117,115,112,111,109,104,100,96,92,91,89
 35 | Cameroon,24,28,31,34,38,41,44,49,51,54,56,59,58,53,53,45,42,39
 36 | Canada,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 37 | Cape Verde,50,48,47,46,45,44,44,43,43,42,42,32,41,41,40,32,32,31
 38 | Cayman Islands,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,1,0
 39 | Central African Republic,47,53,58,64,69,74,56,87,95,103,113,116,110,134,136,120,102,100
 40 | Chad,31,34,37,40,43,46,55,65,72,66,73,79,89,103,99,94,91,90
 41 | Chile,4,4,4,3,3,3,3,3,2,2,2,2,2,2,2,2,1,1
 42 | China,25,24,24,23,22,22,21,21,20,20,20,20,19,18,17,16,15,15
 43 | Colombia,9,9,9,9,9,8,8,8,8,7,6,7,7,7,7,6,5,5
 44 | Comoros,15,14,13,12,11,10,10,11,9,10,9,8,8,8,7,7,7,6
 45 | "Congo, Rep.",44,49,52,55,57,59,92,102,78,118,43,57,66,99,91,91,96,90
 46 | Cook Islands,0,2,10,8,6,3,2,1,0,4,1,10,2,0,5,1,2,4
 47 | Costa Rica,3,3,3,3,3,3,2,2,2,1,1,1,1,1,1,1,1,1
 48 | Croatia,12,12,12,12,11,11,10,10,10,9,7,7,7,7,7,7,6,6
 49 | Cuba,3,3,3,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1
 50 | Cyprus,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0
 51 | Czech Republic,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1
 52 | Cote d'Ivoire,48,56,61,67,72,78,84,91,99,110,125,151,145,148,147,145,136,128
 53 | "Korea, Dem. Rep.",113,113,112,112,111,111,111,111,111,110,105,98,90,86,83,85,83,65
 54 | "Congo, Dem. Rep.",35,39,41,44,47,51,53,58,59,65,73,79,86,87,87,86,85,82
 55 | Denmark,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 56 | Djibouti,126,127,129,132,136,139,78,73,101,94,105,102,133,137,146,148,154,157
 57 | Dominica,3,3,3,3,3,3,2,2,2,2,2,3,2,2,2,3,1,2
 58 | Dominican Republic,27,27,26,26,25,25,22,24,23,22,21,20,17,15,15,14,14,13
 59 | Ecuador,41,40,38,37,36,35,35,32,33,32,31,30,27,26,25,25,24,23
 60 | Egypt,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,2
 61 | El Salvador,15,14,14,13,13,12,13,11,11,10,10,10,9,9,8,8,8,7
 62 | Equatorial Guinea,19,22,23,25,27,29,36,38,37,40,44,83,89,93,69,68,66,87
 63 | Eritrea,20,20,21,21,22,22,23,22,21,14,14,14,14,14,15,16,17,16
 64 | Estonia,4,4,5,5,6,7,8,9,10,10,10,9,9,9,8,7,7,6
 65 | Ethiopia,40,45,50,55,59,64,64,65,69,76,79,88,92,97,98,98,96,92
 66 | Fiji,9,8,8,7,7,6,6,6,5,5,5,4,4,4,4,4,3,4
 67 | Finland,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 68 | France,3,3,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1
 69 | French Polynesia,7,6,9,9,10,11,6,4,5,5,5,5,3,3,2,3,2,3
 70 | Gabon,46,45,46,47,50,49,55,59,63,70,86,44,65,63,56,72,74,76
 71 | Gambia,38,38,38,38,39,39,40,40,40,64,65,67,46,47,48,50,54,55
 72 | Georgia,7,6,7,7,8,9,10,13,12,13,14,14,14,14,13,11,9,9
 73 | Germany,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1
 74 | Ghana,59,58,58,59,59,60,62,58,58,57,55,54,54,53,53,53,52,52
 75 | Greece,4,4,4,4,3,3,3,3,3,3,3,3,2,2,2,2,2,2
 76 | Grenada,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 77 | Guam,11,10,10,11,15,10,9,9,9,9,4,5,6,5,5,3,5,2
 78 | Guatemala,13,13,13,13,13,13,13,13,13,13,13,13,13,13,12,12,12,12
 79 | Guinea,28,29,30,32,33,35,35,38,41,43,46,49,53,56,59,65,66,70
 80 | Guinea-Bissau,40,40,41,42,43,44,46,52,56,58,42,44,48,47,42,45,43,44
 81 | Guyana,6,7,6,8,10,13,15,16,17,18,18,21,22,24,22,22,22,24
 82 | Haiti,81,82,83,84,84,84,85,84,82,80,79,78,76,74,73,71,71,71
 83 | Honduras,17,16,16,15,15,14,16,16,12,10,6,7,7,9,10,10,10,10
 84 | Hungary,6,6,6,6,6,6,7,6,6,5,5,4,4,4,3,3,3,2
 85 | Iceland,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 86 | India,42,42,42,43,43,44,44,45,45,44,43,40,38,35,31,30,29,28
 87 | Indonesia,92,89,85,82,79,75,73,70,67,63,62,59,54,51,47,43,40,39
 88 | Iran,4,4,5,4,5,5,5,4,4,4,4,4,3,3,3,3,3,3
 89 | Iraq,12,12,12,12,12,12,8,8,8,8,10,10,9,9,10,10,11,11
 90 | Ireland,2,2,2,2,2,2,1,2,2,1,2,1,1,1,1,1,1,1
 91 | Israel,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 92 | Italy,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 93 | Jamaica,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 94 | Japan,6,6,5,5,5,5,5,5,5,4,4,4,4,3,3,3,3,3
 95 | Jordan,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
 96 | Kazakhstan,8,7,7,7,7,8,8,12,12,17,15,17,18,20,19,18,18,17
 97 | Kenya,25,27,33,41,49,61,69,83,93,102,113,105,103,100,98,90,73,65
 98 | Kiribati,116,111,107,103,100,96,81,82,65,63,62,71,68,55,45,49,46,49
 99 | Kuwait,5,5,5,4,4,4,4,3,3,3,3,3,3,3,3,3,2,2
100 | Kyrgyzstan,8,8,8,9,9,11,14,16,18,19,20,20,20,19,18,18,18,18
101 | Laos,38,37,36,35,34,33,30,29,28,28,28,27,27,26,26,24,24,24
102 | Latvia,4,5,5,5,6,8,10,12,12,12,12,12,12,11,10,9,9,8
103 | Lebanon,5,5,5,5,5,5,4,4,3,3,3,2,2,2,2,2,2,2
104 | Lesotho,28,31,34,38,43,49,51,41,57,219,76,79,92,89,93,115,85,263
105 | Liberia,57,59,61,63,65,66,60,62,58,60,65,65,57,65,56,63,60,62
106 | Libyan Arab Jamahiriya,5,5,5,5,5,5,4,4,4,2,2,2,2,1,1,1,1,1
107 | Lithuania,4,5,5,6,6,7,8,9,11,11,10,10,10,9,9,8,7,9
108 | Luxembourg,2,2,2,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1
109 | Madagascar,38,38,38,38,38,38,38,38,39,40,41,42,43,43,44,47,45,48
110 | Malawi,62,73,85,96,105,110,111,117,119,118,126,119,123,119,116,116,109,102
111 | Malaysia,21,21,20,20,19,19,19,24,24,24,18,18,19,18,19,18,18,18
112 | Maldives,7,7,6,6,5,5,5,7,7,7,7,7,6,5,5,5,5,4
113 | Mali,74,74,74,75,75,76,77,79,81,83,85,86,86,87,88,89,90,90
114 | Malta,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
115 | Mauritania,63,63,63,64,64,64,66,64,73,75,78,81,83,85,67,76,75,75
116 | Mauritius,5,5,4,4,4,4,5,5,4,3,3,4,4,4,3,4,4,4
117 | Mexico,11,10,10,9,8,8,7,7,6,6,5,4,4,4,4,3,3,2
118 | "Micronesia, Fed. Sts.",33,31,29,28,26,24,23,27,26,25,20,19,17,16,15,14,13,9
119 | Monaco,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
120 | Mongolia,48,47,47,46,45,44,38,38,38,37,37,36,34,35,29,28,21,29
121 | Montserrat,2,2,2,2,2,2,2,2,1,1,2,2,2,1,2,1,2,1
122 | Morocco,13,13,13,12,12,12,11,11,11,10,10,10,9,9,9,9,9,8
123 | Mozambique,37,42,46,50,55,61,68,77,87,98,111,124,133,139,140,138,133,127
124 | Myanmar,52,53,53,53,54,54,47,48,49,47,43,39,33,23,18,13,13,13
125 | Namibia,84,96,107,119,132,146,57,55,57,71,87,82,100,97,109,118,123,102
126 | Nauru,19,32,30,29,9,27,27,26,26,7,4,8,5,5,19,6,24,3
127 | Nepal,51,49,48,47,46,45,43,41,39,32,29,28,27,25,24,24,23,23
128 | Netherlands,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
129 | Netherlands Antilles,3,3,2,2,2,2,2,2,2,2,2,1,1,2,1,1,1,1
130 | New Caledonia,10,10,10,7,6,6,4,8,7,5,4,4,3,3,2,2,2,2
131 | New Zealand,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
132 | Nicaragua,18,17,16,16,15,14,13,13,12,12,11,11,10,10,9,9,9,6
133 | Niger,36,36,36,37,37,37,39,35,35,35,35,36,37,36,38,37,38,38
134 | Nigeria,35,40,43,47,52,56,60,66,72,78,86,93,100,103,102,101,97,93
135 | Niue,13,13,12,12,11,11,15,0,0,40,0,0,82,0,0,0,0,0
136 | Northern Mariana Islands,13,17,26,16,15,15,16,24,23,19,10,13,8,9,7,6,8,7
137 | Norway,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
138 | Oman,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
139 | Pakistan,49,49,49,49,49,49,49,49,48,49,49,48,45,44,42,38,34,29
140 | Palau,12,8,5,30,47,21,7,7,4,31,8,7,6,8,5,12,6,8
141 | Panama,7,7,7,7,7,7,7,8,7,7,7,6,4,5,4,4,4,4
142 | Papua New Guinea,68,67,66,66,65,64,66,64,58,60,59,58,54,52,48,48,51,60
143 | Paraguay,12,12,12,12,12,12,9,12,13,13,12,12,12,12,12,11,10,10
144 | Peru,34,32,31,29,27,26,29,28,23,25,24,22,21,21,19,18,16,16
145 | Philippines,87,86,84,83,81,82,81,78,74,70,57,53,50,47,46,44,41,41
146 | Poland,8,8,8,8,8,7,7,6,6,5,5,5,4,4,4,4,4,3
147 | Portugal,7,6,6,6,6,6,6,5,5,5,5,4,4,5,4,4,4,3
148 | Puerto Rico,2,2,2,2,2,2,2,1,1,1,1,1,1,1,1,1,1,1
149 | Qatar,6,6,6,6,7,8,7,7,7,7,7,7,7,7,6,6,7,7
150 | "Korea, Rep.",19,16,15,13,12,11,11,10,9,8,9,8,9,9,9,10,10,10
151 | Moldova,9,8,9,9,11,12,14,15,18,19,20,20,20,20,19,19,19,19
152 | Romania,8,9,10,11,12,13,15,15,17,18,19,19,20,20,19,19,18,16
153 | Russian Federation,7,7,7,8,10,12,15,17,19,19,21,21,20,20,20,18,18,18
154 | Rwanda,67,73,76,79,82,85,89,92,96,105,118,130,138,141,142,138,131,128
155 | Saint Kitts and Nevis,2,2,2,2,2,2,2,2,1,1,2,2,1,2,2,2,2,1
156 | Saint Lucia,3,3,3,3,3,3,2,1,2,2,2,2,2,2,2,2,2,2
157 | Saint Vincent and the Grenadines,5,5,5,5,5,5,5,5,5,5,4,4,4,4,4,4,4,5
158 | Samoa,5,4,4,4,4,4,4,4,5,3,3,4,2,3,3,3,3,3
159 | San Marino,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
160 | Sao Tome and Principe,38,37,36,35,34,33,32,31,30,30,30,29,29,27,28,27,26,26
161 | Saudi Arabia,5,4,4,4,5,5,5,5,6,6,6,5,5,5,5,5,5,5
162 | Senegal,42,42,42,42,42,43,43,45,47,50,50,52,55,55,58,60,62,64
163 | Seychelles,9,8,8,8,7,7,5,4,5,7,5,5,4,5,4,5,5,5
164 | Sierra Leone,53,57,60,65,69,73,72,78,85,90,97,103,111,120,128,135,142,149
165 | Singapore,6,6,6,5,5,5,5,5,5,5,4,4,4,3,3,3,2,3
166 | Slovakia,7,7,7,8,7,7,6,6,5,5,4,4,4,3,3,3,3,3
167 | Slovenia,5,5,5,4,4,4,4,4,4,4,3,3,3,2,2,2,2,2
168 | Solomon Islands,70,66,61,57,53,50,42,39,37,36,33,32,30,28,26,24,23,21
169 | Somalia,86,85,84,84,83,82,76,75,75,75,74,69,69,65,55,52,57,63
170 | South Africa,78,78,82,112,114,116,126,133,144,129,183,213,194,227,233,249,232,230
171 | Spain,6,6,5,5,5,4,4,4,4,4,4,3,3,3,3,3,3,3
172 | Sri Lanka,10,10,10,10,10,10,10,9,9,9,10,9,9,9,9,7,8,8
173 | Sudan,62,63,64,65,65,66,67,68,63,64,64,66,64,66,67,68,69,71
174 | Suriname,14,13,10,11,10,11,12,15,18,18,18,18,19,20,22,24,26,29
175 | Swaziland,79,83,86,96,113,137,163,200,240,324,362,415,237,239,277,309,322,317
176 | Sweden,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1
177 | Switzerland,2,2,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
178 | Syrian Arab Republic,6,5,5,5,5,4,4,4,4,3,3,3,3,3,2,2,2,2
179 | Tajikistan,20,17,12,9,9,12,16,17,19,20,23,26,30,31,35,38,42,46
180 | Thailand,30,31,31,31,31,31,31,30,28,26,25,22,22,21,21,21,21,21
181 | "Macedonia, FYR",11,11,11,11,11,10,10,9,9,8,7,6,6,6,5,5,5,5
182 | Timor-Leste,73,72,72,71,71,70,70,70,70,70,70,70,37,45,46,46,48,47
183 | Togo,88,91,94,98,102,104,108,119,116,120,122,124,131,129,130,131,133,138
184 | Tokelau,33,33,33,26,0,68,0,12,12,0,0,0,24,0,12,0,0,0
185 | Tonga,6,6,6,5,5,5,3,4,2,4,3,5,3,3,4,3,3,2
186 | Trinidad and Tobago,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
187 | Tunisia,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3
188 | Turkey,8,8,8,8,7,7,7,7,7,6,5,5,5,5,5,5,5,5
189 | Turkmenistan,9,9,9,9,9,8,9,11,13,14,13,13,12,12,11,10,10,9
190 | Turks and Caicos Islands,5,5,4,4,4,4,4,4,3,1,1,3,2,2,2,2,2,1
191 | Tuvalu,62,59,56,53,50,48,46,44,43,41,40,39,37,36,35,29,30,17
192 | Uganda,69,106,115,125,129,133,130,99,100,98,103,104,110,114,110,107,100,93
193 | Ukraine,6,5,6,6,7,7,8,9,10,11,12,13,14,15,15,15,14,15
194 | United Arab Emirates,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2
195 | United Kingdom,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2
196 | Tanzania,42,48,53,58,64,69,74,80,82,86,89,88,92,91,88,86,82,78
197 | Virgin Islands (U.S.),3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2,2,2
198 | United States of America,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0
199 | Uruguay,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
200 | Uzbekistan,10,9,9,11,11,11,10,11,11,12,12,14,15,15,16,16,16,16
201 | Vanuatu,31,30,28,27,25,24,23,21,20,18,16,14,16,15,14,15,12,12
202 | Venezuela,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
203 | Viet Nam,33,33,32,32,32,31,29,26,25,25,25,25,24,24,24,24,24,24
204 | Wallis et Futuna,15,40,7,19,19,10,10,12,10,10,11,2,28,12,5,4,7,3
205 | West Bank and Gaza,6,5,5,5,5,5,5,5,4,4,4,4,4,4,3,3,3,3
206 | Yemen,18,17,18,17,17,16,16,15,14,13,12,12,12,11,11,10,10,10
207 | Zambia,126,153,182,203,218,229,238,247,250,258,255,274,156,138,137,138,128,115
208 | Zimbabwe,140,161,176,193,208,225,239,253,141,155,185,197,216,253,253,263,268,265


--------------------------------------------------------------------------------
/01-data-frames/tb_existing_100.csv:
--------------------------------------------------------------------------------
  1 | "TB prevalence, all forms (per 100 000 population per year)",1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
  2 | Afghanistan,436,429,422,415,407,397,397,387,374,373,346,326,304,308,283,267,251,238
  3 | Albania,42,40,41,42,42,43,42,44,43,42,40,34,32,32,29,29,26,22
  4 | Algeria,45,44,44,43,43,42,43,44,45,46,48,49,50,51,52,53,55,56
  5 | American Samoa,42,14,4,18,17,22,0,25,12,8,8,6,5,6,9,11,9,5
  6 | Andorra,39,37,35,33,32,30,28,23,24,22,20,20,21,18,19,18,17,19
  7 | Angola,514,514,513,512,510,508,512,363,414,384,530,335,307,281,318,331,302,294
  8 | Anguilla,38,38,37,37,36,35,35,36,36,36,35,35,35,35,35,34,34,34
  9 | Antigua and Barbuda,16,15,15,14,13,12,12,11,11,9,8,9,7,9,8,8,9,9
 10 | Argentina,96,91,86,82,78,74,71,67,63,58,52,51,42,41,39,39,37,35
 11 | Armenia,52,49,51,55,60,68,74,75,74,86,94,99,97,91,85,79,79,81
 12 | Australia,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6
 13 | Austria,18,17,16,15,15,14,13,13,12,12,11,11,11,10,10,10,10,10
 14 | Azerbaijan,58,55,57,61,67,76,85,91,100,106,113,117,99,109,90,85,86,86
 15 | Bahamas,54,53,52,52,53,54,54,54,55,46,45,45,51,51,50,50,50,51
 16 | Bahrain,120,113,108,101,97,92,89,86,83,67,57,56,55,53,48,45,45,60
 17 | Bangladesh,639,623,608,594,579,576,550,535,516,492,500,491,478,458,444,416,392,387
 18 | Barbados,8,8,7,7,6,6,6,6,5,5,5,4,4,4,4,4,3,3
 19 | Belarus,62,54,59,62,75,82,91,98,109,113,110,100,89,68,68,68,69,69
 20 | Belgium,16,15,15,15,15,14,13,13,12,12,12,13,12,11,11,11,10,9
 21 | Belize,65,64,62,59,57,55,37,41,53,53,39,36,36,40,42,38,41,46
 22 | Benin,140,138,135,132,129,125,127,129,130,128,128,129,137,139,134,135,134,135
 23 | Bermuda,10,10,9,9,8,8,8,8,7,7,7,6,6,6,6,6,6,6
 24 | Bhutan,924,862,804,750,699,651,620,597,551,538,515,512,472,460,443,412,406,363
 25 | Bolivia,377,362,347,333,320,306,271,264,254,248,238,229,223,218,211,205,202,198
 26 | Bosnia and Herzegovina,160,156,154,150,143,134,131,125,96,80,70,63,66,63,55,58,58,55
 27 | Botswana,344,355,351,349,347,349,336,349,371,413,445,497,535,586,598,599,621,622
 28 | Brazil,124,119,114,109,104,100,97,93,88,86,83,80,77,72,63,60,56,60
 29 | British Virgin Islands,32,30,28,26,25,23,22,21,20,19,19,18,18,17,16,17,16,16
 30 | Brunei Darussalam,91,91,91,91,91,91,91,88,88,93,108,85,78,73,63,55,59,65
 31 | Bulgaria,43,48,54,57,58,57,59,65,68,68,64,63,52,42,40,41,40,41
 32 | Burkina Faso,179,196,208,221,233,246,251,271,286,308,338,368,398,419,426,421,411,403
 33 | Burundi,288,302,292,293,305,322,339,346,424,412,455,522,581,619,639,654,657,647
 34 | Cambodia,928,905,881,858,836,811,810,789,777,764,758,750,728,712,696,676,672,664
 35 | Cameroon,188,199,200,199,197,197,196,207,212,219,228,241,240,227,228,213,201,195
 36 | Canada,7,7,7,6,6,6,5,5,5,5,5,4,4,4,4,4,4,4
 37 | Cape Verde,449,438,428,418,408,398,394,391,387,384,380,283,374,370,367,278,285,280
 38 | Cayman Islands,10,10,9,9,8,8,8,8,7,7,7,5,6,6,5,5,6,5
 39 | Central African Republic,318,336,342,350,356,365,270,395,419,449,485,495,468,566,574,507,437,425
 40 | Chad,251,272,282,294,304,315,354,408,433,390,420,450,502,573,548,518,505,497
 41 | Chile,45,41,38,35,32,30,28,25,24,22,21,19,19,18,15,15,13,12
 42 | China,327,321,315,309,303,303,290,283,276,273,269,265,259,241,220,206,200,194
 43 | Colombia,88,85,82,79,76,73,71,69,67,61,51,62,60,58,55,53,44,43
 44 | Comoros,188,177,167,157,148,140,130,155,120,143,112,103,104,107,99,91,86,83
 45 | "Congo, Rep.",209,222,231,243,255,269,424,457,367,545,313,354,402,509,477,482,511,485
 46 | Cook Islands,0,10,57,47,38,19,10,13,0,40,12,29,11,0,15,9,16,31
 47 | Costa Rica,30,28,27,26,25,24,23,22,21,19,14,14,15,14,12,12,12,11
 48 | Croatia,126,123,121,118,113,106,103,102,99,89,76,73,69,68,67,65,65,54
 49 | Cuba,32,29,26,24,22,20,18,17,15,14,13,12,11,10,9,8,8,7
 50 | Cyprus,14,13,13,12,11,11,11,10,7,7,9,8,7,6,6,6,6,6
 51 | Czech Republic,22,22,22,21,21,21,21,21,19,18,16,14,13,12,11,11,10,9
 52 | Cote d'Ivoire,292,304,306,309,312,319,329,350,376,413,472,571,561,590,604,613,597,582
 53 | "Korea, Dem. Rep.",841,828,815,802,788,775,775,775,775,770,713,650,577,527,499,508,500,441
 54 | "Congo, Dem. Rep.",275,306,327,352,376,411,420,466,472,528,592,643,697,708,710,702,692,666
 55 | Denmark,12,12,11,10,10,9,9,8,8,8,7,7,7,7,7,6,7,6
 56 | Djibouti,"1,485","1,477","1,463","1,442","1,414","1,381",720,669,698,701,761,775,932,960,"1,034","1,046","1,093","1,104"
 57 | Dominica,24,24,24,23,23,22,22,18,20,20,20,22,20,20,20,21,13,19
 58 | Dominican Republic,183,173,164,156,148,141,135,132,128,122,119,115,102,93,90,85,84,82
 59 | Ecuador,282,271,259,249,238,228,221,212,207,200,194,185,170,162,155,155,148,140
 60 | Egypt,48,47,47,45,45,44,51,46,43,40,36,34,32,31,29,28,27,27
 61 | El Salvador,133,126,119,112,105,99,97,80,76,72,69,66,62,60,57,52,50,48
 62 | Equatorial Guinea,169,181,187,194,200,207,216,222,236,253,274,441,470,490,370,366,358,469
 63 | Eritrea,245,245,242,239,235,232,232,225,203,114,114,111,118,110,122,127,133,134
 64 | Estonia,50,50,56,66,77,85,88,98,102,105,72,68,62,56,50,46,44,39
 65 | Ethiopia,312,337,351,366,383,403,396,397,420,464,486,539,569,601,613,612,604,579
 66 | Fiji,68,65,62,58,55,53,49,49,46,40,42,35,36,29,33,31,30,30
 67 | Finland,14,12,11,10,9,10,10,10,9,9,8,7,7,6,6,5,5,5
 68 | France,21,20,19,18,17,16,15,15,14,14,13,12,12,12,12,11,11,11
 69 | French Polynesia,67,55,91,83,93,107,55,48,56,54,40,42,32,29,28,31,31,32
 70 | Gabon,359,340,325,318,316,293,312,320,359,366,434,249,302,299,288,332,358,379
 71 | Gambia,350,350,349,347,344,341,324,321,311,485,491,499,335,343,341,366,399,404
 72 | Georgia,51,48,50,54,59,66,73,104,87,90,98,95,95,94,90,86,83,83
 73 | Germany,15,15,14,14,13,13,12,11,11,10,9,8,7,6,6,6,5,5
 74 | Ghana,533,519,502,480,455,432,426,388,384,382,368,358,359,358,359,357,355,353
 75 | Greece,30,29,27,25,24,23,22,22,21,20,19,18,18,17,17,16,16,16
 76 | Grenada,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6
 77 | Guam,103,101,96,110,146,93,91,89,87,86,44,45,44,47,41,42,39,36
 78 | Guatemala,113,111,108,106,103,100,95,94,93,92,90,91,89,89,86,85,84,87
 79 | Guinea,241,248,255,262,269,275,277,293,305,317,332,346,363,380,391,425,426,448
 80 | Guinea-Bissau,404,403,402,399,395,390,390,387,385,386,273,276,305,296,287,283,270,276
 81 | Guyana,39,43,34,43,50,67,78,81,90,93,98,112,126,136,130,132,133,136
 82 | Haiti,479,464,453,443,435,429,428,426,417,407,403,397,388,380,377,368,368,366
 83 | Honduras,141,133,128,123,119,115,114,112,106,98,70,70,72,71,72,71,70,71
 84 | Hungary,67,68,70,72,73,73,74,72,67,47,43,39,36,33,29,26,22,19
 85 | Iceland,5,4,4,4,4,4,3,3,3,3,3,3,2,2,2,3,3,3
 86 | India,586,577,566,555,542,525,517,501,487,476,443,411,389,349,311,299,290,283
 87 | Indonesia,443,430,417,404,392,380,369,359,348,335,326,314,297,287,274,261,251,244
 88 | Iran,50,51,56,54,55,55,61,52,45,41,40,38,37,35,32,31,29,27
 89 | Iraq,88,88,88,88,88,88,84,84,82,80,71,69,65,67,71,75,78,79
 90 | Ireland,19,18,18,17,15,14,12,12,12,12,12,11,10,10,10,10,10,11
 91 | Israel,11,10,10,9,9,8,8,8,8,7,7,7,6,6,6,6,6,6
 92 | Italy,11,10,10,9,9,8,9,8,7,7,7,7,7,6,6,6,6,6
 93 | Jamaica,10,10,10,10,9,9,7,7,7,7,7,7,7,7,7,7,7,7
 94 | Japan,62,60,58,56,53,51,50,50,49,48,45,41,39,36,34,32,30,28
 95 | Jordan,19,18,17,17,16,15,20,18,12,11,11,9,9,9,9,8,9,9
 96 | Kazakhstan,95,87,85,84,85,94,109,137,163,134,141,148,150,155,152,147,144,139
 97 | Kenya,125,120,134,152,177,207,233,277,313,351,393,384,392,402,410,388,340,319
 98 | Kiribati,"1,026","1,006",986,966,947,928,910,853,571,556,546,607,587,477,439,419,405,423
 99 | Kuwait,89,84,80,75,72,68,66,64,61,35,33,33,30,29,29,30,25,25
100 | Kyrgyzstan,90,93,93,93,101,118,141,165,147,146,156,169,153,145,139,136,135,134
101 | Laos,428,424,420,415,411,407,373,360,352,344,344,337,330,324,313,298,291,289
102 | Latvia,56,57,59,63,75,91,77,89,92,95,91,89,85,78,72,66,61,55
103 | Lebanon,64,64,63,62,62,59,64,54,50,37,35,30,26,24,22,21,23,23
104 | Lesotho,225,231,229,228,232,242,248,264,298,518,356,370,399,408,414,421,408,568
105 | Liberia,476,473,469,465,462,461,418,424,396,403,435,437,382,429,370,416,393,398
106 | Libyan Arab Jamahiriya,46,45,45,43,43,42,41,38,36,23,22,22,21,20,19,18,18,17
107 | Lithuania,64,66,71,79,89,98,110,119,125,120,115,96,83,72,72,66,65,69
108 | Luxembourg,19,18,17,16,15,14,14,13,13,12,11,11,11,10,10,10,10,9
109 | Madagascar,367,368,369,369,370,370,339,345,346,352,359,371,382,375,384,408,400,417
110 | Malawi,380,376,365,355,353,348,337,342,345,349,362,350,358,353,346,342,324,305
111 | Malaysia,159,158,156,155,153,151,147,173,170,167,135,133,132,128,128,126,123,121
112 | Maldives,143,130,118,107,97,88,88,101,89,94,96,84,83,69,71,63,69,48
113 | Mali,640,631,621,609,597,583,573,566,565,567,571,573,572,578,584,589,593,599
114 | Malta,10,9,9,8,8,7,7,7,7,6,6,6,5,5,5,5,5,5
115 | Mauritania,585,587,590,592,594,595,622,615,612,615,619,624,632,642,494,565,556,559
116 | Mauritius,53,51,50,48,47,45,62,61,45,40,39,42,40,39,38,39,39,39
117 | Mexico,101,93,86,80,74,68,64,58,52,48,42,38,35,33,31,27,25,23
118 | "Micronesia, Fed. Sts.",263,253,244,234,225,217,204,287,276,265,173,171,152,142,128,124,112,100
119 | Monaco,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2
120 | Mongolia,477,477,477,477,477,477,333,342,307,281,297,273,258,258,233,232,217,234
121 | Montserrat,14,14,14,14,14,13,13,13,13,13,13,13,13,10,13,10,12,8
122 | Morocco,134,130,127,123,119,116,107,106,105,99,98,95,87,91,89,85,82,80
123 | Mozambique,287,313,328,343,356,369,386,408,432,461,499,535,556,569,567,551,528,504
124 | Myanmar,411,400,389,379,370,361,298,309,312,298,267,238,202,175,168,161,161,162
125 | Namibia,650,685,687,683,671,658,387,395,411,442,481,506,544,560,572,570,556,532
126 | Nauru,170,285,280,274,90,263,258,253,248,44,44,56,57,48,162,121,174,33
127 | Nepal,629,607,585,564,543,523,498,473,448,363,312,304,285,271,260,247,246,240
128 | Netherlands,11,10,10,9,9,8,8,8,8,7,7,7,6,6,6,6,6,6
129 | Netherlands Antilles,28,27,25,24,23,22,21,20,19,18,17,17,17,16,16,15,15,15
130 | New Caledonia,112,107,104,76,69,60,58,97,97,51,51,43,34,28,29,29,25,25
131 | New Zealand,10,10,9,9,10,11,10,10,11,11,11,10,10,10,10,9,8,7
132 | Nicaragua,145,137,129,122,114,108,100,97,93,89,85,80,79,73,69,68,64,56
133 | Niger,317,318,319,319,319,318,322,292,281,281,278,280,288,275,287,285,289,292
134 | Nigeria,282,307,321,336,350,366,379,399,423,452,489,526,563,575,573,563,543,521
135 | Niue,118,115,113,111,109,106,202,0,0,114,0,0,506,0,0,0,0,0
136 | Northern Mariana Islands,142,201,301,194,186,185,188,331,334,220,135,120,95,83,80,83,83,72
137 | Norway,8,8,8,7,7,6,6,6,6,5,5,5,5,5,4,4,4,4
138 | Oman,40,36,29,25,22,22,15,15,14,14,13,14,13,13,12,13,13,14
139 | Pakistan,430,428,427,426,424,422,421,421,415,420,413,406,376,355,333,289,260,223
140 | Palau,96,66,43,260,414,187,53,92,54,376,104,102,69,64,31,102,74,71
141 | Panama,74,73,71,70,69,68,67,67,65,64,60,51,48,49,44,44,44,45
142 | Papua New Guinea,498,498,497,497,496,496,494,493,491,489,486,482,477,471,463,453,441,430
143 | Paraguay,95,93,92,91,89,88,71,92,92,91,90,89,88,85,85,81,74,73
144 | Peru,394,368,343,320,298,278,270,251,230,222,210,198,187,182,167,155,143,136
145 | Philippines,799,783,766,750,735,719,705,689,669,649,600,578,561,542,534,520,505,500
146 | Poland,88,87,86,85,83,79,74,68,63,58,53,50,35,34,33,31,29,28
147 | Portugal,51,49,47,45,44,43,42,41,39,38,36,34,33,32,29,27,24,23
148 | Puerto Rico,17,15,17,18,18,18,15,13,12,10,9,8,7,6,6,6,6,5
149 | Qatar,71,69,69,74,84,89,87,84,75,78,78,78,75,71,71,69,77,81
150 | "Korea, Rep.",223,196,174,150,142,132,105,98,89,107,113,112,126,108,112,118,122,126
151 | Moldova,105,99,103,111,122,138,157,171,191,203,215,174,211,176,152,151,151,151
152 | Romania,118,125,134,147,159,167,174,184,129,194,197,206,180,185,178,148,138,128
153 | Russian Federation,69,64,70,78,91,111,132,142,155,160,164,158,148,140,135,121,117,115
154 | Rwanda,190,211,226,243,259,278,297,316,339,383,442,503,549,581,607,607,595,590
155 | Saint Kitts and Nevis,17,17,16,16,16,15,16,15,11,12,15,13,12,14,13,15,14,12
156 | Saint Lucia,26,26,25,25,25,24,23,17,16,18,20,18,17,19,18,18,18,18
157 | Saint Vincent and the Grenadines,45,45,44,43,42,42,42,41,38,41,35,36,36,34,36,36,34,39
158 | Samoa,36,35,34,33,32,31,35,33,50,31,27,33,28,28,24,27,26,25
159 | San Marino,9,9,8,8,7,7,7,7,7,6,6,6,6,5,5,5,5,5
160 | Sao Tome and Principe,346,335,325,315,304,295,290,285,290,276,272,266,261,266,255,256,252,240
161 | Saudi Arabia,68,60,59,60,64,67,71,73,76,72,67,65,62,60,60,60,62,65
162 | Senegal,380,379,379,378,377,376,372,388,397,424,420,430,443,441,454,456,461,468
163 | Seychelles,113,110,106,103,100,96,66,59,71,90,52,53,42,66,52,57,56,55
164 | Sierra Leone,465,479,492,504,517,534,525,565,602,636,675,696,743,784,830,866,902,941
165 | Singapore,52,52,53,50,49,49,50,50,48,44,39,36,34,32,31,28,27,27
166 | Slovakia,55,56,59,59,56,51,46,42,38,35,32,30,29,26,25,21,20,20
167 | Slovenia,66,62,59,57,53,50,35,35,32,29,27,25,22,21,19,16,16,15
168 | Solomon Islands,625,593,563,534,506,480,380,354,339,322,300,286,277,254,229,204,197,180
169 | Somalia,597,587,577,566,555,543,465,444,446,431,414,398,391,362,334,325,341,352
170 | South Africa,769,726,676,620,562,502,480,466,465,426,515,581,586,649,676,707,690,692
171 | Spain,44,42,40,37,35,34,33,30,30,28,27,26,26,25,24,24,24,23
172 | Sri Lanka,109,106,104,102,99,97,102,93,90,89,107,99,88,89,87,75,80,79
173 | Sudan,409,404,402,402,403,405,409,417,378,382,375,389,363,371,376,384,391,402
174 | Suriname,109,100,79,80,76,78,88,101,118,122,115,113,113,120,126,136,146,155
175 | Swaziland,629,590,527,477,448,441,460,504,556,647,740,832,693,739,776,788,801,812
176 | Sweden,5,5,6,6,5,5,5,4,4,4,4,4,4,4,4,4,4,5
177 | Switzerland,14,13,12,11,10,10,9,8,8,8,7,6,6,6,6,5,5,5
178 | Syrian Arab Republic,94,89,84,80,75,71,67,61,54,48,41,37,35,33,31,30,29,27
179 | Tajikistan,193,162,112,79,85,106,134,141,159,169,191,221,248,256,277,282,301,322
180 | Thailand,336,319,307,297,291,285,285,279,256,231,223,194,197,189,188,184,189,192
181 | "Macedonia, FYR",92,90,89,86,83,77,74,73,72,65,56,39,40,37,34,34,34,33
182 | Timor-Leste,706,694,681,669,656,644,644,644,644,644,644,644,345,359,367,370,385,378
183 | Togo,702,687,668,647,628,614,613,658,637,647,656,669,701,693,702,713,726,750
184 | Tokelau,139,140,143,112,0,301,0,112,112,0,0,0,112,0,112,0,0,0
185 | Tonga,45,44,43,43,42,41,38,38,31,34,34,42,35,36,39,32,34,28
186 | Trinidad and Tobago,17,17,17,16,16,16,16,16,15,15,15,16,15,15,15,15,15,15
187 | Tunisia,49,46,49,51,51,49,48,46,44,31,30,28,27,26,27,27,28,28
188 | Turkey,83,79,77,73,68,62,62,63,64,57,49,45,44,43,44,44,32,34
189 | Turkmenistan,105,99,101,97,92,80,92,114,137,142,130,115,110,103,98,91,85,75
190 | Turks and Caicos Islands,42,40,37,35,33,31,30,29,28,17,16,23,23,22,22,22,18,17
191 | Tuvalu,593,573,554,535,518,500,484,467,452,437,422,408,394,381,368,245,261,203
192 | Uganda,206,313,342,377,394,418,419,342,357,359,391,411,447,476,472,469,450,426
193 | Ukraine,67,64,67,72,75,78,87,93,104,109,120,128,133,135,132,113,99,102
194 | United Arab Emirates,47,44,42,39,38,36,34,33,31,30,27,27,27,25,25,24,24,24
195 | United Kingdom,9,9,10,10,9,9,9,9,9,9,9,9,9,10,10,11,11,12
196 | Tanzania,215,228,240,252,269,283,301,324,333,347,364,367,383,380,373,364,353,337
197 | Virgin Islands (U.S.),30,28,27,25,24,23,19,18,17,19,19,18,18,17,17,16,16,16
198 | United States of America,7,7,7,7,6,6,6,5,5,4,4,4,4,4,3,3,3,3
199 | Uruguay,35,34,33,32,31,30,28,27,28,28,27,25,27,25,23,24,25,23
200 | Uzbekistan,114,105,102,118,116,119,111,122,129,134,139,148,144,152,149,144,134,140
201 | Vanuatu,278,268,259,250,242,234,226,218,211,159,143,128,149,128,118,131,104,102
202 | Venezuela,46,45,44,43,42,42,41,41,40,39,39,41,41,39,38,38,38,39
203 | Viet Nam,365,361,358,354,350,346,312,273,261,253,248,243,235,234,226,227,222,220
204 | Wallis et Futuna,126,352,64,174,172,93,123,213,107,105,103,13,275,147,63,57,60,25
205 | West Bank and Gaza,55,54,54,52,52,50,49,46,44,42,40,39,37,36,35,33,32,31
206 | Yemen,265,261,263,253,250,244,233,207,194,175,164,154,149,146,138,137,135,130
207 | Zambia,436,456,494,526,556,585,602,626,634,657,658,680,517,478,468,453,422,387
208 | Zimbabwe,409,417,415,419,426,439,453,481,392,430,479,523,571,632,652,680,699,714


--------------------------------------------------------------------------------
/01-data-frames/tb_new_100.csv:
--------------------------------------------------------------------------------
  1 | "TB incidence, all forms (per 100 000 population per year)",1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
  2 | Afghanistan,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168
  3 | Albania,25,24,25,26,26,27,27,28,28,27,25,23,23,22,21,20,18,17
  4 | Algeria,38,38,39,40,41,42,43,44,46,47,48,49,50,51,53,54,55,57
  5 | American Samoa,21,7,2,9,9,11,0,12,6,8,6,6,4,5,9,10,7,5
  6 | Andorra,36,34,32,30,29,27,26,26,25,23,22,21,21,20,20,19,19,19
  7 | Angola,205,209,214,218,222,226,231,236,240,245,250,255,260,265,270,276,281,287
  8 | Anguilla,24,24,24,24,23,23,23,23,23,23,23,22,22,22,22,22,22,22
  9 | Antigua and Barbuda,10,10,9,9,8,8,8,7,7,7,6,6,6,6,6,6,6,5
 10 | Argentina,60,57,55,53,51,49,47,45,44,42,40,39,37,36,35,33,32,31
 11 | Armenia,33,32,33,37,41,47,53,58,63,67,71,72,72,71,71,72,72,72
 12 | Australia,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6
 13 | Austria,23,22,21,20,19,18,17,17,16,15,14,14,14,13,13,13,13,12
 14 | Azerbaijan,35,34,36,39,43,50,56,62,67,71,75,77,77,76,76,77,77,77
 15 | Bahamas,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44
 16 | Bahrain,76,72,68,64,61,58,57,55,52,49,47,46,45,43,42,42,41,40
 17 | Bangladesh,264,261,259,256,253,251,248,246,244,241,239,236,234,232,229,227,225,223
 18 | Barbados,7,6,6,6,6,5,5,5,5,4,4,4,4,4,4,4,4,4
 19 | Belarus,38,34,38,40,49,54,60,65,72,76,73,66,60,60,61,61,61,61
 20 | Belgium,20,19,18,19,19,18,17,16,15,15,16,16,15,14,13,13,13,12
 21 | Belize,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
 22 | Benin,77,77,78,79,80,80,81,82,83,84,85,86,86,87,88,89,90,91
 23 | Bermuda,7,6,6,6,6,5,5,5,5,4,4,4,4,4,4,4,4,4
 24 | Bhutan,540,516,492,470,449,428,409,391,373,356,340,325,310,296,283,270,258,246
 25 | Bolivia,255,247,240,233,226,220,213,207,201,195,190,184,179,174,169,164,159,155
 26 | Bosnia and Herzegovina,94,92,93,92,89,84,83,81,79,71,63,58,56,55,53,52,52,51
 27 | Botswana,307,341,364,390,415,444,468,503,542,588,640,692,740,772,780,770,751,731
 28 | Brazil,84,81,78,76,73,71,69,67,64,62,60,58,57,55,53,51,50,48
 29 | British Virgin Islands,19,18,17,16,15,15,14,14,13,12,12,11,11,11,11,11,10,10
 30 | Brunei Darussalam,58,58,58,58,58,58,58,58,58,93,102,70,73,64,53,48,59,59
 31 | Bulgaria,27,30,34,37,38,38,39,43,45,45,44,44,43,39,39,40,39,39
 32 | Burkina Faso,95,105,112,120,128,137,145,155,168,182,198,214,229,239,241,238,232,226
 33 | Burundi,154,171,182,196,208,223,235,252,272,295,321,347,371,387,391,387,377,367
 34 | Cambodia,585,579,574,568,563,557,552,546,541,536,530,525,520,515,510,505,500,495
 35 | Cameroon,81,89,95,102,109,116,123,132,142,154,168,181,194,202,204,202,197,192
 36 | Canada,10,9,9,8,8,7,7,7,7,6,6,6,6,5,5,5,5,5
 37 | Cape Verde,175,174,172,171,169,168,166,165,163,162,160,159,157,156,155,153,152,151
 38 | Cayman Islands,7,6,6,6,6,5,5,5,5,4,4,4,4,4,4,4,4,4
 39 | Central African Republic,145,161,172,184,196,209,221,237,256,277,302,327,349,364,368,363,354,345
 40 | Chad,125,139,149,159,170,181,191,205,221,240,262,283,302,315,318,315,307,299
 41 | Chile,38,35,33,31,29,27,25,24,22,21,19,18,17,16,15,14,13,12
 42 | China,116,115,114,113,112,111,110,109,108,106,105,104,103,102,101,100,99,98
 43 | Colombia,53,52,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35
 44 | Comoros,85,82,79,75,72,69,67,64,61,59,56,54,52,50,48,46,44,42
 45 | "Congo, Rep.",169,188,200,215,229,245,258,277,299,324,353,382,408,425,430,425,414,403
 46 | Cook Islands,0,6,37,31,24,12,6,13,0,20,7,14,7,0,8,8,8,15
 47 | Costa Rica,18,18,17,17,16,16,15,15,14,14,14,13,13,12,12,12,11,11
 48 | Croatia,74,73,73,72,70,66,65,64,62,56,49,46,44,43,42,41,41,40
 49 | Cuba,25,23,22,20,18,17,16,14,13,12,11,10,10,9,8,8,7,6
 50 | Cyprus,9,9,8,8,7,7,7,7,6,6,6,6,5,5,5,5,5,5
 51 | Czech Republic,21,21,21,20,20,20,20,20,19,17,15,14,13,12,11,10,10,9
 52 | Cote d'Ivoire,177,196,209,224,239,255,269,289,312,338,368,398,425,444,448,443,432,420
 53 | "Korea, Dem. Rep.",344,344,344,344,344,344,344,344,344,344,344,344,344,344,344,344,344,344
 54 | "Congo, Dem. Rep.",165,182,195,209,222,238,251,269,290,315,343,371,396,413,417,413,402,392
 55 | Denmark,15,14,14,13,12,12,11,11,10,10,9,9,9,9,8,8,8,8
 56 | Djibouti,582,594,606,618,630,642,655,668,681,695,708,722,737,751,766,781,797,813
 57 | Dominica,15,15,15,14,14,14,14,14,14,14,14,14,14,14,14,13,13,13
 58 | Dominican Republic,114,111,108,104,101,99,96,93,90,88,85,83,80,78,76,73,71,69
 59 | Ecuador,167,162,157,153,148,144,140,136,132,128,124,121,117,114,111,107,104,101
 60 | Egypt,37,36,36,35,34,34,33,31,29,28,27,26,26,24,23,22,22,21
 61 | El Salvador,82,79,75,72,69,66,64,61,58,56,54,51,49,47,45,43,41,40
 62 | Equatorial Guinea,108,119,127,136,145,155,164,176,190,206,224,242,259,270,273,270,263,256
 63 | Eritrea,72,73,74,76,77,78,79,81,82,84,85,86,88,89,91,92,94,95
 64 | Estonia,32,32,35,42,48,53,58,64,66,68,66,62,55,50,46,43,40,38
 65 | Ethiopia,159,176,188,201,215,229,242,260,280,304,331,358,383,399,403,398,388,378
 66 | Fiji,51,48,46,43,41,39,37,35,33,32,30,28,27,26,24,23,22,21
 67 | Finland,18,16,14,12,12,13,13,13,12,12,10,10,9,8,7,6,6,6
 68 | France,26,25,23,22,21,20,19,19,18,17,16,16,15,15,15,14,14,14
 69 | French Polynesia,34,27,45,42,47,54,43,45,51,45,29,29,29,22,26,27,30,27
 70 | Gabon,153,150,148,151,156,151,166,174,200,210,254,271,285,283,296,325,366,406
 71 | Gambia,185,189,193,196,200,204,208,212,217,221,225,230,234,239,244,248,253,258
 72 | Georgia,39,37,39,43,47,54,62,67,73,78,82,84,84,83,83,84,84,84
 73 | Germany,20,19,19,18,17,16,15,15,14,13,11,10,9,8,8,7,7,6
 74 | Ghana,223,222,220,219,218,217,216,214,213,212,211,210,209,207,206,205,204,203
 75 | Greece,33,32,30,28,27,26,25,24,23,22,21,20,20,19,19,18,18,18
 76 | Grenada,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4
 77 | Guam,51,50,48,55,73,46,45,45,44,43,39,44,35,40,33,42,29,34
 78 | Guatemala,74,74,73,72,72,71,70,70,69,68,68,67,67,66,65,65,64,63
 79 | Guinea,119,126,132,139,147,154,163,171,180,190,200,211,222,234,246,259,273,287
 80 | Guinea-Bissau,158,161,164,167,170,174,177,181,184,188,192,195,199,203,207,211,216,220
 81 | Guyana,27,31,26,34,41,55,64,65,71,73,79,91,104,115,118,123,122,122
 82 | Haiti,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306
 83 | Honduras,98,95,92,89,87,84,82,80,77,75,73,71,69,67,65,63,61,59
 84 | Hungary,41,42,44,46,48,48,49,47,44,40,36,33,31,28,25,22,19,17
 85 | Iceland,6,6,5,5,5,5,4,4,4,4,4,3,3,2,3,3,4,4
 86 | India,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168
 87 | Indonesia,343,335,327,319,311,304,297,290,283,276,270,263,257,251,245,239,234,228
 88 | Iran,36,37,41,39,40,40,39,35,32,31,31,30,29,27,25,24,23,22
 89 | Iraq,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56
 90 | Ireland,24,24,23,22,20,18,16,15,16,15,14,13,13,12,12,13,13,13
 91 | Israel,14,13,13,12,11,11,11,10,10,9,9,8,8,8,8,8,8,8
 92 | Italy,14,13,12,12,11,11,10,10,10,9,9,8,8,8,8,8,7,7
 93 | Jamaica,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
 94 | Japan,47,46,44,42,41,39,38,38,37,37,34,32,29,28,26,24,23,21
 95 | Jordan,17,16,15,14,14,13,13,11,10,9,9,8,8,7,8,8,8,7
 96 | Kazakhstan,58,54,54,54,55,62,72,90,111,130,141,146,148,148,144,137,133,129
 97 | Kenya,112,114,135,160,192,224,258,302,344,382,405,408,419,436,441,406,371,353
 98 | Kiribati,513,503,493,483,474,464,455,446,437,428,420,412,403,396,388,380,372,365
 99 | Kuwait,45,42,40,37,36,34,33,32,31,29,28,27,26,25,25,24,24,24
100 | Kyrgyzstan,55,58,58,59,65,77,94,110,125,130,135,135,133,128,125,124,123,121
101 | Laos,179,177,175,173,172,170,168,167,165,163,162,160,159,157,156,154,153,151
102 | Latvia,34,36,37,41,49,60,73,82,85,85,83,83,79,73,68,63,58,53
103 | Lebanon,50,49,49,48,48,46,41,35,32,29,27,23,20,18,17,17,18,19
104 | Lesotho,184,201,218,244,280,323,362,409,461,519,553,576,613,635,643,639,638,637
105 | Liberia,199,203,207,211,215,219,223,228,232,237,242,246,251,256,261,266,272,277
106 | Libyan Arab Jamahiriya,30,29,30,29,28,28,27,25,24,23,22,22,21,20,19,18,18,17
107 | Lithuania,40,42,46,51,58,65,73,80,82,80,77,73,73,68,65,63,66,68
108 | Luxembourg,23,22,21,19,19,18,17,16,16,15,14,14,14,13,13,13,12,12
109 | Madagascar,177,181,185,189,192,196,200,205,209,213,217,222,226,231,236,241,246,251
110 | Malawi,258,286,314,343,373,390,389,401,412,417,425,414,416,410,405,391,368,346
111 | Malaysia,118,117,117,116,115,114,113,112,111,110,109,108,108,107,106,105,104,103
112 | Maldives,129,121,114,108,102,96,90,85,80,75,71,67,63,59,56,53,50,47
113 | Mali,275,277,280,282,285,287,290,292,295,297,300,303,305,308,311,313,316,319
114 | Malta,11,11,10,9,9,9,8,8,8,7,7,7,7,6,6,6,6,6
115 | Mauritania,228,232,237,241,246,251,256,261,266,272,277,282,288,294,300,305,312,318
116 | Mauritius,28,27,27,27,26,26,26,25,25,25,24,24,24,24,23,23,23,22
117 | Mexico,61,57,54,50,47,44,41,39,36,34,32,30,28,26,24,23,21,20
118 | "Micronesia, Fed. Sts.",188,181,174,168,161,155,149,143,138,133,128,123,118,114,109,105,101,97
119 | Monaco,4,4,4,4,4,3,3,3,3,3,3,3,3,2,2,2,2,2
120 | Mongolia,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205
121 | Montenegro,,,,,,,,,,,,,,,,33,32,
122 | Montserrat,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8
123 | Morocco,149,145,141,137,133,129,125,122,118,115,112,109,106,103,100,97,94,92
124 | Mozambique,181,201,214,230,245,262,276,297,320,347,378,408,436,455,460,454,443,431
125 | Myanmar,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171
126 | Namibia,322,357,381,409,435,465,491,527,568,616,671,726,776,809,817,808,787,767
127 | Nauru,85,143,140,137,45,132,129,127,124,22,44,33,55,33,110,121,132,33
128 | Nepal,243,238,233,229,224,220,216,211,207,203,199,195,191,187,184,180,176,173
129 | Netherlands,14,13,13,12,11,11,10,10,10,9,9,8,8,8,8,8,8,8
130 | Netherlands Antilles,14,13,13,12,11,11,10,10,10,9,9,8,8,8,8,8,8,7
131 | New Caledonia,93,89,87,63,57,50,58,48,48,41,49,31,32,19,29,22,22,22
132 | New Zealand,10,10,9,9,10,10,10,10,10,11,11,9,10,10,9,9,8,7
133 | Nicaragua,108,103,98,94,89,85,81,78,74,71,68,64,62,59,56,53,51,49
134 | Niger,125,127,130,133,135,138,141,143,146,149,152,155,158,161,164,168,171,174
135 | Nigeria,131,145,155,166,176,188,199,214,230,250,272,294,314,328,331,327,319,311
136 | Niue,59,58,56,55,54,53,101,0,0,57,0,0,253,0,0,0,0,0
137 | Northern Mariana Islands,71,101,150,97,93,92,94,166,167,110,121,90,80,66,75,79,69,58
138 | Norway,10,10,9,9,8,8,8,7,7,7,6,6,6,6,6,6,6,6
139 | Oman,26,23,19,16,14,14,13,13,12,12,12,13,12,12,11,12,12,13
140 | Pakistan,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181
141 | Palau,64,44,28,172,275,124,32,92,54,188,52,51,62,50,28,55,66,60
142 | Panama,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47
143 | Papua New Guinea,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250
144 | Paraguay,60,60,60,60,60,60,60,59,59,59,59,59,59,59,59,58,58,58
145 | Peru,317,301,285,270,255,242,229,217,205,195,184,174,165,156,148,140,133,126
146 | Philippines,393,386,380,373,366,360,353,347,341,335,329,323,317,312,306,301,295,290
147 | Poland,52,52,53,53,52,51,48,45,42,38,35,33,32,30,28,27,26,25
148 | Portugal,67,65,62,61,59,59,57,55,53,49,46,45,43,41,37,34,32,30
149 | Puerto Rico,11,10,11,12,12,12,11,10,9,8,7,6,5,5,5,5,4,4
150 | Qatar,60,59,59,64,72,76,71,64,63,66,66,64,60,56,57,58,64,70
151 | "Korea, Rep.",165,145,129,112,106,98,93,87,80,71,72,74,83,80,83,85,87,90
152 | Moldova,65,62,65,71,79,91,103,113,122,130,138,141,140,139,139,140,140,141
153 | Romania,74,79,86,96,105,112,116,121,125,131,136,143,145,146,140,134,125,115
154 | Russian Federation,45,42,46,51,60,73,86,94,100,106,113,112,108,105,105,106,108,110
155 | Rwanda,167,185,197,212,225,241,254,273,294,319,348,376,402,419,423,418,408,397
156 | Saint Kitts and Nevis,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9
157 | Saint Lucia,16,16,16,16,15,15,15,15,15,15,15,15,15,15,15,14,14,14
158 | Saint Vincent and the Grenadines,27,27,27,27,27,27,26,26,26,26,26,26,25,25,25,25,25,25
159 | Samoa,32,31,30,29,28,27,26,26,25,24,23,23,22,21,21,20,19,19
160 | San Marino,12,11,11,10,9,9,9,8,8,8,7,7,7,7,7,6,6,6
161 | Sao Tome and Principe,135,133,131,129,126,124,122,120,118,116,114,112,110,108,106,105,103,101
162 | Saudi Arabia,43,38,37,38,41,43,45,46,48,48,47,46,44,43,42,43,45,46
163 | Senegal,195,198,202,206,211,215,219,223,228,232,237,241,246,251,256,261,266,272
164 | Seychelles,43,43,42,41,40,40,39,38,38,37,37,36,35,35,34,33,33,32
165 | Sierra Leone,207,220,233,248,263,279,297,315,334,355,377,400,425,451,479,509,540,574
166 | Singapore,50,50,51,48,47,47,48,48,46,43,37,35,33,32,30,28,27,27
167 | Slovakia,40,41,44,45,44,41,37,35,31,29,26,25,24,22,19,17,17,17
168 | Slovenia,43,40,38,37,34,33,32,30,27,25,23,21,19,17,16,14,14,13
169 | Solomon Islands,312,296,281,267,253,240,228,216,205,195,185,175,166,158,150,142,135,128
170 | Somalia,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249
171 | South Africa,301,301,302,305,309,317,332,360,406,479,576,683,780,852,898,925,940,948
172 | Spain,56,53,50,47,45,43,41,40,38,36,35,33,33,32,31,31,30,30
173 | Sri Lanka,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60
174 | Sudan,174,178,181,185,189,192,196,200,204,208,212,216,221,225,229,234,239,243
175 | Suriname,66,61,49,51,49,51,58,67,79,83,79,79,80,86,91,100,108,116
176 | Swaziland,267,266,260,267,293,337,398,474,558,691,801,916,994,"1,075","1,127","1,141","1,169","1,198"
177 | Sweden,7,7,7,7,7,6,6,6,6,5,5,5,5,5,5,6,6,6
178 | Switzerland,18,18,16,14,14,13,12,11,11,10,9,8,8,8,8,7,7,6
179 | Syrian Arab Republic,61,57,54,51,49,46,43,41,39,37,35,33,31,29,28,26,25,24
180 | Tajikistan,112,95,66,47,51,65,82,86,97,105,117,136,153,165,181,192,211,231
181 | Thailand,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142
182 | "Macedonia, FYR",54,53,53,52,51,48,47,47,45,41,36,33,32,31,31,30,30,29
183 | Timor-Leste,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322
184 | Togo,308,314,320,326,333,339,346,353,360,367,374,382,389,397,405,413,421,429
185 | Tokelau,69,70,72,56,0,150,0,56,56,0,0,0,56,0,56,0,0,0
186 | Tonga,34,33,32,32,31,31,30,29,29,28,28,27,27,26,26,25,25,24
187 | Trinidad and Tobago,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11
188 | Tunisia,31,30,31,32,33,31,31,30,28,27,25,24,23,23,24,24,25,26
189 | Turkey,49,48,47,45,43,40,40,41,41,36,31,29,28,28,28,29,29,30
190 | Turkmenistan,64,62,64,62,59,52,60,74,89,93,92,88,85,79,75,70,69,68
191 | Turks and Caicos Islands,26,24,23,22,21,20,19,18,18,17,16,15,15,15,14,14,14,14
192 | Tuvalu,296,287,277,268,259,250,242,234,226,218,211,204,197,191,184,178,172,166
193 | Uganda,163,250,272,296,306,319,314,320,326,324,340,360,386,396,385,370,350,330
194 | Ukraine,41,40,43,46,48,51,58,63,71,76,84,91,95,97,97,102,102,102
195 | United Arab Emirates,30,28,27,25,24,23,22,21,20,19,18,18,18,17,17,16,16,16
196 | United Kingdom,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,14,15,15
197 | Tanzania,178,196,213,229,249,271,290,308,317,327,339,346,352,344,337,325,311,297
198 | Virgin Islands (U.S.),19,18,17,16,15,15,14,14,13,12,12,11,11,11,11,10,10,10
199 | United States of America,9,10,10,9,9,8,7,7,6,6,6,5,5,5,5,5,4,4
200 | Uruguay,28,27,27,27,26,26,26,25,25,25,24,24,24,23,23,23,23,22
201 | Uzbekistan,68,64,63,73,73,76,72,80,85,89,93,102,110,113,113,117,115,113
202 | Vanuatu,139,134,130,125,121,117,113,109,105,102,98,95,92,89,86,83,80,77
203 | Venezuela,35,35,35,35,35,35,35,35,34,34,34,34,34,34,34,34,34,34
204 | Viet Nam,202,200,198,196,195,193,191,189,187,185,183,182,180,178,176,175,173,171
205 | Wallis et Futuna,63,176,32,87,86,47,62,107,54,53,52,7,141,111,48,52,46,15
206 | West Bank and Gaza,35,34,34,33,33,32,31,29,28,27,26,25,24,23,22,21,21,20
207 | Yemen,133,131,132,127,125,122,119,111,106,102,100,96,93,89,85,82,79,76
208 | Zambia,297,349,411,460,501,536,554,576,583,603,602,627,632,652,623,588,547,506
209 | Zimbabwe,329,364,389,417,444,474,501,538,580,628,685,740,791,825,834,824,803,782


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-10-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-13-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-13-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-14-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-14-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-27-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-28-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-28-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-3-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-3-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-5-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-5-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-6-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-8-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-clustering_files/figure-html/unnamed-chunk-9-1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering-r.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Dimensionality Reduction and Clustering"
  3 | author: "Jose A. Dianes"
  4 | date: "27 July 2015"
  5 | output:
  6 |   html_document:
  7 |     keep_md: yes
  8 | ---
  9 | 
 10 | ## Getting data  
 11 | 
 12 | In R, you use `read.csv` to read CSV files into `data.frame` variables. Although the R function `read.csv` can work with URLs, https is a problem for R in many cases, so you need to use a package like RCurl to get around it.  
 13 | 
 14 | ```{r}
 15 | library(RCurl)
 16 | 
 17 | # Get and process existing cases file
 18 | existing_cases_file <- getURL("https://docs.google.com/spreadsheets/d/1X5Jp7Q8pTs3KLJ5JBWKhncVACGsg5v4xu6badNs4C7I/pub?gid=0&output=csv")
 19 | existing_df <- read.csv(text = existing_cases_file, row.names=1, stringsAsFactor=F)
 20 | existing_df[c(1,2,3,4,5,6,15,16,17,18)] <- 
 21 |     lapply( existing_df[c(1,2,3,4,5,6,15,16,17,18)], 
 22 |             function(x) { as.integer(gsub(',', '', x) )})
 23 | ```
 24 | 
 25 | 
 26 | ## PCA  
 27 | 
 28 | The default R package `stats` comes with function `prcomp()` to perform principal component analysis. This means that we don’t need to install anything (although there are other options using external packages). This is perhaps the quickest way to do a PCA, and I recommend you to call `?prcomp` in your R console if you're interested in the details of how to fine tune the PCA process with this function.  
 29 | 
 30 | ```{r}
 31 | pca_existing <- prcomp(existing_df, scale. = TRUE)
 32 | ```
 33 | 
 34 | The resulting object contains several pieces of information related with principal component analysis. We are interested in the scores, that we have in `pca_existing$x`. We got 18 different principal components. Remember that the total number of PCs corresponds to the total number of variables in the dataset, although we normally don't want to use all of them but the subset that corresponds to our purposes.  
 35 | 
 36 | In our case we will use the first two. How much variation is explained by each one? In R we can use the `plot` function that comes with the PCA result for that.  
 37 | 
 38 | ```{r}
 39 | plot(pca_existing)
 40 | ```
 41 | 
 42 | Most variation is explained by the first PC. So let's use the first two PCs to represent all of our countries in a scatterplot.  
 43 | 
 44 | ```{r}
 45 | scores_existing_df <- as.data.frame(pca_existing$x)
 46 | # Show first two PCs for head countries
 47 | head(scores_existing_df[1:2])
 48 | ```
 49 | 
 50 | Now that we have them in a data frame, we can use them with `plot`.  
 51 | 
 52 | ```{r, fig.height=10,fig.width=9}
 53 | plot(PC1~PC2, data=scores_existing_df, 
 54 |      main= "Existing TB cases per 100K distribution",
 55 |      cex = .1, lty = "solid")
 56 | text(PC1~PC2, data=scores_existing_df, 
 57 |      labels=rownames(existing_df),
 58 |      cex=.8)
 59 | ```
 60 | 
 61 | Let's set the color associated with the mean value for all the years. We will use functions `rgb`, `ramp`, and `rescale` to create a color palette from yellow (lower values) to blue (higher values).    
 62 | 
 63 | ```{r, fig.height=10,fig.width=9}
 64 | library(scales)
 65 | ramp <- colorRamp(c("yellow", "blue"))
 66 | colours_by_mean <- rgb( 
 67 |     ramp( as.vector(rescale(rowMeans(existing_df),c(0,1)))), 
 68 |     max = 255 )
 69 | plot(PC1~PC2, data=scores_existing_df, 
 70 |      main= "Existing TB cases per 100K distribution",
 71 |      cex = .1, lty = "solid", col=colours_by_mean)
 72 | text(PC1~PC2, data=scores_existing_df, 
 73 |      labels=rownames(existing_df),
 74 |      cex=.8, col=colours_by_mean)
 75 | ```
 76 | 
 77 | Now let's associate colour with total sum.  
 78 | 
 79 | ```{r, fig.height=10,fig.width=9}
 80 | ramp <- colorRamp(c("yellow", "blue"))
 81 | colours_by_sum <- rgb( 
 82 |     ramp( as.vector(rescale(rowSums(existing_df),c(0,1)))), 
 83 |     max = 255 )
 84 | plot(PC1~PC2, data=scores_existing_df, 
 85 |      main= "Existing TB cases per 100K distribution",
 86 |      cex = .1, lty = "solid", col=colours_by_sum)
 87 | text(PC1~PC2, data=scores_existing_df, 
 88 |      labels=rownames(existing_df),
 89 |      cex=.8, col=colours_by_sum)
 90 | ```
 91 | 
 92 | And finally let's associate it with the difference between first and last year, as a simple way to measure the change in time.  
 93 | 
 94 | 
 95 | ```{r}
 96 | existing_df_change <- existing_df$X2007 - existing_df$X1990
 97 | ramp <- colorRamp(c("yellow", "blue"))
 98 | colours_by_change <- rgb( 
 99 |     ramp( as.vector(rescale(existing_df_change,c(0,1)))), 
100 |     max = 255 )
101 | plot(PC1~PC2, data=scores_existing_df, 
102 |      main= "Existing TB cases per 100K distribution",
103 |      cex = .1, lty = "solid", col=colours_by_change)
104 | text(PC1~PC2, data=scores_existing_df, 
105 |      labels=rownames(existing_df),
106 |      cex=.8, col=colours_by_change)
107 | ```
108 | 
109 | As we can see, the color gradation mostly changes on the direction of the second principal component. That is, while the first PC captures most of the variation within our dataset and this variation is based on the total cases in the 1990-2007 range, the second PC is largely affected by the change over time.  
110 | 
111 | ## Clustering  
112 | 
113 | Obtaining clusters in R is as simple as calling to `kmeans`. The function has several parameters, but we will just use all the defaults and start trying with different values of k.  
114 | 
115 | Let's start with `k=3` asuming that at least, the are countries in a really bad situation, countries in a good situation, and some of them in between.  
116 | 
117 | ```{r}
118 | set.seed(1234)
119 | existing_clustering <- kmeans(existing_df, centers = 3)
120 | ```
121 | 
122 | The result contains a list with components:  
123 | 
124 | - `cluster`: A vector of integers indicating the cluster to which each point is allocated.  
125 | - `centers`: A matrix of cluster centres.  
126 | - `withinss`: The within-cluster sum of square distances for each cluster.  
127 | - `size`: The number of points in each cluster.  
128 | 
129 | Let's colour our previous scatter plot based on what cluster each country belongs to.  
130 | 
131 | ```{r, fig.height=10,fig.width=9}
132 | existing_cluster_groups <- existing_clustering$cluster
133 | plot(PC1~PC2, data=scores_existing_df, 
134 |      main= "Existing TB cases per 100K distribution",
135 |      cex = .1, lty = "solid", col=existing_cluster_groups)
136 | text(PC1~PC2, data=scores_existing_df, 
137 |      labels=rownames(existing_df),
138 |      cex=.8, col=existing_cluster_groups)
139 | ```
140 | 
141 | Most clusters are based on the first PC. That means that clusters are just defined in terms of the total number of cases per 100K and not how the data evolved on time (PC2). So let's try with `k=4` and see if some of these cluster are refined in the direction of the second PC.  
142 | 
143 | ```{r}
144 | set.seed(1234)
145 | existing_clustering <- kmeans(existing_df, centers = 4)
146 | existing_cluster_groups <- existing_clustering$cluster
147 | plot(PC1~PC2, data=scores_existing_df, 
148 |      main= "Existing TB cases per 100K distribution",
149 |      cex = .1, lty = "solid", col=existing_cluster_groups)
150 | text(PC1~PC2, data=scores_existing_df, 
151 |      labels=rownames(existing_df),
152 |      cex=.8, col=existing_cluster_groups)
153 | ```
154 | 
155 | There is more refinement, but again is in the direction of the first PC. Let's try then with `k=5`.  
156 | 
157 | ```{r}
158 | set.seed(1234)
159 | existing_clustering <- kmeans(existing_df, centers = 5)
160 | existing_cluster_groups <- existing_clustering$cluster
161 | plot(PC1~PC2, data=scores_existing_df, 
162 |      main= "Existing TB cases per 100K distribution",
163 |      cex = .1, lty = "solid", col=existing_cluster_groups)
164 | text(PC1~PC2, data=scores_existing_df, 
165 |      labels=rownames(existing_df),
166 |      cex=.8, col=existing_cluster_groups)
167 | ```
168 | 
169 | There we have it. Right in the middle we have a cluster that has been split in two different ones in the direction of the second PC. What if we try with `k=6`?  
170 | 
171 | ```{r}
172 | set.seed(1234)
173 | existing_clustering <- kmeans(existing_df, centers = 6)
174 | existing_cluster_groups <- existing_clustering$cluster
175 | plot(PC1~PC2, data=scores_existing_df, 
176 |      main= "Existing TB cases per 100K distribution",
177 |      cex = .1, lty = "solid", col=existing_cluster_groups)
178 | text(PC1~PC2, data=scores_existing_df, 
179 |      labels=rownames(existing_df),
180 |      cex=.8, col=existing_cluster_groups)
181 | ```
182 | 
183 | We get some diagonal split in the second top cluster. That surely contains some interesting information, but let's revert to our `k=5` case and later on we will see how to use a different refinement process with clusters are too tight like we have at the top of the plot.  
184 | 
185 | ```{r}
186 | set.seed(1234)
187 | existing_clustering <- kmeans(existing_df, centers = 5)
188 | existing_cluster_groups <- existing_clustering$cluster
189 | plot(PC1~PC2, data=scores_existing_df, 
190 |      main= "Existing TB cases per 100K distribution",
191 |      cex = .1, lty = "solid", col=existing_cluster_groups)
192 | text(PC1~PC2, data=scores_existing_df, 
193 |      labels=rownames(existing_df),
194 |      cex=.8, col=existing_cluster_groups)
195 | ```
196 | 
197 | ## Analysing clusters
198 | 
199 | Most of the work in this section is about data frame indexing. There isn't anything sophisticated about the code we will use, so we will pick up one of our languages and perform the whole thing (we will use R this time). In order to analyise each cluster, let's add a column in our data frame containing the cluster ID. We will use that for subsetting.  
200 | 
201 | ```{r}
202 | existing_df$cluster <- existing_clustering$cluster
203 | table(existing_df$cluster)
204 | ```
205 | 
206 | The last line shows how many countries do we have in each cluster.  
207 | 
208 | ### Centroids comparison chart  
209 | 
210 | Let's start by creating a line chart that compares the time series for each cluster centroid. This chart will helps us better understand our cluster results.  
211 | 
212 | ```{r}
213 | xrange <- 1990:2007
214 | plot(xrange, existing_clustering$centers[1,], 
215 |      type='l', xlab="Year", 
216 |      ylab="New cases per 100K", 
217 |      col = 1, 
218 |      ylim=c(0,1000))
219 | for (i in 2:nrow(existing_clustering$centers)) {
220 |     lines(xrange, existing_clustering$centers[i,],
221 |     col = i)
222 | }
223 | legend(x=1990, y=1000, 
224 |        lty=1, cex = 0.5,
225 |        ncol = 5,
226 |        col=1:(nrow(existing_clustering$centers)+1),
227 |        legend=paste("Cluster",1:nrow(existing_clustering$centers)))
228 | ```
229 | 
230 | ### Cluster 1  
231 | 
232 | Cluster 1 contains just 16 countries. These are:  
233 | 
234 | ```{r}
235 | rownames(subset(existing_df, cluster==1))
236 | ```
237 | 
238 | The centroid that represents them is:
239 | 
240 | ```{r}
241 | existing_clustering$centers[1,]
242 | ```
243 | 
244 | These are by all means countries with the most tuberculosis cases every year. We can see in the carht that this is the top line, although the number of cases descends progressively.  
245 | 
246 | ### Cluster 2  
247 | 
248 | Cluster 2 contains 30 countries. These are:  
249 | 
250 | ```{r}
251 | rownames(subset(existing_df, cluster==2))
252 | ```
253 | 
254 | The centroid that represents them is:
255 | 
256 | ```{r}
257 | existing_clustering$centers[2,]
258 | ```
259 | 
260 | It is a relatively large cluster. Still countries with lots of cases, but definitively less than the first cluster. We see countries such as India or China here, the larger countries on earth (from a previous tutorial we know that China itself has reduced its cases by 85%) and american countries such as Peru or Bolivia. In fact, this is the cluster with the fastest decrease in the number of existing cases as we see in the line chart.   
261 | 
262 | ### Cluster 3  
263 | 
264 | This is an important one. Cluster 3 contains just 20 countries. These are:  
265 | 
266 | ```{r}
267 | rownames(subset(existing_df, cluster==3))
268 | ```
269 | 
270 | The centroid that represents them is:
271 | 
272 | ```{r}
273 | existing_clustering$centers[3,]
274 | ```
275 | 
276 | This is the only cluster where the number of cases has increased over the years, and is about to overtake the first position by 2007. Each of these countries are probably in the middle of an humanitarian crisis and probably beeing affected by other infectious diseases such as HIV. We can confirm here that PC2 is coding mostly that, the percentage of variation over time of the number of exiting cases.  
277 | 
278 | ### Cluster 4  
279 | 
280 | The fourth cluster contains 51 countries.  
281 | 
282 | ```{r}
283 | rownames(subset(existing_df, cluster==4))
284 | ```
285 | 
286 | Represented by its centroid.  
287 | 
288 | ```{r}
289 | existing_clustering$centers[4,]
290 | ```
291 | 
292 | This cluster is pretty close to the last and larger one. It contains many american countries, some european countries, etc. Some of them are large and rich, such as Russia or Brazil. Structurally the differece with the countries in Cluster 5 may reside in a larger number of cases per 100K. They also seem to be decreasing the number of cases slightly faster than Cluster 5. These two reasons made k-means cluster them in a different group.  
293 | 
294 | ### Cluster 5  
295 | 
296 | The last and bigger cluster contains 90 countries.  
297 | 
298 | ```{r}
299 | rownames(subset(existing_df, cluster==5))
300 | ```
301 | 
302 | Represented by its centroid.  
303 | 
304 | ```{r}
305 | existing_clustering$centers[5,]
306 | ```
307 | 
308 | This cluster is too heterogeneous and probably needs futher refinement. However, it is a good grouping when compared to other distant clusters. In any case it contains those countries with less number of existing cases in our set.    
309 | 
310 | ### A second level of clustering  
311 | 
312 | So let's do just that quickly. Let's re-cluster the 90 countries in our Cluster 5 in order to firther refine them. As the number of clusters let's use 2. We are just interested in seeing if there are actually two different clusters withing Cluster 5. The reader can of course try to go further and use more than 2 centers.  
313 | 
314 | ```{r}
315 | # subset the original dataset
316 | cluster5_df <- subset(existing_df, cluster==5)
317 | # do the clustering
318 | set.seed(1234)
319 | cluster5_clustering <- kmeans(cluster5_df[,-19], centers = 2)
320 | # assign sub-cluster number to the data set for Cluster 5
321 | cluster5_df$cluster <- cluster5_clustering$cluster
322 | ```
323 | 
324 | Now we can plot them in order to see if there are actual differences.  
325 | 
326 | ```{r}
327 | xrange <- 1990:2007
328 | plot(xrange, cluster5_clustering$centers[1,], 
329 |      type='l', xlab="Year", 
330 |      ylab="Existing cases per 100K", 
331 |      col = 1, 
332 |      ylim=c(0,200))
333 | for (i in 2:nrow(cluster5_clustering$centers)) {
334 |     lines(xrange, cluster5_clustering$centers[i,],
335 |     col = i)
336 | }
337 | legend(x=1990, y=200, 
338 |        lty=1, cex = 0.5,
339 |        ncol = 5,
340 |        col=1:(nrow(cluster5_clustering$centers)+1),
341 |        legend=paste0("Cluster 5.",1:nrow(cluster5_clustering$centers)))
342 | ```
343 | 
344 | There are actually different tendencies in our data. We can see that there is a group of countries in our original Cluster 5 that is decreasing the number cases at a faster rate, trying to catch up with those countries with a lower number of existing TB cases per 100K.  
345 | 
346 | ```{r}
347 | rownames(subset(cluster5_df, cluster5_df$cluster==2))
348 | ```
349 | 
350 | While the countries with less number of cases and also slower decreasing rate is.  
351 | 
352 | ```{r}
353 | rownames(subset(cluster5_df, cluster5_df$cluster==1))
354 | ```
355 | 
356 | However, we won't likely obtain this clusters by just increasing in 1 the number of centers in our first clustering process with the original dataset. As we said, Cluster 5 seemed like a very cohesive group when compared with more distant countries. This two step clustering process is a useful technique that we can use with any dataset we want to explore.  
357 | 
358 | 
359 | 
360 | 
361 | 


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_24_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_24_0.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_28_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_28_0.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_30_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_30_1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_33_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_33_1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_43_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/03-dimensionality-reduction-and-clustering/dimensionality-reduction-clustering_files/dimensionality-reduction-clustering_43_1.png


--------------------------------------------------------------------------------
/03-dimensionality-reduction-and-clustering/tb_existing_100.csv:
--------------------------------------------------------------------------------
  1 | "TB prevalence, all forms (per 100 000 population per year)",1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
  2 | Afghanistan,436,429,422,415,407,397,397,387,374,373,346,326,304,308,283,267,251,238
  3 | Albania,42,40,41,42,42,43,42,44,43,42,40,34,32,32,29,29,26,22
  4 | Algeria,45,44,44,43,43,42,43,44,45,46,48,49,50,51,52,53,55,56
  5 | American Samoa,42,14,4,18,17,22,0,25,12,8,8,6,5,6,9,11,9,5
  6 | Andorra,39,37,35,33,32,30,28,23,24,22,20,20,21,18,19,18,17,19
  7 | Angola,514,514,513,512,510,508,512,363,414,384,530,335,307,281,318,331,302,294
  8 | Anguilla,38,38,37,37,36,35,35,36,36,36,35,35,35,35,35,34,34,34
  9 | Antigua and Barbuda,16,15,15,14,13,12,12,11,11,9,8,9,7,9,8,8,9,9
 10 | Argentina,96,91,86,82,78,74,71,67,63,58,52,51,42,41,39,39,37,35
 11 | Armenia,52,49,51,55,60,68,74,75,74,86,94,99,97,91,85,79,79,81
 12 | Australia,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6
 13 | Austria,18,17,16,15,15,14,13,13,12,12,11,11,11,10,10,10,10,10
 14 | Azerbaijan,58,55,57,61,67,76,85,91,100,106,113,117,99,109,90,85,86,86
 15 | Bahamas,54,53,52,52,53,54,54,54,55,46,45,45,51,51,50,50,50,51
 16 | Bahrain,120,113,108,101,97,92,89,86,83,67,57,56,55,53,48,45,45,60
 17 | Bangladesh,639,623,608,594,579,576,550,535,516,492,500,491,478,458,444,416,392,387
 18 | Barbados,8,8,7,7,6,6,6,6,5,5,5,4,4,4,4,4,3,3
 19 | Belarus,62,54,59,62,75,82,91,98,109,113,110,100,89,68,68,68,69,69
 20 | Belgium,16,15,15,15,15,14,13,13,12,12,12,13,12,11,11,11,10,9
 21 | Belize,65,64,62,59,57,55,37,41,53,53,39,36,36,40,42,38,41,46
 22 | Benin,140,138,135,132,129,125,127,129,130,128,128,129,137,139,134,135,134,135
 23 | Bermuda,10,10,9,9,8,8,8,8,7,7,7,6,6,6,6,6,6,6
 24 | Bhutan,924,862,804,750,699,651,620,597,551,538,515,512,472,460,443,412,406,363
 25 | Bolivia,377,362,347,333,320,306,271,264,254,248,238,229,223,218,211,205,202,198
 26 | Bosnia and Herzegovina,160,156,154,150,143,134,131,125,96,80,70,63,66,63,55,58,58,55
 27 | Botswana,344,355,351,349,347,349,336,349,371,413,445,497,535,586,598,599,621,622
 28 | Brazil,124,119,114,109,104,100,97,93,88,86,83,80,77,72,63,60,56,60
 29 | British Virgin Islands,32,30,28,26,25,23,22,21,20,19,19,18,18,17,16,17,16,16
 30 | Brunei Darussalam,91,91,91,91,91,91,91,88,88,93,108,85,78,73,63,55,59,65
 31 | Bulgaria,43,48,54,57,58,57,59,65,68,68,64,63,52,42,40,41,40,41
 32 | Burkina Faso,179,196,208,221,233,246,251,271,286,308,338,368,398,419,426,421,411,403
 33 | Burundi,288,302,292,293,305,322,339,346,424,412,455,522,581,619,639,654,657,647
 34 | Cambodia,928,905,881,858,836,811,810,789,777,764,758,750,728,712,696,676,672,664
 35 | Cameroon,188,199,200,199,197,197,196,207,212,219,228,241,240,227,228,213,201,195
 36 | Canada,7,7,7,6,6,6,5,5,5,5,5,4,4,4,4,4,4,4
 37 | Cape Verde,449,438,428,418,408,398,394,391,387,384,380,283,374,370,367,278,285,280
 38 | Cayman Islands,10,10,9,9,8,8,8,8,7,7,7,5,6,6,5,5,6,5
 39 | Central African Republic,318,336,342,350,356,365,270,395,419,449,485,495,468,566,574,507,437,425
 40 | Chad,251,272,282,294,304,315,354,408,433,390,420,450,502,573,548,518,505,497
 41 | Chile,45,41,38,35,32,30,28,25,24,22,21,19,19,18,15,15,13,12
 42 | China,327,321,315,309,303,303,290,283,276,273,269,265,259,241,220,206,200,194
 43 | Colombia,88,85,82,79,76,73,71,69,67,61,51,62,60,58,55,53,44,43
 44 | Comoros,188,177,167,157,148,140,130,155,120,143,112,103,104,107,99,91,86,83
 45 | "Congo, Rep.",209,222,231,243,255,269,424,457,367,545,313,354,402,509,477,482,511,485
 46 | Cook Islands,0,10,57,47,38,19,10,13,0,40,12,29,11,0,15,9,16,31
 47 | Costa Rica,30,28,27,26,25,24,23,22,21,19,14,14,15,14,12,12,12,11
 48 | Croatia,126,123,121,118,113,106,103,102,99,89,76,73,69,68,67,65,65,54
 49 | Cuba,32,29,26,24,22,20,18,17,15,14,13,12,11,10,9,8,8,7
 50 | Cyprus,14,13,13,12,11,11,11,10,7,7,9,8,7,6,6,6,6,6
 51 | Czech Republic,22,22,22,21,21,21,21,21,19,18,16,14,13,12,11,11,10,9
 52 | Cote d'Ivoire,292,304,306,309,312,319,329,350,376,413,472,571,561,590,604,613,597,582
 53 | "Korea, Dem. Rep.",841,828,815,802,788,775,775,775,775,770,713,650,577,527,499,508,500,441
 54 | "Congo, Dem. Rep.",275,306,327,352,376,411,420,466,472,528,592,643,697,708,710,702,692,666
 55 | Denmark,12,12,11,10,10,9,9,8,8,8,7,7,7,7,7,6,7,6
 56 | Djibouti,"1,485","1,477","1,463","1,442","1,414","1,381",720,669,698,701,761,775,932,960,"1,034","1,046","1,093","1,104"
 57 | Dominica,24,24,24,23,23,22,22,18,20,20,20,22,20,20,20,21,13,19
 58 | Dominican Republic,183,173,164,156,148,141,135,132,128,122,119,115,102,93,90,85,84,82
 59 | Ecuador,282,271,259,249,238,228,221,212,207,200,194,185,170,162,155,155,148,140
 60 | Egypt,48,47,47,45,45,44,51,46,43,40,36,34,32,31,29,28,27,27
 61 | El Salvador,133,126,119,112,105,99,97,80,76,72,69,66,62,60,57,52,50,48
 62 | Equatorial Guinea,169,181,187,194,200,207,216,222,236,253,274,441,470,490,370,366,358,469
 63 | Eritrea,245,245,242,239,235,232,232,225,203,114,114,111,118,110,122,127,133,134
 64 | Estonia,50,50,56,66,77,85,88,98,102,105,72,68,62,56,50,46,44,39
 65 | Ethiopia,312,337,351,366,383,403,396,397,420,464,486,539,569,601,613,612,604,579
 66 | Fiji,68,65,62,58,55,53,49,49,46,40,42,35,36,29,33,31,30,30
 67 | Finland,14,12,11,10,9,10,10,10,9,9,8,7,7,6,6,5,5,5
 68 | France,21,20,19,18,17,16,15,15,14,14,13,12,12,12,12,11,11,11
 69 | French Polynesia,67,55,91,83,93,107,55,48,56,54,40,42,32,29,28,31,31,32
 70 | Gabon,359,340,325,318,316,293,312,320,359,366,434,249,302,299,288,332,358,379
 71 | Gambia,350,350,349,347,344,341,324,321,311,485,491,499,335,343,341,366,399,404
 72 | Georgia,51,48,50,54,59,66,73,104,87,90,98,95,95,94,90,86,83,83
 73 | Germany,15,15,14,14,13,13,12,11,11,10,9,8,7,6,6,6,5,5
 74 | Ghana,533,519,502,480,455,432,426,388,384,382,368,358,359,358,359,357,355,353
 75 | Greece,30,29,27,25,24,23,22,22,21,20,19,18,18,17,17,16,16,16
 76 | Grenada,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,6
 77 | Guam,103,101,96,110,146,93,91,89,87,86,44,45,44,47,41,42,39,36
 78 | Guatemala,113,111,108,106,103,100,95,94,93,92,90,91,89,89,86,85,84,87
 79 | Guinea,241,248,255,262,269,275,277,293,305,317,332,346,363,380,391,425,426,448
 80 | Guinea-Bissau,404,403,402,399,395,390,390,387,385,386,273,276,305,296,287,283,270,276
 81 | Guyana,39,43,34,43,50,67,78,81,90,93,98,112,126,136,130,132,133,136
 82 | Haiti,479,464,453,443,435,429,428,426,417,407,403,397,388,380,377,368,368,366
 83 | Honduras,141,133,128,123,119,115,114,112,106,98,70,70,72,71,72,71,70,71
 84 | Hungary,67,68,70,72,73,73,74,72,67,47,43,39,36,33,29,26,22,19
 85 | Iceland,5,4,4,4,4,4,3,3,3,3,3,3,2,2,2,3,3,3
 86 | India,586,577,566,555,542,525,517,501,487,476,443,411,389,349,311,299,290,283
 87 | Indonesia,443,430,417,404,392,380,369,359,348,335,326,314,297,287,274,261,251,244
 88 | Iran,50,51,56,54,55,55,61,52,45,41,40,38,37,35,32,31,29,27
 89 | Iraq,88,88,88,88,88,88,84,84,82,80,71,69,65,67,71,75,78,79
 90 | Ireland,19,18,18,17,15,14,12,12,12,12,12,11,10,10,10,10,10,11
 91 | Israel,11,10,10,9,9,8,8,8,8,7,7,7,6,6,6,6,6,6
 92 | Italy,11,10,10,9,9,8,9,8,7,7,7,7,7,6,6,6,6,6
 93 | Jamaica,10,10,10,10,9,9,7,7,7,7,7,7,7,7,7,7,7,7
 94 | Japan,62,60,58,56,53,51,50,50,49,48,45,41,39,36,34,32,30,28
 95 | Jordan,19,18,17,17,16,15,20,18,12,11,11,9,9,9,9,8,9,9
 96 | Kazakhstan,95,87,85,84,85,94,109,137,163,134,141,148,150,155,152,147,144,139
 97 | Kenya,125,120,134,152,177,207,233,277,313,351,393,384,392,402,410,388,340,319
 98 | Kiribati,"1,026","1,006",986,966,947,928,910,853,571,556,546,607,587,477,439,419,405,423
 99 | Kuwait,89,84,80,75,72,68,66,64,61,35,33,33,30,29,29,30,25,25
100 | Kyrgyzstan,90,93,93,93,101,118,141,165,147,146,156,169,153,145,139,136,135,134
101 | Laos,428,424,420,415,411,407,373,360,352,344,344,337,330,324,313,298,291,289
102 | Latvia,56,57,59,63,75,91,77,89,92,95,91,89,85,78,72,66,61,55
103 | Lebanon,64,64,63,62,62,59,64,54,50,37,35,30,26,24,22,21,23,23
104 | Lesotho,225,231,229,228,232,242,248,264,298,518,356,370,399,408,414,421,408,568
105 | Liberia,476,473,469,465,462,461,418,424,396,403,435,437,382,429,370,416,393,398
106 | Libyan Arab Jamahiriya,46,45,45,43,43,42,41,38,36,23,22,22,21,20,19,18,18,17
107 | Lithuania,64,66,71,79,89,98,110,119,125,120,115,96,83,72,72,66,65,69
108 | Luxembourg,19,18,17,16,15,14,14,13,13,12,11,11,11,10,10,10,10,9
109 | Madagascar,367,368,369,369,370,370,339,345,346,352,359,371,382,375,384,408,400,417
110 | Malawi,380,376,365,355,353,348,337,342,345,349,362,350,358,353,346,342,324,305
111 | Malaysia,159,158,156,155,153,151,147,173,170,167,135,133,132,128,128,126,123,121
112 | Maldives,143,130,118,107,97,88,88,101,89,94,96,84,83,69,71,63,69,48
113 | Mali,640,631,621,609,597,583,573,566,565,567,571,573,572,578,584,589,593,599
114 | Malta,10,9,9,8,8,7,7,7,7,6,6,6,5,5,5,5,5,5
115 | Mauritania,585,587,590,592,594,595,622,615,612,615,619,624,632,642,494,565,556,559
116 | Mauritius,53,51,50,48,47,45,62,61,45,40,39,42,40,39,38,39,39,39
117 | Mexico,101,93,86,80,74,68,64,58,52,48,42,38,35,33,31,27,25,23
118 | "Micronesia, Fed. Sts.",263,253,244,234,225,217,204,287,276,265,173,171,152,142,128,124,112,100
119 | Monaco,3,3,3,3,3,3,3,3,2,2,2,2,2,2,2,2,2,2
120 | Mongolia,477,477,477,477,477,477,333,342,307,281,297,273,258,258,233,232,217,234
121 | Montserrat,14,14,14,14,14,13,13,13,13,13,13,13,13,10,13,10,12,8
122 | Morocco,134,130,127,123,119,116,107,106,105,99,98,95,87,91,89,85,82,80
123 | Mozambique,287,313,328,343,356,369,386,408,432,461,499,535,556,569,567,551,528,504
124 | Myanmar,411,400,389,379,370,361,298,309,312,298,267,238,202,175,168,161,161,162
125 | Namibia,650,685,687,683,671,658,387,395,411,442,481,506,544,560,572,570,556,532
126 | Nauru,170,285,280,274,90,263,258,253,248,44,44,56,57,48,162,121,174,33
127 | Nepal,629,607,585,564,543,523,498,473,448,363,312,304,285,271,260,247,246,240
128 | Netherlands,11,10,10,9,9,8,8,8,8,7,7,7,6,6,6,6,6,6
129 | Netherlands Antilles,28,27,25,24,23,22,21,20,19,18,17,17,17,16,16,15,15,15
130 | New Caledonia,112,107,104,76,69,60,58,97,97,51,51,43,34,28,29,29,25,25
131 | New Zealand,10,10,9,9,10,11,10,10,11,11,11,10,10,10,10,9,8,7
132 | Nicaragua,145,137,129,122,114,108,100,97,93,89,85,80,79,73,69,68,64,56
133 | Niger,317,318,319,319,319,318,322,292,281,281,278,280,288,275,287,285,289,292
134 | Nigeria,282,307,321,336,350,366,379,399,423,452,489,526,563,575,573,563,543,521
135 | Niue,118,115,113,111,109,106,202,0,0,114,0,0,506,0,0,0,0,0
136 | Northern Mariana Islands,142,201,301,194,186,185,188,331,334,220,135,120,95,83,80,83,83,72
137 | Norway,8,8,8,7,7,6,6,6,6,5,5,5,5,5,4,4,4,4
138 | Oman,40,36,29,25,22,22,15,15,14,14,13,14,13,13,12,13,13,14
139 | Pakistan,430,428,427,426,424,422,421,421,415,420,413,406,376,355,333,289,260,223
140 | Palau,96,66,43,260,414,187,53,92,54,376,104,102,69,64,31,102,74,71
141 | Panama,74,73,71,70,69,68,67,67,65,64,60,51,48,49,44,44,44,45
142 | Papua New Guinea,498,498,497,497,496,496,494,493,491,489,486,482,477,471,463,453,441,430
143 | Paraguay,95,93,92,91,89,88,71,92,92,91,90,89,88,85,85,81,74,73
144 | Peru,394,368,343,320,298,278,270,251,230,222,210,198,187,182,167,155,143,136
145 | Philippines,799,783,766,750,735,719,705,689,669,649,600,578,561,542,534,520,505,500
146 | Poland,88,87,86,85,83,79,74,68,63,58,53,50,35,34,33,31,29,28
147 | Portugal,51,49,47,45,44,43,42,41,39,38,36,34,33,32,29,27,24,23
148 | Puerto Rico,17,15,17,18,18,18,15,13,12,10,9,8,7,6,6,6,6,5
149 | Qatar,71,69,69,74,84,89,87,84,75,78,78,78,75,71,71,69,77,81
150 | "Korea, Rep.",223,196,174,150,142,132,105,98,89,107,113,112,126,108,112,118,122,126
151 | Moldova,105,99,103,111,122,138,157,171,191,203,215,174,211,176,152,151,151,151
152 | Romania,118,125,134,147,159,167,174,184,129,194,197,206,180,185,178,148,138,128
153 | Russian Federation,69,64,70,78,91,111,132,142,155,160,164,158,148,140,135,121,117,115
154 | Rwanda,190,211,226,243,259,278,297,316,339,383,442,503,549,581,607,607,595,590
155 | Saint Kitts and Nevis,17,17,16,16,16,15,16,15,11,12,15,13,12,14,13,15,14,12
156 | Saint Lucia,26,26,25,25,25,24,23,17,16,18,20,18,17,19,18,18,18,18
157 | Saint Vincent and the Grenadines,45,45,44,43,42,42,42,41,38,41,35,36,36,34,36,36,34,39
158 | Samoa,36,35,34,33,32,31,35,33,50,31,27,33,28,28,24,27,26,25
159 | San Marino,9,9,8,8,7,7,7,7,7,6,6,6,6,5,5,5,5,5
160 | Sao Tome and Principe,346,335,325,315,304,295,290,285,290,276,272,266,261,266,255,256,252,240
161 | Saudi Arabia,68,60,59,60,64,67,71,73,76,72,67,65,62,60,60,60,62,65
162 | Senegal,380,379,379,378,377,376,372,388,397,424,420,430,443,441,454,456,461,468
163 | Seychelles,113,110,106,103,100,96,66,59,71,90,52,53,42,66,52,57,56,55
164 | Sierra Leone,465,479,492,504,517,534,525,565,602,636,675,696,743,784,830,866,902,941
165 | Singapore,52,52,53,50,49,49,50,50,48,44,39,36,34,32,31,28,27,27
166 | Slovakia,55,56,59,59,56,51,46,42,38,35,32,30,29,26,25,21,20,20
167 | Slovenia,66,62,59,57,53,50,35,35,32,29,27,25,22,21,19,16,16,15
168 | Solomon Islands,625,593,563,534,506,480,380,354,339,322,300,286,277,254,229,204,197,180
169 | Somalia,597,587,577,566,555,543,465,444,446,431,414,398,391,362,334,325,341,352
170 | South Africa,769,726,676,620,562,502,480,466,465,426,515,581,586,649,676,707,690,692
171 | Spain,44,42,40,37,35,34,33,30,30,28,27,26,26,25,24,24,24,23
172 | Sri Lanka,109,106,104,102,99,97,102,93,90,89,107,99,88,89,87,75,80,79
173 | Sudan,409,404,402,402,403,405,409,417,378,382,375,389,363,371,376,384,391,402
174 | Suriname,109,100,79,80,76,78,88,101,118,122,115,113,113,120,126,136,146,155
175 | Swaziland,629,590,527,477,448,441,460,504,556,647,740,832,693,739,776,788,801,812
176 | Sweden,5,5,6,6,5,5,5,4,4,4,4,4,4,4,4,4,4,5
177 | Switzerland,14,13,12,11,10,10,9,8,8,8,7,6,6,6,6,5,5,5
178 | Syrian Arab Republic,94,89,84,80,75,71,67,61,54,48,41,37,35,33,31,30,29,27
179 | Tajikistan,193,162,112,79,85,106,134,141,159,169,191,221,248,256,277,282,301,322
180 | Thailand,336,319,307,297,291,285,285,279,256,231,223,194,197,189,188,184,189,192
181 | "Macedonia, FYR",92,90,89,86,83,77,74,73,72,65,56,39,40,37,34,34,34,33
182 | Timor-Leste,706,694,681,669,656,644,644,644,644,644,644,644,345,359,367,370,385,378
183 | Togo,702,687,668,647,628,614,613,658,637,647,656,669,701,693,702,713,726,750
184 | Tokelau,139,140,143,112,0,301,0,112,112,0,0,0,112,0,112,0,0,0
185 | Tonga,45,44,43,43,42,41,38,38,31,34,34,42,35,36,39,32,34,28
186 | Trinidad and Tobago,17,17,17,16,16,16,16,16,15,15,15,16,15,15,15,15,15,15
187 | Tunisia,49,46,49,51,51,49,48,46,44,31,30,28,27,26,27,27,28,28
188 | Turkey,83,79,77,73,68,62,62,63,64,57,49,45,44,43,44,44,32,34
189 | Turkmenistan,105,99,101,97,92,80,92,114,137,142,130,115,110,103,98,91,85,75
190 | Turks and Caicos Islands,42,40,37,35,33,31,30,29,28,17,16,23,23,22,22,22,18,17
191 | Tuvalu,593,573,554,535,518,500,484,467,452,437,422,408,394,381,368,245,261,203
192 | Uganda,206,313,342,377,394,418,419,342,357,359,391,411,447,476,472,469,450,426
193 | Ukraine,67,64,67,72,75,78,87,93,104,109,120,128,133,135,132,113,99,102
194 | United Arab Emirates,47,44,42,39,38,36,34,33,31,30,27,27,27,25,25,24,24,24
195 | United Kingdom,9,9,10,10,9,9,9,9,9,9,9,9,9,10,10,11,11,12
196 | Tanzania,215,228,240,252,269,283,301,324,333,347,364,367,383,380,373,364,353,337
197 | Virgin Islands (U.S.),30,28,27,25,24,23,19,18,17,19,19,18,18,17,17,16,16,16
198 | United States of America,7,7,7,7,6,6,6,5,5,4,4,4,4,4,3,3,3,3
199 | Uruguay,35,34,33,32,31,30,28,27,28,28,27,25,27,25,23,24,25,23
200 | Uzbekistan,114,105,102,118,116,119,111,122,129,134,139,148,144,152,149,144,134,140
201 | Vanuatu,278,268,259,250,242,234,226,218,211,159,143,128,149,128,118,131,104,102
202 | Venezuela,46,45,44,43,42,42,41,41,40,39,39,41,41,39,38,38,38,39
203 | Viet Nam,365,361,358,354,350,346,312,273,261,253,248,243,235,234,226,227,222,220
204 | Wallis et Futuna,126,352,64,174,172,93,123,213,107,105,103,13,275,147,63,57,60,25
205 | West Bank and Gaza,55,54,54,52,52,50,49,46,44,42,40,39,37,36,35,33,32,31
206 | Yemen,265,261,263,253,250,244,233,207,194,175,164,154,149,146,138,137,135,130
207 | Zambia,436,456,494,526,556,585,602,626,634,657,658,680,517,478,468,453,422,387
208 | Zimbabwe,409,417,415,419,426,439,453,481,392,430,479,523,571,632,652,680,699,714


--------------------------------------------------------------------------------
/04-sentiment-analysis/sentiment-analysis-R.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Sentiment Analysis"
  3 | author: "Jose A. Dianes"
  4 | date: "3 August 2015"
  5 | output:
  6 |   html_document:
  7 |     keep_md: yes
  8 | ---
  9 | 
 10 | 
 11 | ## Getting and preparing data
 12 | 
 13 | In R, you use `read.csv` to read CSV files into `data.frame` variables. Although the R function `read.csv` can work with URLs, https is a problem for R in many cases, so you need to use a package like RCurl to get around it. Moreover, from the Kaggle page description we know that the file is tab-separated, there is not header, and we need to disable quoting since some sentences include quotes and that will stop file parsing at some point.  
 14 | 
 15 | 
 16 | ```{r}
 17 | library(RCurl)
 18 | 
 19 | test_data_url <- "https://kaggle2.blob.core.windows.net/competitions-data/inclass/2558/testdata.txt?sv=2012-02-12&se=2015-08-06T10%3A32%3A23Z&sr=b&sp=r&sig=a8lqVKO0%2FLjN4hMrFo71sPcnMzltKk1HN8m7OPolArw%3D"
 20 | train_data_url <- "https://kaggle2.blob.core.windows.net/competitions-data/inclass/2558/training.txt?sv=2012-02-12&se=2015-08-06T10%3A34%3A08Z&sr=b&sp=r&sig=meGjVzfSsvayeJiDdKY9S6C9ep7qW8v74M6XzON0YQk%3D"
 21 | 
 22 | test_data_file <- getURL(test_data_url)
 23 | train_data_file <- getURL(train_data_url)
 24 | 
 25 | train_data_df <- read.csv(
 26 |     text = train_data_file, 
 27 |     sep='\t', 
 28 |     header=FALSE, 
 29 |     quote = "",
 30 |     stringsAsFactor=F,
 31 |     col.names=c("Sentiment", "Text"))
 32 | test_data_df <- read.csv(
 33 |     text = test_data_file, 
 34 |     sep='\t', 
 35 |     header=FALSE, 
 36 |     quote = "",
 37 |     stringsAsFactor=F,
 38 |     col.names=c("Text"))
 39 | # we need to convert Sentiment to factor
 40 | train_data_df$Sentiment <- as.factor(train_data_df$Sentiment)
 41 | ```
 42 | 
 43 | Now we have our data in data frames. We have 7086 sentences for the training data and 33052 sentences for the test data. The sentences are in a column named `Text` and the sentiment tag (just for training data) in a column named `Sentiment`. Let's have a look at the first few lines of the training data.  
 44 | 
 45 | ```{r}
 46 | head(train_data_df)
 47 | ```
 48 | 
 49 | We can also get a glimpse at how tags ar distributed. In R we can use `table`.  
 50 | 
 51 | ```{r}
 52 | table(train_data_df$Sentiment)
 53 | ```
 54 | 
 55 | That is, we have data more or less evenly distributed, with 3091 negatively tagged sentences, and 3995 positively tagged sentences. How long on average are our sentences in words?    
 56 | 
 57 | ```{r}
 58 | mean(sapply(sapply(train_data_df$Text, strsplit, " "), length))
 59 | ```
 60 | 
 61 | About 10.8 words in length.  
 62 | 
 63 | ## Preparing a corpus  
 64 | 
 65 | > In linguistics, a corpus (plural corpora) or text corpus is a large and structured set of texts (nowadays usually electronically stored and processed). They are used to do statistical analysis and hypothesis testing, checking occurrences or validating linguistic rules within a specific language territory.  
 66 | > Source: [Wikipedia](https://en.wikipedia.org/wiki/Text_corpus)  
 67 | 
 68 | In this section we will process our text sentences and create a corpus. We will also extract important words and stablish them as input variables for our classifier.  
 69 | 
 70 | ```{r}
 71 | library(tm)
 72 | corpus <- Corpus(VectorSource(c(train_data_df$Text, test_data_df$Text)))
 73 | corpus
 74 | ```
 75 | 
 76 | Let's explain what we just did. First we used both, test and train data. We need to consider all possible word in our corpus. Then we created a `VectorSource`, that is the input type for the `Corpus` function defined in the package `tm`. That gives us a `VCorpus` object that basically is a collection of content+metadata objects, where the content contains our sentences. For example, the content on the first document looks like this.    
 77 | 
 78 | ```{r}
 79 | corpus[1]$content
 80 | ```
 81 | 
 82 | In order to make use of this corpus, we need to transform its contents as follows.  
 83 | 
 84 | ```{r}
 85 | corpus <- tm_map(corpus, tolower)
 86 | corpus <- tm_map(corpus, PlainTextDocument)
 87 | corpus <- tm_map(corpus, removePunctuation)
 88 | corpus <- tm_map(corpus, removeWords, stopwords("english"))
 89 | corpus <- tm_map(corpus, stripWhitespace)
 90 | corpus <- tm_map(corpus, stemDocument)
 91 | ```
 92 | 
 93 | First we put everything in lowercase. The second transformation is needed in order to have each document in the format we will need later on. Then we remove punctuation, english stopwords, strip whitespaces, and [stem](https://en.wikipedia.org/wiki/Stemming) each word. Right now, the first entry now looks like this.  
 94 | 
 95 | ```{r}
 96 | corpus[1]$content
 97 | ```
 98 | 
 99 | In our way to find document input features for our classifier, we want to put this corpus in the shame of a document matrix. A document matrix is a numeric matrix containing a column for each different word in our whole corpus, and a row for each document. A given cell equals to the freqency in a document for a given term.  
100 | 
101 | This is how we do it in R.  
102 | 
103 | ```{r}
104 | dtm <- DocumentTermMatrix(corpus)
105 | dtm
106 | ```
107 | 
108 | If we consider each column as a term for our model, we will end up with a very complex model with 8383 different features. This will make the model slow and probably not very efficient. Some terms or words are more important than others, and we want to remove those that are not so much. We can use the function `removeSparseTerms` from the `tm` package where we pass the matrix and a number that gives the maximal allowed sparsity for a term in our corpus. For example, if we want terms that appear in at least 1% of the documents we can do as follows.  
109 | 
110 | ```{r}
111 | sparse <- removeSparseTerms(dtm, 0.99)
112 | sparse
113 | ```
114 | 
115 | We end up with just 85 terms. The close that value is to 1, the more terms we will have in our `sparse` object, since the number of documents we need a term to be in is smaller.  
116 | 
117 | Now we want to convert this matrix into a data frame that we can use to train a classifier in the next section.  
118 | 
119 | ```{r}
120 | important_words_df <- as.data.frame(as.matrix(sparse))
121 | colnames(important_words_df) <- make.names(colnames(important_words_df))
122 | # split into train and test
123 | important_words_train_df <- head(important_words_df, nrow(train_data_df))
124 | important_words_test_df <- tail(important_words_df, nrow(test_data_df))
125 | 
126 | # Add to original dataframes
127 | train_data_words_df <- cbind(train_data_df, important_words_train_df)
128 | test_data_words_df <- cbind(test_data_df, important_words_test_df)
129 | 
130 | # Get rid of the original Text field
131 | train_data_words_df$Text <- NULL
132 | test_data_words_df$Text <- NULL
133 | ```
134 | 
135 | Now we are ready to train our first classifier.  
136 | 
137 | ## A bag-of-words linear classifier  
138 | 
139 | The approach we are using here is called a [bag-of-words model](https://en.wikipedia.org/wiki/Bag-of-words_model). In this kind of model we simplify documents to a multiset of terms frequencies. That means that, for our model, a document sentiment tag will depend on what words appear in that document, discarding any grammar or word order but keeping multiplicity.  
140 | 
141 | But first of all we need to split our train data into train and test data. Why we do that if we already have a testing set? Simple. The test set from the Kaggle competition doesn't have tags at all (obviously). If we want to asses our model accuracy we need a test set with sentiment tags to compare our results. We will split using `sample.split` from the [`caTools`](https://cran.r-project.org/web/packages/caTools/index.html) package.    
142 | 
143 | ```{r}
144 | library(caTools)
145 | set.seed(1234)
146 | # first we create an index with 80% True values based on Sentiment
147 | spl <- sample.split(train_data_words_df$Sentiment, .85)
148 | # now we use it to split our data into train and test
149 | eval_train_data_df <- train_data_words_df[spl==T,]
150 | eval_test_data_df <- train_data_words_df[spl==F,]
151 | ```
152 | 
153 | Building linear models is something that is at the very heart of R. Therefore is very easy, and it requires just a single function call.  
154 | 
155 | ```{r}
156 | log_model <- glm(Sentiment~., data=eval_train_data_df, family=binomial)
157 | summary(log_model)
158 | ```
159 | 
160 | The first parameter is a formula in the form `Output~Input` where the `.` at the input side means to use every single variable but the output one. Then we pass the data frame and `family=binomial` that means we want to use logistic regression.  
161 | 
162 | The summary function gives us really good insight into the model we just built. The coefficient section lists all the input variables used in the model. A series of asterisks at the very end of them gives us the importance of each one, with `***` being the greatest significance level, and `**` or `*` being also important. These starts relate to the values in `Pr`. for example, we get that the stem `awesom` has a great significance, with a high positive `Estimate` value. That means that a document with that stem is very likely to be tagged with sentiment 1 (positive). We see the oposite case with the stem `hate`. We also see that there are many terms that doesn't seem to have a great significance.    
163 | 
164 | So let's use our model with the test data.  
165 | 
166 | ```{r}
167 | log_pred <- predict(log_model, newdata=eval_test_data_df, type="response")
168 | ```
169 | 
170 | The previous `predict` called with `type="response"` will return probabilities (see [logistic regression](https://en.wikipedia.org/wiki/Logistic_regression)). Let's say that we want a .5 threshold for a document to be classified as positive (Sentiment tag equals 1). Then we can calculate accuracy as follows.   
171 | 
172 | ```{r}
173 | # Calculate accuracy based on prob
174 | table(eval_test_data_df$Sentiment, log_pred>.5)
175 | ```
176 | 
177 | The cases where our model performed properly are given by the diagonal.  
178 | 
179 | ```{r}
180 | (453 + 590) / nrow(eval_test_data_df)
181 | ```
182 | 
183 | This is a very good accuracy. It seems that our bag of words approach works nicely with this particular problem.  
184 | 
185 | We know we don't have tags on the given test dataset. Still we will try something. We will use our model to tag their entries and then get a random sample of entries and visually inspect how are they tagged. We can do this quickly in R as follows.  
186 | 
187 | ```{r}
188 | log_pred_test <- predict(log_model, newdata=test_data_words_df, type="response")
189 | 
190 | test_data_df$Sentiment <- log_pred_test>.5
191 |     
192 | set.seed(1234)
193 | spl_test <- sample.split(test_data_df$Sentiment, .0005)
194 | test_data_sample_df <- test_data_df[spl_test==T,]
195 | ```
196 | 
197 | So lest check what has been classified as positive entries.  
198 | 
199 | ```{r}
200 | test_data_sample_df[test_data_sample_df$Sentiment==T, c('Text')]
201 | ```
202 | 
203 | And negative ones.  
204 | 
205 | ```{r}
206 | test_data_sample_df[test_data_sample_df$Sentiment==F, c('Text')]
207 | ```
208 | 
209 | So judge by yourself. Is our classifier doing a good job at all?
210 | 
211 | 


--------------------------------------------------------------------------------
/05-regularisation/tb_new_100.csv:
--------------------------------------------------------------------------------
  1 | "TB incidence, all forms (per 100 000 population per year)",1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
  2 | Afghanistan,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168
  3 | Albania,25,24,25,26,26,27,27,28,28,27,25,23,23,22,21,20,18,17
  4 | Algeria,38,38,39,40,41,42,43,44,46,47,48,49,50,51,53,54,55,57
  5 | American Samoa,21,7,2,9,9,11,0,12,6,8,6,6,4,5,9,10,7,5
  6 | Andorra,36,34,32,30,29,27,26,26,25,23,22,21,21,20,20,19,19,19
  7 | Angola,205,209,214,218,222,226,231,236,240,245,250,255,260,265,270,276,281,287
  8 | Anguilla,24,24,24,24,23,23,23,23,23,23,23,22,22,22,22,22,22,22
  9 | Antigua and Barbuda,10,10,9,9,8,8,8,7,7,7,6,6,6,6,6,6,6,5
 10 | Argentina,60,57,55,53,51,49,47,45,44,42,40,39,37,36,35,33,32,31
 11 | Armenia,33,32,33,37,41,47,53,58,63,67,71,72,72,71,71,72,72,72
 12 | Australia,7,7,7,7,7,7,7,7,6,6,6,6,6,6,6,6,6,6
 13 | Austria,23,22,21,20,19,18,17,17,16,15,14,14,14,13,13,13,13,12
 14 | Azerbaijan,35,34,36,39,43,50,56,62,67,71,75,77,77,76,76,77,77,77
 15 | Bahamas,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44,44
 16 | Bahrain,76,72,68,64,61,58,57,55,52,49,47,46,45,43,42,42,41,40
 17 | Bangladesh,264,261,259,256,253,251,248,246,244,241,239,236,234,232,229,227,225,223
 18 | Barbados,7,6,6,6,6,5,5,5,5,4,4,4,4,4,4,4,4,4
 19 | Belarus,38,34,38,40,49,54,60,65,72,76,73,66,60,60,61,61,61,61
 20 | Belgium,20,19,18,19,19,18,17,16,15,15,16,16,15,14,13,13,13,12
 21 | Belize,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40,40
 22 | Benin,77,77,78,79,80,80,81,82,83,84,85,86,86,87,88,89,90,91
 23 | Bermuda,7,6,6,6,6,5,5,5,5,4,4,4,4,4,4,4,4,4
 24 | Bhutan,540,516,492,470,449,428,409,391,373,356,340,325,310,296,283,270,258,246
 25 | Bolivia,255,247,240,233,226,220,213,207,201,195,190,184,179,174,169,164,159,155
 26 | Bosnia and Herzegovina,94,92,93,92,89,84,83,81,79,71,63,58,56,55,53,52,52,51
 27 | Botswana,307,341,364,390,415,444,468,503,542,588,640,692,740,772,780,770,751,731
 28 | Brazil,84,81,78,76,73,71,69,67,64,62,60,58,57,55,53,51,50,48
 29 | British Virgin Islands,19,18,17,16,15,15,14,14,13,12,12,11,11,11,11,11,10,10
 30 | Brunei Darussalam,58,58,58,58,58,58,58,58,58,93,102,70,73,64,53,48,59,59
 31 | Bulgaria,27,30,34,37,38,38,39,43,45,45,44,44,43,39,39,40,39,39
 32 | Burkina Faso,95,105,112,120,128,137,145,155,168,182,198,214,229,239,241,238,232,226
 33 | Burundi,154,171,182,196,208,223,235,252,272,295,321,347,371,387,391,387,377,367
 34 | Cambodia,585,579,574,568,563,557,552,546,541,536,530,525,520,515,510,505,500,495
 35 | Cameroon,81,89,95,102,109,116,123,132,142,154,168,181,194,202,204,202,197,192
 36 | Canada,10,9,9,8,8,7,7,7,7,6,6,6,6,5,5,5,5,5
 37 | Cape Verde,175,174,172,171,169,168,166,165,163,162,160,159,157,156,155,153,152,151
 38 | Cayman Islands,7,6,6,6,6,5,5,5,5,4,4,4,4,4,4,4,4,4
 39 | Central African Republic,145,161,172,184,196,209,221,237,256,277,302,327,349,364,368,363,354,345
 40 | Chad,125,139,149,159,170,181,191,205,221,240,262,283,302,315,318,315,307,299
 41 | Chile,38,35,33,31,29,27,25,24,22,21,19,18,17,16,15,14,13,12
 42 | China,116,115,114,113,112,111,110,109,108,106,105,104,103,102,101,100,99,98
 43 | Colombia,53,52,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35
 44 | Comoros,85,82,79,75,72,69,67,64,61,59,56,54,52,50,48,46,44,42
 45 | "Congo, Rep.",169,188,200,215,229,245,258,277,299,324,353,382,408,425,430,425,414,403
 46 | Cook Islands,0,6,37,31,24,12,6,13,0,20,7,14,7,0,8,8,8,15
 47 | Costa Rica,18,18,17,17,16,16,15,15,14,14,14,13,13,12,12,12,11,11
 48 | Croatia,74,73,73,72,70,66,65,64,62,56,49,46,44,43,42,41,41,40
 49 | Cuba,25,23,22,20,18,17,16,14,13,12,11,10,10,9,8,8,7,6
 50 | Cyprus,9,9,8,8,7,7,7,7,6,6,6,6,5,5,5,5,5,5
 51 | Czech Republic,21,21,21,20,20,20,20,20,19,17,15,14,13,12,11,10,10,9
 52 | Cote d'Ivoire,177,196,209,224,239,255,269,289,312,338,368,398,425,444,448,443,432,420
 53 | "Korea, Dem. Rep.",344,344,344,344,344,344,344,344,344,344,344,344,344,344,344,344,344,344
 54 | "Congo, Dem. Rep.",165,182,195,209,222,238,251,269,290,315,343,371,396,413,417,413,402,392
 55 | Denmark,15,14,14,13,12,12,11,11,10,10,9,9,9,9,8,8,8,8
 56 | Djibouti,582,594,606,618,630,642,655,668,681,695,708,722,737,751,766,781,797,813
 57 | Dominica,15,15,15,14,14,14,14,14,14,14,14,14,14,14,14,13,13,13
 58 | Dominican Republic,114,111,108,104,101,99,96,93,90,88,85,83,80,78,76,73,71,69
 59 | Ecuador,167,162,157,153,148,144,140,136,132,128,124,121,117,114,111,107,104,101
 60 | Egypt,37,36,36,35,34,34,33,31,29,28,27,26,26,24,23,22,22,21
 61 | El Salvador,82,79,75,72,69,66,64,61,58,56,54,51,49,47,45,43,41,40
 62 | Equatorial Guinea,108,119,127,136,145,155,164,176,190,206,224,242,259,270,273,270,263,256
 63 | Eritrea,72,73,74,76,77,78,79,81,82,84,85,86,88,89,91,92,94,95
 64 | Estonia,32,32,35,42,48,53,58,64,66,68,66,62,55,50,46,43,40,38
 65 | Ethiopia,159,176,188,201,215,229,242,260,280,304,331,358,383,399,403,398,388,378
 66 | Fiji,51,48,46,43,41,39,37,35,33,32,30,28,27,26,24,23,22,21
 67 | Finland,18,16,14,12,12,13,13,13,12,12,10,10,9,8,7,6,6,6
 68 | France,26,25,23,22,21,20,19,19,18,17,16,16,15,15,15,14,14,14
 69 | French Polynesia,34,27,45,42,47,54,43,45,51,45,29,29,29,22,26,27,30,27
 70 | Gabon,153,150,148,151,156,151,166,174,200,210,254,271,285,283,296,325,366,406
 71 | Gambia,185,189,193,196,200,204,208,212,217,221,225,230,234,239,244,248,253,258
 72 | Georgia,39,37,39,43,47,54,62,67,73,78,82,84,84,83,83,84,84,84
 73 | Germany,20,19,19,18,17,16,15,15,14,13,11,10,9,8,8,7,7,6
 74 | Ghana,223,222,220,219,218,217,216,214,213,212,211,210,209,207,206,205,204,203
 75 | Greece,33,32,30,28,27,26,25,24,23,22,21,20,20,19,19,18,18,18
 76 | Grenada,5,5,5,5,5,4,4,4,4,4,4,4,4,4,4,4,4,4
 77 | Guam,51,50,48,55,73,46,45,45,44,43,39,44,35,40,33,42,29,34
 78 | Guatemala,74,74,73,72,72,71,70,70,69,68,68,67,67,66,65,65,64,63
 79 | Guinea,119,126,132,139,147,154,163,171,180,190,200,211,222,234,246,259,273,287
 80 | Guinea-Bissau,158,161,164,167,170,174,177,181,184,188,192,195,199,203,207,211,216,220
 81 | Guyana,27,31,26,34,41,55,64,65,71,73,79,91,104,115,118,123,122,122
 82 | Haiti,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306,306
 83 | Honduras,98,95,92,89,87,84,82,80,77,75,73,71,69,67,65,63,61,59
 84 | Hungary,41,42,44,46,48,48,49,47,44,40,36,33,31,28,25,22,19,17
 85 | Iceland,6,6,5,5,5,5,4,4,4,4,4,3,3,2,3,3,4,4
 86 | India,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168,168
 87 | Indonesia,343,335,327,319,311,304,297,290,283,276,270,263,257,251,245,239,234,228
 88 | Iran,36,37,41,39,40,40,39,35,32,31,31,30,29,27,25,24,23,22
 89 | Iraq,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56,56
 90 | Ireland,24,24,23,22,20,18,16,15,16,15,14,13,13,12,12,13,13,13
 91 | Israel,14,13,13,12,11,11,11,10,10,9,9,8,8,8,8,8,8,8
 92 | Italy,14,13,12,12,11,11,10,10,10,9,9,8,8,8,8,8,7,7
 93 | Jamaica,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
 94 | Japan,47,46,44,42,41,39,38,38,37,37,34,32,29,28,26,24,23,21
 95 | Jordan,17,16,15,14,14,13,13,11,10,9,9,8,8,7,8,8,8,7
 96 | Kazakhstan,58,54,54,54,55,62,72,90,111,130,141,146,148,148,144,137,133,129
 97 | Kenya,112,114,135,160,192,224,258,302,344,382,405,408,419,436,441,406,371,353
 98 | Kiribati,513,503,493,483,474,464,455,446,437,428,420,412,403,396,388,380,372,365
 99 | Kuwait,45,42,40,37,36,34,33,32,31,29,28,27,26,25,25,24,24,24
100 | Kyrgyzstan,55,58,58,59,65,77,94,110,125,130,135,135,133,128,125,124,123,121
101 | Laos,179,177,175,173,172,170,168,167,165,163,162,160,159,157,156,154,153,151
102 | Latvia,34,36,37,41,49,60,73,82,85,85,83,83,79,73,68,63,58,53
103 | Lebanon,50,49,49,48,48,46,41,35,32,29,27,23,20,18,17,17,18,19
104 | Lesotho,184,201,218,244,280,323,362,409,461,519,553,576,613,635,643,639,638,637
105 | Liberia,199,203,207,211,215,219,223,228,232,237,242,246,251,256,261,266,272,277
106 | Libyan Arab Jamahiriya,30,29,30,29,28,28,27,25,24,23,22,22,21,20,19,18,18,17
107 | Lithuania,40,42,46,51,58,65,73,80,82,80,77,73,73,68,65,63,66,68
108 | Luxembourg,23,22,21,19,19,18,17,16,16,15,14,14,14,13,13,13,12,12
109 | Madagascar,177,181,185,189,192,196,200,205,209,213,217,222,226,231,236,241,246,251
110 | Malawi,258,286,314,343,373,390,389,401,412,417,425,414,416,410,405,391,368,346
111 | Malaysia,118,117,117,116,115,114,113,112,111,110,109,108,108,107,106,105,104,103
112 | Maldives,129,121,114,108,102,96,90,85,80,75,71,67,63,59,56,53,50,47
113 | Mali,275,277,280,282,285,287,290,292,295,297,300,303,305,308,311,313,316,319
114 | Malta,11,11,10,9,9,9,8,8,8,7,7,7,7,6,6,6,6,6
115 | Mauritania,228,232,237,241,246,251,256,261,266,272,277,282,288,294,300,305,312,318
116 | Mauritius,28,27,27,27,26,26,26,25,25,25,24,24,24,24,23,23,23,22
117 | Mexico,61,57,54,50,47,44,41,39,36,34,32,30,28,26,24,23,21,20
118 | "Micronesia, Fed. Sts.",188,181,174,168,161,155,149,143,138,133,128,123,118,114,109,105,101,97
119 | Monaco,4,4,4,4,4,3,3,3,3,3,3,3,3,2,2,2,2,2
120 | Mongolia,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205,205
121 | Montenegro,,,,,,,,,,,,,,,,33,32,
122 | Montserrat,9,9,9,9,9,8,8,8,8,8,8,8,8,8,8,8,8,8
123 | Morocco,149,145,141,137,133,129,125,122,118,115,112,109,106,103,100,97,94,92
124 | Mozambique,181,201,214,230,245,262,276,297,320,347,378,408,436,455,460,454,443,431
125 | Myanmar,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171,171
126 | Namibia,322,357,381,409,435,465,491,527,568,616,671,726,776,809,817,808,787,767
127 | Nauru,85,143,140,137,45,132,129,127,124,22,44,33,55,33,110,121,132,33
128 | Nepal,243,238,233,229,224,220,216,211,207,203,199,195,191,187,184,180,176,173
129 | Netherlands,14,13,13,12,11,11,10,10,10,9,9,8,8,8,8,8,8,8
130 | Netherlands Antilles,14,13,13,12,11,11,10,10,10,9,9,8,8,8,8,8,8,7
131 | New Caledonia,93,89,87,63,57,50,58,48,48,41,49,31,32,19,29,22,22,22
132 | New Zealand,10,10,9,9,10,10,10,10,10,11,11,9,10,10,9,9,8,7
133 | Nicaragua,108,103,98,94,89,85,81,78,74,71,68,64,62,59,56,53,51,49
134 | Niger,125,127,130,133,135,138,141,143,146,149,152,155,158,161,164,168,171,174
135 | Nigeria,131,145,155,166,176,188,199,214,230,250,272,294,314,328,331,327,319,311
136 | Niue,59,58,56,55,54,53,101,0,0,57,0,0,253,0,0,0,0,0
137 | Northern Mariana Islands,71,101,150,97,93,92,94,166,167,110,121,90,80,66,75,79,69,58
138 | Norway,10,10,9,9,8,8,8,7,7,7,6,6,6,6,6,6,6,6
139 | Oman,26,23,19,16,14,14,13,13,12,12,12,13,12,12,11,12,12,13
140 | Pakistan,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181,181
141 | Palau,64,44,28,172,275,124,32,92,54,188,52,51,62,50,28,55,66,60
142 | Panama,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47
143 | Papua New Guinea,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250,250
144 | Paraguay,60,60,60,60,60,60,60,59,59,59,59,59,59,59,59,58,58,58
145 | Peru,317,301,285,270,255,242,229,217,205,195,184,174,165,156,148,140,133,126
146 | Philippines,393,386,380,373,366,360,353,347,341,335,329,323,317,312,306,301,295,290
147 | Poland,52,52,53,53,52,51,48,45,42,38,35,33,32,30,28,27,26,25
148 | Portugal,67,65,62,61,59,59,57,55,53,49,46,45,43,41,37,34,32,30
149 | Puerto Rico,11,10,11,12,12,12,11,10,9,8,7,6,5,5,5,5,4,4
150 | Qatar,60,59,59,64,72,76,71,64,63,66,66,64,60,56,57,58,64,70
151 | "Korea, Rep.",165,145,129,112,106,98,93,87,80,71,72,74,83,80,83,85,87,90
152 | Moldova,65,62,65,71,79,91,103,113,122,130,138,141,140,139,139,140,140,141
153 | Romania,74,79,86,96,105,112,116,121,125,131,136,143,145,146,140,134,125,115
154 | Russian Federation,45,42,46,51,60,73,86,94,100,106,113,112,108,105,105,106,108,110
155 | Rwanda,167,185,197,212,225,241,254,273,294,319,348,376,402,419,423,418,408,397
156 | Saint Kitts and Nevis,10,10,10,10,10,10,10,10,10,10,10,10,10,10,9,9,9,9
157 | Saint Lucia,16,16,16,16,15,15,15,15,15,15,15,15,15,15,15,14,14,14
158 | Saint Vincent and the Grenadines,27,27,27,27,27,27,26,26,26,26,26,26,25,25,25,25,25,25
159 | Samoa,32,31,30,29,28,27,26,26,25,24,23,23,22,21,21,20,19,19
160 | San Marino,12,11,11,10,9,9,9,8,8,8,7,7,7,7,7,6,6,6
161 | Sao Tome and Principe,135,133,131,129,126,124,122,120,118,116,114,112,110,108,106,105,103,101
162 | Saudi Arabia,43,38,37,38,41,43,45,46,48,48,47,46,44,43,42,43,45,46
163 | Senegal,195,198,202,206,211,215,219,223,228,232,237,241,246,251,256,261,266,272
164 | Seychelles,43,43,42,41,40,40,39,38,38,37,37,36,35,35,34,33,33,32
165 | Sierra Leone,207,220,233,248,263,279,297,315,334,355,377,400,425,451,479,509,540,574
166 | Singapore,50,50,51,48,47,47,48,48,46,43,37,35,33,32,30,28,27,27
167 | Slovakia,40,41,44,45,44,41,37,35,31,29,26,25,24,22,19,17,17,17
168 | Slovenia,43,40,38,37,34,33,32,30,27,25,23,21,19,17,16,14,14,13
169 | Solomon Islands,312,296,281,267,253,240,228,216,205,195,185,175,166,158,150,142,135,128
170 | Somalia,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249,249
171 | South Africa,301,301,302,305,309,317,332,360,406,479,576,683,780,852,898,925,940,948
172 | Spain,56,53,50,47,45,43,41,40,38,36,35,33,33,32,31,31,30,30
173 | Sri Lanka,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60
174 | Sudan,174,178,181,185,189,192,196,200,204,208,212,216,221,225,229,234,239,243
175 | Suriname,66,61,49,51,49,51,58,67,79,83,79,79,80,86,91,100,108,116
176 | Swaziland,267,266,260,267,293,337,398,474,558,691,801,916,994,"1,075","1,127","1,141","1,169","1,198"
177 | Sweden,7,7,7,7,7,6,6,6,6,5,5,5,5,5,5,6,6,6
178 | Switzerland,18,18,16,14,14,13,12,11,11,10,9,8,8,8,8,7,7,6
179 | Syrian Arab Republic,61,57,54,51,49,46,43,41,39,37,35,33,31,29,28,26,25,24
180 | Tajikistan,112,95,66,47,51,65,82,86,97,105,117,136,153,165,181,192,211,231
181 | Thailand,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142,142
182 | "Macedonia, FYR",54,53,53,52,51,48,47,47,45,41,36,33,32,31,31,30,30,29
183 | Timor-Leste,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322,322
184 | Togo,308,314,320,326,333,339,346,353,360,367,374,382,389,397,405,413,421,429
185 | Tokelau,69,70,72,56,0,150,0,56,56,0,0,0,56,0,56,0,0,0
186 | Tonga,34,33,32,32,31,31,30,29,29,28,28,27,27,26,26,25,25,24
187 | Trinidad and Tobago,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11,11
188 | Tunisia,31,30,31,32,33,31,31,30,28,27,25,24,23,23,24,24,25,26
189 | Turkey,49,48,47,45,43,40,40,41,41,36,31,29,28,28,28,29,29,30
190 | Turkmenistan,64,62,64,62,59,52,60,74,89,93,92,88,85,79,75,70,69,68
191 | Turks and Caicos Islands,26,24,23,22,21,20,19,18,18,17,16,15,15,15,14,14,14,14
192 | Tuvalu,296,287,277,268,259,250,242,234,226,218,211,204,197,191,184,178,172,166
193 | Uganda,163,250,272,296,306,319,314,320,326,324,340,360,386,396,385,370,350,330
194 | Ukraine,41,40,43,46,48,51,58,63,71,76,84,91,95,97,97,102,102,102
195 | United Arab Emirates,30,28,27,25,24,23,22,21,20,19,18,18,18,17,17,16,16,16
196 | United Kingdom,12,12,12,12,12,12,12,12,12,12,12,12,12,13,13,14,15,15
197 | Tanzania,178,196,213,229,249,271,290,308,317,327,339,346,352,344,337,325,311,297
198 | Virgin Islands (U.S.),19,18,17,16,15,15,14,14,13,12,12,11,11,11,11,10,10,10
199 | United States of America,9,10,10,9,9,8,7,7,6,6,6,5,5,5,5,5,4,4
200 | Uruguay,28,27,27,27,26,26,26,25,25,25,24,24,24,23,23,23,23,22
201 | Uzbekistan,68,64,63,73,73,76,72,80,85,89,93,102,110,113,113,117,115,113
202 | Vanuatu,139,134,130,125,121,117,113,109,105,102,98,95,92,89,86,83,80,77
203 | Venezuela,35,35,35,35,35,35,35,35,34,34,34,34,34,34,34,34,34,34
204 | Viet Nam,202,200,198,196,195,193,191,189,187,185,183,182,180,178,176,175,173,171
205 | Wallis et Futuna,63,176,32,87,86,47,62,107,54,53,52,7,141,111,48,52,46,15
206 | West Bank and Gaza,35,34,34,33,33,32,31,29,28,27,26,25,24,23,22,21,21,20
207 | Yemen,133,131,132,127,125,122,119,111,106,102,100,96,93,89,85,82,79,76
208 | Zambia,297,349,411,460,501,536,554,576,583,603,602,627,632,652,623,588,547,506
209 | Zimbabwe,329,364,389,417,444,474,501,538,580,628,685,740,791,825,834,824,803,782


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This repository contains a variety of content; some developed by Jose A. Dianes, and some from third-parties. The third-party content is distributed under the license provided by those parties.
 2 | 
 3 | The content developed by Jose A. Dianes is distributed under the following license:
 4 | 
 5 | Copyright 2016 Jose A Dianes
 6 | 
 7 | Licensed under the Apache License, Version 2.0 (the "License");
 8 | you may not use this file except in compliance with the License.
 9 | You may obtain a copy of the License at
10 | 
11 |   http://www.apache.org/licenses/LICENSE-2.0
12 | 
13 | Unless required by applicable law or agreed to in writing, software
14 | distributed under the License is distributed on an "AS IS" BASIS,
15 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | See the License for the specific language governing permissions and
17 | limitations under the License.
18 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Data Science Engineering, your way
 2 | 
 3 | [![Join the chat at https://gitter.im/jadianes/data-science-your-way](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/jadianes/data-science-your-way?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 4 | 
 5 | ##### An introduction to different Data Science engineering concepts and Applications using Python and R  
 6 | 
 7 | These series of tutorials on Data Science engineering will try to compare how different concepts in the discipline can be implemented in the two dominant ecosystems nowadays: R and Python.  
 8 | 
 9 | We will do this from a neutral point of view. Our opinion is that each environment has good and bad things, and any data scientist should know how to use both in order to be as prepared as posible for job market or to start personal project.    
10 | 
11 | To get a feeling of what is going on regarding this hot topic, we refer the reader to [DataCamp's Data Science War](http://blog.datacamp.com/r-or-python-for-data-analysis/) infographic. Their infographic explores what the strengths of **R** are over **Python** and vice versa, and aims to provide a basic comparison between these two programming languages from a data science and statistics perspective.  
12 | 
13 | Far from being a repetition from the previous, our series of tutorials will go hands-on into how to actually perform different data science taks such as working with data frames, doing aggregations, or creating different statistical models such in the areas of supervised and unsupervised learning.  
14 | 
15 | We will use real-world datasets, and we will build some real data products. This will help us to quickly transfer what we learn here to actual data analysis situations.  
16 | 
17 | If your are interested in Big Data products, then you might find interesting our series of [tutorials on using Apache Spark and Python](https://github.com/jadianes/spark-py-notebooks) or [using R on Apache Spark (SparkR)](https://github.com/jadianes/spark-r-notebooks).  
18 | 
19 | ## Tutorials
20 | 
21 | This is a growing list of tutorials explaining concepts and applications in Python and R. 
22 | 
23 | ### [Introduction to Data Frames](https://github.com/jadianes/data-science-your-way/blob/master/01-data-frames/README.md)  
24 | 
25 | An introduction to the basic data structure and how to use it in Python/Pandas and R.  
26 | 
27 | ### [Exploratory Data Analysis](https://github.com/jadianes/data-science-your-way/blob/master/02-exploratory-data-analysis/README.md)    
28 | 
29 | About this important task in any data science engineering project.  
30 | 
31 | ### [Dimensionality Reduction and Clustering](https://github.com/jadianes/data-science-your-way/blob/master/03-dimensionality-reduction-and-clustering/README.md)    
32 | About using Principal Component Analysis and k-means Clustering to better represent and understand our data.  
33 | 
34 | ### [Text Mining and Sentiment Classification](https://github.com/jadianes/data-science-your-way/blob/master/04-sentiment-analysis/README.md)    
35 | 
36 | How to use text mining techniques to analyse the positive or non-positive sentiment of text documents using just *linear methods*.  
37 | 
38 | ## Applications  
39 | 
40 | These are some of the applications we have built using the concepts explained in the tutorials.  
41 | 
42 | ### [A web-based Sentiment Classifier using R and Shiny](https://github.com/jadianes/data-science-your-way/blob/master/apps/sentimentclassifier/README.md)  
43 | 
44 | How to build a web applications where we can upload text documents to be sentiment-analysed using the R-based framework [Shiny](http://shiny.rstudio.com/).  
45 | 
46 | ### [Building Data Products with Python](https://github.com/jadianes/data-science-your-way/blob/master/apps/winerama/README.md)  
47 | 
48 | Using a [wine reviews and recommendations website](http://jadianes.koding.io:8000/reviews/) as a leitmotif, this series of tutorials, with [its own separate repository](https://github.com/jadianes/winerama-recommender-tutorial) tagged by lessons, digs into how to use Python technologies such as Django, Pandas, or Scikit-learn, in order to build data products.   
49 | 
50 | ### [Red Wine Quality Data analysis with R](https://github.com/jadianes/data-science-your-way/blob/master/apps/wine-quality-data-analysis/README.md)  
51 | 
52 | Using R and ggplot2, we perform Exploratory Data Analysis of this reference dataset about wine quality.    
53 | 
54 | ### [Information Retrieval algorithms with Python](https://github.com/jadianes/data-science-your-way/blob/master/apps/information-retrieval/README.md)  
55 | 
56 | Where we show our own implementation of a couple of Information Retrieval algorithms: vector space model, and tf-idf.  
57 | 
58 | ### [Kaggle - The Analytics Edge (Spring 2015)](https://github.com/jadianes/data-science-your-way/blob/master/apps/kaggle-analytics-edge-15/)  
59 | 
60 | My solution to this Kaggle competition. It was part of the edX MOOC [The Analitics Edge](https://www.edx.org/course/analytics-edge-mitx-15-071x-0). I highly recommend this on-line course. It is one of the most applied I have ever taken about using R for data anlysis and machine learning.  
61 | 
62 | ## Contributing
63 | 
64 | Contributions are welcome!  For bug reports or requests please [submit an issue](https://github.com/jadianes/data-science-your-way/issues).
65 | 
66 | ## Contact  
67 | 
68 | Feel free to contact me to discuss any issues, questions, or comments.
69 | 
70 | * Twitter: [@ja_dianes](https://twitter.com/ja_dianes)
71 | * GitHub: [jadianes](https://github.com/jadianes)
72 | * LinkedIn: [jadianes](https://www.linkedin.com/in/jadianes)
73 | * Website: [jadianes.me](http://data.jadianes.com)
74 | 
75 | ## License
76 | 
77 | This repository contains a variety of content; some developed by Jose A. Dianes, and some from third-parties.  The third-party content is distributed under the license provided by those parties.
78 | 
79 | The content developed by Jose A. Dianes is distributed under the following license:
80 | 
81 |     Copyright 2016 Jose A Dianes
82 | 
83 |     Licensed under the Apache License, Version 2.0 (the "License");
84 |     you may not use this file except in compliance with the License.
85 |     You may obtain a copy of the License at
86 | 
87 |        http://www.apache.org/licenses/LICENSE-2.0
88 | 
89 |     Unless required by applicable law or agreed to in writing, software
90 |     distributed under the License is distributed on an "AS IS" BASIS,
91 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
92 |     See the License for the specific language governing permissions and
93 |     limitations under the License.
94 | 


--------------------------------------------------------------------------------
/apps/information-retrieval/README.md:
--------------------------------------------------------------------------------
 1 | IrPy Notebooks
 2 | ====
 3 | 
 4 | Information Retrieval algorithms and data structures in Python. Mostly in memory and using Dictionaries
 5 | 
 6 | ## Notebooks  
 7 | 
 8 | [On tf-idf](https://github.com/jadianes/data-science-your-way/blob/master/apps/information-retrieval/tf-idf.ipynb)  
 9 | [Scoring using the Vector Space Model](https://github.com/jadianes/data-science-your-way/blob/master/apps/information-retrieval/Vector%20Space.ipynb)  
10 | 
11 | 


--------------------------------------------------------------------------------
/apps/information-retrieval/Vector Space.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "metadata": {
  3 |   "name": "",
  4 |   "signature": "sha256:e586c7ce52656e5b15e6a5af32250e16c3a08411e7da2c1afb89acb747e3d8d6"
  5 |  },
  6 |  "nbformat": 3,
  7 |  "nbformat_minor": 0,
  8 |  "worksheets": [
  9 |   {
 10 |    "cells": [
 11 |     {
 12 |      "cell_type": "markdown",
 13 |      "metadata": {},
 14 |      "source": [
 15 |       "## Scoring using the Vector Space Model"
 16 |      ]
 17 |     },
 18 |     {
 19 |      "cell_type": "markdown",
 20 |      "metadata": {},
 21 |      "source": [
 22 |       "Previously we discussed tf-idf as a way to calculate how relevant a search term is given a set of indexed documents. When having multiple terms, we used *overlap score measure* consisting in the sum of the *tf-idf* for each term in the given input. A more general and flexible way of scoring multi-term searches is using the **vector space model**.    "
 23 |      ]
 24 |     },
 25 |     {
 26 |      "cell_type": "markdown",
 27 |      "metadata": {},
 28 |      "source": [
 29 |       "In the vector space model, each document in the corpus is represented by a vector. The search input terms are also represented by a vector. Scoring search results consists then in vector operations between the documents vectors and the search terms vector.  \n",
 30 |       "\n"
 31 |      ]
 32 |     },
 33 |     {
 34 |      "cell_type": "markdown",
 35 |      "metadata": {},
 36 |      "source": [
 37 |       "But what is each of these vectors made of? Basically they define term frequencies. That is, each dimension in a vector represents the term frequency for a given term. Then, a document is represented in a multi-dimensional space by a vector of the frequencies of each of the words in hte corpus. Equally, a search input is represented by a vector in the same space but using the input terms.  "
 38 |      ]
 39 |     },
 40 |     {
 41 |      "cell_type": "markdown",
 42 |      "metadata": {},
 43 |      "source": [
 44 |       "### Python implementation  "
 45 |      ]
 46 |     },
 47 |     {
 48 |      "cell_type": "markdown",
 49 |      "metadata": {},
 50 |      "source": [
 51 |       "The following is a Python representation if this concept.   "
 52 |      ]
 53 |     },
 54 |     {
 55 |      "cell_type": "code",
 56 |      "collapsed": false,
 57 |      "input": [
 58 |       "import re\n",
 59 |       "\n",
 60 |       "class IrIndex:\n",
 61 |       "    \"\"\"An in-memory inverted index\"\"\"\n",
 62 |       "\n",
 63 |       "    pattern = re.compile(\"^\\s+|\\s*,*\\s*|\\s+$\")\n",
 64 |       "\n",
 65 |       "    def __init__(self):\n",
 66 |       "        self.index = {}\n",
 67 |       "        self.documents = []\n",
 68 |       "        self.vectors = []\n",
 69 |       "\n",
 70 |       "    def index_document(self, document):\n",
 71 |       "        # split\n",
 72 |       "        terms = [word for word in self.pattern.split(document)]\n",
 73 |       "        # add to documents\n",
 74 |       "        self.documents.append(document)\n",
 75 |       "        document_pos = len(self.documents) - 1\n",
 76 |       "        # add posts to index, while creating document vector\n",
 77 |       "        vector = {}\n",
 78 |       "        for term in terms:\n",
 79 |       "            if term not in self.index:\n",
 80 |       "                self.index[term] = []\n",
 81 |       "            self.index[term].append(document_pos)\n",
 82 |       "            if term not in vector:\n",
 83 |       "                vector[term] = 1\n",
 84 |       "            else:\n",
 85 |       "                vector[term] += 1\n",
 86 |       "        # add the vector\n",
 87 |       "        self.vectors.append(vector)"
 88 |      ],
 89 |      "language": "python",
 90 |      "metadata": {},
 91 |      "outputs": [],
 92 |      "prompt_number": 27
 93 |     },
 94 |     {
 95 |      "cell_type": "markdown",
 96 |      "metadata": {},
 97 |      "source": [
 98 |       "We use the same `IrIdenx` class and a boolean search schema. The difference is when calculating scores. We don't store a precalculated `tf` structure anymore but operate vectors directly.  \n",
 99 |       "\n",
100 |       "In terms of complexity, when indexing we have moved from:  \n",
101 |       "* Recalculate every `tf` entry when indexing a new document. This involves lookup + sum for each term in the document.    \n",
102 |       "\n",
103 |       "To:  \n",
104 |       "* Indexing stage: store a new document as a vectors of `tf` values. Here we save the recalculation of `tf` entries.    \n",
105 |       "\n",
106 |       "So as we can see, the indexing stage is simpler (and more scalable) when using the vector space model. This scalability gain when indexing is not to be overlooked. In an index with hundreds of thousands or even millions of terms, indexing a new large document and recalculating term frequencies at a global scale can be costly. We could calculate term frequencies when searching instead, but then using the vector space model makes even more sense.  \n",
107 |       "\n",
108 |       "Next the search and scoring part.  "
109 |      ]
110 |     },
111 |     {
112 |      "cell_type": "code",
113 |      "collapsed": false,
114 |      "input": [
115 |       "from numpy import array, dot\n",
116 |       "from math import log\n",
117 |       "\n",
118 |       "def create_tfidf_list(self, *args):\n",
119 |       "        if len(args) == 1:\n",
120 |       "            res = [tf for tf in args[0].itervalues()]\n",
121 |       "        elif len(args) == 2:\n",
122 |       "            res = []\n",
123 |       "            for term in args[0].iterkeys():\n",
124 |       "                if term in args[1]:\n",
125 |       "                    idf = log(float(len(self.documents)) / float(len(self.index[term])))\n",
126 |       "                    res.append(args[1][term] * idf)\n",
127 |       "                else:\n",
128 |       "                    res.append(0)\n",
129 |       "        return res\n",
130 |       "\n",
131 |       "def create_tf_dictionary(self, terms):\n",
132 |       "    res = {}\n",
133 |       "    for term in self.pattern.split(terms):\n",
134 |       "        if term not in res:\n",
135 |       "            res[term] = terms.count(term)\n",
136 |       "    return res\n",
137 |       "\n",
138 |       "def vector_space_search(self, terms):\n",
139 |       "    res = []\n",
140 |       "    hits = {}\n",
141 |       "    # create a numeric vector from terms\n",
142 |       "    terms_tf_dictionary = self.create_tf_dictionary(terms)\n",
143 |       "    terms_tfidf_list = self.create_tfidf_list(terms_tf_dictionary)\n",
144 |       "    # create a numeric vector for each hitting document\n",
145 |       "    hitting_terms = [term for term in self.pattern.split(terms) if term in self.index]\n",
146 |       "    for term in hitting_terms:  # for each term having at least on hit...\n",
147 |       "        for post in self.index[term]:  # for each document create the numeric vector\n",
148 |       "            if post not in hits:\n",
149 |       "                tfidf_list = self.create_tfidf_list(terms_tf_dictionary, self.vectors[post])\n",
150 |       "                hits[post] = tfidf_list\n",
151 |       "    # do the dot products\n",
152 |       "    for post in hits.iterkeys():\n",
153 |       "        score = dot(array(terms_tfidf_list), array(hits[post]))\n",
154 |       "        res.append((score, self.documents[post]))\n",
155 |       "    return res\n",
156 |       "\n",
157 |       "\n",
158 |       "IrIndex.create_tf_dictionary = create_tf_dictionary\n",
159 |       "IrIndex.create_tfidf_list = create_tfidf_list\n",
160 |       "IrIndex.vector_space_search = vector_space_search"
161 |      ],
162 |      "language": "python",
163 |      "metadata": {},
164 |      "outputs": [],
165 |      "prompt_number": 28
166 |     },
167 |     {
168 |      "cell_type": "markdown",
169 |      "metadata": {},
170 |      "source": [
171 |       "At search stage, we have moved from:  \n",
172 |       "\n",
173 |       "* Calculate the `idf` and access the `tf` lookup table for each search term and document hit. Sum the resulting `tf-idf` values for each document hit. This is done using two for loops, one of them including another nested internal for loop.    \n",
174 |       "\n",
175 |       "To:  \n",
176 |       "* Access the `index` lookup table for any of the search terms and perform dot-product with the resulting vectors. The later is an overhead introduced by this approach in the search stage.    \n",
177 |       "\n",
178 |       "The new search stage has introduced vector dot products where there were just sums (although using nested lopps) when the vector space model was not used. However the data structures and their usage has been simplified. Note that we build the vectors from pre-calculated dictionaries. Doing so we can determine the dimensions form the search query vector.  "
179 |      ]
180 |     },
181 |     {
182 |      "cell_type": "markdown",
183 |      "metadata": {},
184 |      "source": [
185 |       "### Other benefits of the Vector Space Model  "
186 |      ]
187 |     },
188 |     {
189 |      "cell_type": "markdown",
190 |      "metadata": {},
191 |      "source": [
192 |       "But what other benefits come with the vector space model? These are some of them: \n",
193 |       "* Treating queries as vectors allows us simplifying data structures and calculations. Where we used two dictionaries and loops, now we use a single dictionary and linear algebra.  \n",
194 |       "* The compact and easy to operate vector representation leaves the door open to different weighting and transformation schemas that were difficult to apply before (or at least the result were not so clean).  \n",
195 |       "* Vectors can be the input of additional Information Retrieval and Machine Learning techniques including supervised (e.g. classification) and unsupervised (e.g. clustering, frequent pattern mining).  \n"
196 |      ]
197 |     },
198 |     {
199 |      "cell_type": "markdown",
200 |      "metadata": {},
201 |      "source": [
202 |       "### Examples  "
203 |      ]
204 |     },
205 |     {
206 |      "cell_type": "markdown",
207 |      "metadata": {},
208 |      "source": [
209 |       "Let us now recall our sample wine-related mini-corpus in order to see if we get similar results using the new Vector Space Model. Remember that results are given unsorted. Just pay attention to the scores.    "
210 |      ]
211 |     },
212 |     {
213 |      "cell_type": "code",
214 |      "collapsed": false,
215 |      "input": [
216 |       "index = IrIndex()\n",
217 |       "index.index_document(\"Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France\")\n",
218 |       "index.index_document(\"Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France\")\n",
219 |       "index.index_document(\"Bruno Clair Clos Saint Jaques 2001, Bourgogne, France\")\n",
220 |       "index.index_document(\"Bruno Clair Clos Saint Jaques 2002, Bourgogne, France\")\n",
221 |       "index.index_document(\"Bruno Clair Clos Saint Jaques 2005, Bourgogne, France\")\n",
222 |       "index.index_document(\"Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France\")\n",
223 |       "index.index_document(\"Chateau Margaux 1982, Bordeaux, France\")\n",
224 |       "index.index_document(\"Chateau Margaux 1996, Bordeaux, France\")\n",
225 |       "index.index_document(\"Chateau Latour 1982, Bordeaux, France\")\n",
226 |       "index.index_document(\"Domaine Raveneau Le Clos 2001, Bourgogne, France\")"
227 |      ],
228 |      "language": "python",
229 |      "metadata": {},
230 |      "outputs": [],
231 |      "prompt_number": 29
232 |     },
233 |     {
234 |      "cell_type": "code",
235 |      "collapsed": false,
236 |      "input": [
237 |       "index.vector_space_search(\"hello\")"
238 |      ],
239 |      "language": "python",
240 |      "metadata": {},
241 |      "outputs": [
242 |       {
243 |        "metadata": {},
244 |        "output_type": "pyout",
245 |        "prompt_number": 30,
246 |        "text": [
247 |         "[]"
248 |        ]
249 |       }
250 |      ],
251 |      "prompt_number": 30
252 |     },
253 |     {
254 |      "cell_type": "code",
255 |      "collapsed": false,
256 |      "input": [
257 |       "index.vector_space_search(\"Bordeaux\")"
258 |      ],
259 |      "language": "python",
260 |      "metadata": {},
261 |      "outputs": [
262 |       {
263 |        "metadata": {},
264 |        "output_type": "pyout",
265 |        "prompt_number": 31,
266 |        "text": [
267 |         "[(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),\n",
268 |         " (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),\n",
269 |         " (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France')]"
270 |        ]
271 |       }
272 |      ],
273 |      "prompt_number": 31
274 |     },
275 |     {
276 |      "cell_type": "code",
277 |      "collapsed": false,
278 |      "input": [
279 |       "index.vector_space_search(\"Margaux\")"
280 |      ],
281 |      "language": "python",
282 |      "metadata": {},
283 |      "outputs": [
284 |       {
285 |        "metadata": {},
286 |        "output_type": "pyout",
287 |        "prompt_number": 32,
288 |        "text": [
289 |         "[(1.6094379124341003, 'Chateau Margaux 1982, Bordeaux, France'),\n",
290 |         " (1.6094379124341003, 'Chateau Margaux 1996, Bordeaux, France')]"
291 |        ]
292 |       }
293 |      ],
294 |      "prompt_number": 32
295 |     },
296 |     {
297 |      "cell_type": "code",
298 |      "collapsed": false,
299 |      "input": [
300 |       "index.vector_space_search(\"Bourgogne\")"
301 |      ],
302 |      "language": "python",
303 |      "metadata": {},
304 |      "outputs": [
305 |       {
306 |        "metadata": {},
307 |        "output_type": "pyout",
308 |        "prompt_number": 33,
309 |        "text": [
310 |         "[(0.22314355131420976,\n",
311 |         "  'Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France'),\n",
312 |         " (0.22314355131420976,\n",
313 |         "  'Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France'),\n",
314 |         " (0.22314355131420976,\n",
315 |         "  'Bruno Clair Clos Saint Jaques 2001, Bourgogne, France'),\n",
316 |         " (0.22314355131420976,\n",
317 |         "  'Bruno Clair Clos Saint Jaques 2002, Bourgogne, France'),\n",
318 |         " (0.22314355131420976,\n",
319 |         "  'Bruno Clair Clos Saint Jaques 2005, Bourgogne, France'),\n",
320 |         " (0.44628710262841953,\n",
321 |         "  'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),\n",
322 |         " (0.22314355131420976, 'Domaine Raveneau Le Clos 2001, Bourgogne, France')]"
323 |        ]
324 |       }
325 |      ],
326 |      "prompt_number": 33
327 |     },
328 |     {
329 |      "cell_type": "code",
330 |      "collapsed": false,
331 |      "input": [
332 |       "index.vector_space_search(\"hello Bordeaux\")"
333 |      ],
334 |      "language": "python",
335 |      "metadata": {},
336 |      "outputs": [
337 |       {
338 |        "metadata": {},
339 |        "output_type": "pyout",
340 |        "prompt_number": 34,
341 |        "text": [
342 |         "[(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),\n",
343 |         " (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),\n",
344 |         " (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France')]"
345 |        ]
346 |       }
347 |      ],
348 |      "prompt_number": 34
349 |     },
350 |     {
351 |      "cell_type": "code",
352 |      "collapsed": false,
353 |      "input": [
354 |       "index.vector_space_search(\"Bourgogne Bordeaux\")"
355 |      ],
356 |      "language": "python",
357 |      "metadata": {},
358 |      "outputs": [
359 |       {
360 |        "metadata": {},
361 |        "output_type": "pyout",
362 |        "prompt_number": 35,
363 |        "text": [
364 |         "[(0.22314355131420976,\n",
365 |         "  'Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France'),\n",
366 |         " (0.22314355131420976,\n",
367 |         "  'Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France'),\n",
368 |         " (0.22314355131420976,\n",
369 |         "  'Bruno Clair Clos Saint Jaques 2001, Bourgogne, France'),\n",
370 |         " (0.22314355131420976,\n",
371 |         "  'Bruno Clair Clos Saint Jaques 2002, Bourgogne, France'),\n",
372 |         " (0.22314355131420976,\n",
373 |         "  'Bruno Clair Clos Saint Jaques 2005, Bourgogne, France'),\n",
374 |         " (0.44628710262841953,\n",
375 |         "  'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),\n",
376 |         " (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),\n",
377 |         " (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France'),\n",
378 |         " (1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),\n",
379 |         " (0.22314355131420976, 'Domaine Raveneau Le Clos 2001, Bourgogne, France')]"
380 |        ]
381 |       }
382 |      ],
383 |      "prompt_number": 35
384 |     },
385 |     {
386 |      "cell_type": "code",
387 |      "collapsed": false,
388 |      "input": [
389 |       "index.vector_space_search(\"Margaux Bordeaux\")"
390 |      ],
391 |      "language": "python",
392 |      "metadata": {},
393 |      "outputs": [
394 |       {
395 |        "metadata": {},
396 |        "output_type": "pyout",
397 |        "prompt_number": 36,
398 |        "text": [
399 |         "[(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),\n",
400 |         " (2.8134107167600364, 'Chateau Margaux 1982, Bordeaux, France'),\n",
401 |         " (2.8134107167600364, 'Chateau Margaux 1996, Bordeaux, France')]"
402 |        ]
403 |       }
404 |      ],
405 |      "prompt_number": 36
406 |     }
407 |    ],
408 |    "metadata": {}
409 |   }
410 |  ]
411 | }


--------------------------------------------------------------------------------
/apps/information-retrieval/Vector Space.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Scoring using the Vector Space Model
  3 | 
  4 | Previously we discussed tf-idf as a way to calculate how relevant a search term
  5 | is given a set of indexed documents. When having multiple terms, we used
  6 | *overlap score measure* consisting in the sum of the *tf-idf* for each term in
  7 | the given input. A more general and flexible way of scoring multi-term searches
  8 | is using the **vector space model**.
  9 | 
 10 | In the vector space model, each document in the corpus is represented by a
 11 | vector. The search input terms are also represented by a vector. Scoring search
 12 | results consists then in vector operations between the documents vectors and the
 13 | search terms vector.
 14 | 
 15 | 
 16 | 
 17 | But what is each of these vectors made of? Basically they define term
 18 | frequencies. That is, each dimension in a vector represents the term frequency
 19 | for a given term. Then, a document is represented in a multi-dimensional space
 20 | by a vector of the frequencies of each of the words in hte corpus. Equally, a
 21 | search input is represented by a vector in the same space but using the input
 22 | terms.
 23 | 
 24 | ### Python implementation
 25 | 
 26 | The following is a Python representation if this concept.
 27 | 
 28 | 
 29 |     import re
 30 |     
 31 |     class IrIndex:
 32 |         """An in-memory inverted index"""
 33 |     
 34 |         pattern = re.compile("^\s+|\s*,*\s*|\s+$")
 35 |     
 36 |         def __init__(self):
 37 |             self.index = {}
 38 |             self.documents = []
 39 |             self.vectors = []
 40 |     
 41 |         def index_document(self, document):
 42 |             # split
 43 |             terms = [word for word in self.pattern.split(document)]
 44 |             # add to documents
 45 |             self.documents.append(document)
 46 |             document_pos = len(self.documents) - 1
 47 |             # add posts to index, while creating document vector
 48 |             vector = {}
 49 |             for term in terms:
 50 |                 if term not in self.index:
 51 |                     self.index[term] = []
 52 |                 self.index[term].append(document_pos)
 53 |                 if term not in vector:
 54 |                     vector[term] = 1
 55 |                 else:
 56 |                     vector[term] += 1
 57 |             # add the vector
 58 |             self.vectors.append(vector)
 59 | 
 60 | We use the same `IrIdenx` class and a boolean search schema. The difference is
 61 | when calculating scores. We don't store a precalculated `tf` structure anymore
 62 | but operate vectors directly.
 63 | 
 64 | In terms of complexity, when indexing we have moved from:
 65 | * Recalculate every `tf` entry when indexing a new document. This involves
 66 | lookup + sum for each term in the document.
 67 | 
 68 | To:
 69 | * Indexing stage: store a new document as a vectors of `tf` values. Here we save
 70 | the recalculation of `tf` entries.
 71 | 
 72 | So as we can see, the indexing stage is simpler (and more scalable) when using
 73 | the vector space model. This scalability gain when indexing is not to be
 74 | overlooked. In an index with hundreds of thousands or even millions of terms,
 75 | indexing a new large document and recalculating term frequencies at a global
 76 | scale can be costly. We could calculate term frequencies when searching instead,
 77 | but then using the vector space model makes even more sense.
 78 | 
 79 | Next the search and scoring part.
 80 | 
 81 | 
 82 |     from numpy import array, dot
 83 |     from math import log
 84 |     
 85 |     def create_tfidf_list(self, *args):
 86 |             if len(args) == 1:
 87 |                 res = [tf for tf in args[0].itervalues()]
 88 |             elif len(args) == 2:
 89 |                 res = []
 90 |                 for term in args[0].iterkeys():
 91 |                     if term in args[1]:
 92 |                         idf = log(float(len(self.documents)) / float(len(self.index[term])))
 93 |                         res.append(args[1][term] * idf)
 94 |                     else:
 95 |                         res.append(0)
 96 |             return res
 97 |     
 98 |     def create_tf_dictionary(self, terms):
 99 |         res = {}
100 |         for term in self.pattern.split(terms):
101 |             if term not in res:
102 |                 res[term] = terms.count(term)
103 |         return res
104 |     
105 |     def vector_space_search(self, terms):
106 |         res = []
107 |         hits = {}
108 |         # create a numeric vector from terms
109 |         terms_tf_dictionary = self.create_tf_dictionary(terms)
110 |         terms_tfidf_list = self.create_tfidf_list(terms_tf_dictionary)
111 |         # create a numeric vector for each hitting document
112 |         hitting_terms = [term for term in self.pattern.split(terms) if term in self.index]
113 |         for term in hitting_terms:  # for each term having at least on hit...
114 |             for post in self.index[term]:  # for each document create the numeric vector
115 |                 if post not in hits:
116 |                     tfidf_list = self.create_tfidf_list(terms_tf_dictionary, self.vectors[post])
117 |                     hits[post] = tfidf_list
118 |         # do the dot products
119 |         for post in hits.iterkeys():
120 |             score = dot(array(terms_tfidf_list), array(hits[post]))
121 |             res.append((score, self.documents[post]))
122 |         return res
123 |     
124 |     
125 |     IrIndex.create_tf_dictionary = create_tf_dictionary
126 |     IrIndex.create_tfidf_list = create_tfidf_list
127 |     IrIndex.vector_space_search = vector_space_search
128 | 
129 | At search stage, we have moved from:
130 | 
131 | * Calculate the `idf` and access the `tf` lookup table for each search term and
132 | document hit. Sum the resulting `tf-idf` values for each document hit. This is
133 | done using two for loops, one of them including another nested internal for
134 | loop.
135 | 
136 | To:
137 | * Access the `index` lookup table for any of the search terms and perform dot-
138 | product with the resulting vectors. The later is an overhead introduced by this
139 | approach in the search stage.
140 | 
141 | The new search stage has introduced vector dot products where there were just
142 | sums (although using nested lopps) when the vector space model was not used.
143 | However the data structures and their usage has been simplified. Note that we
144 | build the vectors from pre-calculated dictionaries. Doing so we can determine
145 | the dimensions form the search query vector.
146 | 
147 | ### Other benefits of the Vector Space Model
148 | 
149 | But what other benefits come with the vector space model? These are some of
150 | them:
151 | * Treating queries as vectors allows us simplifying data structures and
152 | calculations. Where we used two dictionaries and loops, now we use a single
153 | dictionary and linear algebra.
154 | * The compact and easy to operate vector representation leaves the door open to
155 | different weighting and transformation schemas that were difficult to apply
156 | before (or at least the result were not so clean).
157 | * Vectors can be the input of additional Information Retrieval and Machine
158 | Learning techniques including supervised (e.g. classification) and unsupervised
159 | (e.g. clustering, frequent pattern mining).
160 | 
161 | 
162 | ### Examples
163 | 
164 | Let us now recall our sample wine-related mini-corpus in order to see if we get
165 | similar results using the new Vector Space Model. Remember that results are
166 | given unsorted. Just pay attention to the scores.
167 | 
168 | 
169 |     index = IrIndex()
170 |     index.index_document("Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France")
171 |     index.index_document("Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France")
172 |     index.index_document("Bruno Clair Clos Saint Jaques 2001, Bourgogne, France")
173 |     index.index_document("Bruno Clair Clos Saint Jaques 2002, Bourgogne, France")
174 |     index.index_document("Bruno Clair Clos Saint Jaques 2005, Bourgogne, France")
175 |     index.index_document("Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France")
176 |     index.index_document("Chateau Margaux 1982, Bordeaux, France")
177 |     index.index_document("Chateau Margaux 1996, Bordeaux, France")
178 |     index.index_document("Chateau Latour 1982, Bordeaux, France")
179 |     index.index_document("Domaine Raveneau Le Clos 2001, Bourgogne, France")
180 | 
181 | 
182 |     index.vector_space_search("hello")
183 | 
184 | 
185 | 
186 | 
187 |     []
188 | 
189 | 
190 | 
191 | 
192 |     index.vector_space_search("Bordeaux")
193 | 
194 | 
195 | 
196 | 
197 |     [(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
198 |      (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),
199 |      (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France')]
200 | 
201 | 
202 | 
203 | 
204 |     index.vector_space_search("Margaux")
205 | 
206 | 
207 | 
208 | 
209 |     [(1.6094379124341003, 'Chateau Margaux 1982, Bordeaux, France'),
210 |      (1.6094379124341003, 'Chateau Margaux 1996, Bordeaux, France')]
211 | 
212 | 
213 | 
214 | 
215 |     index.vector_space_search("Bourgogne")
216 | 
217 | 
218 | 
219 | 
220 |     [(0.22314355131420976,
221 |       'Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France'),
222 |      (0.22314355131420976,
223 |       'Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France'),
224 |      (0.22314355131420976,
225 |       'Bruno Clair Clos Saint Jaques 2001, Bourgogne, France'),
226 |      (0.22314355131420976,
227 |       'Bruno Clair Clos Saint Jaques 2002, Bourgogne, France'),
228 |      (0.22314355131420976,
229 |       'Bruno Clair Clos Saint Jaques 2005, Bourgogne, France'),
230 |      (0.44628710262841953,
231 |       'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),
232 |      (0.22314355131420976, 'Domaine Raveneau Le Clos 2001, Bourgogne, France')]
233 | 
234 | 
235 | 
236 | 
237 |     index.vector_space_search("hello Bordeaux")
238 | 
239 | 
240 | 
241 | 
242 |     [(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
243 |      (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),
244 |      (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France')]
245 | 
246 | 
247 | 
248 | 
249 |     index.vector_space_search("Bourgogne Bordeaux")
250 | 
251 | 
252 | 
253 | 
254 |     [(0.22314355131420976,
255 |       'Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France'),
256 |      (0.22314355131420976,
257 |       'Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France'),
258 |      (0.22314355131420976,
259 |       'Bruno Clair Clos Saint Jaques 2001, Bourgogne, France'),
260 |      (0.22314355131420976,
261 |       'Bruno Clair Clos Saint Jaques 2002, Bourgogne, France'),
262 |      (0.22314355131420976,
263 |       'Bruno Clair Clos Saint Jaques 2005, Bourgogne, France'),
264 |      (0.44628710262841953,
265 |       'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),
266 |      (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),
267 |      (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France'),
268 |      (1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
269 |      (0.22314355131420976, 'Domaine Raveneau Le Clos 2001, Bourgogne, France')]
270 | 
271 | 
272 | 
273 | 
274 |     index.vector_space_search("Margaux Bordeaux")
275 | 
276 | 
277 | 
278 | 
279 |     [(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
280 |      (2.8134107167600364, 'Chateau Margaux 1982, Bordeaux, France'),
281 |      (2.8134107167600364, 'Chateau Margaux 1996, Bordeaux, France')]
282 | 
283 | 
284 | 


--------------------------------------------------------------------------------
/apps/information-retrieval/tf-idf.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Term Frequency - Inverse Document Frequency 101
  3 | 
  4 | Let's program here a basic and beautiful *Information Retrieval* concept such as
  5 | *[tf-idf](http://en.wikipedia.org/wiki/Tf%E2%80%93idf)*. In order to do so, we
  6 | will first define a basic in-memory search engine that allows to add documents
  7 | and search for them. The search results will contain relevant documents together
  8 | with the *tf-idf* value.
  9 | 
 10 | 
 11 |     from math import log
 12 |     import re
 13 |     
 14 |     class IrIndex:
 15 |         """An in-memory inverted index"""
 16 |         
 17 |         pattern = re.compile("^\s+|\s*,*\s*|\s+$")
 18 |         
 19 |         def __init__(self):
 20 |             self.index = {}
 21 |             self.documents = []
 22 |             self.tf = {}
 23 |         
 24 |         def index_document(self, document):
 25 |             ## split
 26 |             terms = [word for word in self.pattern.split(document)]
 27 |             
 28 |             ## add to documents
 29 |             self.documents.append(document)
 30 |             document_pos = len(self.documents)-1
 31 |             
 32 |             ## add posts to index, updating tf, idf
 33 |             for term in terms:
 34 |                 if term not in self.index:
 35 |                     self.index[term] = []
 36 |                     self.tf[term] = []
 37 |                 self.index[term].append(document_pos)
 38 |                 self.tf[term].append(terms.count(term))
 39 |             
 40 |         
 41 |         def tf_idf(self, term):
 42 |             ## get tf for each document
 43 |             if term in self.tf:
 44 |                 res = []
 45 |                 for tf, post in zip(self.tf[term], self.index[term]):
 46 |                     idf = log( float( len(self.documents) ) / float( len(self.tf[term]) ) )
 47 |                     res.append((tf * idf, self.documents[post]))
 48 |                 return res 
 49 |             else:
 50 |                 return []
 51 |                
 52 | 
 53 | We create now our empty index.
 54 | 
 55 | 
 56 |     index = IrIndex()
 57 | 
 58 | Add some documents...
 59 | 
 60 | 
 61 |     index.index_document("Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France")
 62 |     index.index_document("Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France")
 63 |     index.index_document("Bruno Clair Clos Saint Jaques 2001, Bourgogne, France")
 64 |     index.index_document("Bruno Clair Clos Saint Jaques 2002, Bourgogne, France")
 65 |     index.index_document("Bruno Clair Clos Saint Jaques 2005, Bourgogne, France")
 66 |     index.index_document("Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France")
 67 |     index.index_document("Chateau Margaux 1982, Bordeaux, France")
 68 |     index.index_document("Chateau Margaux 1996, Bordeaux, France")
 69 |     index.index_document("Chateau Latour 1982, Bordeaux, France")
 70 |     index.index_document("Domaine Raveneau Le Clos 2001, Bourgogne, France")
 71 | 
 72 | Let's try some terms. First, we search for a term that doesn't exists in any of
 73 | our documents.
 74 | 
 75 | 
 76 |     index.tf_idf("hello")
 77 | 
 78 | 
 79 | 
 80 | 
 81 |     []
 82 | 
 83 | 
 84 | 
 85 | Next, let's try a term that appears in few documents.
 86 | 
 87 | 
 88 |     index.tf_idf("Bordeaux")
 89 | 
 90 | 
 91 | 
 92 | 
 93 |     [(1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),
 94 |      (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France'),
 95 |      (1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France')]
 96 | 
 97 | 
 98 | 
 99 | A term with higher idf. That is, 'Margaux' is **less common** or **more
100 | specific** than 'Bordeaux'. We see higher scores in general.
101 | 
102 | 
103 |     index.tf_idf("Margaux")
104 | 
105 | 
106 | 
107 | 
108 |     [(1.6094379124341003, 'Chateau Margaux 1982, Bordeaux, France'),
109 |      (1.6094379124341003, 'Chateau Margaux 1996, Bordeaux, France')]
110 | 
111 | 
112 | 
113 | A term with tf higher than 1 for one of the documents. Now we search for
114 | 'Bourgogne', a **more common term** in our index, so we have **lower idf**. That
115 | means lower scores in general but higher in cases with **higher tf** among them.
116 | 
117 | 
118 |     index.tf_idf("Bourgogne")
119 | 
120 | 
121 | 
122 | 
123 |     [(0.22314355131420976,
124 |       'Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France'),
125 |      (0.22314355131420976,
126 |       'Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France'),
127 |      (0.22314355131420976,
128 |       'Bruno Clair Clos Saint Jaques 2001, Bourgogne, France'),
129 |      (0.22314355131420976,
130 |       'Bruno Clair Clos Saint Jaques 2002, Bourgogne, France'),
131 |      (0.22314355131420976,
132 |       'Bruno Clair Clos Saint Jaques 2005, Bourgogne, France'),
133 |      (0.44628710262841953,
134 |       'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),
135 |      (0.44628710262841953,
136 |       'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),
137 |      (0.22314355131420976, 'Domaine Raveneau Le Clos 2001, Bourgogne, France')]
138 | 
139 | 
140 | 
141 | ## Multi-term search
142 | 
143 | In order to do multi-term search, we will sum the tf-idf for each term per
144 | document. In order to do so, we need a new `tf_idf` method for our index.
145 | 
146 | 
147 |     def tf_idf_multi(self, terms):
148 |         res = []
149 |         hits = {}
150 |         # sum tf-idfs for each hitting document
151 |         hitting_terms = [term for term in self.pattern.split(terms) if term in self.tf]
152 |         for term in hitting_terms: # for each term having at least on hit...
153 |             for tf, post in zip(self.tf[term], self.index[term]): # store the tf-idf in hits for later sum
154 |                 if post not in hits:
155 |                     hits[post] = []
156 |                 idf = log( float( len(self.documents) ) / float( len(self.tf[term]) ) )
157 |                 hits[post].append(tf * idf)
158 |         # sum hits for each post
159 |         for post in hits.iterkeys():
160 |             tfidf = sum(hits[post])
161 |             res.append((tfidf, self.documents[post]))
162 |             
163 |         return res 
164 |     
165 |     
166 |     IrIndex.tf_idf_multi = tf_idf_multi
167 | 
168 | First, let's check that works the same for single term queries.
169 | 
170 | 
171 |     index.tf_idf_multi("hello")
172 | 
173 | 
174 | 
175 | 
176 |     []
177 | 
178 | 
179 | 
180 | 
181 |     index.tf_idf_multi("Bordeaux")
182 | 
183 | 
184 | 
185 | 
186 |     [(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
187 |      (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),
188 |      (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France')]
189 | 
190 | 
191 | 
192 | 
193 |     index.tf_idf_multi("Margaux")
194 | 
195 | 
196 | 
197 | 
198 |     [(1.6094379124341003, 'Chateau Margaux 1982, Bordeaux, France'),
199 |      (1.6094379124341003, 'Chateau Margaux 1996, Bordeaux, France')]
200 | 
201 | 
202 | 
203 | 
204 |     index.tf_idf_multi("Bourgogne")
205 | 
206 | 
207 | 
208 | 
209 |     [(0.22314355131420976,
210 |       'Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France'),
211 |      (0.22314355131420976,
212 |       'Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France'),
213 |      (0.22314355131420976,
214 |       'Bruno Clair Clos Saint Jaques 2001, Bourgogne, France'),
215 |      (0.22314355131420976,
216 |       'Bruno Clair Clos Saint Jaques 2002, Bourgogne, France'),
217 |      (0.22314355131420976,
218 |       'Bruno Clair Clos Saint Jaques 2005, Bourgogne, France'),
219 |      (0.8925742052568391,
220 |       'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),
221 |      (0.22314355131420976, 'Domaine Raveneau Le Clos 2001, Bourgogne, France')]
222 | 
223 | 
224 | 
225 | We try now with a multi term search where one of the terms doesn't hit any
226 | document.
227 | 
228 | 
229 |     index.tf_idf_multi("hello Bordeaux")
230 | 
231 | 
232 | 
233 | 
234 |     [(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
235 |      (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),
236 |      (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France')]
237 | 
238 | 
239 | 
240 | Multi-term, with disjoint results.
241 | 
242 | 
243 |     index.tf_idf_multi("Bourgogne Bordeaux")
244 | 
245 | 
246 | 
247 | 
248 |     [(0.22314355131420976,
249 |       'Bruno Clair Chambertin Clos de Beze 2001, Bourgogne, France'),
250 |      (0.22314355131420976,
251 |       'Bruno Clair Chambertin Clos de Beze 2005, Bourgogne, France'),
252 |      (0.22314355131420976,
253 |       'Bruno Clair Clos Saint Jaques 2001, Bourgogne, France'),
254 |      (0.22314355131420976,
255 |       'Bruno Clair Clos Saint Jaques 2002, Bourgogne, France'),
256 |      (0.22314355131420976,
257 |       'Bruno Clair Clos Saint Jaques 2005, Bourgogne, France'),
258 |      (0.8925742052568391,
259 |       'Coche-Dury Bourgogne Chardonay 2005, Bourgogne, France'),
260 |      (1.2039728043259361, 'Chateau Margaux 1982, Bordeaux, France'),
261 |      (1.2039728043259361, 'Chateau Margaux 1996, Bordeaux, France'),
262 |      (1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
263 |      (0.22314355131420976, 'Domaine Raveneau Le Clos 2001, Bourgogne, France')]
264 | 
265 | 
266 | 
267 | And finally, a multi-term where some results have more than one term hitting. We
268 | see how the score is increased.
269 | 
270 | 
271 |     index.tf_idf_multi("Margaux Bordeaux")
272 | 
273 | 
274 | 
275 | 
276 |     [(1.2039728043259361, 'Chateau Latour 1982, Bordeaux, France'),
277 |      (2.8134107167600364, 'Chateau Margaux 1982, Bordeaux, France'),
278 |      (2.8134107167600364, 'Chateau Margaux 1996, Bordeaux, France')]
279 | 
280 | 
281 | 
282 | With this we complete our introduction to the concept of Term Frequency -
283 | Inverse Document Frequency.
284 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/Competition_TextData.R:
--------------------------------------------------------------------------------
 1 | # KAGGLE COMPETITION - DEALING WITH THE TEXT DATA
 2 | 
 3 | # This script file is intended to help you deal with the text data provided in the competition data files
 4 | 
 5 | # If you haven't already, start by reading the data into R
 6 | # Make sure you have downloaded these files from the Kaggle website, and have navigated to the directory where you saved the files on your computer 
 7 | 
 8 | # We are adding in the argument stringsAsFactors=FALSE, since we have some text fields
 9 | 
10 | NewsTrain = read.csv("data/NYTimesBlogTrain.csv", stringsAsFactors=FALSE)
11 | 
12 | NewsTest = read.csv("data/NYTimesBlogTest.csv", stringsAsFactors=FALSE)
13 | 
14 | # Now, let's load the "tm" package
15 | 
16 | library(tm)
17 | 
18 | # Then create a corpus from the headline variable. You can use other variables in the dataset for text analytics, but we will just show you how to use this particular variable. 
19 | # Note that we are creating a corpus out of the training and testing data.
20 | 
21 | CorpusHeadline = Corpus(VectorSource(c(NewsTrain$Headline, NewsTest$Headline)))
22 | 
23 | # You can go through all of the standard pre-processing steps like we did in Unit 5:
24 | 
25 | CorpusHeadline = tm_map(CorpusHeadline, tolower)
26 | 
27 | # Remember this extra line is needed after running the tolower step:
28 | 
29 | CorpusHeadline = tm_map(CorpusHeadline, PlainTextDocument)
30 | 
31 | CorpusHeadline = tm_map(CorpusHeadline, removePunctuation)
32 | 
33 | CorpusHeadline = tm_map(CorpusHeadline, removeWords, stopwords("english"))
34 | 
35 | CorpusHeadline = tm_map(CorpusHeadline, stemDocument)
36 | 
37 | # Now we are ready to convert our corpus to a DocumentTermMatrix, remove sparse terms, and turn it into a data frame. 
38 | # We selected one particular threshold to remove sparse terms, but remember that you can try different numbers!
39 | 
40 | dtm = DocumentTermMatrix(CorpusHeadline)
41 | 
42 | sparse = removeSparseTerms(dtm, 0.99)
43 | 
44 | HeadlineWords = as.data.frame(as.matrix(sparse))
45 | 
46 | # Let's make sure our variable names are okay for R:
47 | 
48 | colnames(HeadlineWords) = make.names(colnames(HeadlineWords))
49 | 
50 | # Now we need to split the observations back into the training set and testing set.
51 | # To do this, we can use the head and tail functions in R. 
52 | # The head function takes the first "n" rows of HeadlineWords (the first argument to the head function), where "n" is specified by the second argument to the head function. 
53 | # So here we are taking the first nrow(NewsTrain) observations from HeadlineWords, and putting them in a new data frame called "HeadlineWordsTrain"
54 | 
55 | HeadlineWordsTrain = head(HeadlineWords, nrow(NewsTrain))
56 | 
57 | # The tail function takes the last "n" rows of HeadlineWords (the first argument to the tail function), where "n" is specified by the second argument to the tail function. 
58 | # So here we are taking the last nrow(NewsTest) observations from HeadlineWords, and putting them in a new data frame called "HeadlineWordsTest"
59 | 
60 | HeadlineWordsTest = tail(HeadlineWords, nrow(NewsTest))
61 | 
62 | # Note that this split of HeadlineWords works to properly put the observations back into the training and testing sets, because of how we combined them together when we first made our corpus.
63 | 
64 | # Before building models, we want to add back the original variables from our datasets. We'll add back the dependent variable to the training set, and the WordCount variable to both datasets. You might want to add back more variables to use in your model - we'll leave this up to you!
65 | 
66 | HeadlineWordsTrain$Popular = NewsTrain$Popular
67 | 
68 | HeadlineWordsTrain$WordCount = NewsTrain$WordCount
69 | HeadlineWordsTest$WordCount = NewsTest$WordCount
70 | 
71 | # Remember that you can always look at the structure of these data frames to understand what we have created
72 | 
73 | 
74 | # Now let's create a logistic regression model using all of the variables:
75 | 
76 | HeadlineWordsLog = glm(Popular ~ ., data=HeadlineWordsTrain, family=binomial)
77 | 
78 | # And make predictions on our test set:
79 | 
80 | PredTest = predict(HeadlineWordsLog, newdata=HeadlineWordsTest, type="response")
81 | 
82 | # Now we can prepare our submission file for Kaggle:
83 | 
84 | MySubmission = data.frame(UniqueID = NewsTest$UniqueID, Probability1 = PredTest)
85 | 
86 | write.csv(MySubmission, "SubmissionHeadlineLog.csv", row.names=FALSE)
87 | 
88 | # You should upload the submission "SubmissionHeadlineLog.csv" on the Kaggle website to use this as a submission to the competition
89 | 
90 | # This script file was just designed to help you get started - to do well in the competition, you will need to build better models!


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/README.md:
--------------------------------------------------------------------------------
 1 | # Kaggle 15.071x - The Analytics Edge (Spring 2015)
 2 | 
 3 | Files for my solution to [this Kaggle competition](https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015)
 4 | 
 5 | ## Task description  
 6 | 
 7 | The task description can be found [here](https://www.kaggle.com/c/15-071x-the-analytics-edge-competition-spring-2015):  
 8 | 
 9 | *In this competition, we challenge you to develop an analytics model that will help the New York Times understand the features of a blog post that make it popular.*
10 | 
11 | ## Files description  
12 | 
13 | ### `main.R`  
14 | 
15 | The main script calling the others in order to generate a prediction.  
16 | 
17 | 
18 | ### `loader.R`  
19 | 
20 | Loads data into a dataframe.  
21 | 
22 | 
23 | ### `add_corpus_XXX.R`  
24 | 
25 | Different scripts that generate a corpus from text fields and add them as
26 | predictors.  
27 | 
28 | This process includes creating linear models to determine significative terms in order to do variable selection.  
29 | 
30 | ### `split_eval.R`  
31 | 
32 | Splits training data into training and test. TODO: do cross validation.  
33 | 
34 | ### `train_random_forest.R`  
35 | 
36 | Trains a **Random Forest** and makes predictions.  
37 | 
38 | 
39 | ### `results` folder  
40 | 
41 | Contains different predictions as CSV files.  
42 | 
43 | ## Future works  
44 | 
45 | - Use ensemble methods.  
46 | - Do cross-validation for better parameter selection.  
47 | - Impute missing values for section/subsection. Right now empty values are taking as a factor level and therefore as meaninful for predictions.    
48 | - Do more exploratory data analysis, specially for missclassified cases (e.g. use word clouds).  
49 | - Filter out non memingful frequent terms in corpus (e.g. *new*, *york*, etc.)
50 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/add_corpus_abstract.R:
--------------------------------------------------------------------------------
 1 | # Prepare corpus using snippet
 2 | corpusAbstract <- Corpus(VectorSource(c(newsTrain$Abstract, newsTest$Abstract)))
 3 | corpusAbstract <- tm_map(corpusAbstract, tolower)
 4 | corpusAbstract <- tm_map(corpusAbstract, PlainTextDocument)
 5 | corpusAbstract <- tm_map(corpusAbstract, removePunctuation)
 6 | corpusAbstract <- tm_map(corpusAbstract, removeWords, stopwords("english"))
 7 | corpusAbstract <- tm_map(corpusAbstract, stripWhitespace)
 8 | corpusAbstract <- tm_map(corpusAbstract, stemDocument)
 9 | 
10 | # Generate term matrix
11 | dtmAbstract <- DocumentTermMatrix(corpusAbstract)
12 | sparseAbstract <- removeSparseTerms(dtmAbstract, 0.995)
13 | abstractWords <- as.data.frame(as.matrix(sparseAbstract))
14 | 
15 | colnames(abstractWords) <- make.names(colnames(abstractWords))
16 | colnames(abstractWords) <- paste0("A_", colnames(abstractWords))
17 | 
18 | # Find most significative terms
19 | abstractWordsTrain2 <- head(abstractWords, nrow(newsTrain))
20 | abstractWordsTrain2$Popular <- newsTrain$Popular
21 | logModelAbstractWords <- glm(Popular~., data=abstractWordsTrain2, family=binomial)
22 | abstract_three_star_terms <- names(which(summary(logModelAbstractWords)$coefficients[,4]<0.001))
23 | abstract_two_star_terms <- names(which(summary(logModelAbstractWords)$coefficients[,4]<0.01))
24 | abstract_one_star_terms <- names(which(summary(logModelAbstractWords)$coefficients[,4]<0.05))
25 | 
26 | # Leave just those terms that are different between popular and unpopular articles
27 | abstractWords <- subset(abstractWords, 
28 |                        select=names(abstractWords) %in% abstract_three_star_terms)
29 | 
30 | # split again
31 | abstractWordsTrain <- head(abstractWords, nrow(newsTrain))
32 | abstractWordsTest <- tail(abstractWords, nrow(newsTest))
33 | 
34 | # Add to dataframes
35 | newsTrain <- cbind(newsTrain, abstractWordsTrain)
36 | newsTest <- cbind(newsTest, abstractWordsTest)
37 | 
38 | # Explore a bit
39 | # ...
40 | 
41 | # Remove original text variables
42 | newsTrain$Abstract <- NULL
43 | newsTest$Abstract <- NULL
44 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/add_corpus_all.R:
--------------------------------------------------------------------------------
 1 | # Prepare corpus using snippet
 2 | # newsTrain$AllText <- do.call(paste, newsTrain[,c("Headline","Snippet","Abstract")])
 3 | # newsTest$AllText <- do.call(paste, newsTest[,c("Headline","Snippet","Abstract")])
 4 | newsTrain$AllText <- do.call(paste, newsTrain[,c("Headline","Snippet")])
 5 | newsTest$AllText <- do.call(paste, newsTest[,c("Headline","Snippet")])
 6 | 
 7 | corpusAll <- Corpus(VectorSource(c(newsTrain$AllText, newsTest$AllText)))
 8 | corpusAll <- tm_map(corpusAll, tolower)
 9 | corpusAll <- tm_map(corpusAll, PlainTextDocument)
10 | corpusAll <- tm_map(corpusAll, removePunctuation)
11 | corpusAll <- tm_map(corpusAll, removeWords, stopwords("english"))
12 | corpusAll <- tm_map(corpusAll, stripWhitespace)
13 | corpusAll <- tm_map(corpusAll, stemDocument)
14 | 
15 | # Generate term matrix
16 | dtmAll <- DocumentTermMatrix(corpusAll)
17 | sparseAll <- removeSparseTerms(dtmAll, 0.99)
18 | allWords <- as.data.frame(as.matrix(sparseAll))
19 | 
20 | colnames(allWords) <- make.names(colnames(allWords))
21 | 
22 | # Find most significative terms
23 | allWordsTrain2 <- head(allWords, nrow(newsTrain))
24 | allWordsTrain2$Popular <- newsTrain$Popular
25 | logModelAllWords <- glm(Popular~., data=allWordsTrain2, family=binomial)
26 | all_three_star_terms <- names(which(summary(logModelAllWords)$coefficients[,4]<0.001))
27 | all_two_star_terms <- names(which(summary(logModelAllWords)$coefficients[,4]<0.01))
28 | all_one_star_terms <- names(which(summary(logModelAllWords)$coefficients[,4]<0.05))
29 | 
30 | # Leave just those terms that are different between popular and unpopular articles
31 | allWords <- subset(allWords, 
32 |                        select=names(allWords) %in% all_one_star_terms)
33 | 
34 | # Split again
35 | allWordsTrain <- head(allWords, nrow(newsTrain))
36 | allWordsTest <- tail(allWords, nrow(newsTest))
37 | 
38 | # Add to dataframes
39 | newsTrain <- cbind(newsTrain, allWordsTrain)
40 | newsTest <- cbind(newsTest, allWordsTest)
41 | 
42 | # Explore a bit
43 | # ...
44 | 
45 | # Remove original text variables
46 | newsTrain$AllText <- NULL
47 | newsTest$AllText <- NULL
48 | 
49 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/add_corpus_headline.R:
--------------------------------------------------------------------------------
 1 | # Prepare corpus using headline
 2 | corpusHeadline <- Corpus(VectorSource(c(newsTrain$Headline, newsTest$Headline)))
 3 | corpusHeadline <- tm_map(corpusHeadline, tolower)
 4 | corpusHeadline <- tm_map(corpusHeadline, PlainTextDocument)
 5 | corpusHeadline <- tm_map(corpusHeadline, removePunctuation)
 6 | corpusHeadline <- tm_map(corpusHeadline, removeWords, stopwords("english"))
 7 | corpusHeadline <- tm_map(corpusHeadline, stripWhitespace)
 8 | corpusHeadline <- tm_map(corpusHeadline, stemDocument)
 9 | 
10 | # Generate term matrix
11 | dtm <- DocumentTermMatrix(corpusHeadline)
12 | sparse <- removeSparseTerms(dtm, 0.99)
13 | headlineWords <- as.data.frame(as.matrix(sparse))
14 | 
15 | colnames(headlineWords) <- make.names(colnames(headlineWords))
16 | colnames(headlineWords) <- paste0("H_", colnames(headlineWords))
17 | 
18 | # Find most significative terms
19 | headlineWordsTrain2 <- head(headlineWords, nrow(newsTrain))
20 | headlineWordsTrain2$Popular <- newsTrain$Popular
21 | logModelHeadlineWords <- glm(Popular~., data=headlineWordsTrain2, family=binomial)
22 | three_star_terms <- names(which(summary(logModelHeadlineWords)$coefficients[,4]<0.001))
23 | two_star_terms <- names(which(summary(logModelHeadlineWords)$coefficients[,4]<0.01))
24 | one_star_terms <- names(which(summary(logModelHeadlineWords)$coefficients[,4]<0.05))
25 | 
26 | # Leave just those terms that are different between popular and unpopular articles
27 | headlineWords <- subset(headlineWords, 
28 |                         select=names(headlineWords) %in% one_star_terms)
29 |                     
30 | 
31 | # Filter out common frequent terms
32 | # headlineWordsCountsPopular <- colSums(subset(headlineWords, Popular==T))
33 | # headlineWordsCountsUnpopular <- colSums(subset(headlineWords, Popular==F))
34 | # topPopular <- tail(sort(headlineWordsCountsPopular), 20)
35 | # topUnpopular <- tail(sort(headlineWordsCountsUnpopular), 20)
36 | 
37 | # Leave just those terms that are different between popular and unpopular articles
38 | # headlineWords <- subset(headlineWords, 
39 | #                         select=names(headlineWords) %in% setdiff(names(topPopular), names(topUnpopular))
40 | #                     )
41 | 
42 | # Split again
43 | headlineWordsTrain <- head(headlineWords, nrow(newsTrain))
44 | headlineWordsTest <- tail(headlineWords, nrow(newsTest))
45 | 
46 | # Add to dataframes
47 | newsTrain <- cbind(newsTrain, headlineWordsTrain)
48 | newsTest <- cbind(newsTest, headlineWordsTest)
49 | 
50 | # Remove original text variables
51 | newsTrain$Headline <- NULL
52 | newsTest$Headline <- NULL
53 | 
54 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/add_corpus_snippet.R:
--------------------------------------------------------------------------------
 1 | # Prepare corpus using snippet
 2 | corpusSnippet <- Corpus(VectorSource(c(newsTrain$Snippet, newsTest$Snippet)))
 3 | corpusSnippet <- tm_map(corpusSnippet, tolower)
 4 | corpusSnippet <- tm_map(corpusSnippet, PlainTextDocument)
 5 | corpusSnippet <- tm_map(corpusSnippet, removePunctuation)
 6 | corpusSnippet <- tm_map(corpusSnippet, removeWords, stopwords("english"))
 7 | corpusSnippet <- tm_map(corpusSnippet, stripWhitespace)
 8 | corpusSnippet <- tm_map(corpusSnippet, stemDocument)
 9 | 
10 | # Generate term matrix
11 | dtmSnippet <- DocumentTermMatrix(corpusSnippet)
12 | sparseSnippet <- removeSparseTerms(dtmSnippet, 0.99)
13 | snippetWords <- as.data.frame(as.matrix(sparseSnippet))
14 | 
15 | colnames(snippetWords) <- make.names(colnames(snippetWords))
16 | colnames(snippetWords) <- paste0("S_", colnames(snippetWords))
17 | 
18 | # Find most significative terms
19 | snippetWordsTrain2 <- head(snippetWords, nrow(newsTrain))
20 | snippetWordsTrain2$Popular <- newsTrain$Popular
21 | logModelSnippetWords <- glm(Popular~., data=snippetWordsTrain2, family=binomial)
22 | snippet_three_star_terms <- names(which(summary(logModelSnippetWords)$coefficients[,4]<0.001))
23 | snippet_two_star_terms <- names(which(summary(logModelSnippetWords)$coefficients[,4]<0.01))
24 | snippet_one_star_terms <- names(which(summary(logModelSnippetWords)$coefficients[,4]<0.05))
25 | 
26 | # Leave just those terms that are different between popular and unpopular articles
27 | snippetWords <- subset(snippetWords, 
28 |                         select=names(snippetWords) %in% snippet_one_star_terms)
29 | 
30 | # Split again
31 | snippetWordsTrain <- head(snippetWords, nrow(newsTrain))
32 | snippetWordsTest <- tail(snippetWords, nrow(newsTest))
33 | 
34 | # Add to dataframes
35 | newsTrain <- cbind(newsTrain, snippetWordsTrain)
36 | newsTest <- cbind(newsTest, snippetWordsTest)
37 | 
38 | # Explore a bit
39 | # ...
40 | 
41 | # Remove original text variables
42 | newsTrain$Snippet <- NULL
43 | newsTest$Snippet <- NULL
44 | 
45 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/explore.R:
--------------------------------------------------------------------------------
 1 | # A bit of exploratory data analysis to see which of these variables make
 2 | # a difference between popular articles
 3 | 
 4 | summary(newsTrain)
 5 | table(newsTrain$Popular)
 6 | 
 7 | table(newsTrain$Popular, newsTrain$NewsDesk)
 8 | 
 9 | table(newsTrain$Popular, newsTrain$SectionName)
10 | 
11 | table(newsTrain$Popular, newsTrain$SubsectionName)
12 | 
13 | summary(newsTrain$WordCount)
14 | table(newsTrain$Popular, newsTrain$WordCount>374)
15 | 
16 | table(newsTrain$Popular, newsTrain$MonthOfTheYear)
17 | table(newsTrain$Popular, newsTrain$DayOfTheWeek)
18 | hoursPopular <- data.frame(table(newsTrain$Popular, newsTrain$HourOfTheDay))
19 | 
20 | # Let's plot this
21 | library(ggplot2)
22 | ggplot(hoursPopular, aes(x=Var2, y=Freq)) + geom_line(aes(group=Var1, color=Var1))
23 | 
24 | # explore headline corpus
25 | headlineWordsCountsPopular <- colSums(subset(headlineWords, Popular==T))
26 | headlineWordsCountsUnpopular <- colSums(subset(headlineWords, Popular==F))
27 | headlineWordsCountsPopular
28 | topPopular <- tail(sort(headlineWordsCountsPopular), 100)
29 | topUnpopular <- tail(sort(headlineWordsCountsUnpopular), 100)
30 | topPopular[names(topPopular) %in% names(topUnpopular)]
31 | topUnpopular[names(topUnpopular) %in% names(topPopular)]
32 | 
33 | headlineWordsPopularDiff <- subset(headlineWords, select=names(headlineWords) %in% setdiff(names(topPopular), names(topUnpopular)))
34 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/loader.R:
--------------------------------------------------------------------------------
 1 | library(tm)
 2 | library(ROCR)
 3 | library(rpart)
 4 | library(rpart.plot)
 5 | library(caTools)
 6 | library(randomForest)
 7 | library(caret)
 8 | library(e1071)
 9 | 
10 | newsTrain <- read.csv("data/NYTimesBlogTrain.csv", stringsAsFactors=FALSE)
11 | newsTest <- read.csv("data/NYTimesBlogTest.csv", stringsAsFactors=FALSE)
12 | 
13 | # Bind the data in order to do the transformations jusnt once
14 | newsAll <- rbind(newsTrain[,-9], newsTest)
15 | 
16 |     
17 | # Extract date information
18 | newsAll$PubDate <- strptime(newsAll$PubDate, format="%Y-%m-%d %H:%M:%S")
19 | newsAll$DayOfTheWeek <- as.factor(weekdays(newsAll$PubDate))
20 | # We saw during exploration that mont doesn't make a difference
21 | # newsAll$MonthOfTheYear <- as.factor(months(newsAll$PubDate))
22 | newsAll$HourOfTheDay <- as.factor(newsAll$PubDate$hour)
23 | newsAll$PubDate <- NULL # Get rid of the original Dates
24 | 
25 | # Convert to factors
26 | newsAll$NewsDesk <- as.factor(newsAll$NewsDesk)
27 | newsAll$SectionName <- as.factor(newsAll$SectionName)
28 | newsAll$SubsectionName <- as.factor(newsAll$SubsectionName)
29 | 
30 | # Split again
31 | Popular <- as.factor(newsTrain$Popular)
32 | newsTrain <- head(newsAll, nrow(newsTrain))
33 | newsTrain$Popular <- Popular
34 | newsTest <- tail(newsAll, nrow(newsTest))
35 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/main.R:
--------------------------------------------------------------------------------
 1 | source("loader.R")
 2 | # source("add_corpus_headline.R")
 3 | # source("add_corpus_snippet.R")
 4 | # source("add_corpus_abstract.R")
 5 | source("add_corpus_all.R")
 6 | 
 7 | # Remove these as needed
 8 | newsTrain$Headline <- NULL
 9 | newsTest$Headline <- NULL
10 | 
11 | newsTrain$Abstract <- NULL
12 | newsTest$Abstract <- NULL
13 | 
14 | newsTrain$Snippet <- NULL
15 | newsTest$Snippet <- NULL
16 | 
17 | source("split_eval.R")
18 | source("train_random_forest.R")
19 | source("train_glm.R")
20 | 
21 | # 0.9472158 without any corpus at all, ntree=500
22 | # 0.9473081 ntree=1000
23 | # 0.9474844 ntree=5000
24 | # 0.9475809 ntree=10000
25 | # 0.9474634 ntree=25000
26 | # 0.9474508 ntree=50000
27 | 
28 | # 0.9463428 just header at .99 (42 terms)
29 | # 0.944299 just header at .995 (142 terms)
30 | # 0.9449369 top 100 terms diff, ntree=10000 (30 terms)
31 | # 0.9446683 top 100 terms diff, ntree=500 (30 terms)
32 | # 0.9474214 top 50 terms diff, ntree=500 (27 terms)
33 | # 0.9477488 top 25 terms diff, ntree=500 (15 terms)
34 | # 0.9484497 top 20 terms diff, ntree=500 (13 terms)
35 | # 0.9487225 top 20 terms diff, ntree=10000 (13 terms)
36 | # 0.9463932 top 15 terms diff, ntree=500 (9 terms)
37 | # 0.9481391 top 17 terms diff, ntree=500 (10 terms)
38 | # 0.9483322 top 21 terms (12 terms)
39 | 
40 | 
41 | # 0.9375378 just snippet at .99 (229 vars)
42 | # 0.9415668 just snippet at .995 (532 vars)
43 | 
44 | # 0.9376007 just abstract at .99 (223 vars)
45 | # 0.9387507 just abstract at .995 (525 vars)
46 | 
47 | # 0.9387549 with header+snippet at .995/.995
48 | # 0.939013 with header+snippet at .99/.99
49 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/split_eval.R:
--------------------------------------------------------------------------------
1 | # Create train and test splits from the original train data, that is labeled
2 | set.seed(1234)
3 | spl <- sample.split(newsTrain$Popular, .80)
4 | evalNewsTrain <- newsTrain[spl==T,]
5 | evalNewsTest <- newsTrain[spl==F,]
6 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/train_glm.R:
--------------------------------------------------------------------------------
 1 | set.seed(1234)
 2 | logModel <- glm(Popular~.-UniqueID, data=evalNewsTrain, family=binomial)
 3 | summary(logModel)
 4 | 
 5 | logModel <- glm(
 6 |     Popular~
 7 |         I(NewsDesk=="Culture") + I(NewsDesk=="OpEd") + I(NewsDesk=="Styles") + I(NewsDesk=="TStyle") +
 8 |         I(SectionName=="Multimedia") + I(SubsectionName=="Room For Debate") + (SubsectionName=="The Public Editor") +
 9 |         WordCount +
10 |         I(DayOfTheWeek=="Monday") + I(DayOfTheWeek=="Sunday") + 
11 |         I(HourOfTheDay=="7") + I(HourOfTheDay=="19")
12 |         -UniqueID, 
13 |     data=evalNewsTrain, 
14 |     family=binomial)
15 | summary(logModel)
16 | 
17 | logModel3 <- glm(
18 |     Popular ~ 
19 |         SubsectionName + 
20 |         DayOfTheWeekMonday + DayOfTheWeekSunday + 
21 |         HourOfTheDay7 + HourOfTheDay12 + HourOfTheDay13 + HourOfTheDay19 + 
22 |         WordCount - UniqueID,
23 |     data=evalNewsTrain, family=binomial)
24 | summary(logModel3)
25 | 
26 | logPred <- predict(logModel3, newdata=evalNewsTest, type="response")
27 | 
28 | # Calculate AUC
29 | logRocr <- prediction(logPred, evalNewsTest$Popular)
30 | logAuc <- as.numeric(performance(logRocr, "auc")@y.values)
31 | logAuc
32 | 
33 | # Calculate accuracy
34 | table(evalNewsTest$Popular, rfPred>.5)
35 | 


--------------------------------------------------------------------------------
/apps/kaggle-analytics-edge-15/train_random_forest.R:
--------------------------------------------------------------------------------
 1 | # do the train
 2 | set.seed(1234)
 3 | rfModel <- randomForest(
 4 |     Popular ~ . - UniqueID,
 5 |     data=evalNewsTrain, ntree=500)
 6 | 
 7 | # Calculate AUC
 8 | rfPred <- predict(rfModel, newdata=evalNewsTest, type="prob")
 9 | rfRocr <- prediction(rfPred[,2], evalNewsTest$Popular)
10 | rfAuc <- as.numeric(performance(rfRocr, "auc")@y.values)
11 | rfAuc
12 | 
13 | # do cv
14 | set.seed(1234)
15 | x <- newsTrain
16 | x$Popular <- NULL
17 | y <- newsTrain$Popular
18 | rf.cv <- rfcv(x, y, cv.fold=10, recursive=TRUE)
19 | with(rf.cv, plot(n.var, error.cv))
20 | 
21 | # Calculate accuracy
22 | table(evalNewsTest$Popular, rfPred>.5)
23 | 
24 | # Save to file
25 | set.seed(1234)
26 | rfModelSubmission <- randomForest(Popular~.-UniqueID, data=newsTrain, ntree=10000)
27 | rfPredSubmission <- predict(rfModelSubmission, newdata=newsTest, type="prob")
28 | mySubmission <- data.frame(
29 |     UniqueID = newsTest$UniqueID, 
30 |     Probability1 = abs(rfPredSubmission[,2])
31 | )
32 | write.csv(mySubmission, "SubmissionRF_all_corpora_10000.csv", row.names=FALSE)
33 | 


--------------------------------------------------------------------------------
/apps/sentimentclassifier/GUI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/sentimentclassifier/GUI.png


--------------------------------------------------------------------------------
/apps/sentimentclassifier/genesis.txt:
--------------------------------------------------------------------------------
 1 | In the beginning God created the heavens and the earth. 2 Now the earth was formless and empty, darkness was over the surface of the deep, and the Spirit of God was hovering over the waters.
 2 | 
 3 | 3 And God said, “Let there be light,” and there was light. 4 God saw that the light was good, and he separated the light from the darkness. 5 God called the light “day,” and the darkness he called “night.” And there was evening, and there was morning—the first day.
 4 | 
 5 | 6 And God said, “Let there be a vault between the waters to separate water from water.” 7 So God made the vault and separated the water under the vault from the water above it. And it was so. 8 God called the vault “sky.” And there was evening, and there was morning—the second day.
 6 | 
 7 | 9 And God said, “Let the water under the sky be gathered to one place, and let dry ground appear.” And it was so. 10 God called the dry ground “land,” and the gathered waters he called “seas.” And God saw that it was good.
 8 | 
 9 | 11 Then God said, “Let the land produce vegetation: seed-bearing plants and trees on the land that bear fruit with seed in it, according to their various kinds.” And it was so. 12 The land produced vegetation: plants bearing seed according to their kinds and trees bearing fruit with seed in it according to their kinds. And God saw that it was good. 13 And there was evening, and there was morning—the third day.
10 | 
11 | 14 And God said, “Let there be lights in the vault of the sky to separate the day from the night, and let them serve as signs to mark sacred times, and days and years, 15 and let them be lights in the vault of the sky to give light on the earth.” And it was so. 16 God made two great lights—the greater light to govern the day and the lesser light to govern the night. He also made the stars. 17 God set them in the vault of the sky to give light on the earth, 18 to govern the day and the night, and to separate light from darkness. And God saw that it was good. 19 And there was evening, and there was morning—the fourth day.
12 | 
13 | 20 And God said, “Let the water teem with living creatures, and let birds fly above the earth across the vault of the sky.” 21 So God created the great creatures of the sea and every living thing with which the water teems and that moves about in it, according to their kinds, and every winged bird according to its kind. And God saw that it was good. 22 God blessed them and said, “Be fruitful and increase in number and fill the water in the seas, and let the birds increase on the earth.” 23 And there was evening, and there was morning—the fifth day.
14 | 
15 | 24 And God said, “Let the land produce living creatures according to their kinds: the livestock, the creatures that move along the ground, and the wild animals, each according to its kind.” And it was so. 25 God made the wild animals according to their kinds, the livestock according to their kinds, and all the creatures that move along the ground according to their kinds. And God saw that it was good.
16 | 
17 | 26 Then God said, “Let us make mankind in our image, in our likeness, so that they may rule over the fish in the sea and the birds in the sky, over the livestock and all the wild animals,[a] and over all the creatures that move along the ground.”
18 | 
19 | 27 So God created mankind in his own image,
20 |     in the image of God he created them;
21 |     male and female he created them.
22 | 28 God blessed them and said to them, “Be fruitful and increase in number; fill the earth and subdue it. Rule over the fish in the sea and the birds in the sky and over every living creature that moves on the ground.”
23 | 
24 | 29 Then God said, “I give you every seed-bearing plant on the face of the whole earth and every tree that has fruit with seed in it. They will be yours for food. 30 And to all the beasts of the earth and all the birds in the sky and all the creatures that move along the ground—everything that has the breath of life in it—I give every green plant for food.” And it was so.
25 | 
26 | 31 God saw all that he had made, and it was very good. And there was evening, and there was morning—the sixth day.


--------------------------------------------------------------------------------
/apps/sentimentclassifier/luther.txt:
--------------------------------------------------------------------------------
 1 | I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.
 2 | 
 3 | Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity.
 4 | 
 5 | But one hundred years later, the Negro still is not free. One hundred years later, the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination. One hundred years later, the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. One hundred years later, the Negro is still languished in the corners of American society and finds himself an exile in his own land. And so we've come here today to dramatize a shameful condition.
 6 | 
 7 | In a sense we've come to our nation's capital to cash a check. When the architects of our republic wrote the magnificent words of the Constitution and the Declaration of Independence, they were signing a promissory note to which every American was to fall heir. This note was a promise that all men, yes, black men as well as white men, would be guaranteed the "unalienable Rights" of "Life, Liberty and the pursuit of Happiness." It is obvious today that America has defaulted on this promissory note, insofar as her citizens of color are concerned. Instead of honoring this sacred obligation, America has given the Negro people a bad check, a check which has come back marked "insufficient funds."
 8 | 
 9 | But we refuse to believe that the bank of justice is bankrupt. We refuse to believe that there are insufficient funds in the great vaults of opportunity of this nation. And so, we've come to cash this check, a check that will give us upon demand the riches of freedom and the security of justice.
10 | 
11 | We have also come to this hallowed spot to remind America of the fierce urgency of Now. This is no time to engage in the luxury of cooling off or to take the tranquilizing drug of gradualism. Now is the time to make real the promises of democracy. Now is the time to rise from the dark and desolate valley of segregation to the sunlit path of racial justice. Now is the time to lift our nation from the quicksands of racial injustice to the solid rock of brotherhood. Now is the time to make justice a reality for all of God's children.
12 | 
13 | It would be fatal for the nation to overlook the urgency of the moment. This sweltering summer of the Negro's legitimate discontent will not pass until there is an invigorating autumn of freedom and equality. Nineteen sixty-three is not an end, but a beginning. And those who hope that the Negro needed to blow off steam and will now be content will have a rude awakening if the nation returns to business as usual. And there will be neither rest nor tranquility in America until the Negro is granted his citizenship rights. The whirlwinds of revolt will continue to shake the foundations of our nation until the bright day of justice emerges.
14 | 
15 | But there is something that I must say to my people, who stand on the warm threshold which leads into the palace of justice: In the process of gaining our rightful place, we must not be guilty of wrongful deeds. Let us not seek to satisfy our thirst for freedom by drinking from the cup of bitterness and hatred. We must forever conduct our struggle on the high plane of dignity and discipline. We must not allow our creative protest to degenerate into physical violence. Again and again, we must rise to the majestic heights of meeting physical force with soul force.
16 | 
17 | The marvelous new militancy which has engulfed the Negro community must not lead us to a distrust of all white people, for many of our white brothers, as evidenced by their presence here today, have come to realize that their destiny is tied up with our destiny. And they have come to realize that their freedom is inextricably bound to our freedom.
18 | 
19 | We cannot walk alone.
20 | 
21 | And as we walk, we must make the pledge that we shall always march ahead.
22 | 
23 | We cannot turn back.
24 | 
25 | There are those who are asking the devotees of civil rights, "When will you be satisfied?" We can never be satisfied as long as the Negro is the victim of the unspeakable horrors of police brutality. We can never be satisfied as long as our bodies, heavy with the fatigue of travel, cannot gain lodging in the motels of the highways and the hotels of the cities. *We cannot be satisfied as long as the negro's basic mobility is from a smaller ghetto to a larger one. We can never be satisfied as long as our children are stripped of their self-hood and robbed of their dignity by signs stating: "For Whites Only."* We cannot be satisfied as long as a Negro in Mississippi cannot vote and a Negro in New York believes he has nothing for which to vote. No, no, we are not satisfied, and we will not be satisfied until "justice rolls down like waters, and righteousness like a mighty stream."¹
26 | 
27 | I am not unmindful that some of you have come here out of great trials and tribulations. Some of you have come fresh from narrow jail cells. And some of you have come from areas where your quest -- quest for freedom left you battered by the storms of persecution and staggered by the winds of police brutality. You have been the veterans of creative suffering. Continue to work with the faith that unearned suffering is redemptive. Go back to Mississippi, go back to Alabama, go back to South Carolina, go back to Georgia, go back to Louisiana, go back to the slums and ghettos of our northern cities, knowing that somehow this situation can and will be changed.
28 | 
29 | Let us not wallow in the valley of despair, I say to you today, my friends.
30 | 
31 | And so even though we face the difficulties of today and tomorrow, I still have a dream. It is a dream deeply rooted in the American dream.
32 | 
33 | I have a dream that one day this nation will rise up and live out the true meaning of its creed: "We hold these truths to be self-evident, that all men are created equal."
34 | 
35 | I have a dream that one day on the red hills of Georgia, the sons of former slaves and the sons of former slave owners will be able to sit down together at the table of brotherhood.
36 | 
37 | I have a dream that one day even the state of Mississippi, a state sweltering with the heat of injustice, sweltering with the heat of oppression, will be transformed into an oasis of freedom and justice.
38 | 
39 | I have a dream that my four little children will one day live in a nation where they will not be judged by the color of their skin but by the content of their character.
40 | 
41 | I have a dream today!
42 | 
43 | I have a dream that one day, down in Alabama, with its vicious racists, with its governor having his lips dripping with the words of "interposition" and "nullification" -- one day right there in Alabama little black boys and black girls will be able to join hands with little white boys and white girls as sisters and brothers.
44 | 
45 | I have a dream today!
46 | 
47 | I have a dream that one day every valley shall be exalted, and every hill and mountain shall be made low, the rough places will be made plain, and the crooked places will be made straight; "and the glory of the Lord shall be revealed and all flesh shall see it together."2
48 | 
49 | This is our hope, and this is the faith that I go back to the South with.
50 | 
51 | With this faith, we will be able to hew out of the mountain of despair a stone of hope. With this faith, we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith, we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day.
52 | 
53 | And this will be the day -- this will be the day when all of God's children will be able to sing with new meaning:
54 | 
55 | My country 'tis of thee, sweet land of liberty, of thee I sing.
56 | 
57 | Land where my fathers died, land of the Pilgrim's pride,
58 | 
59 | From every mountainside, let freedom ring!
60 | 
61 | And if America is to be a great nation, this must become true.
62 | 
63 | And so let freedom ring from the prodigious hilltops of New Hampshire.
64 | 
65 | Let freedom ring from the mighty mountains of New York.
66 | 
67 | Let freedom ring from the heightening Alleghenies of Pennsylvania.
68 | 
69 | Let freedom ring from the snow-capped Rockies of Colorado.
70 | 
71 | Let freedom ring from the curvaceous slopes of California.
72 | 
73 | But not only that:
74 | 
75 | Let freedom ring from Stone Mountain of Georgia.
76 | 
77 | Let freedom ring from Lookout Mountain of Tennessee.
78 | 
79 | Let freedom ring from every hill and molehill of Mississippi.
80 | 
81 | From every mountainside, let freedom ring.
82 | 
83 | And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God's children, black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual:
84 | 
85 |                 Free at last! Free at last!
86 | 
87 |                 Thank God Almighty, we are free at last!3


--------------------------------------------------------------------------------
/apps/sentimentclassifier/ratm.txt:
--------------------------------------------------------------------------------
 1 | Killing in the name of!
 2 | Some of those that work forces, are the same that burn crosses
 3 | Some of those that work forces, are the same that burn crosses
 4 | Some of those that work forces, are the same that burn crosses
 5 | Some of those that work forces, are the same that burn crosses
 6 | Huh!
 7 | 
 8 | Killing in the name of!
 9 | Killing in the name of
10 | 
11 | And now you do what they told ya 
12 | And now you do what they told ya
13 | And now you do what they told ya
14 | And now you do what they told ya
15 | And now you do what they told ya
16 | And now you do what they told ya
17 | And now you do what they told ya
18 | And now you do what they told ya
19 | And now you do what they told ya
20 | And now you do what they told ya
21 | And now you do what they told ya
22 | But now you do what they told ya
23 | Well now you do what they told ya
24 | 
25 | Those who died are justified, for wearing the badge, they're the chosen whites
26 | You justify those that died by wearing the badge, they're the chosen whites
27 | Those who died are justified, for wearing the badge, they're the chosen whites
28 | You justify those that died by wearing the badge, they're the chosen whites
29 | 
30 | Some of those that work forces, are the same that burn crosses
31 | Some of those that work forces, are the same that burn crosses
32 | Some of those that work forces, are the same that burn crosses
33 | Some of those that work forces, are the same that burn crosses
34 | Uggh!
35 | 
36 | Killing in the name of!
37 | Killing in the name of
38 | 
39 | And now you do what they told ya
40 | And now you do what they told ya
41 | And now you do what they told ya
42 | And now you do what they told ya
43 | And now you do what they told ya, now you're under control (7 times)
44 | And now you do what they told ya, now you're under control
45 | And now you do what they told ya, now you're under control
46 | And now you do what they told ya, now you're under control
47 | And now you do what they told ya, now you're under control
48 | And now you do what they told ya, now you're under control
49 | And now you do what they told ya, now you're under control
50 | And now you do what they told ya!
51 | 
52 | Those who died are justified, for wearing the badge, they're the chosen whites
53 | You justify those that died by wearing the badge, they're the chosen whites
54 | Those who died are justified, for wearing the badge, they're the chosen whites
55 | You justify those that died by wearing the badge, they're the chosen whites
56 | Come on!
57 | 
58 | Yeah! Come on!
59 | 
60 | Fuck you, I won't do what you tell me
61 | Fuck you, I won't do what you tell me
62 | Fuck you, I won't do what you tell me
63 | Fuck you, I won't do what you tell me
64 | Fuck you, I won't do what you tell me
65 | Fuck you, I won't do what you tell me
66 | Fuck you, I won't do what you tell me
67 | Fuck you, I won't do what you tell me
68 | Fuck you, I won't do what you tell me!
69 | Fuck you, I won't do what you tell me!
70 | Fuck you, I won't do what you tell me!
71 | Fuck you, I won't do what you tell me!
72 | Fuck you, I won't do what you tell me!
73 | Fuck you, I won't do what you tell me!
74 | Fuck you, I won't do what you tell me!
75 | Fuck you, I won't do what you tell me!
76 | Motherfucker!
77 | Uggh!


--------------------------------------------------------------------------------
/apps/sentimentclassifier/rem.txt:
--------------------------------------------------------------------------------
 1 | Shiny happy people laughing
 2 | 
 3 | Meet me in the crowd, people, people
 4 | Throw your love around, love me, love me
 5 | Take it into town, happy, happy
 6 | Put it in the ground where the flowers grow 
 7 | Gold and silver shine
 8 | 
 9 | Shiny happy people holding hands
10 | Shiny happy people holding hands
11 | Shiny happy people laughing 
12 | 
13 | Everyone around, love them, love them
14 | Put it in your hands, take it, take it
15 | There's no time to cry, happy, happy
16 | Put it in your heart where tomorrow shines
17 | Gold and silver shine
18 | 
19 | Shiny happy people holding hands
20 | Shiny happy people holding hands
21 | Shiny happy people laughing 
22 | 
23 | Whoa, here we go
24 | 
25 | Shiny happy people holding hands
26 | Shiny happy people holding hands
27 | Shiny happy people laughing
28 | 
29 | Shiny happy people holding hands
30 | Shiny happy people holding hands
31 | Shiny happy people laughing
32 | 
33 | Shiny happy people holding hands
34 | Shiny happy people holding hands
35 | Shiny happy people laughing
36 | 
37 | Shiny happy people holding hands
38 | People, happy people
39 | People


--------------------------------------------------------------------------------
/apps/sentimentclassifier/server.R:
--------------------------------------------------------------------------------
  1 | library(shiny)
  2 | library(tm)
  3 | library(SnowballC)
  4 | library(randomForest)
  5 | 
  6 | options(shiny.maxRequestSize=3*1024^2)
  7 | options(mc.cores=1)
  8 | 
  9 | build_model <- function(new_data_df, sparsity) {
 10 |     # Create new data corpus
 11 |     new_corpus <- Corpus(VectorSource(new_data_df$Text))
 12 |     new_corpus <- tm_map(new_corpus, content_transformer(tolower))
 13 |     new_corpus <- tm_map(new_corpus, removePunctuation)
 14 |     new_corpus <- tm_map(new_corpus, removeWords, stopwords("english"))
 15 |     new_corpus <- tm_map(new_corpus, stripWhitespace)
 16 |     new_corpus <- tm_map(new_corpus, stemDocument)
 17 |     message("build_model: corpus DONE")
 18 |     
 19 |     # create document-term matrix
 20 |     new_dtm <- DocumentTermMatrix(new_corpus)
 21 |     new_dtm <- removeSparseTerms(new_dtm, sparsity)
 22 |     new_dtm_df <- as.data.frame(as.matrix(new_dtm))
 23 |     colnames(new_dtm_df) <- make.names(colnames(new_dtm_df))
 24 |     message("build_model: ", "term matrix created for new data with ", ncol(new_dtm_df), " variables")
 25 |     
 26 |     # intersect corpora and prepare final training data
 27 |     common_names = intersect(colnames(train_dtm_df),colnames(new_dtm_df))
 28 |     new_dtm_df <- subset(new_dtm_df, select=names(new_dtm_df) %in% common_names)
 29 |     message("build_model: ", "new data term matrix reduced to ", ncol(new_dtm_df), " variables")
 30 |     
 31 |     model_train_data_df <- cbind(train_data_df, subset(train_dtm_df, select=names(train_dtm_df) %in% common_names))
 32 |     model_train_data_df$Text <- NULL
 33 |     message("build_model: ", "final training data created with ", ncol(model_train_data_df)-1, " variables")
 34 |     
 35 |     # train classifier
 36 |     message("build_model: ", "training classifier...")
 37 |     model <- randomForest(Sentiment~.,data=model_train_data_df, ntree=50)
 38 |     message("build_model: ", "classifier training DONE!")
 39 |     
 40 |     list(model, new_dtm_df)
 41 | }
 42 | 
 43 | 
 44 | shinyServer(function(input, output) {
 45 |     
 46 |     output$contents <- renderTable({
 47 |         results()
 48 |     })
 49 |     
 50 |     output$status <- renderText({
 51 |         if (!is.null(train_dtm_df))
 52 |             return("Ready!")
 53 |         return("Starting...")
 54 |     })
 55 |     
 56 |     output$distribution <- renderPlot({
 57 |         if (is.null(results()))
 58 |             return(NULL)
 59 |         d <- density(
 60 |             as.numeric(results()$Prob > input$threshold)
 61 |         )
 62 |         plot(
 63 |             d, 
 64 |             xlim = c(0, 1),
 65 |             main=paste0("Sentiment Distribution (Prob > ", input$threshold, ")")
 66 |         )
 67 |         polygon(d, col="lightgrey", border="lightgrey")
 68 |         abline(v = input$threshold, col = "blue")
 69 |     })
 70 |     
 71 |     results <- reactive({
 72 |         inFile <- input$file1
 73 |             
 74 |         if (is.null(inFile))
 75 |             return(NULL)
 76 |         
 77 |         # load input data
 78 |         new_data_df <- read.csv(
 79 |             inFile$datapath, 
 80 |             sep='\t', 
 81 |             header=FALSE, 
 82 |             quote = "",
 83 |             stringsAsFactor=F,
 84 |             col.names=c("Text")
 85 |         )
 86 |         message("renderTable: ", "input file loaded")
 87 |         
 88 |         model_and_data <- build_model(new_data_df, input$sparsity)
 89 |         
 90 |         message("renderTable: ", "making predictions...")
 91 |         pred <- predict(model_and_data[[1]], newdata=model_and_data[[2]], type="prob")
 92 |         message("renderTable: ", "predictions DONE")
 93 |         
 94 |         new_data_df$Prob <- pred[,2]
 95 | 
 96 |         new_data_df
 97 |     })
 98 | })
 99 | 
100 | # Load train and test data
101 | train_data_df <- read.csv(
102 |     file = 'train_data.tsv',
103 |     sep='\t', 
104 |     quote = "",
105 |     header=FALSE, 
106 |     stringsAsFactor=F,
107 |     col.names=c("Sentiment", "Text")
108 | )
109 | train_data_df$Sentiment <- as.factor(train_data_df$Sentiment)
110 | message(paste("There are", nrow(train_data_df), "rows in training dataset"))
111 | 
112 | # Create training corpus for later re-use
113 | train_corpus <- Corpus(VectorSource(train_data_df$Text))
114 | #message("init: corpus created with length ", length(train_corpus))
115 | train_corpus <- tm_map(train_corpus, content_transformer(tolower))
116 | #message("init: corpus lowercased with length ", length(train_corpus))
117 | train_corpus <- tm_map(train_corpus, removePunctuation)
118 | #message("init: corpus punctuation removed with length ", length(train_corpus))
119 | train_corpus <- tm_map(train_corpus, removeWords, stopwords("english"))
120 | #message("init: corpus stopwords removed with length ", length(train_corpus))
121 | train_corpus <- tm_map(train_corpus, stripWhitespace)
122 | #message("init: corpus space stripped with length ", length(train_corpus))
123 | train_corpus <- tm_map(train_corpus, stemDocument)
124 | #message("init: corpus stemmed with length ", length(train_corpus))
125 | message("init: training corpus DONE")
126 | 
127 | # create document-term matrix
128 | train_dtm <- DocumentTermMatrix(train_corpus)
129 | train_dtm <- removeSparseTerms(train_dtm, 0.995)
130 | message(paste0("init: training dtm created (", ncol(train_dtm), " terms in training corpus)"))
131 | train_dtm_df <- data.frame(as.matrix(train_dtm))
132 | message(paste0("init: training dtm converted to df (", ncol(train_dtm), " terms in training corpus)"))
133 | colnames(train_dtm_df) <- make.names(colnames(train_dtm_df))
134 | message(paste0("init: training dtm DONE (", ncol(train_dtm_df), " terms in training corpus)"))
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/apps/sentimentclassifier/server_reactions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/sentimentclassifier/server_reactions.png


--------------------------------------------------------------------------------
/apps/sentimentclassifier/ui.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | shinyUI(fluidPage(
 4 |     
 5 |     # Application title
 6 |     headerPanel("Text Sentiment Analyser"),
 7 |     
 8 |     sidebarLayout(
 9 |         # the control panel
10 |         sidebarPanel(
11 |             helpText("Starting...",
12 |             textOutput("status")
13 |             ),
14 |             tags$hr(),
15 |             fileInput('file1', 'Choose text File',
16 |                       accept=c('text/tsv', 
17 |                                'text/tab-separated-values,text/plain', 
18 |                                '.tsv')),
19 |             tags$hr(),
20 |             sliderInput("threshold",
21 |                         "Positive sentiment threshold",
22 |                         min = .1,
23 |                         max = .99,
24 |                         value = .5),
25 |             tags$hr(),
26 |             sliderInput("sparsity",
27 |                         "Max. term sparsity",
28 |                         min = .1,
29 |                         max = .99,
30 |                         value = .95)
31 |         ),
32 |         
33 |         # Show a plot of the generated distribution
34 |         mainPanel(
35 |             plotOutput('distribution')
36 |         )
37 |     ),
38 |     tags$hr(),
39 |     fluidRow(
40 |         # the results detail panel
41 |         column(12,
42 |             tableOutput('contents')
43 |         )
44 |     )
45 | ))
46 | 


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-11-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-11-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-11.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-12-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-12-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-12.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-15-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-15-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-15.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-16-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-16-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-16.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-19-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-19-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-19.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-20-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-20-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-20.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-23-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-23-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-23.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-24-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-24-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-24.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-25-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-25-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-25.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-27-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-27-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-27.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-28-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-28-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-28.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-7-1.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/README_files/figure-html/unnamed-chunk-7.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-11.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-12.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-15.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-16.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-19.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-20.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-23.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-24.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-25.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-27.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-28.png


--------------------------------------------------------------------------------
/apps/wine-quality-data-analysis/figure/unnamed-chunk-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jadianes/data-science-your-way/ec3fb379e98b1492abd67ec4fa57c75ee3cafab5/apps/wine-quality-data-analysis/figure/unnamed-chunk-7.png


--------------------------------------------------------------------------------
/apps/winerama/README.md:
--------------------------------------------------------------------------------
 1 | # Winerama  
 2 | ###### a web recommender tutorial using Python, Django, and Pandas.  
 3 | 
 4 | This repository contains the code for such a web application in different stages as git tags.  
 5 | 
 6 | - [`stage-0`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-0): an empty repo.  
 7 | - [`stage-0.1`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-0.1): a Django project with one app called `reviews`. The app defines model entities.  
 8 | - [`stage-0.2`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-0.2): admin site up and running for our model entitities `Wine` and `Review`.  
 9 | - [`stage-0.3`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-0.3): views and templates are available.  
10 | - [`stage-0.4`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-0.4): add review form added.  
11 | - [`stage-0.5`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-0.5): template reuse.  
12 | - [**stage-1**](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-1): added Bootstrap 3 for Django.  
13 | - [`stage-1.1`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-1.1): `add_review` now requires login. Added login templates and menu sesion links.   
14 | - [`stage-1.2`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-1.1): a user reviews page created.  
15 | - [**stage-2**](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-2): user management done.  
16 | - [`stage-2.1`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-2.1): Scripts to load CSV available + data loaded.  
17 | - [`stage-2.2`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-2.2): An empty wine suggestions view has been added.  
18 | - [`stage-2.3`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-2.3): Suggestions view now shows wines not reviewed by the user.  
19 | - [`stage-2.4`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-2.4): Added cluster model object and manually created three clusters.  
20 | - [`stage-2.5`](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-2.5): Suggestions view now makes use of cluster information.  
21 | - [**stage-3**](https://github.com/jadianes/winerama-recommender-tutorial/tree/stage-3): K-means clustering based recommendations provided.  
22 | 
23 | 
24 | ## Tutorials  
25 | 
26 | The following tutorials will guide you through each of the previous Git tags while learning different 
27 | concepts of data product development with Python.  
28 | 
29 | #### [A Wine Review Website using Django and Bootstrap](https://github.com/jadianes/winerama-recommender-tutorial/blob/master/tutorials/tutorial_1.md)  
30 | 
31 | #### [Adding User management](https://github.com/jadianes/winerama-recommender-tutorial/blob/master/tutorials/tutorial_2.md)  
32 | 
33 | #### [Providing wine recommendations using K-Means](https://github.com/jadianes/winerama-recommender-tutorial/blob/master/tutorials/tutorial_3.md)  
34 | 


--------------------------------------------------------------------------------