├── .gitignore
├── LICENSE
├── README.md
├── data
    ├── Advertising.csv
    ├── Credit.csv
    ├── WineData.csv
    ├── airquality.csv
    ├── churn-bigml-20.csv
    ├── churn-bigml-80.csv
    ├── cuse_binary.csv
    ├── horseshoe_crab.csv
    ├── hsb2.csv
    ├── hsb2_modified.csv
    ├── iris.csv
    ├── kaggle-titanic-gender_submission.csv
    ├── kaggle-titanic-test.csv
    ├── kaggle-titanic-train.csv
    ├── mtcars.csv
    ├── prostate.csv
    ├── saved-mtcars
    │   ├── .part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc
    │   ├── _SUCCESS
    │   └── part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv
    ├── saved-twitter
    │   ├── .part-00000.crc
    │   ├── _SUCCESS
    │   └── part-00000
    ├── titanic
    │   ├── gender_submission.csv
    │   ├── test.csv
    │   └── train.csv
    └── twitter.txt
├── delete-readme.txt
├── images
    ├── simple-nlp-pipeline.png
    └── spark-pipeline.png
├── index.Rmd
├── index.html
├── legacy
    ├── 01_entry_points_to_spark.Rmd
    ├── 02_rdd_object.Rmd
    ├── 03_dataframe_object.Rmd
    ├── HashingTF-and-CountVectorizer.Rmd
    ├── categorical-data.Rmd
    ├── continuous-to-categorical-variable.Rmd
    ├── conversion-between-rdd-and-dataframe.Rmd
    ├── cross-validation-in-r.Rmd
    ├── decision-tree-classification.Rmd
    ├── dttreeC.Rmd
    ├── dttreeC.html
    ├── dttreeR.Rmd
    ├── fnn.Rmd
    ├── index.Rmd
    ├── information-extraction.Rmd
    ├── install.Rmd
    ├── k-folds-cross-validation.Rmd
    ├── kmeans.Rmd
    ├── linear-regression.Rmd
    ├── linearRegression.Rmd
    ├── logistic-regression.Rmd
    ├── machine-learning-framework.Rmd
    ├── nlp-and-nltk-basics.Rmd
    ├── nlpC.Rmd
    ├── nlpLDA.Rmd
    ├── pyspark-on-jupyter.Rmd
    ├── pyspark-on-rodeo.Rmd
    ├── pyspark-vectors.Rmd
    ├── pyspark.ml.feature-module.Rmd
    ├── r-markdown-header.Rmd
    ├── randomforest.Rmd
    ├── randomforestC.Rmd
    ├── regularization.Rmd
    ├── sna.Rmd
    ├── spark-on-jetstream-cloud.Rmd
    └── tf-idf.Rmd
├── link-spark-with-jupyter.md
├── logo.jpg
├── notebooks
    ├── 01-data-strcture
    │   ├── .gitignore
    │   ├── .ipynb_checkpoints
    │   │   ├── 1.1-rdd-checkpoint.ipynb
    │   │   ├── 1.2-dataframe-checkpoint.ipynb
    │   │   ├── 1.3-conversion-between-rdd-and-dataframe-checkpoint.ipynb
    │   │   └── 1.4-merge-and-split-columns-checkpoint.ipynb
    │   ├── 1.1-rdd.ipynb
    │   ├── 1.2-dataframe.ipynb
    │   ├── 1.3-conversion-between-rdd-and-dataframe.ipynb
    │   └── 1.4-merge-and-split-columns.ipynb
    ├── 02-data-manipulation
    │   ├── .ipynb_checkpoints
    │   │   ├── 2.1-map-functions-checkpoint.ipynb
    │   │   ├── 2.2-aggregate-functions-checkpoint.ipynb
    │   │   ├── 2.3-continuous-variable-to-categorical-variable-checkpoint.ipynb
    │   │   ├── 2.4-first-data-check-checkpoint.ipynb
    │   │   ├── 2.7.1-column-expression-checkpoint.ipynb
    │   │   ├── 2.7.3-boolean-column-expression-checkpoint.ipynb
    │   │   ├── 2.8-sql-functions-to-extend-column-expressions-checkpoint.ipynb
    │   │   └── 2.9-user-defined-sql-function (udf)-checkpoint.ipynb
    │   ├── 2.1-map-functions.ipynb
    │   ├── 2.2-aggregate-functions.ipynb
    │   ├── 2.3-continuous-variable-to-categorical-variable.ipynb
    │   ├── 2.4-first-data-check.ipynb
    │   ├── 2.5-subset-dataframe-by-row.ipynb
    │   ├── 2.6-subset-dataframe-by-column.ipynb
    │   ├── 2.7.1-column-expression.ipynb
    │   ├── 2.7.2-dot-column-expression.ipynb
    │   ├── 2.7.3-boolean-column-expression.ipynb
    │   ├── 2.8-sql-functions-to-extend-column-expressions.ipynb
    │   ├── 2.9-user-defined-sql-function (udf).ipynb
    │   └── import-and-export-data.ipynb
    ├── 03-data-preparation
    │   ├── stringindexer-and-onehotencoder.ipynb
    │   └── vector-assembler.ipynb
    ├── 04-miscellaneous
    │   ├── .ipynb_checkpoints
    │   │   └── user-defined-sql-function (udf)-checkpoint.ipynb
    │   ├── TF-IDF.ipynb
    │   ├── add-python-files-to-spark-cluster.ipynb
    │   ├── dense-vs-sparse-vectors.ipynb
    │   ├── issues-and-solutions.ipynb
    │   ├── pipeline.ipynb
    │   └── sql-functions.ipynb
    ├── 05-module-turning
    │   ├── cross-validation.ipynb
    │   └── regularization.ipynb
    ├── 06-machine-learning
    │   ├── classification
    │   │   ├── binary-classification.ipynb
    │   │   ├── decision-tree-classification.ipynb
    │   │   ├── gradient-boost-tree-classification.ipynb
    │   │   ├── logistic-regression.ipynb
    │   │   ├── naive-bayes-classification.ipynb
    │   │   └── random-forest-classification.ipynb
    │   └── regression
    │   │   ├── generalized-linear-regression.ipynb
    │   │   └── linear-regression.ipynb
    ├── 07-natural-language-processing
    │   ├── nlp-and-nltk-basics.ipynb
    │   ├── nlp-information-extraction.ipynb
    │   └── skills-needed-for-nlp-jobs.ipynb
    └── ipynb
    │   ├── .ipynb_checkpoints
    │       ├── DecisionTree-checkpoint.ipynb
    │       ├── Feedforward neural network(1)-checkpoint.ipynb
    │       ├── HashingTF-and-CountVectorizer-checkpoint.ipynb
    │       ├── NaiveBayes-checkpoint.ipynb
    │       └── RDD-manipulation-checkpoint.ipynb
    │   ├── Categoricaldata.ipynb
    │   ├── DataWrangling.ipynb
    │   ├── DecisionTree.ipynb
    │   ├── DecisionTreeC3.ipynb
    │   ├── DecisionTreeC7.ipynb
    │   ├── DecisionTreeR.ipynb
    │   ├── Feedforward neural network(1).ipynb
    │   ├── Feedforward neural network.ipynb
    │   ├── HashingTF-and-CountVectorizer.ipynb
    │   ├── LinearRegression.ipynb
    │   ├── NaiveBayes.ipynb
    │   ├── Natural Language Processing nb.ipynb
    │   ├── PysparkCluster.ipynb
    │   ├── RandomForest.ipynb
    │   ├── Regression.ipynb
    │   ├── derby.log
    │   ├── preproc.py
    │   └── vector.ipynb
├── pyFiles
    ├── .idea
    │   ├── misc.xml
    │   ├── modules.xml
    │   ├── pyFiles.iml
    │   └── workspace.xml
    └── my_module.py
└── vakata-jstree-3.3.5
    ├── .gitignore
    ├── LICENSE-MIT
    ├── README.md
    ├── bower.json
    ├── component.json
    ├── composer.json
    ├── demo
        ├── README.md
        └── basic
        │   ├── index.html
        │   └── root.json
    ├── dist
        ├── jstree.js
        ├── jstree.min.js
        └── themes
        │   ├── default-dark
        │       ├── 32px.png
        │       ├── 40px.png
        │       ├── style.css
        │       ├── style.min.css
        │       └── throbber.gif
        │   └── default
        │       ├── 32px.png
        │       ├── 40px.png
        │       ├── style.css
        │       ├── style.min.css
        │       └── throbber.gif
    ├── gruntfile.js
    ├── jstree.jquery.json
    ├── package.json
    ├── src
        ├── intro.js
        ├── jstree.changed.js
        ├── jstree.checkbox.js
        ├── jstree.conditionalselect.js
        ├── jstree.contextmenu.js
        ├── jstree.dnd.js
        ├── jstree.js
        ├── jstree.massload.js
        ├── jstree.search.js
        ├── jstree.sort.js
        ├── jstree.state.js
        ├── jstree.types.js
        ├── jstree.unique.js
        ├── jstree.wholerow.js
        ├── misc.js
        ├── outro.js
        ├── sample.js
        ├── themes
        │   ├── base.less
        │   ├── default-dark
        │   │   ├── 32px.png
        │   │   ├── 40px.png
        │   │   ├── style.css
        │   │   ├── style.less
        │   │   └── throbber.gif
        │   ├── default
        │   │   ├── 32px.png
        │   │   ├── 40px.png
        │   │   ├── style.css
        │   │   ├── style.less
        │   │   └── throbber.gif
        │   ├── main.less
        │   ├── mixins.less
        │   └── responsive.less
        └── vakata-jstree.js
    └── test
        ├── unit
            ├── index.html
            ├── libs
            │   ├── qunit.css
            │   └── qunit.js
            └── test.js
        └── visual
            ├── desktop
                └── index.html
            ├── mobile
                └── index.html
            └── screenshots
                ├── desktop
                    ├── .png
                    ├── desktop.png
                    └── home.png
                └── mobile
                    ├── .png
                    ├── home.png
                    └── mobile.png


/.gitignore:
--------------------------------------------------------------------------------
 1 | /.Rproj.user
 2 | /.Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | .DS_Store
 6 | *.ipybn
 7 | /.ipynb_checkpoints
 8 | /.idea
 9 | .Rproj.user
10 | metastore_db
11 | *_cache
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Ming Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # [Learning Apache Spark](https://mingchen0919.github.io/learning-apache-spark/)
2 | 
3 | 
4 | Website: https://mingchen0919.github.io/learning-apache-spark/index.html
5 | 


--------------------------------------------------------------------------------
/data/Advertising.csv:
--------------------------------------------------------------------------------
1 | TV,Radio,Newspaper,Sales230.1,37.8,69.2,22.144.5,39.3,45.1,10.417.2,45.9,69.3,9.3151.5,41.3,58.5,18.5180.8,10.8,58.4,12.98.7,48.9,75,7.257.5,32.8,23.5,11.8120.2,19.6,11.6,13.28.6,2.1,1,4.8199.8,2.6,21.2,10.666.1,5.8,24.2,8.6214.7,24,4,17.423.8,35.1,65.9,9.297.5,7.6,7.2,9.7204.1,32.9,46,19195.4,47.7,52.9,22.467.8,36.6,114,12.5281.4,39.6,55.8,24.469.2,20.5,18.3,11.3147.3,23.9,19.1,14.6218.4,27.7,53.4,18237.4,5.1,23.5,12.513.2,15.9,49.6,5.6228.3,16.9,26.2,15.562.3,12.6,18.3,9.7262.9,3.5,19.5,12142.9,29.3,12.6,15240.1,16.7,22.9,15.9248.8,27.1,22.9,18.970.6,16,40.8,10.5292.9,28.3,43.2,21.4112.9,17.4,38.6,11.997.2,1.5,30,9.6265.6,20,0.3,17.495.7,1.4,7.4,9.5290.7,4.1,8.5,12.8266.9,43.8,5,25.474.7,49.4,45.7,14.743.1,26.7,35.1,10.1228,37.7,32,21.5202.5,22.3,31.6,16.6177,33.4,38.7,17.1293.6,27.7,1.8,20.7206.9,8.4,26.4,12.925.1,25.7,43.3,8.5175.1,22.5,31.5,14.989.7,9.9,35.7,10.6239.9,41.5,18.5,23.2227.2,15.8,49.9,14.866.9,11.7,36.8,9.7199.8,3.1,34.6,11.4100.4,9.6,3.6,10.7216.4,41.7,39.6,22.6182.6,46.2,58.7,21.2262.7,28.8,15.9,20.2198.9,49.4,60,23.77.3,28.1,41.4,5.5136.2,19.2,16.6,13.2210.8,49.6,37.7,23.8210.7,29.5,9.3,18.453.5,2,21.4,8.1261.3,42.7,54.7,24.2239.3,15.5,27.3,15.7102.7,29.6,8.4,14131.1,42.8,28.9,1869,9.3,0.9,9.331.5,24.6,2.2,9.5139.3,14.5,10.2,13.4237.4,27.5,11,18.9216.8,43.9,27.2,22.3199.1,30.6,38.7,18.3109.8,14.3,31.7,12.426.8,33,19.3,8.8129.4,5.7,31.3,11213.4,24.6,13.1,1716.9,43.7,89.4,8.727.5,1.6,20.7,6.9120.5,28.5,14.2,14.25.4,29.9,9.4,5.3116,7.7,23.1,1176.4,26.7,22.3,11.8239.8,4.1,36.9,12.375.3,20.3,32.5,11.368.4,44.5,35.6,13.6213.5,43,33.8,21.7193.2,18.4,65.7,15.276.3,27.5,16,12110.7,40.6,63.2,1688.3,25.5,73.4,12.9109.8,47.8,51.4,16.7134.3,4.9,9.3,11.228.6,1.5,33,7.3217.7,33.5,59,19.4250.9,36.5,72.3,22.2107.4,14,10.9,11.5163.3,31.6,52.9,16.9197.6,3.5,5.9,11.7184.9,21,22,15.5289.7,42.3,51.2,25.4135.2,41.7,45.9,17.2222.4,4.3,49.8,11.7296.4,36.3,100.9,23.8280.2,10.1,21.4,14.8187.9,17.2,17.9,14.7238.2,34.3,5.3,20.7137.9,46.4,59,19.225,11,29.7,7.290.4,0.3,23.2,8.713.1,0.4,25.6,5.3255.4,26.9,5.5,19.8225.8,8.2,56.5,13.4241.7,38,23.2,21.8175.7,15.4,2.4,14.1209.6,20.6,10.7,15.978.2,46.8,34.5,14.675.1,35,52.7,12.6139.2,14.3,25.6,12.276.4,0.8,14.8,9.4125.7,36.9,79.2,15.919.4,16,22.3,6.6141.3,26.8,46.2,15.518.8,21.7,50.4,7224,2.4,15.6,11.6123.1,34.6,12.4,15.2229.5,32.3,74.2,19.787.2,11.8,25.9,10.67.8,38.9,50.6,6.680.2,0,9.2,8.8220.3,49,3.2,24.759.6,12,43.1,9.70.7,39.6,8.7,1.6265.2,2.9,43,12.78.4,27.2,2.1,5.7219.8,33.5,45.1,19.636.9,38.6,65.6,10.848.3,47,8.5,11.625.6,39,9.3,9.5273.7,28.9,59.7,20.843,25.9,20.5,9.6184.9,43.9,1.7,20.773.4,17,12.9,10.9193.7,35.4,75.6,19.2220.5,33.2,37.9,20.1104.6,5.7,34.4,10.496.2,14.8,38.9,11.4140.3,1.9,9,10.3240.1,7.3,8.7,13.2243.2,49,44.3,25.438,40.3,11.9,10.944.7,25.8,20.6,10.1280.7,13.9,37,16.1121,8.4,48.7,11.6197.6,23.3,14.2,16.6171.3,39.7,37.7,19187.8,21.1,9.5,15.64.1,11.6,5.7,3.293.9,43.5,50.5,15.3149.8,1.3,24.3,10.111.7,36.9,45.2,7.3131.7,18.4,34.6,12.9172.5,18.1,30.7,14.485.7,35.8,49.3,13.3188.4,18.1,25.6,14.9163.5,36.8,7.4,18117.2,14.7,5.4,11.9234.5,3.4,84.8,11.917.9,37.6,21.6,8206.8,5.2,19.4,12.2215.4,23.6,57.6,17.1284.3,10.6,6.4,1550,11.6,18.4,8.4164.5,20.9,47.4,14.519.6,20.1,17,7.6168.4,7.1,12.8,11.7222.4,3.4,13.1,11.5276.9,48.9,41.8,27248.4,30.2,20.3,20.2170.2,7.8,35.2,11.7276.7,2.3,23.7,11.8165.6,10,17.6,12.6156.6,2.6,8.3,10.5218.5,5.4,27.4,12.256.2,5.7,29.7,8.7287.6,43,71.8,26.2253.8,21.3,30,17.6205,45.1,19.6,22.6139.5,2.1,26.6,10.3191.1,28.7,18.2,17.3286,13.9,3.7,15.918.7,12.1,23.4,6.739.5,41.1,5.8,10.875.5,10.8,6,9.917.2,4.1,31.6,5.9166.8,42,3.6,19.6149.7,35.6,6,17.338.2,3.7,13.8,7.694.2,4.9,8.1,9.7177,9.3,6.4,12.8283.6,42,66.2,25.5232.1,8.6,8.7,13.4


--------------------------------------------------------------------------------
/data/airquality.csv:
--------------------------------------------------------------------------------
  1 | "ozone","solar.r","wind","temp","month","day"
  2 | 41,190,7.4,67,5,1
  3 | 36,118,8,72,5,2
  4 | 12,149,12.6,74,5,3
  5 | 18,313,11.5,62,5,4
  6 | NA,NA,14.3,56,5,5
  7 | 28,NA,14.9,66,5,6
  8 | 23,299,8.6,65,5,7
  9 | 19,99,13.8,59,5,8
 10 | 8,19,20.1,61,5,9
 11 | NA,194,8.6,69,5,10
 12 | 7,NA,6.9,74,5,11
 13 | 16,256,9.7,69,5,12
 14 | 11,290,9.2,66,5,13
 15 | 14,274,10.9,68,5,14
 16 | 18,65,13.2,58,5,15
 17 | 14,334,11.5,64,5,16
 18 | 34,307,12,66,5,17
 19 | 6,78,18.4,57,5,18
 20 | 30,322,11.5,68,5,19
 21 | 11,44,9.7,62,5,20
 22 | 1,8,9.7,59,5,21
 23 | 11,320,16.6,73,5,22
 24 | 4,25,9.7,61,5,23
 25 | 32,92,12,61,5,24
 26 | NA,66,16.6,57,5,25
 27 | NA,266,14.9,58,5,26
 28 | NA,NA,8,57,5,27
 29 | 23,13,12,67,5,28
 30 | 45,252,14.9,81,5,29
 31 | 115,223,5.7,79,5,30
 32 | 37,279,7.4,76,5,31
 33 | NA,286,8.6,78,6,1
 34 | NA,287,9.7,74,6,2
 35 | NA,242,16.1,67,6,3
 36 | NA,186,9.2,84,6,4
 37 | NA,220,8.6,85,6,5
 38 | NA,264,14.3,79,6,6
 39 | 29,127,9.7,82,6,7
 40 | NA,273,6.9,87,6,8
 41 | 71,291,13.8,90,6,9
 42 | 39,323,11.5,87,6,10
 43 | NA,259,10.9,93,6,11
 44 | NA,250,9.2,92,6,12
 45 | 23,148,8,82,6,13
 46 | NA,332,13.8,80,6,14
 47 | NA,322,11.5,79,6,15
 48 | 21,191,14.9,77,6,16
 49 | 37,284,20.7,72,6,17
 50 | 20,37,9.2,65,6,18
 51 | 12,120,11.5,73,6,19
 52 | 13,137,10.3,76,6,20
 53 | NA,150,6.3,77,6,21
 54 | NA,59,1.7,76,6,22
 55 | NA,91,4.6,76,6,23
 56 | NA,250,6.3,76,6,24
 57 | NA,135,8,75,6,25
 58 | NA,127,8,78,6,26
 59 | NA,47,10.3,73,6,27
 60 | NA,98,11.5,80,6,28
 61 | NA,31,14.9,77,6,29
 62 | NA,138,8,83,6,30
 63 | 135,269,4.1,84,7,1
 64 | 49,248,9.2,85,7,2
 65 | 32,236,9.2,81,7,3
 66 | NA,101,10.9,84,7,4
 67 | 64,175,4.6,83,7,5
 68 | 40,314,10.9,83,7,6
 69 | 77,276,5.1,88,7,7
 70 | 97,267,6.3,92,7,8
 71 | 97,272,5.7,92,7,9
 72 | 85,175,7.4,89,7,10
 73 | NA,139,8.6,82,7,11
 74 | 10,264,14.3,73,7,12
 75 | 27,175,14.9,81,7,13
 76 | NA,291,14.9,91,7,14
 77 | 7,48,14.3,80,7,15
 78 | 48,260,6.9,81,7,16
 79 | 35,274,10.3,82,7,17
 80 | 61,285,6.3,84,7,18
 81 | 79,187,5.1,87,7,19
 82 | 63,220,11.5,85,7,20
 83 | 16,7,6.9,74,7,21
 84 | NA,258,9.7,81,7,22
 85 | NA,295,11.5,82,7,23
 86 | 80,294,8.6,86,7,24
 87 | 108,223,8,85,7,25
 88 | 20,81,8.6,82,7,26
 89 | 52,82,12,86,7,27
 90 | 82,213,7.4,88,7,28
 91 | 50,275,7.4,86,7,29
 92 | 64,253,7.4,83,7,30
 93 | 59,254,9.2,81,7,31
 94 | 39,83,6.9,81,8,1
 95 | 9,24,13.8,81,8,2
 96 | 16,77,7.4,82,8,3
 97 | 78,NA,6.9,86,8,4
 98 | 35,NA,7.4,85,8,5
 99 | 66,NA,4.6,87,8,6
100 | 122,255,4,89,8,7
101 | 89,229,10.3,90,8,8
102 | 110,207,8,90,8,9
103 | NA,222,8.6,92,8,10
104 | NA,137,11.5,86,8,11
105 | 44,192,11.5,86,8,12
106 | 28,273,11.5,82,8,13
107 | 65,157,9.7,80,8,14
108 | NA,64,11.5,79,8,15
109 | 22,71,10.3,77,8,16
110 | 59,51,6.3,79,8,17
111 | 23,115,7.4,76,8,18
112 | 31,244,10.9,78,8,19
113 | 44,190,10.3,78,8,20
114 | 21,259,15.5,77,8,21
115 | 9,36,14.3,72,8,22
116 | NA,255,12.6,75,8,23
117 | 45,212,9.7,79,8,24
118 | 168,238,3.4,81,8,25
119 | 73,215,8,86,8,26
120 | NA,153,5.7,88,8,27
121 | 76,203,9.7,97,8,28
122 | 118,225,2.3,94,8,29
123 | 84,237,6.3,96,8,30
124 | 85,188,6.3,94,8,31
125 | 96,167,6.9,91,9,1
126 | 78,197,5.1,92,9,2
127 | 73,183,2.8,93,9,3
128 | 91,189,4.6,93,9,4
129 | 47,95,7.4,87,9,5
130 | 32,92,15.5,84,9,6
131 | 20,252,10.9,80,9,7
132 | 23,220,10.3,78,9,8
133 | 21,230,10.9,75,9,9
134 | 24,259,9.7,73,9,10
135 | 44,236,14.9,81,9,11
136 | 21,259,15.5,76,9,12
137 | 28,238,6.3,77,9,13
138 | 9,24,10.9,71,9,14
139 | 13,112,11.5,71,9,15
140 | 46,237,6.9,78,9,16
141 | 18,224,13.8,67,9,17
142 | 13,27,10.3,76,9,18
143 | 24,238,10.3,68,9,19
144 | 16,201,8,82,9,20
145 | 13,238,12.6,64,9,21
146 | 23,14,9.2,71,9,22
147 | 36,139,10.3,81,9,23
148 | 7,49,10.3,69,9,24
149 | 14,20,16.6,63,9,25
150 | 30,193,6.9,70,9,26
151 | NA,145,13.2,77,9,27
152 | 14,191,14.3,75,9,28
153 | 18,131,8,76,9,29
154 | 20,223,11.5,68,9,30
155 | 


--------------------------------------------------------------------------------
/data/horseshoe_crab.csv:
--------------------------------------------------------------------------------
  1 | C,S,W,Wt,Sa
  2 | 2,3,28.3,3.05,8
  3 | 3,3,26,2.6,4
  4 | 3,3,25.6,2.15,0
  5 | 4,2,21,1.85,0
  6 | 2,3,29,3,1
  7 | 1,2,25,2.3,3
  8 | 4,3,26.2,1.3,0
  9 | 2,3,24.9,2.1,0
 10 | 2,1,25.7,2,8
 11 | 2,3,27.5,3.15,6
 12 | 1,1,26.1,2.8,5
 13 | 3,3,28.9,2.8,4
 14 | 2,1,30.3,3.6,3
 15 | 2,3,22.9,1.6,4
 16 | 3,3,26.2,2.3,3
 17 | 3,3,24.5,2.05,5
 18 | 2,3,30,3.05,8
 19 | 2,3,26.2,2.4,3
 20 | 2,3,25.4,2.25,6
 21 | 2,3,25.4,2.25,4
 22 | 4,3,27.5,2.9,0
 23 | 4,3,27,2.25,3
 24 | 2,2,24,1.7,0
 25 | 2,1,28.7,3.2,0
 26 | 3,3,26.5,1.97,1
 27 | 2,3,24.5,1.6,1
 28 | 3,3,27.3,2.9,1
 29 | 2,3,26.5,2.3,4
 30 | 2,3,25,2.1,2
 31 | 3,3,22,1.4,0
 32 | 1,1,30.2,3.28,2
 33 | 2,2,25.4,2.3,0
 34 | 2,1,24.9,2.3,6
 35 | 4,3,25.8,2.25,10
 36 | 3,3,27.2,2.4,5
 37 | 2,3,30.5,3.32,3
 38 | 4,3,25,2.1,8
 39 | 2,3,30,3,9
 40 | 2,1,22.9,1.6,0
 41 | 2,3,23.9,1.85,2
 42 | 2,3,26,2.28,3
 43 | 2,3,25.8,2.2,0
 44 | 3,3,29,3.28,4
 45 | 1,1,26.5,2.35,0
 46 | 3,3,22.5,1.55,0
 47 | 2,3,23.8,2.1,0
 48 | 3,3,24.3,2.15,0
 49 | 2,1,26,2.3,14
 50 | 4,3,24.7,2.2,0
 51 | 2,1,22.5,1.6,1
 52 | 2,3,28.7,3.15,3
 53 | 1,1,29.3,3.2,4
 54 | 2,1,26.7,2.7,5
 55 | 4,3,23.4,1.9,0
 56 | 1,1,27.7,2.5,6
 57 | 2,3,28.2,2.6,6
 58 | 4,3,24.7,2.1,5
 59 | 2,1,25.7,2,5
 60 | 2,1,27.8,2.75,0
 61 | 3,1,27,2.45,3
 62 | 2,3,29,3.2,10
 63 | 3,3,25.6,2.8,7
 64 | 3,3,24.2,1.9,0
 65 | 3,3,25.7,1.2,0
 66 | 3,3,23.1,1.65,0
 67 | 2,3,28.5,3.05,0
 68 | 2,1,29.7,3.85,5
 69 | 3,3,23.1,1.55,0
 70 | 3,3,24.5,2.2,1
 71 | 2,3,27.5,2.55,1
 72 | 2,3,26.3,2.4,1
 73 | 2,3,27.8,3.25,3
 74 | 2,3,31.9,3.33,2
 75 | 2,3,25,2.4,5
 76 | 3,3,26.2,2.22,0
 77 | 3,3,28.4,3.2,3
 78 | 1,2,24.5,1.95,6
 79 | 2,3,27.9,3.05,7
 80 | 2,2,25,2.25,6
 81 | 3,3,29,2.92,3
 82 | 2,1,31.7,3.73,4
 83 | 2,3,27.6,2.85,4
 84 | 4,3,24.5,1.9,0
 85 | 3,3,23.8,1.8,0
 86 | 2,3,28.2,3.05,8
 87 | 3,3,24.1,1.8,0
 88 | 1,1,28,2.62,0
 89 | 1,1,26,2.3,9
 90 | 3,2,24.7,1.9,0
 91 | 2,3,25.8,2.65,0
 92 | 1,1,27.1,2.95,8
 93 | 2,3,27.4,2.7,5
 94 | 3,3,26.7,2.6,2
 95 | 2,1,26.8,2.7,5
 96 | 1,3,25.8,2.6,0
 97 | 4,3,23.7,1.85,0
 98 | 2,3,27.9,2.8,6
 99 | 2,1,30,3.3,5
100 | 2,3,25,2.1,4
101 | 2,3,27.7,2.9,5
102 | 2,3,28.3,3,15
103 | 4,3,25.5,2.25,0
104 | 2,3,26,2.15,5
105 | 2,3,26.2,2.4,0
106 | 3,3,23,1.65,1
107 | 2,2,22.9,1.6,0
108 | 2,3,25.1,2.1,5
109 | 3,1,25.9,2.55,4
110 | 4,1,25.5,2.75,0
111 | 2,1,26.8,2.55,0
112 | 2,1,29,2.8,1
113 | 3,3,28.5,3,1
114 | 2,2,24.7,2.55,4
115 | 2,3,29,3.1,1
116 | 2,3,27,2.5,6
117 | 4,3,23.7,1.8,0
118 | 3,3,27,2.5,6
119 | 2,3,24.2,1.65,2
120 | 4,3,22.5,1.47,4
121 | 2,3,25.1,1.8,0
122 | 2,3,24.9,2.2,0
123 | 2,3,27.5,2.63,6
124 | 2,1,24.3,2,0
125 | 2,3,29.5,3.02,4
126 | 2,3,26.2,2.3,0
127 | 2,3,24.7,1.95,4
128 | 3,2,29.8,3.5,4
129 | 4,3,25.7,2.15,0
130 | 3,3,26.2,2.17,2
131 | 4,3,27,2.63,0
132 | 3,3,24.8,2.1,0
133 | 2,1,23.7,1.95,0
134 | 2,3,28.2,3.05,11
135 | 2,3,25.2,2,1
136 | 2,2,23.2,1.95,4
137 | 4,3,25.8,2,3
138 | 4,3,27.5,2.6,0
139 | 2,2,25.7,2,0
140 | 2,3,26.8,2.65,0
141 | 3,3,27.5,3.1,3
142 | 3,1,28.5,3.25,9
143 | 2,3,28.5,3,3
144 | 1,1,27.4,2.7,6
145 | 2,3,27.2,2.7,3
146 | 3,3,27.1,2.55,0
147 | 2,3,28,2.8,1
148 | 2,1,26.5,1.3,0
149 | 3,3,23,1.8,0
150 | 3,2,26,2.2,3
151 | 3,2,24.5,2.25,0
152 | 2,3,25.8,2.3,0
153 | 4,3,23.5,1.9,0
154 | 4,3,26.7,2.45,0
155 | 3,3,25.5,2.25,0
156 | 2,3,28.2,2.87,1
157 | 2,1,25.2,2,1
158 | 2,3,25.3,1.9,2
159 | 3,3,25.7,2.1,0
160 | 4,3,29.3,3.23,12
161 | 3,3,23.8,1.8,6
162 | 2,3,27.4,2.9,3
163 | 2,3,26.2,2.02,2
164 | 2,1,28,2.9,4
165 | 2,1,28.4,3.1,5
166 | 2,1,33.5,5.2,7
167 | 2,3,25.8,2.4,0
168 | 3,3,24,1.9,10
169 | 2,1,23.1,2,0
170 | 2,3,28.3,3.2,0
171 | 2,3,26.5,2.35,4
172 | 2,3,26.5,2.75,7
173 | 3,3,26.1,2.75,3
174 | 2,2,24.5,2,0


--------------------------------------------------------------------------------
/data/iris.csv:
--------------------------------------------------------------------------------
  1 | sepal_length,sepal_width,petal_length,petal_width,species
  2 | 5.1,3.5,1.4,0.2,setosa
  3 | 4.9,3,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,3.1,1.5,0.2,setosa
  6 | 5,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5,3.4,1.5,0.2,setosa
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,setosa
 14 | 4.8,3,1.4,0.1,setosa
 15 | 4.3,3,1.1,0.1,setosa
 16 | 5.8,4,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | 5.1,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | 4.8,3.4,1.9,0.2,setosa
 27 | 5,3,1.6,0.2,setosa
 28 | 5,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,4.1,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.1,setosa
 37 | 5,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.1,1.5,0.1,setosa
 40 | 4.4,3,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,0.2,setosa
 42 | 5,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | 5.3,3.7,1.5,0.2,setosa
 51 | 5,3.3,1.4,0.2,setosa
 52 | 7,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,3.1,4.9,1.5,versicolor
 55 | 5.5,2.3,4,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,versicolor
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5,2,3.5,1,versicolor
 63 | 5.9,3,4.2,1.5,versicolor
 64 | 6,2.2,4,1,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3,5,1.7,versicolor
 80 | 6,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6,2.7,5.1,1.6,versicolor
 86 | 5.4,3,4.5,1.5,versicolor
 87 | 6,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,versicolor
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3,4.1,1.3,versicolor
 91 | 5.5,2.5,4,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,3,4.6,1.4,versicolor
 94 | 5.8,2.6,4,1.2,versicolor
 95 | 5,2.3,3.3,1,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3,5.8,2.2,virginica
107 | 7.6,3,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3,5.5,2.1,virginica
115 | 5.7,2.5,5,2,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6,2.2,5,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2,virginica
124 | 7.7,2.8,6.7,2,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6,3,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3,5.2,2.3,virginica
148 | 6.3,2.5,5,1.9,virginica
149 | 6.5,3,5.2,2,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3,5.1,1.8,virginica
152 | 


--------------------------------------------------------------------------------
/data/kaggle-titanic-gender_submission.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,1
 35 | 925,1
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/data/mtcars.csv:
--------------------------------------------------------------------------------
 1 | ,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
 2 | Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4
 3 | Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4
 4 | Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
 5 | Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
 6 | Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
 7 | Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
 8 | Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
 9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
16 | Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
17 | Lincoln Continental,10.4,8,460,215,3,5.424,17.82,0,0,3,4
18 | Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | Dodge Challenger,15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
24 | AMC Javelin,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
25 | Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
26 | Pontiac Firebird,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
27 | Fiat X1-9,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
28 | Porsche 914-2,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | Ford Pantera L,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
31 | Ferrari Dino,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
32 | Maserati Bora,15,8,301,335,3.54,3.57,14.6,0,1,5,8
33 | Volvo 142E,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2


--------------------------------------------------------------------------------
/data/saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc


--------------------------------------------------------------------------------
/data/saved-mtcars/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-mtcars/_SUCCESS


--------------------------------------------------------------------------------
/data/saved-mtcars/part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv:
--------------------------------------------------------------------------------
 1 | x_rown_ames,x_mpg,x_cyl,x_disp,x_hp,x_drat,x_wt,x_qsec,x_vs,x_am,x_gear,x_carb
 2 | Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
 3 | Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
 4 | Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
 5 | Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
 6 | Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
 7 | Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
 8 | Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
 9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0,0,3,3
16 | Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,0,3,4
17 | Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,0,0,3,4
18 | Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,0,0,3,4
19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,0,0,3,2
24 | AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2
25 | Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,0,3,4
26 | Pontiac Firebird,19.2,8,400.0,175,3.08,3.845,17.05,0,0,3,2
27 | Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,1,4,1
28 | Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4
31 | Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6
32 | Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
33 | Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2
34 | 


--------------------------------------------------------------------------------
/data/saved-twitter/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-twitter/.part-00000.crc


--------------------------------------------------------------------------------
/data/saved-twitter/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-twitter/_SUCCESS


--------------------------------------------------------------------------------
/data/saved-twitter/part-00000:
--------------------------------------------------------------------------------
 1 | Fresh install of XP on new computer. Sweet relief! fuck vista	1018769417	1.0
 2 | Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl	10284216536	1.0
 3 | "Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting."	10298589026	1.0
 4 | Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever!	109017669432377344	1.0
 5 | 'Cheap Eats in SLP' - http://t.co/4w8gRp7	109642968603963392	1.0
 6 | Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW	10995492579	1.0
 7 | New demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa	11713360136	1.0
 8 | hi all - i'm going to be tweeting things lookstat at the @lookstat twitter account. please follow me there	1208319583	1.0
 9 | Holy carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D?	121330835726155776	1.0
10 | "Did I really need to learn ""I bought a box and put in it things"" in arabic? This is the most random book ever."	12358025545	1.0
11 | 


--------------------------------------------------------------------------------
/data/titanic/gender_submission.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,1
 35 | 925,1
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/data/twitter.txt:
--------------------------------------------------------------------------------
 1 | Fresh install of XP on new computer. Sweet relief! fuck vista	1018769417	1.0
 2 | Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl	10284216536	1.0
 3 | "Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting."	10298589026	1.0
 4 | Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever!	109017669432377344	1.0
 5 | 'Cheap Eats in SLP' - http://t.co/4w8gRp7	109642968603963392	1.0
 6 | Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW	10995492579	1.0
 7 | New demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa	11713360136	1.0
 8 | hi all - i'm going to be tweeting things lookstat at the @lookstat twitter account. please follow me there	1208319583	1.0
 9 | Holy carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D?	121330835726155776	1.0
10 | "Did I really need to learn ""I bought a box and put in it things"" in arabic? This is the most random book ever."	12358025545	1.0
11 | 


--------------------------------------------------------------------------------
/delete-readme.txt:
--------------------------------------------------------------------------------
 1 | # Learning apache spark
 2 | 
 3 | **[Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/)**
 4 | 
 5 | ## Introduction
 6 | 
 7 | This repository contains mainly notes from learning Apache Spark by [Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/). We try to use the detailed demo code and examples to show how to use pyspark for big data mining. **If you find your work wasn't cited in this note, please feel free to let us know.**
 8 | 
 9 | ## Content
10 | 
11 | * ***Cheat Sheets***
12 |     + [Python Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PythonForDataScience.pdf)
13 |     + [Pandas Basics](http://datacamp-community.s3.amazonaws.com/3857975e-e12f-406a-b3e8-7d627217e952)
14 |     + [Numpy Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf)
15 |     + [Scikit-Learn](http://datacamp-community.s3.amazonaws.com/5433fa18-9f43-44cc-b228-74672efcd116)
16 |     + [RDD Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_Cheat_Sheet_Python.pdf)
17 |     + [DataFrame Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf)
18 |     + [Apache Spark Cheat Sheet](https://hackr.io/tutorials/learn-apache-spark)
19 |     
20 | * ***Data Manipulation***
21 |     + [Entry Points to Spark](entry-points-to-spark.ipynb)
22 |     + [RDD Object](rdd-object.ipynb)
23 |     + [DataFrame Object](dataframe-object.ipynb)
24 |     + [RDD and DataFrame conversion](conversion-between-rdd-and-dataframe.ipynb)
25 |     + [Categorical Data, `StringIndexer` and `OneHotEncoder`](stringindexer-and-onehotencoder.ipynb)
26 |     + [Continuous variables to categorical variables](Continuous-variable-to-categorical-variable.ipynb)
27 |     + [Import and export data](import-and-export-data.ipynb)   
28 |     + [Subset data](subset-data.ipynb):
29 |         * select rows by index
30 |         * select rows by logical criteria
31 |         * select columns by index
32 |         * select columns by names
33 |         * select columns by regex pattern
34 |     + [`udf()` function and SQL data types](udf-and-sql-types.ipynb):
35 |         * use `udf()` function
36 |         * difference between `ArrayType` and `StructType`
37 |     + [Pipeline](pipeline.ipynb)
38 |     + [Dense and sparse vectors](dense-vs-sparse-vectors.ipynb)
39 |     + [Assemble feature columns into a `featuresCol` column with `VectorAssembler`](vector-assembler.ipynb)
40 |     + [TF-IDF, HashingTF and CountVectorizer](TF-IDF.ipynb)
41 |     + Feature processing:
42 |     	- [First data check](first-data-check.ipynb)
43 |     + [SQL functions](sql-functions.ipynb)
44 |     + [Add py Files to cluster](add-py-files-to-spark-cluster.ipynb)
45 | 
46 | * ***Machine Learning***
47 |     + [Machine Learning Framework](machine-learning-framework.Rmd)
48 |     + **Regression**
49 | 
50 |         - [Linear regression](linear-regression.ipynb)
51 |         - [Logistic regression](logistic-regression.ipynb)
52 |     
53 |     + **Classification**
54 | 
55 | 		- [Naive bayes classification](naive-bayes-classification.ipynb)
56 | 		- [Decision tree](decision-tree-classification.ipynb)
57 | 		- [Random forest classification](random-forest-classification.ipynb)
58 | 		- [Gradient boost tree classification](gradient-boost-tree-classification.ipynb)
59 |     
60 | * **Model Tuning**
61 |     + [Regularization](regularization.ipynb)
62 |     + [Cross-validation](cross-validation.ipynb)
63 | 
64 | * **Nutural Language Processing**
65 |     + [NLP and NLTK Basics](nlp-and-nltk-basics.ipynb)
66 |     + [NLP Information Extraction](nlp-information-extraction.ipynb)
67 |     
68 | ### Acknowledgement
69 | 
70 | At here, we would like to thank Jian Sun and Zhongbo Li at the University of Tennessee at Knoxville for the valuable disscussion and thank the generous anonymous authors for providing the detailed solutions and source code on the internet. Without those help, this repository would not have been possible to be made. Wenqiang also would like to thank the Institute for Mathematics and Its Applications (IMA) at University of Minnesota, Twin Cities for support during his IMA Data Scientist Fellow visit. 
71 | 
72 | ### Feedback and suggestions
73 | 
74 | Your comments and suggestions are highly appreciated. We are more than happy to receive corrections, suggestions or feedbacks through email (Ming Chen: mchen33@utk.edu, Wenqiang Feng: wfeng1@utk.edu) for improvements.
75 | 


--------------------------------------------------------------------------------
/images/simple-nlp-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/images/simple-nlp-pipeline.png


--------------------------------------------------------------------------------
/images/spark-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/images/spark-pipeline.png


--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: '[Learning Apache Spark](https://github.com/MingChen0919/learning-apache-spark)'
  3 | output:
  4 |     html_document:
  5 |       highlight: pygments
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
  9 | knitr::opts_knit$set(progress = FALSE)
 10 | knitr::opts_chunk$set(error = TRUE, echo = FALSE)
 11 | library(htmltools)
 12 | ```
 13 | 
 14 | ```{r, echo=FALSE}
 15 | # to make the css theme to work, <link></link> tags cannot be added directly 
 16 | # as <script></script> tags as below.
 17 | # it has to be added using a code chunk with the htmltool functions!!!
 18 | css_link = tags$link()
 19 | css_link$attribs = list(rel="stylesheet", href="vakata-jstree-3.3.5/dist/themes/default/style.min.css")
 20 | css_link
 21 | ```
 22 | 
 23 | ```{r, eval=FALSE, echo=FALSE}
 24 | # this code chunk is purely for adding comments
 25 | # below is to add jQuery and jstree javascripts
 26 | ```
 27 | <script src="https://code.jquery.com/jquery-3.3.1.min.js"></script>
 28 | <script src="vakata-jstree-3.3.5/dist/jstree.min.js"></script>
 29 | 
 30 | ```{r, eval=FALSE, echo=FALSE}
 31 | # this code chunk is purely for adding comments
 32 | # javascript code below is to build the file tree interface
 33 | # see this for how to implement opening hyperlink: https://stackoverflow.com/questions/18611317/how-to-get-i-get-leaf-nodes-in-jstree-to-open-their-hyperlink-when-clicked-when
 34 | ```
 35 | <script>
 36 |   $(function () {
 37 |     // create an instance when the DOM is ready
 38 |     $('#jstree').jstree().bind("select_node.jstree", function (e, data) {
 39 |      window.open( data.node.a_attr.href, data.node.a_attr.target )
 40 |     });
 41 |   });
 42 | </script>
 43 | 
 44 | 
 45 | ```{r}
 46 | file_tree = function(dir = '.'){
 47 |   # # get the OUTPUT_DIR folder data: dataset_NUMBER_files
 48 |   # report_files_path = Sys.getenv('REPORT_FILES_PATH')
 49 |   # output_dir = tail(strsplit(report_files_path, '/')[[1]], 1)
 50 |   
 51 |   files = list.files(path = dir, recursive = FALSE, full.names = TRUE)
 52 |   # files also include directorys, need to remove directorys
 53 |   files = files[!dir.exists(files)]
 54 |   dirs = list.dirs(path = dir, recursive = FALSE, full.names = TRUE)
 55 |   # exclude .ipynb_checkpoints folder
 56 |   # ipynb_checkpoints = grep(pattern = 'ipynb_checkpoints', x = dirs)
 57 |   # dirs = dirs[-ipynb_checkpoints]
 58 |   github_repo_url = 'https://github.com/MingChen0919/learning-apache-spark/blob/master/'
 59 |   tags$ul(
 60 |     {
 61 |       if (length(files) > 0) {
 62 |         lapply(files, function(x){
 63 |           path_end = tail(strsplit(x, '/')[[1]],1)
 64 |           li_item = tags$li(tags$a(path_end, href=paste0(github_repo_url, x)))
 65 |           li_item$attribs = list('data-jstree'='{"icon":"jstree-file"}')
 66 |           li_item
 67 |         })
 68 |       }
 69 |     },
 70 |     {
 71 |       if (length(dirs) > 0) {
 72 |         lapply(dirs, function(x){
 73 |           path_end = tail(strsplit(x, '/')[[1]],1)
 74 |           if (!(path_end %in% c('vakata-jstree-3.3.5', '.ipynb_checkpoints', 'spark-warehouse', 'ipynb'))) {
 75 |             li_item = tags$li(path_end, file_tree(x))
 76 |             li_item$attribs = list('data-jstree' = '{"icon":"jstree-folder"}', class=list('jstree-open'))
 77 |             li_item
 78 |           }
 79 |         })
 80 |       }
 81 |     }
 82 |   )
 83 | }
 84 | ```
 85 | 
 86 | 
 87 | **[Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/)**
 88 | 
 89 | ## Introduction
 90 | 
 91 | This repository contains mainly notes from learning Apache Spark by [Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/). We try to use the detailed demo code and examples to show how to use pyspark for big data mining. **If you find your work wasn't cited in this note, please feel free to let us know.**
 92 | 
 93 | 
 94 | ## Cheat Sheets
 95 | 
 96 | + [Python Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PythonForDataScience.pdf)
 97 | + [Pandas Basics](http://datacamp-community.s3.amazonaws.com/3857975e-e12f-406a-b3e8-7d627217e952)
 98 | + [Numpy Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf)
 99 | + [Scikit-Learn](http://datacamp-community.s3.amazonaws.com/5433fa18-9f43-44cc-b228-74672efcd116)
100 | + [RDD Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_Cheat_Sheet_Python.pdf)
101 | + [DataFrame Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf)
102 | + [Apache Spark Cheat Sheet](https://hackr.io/tutorials/learn-apache-spark)
103 |     
104 | ## Contents
105 | 
106 | ```{r, echo=FALSE}
107 | # create a div container to store the file tree interface
108 | tags$div(
109 |   id="jstree",
110 |   file_tree('notebooks')
111 | )
112 | ```
113 | 
114 | 
115 | ## Feedback and suggestions
116 | 
117 | Your comments and suggestions are highly appreciated. We are more than happy to receive corrections, suggestions or feedbacks for improvements.
118 | 


--------------------------------------------------------------------------------
/legacy/01_entry_points_to_spark.Rmd:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ```{r setup, include=FALSE}
 4 | knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
 5 | ```
 6 | 
 7 | 
 8 | # Entry points to spark cluster
 9 | 
10 | There are two main entry points to spark cluster:
11 | 
12 | * **SparkContext**: create **RDD** and broadcast variables on the cluster.
13 | * **SparkSession**: create **DataFrame** (pyspark.sql.dataframe.DataFrame).
14 | 
15 | # Create entry point instances
16 | 
17 | * Create a **SparkContext** instance:
18 | 
19 | ```{python eval=FALSE}
20 | from pyspark import SparkContext
21 | sc = SparkContext(master = 'local')
22 | ```
23 | 
24 | * Create a **SparkSession** instance
25 | 
26 | ```{python eval=FALSE}
27 | from pyspark.sql import SparkSession
28 | spark = SparkSession.builder \
29 |           .appName("Python Spark SQL basic example") \
30 |           .config("spark.some.config.option", "some-value") \
31 |           .getOrCreate()
32 | ```


--------------------------------------------------------------------------------
/legacy/03_dataframe_object.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "DataFrame object"
 3 | author: "Ming Chen"
 4 | date: "6/4/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 | 
12 | 
13 | # Content
14 | 
15 | * [Create a DataFrame object](#create-a-dataframe-object)
16 | * [Column instance](#column-instance)
17 | * [DataFrame column methods](#dataframe-column-methods)
18 | 
19 | ## Create a DataFrame object
20 | 
21 | ```{python}
22 | mtcars = spark.read.csv(path='data/mtcars.csv',
23 |                         sep=',',
24 |                         encoding='UTF-8',
25 |                         comment=None,
26 |                         header=True, 
27 |                         inferSchema=True)
28 | ```
29 | 
30 | ```{python}
31 | mtcars.show(n=5, truncate=False)
32 | ```
33 | 
34 | ```{python}
35 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
36 | |model            |mpg |cyl|disp |hp |drat|wt   |qsec |vs |am |gear|carb|
37 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
38 | |Mazda RX4        |21.0|6  |160.0|110|3.9 |2.62 |16.46|0  |1  |4   |4   |
39 | |Mazda RX4 Wag    |21.0|6  |160.0|110|3.9 |2.875|17.02|0  |1  |4   |4   |
40 | |Datsun 710       |22.8|4  |108.0|93 |3.85|2.32 |18.61|1  |1  |4   |1   |
41 | |Hornet 4 Drive   |21.4|6  |258.0|110|3.08|3.215|19.44|1  |0  |3   |1   |
42 | |Hornet Sportabout|18.7|8  |360.0|175|3.15|3.44 |17.02|0  |0  |3   |2   |
43 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
44 | only showing top 5 rows
45 | ```
46 | 
47 | 
48 | ## Column instance
49 | 
50 | Column instances can be created in two ways:
51 | 
52 | 1. directly select a column out of a *DataFrame*: `df.colName`
53 | 2. create from a column expression: `df.colName + 1`
54 | 
55 | Technically, there is only one way to create a column instance. Column expressions start from a column instance.
56 | 
57 | **Remember how to create column instances, because this is usually the starting point if we want to operate DataFrame columns.**
58 | 
59 | The column classes come with some methods that can operate on a column instance. ***However, almost all functions from the `pyspark.sql.functions` module take one or more column instances as argument(s)***. These functions are important for data manipulation tools.
60 | 
61 | ## DataFrame column methods
62 | 
63 | ### Methods that take column names as arguments:
64 | 
65 | * `corr(col1, col2)`: two column names.
66 | * `cov(col1, col2)`: two column names.
67 | * `crosstab(col1, col2)`: two column names.
68 | * `describe(*cols)`: ***`*cols` refers to only column names (strings).***
69 | 
70 | ### Methods that take column names or column expressions or **both** as arguments:
71 | 
72 | * `cube(*cols)`: column names (string) or column expressions or **both**.
73 | * `drop(*cols)`: ***a list of column names OR a single column expression.***
74 | * `groupBy(*cols)`: column name (string) or column expression or **both**.
75 | * `rollup(*cols)`: column name (string) or column expression or **both**.
76 | * `select(*cols)`: column name (string) or column expression or **both**.
77 | * `sort(*cols, **kwargs)`: column name (string) or column expression or **both**.
78 | * `sortWithinPartitions(*cols, **kwargs)`: column name (string) or column expression or **both**.
79 | * `orderBy(*cols, **kwargs)`: column name (string) or column expression or **both**.
80 | * `sampleBy(col, fractions, sed=None)`: a column name.
81 | * `toDF(*cols)`: **a list of column names (string).**
82 | * `withColumn(colName, col)`: `colName` refers to column name; `col` refers to a column expression.
83 | * `withColumnRenamed(existing, new)`: takes column names as arguments.
84 | * `filter(condition)`: ***condition** refers to a column expression that returns `types.BooleanType` of values. 
85 | 


--------------------------------------------------------------------------------
/legacy/HashingTF-and-CountVectorizer.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "HashingTF and CountVectorizer"
  3 | author: "Wenqiang & Ming Chen"
  4 | date: "3/23/2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
 10 | ```
 11 | 
 12 | 
 13 | ## HashingTF and CountVectorizer
 14 | 
 15 | ### Load data
 16 | 
 17 | ```{python}
 18 | twitter = spark.createDataFrame([
 19 |                                 ('Wenqiang is a spark expert', 'Wenqiang', 1.0),
 20 |                                 ('Ming is learning spark', 'Ming', 0.0)],
 21 |                                 ['text', 'id', 'label']
 22 |                                )
 23 | ```
 24 | 
 25 | ```{python}
 26 | twitter.show()
 27 | ```
 28 | 
 29 | ```{python}
 30 | +--------------------+--------+-----+
 31 | |                text|      id|label|
 32 | +--------------------+--------+-----+
 33 | |Wenqiang is a spa...|Wenqiang|  1.0|
 34 | |Ming is learning ...|    Ming|  0.0|
 35 | +--------------------+--------+-----+
 36 | ```
 37 | 
 38 | 
 39 | ### Tokenization
 40 | 
 41 | ```{python}
 42 | from pyspark.ml.feature import Tokenizer
 43 | ```
 44 | 
 45 | ```{python}
 46 | tokenizer_mod = Tokenizer(inputCol='text', outputCol='tokens')
 47 | twitter_tokens = tokenizer_mod.transform(twitter)
 48 | twitter_tokens.show()
 49 | ```
 50 | 
 51 | ```{python}
 52 | +--------------------+--------+-----+--------------------+
 53 | |                text|      id|label|              tokens|
 54 | +--------------------+--------+-----+--------------------+
 55 | |Wenqiang is a spa...|Wenqiang|  1.0|[wenqiang, is, a,...|
 56 | |Ming is learning ...|    Ming|  0.0|[ming, is, learni...|
 57 | +--------------------+--------+-----+--------------------+
 58 | ```
 59 | 
 60 | 
 61 | ### HashingTF
 62 | 
 63 | ```{python}
 64 | from pyspark.ml.feature import HashingTF
 65 | hashingTF_mod = HashingTF(numFeatures=pow(2,4), inputCol='tokens', outputCol='features')
 66 | hashingTF_twitter = hashingTF_mod.transform(twitter_tokens)
 67 | ```
 68 | 
 69 | ```{python}
 70 | hashingTF_twitter.show(truncate=False)
 71 | ```
 72 | 
 73 | ```{python}
 74 | +--------------------------+--------+-----+--------------------------------+---------------------------------+
 75 | |text                      |id      |label|tokens                          |features                         |
 76 | +--------------------------+--------+-----+--------------------------------+---------------------------------+
 77 | |Wenqiang is a spark expert|Wenqiang|1.0  |[wenqiang, is, a, spark, expert]|(16,[1,2,9,13],[2.0,1.0,1.0,1.0])|
 78 | |Ming is learning spark    |Ming    |0.0  |[ming, is, learning, spark]     |(16,[0,1,14],[1.0,2.0,1.0])      |
 79 | +--------------------------+--------+-----+--------------------------------+---------------------------------+
 80 | ```
 81 | 
 82 | 
 83 | ### CountVectorizer
 84 | 
 85 | ```{python}
 86 | from pyspark.ml.feature import CountVectorizer
 87 | count_vectorizer = CountVectorizer(vocabSize=pow(2,4), inputCol='tokens', outputCol='features')
 88 | countVectorizer_mod = count_vectorizer.fit(twitter_tokens)
 89 | countVectorizer_twitter = countVectorizer_mod.transform(twitter_tokens)
 90 | ```
 91 | 
 92 | ```{python}
 93 | countVectorizer_twitter.show(truncate=False)
 94 | ```
 95 | 
 96 | ```{python}
 97 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+
 98 | |text                      |id      |label|tokens                          |features                             |
 99 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+
100 | |Wenqiang is a spark expert|Wenqiang|1.0  |[wenqiang, is, a, spark, expert]|(7,[0,1,2,3,5],[1.0,1.0,1.0,1.0,1.0])|
101 | |Ming is learning spark    |Ming    |0.0  |[ming, is, learning, spark]     |(7,[0,1,4,6],[1.0,1.0,1.0,1.0])      |
102 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+
103 | ```
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/legacy/continuous-to-categorical-variable.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Continuous to categorical data"
 3 | author: "Ming Chen"
 4 | date: "6/9/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 | 
12 | ## Convert continuous variables to categorical variables
13 | 
14 | There are two functions we can use to split a continuous variable into categories:
15 | 
16 | * `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold
17 | * `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.
18 |     + with $n + 1$ split points, there are n categories (buckets).
19 |     
20 | **Create some data**
21 | 
22 | ```{python}
23 | import numpy as np
24 | import pandas as pd
25 | np.random.seed(seed=1234)
26 | pdf = pd.DataFrame({
27 |         'x1': np.random.randn(10),
28 |         'x2': np.random.rand(10)*10
29 |     })
30 | np.random.seed(seed=None)
31 | df = spark.createDataFrame(pdf)
32 | df.show()
33 | 
34 | +--------------------+------------------+
35 | |                  x1|                x2|
36 | +--------------------+------------------+
37 | | 0.47143516373249306| 6.834629351721363|
38 | | -1.1909756947064645| 7.127020269829002|
39 | |  1.4327069684260973|3.7025075479039495|
40 | | -0.3126518960917129| 5.611961860656249|
41 | | -0.7205887333650116| 5.030831653078097|
42 | |  0.8871629403077386|0.1376844959068224|
43 | |  0.8595884137174165| 7.728266216123741|
44 | | -0.6365235044173491| 8.826411906361166|
45 | |0.015696372114428918| 3.648859839013723|
46 | | -2.2426849541854055| 6.153961784334937|
47 | +--------------------+------------------+
48 | ```
49 | 
50 | **`Binarize` the column `x1` and `Bucketize` the column `x2`**
51 | 
52 | ```{python}
53 | from pyspark.ml.feature import Binarizer, Bucketizer
54 | # threshold = 0 for binarizer
55 | binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')
56 | # provide 5 split points to generate 4 buckets
57 | bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')
58 | 
59 | # pipeline stages
60 | from pyspark.ml import Pipeline
61 | stages = [binarizer, bucketizer]
62 | pipeline = Pipeline(stages=stages)
63 | 
64 | # fit the pipeline model and transform the data
65 | pipeline.fit(df).transform(df).show()
66 | 
67 | +--------------------+------------------+------+------+
68 | |                  x1|                x2|x1_new|x2_new|
69 | +--------------------+------------------+------+------+
70 | | 0.47143516373249306| 6.834629351721363|   1.0|   2.0|
71 | | -1.1909756947064645| 7.127020269829002|   0.0|   2.0|
72 | |  1.4327069684260973|3.7025075479039495|   1.0|   1.0|
73 | | -0.3126518960917129| 5.611961860656249|   0.0|   2.0|
74 | | -0.7205887333650116| 5.030831653078097|   0.0|   2.0|
75 | |  0.8871629403077386|0.1376844959068224|   1.0|   0.0|
76 | |  0.8595884137174165| 7.728266216123741|   1.0|   3.0|
77 | | -0.6365235044173491| 8.826411906361166|   0.0|   3.0|
78 | |0.015696372114428918| 3.648859839013723|   1.0|   1.0|
79 | | -2.2426849541854055| 6.153961784334937|   0.0|   2.0|
80 | +--------------------+------------------+------+------+
81 | ```
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/legacy/cross-validation-in-r.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Cross-validation in R"
 3 | author: "Ming Chen"
 4 | date: "6/5/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | library(boot)
11 | ```
12 | 
13 | 
14 | ## Prepare data
15 | 
16 | ```{r}
17 | horseshoe_crab = read.csv("data/horseshoe_crab.csv")
18 | horseshoe_crab$C = as.factor(horseshoe_crab$C)
19 | horseshoe_crab$S = as.factor(horseshoe_crab$S)
20 | y = numeric()
21 | y[horseshoe_crab$Sa != 0] = 1
22 | y[horseshoe_crab$Sa == 0] = 0
23 | horseshoe_crab$y = y
24 | ```
25 | 
26 | ## Split data into training and test datasets
27 | 
28 | ```{r}
29 | training_index = sort(sample(nrow(horseshoe_crab), nrow(horseshoe_crab)*0.8))
30 | training = horseshoe_crab[training_index, ]
31 | test = horseshoe_crab[-training_index, ]
32 | ```
33 | 
34 | 
35 | ## Build cross validation model
36 | 
37 | ```{r}
38 | glm_logit = glm(formula = y ~ C + S + W + Wt, data = training,
39 |                 family = 'binomial')
40 | glm_logit = glm(formula = y ~ C + S + W + Wt, data = horseshoe_crab,
41 |                 family = 'binomial')
42 | # 4 fold cross validation
43 | cv_glm_5 = cv.glm(data = training, glmfit = glm_logit, K = 4)
44 | ```
45 | 
46 | 


--------------------------------------------------------------------------------
/legacy/index.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Learning Apache Spark"
3 | output: html_document
4 | ---
5 | 
6 | **All materials are converted to notebooks (in ipynb format) and moved to the github repository. Click here to go to the [repository](https://github.com/MingChen0919/learning-apache-spark/blob/master/README.md).**


--------------------------------------------------------------------------------
/legacy/install.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Installations"
 3 | author: "Wenqiang Feng & Ming Chen"
 4 | date: "2/17/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ### **Caution**: Before you start the following steps, please make sure you have already installed 
13 | 
14 | - [Java JDK](http://www.oracle.com/technetwork/java/javase/downloads/index-jsp-138363.html)
15 | - [Ipython and python](https://ipython.org/install.html)
16 | - If you use this method to install the Spark, you can skip the [Spark on Jupyter section](pyspark-on-jupyter.html)
17 | 
18 | ### 1. Download Apache Spark from the official website 
19 |  Weblink: [Download Apache Spark™](http://spark.apache.org/downloads.html)
20 | 
21 | ### 2. Installation 
22 | 
23 | Actually, the Pre-build version doesn't need installation. You can
24 | use it when you unpack it. 
25 | 
26 | ### 3. Set path link
27 | 
28 | This is the most difficult step for the beginner. However, this step can be easily solved via  Min RK's [`findspark`](https://github.com/minrk/findspark).
29 | 
30 | - install findspark
31 | ```{python eval=FALSE}
32 | pip install findspark
33 | ```
34 | - open `ipython` in terminal and import findspark
35 | ```{python eval=FALSE}
36 | import findspark
37 | findspark.init()
38 | ```
39 | - finding spark path
40 | ```{python eval=FALSE}
41 | findspark.find()
42 | ```
43 | ```{python eval=FALSE}
44 | Out[3]: '/Users/wenqiangfeng/spark/'
45 | ```
46 | - open `ipython --profile=myprofile` in terminal then run the following code
47 | ```{python eval=FALSE}
48 | findspark.init('/Users/wenqiangfeng/spark/', edit_profile=True)
49 | ```
50 | ```{python eval=FALSE}
51 | findspark.init('/Users/wenqiangfeng/spark/', edit_rc=True)
52 | ```
53 | 
54 | ### Note: 
55 | 
56 | - This will also help you to set up the `ipython notebook` or `Jupyter`. You may run the following code in terminal to double check it:
57 | ```{python eval=FALSE}
58 | jupyter notebook
59 | ```
60 | 
61 | * If you PySpark still doesn't work, you need to check your `.profile` or `bash_profile` and add the following path to it
62 | 
63 |     + check `.profile` or `bash_profile` at terminal
64 |     + add the path to your `.profile` or `bash_profile`
65 | ```{bash eval=FALSE}
66 | vim ~/.profile
67 | ```
68 |     
69 |     
70 | ```{bash eval=FALSE}
71 | # Added for Pyspark
72 | export SPARK_HOME=YOUR_PATH/apache-spark/libexec
73 | export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
74 | export PYSPARK_DRIVER_PYTHON="jupyter"
75 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
76 | ```
77 |     
78 |   
79 | 
80 | 


--------------------------------------------------------------------------------
/legacy/k-folds-cross-validation.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "K-folds Cross Validation"
  3 | author: "Wenqiang Feng & Ming Chen"
  4 | date: "2/20/2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
 10 | ```
 11 | 
 12 | ## Training/validation/test data sets
 13 | 
 14 | * **Training set**: the data set for training your models.
 15 | * **Validation set**: The data set used for testing the performance of your models you have built with training sets. Based on the performance, you choose the best model (final).
 16 | * **Test set**: use this data set to test the performance of your final model.
 17 | 
 18 | ## K-folds cross validation steps (k=4 as an example).
 19 | 
 20 | * step 1: split your data into training set and test set (for example 80% training and 20% test). Test set will never be used in model training and selection. 
 21 | * step 2: split training set into k (k=4) eqaul subsets: 3 subsets for traing + 1 subset for validation.
 22 | * step 3: training your models with the 3 subsets and calculate a performance score with the remaining 1 subset.
 23 | * step 4: choose a different subset for validation and then repeat step 3 until every subset has been used as a validation subset.
 24 | * step 5: for a k=4 fold cross validation, each trained model should have been validated by 4 subsets and therefore has 4 performance scores. Calculate the average of these 4 perfermance scores for each model. Use the average score to select the best, final model.
 25 | * step 6: apply your final model to the **untouched** test data and see how it performs.
 26 | 
 27 | ## Example of k-folds cross validation
 28 | 
 29 | * **Build parameter grids**
 30 |     + parameter grid: a combination of all variable parameters in your model.
 31 |     + example: If I want to train a logistic regression model on 4 different *regParam* and 3 different *elasticNetParam*, I will have 3 x 4 = 12 models to train and validate.
 32 |     
 33 | ```{python}
 34 | from pyspark.ml.classification import LogisticRegression
 35 | blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial')
 36 | 
 37 | from pyspark.ml.tuning import ParamGridBuilder
 38 | param_grid = ParamGridBuilder().\
 39 |     addGrid(blor.regParam, [0, 0.5, 1, 2]).\
 40 |     addGrid(blor.elasticNetParam, [0, 0.5, 1]).\
 41 |     build()
 42 | ```
 43 | 
 44 | ```{python}
 45 | # the first 2 elements in param_grid
 46 | [{Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0,
 47 |   Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 0},
 48 |  {Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5,
 49 |   Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 0}]
 50 | ```
 51 | 
 52 | * **Split data into training and test sets**
 53 |     + Refer to the [logistic regression page](logistic-regression.html) to see what data we used and how the training and test sets were generated.
 54 | 
 55 | * **Run k (k=4) folds cross validation**
 56 | ```{python}
 57 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
 58 | evaluator = BinaryClassificationEvaluator()
 59 | 
 60 | from pyspark.ml.tuning import CrossValidator
 61 | cv = CrossValidator(estimator=blor, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)
 62 | 
 63 | cvModel = cv.fit(training)
 64 | ```
 65 | 
 66 | * **Find the best model**
 67 |     + best model ID
 68 |         
 69 |     ```{python}
 70 |     cvModel.bestModel
 71 |     ```
 72 |     
 73 |     ```{python}
 74 |     LogisticRegression_41fe9f7454164180f433
 75 |     ```
 76 |     
 77 |     + average cross-validation metrics
 78 |         + the 10th model has highest score and is the best model
 79 |         + *regParam* = 2 and *elasticNetParam* = 0. It is a ridge regularization method.
 80 |             
 81 |     ```{python}
 82 |     cvModel.avgMetrics
 83 |     ```
 84 |     
 85 |     ```{python}
 86 |     [0.8191225353777875,
 87 |      0.8191225353777875,
 88 |      0.8191225353777875,
 89 |      0.8243105196624104,
 90 |      0.5,
 91 |      0.5,
 92 |      0.8247709310997127,
 93 |      0.5,
 94 |      0.5,
 95 |      0.8259072947360763,
 96 |      0.5,
 97 |      0.5]
 98 |     ```
 99 |     
100 |     
101 |     ```{python}
102 |     param_grid[9]
103 |     ```
104 |     
105 |     ```{python}
106 | {Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0, Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 2}    
107 |     ```
108 | 
109 |     + Model comparison (not finished)
110 |     
111 |         
112 |     ```{python}
113 |     # new model
114 |     blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial')
115 |     model = blor.fit(training)
116 |     evaluator.evaluate(model.transform(training))
117 |     evaluator.evaluate(model.transform(test))
118 |     
119 |     new_blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial', regParam=0.5, elasticNetParam=0)
120 |     new_model = new_blor.fit(training)
121 |     evaluator.evaluate(new_model.transform(training))
122 |     evaluator.evaluate(new_model.transform(test))
123 |     ```
124 | 


--------------------------------------------------------------------------------
/legacy/linear-regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Linear Regression"
  3 | author: "Ming Chen"
  4 | date: "6/5/2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
 10 | ```
 11 | 
 12 | # Linear regression
 13 | 
 14 | ## Linear regression without cross-valiation
 15 | 
 16 | **Import data**
 17 | 
 18 | ```{python}
 19 | ad = spark.read.csv('data/Advertising.csv', header=True, inferSchema=True)
 20 | ad.show(5)
 21 | 
 22 | +-----+-----+---------+-----+
 23 | |   TV|Radio|Newspaper|Sales|
 24 | +-----+-----+---------+-----+
 25 | |230.1| 37.8|     69.2| 22.1|
 26 | | 44.5| 39.3|     45.1| 10.4|
 27 | | 17.2| 45.9|     69.3|  9.3|
 28 | |151.5| 41.3|     58.5| 18.5|
 29 | |180.8| 10.8|     58.4| 12.9|
 30 | +-----+-----+---------+-----+
 31 | only showing top 5 rows
 32 | ```
 33 | 
 34 | **Transform data structure**
 35 | 
 36 | ```{python}
 37 | from pyspark.ml.linalg import Vectors
 38 | ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])
 39 | ad_df.show(5)
 40 | 
 41 | +-----------------+-----+
 42 | |         features|label|
 43 | +-----------------+-----+
 44 | |[230.1,37.8,69.2]| 22.1|
 45 | | [44.5,39.3,45.1]| 10.4|
 46 | | [17.2,45.9,69.3]|  9.3|
 47 | |[151.5,41.3,58.5]| 18.5|
 48 | |[180.8,10.8,58.4]| 12.9|
 49 | +-----------------+-----+
 50 | only showing top 5 rows
 51 | ```
 52 | 
 53 | **Build linear regression model**
 54 | 
 55 | ```{python}
 56 | from pyspark.ml.regression import LinearRegression
 57 | lr = LinearRegression(featuresCol = 'features', labelCol = 'label')
 58 | ```
 59 | 
 60 | **Fit the model**
 61 | 
 62 | ```{python}
 63 | lr_model = lr.fit(ad_df)
 64 | ```
 65 | 
 66 | **Module evaluation**
 67 | 
 68 | ```{python}
 69 | from pyspark.ml.evaluation import RegressionEvaluator 
 70 | evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
 71 | evaluator.evaluate(ad_pred, {evaluator.metricName: "r2"})
 72 | 
 73 | 0.897210638178952
 74 | ```
 75 | 
 76 | **Compare results with results from R**
 77 | 
 78 | The comparison below shows that the linear regression analyses from pyspark and R obtained very close results.
 79 | 
 80 | ```{python}
 81 | # intercept and coefficients from R
 82 | advertise = read.csv('data/Advertising.csv', header = TRUE)
 83 | lr_ad = lm(Sales~., data = advertise)
 84 | lr_ad$coefficients
 85 | 
 86 |  (Intercept)           TV        Radio    Newspaper 
 87 |  2.938889369  0.045764645  0.188530017 -0.001037493
 88 |  
 89 | # intercept and coefficents from pyspark
 90 | lr_model.intercept
 91 | 
 92 | 2.9388893694594134
 93 | 
 94 | lr_model.coefficients
 95 | 
 96 | DenseVector([0.0458, 0.1885, -0.001])
 97 | 
 98 | # R squared from R
 99 | summary(lr_ad)$r.squared
100 | 
101 | 0.8972106
102 | 
103 | # R squared from pyspark
104 | evaluator.evaluate(ad_pred, {evaluator.metricName: "r2"})
105 | 
106 | 0.897210638178952
107 | ```
108 | 
109 | 
110 | ## Linear regression with cross-validation
111 | 
112 | **Training and test datasets**
113 | 
114 | ```{python}
115 | ## split data into training and test datasets
116 | training, test = ad_df.randomSplit([0.8, 0.2], seed=123)
117 | ```
118 | 
119 | **Build cross-validation model**
120 | 
121 | ```{python}
122 | ##=====build cross valiation model======
123 | 
124 | # estimator
125 | lr = LinearRegression(featuresCol = 'features', labelCol = 'label')
126 | 
127 | # parameter grid
128 | from pyspark.ml.tuning import ParamGridBuilder
129 | param_grid = ParamGridBuilder().\
130 |     addGrid(lr.regParam, [0, 0.5, 1]).\
131 |     addGrid(lr.elasticNetParam, [0, 0.5, 1]).\
132 |     build()
133 |     
134 | # evaluator
135 | evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2')
136 | 
137 | # cross-validation model
138 | from pyspark.ml.tuning import CrossValidator
139 | cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)
140 | ```
141 | 
142 | **Fit cross-validation model**
143 | 
144 | ```{python}
145 | cv_model = cv.fit(training)
146 | ```
147 | 
148 | **Prediction**
149 | 
150 | ```{python}
151 | pred_training_cv = cv_model.transform(training)
152 | pred_test_cv = cv_model.transform(test)
153 | ```
154 | 
155 | **Evaluation**
156 | 
157 | ```{python}
158 | # performance on training data
159 | evaluator.evaluate(pred_training_cv)
160 | 
161 | 0.8982486958337326
162 | 
163 | # performance on test data
164 | evaluator.evaluate(pred_test_cv)
165 | 
166 | 0.8896562076565583
167 | ```
168 | 
169 | 
170 | **Intercept and coefficients**
171 | 
172 | ```{python}
173 | cv_model.bestModel.intercept
174 | 
175 | 3.075068686285647
176 | 
177 | cv_model.bestModel.coefficients
178 | 
179 | DenseVector([0.0465, 0.1809, -0.0011])
180 | ```
181 | 
182 | **Get parameter values from the best model**
183 | 
184 | Parameters can be extracted by calling the java property.
185 | 
186 | ```{python}
187 | print('best regParam: ' + str(cv_model.bestModel._java_obj.getRegParam()) + "\n" +
188 |      'best ElasticNetParam:' + str(cv_model.bestModel._java_obj.getElasticNetParam()))
189 |      
190 | best regParam: 0.0
191 | best ElasticNetParam:0.0
192 | ```
193 | 
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/legacy/pyspark-on-jupyter.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Spark on Jupyter"
 3 | author: "Wenqiang Feng & Ming Chen"
 4 | date: "2/5/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ### 1. Install jupyter with conda
13 | 
14 | ```{python eval=FALSE}
15 | conda install jupyter
16 | ```
17 | 
18 | ### 2. Get `jupyter binary executable path`
19 | 
20 | ```{python eval=FALSE}
21 | which jupyter
22 | ```
23 | ```{python eval=FALSE}
24 | # output
25 | /Users/mingchen/anaconda2/bin/jupyter
26 | ```
27 | 
28 | ### 3.Link spark with jupyter
29 | 
30 | ```{python eval=FALSE}
31 | export PYSPARK_DRIVER_PYTHON=/Users/mingchen/anaconda2/bin/jupyter
32 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8880"
33 | ```
34 | 
35 | You can also add the two environmental variables to the `~/.bash_profile` file to permenantly link spark with jupyter
36 | 
37 | ### 4. Run jupyter notebook
38 | 
39 | ```{python eval=FALSE}
40 | pyspark
41 | ```
42 | 
43 | Then go to [http://127.0.0.1:8880](http://127.0.0.1:8880)
44 | 
45 | 


--------------------------------------------------------------------------------
/legacy/pyspark-on-rodeo.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Pyspark On Rodeo"
 3 | author: "Wenqiang Feng & Ming Chen"
 4 | date: "2/5/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ### 1. Install Rodeo on Mac
 9 | 
10 | * Download DMG file [https://www.yhat.com/products/rodeo/](https://www.yhat.com/products/rodeo/)
11 | 
12 | ### 2. Install `apache-spark` with homebrew
13 | 
14 | ```{bash eval=FALSE}
15 | brew install apache-spark
16 | ```
17 | 
18 | ### 3.Locate the `python` directory within apache-spark root directory
19 | 
20 | ```{bash eval=FALSE}
21 | /usr/local/Cellar/apache-spark/2.1.0/libexec/python
22 | ```
23 | 
24 | 
25 | ### 4. Set environment variable
26 | 
27 | * Open Rodeo, go to **settings**->**ENVIRONMENT VARIABLES**
28 | * Add the path `/usr/local/Cellar/apache-spark/2.1.0/libexec/python` to `PYTHONPATH`
29 | 
30 | ### 5. Test pyspark on Rodeo
31 | 
32 | Run the following command
33 | 
34 | ```{python eval=FALSE}
35 | from pyspark import SparkConf, SparkContext
36 | 
37 | conf = SparkConf().setAppName("myAppName")
38 | sc = SparkContext(conf=conf)
39 | sc
40 | ```
41 | 
42 | You should get something this
43 | 
44 | ```{python eval=FALSE}
45 | <pyspark.context.SparkContext at 0x114ee1610>
46 | ```


--------------------------------------------------------------------------------
/legacy/pyspark-vectors.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Pyspark Vectors"
 3 | author: "Wenqiang Feng & Ming Chen"
 4 | date: "2/18/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | ## Remark:
12 | 
13 | - You can download the complete [ipython notebook](./ipynb/vector.ipynb) for the this session.
14 | 
15 | 
16 | ## Dense vector vs. Sparse vector
17 | 
18 | * Both dense vector and sparse vector are homogeneous and can only have numeric data.
19 | 
20 | * `DenseVector` takes one single argument and is very like an R vector.
21 | * `SparseVector` only display non-zero values. `SparseVector` uses three values to achieve this. The non-value entries' indices (positions) and corresponding values, and the vector size. With this information, you can figure out which entries in a vector have zero values, and therefore, a complete vector.
22 | 
23 | ## Example:
24 | 
25 |   + set up spark context and  SparkSession
26 | 
27 | ```{python eval =FALSE}
28 | from pyspark import SparkConf, SparkContext
29 | ## set up spark context
30 | from pyspark.sql import SQLContext
31 | sc = SparkContext()
32 | sqlContext = SQLContext(sc)
33 | ## set up  SparkSession
34 | from pyspark.sql import SparkSession
35 | 
36 | spark = SparkSession \
37 |     .builder \
38 |     .appName("Python Spark SQL basic example") \
39 |     .config("spark.some.config.option", "some-value") \
40 |     .getOrCreate()
41 | ```
42 | 
43 |   + import `Vectors` from pyspark library
44 |   
45 | ```{python eval=FALSE}
46 | from pyspark.ml.linalg import Vectors
47 | ```  
48 | 
49 |   + dense vector
50 |   
51 | ```{python eval=FALSE}
52 | densevector = Vectors.dense([1,3,4,2.5])
53 | densevector
54 | ```
55 | 
56 | ```{python eval=FALSE}
57 | # output
58 | DenseVector([1.0, 3.0, 4.0, 2.5])
59 | ```
60 | 
61 | ```{python eval=FALSE}
62 | densevector.toArray()
63 | ```
64 | 
65 | ```{python eval=FALSE}
66 | # output
67 | array([ 1. ,  3. ,  4. ,  2.5])
68 | ```
69 |   + sparse vector
70 |     = The sparse vector below is a representation of vector [ 0. ,  3. ,  0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ]
71 |     
72 | ```{python eval=FALSE}
73 | sparseVector = Vectors.sparse(10, [1, 3], [3.0, 4.5])
74 | sparseVector.toArray()
75 | ```
76 | 
77 | ```{python eval=FALSE}
78 | # output
79 | array([ 0. ,  3. ,  0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ])
80 | ```


--------------------------------------------------------------------------------
/legacy/pyspark.ml.feature-module.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "pyspark.ml.feature module"
 3 | author: "Wenqiang Feng & Ming Chen"
 4 | date: "2/15/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## Introduction
13 | 
14 | This module provides a set of functions, methods and classes which act on **features**. A feature is like a column from the data frame or table. You can see that most of functions or classes take parameters like `inputCol`, `featuresCol`, `outputCol`, `labelCol`. These parameters specify the names of column (features) that you want to work on.
15 | 
16 | ## class pairs and `fit/transform` functions
17 | 
18 | I found that there are a lot of class pairs in this module. For example:
19 | 
20 | * `ChiSqSelector` and `ChiSqSelectorModel`
21 | * `CountVectorizer` and `CountVectorizerModel`
22 | * `IDF` and `IDFModel`
23 | * a lot of other pairs ...
24 | 
25 | The first class in a pair have functions to build model (instructions about how you want to transform your data). The second class in a pair do the actual data transformation. 
26 | 
27 | * The `fit` function is from the first class and fit the built model to your data.
28 | * The `transform` function is from the second class and does the actual data transformation.


--------------------------------------------------------------------------------
/legacy/r-markdown-header.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "r-markdown-hearder"
 3 | author: "Ming Chen"
 4 | output: html_document
 5 | ---
 6 | 
 7 | <style>
 8 | pre code, pre, code {
 9 |   white-space: pre !important;
10 |   overflow-x: scroll !important;
11 |   word-break: keep-all !important;
12 |   word-wrap: initial !important;
13 | }
14 | </style>
15 | 
16 | ```{r setup, include=FALSE}
17 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
18 | ```
19 | 
20 | 
21 | ## Create SparkContext & SparkSession
22 | 
23 | **SparkContext**
24 | 
25 | ```{python}
26 | from pyspark import SparkContext
27 | sc = SparkContext(master = 'local')
28 | ```
29 | 
30 | **SparkSession**
31 | 
32 | ```{python}
33 | from pyspark.sql import SparkSession
34 | spark = SparkSession.builder \
35 |           .appName("Learning Apach Spark") \
36 |           .config("spark.some.config.option", "some-value") \
37 |           .getOrCreate()
38 | ```
39 | 


--------------------------------------------------------------------------------
/legacy/randomforest.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Random Forest Regression"
  3 | author: "Wenqiang Feng & Ming Chen"
  4 | date: "February 19, 2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | ### Remark: 
 12 | 
 13 | - You can download the complete [ipython notebook](./ipynb/RandomForest.ipynb) for this tutorial session.
 14 | 
 15 | ### 1. Set up spark context and SparkSession
 16 | 
 17 | ```{python eval=FALSE}
 18 | from pyspark.sql import SparkSession
 19 | 
 20 | spark = SparkSession \
 21 |     .builder \
 22 |     .appName("Python Spark Random Forest Regression") \
 23 |     .config("spark.some.config.option", "some-value") \
 24 |     .getOrCreate()
 25 | ```
 26 | 
 27 | ### 2. Load dataset
 28 | ```{python eval=FALSE}
 29 | df = spark.read.format('com.databricks.spark.csv').\
 30 |                                options(header='true', \
 31 |                                inferschema='true').load("./data/WineData.csv",header=True);
 32 | ```
 33 | 
 34 | ```{python eval=FALSE}
 35 | df.printSchema()
 36 | ```
 37 | ```{python eval=FALSE}
 38 | #output
 39 | root
 40 |  |-- fixed acidity: double (nullable = true)
 41 |  |-- volatile acidity: double (nullable = true)
 42 |  |-- citric acid: double (nullable = true)
 43 |  |-- residual sugar: double (nullable = true)
 44 |  |-- chlorides: double (nullable = true)
 45 |  |-- free sulfur dioxide: double (nullable = true)
 46 |  |-- total sulfur dioxide: double (nullable = true)
 47 |  |-- density: double (nullable = true)
 48 |  |-- pH: double (nullable = true)
 49 |  |-- sulphates: double (nullable = true)
 50 |  |-- alcohol: double (nullable = true)
 51 |  |-- quality: integer (nullable = true)
 52 | ```
 53 | 
 54 | 
 55 | ### 3. Convert the data to dense vector
 56 | ```{python eval=FALSE}
 57 | from pyspark.sql import Row
 58 | from pyspark.ml.linalg import Vectors
 59 | ```
 60 | ```{python eval=FALSE}
 61 | def transData(data):
 62 |     return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])
 63 | ```
 64 | 
 65 | ```{python eval=FALSE}
 66 | transformed= transData(df)
 67 | transformed.show(6)
 68 | ```
 69 | 
 70 | ```{python eval=FALSE}
 71 | #output
 72 | +--------------------+-----+
 73 | |            features|label|
 74 | +--------------------+-----+
 75 | |[7.4,0.7,0.0,1.9,...|    5|
 76 | |[7.8,0.88,0.0,2.6...|    5|
 77 | |[7.8,0.76,0.04,2....|    5|
 78 | |[11.2,0.28,0.56,1...|    6|
 79 | |[7.4,0.7,0.0,1.9,...|    5|
 80 | |[7.4,0.66,0.0,1.8...|    5|
 81 | +--------------------+-----+
 82 | only showing top 6 rows
 83 | ```
 84 | 
 85 | ```{python eval=FALSE}
 86 | from pyspark.ml import Pipeline
 87 | from pyspark.ml.regression import RandomForestRegressor
 88 | from pyspark.ml.feature import VectorIndexer
 89 | from pyspark.ml.evaluation import RegressionEvaluator
 90 | ```
 91 | ### 4. Split the data into training and test sets (30% held out for testing)
 92 | ```{python eval=FALSE}
 93 | # Split the data into training and test sets (30% held out for testing)
 94 | (trainingData, testData) = transformed.randomSplit([0.7, 0.3])
 95 | ```
 96 | ### 5. Train a RandomForest model.
 97 | 
 98 | ```{python eval=FALSE}
 99 | # Train a RandomForest model.
100 | rf = RandomForestRegressor()
101 | model = rf.fit(trainingData)
102 | ```
103 | ### 6. Make predictions.
104 | 
105 | ```{python eval=FALSE}
106 | # Make predictions.
107 | predictions = model.transform(testData)
108 | ```
109 | ### 6. Show esults 
110 | ```{python eval=FALSE}
111 | # Select example rows to display.
112 | predictions.select("prediction", "label", "features").show(5)
113 | ```
114 | 
115 | ```{python eval=FALSE}
116 | #output
117 | +------------------+-----+--------------------+
118 | |        prediction|label|            features|
119 | +------------------+-----+--------------------+
120 | | 6.489667556875804|    7|[4.9,0.42,0.0,2.1...|
121 | | 6.267301910170284|    7|[5.1,0.42,0.0,1.8...|
122 | |6.0526786505470245|    7|[5.1,0.585,0.0,1....|
123 | | 5.257985010985523|    5|[5.2,0.32,0.25,1....|
124 | | 5.943264423589821|    7|[5.2,0.48,0.04,1....|
125 | +------------------+-----+--------------------+
126 | ```
127 | 
128 | ### 7. Model Evaluation
129 | ```{python eval=FALSE}
130 | # Select (prediction, true label) and compute test error
131 | evaluator = RegressionEvaluator(
132 |     labelCol="label", predictionCol="prediction", metricName="rmse")
133 | rmse = evaluator.evaluate(predictions)
134 | print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
135 | ```
136 | 
137 | ```{python eval=FALSE}
138 | Root Mean Squared Error (RMSE) on test data = 0.659148
139 | ```
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/legacy/randomforestC.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Random Forest Regression"
  3 | author: "Wenqiang Feng & Ming Chen"
  4 | date: "February 19, 2017"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | ### Remark: 
 12 | 
 13 | - You can download the complete [ipython notebook](./ipynb/RandomForest.ipynb) for this tutorial session.
 14 | 
 15 | ### 1. Set up spark context and SparkSession
 16 | 
 17 | ```{python eval=FALSE}
 18 | from pyspark.sql import SparkSession
 19 | 
 20 | spark = SparkSession \
 21 |     .builder \
 22 |     .appName("Python Spark Random Forest Regression") \
 23 |     .config("spark.some.config.option", "some-value") \
 24 |     .getOrCreate()
 25 | ```
 26 | 
 27 | ### 2. Load dataset
 28 | ```{python eval=FALSE}
 29 | df = spark.read.format('com.databricks.spark.csv').\
 30 |                                options(header='true', \
 31 |                                inferschema='true').load("./data/WineData.csv",header=True);
 32 | ```
 33 | 
 34 | ```{python eval=FALSE}
 35 | df.printSchema()
 36 | ```
 37 | ```{python eval=FALSE}
 38 | #output
 39 | root
 40 |  |-- fixed acidity: double (nullable = true)
 41 |  |-- volatile acidity: double (nullable = true)
 42 |  |-- citric acid: double (nullable = true)
 43 |  |-- residual sugar: double (nullable = true)
 44 |  |-- chlorides: double (nullable = true)
 45 |  |-- free sulfur dioxide: double (nullable = true)
 46 |  |-- total sulfur dioxide: double (nullable = true)
 47 |  |-- density: double (nullable = true)
 48 |  |-- pH: double (nullable = true)
 49 |  |-- sulphates: double (nullable = true)
 50 |  |-- alcohol: double (nullable = true)
 51 |  |-- quality: integer (nullable = true)
 52 | ```
 53 | 
 54 | 
 55 | ### 3. Convert the data to dense vector
 56 | ```{python eval=FALSE}
 57 | from pyspark.sql import Row
 58 | from pyspark.ml.linalg import Vectors
 59 | ```
 60 | ```{python eval=FALSE}
 61 | def transData(data):
 62 |     return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])
 63 | ```
 64 | 
 65 | ```{python eval=FALSE}
 66 | transformed= transData(df)
 67 | transformed.show(6)
 68 | ```
 69 | 
 70 | ```{python eval=FALSE}
 71 | #output
 72 | +--------------------+-----+
 73 | |            features|label|
 74 | +--------------------+-----+
 75 | |[7.4,0.7,0.0,1.9,...|    5|
 76 | |[7.8,0.88,0.0,2.6...|    5|
 77 | |[7.8,0.76,0.04,2....|    5|
 78 | |[11.2,0.28,0.56,1...|    6|
 79 | |[7.4,0.7,0.0,1.9,...|    5|
 80 | |[7.4,0.66,0.0,1.8...|    5|
 81 | +--------------------+-----+
 82 | only showing top 6 rows
 83 | ```
 84 | 
 85 | ```{python eval=FALSE}
 86 | from pyspark.ml import Pipeline
 87 | from pyspark.ml.regression import RandomForestRegressor
 88 | from pyspark.ml.feature import VectorIndexer
 89 | from pyspark.ml.evaluation import RegressionEvaluator
 90 | ```
 91 | ### 4. Split the data into training and test sets (30% held out for testing)
 92 | ```{python eval=FALSE}
 93 | # Split the data into training and test sets (30% held out for testing)
 94 | (trainingData, testData) = transformed.randomSplit([0.7, 0.3])
 95 | ```
 96 | ### 5. Train a RandomForest model.
 97 | 
 98 | ```{python eval=FALSE}
 99 | # Train a RandomForest model.
100 | rf = RandomForestRegressor()
101 | model = rf.fit(trainingData)
102 | ```
103 | ### 6. Make predictions.
104 | 
105 | ```{python eval=FALSE}
106 | # Make predictions.
107 | predictions = model.transform(testData)
108 | ```
109 | ### 6. Show esults 
110 | ```{python eval=FALSE}
111 | # Select example rows to display.
112 | predictions.select("prediction", "label", "features").show(5)
113 | ```
114 | 
115 | ```{python eval=FALSE}
116 | #output
117 | +------------------+-----+--------------------+
118 | |        prediction|label|            features|
119 | +------------------+-----+--------------------+
120 | | 6.489667556875804|    7|[4.9,0.42,0.0,2.1...|
121 | | 6.267301910170284|    7|[5.1,0.42,0.0,1.8...|
122 | |6.0526786505470245|    7|[5.1,0.585,0.0,1....|
123 | | 5.257985010985523|    5|[5.2,0.32,0.25,1....|
124 | | 5.943264423589821|    7|[5.2,0.48,0.04,1....|
125 | +------------------+-----+--------------------+
126 | ```
127 | 
128 | ### 7. Model Evaluation
129 | ```{python eval=FALSE}
130 | # Select (prediction, true label) and compute test error
131 | evaluator = RegressionEvaluator(
132 |     labelCol="label", predictionCol="prediction", metricName="rmse")
133 | rmse = evaluator.evaluate(predictions)
134 | print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
135 | ```
136 | 
137 | ```{python eval=FALSE}
138 | Root Mean Squared Error (RMSE) on test data = 0.659148
139 | ```
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/legacy/regularization.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Regularization"
 3 | author: "Ming Chen"
 4 | date: "6/5/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 | 
12 | ## Regularization
13 | 
14 | Regularization is the technique used to solve the overfitting problem. An overfitted model means that the model can predict very well with the training data, but perform poorly with independent validation data.
15 | 
16 | When we add more predictors to our model, we will almost neccessarily decrease the **Residual Sum of Squares** (RSS; smaller RSS indicates better model). This increases the complexity of our model and makes our model only perform well on the training data (overfitting).
17 | 
18 | To balance the RSS and model overfitting, we introduce penalty for adding new predictors (coefficient $\beta \neq 0$) to the model.
19 | 
20 | ## LASSO regularization and Ridge regularization
21 | 
22 | * **LASSO**: $min \{RSS + \lambda\sum_{j=1}^{p}|\beta|\}$
23 | * **Ridge**: $min \{RSS + \lambda\sum_{j=1}^{p}\beta^2_j\}$
24 | 
25 | ## Elastic Net regularization
26 | 
27 | Elastic net is a regularized method that linearly combines penalities of the lasso and ridge methods.
28 | 
29 | * **elastic net**: $min \{RSS + \lambda_1\sum_{j=1}^{p}|\beta| + \lambda_2\sum_{j=1}^{p}\beta^2_j\}$
30 | 
31 | ## *regParam* and *elasticNetParam* parameters in regression models
32 | 
33 | * **regParam**: regularization parameter $\lambda$
34 | * **elasticNetParam**: $\lambda_2$ ridge penalty
35 | * **Scenarios**:
36 |     + *regParam* = $0$, *elasticNetParam* = $0$: no regularization applied, $\lambda = 0$
37 |     + *regParam* $\neq 0$, *elasticNetParam* = $1$: lasso regularization applied
38 |     + *regParam* $\neq 0$, *elasticNetParam* = $0$: ridge regularization applied
39 |     + *regParam* $\neq 0$, $0 < elasticNetParam < 1$: elastic net regularization applied


--------------------------------------------------------------------------------
/legacy/sna.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Social Network Analysis"
 3 | author: "Wenqiang Feng"
 4 | date: "4/7/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | 
12 | ## R Markdown
13 | 
14 | This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
15 | 
16 | When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
17 | 
18 | ```{r cars}
19 | summary(cars)
20 | ```
21 | 
22 | ## Including Plots
23 | 
24 | You can also embed plots, for example:
25 | 
26 | ```{r pressure, echo=FALSE}
27 | plot(pressure)
28 | ```
29 | 
30 | Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
31 | 


--------------------------------------------------------------------------------
/legacy/spark-on-jetstream-cloud.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Spark on Jetstream Cloud"
 3 | author: "Wenqiang Feng & Ming Chen"
 4 | date: "3/8/2017"
 5 | output: html_document
 6 | ---
 7 | 
 8 | ```{r setup, include=FALSE}
 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 | 
12 | 
13 | ## Set up apache spark on jetstream
14 | 
15 | * Install linuxbrew and spark
16 | 
17 | ```{python}
18 | sudo apt-get install -y ruby
19 | ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Linuxbrew/install/master/install)"
20 | 
21 | echo 'export PATH="/home/mchen33/.linuxbrew/bin:$PATH"' >>~/.bash_profile
22 | echo 'export MANPATH="/home/mchen33/.linuxbrew/share/man:$MANPATH"' >>~/.bash_profile
23 | echo 'export INFOPATH="/home/mchen33/.linuxbrew/share/info:$INFOPATH"' >>~/.bash_profile
24 | 
25 | source ~/.bash_profile
26 |   
27 | sudo apt-get install build-essential
28 | 
29 | brew install apache-spark
30 | 
31 | ## install java
32 | sudo apt-get install -y default-jre
33 | ```
34 | 
35 | 
36 | ```{python}
37 | export SPARK_LOCAL_IP="127.0.0.1"
38 | ```
39 | 
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/link-spark-with-jupyter.md:
--------------------------------------------------------------------------------
 1 | ## Install jupyter with conda
 2 | 
 3 | ```
 4 | conda install jupyter
 5 | ```
 6 | 
 7 | ## Get `jupyter binary executable path`
 8 | 
 9 | ```
10 | which jupyter
11 | ```
12 | 
13 | output
14 | 
15 | ```
16 | /Users/mingchen/anaconda2/bin/jupyter
17 | ```
18 | 
19 | ## Link spark with jupyter
20 | 
21 | ```
22 | export PYSPARK_DRIVER_PYTHON=/Users/mingchen/anaconda2/bin/jupyter
23 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8880"
24 | ```
25 | 
26 | You can also add the two environmental variables to the `~/.bash_profile` file to permenantly link spark with jupyter
27 | 
28 | ## Run jupyter notebook
29 | 
30 | ```
31 | pyspark
32 | ```
33 | 
34 | Then go to [http://127.0.0.1:8880](http://127.0.0.1:8880)
35 | 
36 | 


--------------------------------------------------------------------------------
/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/logo.jpg


--------------------------------------------------------------------------------
/notebooks/01-data-strcture/.gitignore:
--------------------------------------------------------------------------------
1 | spark-warehouse
2 | 


--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/.ipynb_checkpoints/2.3-continuous-variable-to-categorical-variable-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# create entry points to spark\n",
 10 |     "try:\n",
 11 |     "    sc.stop()\n",
 12 |     "except:\n",
 13 |     "    pass\n",
 14 |     "from pyspark import SparkContext, SparkConf\n",
 15 |     "from pyspark.sql import SparkSession\n",
 16 |     "sc=SparkContext()\n",
 17 |     "spark = SparkSession(sparkContext=sc)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Convert continuous variables to categorical variables"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "There are two functions we can use to split a continuous variable into categories:\n",
 32 |     "\n",
 33 |     "* `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold\n",
 34 |     "* `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.\n",
 35 |     "    + with n+1 split points, there are n categories (buckets).\n"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Create some data"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "+--------------------+------------------+\n",
 55 |       "|                  x1|                x2|\n",
 56 |       "+--------------------+------------------+\n",
 57 |       "| 0.47143516373249306| 6.834629351721363|\n",
 58 |       "| -1.1909756947064645| 7.127020269829002|\n",
 59 |       "|  1.4327069684260973|3.7025075479039495|\n",
 60 |       "| -0.3126518960917129| 5.611961860656249|\n",
 61 |       "| -0.7205887333650116| 5.030831653078097|\n",
 62 |       "|  0.8871629403077386|0.1376844959068224|\n",
 63 |       "|  0.8595884137174165| 7.728266216123741|\n",
 64 |       "| -0.6365235044173491| 8.826411906361166|\n",
 65 |       "|0.015696372114428918| 3.648859839013723|\n",
 66 |       "| -2.2426849541854055| 6.153961784334937|\n",
 67 |       "+--------------------+------------------+\n",
 68 |       "\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "import numpy as np\n",
 74 |     "import pandas as pd\n",
 75 |     "np.random.seed(seed=1234)\n",
 76 |     "pdf = pd.DataFrame({\n",
 77 |     "        'x1': np.random.randn(10),\n",
 78 |     "        'x2': np.random.rand(10)*10\n",
 79 |     "    })\n",
 80 |     "np.random.seed(seed=None)\n",
 81 |     "df = spark.createDataFrame(pdf)\n",
 82 |     "df.show()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Binarize the column x1 and Bucketize the column x2"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 3,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "+--------------------+------------------+------+------+\n",
102 |       "|                  x1|                x2|x1_new|x2_new|\n",
103 |       "+--------------------+------------------+------+------+\n",
104 |       "| 0.47143516373249306| 6.834629351721363|   1.0|   2.0|\n",
105 |       "| -1.1909756947064645| 7.127020269829002|   0.0|   2.0|\n",
106 |       "|  1.4327069684260973|3.7025075479039495|   1.0|   1.0|\n",
107 |       "| -0.3126518960917129| 5.611961860656249|   0.0|   2.0|\n",
108 |       "| -0.7205887333650116| 5.030831653078097|   0.0|   2.0|\n",
109 |       "|  0.8871629403077386|0.1376844959068224|   1.0|   0.0|\n",
110 |       "|  0.8595884137174165| 7.728266216123741|   1.0|   3.0|\n",
111 |       "| -0.6365235044173491| 8.826411906361166|   0.0|   3.0|\n",
112 |       "|0.015696372114428918| 3.648859839013723|   1.0|   1.0|\n",
113 |       "| -2.2426849541854055| 6.153961784334937|   0.0|   2.0|\n",
114 |       "+--------------------+------------------+------+------+\n",
115 |       "\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "from pyspark.ml.feature import Binarizer, Bucketizer\n",
121 |     "# threshold = 0 for binarizer\n",
122 |     "binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')\n",
123 |     "# provide 5 split points to generate 4 buckets\n",
124 |     "bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')\n",
125 |     "\n",
126 |     "# pipeline stages\n",
127 |     "from pyspark.ml import Pipeline\n",
128 |     "stages = [binarizer, bucketizer]\n",
129 |     "pipeline = Pipeline(stages=stages)\n",
130 |     "\n",
131 |     "# fit the pipeline model and transform the data\n",
132 |     "pipeline.fit(df).transform(df).show()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": []
143 |   }
144 |  ],
145 |  "metadata": {
146 |   "kernelspec": {
147 |    "display_name": "Python 3",
148 |    "language": "python",
149 |    "name": "python3"
150 |   },
151 |   "language_info": {
152 |    "codemirror_mode": {
153 |     "name": "ipython",
154 |     "version": 3
155 |    },
156 |    "file_extension": ".py",
157 |    "mimetype": "text/x-python",
158 |    "name": "python",
159 |    "nbconvert_exporter": "python",
160 |    "pygments_lexer": "ipython3",
161 |    "version": "3.6.5"
162 |   }
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 2
166 | }
167 | 


--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/2.3-continuous-variable-to-categorical-variable.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# create entry points to spark\n",
 10 |     "try:\n",
 11 |     "    sc.stop()\n",
 12 |     "except:\n",
 13 |     "    pass\n",
 14 |     "from pyspark import SparkContext, SparkConf\n",
 15 |     "from pyspark.sql import SparkSession\n",
 16 |     "sc=SparkContext()\n",
 17 |     "spark = SparkSession(sparkContext=sc)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "# Convert continuous variables to categorical variables"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "There are two functions we can use to split a continuous variable into categories:\n",
 32 |     "\n",
 33 |     "* `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold\n",
 34 |     "* `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.\n",
 35 |     "    + with n+1 split points, there are n categories (buckets).\n"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Create some data"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "+--------------------+------------------+\n",
 55 |       "|                  x1|                x2|\n",
 56 |       "+--------------------+------------------+\n",
 57 |       "| 0.47143516373249306| 6.834629351721363|\n",
 58 |       "| -1.1909756947064645| 7.127020269829002|\n",
 59 |       "|  1.4327069684260973|3.7025075479039495|\n",
 60 |       "| -0.3126518960917129| 5.611961860656249|\n",
 61 |       "| -0.7205887333650116| 5.030831653078097|\n",
 62 |       "|  0.8871629403077386|0.1376844959068224|\n",
 63 |       "|  0.8595884137174165| 7.728266216123741|\n",
 64 |       "| -0.6365235044173491| 8.826411906361166|\n",
 65 |       "|0.015696372114428918| 3.648859839013723|\n",
 66 |       "| -2.2426849541854055| 6.153961784334937|\n",
 67 |       "+--------------------+------------------+\n",
 68 |       "\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "import numpy as np\n",
 74 |     "import pandas as pd\n",
 75 |     "np.random.seed(seed=1234)\n",
 76 |     "pdf = pd.DataFrame({\n",
 77 |     "        'x1': np.random.randn(10),\n",
 78 |     "        'x2': np.random.rand(10)*10\n",
 79 |     "    })\n",
 80 |     "np.random.seed(seed=None)\n",
 81 |     "df = spark.createDataFrame(pdf)\n",
 82 |     "df.show()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "## Binarize the column x1 and Bucketize the column x2"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 3,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "+--------------------+------------------+------+------+\n",
102 |       "|                  x1|                x2|x1_new|x2_new|\n",
103 |       "+--------------------+------------------+------+------+\n",
104 |       "| 0.47143516373249306| 6.834629351721363|   1.0|   2.0|\n",
105 |       "| -1.1909756947064645| 7.127020269829002|   0.0|   2.0|\n",
106 |       "|  1.4327069684260973|3.7025075479039495|   1.0|   1.0|\n",
107 |       "| -0.3126518960917129| 5.611961860656249|   0.0|   2.0|\n",
108 |       "| -0.7205887333650116| 5.030831653078097|   0.0|   2.0|\n",
109 |       "|  0.8871629403077386|0.1376844959068224|   1.0|   0.0|\n",
110 |       "|  0.8595884137174165| 7.728266216123741|   1.0|   3.0|\n",
111 |       "| -0.6365235044173491| 8.826411906361166|   0.0|   3.0|\n",
112 |       "|0.015696372114428918| 3.648859839013723|   1.0|   1.0|\n",
113 |       "| -2.2426849541854055| 6.153961784334937|   0.0|   2.0|\n",
114 |       "+--------------------+------------------+------+------+\n",
115 |       "\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "from pyspark.ml.feature import Binarizer, Bucketizer\n",
121 |     "# threshold = 0 for binarizer\n",
122 |     "binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')\n",
123 |     "# provide 5 split points to generate 4 buckets\n",
124 |     "bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')\n",
125 |     "\n",
126 |     "# pipeline stages\n",
127 |     "from pyspark.ml import Pipeline\n",
128 |     "stages = [binarizer, bucketizer]\n",
129 |     "pipeline = Pipeline(stages=stages)\n",
130 |     "\n",
131 |     "# fit the pipeline model and transform the data\n",
132 |     "pipeline.fit(df).transform(df).show()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {
139 |     "collapsed": true
140 |    },
141 |    "outputs": [],
142 |    "source": []
143 |   }
144 |  ],
145 |  "metadata": {
146 |   "kernelspec": {
147 |    "display_name": "Python 3",
148 |    "language": "python",
149 |    "name": "python3"
150 |   },
151 |   "language_info": {
152 |    "codemirror_mode": {
153 |     "name": "ipython",
154 |     "version": 3
155 |    },
156 |    "file_extension": ".py",
157 |    "mimetype": "text/x-python",
158 |    "name": "python",
159 |    "nbconvert_exporter": "python",
160 |    "pygments_lexer": "ipython3",
161 |    "version": "3.6.5"
162 |   }
163 |  },
164 |  "nbformat": 4,
165 |  "nbformat_minor": 2
166 | }
167 | 


--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/2.7.1-column-expression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "# create entry points to spark\n",
 10 |     "try:\n",
 11 |     "    sc.stop()\n",
 12 |     "except:\n",
 13 |     "    pass\n",
 14 |     "from pyspark import SparkContext, SparkConf\n",
 15 |     "from pyspark.sql import SparkSession\n",
 16 |     "sc=SparkContext()\n",
 17 |     "spark = SparkSession(sparkContext=sc)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Column expression\n",
 25 |     "\n",
 26 |     "A Spark **column instance** is **NOT a column of values** from the **DataFrame**: when you crate a column instance, it does not give you the actual values of that column in the DataFrame. I found it makes more sense to me if I consider a **column instance as a column of expressions**. These expressions are evaluated by other methods (e.g., the **select()**, **groupby()**, and **orderby()** from **pyspark.sql.DataFrame**)"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Example data"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
 46 |       "|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|\n",
 47 |       "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
 48 |       "|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|\n",
 49 |       "|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|\n",
 50 |       "|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|\n",
 51 |       "|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|\n",
 52 |       "|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|\n",
 53 |       "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
 54 |       "only showing top 5 rows\n",
 55 |       "\n"
 56 |      ]
 57 |     }
 58 |    ],
 59 |    "source": [
 60 |     "mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)\n",
 61 |     "mtcars = mtcars.withColumnRenamed('_c0', 'model')\n",
 62 |     "mtcars.show(5)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## Use dot (.) to select column from DataFrame"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "Column<b'mpg'>"
 81 |       ]
 82 |      },
 83 |      "execution_count": 4,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "mpg_col = mtcars.mpg\n",
 90 |     "mpg_col"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "## Modify a column to generate a new column"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 5,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "Column<b'(mpg + 1)'>"
109 |       ]
110 |      },
111 |      "execution_count": 5,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "mpg_col + 1"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "+-----------+\n",
130 |       "|(mpg * 100)|\n",
131 |       "+-----------+\n",
132 |       "|     2100.0|\n",
133 |       "|     2100.0|\n",
134 |       "|     2280.0|\n",
135 |       "|     2140.0|\n",
136 |       "|     1870.0|\n",
137 |       "+-----------+\n",
138 |       "only showing top 5 rows\n",
139 |       "\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "mtcars.select(mpg_col * 100).show(5)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "The **pyspark.sql.Column** has many methods that acts on a column and returns a column instance."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 7,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "+----------------+\n",
164 |       "|(gear IN (2, 3))|\n",
165 |       "+----------------+\n",
166 |       "|           false|\n",
167 |       "|           false|\n",
168 |       "|           false|\n",
169 |       "|            true|\n",
170 |       "|            true|\n",
171 |       "+----------------+\n",
172 |       "only showing top 5 rows\n",
173 |       "\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "mtcars.select(mtcars.gear.isin([2,3])).show(5)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 8,
184 |    "metadata": {},
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "Column<b'mpg ASC NULLS FIRST'>"
190 |       ]
191 |      },
192 |      "execution_count": 8,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "mtcars.mpg.asc()"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "collapsed": true
206 |    },
207 |    "outputs": [],
208 |    "source": []
209 |   }
210 |  ],
211 |  "metadata": {
212 |   "kernelspec": {
213 |    "display_name": "Python 3",
214 |    "language": "python",
215 |    "name": "python3"
216 |   },
217 |   "language_info": {
218 |    "codemirror_mode": {
219 |     "name": "ipython",
220 |     "version": 3
221 |    },
222 |    "file_extension": ".py",
223 |    "mimetype": "text/x-python",
224 |    "name": "python",
225 |    "nbconvert_exporter": "python",
226 |    "pygments_lexer": "ipython3",
227 |    "version": "3.6.5"
228 |   }
229 |  },
230 |  "nbformat": 4,
231 |  "nbformat_minor": 2
232 | }
233 | 


--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/2.7.2-dot-column-expression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "# create entry points to spark\n",
 12 |     "try:\n",
 13 |     "    sc.stop()\n",
 14 |     "except:\n",
 15 |     "    pass\n",
 16 |     "from pyspark import SparkContext, SparkConf\n",
 17 |     "from pyspark.sql import SparkSession\n",
 18 |     "sc=SparkContext()\n",
 19 |     "spark = SparkSession(sparkContext=sc)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Example data"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {},
 33 |    "outputs": [
 34 |     {
 35 |      "name": "stdout",
 36 |      "output_type": "stream",
 37 |      "text": [
 38 |       "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
 39 |       "|            model| mpg|cyl| disp| hp|drat|   wt| qsec| vs| am|gear|carb|\n",
 40 |       "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
 41 |       "|        Mazda RX4|21.0|  6|160.0|110| 3.9| 2.62|16.46|  0|  1|   4|   4|\n",
 42 |       "|    Mazda RX4 Wag|21.0|  6|160.0|110| 3.9|2.875|17.02|  0|  1|   4|   4|\n",
 43 |       "|       Datsun 710|22.8|  4|108.0| 93|3.85| 2.32|18.61|  1|  1|   4|   1|\n",
 44 |       "|   Hornet 4 Drive|21.4|  6|258.0|110|3.08|3.215|19.44|  1|  0|   3|   1|\n",
 45 |       "|Hornet Sportabout|18.7|  8|360.0|175|3.15| 3.44|17.02|  0|  0|   3|   2|\n",
 46 |       "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
 47 |       "only showing top 5 rows\n",
 48 |       "\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "mtcars = spark.read.csv('../../../data/mtcars.csv', inferSchema=True, header=True)\n",
 54 |     "mtcars = mtcars.withColumnRenamed('_c0', 'model')\n",
 55 |     "mtcars.show(5)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Dot (.) column expression\n",
 63 |     "\n",
 64 |     "Create a column expression that will return the original column values."
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 3,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "data": {
 74 |       "text/plain": [
 75 |        "Column<b'mpg'>"
 76 |       ]
 77 |      },
 78 |      "execution_count": 3,
 79 |      "metadata": {},
 80 |      "output_type": "execute_result"
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "mpg_col_exp = mtcars.mpg\n",
 85 |     "mpg_col_exp"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 5,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "name": "stdout",
 95 |      "output_type": "stream",
 96 |      "text": [
 97 |       "+----+\n",
 98 |       "| mpg|\n",
 99 |       "+----+\n",
100 |       "|21.0|\n",
101 |       "|21.0|\n",
102 |       "|22.8|\n",
103 |       "|21.4|\n",
104 |       "|18.7|\n",
105 |       "+----+\n",
106 |       "only showing top 5 rows\n",
107 |       "\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "mtcars.select(mpg_col_exp).show(5)"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": []
123 |   }
124 |  ],
125 |  "metadata": {
126 |   "kernelspec": {
127 |    "display_name": "Python 3",
128 |    "language": "python",
129 |    "name": "python3"
130 |   },
131 |   "language_info": {
132 |    "codemirror_mode": {
133 |     "name": "ipython",
134 |     "version": 3
135 |    },
136 |    "file_extension": ".py",
137 |    "mimetype": "text/x-python",
138 |    "name": "python",
139 |    "nbconvert_exporter": "python",
140 |    "pygments_lexer": "ipython3",
141 |    "version": "3.5.0"
142 |   }
143 |  },
144 |  "nbformat": 4,
145 |  "nbformat_minor": 2
146 | }
147 | 


--------------------------------------------------------------------------------
/notebooks/04-miscellaneous/add-python-files-to-spark-cluster.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "The `SparkContext.addPyFiles()` function can be used to add py files. We can define objects and variables in these files and make them available to the Spark cluster."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "# Create a SparkContext object"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from pyspark import SparkConf, SparkContext, SparkFiles\n",
 26 |     "from pyspark.sql import SparkSession"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 2,
 32 |    "metadata": {
 33 |     "collapsed": true
 34 |    },
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "sc = SparkContext(conf=SparkConf())"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "# Add py files"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": true
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "sc.addPyFile('pyFiles/my_module.py')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "'/private/var/folders/2_/kb60z5_j0k91tyh740s1zhn40000gn/T/spark-4f959e9f-4af6-490e-afce-02e1582aae8d/userFiles-8b1c073b-4c82-467a-b9ff-021aa3067abe/my_module.py'"
 67 |       ]
 68 |      },
 69 |      "execution_count": 4,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "SparkFiles.get('my_module.py')"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "# Use **my_module.py**\n",
 83 |     "We can import `my_module` as a python module"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "collapsed": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "from my_module import *"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 6,
100 |    "metadata": {},
101 |    "outputs": [
102 |     {
103 |      "data": {
104 |       "text/plain": [
105 |        "True"
106 |       ]
107 |      },
108 |      "execution_count": 6,
109 |      "metadata": {},
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "addPyFiles_is_successfull()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 7,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "9"
126 |       ]
127 |      },
128 |      "execution_count": 7,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "sum_two_variables(4,5)"
135 |    ]
136 |   }
137 |  ],
138 |  "metadata": {
139 |   "kernelspec": {
140 |    "display_name": "Python 3",
141 |    "language": "python",
142 |    "name": "python3"
143 |   },
144 |   "language_info": {
145 |    "codemirror_mode": {
146 |     "name": "ipython",
147 |     "version": 3
148 |    },
149 |    "file_extension": ".py",
150 |    "mimetype": "text/x-python",
151 |    "name": "python",
152 |    "nbconvert_exporter": "python",
153 |    "pygments_lexer": "ipython3",
154 |    "version": "3.5.0"
155 |   }
156 |  },
157 |  "nbformat": 4,
158 |  "nbformat_minor": 2
159 | }
160 | 


--------------------------------------------------------------------------------
/notebooks/04-miscellaneous/dense-vs-sparse-vectors.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from pyspark import SparkConf, SparkContext\n",
 12 |     "from pyspark.sql import SparkSession"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 2,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "sc = SparkContext(conf=SparkConf())\n",
 24 |     "spark = SparkSession(sparkContext=sc)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 3,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "from pyspark.ml.linalg import Vector, DenseVector, SparseVector"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "markdown",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "# Dense vector and sparse vector"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "A vector can be represented in dense and sparse formats. A dense vector is a regular vector that has each elements printed. A sparse vector use three components to represent a vector but with less memory."
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 22,
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/plain": [
 60 |        "DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])"
 61 |       ]
 62 |      },
 63 |      "execution_count": 22,
 64 |      "metadata": {},
 65 |      "output_type": "execute_result"
 66 |     }
 67 |    ],
 68 |    "source": [
 69 |     "dv = DenseVector([1.0,0.,0.,0.,4.5,0])\n",
 70 |     "dv"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## Three components of a sparse vector\n",
 78 |     "\n",
 79 |     "* vector size\n",
 80 |     "* indices of active elements\n",
 81 |     "* values of active elements\n",
 82 |     "\n",
 83 |     "In the above dense vector:\n",
 84 |     "\n",
 85 |     "* vector size = 6\n",
 86 |     "* indices of active elements = [0, 4]\n",
 87 |     "* values of active elements = [1.0, 4.5]"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "We can use the `SparseVector()` function to create a sparse vector. The first argument is the vector size, the second\n",
 95 |     "argument is a dictionary. The keys are indices of active elements and the values are values of active elements."
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 23,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "data": {
105 |       "text/plain": [
106 |        "SparseVector(6, {0: 1.0, 4: 4.5})"
107 |       ]
108 |      },
109 |      "execution_count": 23,
110 |      "metadata": {},
111 |      "output_type": "execute_result"
112 |     }
113 |    ],
114 |    "source": [
115 |     "sv = SparseVector(6, {0:1.0, 4:4.5})\n",
116 |     "sv"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "## Convert sparse vector to dense vector"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 30,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])"
135 |       ]
136 |      },
137 |      "execution_count": 30,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "DenseVector(sv.toArray())"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Convert dense vector to sparse vector"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 33,
156 |    "metadata": {},
157 |    "outputs": [
158 |     {
159 |      "data": {
160 |       "text/plain": [
161 |        "{0: 1.0, 4: 4.5}"
162 |       ]
163 |      },
164 |      "execution_count": 33,
165 |      "metadata": {},
166 |      "output_type": "execute_result"
167 |     }
168 |    ],
169 |    "source": [
170 |     "active_elements_dict = {index: value for index, value in enumerate(dv) if value != 0}\n",
171 |     "active_elements_dict"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": 34,
177 |    "metadata": {},
178 |    "outputs": [
179 |     {
180 |      "data": {
181 |       "text/plain": [
182 |        "SparseVector(6, {0: 1.0, 4: 4.5})"
183 |       ]
184 |      },
185 |      "execution_count": 34,
186 |      "metadata": {},
187 |      "output_type": "execute_result"
188 |     }
189 |    ],
190 |    "source": [
191 |     "SparseVector(len(dv), active_elements_dict)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": []
202 |   }
203 |  ],
204 |  "metadata": {
205 |   "kernelspec": {
206 |    "display_name": "Python 3",
207 |    "language": "python",
208 |    "name": "python3"
209 |   },
210 |   "language_info": {
211 |    "codemirror_mode": {
212 |     "name": "ipython",
213 |     "version": 3
214 |    },
215 |    "file_extension": ".py",
216 |    "mimetype": "text/x-python",
217 |    "name": "python",
218 |    "nbconvert_exporter": "python",
219 |    "pygments_lexer": "ipython3",
220 |    "version": "3.5.0"
221 |   }
222 |  },
223 |  "nbformat": 4,
224 |  "nbformat_minor": 2
225 | }
226 | 


--------------------------------------------------------------------------------
/notebooks/04-miscellaneous/issues-and-solutions.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Issues and Solutions"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "markdown",
12 |    "metadata": {},
13 |    "source": [
14 |     "**load pyspark environment permission denied**\n",
15 |     "\n",
16 |     "This issue might be caused by a recently Mac OS updating to Sierra 10.12.5."
17 |    ]
18 |   },
19 |   {
20 |    "cell_type": "markdown",
21 |    "metadata": {},
22 |    "source": [
23 |     "```\n",
24 |     "\n",
25 |     "```"
26 |    ]
27 |   }
28 |  ],
29 |  "metadata": {
30 |   "kernelspec": {
31 |    "display_name": "Python 3",
32 |    "language": "python",
33 |    "name": "python3"
34 |   },
35 |   "language_info": {
36 |    "codemirror_mode": {
37 |     "name": "ipython",
38 |     "version": 3
39 |    },
40 |    "file_extension": ".py",
41 |    "mimetype": "text/x-python",
42 |    "name": "python",
43 |    "nbconvert_exporter": "python",
44 |    "pygments_lexer": "ipython3",
45 |    "version": "3.6.1"
46 |   }
47 |  },
48 |  "nbformat": 4,
49 |  "nbformat_minor": 2
50 | }
51 | 


--------------------------------------------------------------------------------
/notebooks/05-module-turning/cross-validation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Cross-validation\n",
  8 |     "---"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "## Training/validation/test data sets\n",
 16 |     "\n",
 17 |     "* **Training set**: the data set for training your models.\n",
 18 |     "* **Validation set**: The data set used for testing the performance of your models you have built with training sets. Based on the performance, you choose the best model (final).\n",
 19 |     "* **Test set**: use this data set to test the performance of your final model.\n",
 20 |     "\n",
 21 |     "## K-folds cross validation steps (k=4 as an example).\n",
 22 |     "\n",
 23 |     "* step 1: split your data into training set and test set (for example 80% training and 20% test). Test set will never be used in model training and selection. \n",
 24 |     "* step 2: split training set into k (k=4) eqaul subsets: 3 subsets for traing + 1 subset for validation.\n",
 25 |     "* step 3: training your models with the 3 subsets and calculate a performance score with the remaining 1 subset.\n",
 26 |     "* step 4: choose a different subset for validation and then repeat step 3 until every subset has been used as a validation subset.\n",
 27 |     "* step 5: for a k=4 fold cross validation, each trained model should have been validated by 4 subsets and therefore has 4 performance scores. Calculate the average of these 4 perfermance scores for each model. Use the average score to select the best, final model.\n",
 28 |     "* step 6: apply your final model to the **untouched** test data and see how it performs.\n",
 29 |     "\n",
 30 |     "## Example of k-folds cross validation\n",
 31 |     "\n",
 32 |     "### Build parameter grids\n",
 33 |     "\n",
 34 |     "* parameter grid: a combination of all variable parameters in your model.\n",
 35 |     "* example: If I want to train a logistic regression model on 4 different *regParam* and 3 different *elasticNetParam*, I will have 3 x 4 = 12 models to train and validate.\n",
 36 |     "    "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": null,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "from pyspark.ml.classification import LogisticRegression\n",
 48 |     "blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial')\n",
 49 |     "\n",
 50 |     "from pyspark.ml.tuning import ParamGridBuilder\n",
 51 |     "param_grid = ParamGridBuilder().\\\n",
 52 |     "    addGrid(blor.regParam, [0, 0.5, 1, 2]).\\\n",
 53 |     "    addGrid(blor.elasticNetParam, [0, 0.5, 1]).\\\n",
 54 |     "    build()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {},
 60 |    "source": [
 61 |     "### Split data into training and test sets\n",
 62 |     "* Refer to the [logistic regression page](logistic-regression.ipynb) to see what data we used and how the training and test sets were generated.\n",
 63 |     "\n",
 64 |     "### Run k (k=4) folds cross validation"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {
 71 |     "collapsed": true
 72 |    },
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
 76 |     "evaluator = BinaryClassificationEvaluator()\n",
 77 |     "\n",
 78 |     "from pyspark.ml.tuning import CrossValidator\n",
 79 |     "cv = CrossValidator(estimator=blor, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)\n",
 80 |     "\n",
 81 |     "cvModel = cv.fit(training)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": []
 92 |   }
 93 |  ],
 94 |  "metadata": {
 95 |   "kernelspec": {
 96 |    "display_name": "Python 3",
 97 |    "language": "python",
 98 |    "name": "python3"
 99 |   },
100 |   "language_info": {
101 |    "codemirror_mode": {
102 |     "name": "ipython",
103 |     "version": 3
104 |    },
105 |    "file_extension": ".py",
106 |    "mimetype": "text/x-python",
107 |    "name": "python",
108 |    "nbconvert_exporter": "python",
109 |    "pygments_lexer": "ipython3",
110 |    "version": "3.6.1"
111 |   }
112 |  },
113 |  "nbformat": 4,
114 |  "nbformat_minor": 2
115 | }
116 | 


--------------------------------------------------------------------------------
/notebooks/05-module-turning/regularization.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "# Regularization\n",
 8 |     "---"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "markdown",
13 |    "metadata": {},
14 |    "source": [
15 |     "## Regularization\n",
16 |     "\n",
17 |     "Regularization is the technique used to solve the overfitting problem. An overfitted model means that the model can predict very well with the training data, but perform poorly with independent validation data.\n",
18 |     "\n",
19 |     "When we add more predictors to our model, we will almost neccessarily decrease the **Residual Sum of Squares** (RSS; smaller RSS indicates better model). This increases the complexity of our model and makes our model only perform well on the training data (overfitting).\n",
20 |     "\n",
21 |     "To balance the RSS and model overfitting, we introduce penalty for adding new predictors (coefficient $\\beta \\neq 0$) to the model.\n",
22 |     "\n",
23 |     "\n",
24 |     "\n",
25 |     "\n",
26 |     "\n",
27 |     "\n",
28 |     "\n",
29 |     "\n",
30 |     "## LASSO regularization and Ridge regularization\n",
31 |     "\n",
32 |     "* **LASSO**: $min \\{RSS + \\lambda\\sum_{j=1}^{p}|\\beta_1|\\}$\n",
33 |     "* **Ridge**: $min \\{RSS + \\lambda\\sum_{j=1}^{p}\\beta^2_2\\}$\n",
34 |     "\n",
35 |     "## Elastic Net regularization\n",
36 |     "\n",
37 |     "Elastic net is a regularized method that linearly combines penalities of the lasso and ridge methods.\n",
38 |     "\n",
39 |     "* **elastic net**: $min \\{RSS + \\lambda[\\sum_{j=1}^{p}\\frac{1}{2}(1-\\alpha)|\\beta^2_2| + \\alpha\\sum_{j=1}^{p}\\beta_1]\\}$\n",
40 |     "\n",
41 |     "Reference: https://spark.apache.org/docs/2.1.1/ml-classification-regression.html\n",
42 |     "\n",
43 |     "## *regParam* and *elasticNetParam* parameters in regression models\n",
44 |     "\n",
45 |     "* **regParam**: corresponds to $\\lambda$\n",
46 |     "* **elasticNetParam** corresponds to $\\alpha$. When $\\alpha = 0$, it is ridge regularization (L2 penalty). When $\\alpha = 1$, it is lasso regularization (L1 penalty)."
47 |    ]
48 |   },
49 |   {
50 |    "cell_type": "code",
51 |    "execution_count": null,
52 |    "metadata": {
53 |     "collapsed": true
54 |    },
55 |    "outputs": [],
56 |    "source": []
57 |   }
58 |  ],
59 |  "metadata": {
60 |   "kernelspec": {
61 |    "display_name": "Python 3",
62 |    "language": "python",
63 |    "name": "python3"
64 |   },
65 |   "language_info": {
66 |    "codemirror_mode": {
67 |     "name": "ipython",
68 |     "version": 3
69 |    },
70 |    "file_extension": ".py",
71 |    "mimetype": "text/x-python",
72 |    "name": "python",
73 |    "nbconvert_exporter": "python",
74 |    "pygments_lexer": "ipython3",
75 |    "version": "3.6.1"
76 |   }
77 |  },
78 |  "nbformat": 4,
79 |  "nbformat_minor": 2
80 | }
81 | 


--------------------------------------------------------------------------------
/notebooks/ipynb/.ipynb_checkpoints/HashingTF-and-CountVectorizer-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/ipynb/.ipynb_checkpoints/NaiveBayes-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/ipynb/.ipynb_checkpoints/RDD-manipulation-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/ipynb/derby.log:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------------------
 2 | Wed Mar 22 19:59:25 EDT 2017:
 3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.12.1.1 - (1704137): instance a816c00e-015a-f875-fa3e-0000108dd888 
 4 | on database directory /Users/mingchen/GoogleDrive/R-projects/learning-apache-spark/ipynb/metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@3bcf99f7 
 5 | Loaded from file:/usr/local/Cellar/apache-spark/2.1.0/libexec/jars/derby-10.12.1.1.jar
 6 | java.vendor=Oracle Corporation
 7 | java.runtime.version=1.8.0_51-b16
 8 | user.dir=/Users/mingchen/GoogleDrive/R-projects/learning-apache-spark/ipynb
 9 | os.name=Mac OS X
10 | os.arch=x86_64
11 | os.version=10.12.3
12 | derby.system.home=null
13 | Database Class Loader started - derby.database.classpath=''
14 | 


--------------------------------------------------------------------------------
/notebooks/ipynb/preproc.py:
--------------------------------------------------------------------------------
  1 | from nltk.stem.wordnet import WordNetLemmatizer
  2 | from nltk.corpus import stopwords
  3 | from nltk import pos_tag
  4 | import string
  5 | import re
  6 | import langid
  7 | 
  8 | # Convert to float format
  9 | def string_to_float(x):
 10 |     return float(x)
 11 | 
 12 | # Use langid module to classify the language to make sure we are applying the correct cleanup actions for English
 13 | # https://github.com/saffsd/langid.py
 14 | def check_lang(data_str):
 15 |     predict_lang = langid.classify(data_str)
 16 |     if predict_lang[1] >= .9:
 17 |         language = predict_lang[0]
 18 |     else:
 19 |         language = 'NA'
 20 |     return language
 21 | 
 22 | 
 23 | # Stop words usually refer to the most common words in a language, there is no single universal list of stop words used
 24 | # by all natural language processing tools.
 25 | # Reduces Dimensionality
 26 | # removes stop words of a single Tweets (cleaned_str/row/document)
 27 | def remove_stops(data_str):
 28 |     # expects a string
 29 |     stops = set(stopwords.words("english"))
 30 |     list_pos = 0
 31 |     cleaned_str = ''
 32 |     text = data_str.split()
 33 |     for word in text:
 34 |         if word not in stops:
 35 |             # rebuild cleaned_str
 36 |             if list_pos == 0:
 37 |                 cleaned_str = word
 38 |             else:
 39 |                 cleaned_str = cleaned_str + ' ' + word
 40 |             list_pos += 1
 41 |     return cleaned_str
 42 | 
 43 | 
 44 | # catch-all to remove other 'words' that I felt didn't add a lot of value
 45 | # Reduces Dimensionality, gets rid of a lot of unique urls
 46 | def remove_features(data_str):
 47 |     # compile regex
 48 |     url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
 49 |     punc_re = re.compile('[%s]' % re.escape(string.punctuation))
 50 |     num_re = re.compile('(\\d+)')
 51 |     mention_re = re.compile('@(\w+)')
 52 |     alpha_num_re = re.compile("^[a-z0-9_.]+$")
 53 |     # convert to lowercase
 54 |     data_str = data_str.lower()
 55 |     # remove hyperlinks
 56 |     data_str = url_re.sub(' ', data_str)
 57 |     # remove @mentions
 58 |     data_str = mention_re.sub(' ', data_str)
 59 |     # remove puncuation
 60 |     data_str = punc_re.sub(' ', data_str)
 61 |     # remove numeric 'words'
 62 |     data_str = num_re.sub(' ', data_str)
 63 |     # remove non a-z 0-9 characters and words shorter than 3 characters
 64 |     list_pos = 0
 65 |     cleaned_str = ''
 66 |     for word in data_str.split():
 67 |         if list_pos == 0:
 68 |             if alpha_num_re.match(word) and len(word) > 2:
 69 |                 cleaned_str = word
 70 |             else:
 71 |                 cleaned_str = ' '
 72 |         else:
 73 |             if alpha_num_re.match(word) and len(word) > 2:
 74 |                 cleaned_str = cleaned_str + ' ' + word
 75 |             else:
 76 |                 cleaned_str += ' '
 77 |         list_pos += 1
 78 |     return cleaned_str
 79 | 
 80 | 
 81 | # Process of classifying words into their parts of speech and labeling them accordingly is known as part-of-speech
 82 | # tagging, POS-tagging, or simply tagging. Parts of speech are also known as word classes or lexical categories. The
 83 | # collection of tags used for a particular task is known as a tagset. Our emphasis in this chapter is on exploiting
 84 | # tags, and tagging text automatically.
 85 | # http://www.nltk.org/book/ch05.html
 86 | def tag_and_remove(data_str):
 87 |     cleaned_str = ' '
 88 |     # noun tags
 89 |     nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS']
 90 |     # adjectives
 91 |     jj_tags = ['JJ', 'JJR', 'JJS']
 92 |     # verbs
 93 |     vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
 94 |     nltk_tags = nn_tags + jj_tags + vb_tags
 95 | 
 96 |     # break string into 'words'
 97 |     text = data_str.split()
 98 | 
 99 |     # tag the text and keep only those with the right tags
100 |     tagged_text = pos_tag(text)
101 |     for tagged_word in tagged_text:
102 |         if tagged_word[1] in nltk_tags:
103 |             cleaned_str += tagged_word[0] + ' '
104 | 
105 |     return cleaned_str
106 | 
107 | 
108 | # Tweets are going to use different forms of a word, such as organize, organizes, and
109 | # organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy,
110 | # democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these
111 | # words to return documents that contain another word in the set.
112 | # Reduces Dimensionality and boosts numerical measures like TFIDF
113 | 
114 | # http://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
115 | # lemmatization of a single Tweets (cleaned_str/row/document)
116 | def lemmatize(data_str):
117 |     # expects a string
118 |     list_pos = 0
119 |     cleaned_str = ''
120 |     lmtzr = WordNetLemmatizer()
121 |     text = data_str.split()
122 |     tagged_words = pos_tag(text)
123 |     for word in tagged_words:
124 |         if 'v' in word[1].lower():
125 |             lemma = lmtzr.lemmatize(word[0], pos='v')
126 |         else:
127 |             lemma = lmtzr.lemmatize(word[0], pos='n')
128 |         if list_pos == 0:
129 |             cleaned_str = lemma
130 |         else:
131 |             cleaned_str = cleaned_str + ' ' + lemma
132 |         list_pos += 1
133 |     return cleaned_str
134 | 
135 | 
136 | # check to see if a row only contains whitespace
137 | def check_blanks(data_str):
138 |     is_blank = str(data_str.isspace())
139 |     return is_blank
140 | 


--------------------------------------------------------------------------------
/notebooks/ipynb/vector.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "from pyspark import SparkConf, SparkContext\n",
 12 |     "## set up spark context\n",
 13 |     "from pyspark.sql import SQLContext\n",
 14 |     "sc = SparkContext()\n",
 15 |     "sqlContext = SQLContext(sc)\n",
 16 |     "## set up  SparkSession\n",
 17 |     "from pyspark.sql import SparkSession\n",
 18 |     "\n",
 19 |     "spark = SparkSession \\\n",
 20 |     "    .builder \\\n",
 21 |     "    .appName(\"Python Spark SQL basic example\") \\\n",
 22 |     "    .config(\"spark.some.config.option\", \"some-value\") \\\n",
 23 |     "    .getOrCreate()"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {
 30 |     "collapsed": false
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "from pyspark.ml.linalg import Vectors\n",
 35 |     "densevector = Vectors.dense([1,3,4,2.5])"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {
 42 |     "collapsed": false
 43 |    },
 44 |    "outputs": [
 45 |     {
 46 |      "data": {
 47 |       "text/plain": [
 48 |        "DenseVector([1.0, 3.0, 4.0, 2.5])"
 49 |       ]
 50 |      },
 51 |      "execution_count": 3,
 52 |      "metadata": {},
 53 |      "output_type": "execute_result"
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "densevector"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [
 67 |     {
 68 |      "data": {
 69 |       "text/plain": [
 70 |        "array([ 1. ,  3. ,  4. ,  2.5])"
 71 |       ]
 72 |      },
 73 |      "execution_count": 4,
 74 |      "metadata": {},
 75 |      "output_type": "execute_result"
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "densevector.toArray()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 5,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "sparseVector = Vectors.sparse(10, [1, 3], [3.0, 4.5])"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 6,
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "array([ 0. ,  3. ,  0. ,  4.5,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ])"
104 |       ]
105 |      },
106 |      "execution_count": 6,
107 |      "metadata": {},
108 |      "output_type": "execute_result"
109 |     }
110 |    ],
111 |    "source": [
112 |     "sparseVector.toArray()"
113 |    ]
114 |   }
115 |  ],
116 |  "metadata": {
117 |   "kernelspec": {
118 |    "display_name": "Python 2",
119 |    "language": "python",
120 |    "name": "python2"
121 |   },
122 |   "language_info": {
123 |    "codemirror_mode": {
124 |     "name": "ipython",
125 |     "version": 2
126 |    },
127 |    "file_extension": ".py",
128 |    "mimetype": "text/x-python",
129 |    "name": "python",
130 |    "nbconvert_exporter": "python",
131 |    "pygments_lexer": "ipython2",
132 |    "version": "2.7.6"
133 |   }
134 |  },
135 |  "nbformat": 4,
136 |  "nbformat_minor": 1
137 | }
138 | 


--------------------------------------------------------------------------------
/pyFiles/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6.1 (~/anaconda/bin/python)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/pyFiles/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/pyFiles.iml" filepath="$PROJECT_DIR$/.idea/pyFiles.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/pyFiles/.idea/pyFiles.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="py.test" />
10 |     <option name="PROJECT_TEST_RUNNER" value="py.test" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/pyFiles/my_module.py:
--------------------------------------------------------------------------------
1 | def addPyFiles_is_successfull():
2 |     return(True)
3 | 
4 | def sum_two_variables(a, b):
5 |     return(sum([a,b]))
6 |     


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/.gitignore:
--------------------------------------------------------------------------------
 1 | /debug
 2 | /jstree.sublime-project
 3 | /jstree.sublime-workspace
 4 | /bower_components
 5 | /node_modules
 6 | /site
 7 | /nuget
 8 | /demo/filebrowser/data/root
 9 | /npm.txt
10 | /libs
11 | /docs
12 | /dist/libs
13 | /.vscode
14 | /.idea


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014 Ivan Bozhanov
 2 | 
 3 | Permission is hereby granted, free of charge, to any person
 4 | obtaining a copy of this software and associated documentation
 5 | files (the "Software"), to deal in the Software without
 6 | restriction, including without limitation the rights to use,
 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following
10 | conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "jstree",
 3 | 	"license": "MIT",
 4 | 	"version": "3.3.5",
 5 | 	"main" : [
 6 | 		"./dist/jstree.js",
 7 | 		"./dist/themes/default/style.css"
 8 | 	],
 9 | 	"ignore": [
10 | 		"**/.*",
11 | 		"docs",
12 | 		"demo",
13 | 		"libs",
14 | 		"node_modules",
15 | 		"test",
16 | 		"libs",
17 | 		"jstree.jquery.json",
18 | 		"gruntfile.js",
19 | 		"package.json",
20 | 		"bower.json",
21 | 		"component.json",
22 | 		"LICENCE-MIT",
23 | 		"README.md"
24 | 	],
25 | 	"dependencies": {
26 | 		"jquery": ">=1.9.1"
27 | 	},
28 | 	"keywords": [
29 | 		"ui",
30 | 		"tree",
31 | 		"jstree"
32 | 	]
33 | }
34 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/component.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "jstree",
 3 | 	"repo": "vakata/jstree",
 4 | 	"description": "jsTree is jquery plugin, that provides interactive trees.",
 5 | 	"version": "3.3.5",
 6 | 	"license": "MIT",
 7 | 	"keywords": [
 8 | 		"ui",
 9 | 		"tree",
10 | 		"jstree"
11 | 	],
12 | 	"scripts": [
13 | 		"dist/jstree.js",
14 | 		"dist/jstree.min.js"
15 | 	],
16 | 	"images": [
17 | 		"dist/themes/default/32px.png",
18 | 		"dist/themes/default/40px.png",
19 | 		"dist/themes/default/throbber.gif"
20 | 	],
21 | 	"styles": [
22 | 		"dist/themes/default/style.css",
23 | 		"dist/themes/default/style.min.css"
24 | 	],
25 | 	"dependencies": {
26 | 		"components/jquery": ">=1.9.1"
27 | 	}
28 | }
29 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/composer.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "vakata/jstree",
 3 | 	"description": "jsTree is jquery plugin, that provides interactive trees.",
 4 | 	"type": "component",
 5 | 	"homepage": "http://jstree.com",
 6 | 	"license": "MIT",
 7 | 	"support": {
 8 | 		"issues": "https://github.com/vakata/jstree/issues",
 9 | 		"forum": "https://groups.google.com/forum/#!forum/jstree",
10 | 		"source": "https://github.com/vakata/jstree"
11 | 	},
12 | 	"authors": [
13 | 		{
14 | 			"name": "Ivan Bozhanov",
15 | 			"email": "jstree@jstree.com"
16 | 		}
17 | 	],
18 | 	"require": {
19 | 		"components/jquery": ">=1.9.1"
20 | 	},
21 | 	"suggest": {
22 | 		"robloach/component-installer": "Allows installation of Components via Composer"
23 | 	},
24 | 	"extra": {
25 | 		"component": {
26 | 			"scripts": [
27 | 				"dist/jstree.js"
28 | 			],
29 | 			"styles": [
30 | 				"dist/themes/default/style.css"
31 | 			],
32 | 			"images": [
33 | 				"dist/themes/default/32px.png",
34 | 				"dist/themes/default/40px.png",
35 | 				"dist/themes/default/throbber.gif"
36 | 			],
37 | 			"files": [
38 | 				"dist/jstree.min.js",
39 | 				"dist/themes/default/style.min.css",
40 | 				"dist/themes/default/32px.png",
41 | 				"dist/themes/default/40px.png",
42 | 				"dist/themes/default/throbber.gif"
43 | 			]
44 | 		}
45 | 	}
46 | }


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/demo/README.md:
--------------------------------------------------------------------------------
1 | ## PHP demos moved to new repository
2 | https://github.com/vakata/jstree-php-demos


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/demo/basic/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 | 	<meta charset="UTF-8">
  5 | 	<title>jstree basic demos</title>
  6 | 	<style>
  7 | 	html { margin:0; padding:0; font-size:62.5%; }
  8 | 	body { max-width:800px; min-width:300px; margin:0 auto; padding:20px 10px; font-size:14px; font-size:1.4em; }
  9 | 	h1 { font-size:1.8em; }
 10 | 	.demo { overflow:auto; border:1px solid silver; min-height:100px; }
 11 | 	</style>
 12 | 	<link rel="stylesheet" href="./../../dist/themes/default/style.min.css" />
 13 | </head>
 14 | <body>
 15 | 	<h1>HTML demo</h1>
 16 | 	<div id="html" class="demo">
 17 | 		<ul>
 18 | 			<li data-jstree='{ "opened" : true }'>Root node
 19 | 				<ul>
 20 | 					<li data-jstree='{ "selected" : true }'>Child node 1</li>
 21 | 					<li>Child node 2</li>
 22 | 				</ul>
 23 | 			</li>
 24 | 		</ul>
 25 | 	</div>
 26 | 
 27 | 	<h1>Inline data demo</h1>
 28 | 	<div id="data" class="demo"></div>
 29 | 
 30 | 	<h1>Data format demo</h1>
 31 | 	<div id="frmt" class="demo"></div>
 32 | 
 33 | 	<h1>AJAX demo</h1>
 34 | 	<div id="ajax" class="demo"></div>
 35 | 
 36 | 	<h1>Lazy loading demo</h1>
 37 | 	<div id="lazy" class="demo"></div>
 38 | 
 39 | 	<h1>Callback function data demo</h1>
 40 | 	<div id="clbk" class="demo"></div>
 41 | 
 42 | 	<h1>Interaction and events demo</h1>
 43 | 	<button id="evts_button">select node with id 1</button> <em>either click the button or a node in the tree</em>
 44 | 	<div id="evts" class="demo"></div>
 45 | 
 46 | 	<script src="//ajax.googleapis.com/ajax/libs/jquery/1/jquery.min.js"></script>
 47 | 	<script src="./../../dist/jstree.min.js"></script>
 48 | 	
 49 | 	<script>
 50 | 	// html demo
 51 | 	$('#html').jstree();
 52 | 
 53 | 	// inline data demo
 54 | 	$('#data').jstree({
 55 | 		'core' : {
 56 | 			'data' : [
 57 | 				{ "text" : "Root node", "children" : [
 58 | 						{ "text" : "Child node 1" },
 59 | 						{ "text" : "Child node 2" }
 60 | 				]}
 61 | 			]
 62 | 		}
 63 | 	});
 64 | 
 65 | 	// data format demo
 66 | 	$('#frmt').jstree({
 67 | 		'core' : {
 68 | 			'data' : [
 69 | 				{
 70 | 					"text" : "Root node",
 71 | 					"state" : { "opened" : true },
 72 | 					"children" : [
 73 | 						{
 74 | 							"text" : "Child node 1",
 75 | 							"state" : { "selected" : true },
 76 | 							"icon" : "jstree-file"
 77 | 						},
 78 | 						{ "text" : "Child node 2", "state" : { "disabled" : true } }
 79 | 					]
 80 | 				}
 81 | 			]
 82 | 		}
 83 | 	});
 84 | 
 85 | 	// ajax demo
 86 | 	$('#ajax').jstree({
 87 | 		'core' : {
 88 | 			'data' : {
 89 | 				"url" : "./root.json",
 90 | 				"dataType" : "json" // needed only if you do not supply JSON headers
 91 | 			}
 92 | 		}
 93 | 	});
 94 | 
 95 | 	// lazy demo
 96 | 	$('#lazy').jstree({
 97 | 		'core' : {
 98 | 			'data' : {
 99 | 				"url" : "//www.jstree.com/fiddle/?lazy",
100 | 				"data" : function (node) {
101 | 					return { "id" : node.id };
102 | 				}
103 | 			}
104 | 		}
105 | 	});
106 | 
107 | 	// data from callback
108 | 	$('#clbk').jstree({
109 | 		'core' : {
110 | 			'data' : function (node, cb) {
111 | 				if(node.id === "#") {
112 | 					cb([{"text" : "Root", "id" : "1", "children" : true}]);
113 | 				}
114 | 				else {
115 | 					cb(["Child"]);
116 | 				}
117 | 			}
118 | 		}
119 | 	});
120 | 
121 | 	// interaction and events
122 | 	$('#evts_button').on("click", function () {
123 | 		var instance = $('#evts').jstree(true);
124 | 		instance.deselect_all();
125 | 		instance.select_node('1');
126 | 	});
127 | 	$('#evts')
128 | 		.on("changed.jstree", function (e, data) {
129 | 			if(data.selected.length) {
130 | 				alert('The selected node is: ' + data.instance.get_node(data.selected[0]).text);
131 | 			}
132 | 		})
133 | 		.jstree({
134 | 			'core' : {
135 | 				'multiple' : false,
136 | 				'data' : [
137 | 					{ "text" : "Root node", "children" : [
138 | 							{ "text" : "Child node 1", "id" : 1 },
139 | 							{ "text" : "Child node 2" }
140 | 					]}
141 | 				]
142 | 			}
143 | 		});
144 | 	</script>
145 | </body>
146 | </html>


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/demo/basic/root.json:
--------------------------------------------------------------------------------
1 | [{"id":1,"text":"Root node","children":[{"id":2,"text":"Child node 1"},{"id":3,"text":"Child node 2"}]}]


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default-dark/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/32px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default-dark/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/40px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default-dark/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/throbber.gif


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/32px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/40px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/throbber.gif


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/jstree.jquery.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"name": "jstree",
 3 | 	"title": "jsTree",
 4 | 	"description": "Tree view for jQuery",
 5 | 	"version": "3.3.5",
 6 | 	"homepage": "http://jstree.com",
 7 | 	"keywords": [
 8 | 		"ui",
 9 | 		"tree",
10 | 		"jstree"
11 | 	],
12 | 	"author": {
13 | 		"name": "Ivan Bozhanov",
14 | 		"email": "jstree@jstree.com",
15 | 		"url": "http://vakata.com"
16 | 	},
17 | 	"licenses": [
18 | 		{
19 | 			"type": "MIT",
20 | 			"url": "https://github.com/vakata/jstree/blob/master/LICENSE-MIT"
21 | 		}
22 | 	],
23 | 	"bugs": "https://github.com/vakata/jstree/issues",
24 | 	"demo": "http://jstree.com/demo",
25 | 	"dependencies": {
26 | 		"jquery": ">=1.9.1"
27 | 	}
28 | }


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "jstree",
 3 |   "title": "jsTree",
 4 |   "description": "jQuery tree plugin",
 5 |   "version": "3.3.5",
 6 |   "homepage": "http://jstree.com",
 7 |   "main": "./dist/jstree.js",
 8 |   "author": {
 9 |     "name": "Ivan Bozhanov",
10 |     "email": "jstree@jstree.com",
11 |     "url": "http://vakata.com"
12 |   },
13 |   "repository": {
14 |     "type": "git",
15 |     "url": "git://github.com/vakata/jstree.git"
16 |   },
17 |   "bugs": {
18 |     "url": "https://github.com/vakata/jstree/issues"
19 |   },
20 |   "license": "MIT",
21 |   "licenses": [
22 |     {
23 |       "type": "MIT",
24 |       "url": "https://github.com/vakata/jstree/blob/master/LICENSE-MIT"
25 |     }
26 |   ],
27 |   "keywords": [],
28 |   "devDependencies": {
29 |     "dox": "~0.4.4",
30 |     "grunt": "~0.4.0",
31 |     "grunt-contrib-concat": "*",
32 |     "grunt-contrib-copy": "*",
33 |     "grunt-contrib-imagemin": "~0.4.0",
34 |     "grunt-contrib-jshint": "*",
35 |     "grunt-contrib-less": "~0.8.2",
36 |     "grunt-contrib-qunit": "~v0.3.0",
37 |     "grunt-contrib-uglify": "*",
38 |     "grunt-contrib-watch": "~0.5.3",
39 |     "grunt-phantomcss-gitdiff": "0.0.7",
40 |     "grunt-resemble-cli": "0.0.8",
41 |     "grunt-text-replace": "~0.3.11"
42 |   },
43 |   "dependencies": {
44 |     "jquery": ">=1.9.1"
45 |   },
46 |   "npmName": "jstree",
47 |   "npmFileMap": [
48 |     {
49 |       "basePath": "/dist/",
50 |       "files": [
51 |         "jstree.min.js",
52 |         "themes/**/*.png",
53 |         "themes/**/*.gif",
54 |         "themes/**/*.min.css"
55 |       ]
56 |     }
57 |   ]
58 | }
59 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/intro.js:
--------------------------------------------------------------------------------
 1 | /*globals jQuery, define, module, exports, require, window, document, postMessage */
 2 | (function (factory) {
 3 | 	"use strict";
 4 | 	if (typeof define === 'function' && define.amd) {
 5 | 		define(['jquery'], factory);
 6 | 	}
 7 | 	else if(typeof module !== 'undefined' && module.exports) {
 8 | 		module.exports = factory(require('jquery'));
 9 | 	}
10 | 	else {
11 | 		factory(jQuery);
12 | 	}
13 | }(function ($, undefined) {
14 | 	"use strict";
15 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.changed.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * ### Changed plugin
 3 |  *
 4 |  * This plugin adds more information to the `changed.jstree` event. The new data is contained in the `changed` event data property, and contains a lists of `selected` and `deselected` nodes.
 5 |  */
 6 | /*globals jQuery, define, exports, require, document */
 7 | (function (factory) {
 8 | 	"use strict";
 9 | 	if (typeof define === 'function' && define.amd) {
10 | 		define('jstree.changed', ['jquery','jstree'], factory);
11 | 	}
12 | 	else if(typeof exports === 'object') {
13 | 		factory(require('jquery'), require('jstree'));
14 | 	}
15 | 	else {
16 | 		factory(jQuery, jQuery.jstree);
17 | 	}
18 | }(function ($, jstree, undefined) {
19 | 	"use strict";
20 | 
21 | 	if($.jstree.plugins.changed) { return; }
22 | 
23 | 	$.jstree.plugins.changed = function (options, parent) {
24 | 		var last = [];
25 | 		this.trigger = function (ev, data) {
26 | 			var i, j;
27 | 			if(!data) {
28 | 				data = {};
29 | 			}
30 | 			if(ev.replace('.jstree','') === 'changed') {
31 | 				data.changed = { selected : [], deselected : [] };
32 | 				var tmp = {};
33 | 				for(i = 0, j = last.length; i < j; i++) {
34 | 					tmp[last[i]] = 1;
35 | 				}
36 | 				for(i = 0, j = data.selected.length; i < j; i++) {
37 | 					if(!tmp[data.selected[i]]) {
38 | 						data.changed.selected.push(data.selected[i]);
39 | 					}
40 | 					else {
41 | 						tmp[data.selected[i]] = 2;
42 | 					}
43 | 				}
44 | 				for(i = 0, j = last.length; i < j; i++) {
45 | 					if(tmp[last[i]] === 1) {
46 | 						data.changed.deselected.push(last[i]);
47 | 					}
48 | 				}
49 | 				last = data.selected.slice();
50 | 			}
51 | 			/**
52 | 			 * triggered when selection changes (the "changed" plugin enhances the original event with more data)
53 | 			 * @event
54 | 			 * @name changed.jstree
55 | 			 * @param {Object} node
56 | 			 * @param {Object} action the action that caused the selection to change
57 | 			 * @param {Array} selected the current selection
58 | 			 * @param {Object} changed an object containing two properties `selected` and `deselected` - both arrays of node IDs, which were selected or deselected since the last changed event
59 | 			 * @param {Object} event the event (if any) that triggered this changed event
60 | 			 * @plugin changed
61 | 			 */
62 | 			parent.trigger.call(this, ev, data);
63 | 		};
64 | 		this.refresh = function (skip_loading, forget_state) {
65 | 			last = [];
66 | 			return parent.refresh.apply(this, arguments);
67 | 		};
68 | 	};
69 | }));


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.conditionalselect.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * ### Conditionalselect plugin
 3 |  *
 4 |  * This plugin allows defining a callback to allow or deny node selection by user input (activate node method).
 5 |  */
 6 | /*globals jQuery, define, exports, require, document */
 7 | (function (factory) {
 8 | 	"use strict";
 9 | 	if (typeof define === 'function' && define.amd) {
10 | 		define('jstree.conditionalselect', ['jquery','jstree'], factory);
11 | 	}
12 | 	else if(typeof exports === 'object') {
13 | 		factory(require('jquery'), require('jstree'));
14 | 	}
15 | 	else {
16 | 		factory(jQuery, jQuery.jstree);
17 | 	}
18 | }(function ($, jstree, undefined) {
19 | 	"use strict";
20 | 
21 | 	if($.jstree.plugins.conditionalselect) { return; }
22 | 
23 | 	/**
24 | 	 * a callback (function) which is invoked in the instance's scope and receives two arguments - the node and the event that triggered the `activate_node` call. Returning false prevents working with the node, returning true allows invoking activate_node. Defaults to returning `true`.
25 | 	 * @name $.jstree.defaults.checkbox.visible
26 | 	 * @plugin checkbox
27 | 	 */
28 | 	$.jstree.defaults.conditionalselect = function () { return true; };
29 | 	$.jstree.plugins.conditionalselect = function (options, parent) {
30 | 		// own function
31 | 		this.activate_node = function (obj, e) {
32 | 			if(this.settings.conditionalselect.call(this, this.get_node(obj), e)) {
33 | 				return parent.activate_node.call(this, obj, e);
34 | 			}
35 | 		};
36 | 	};
37 | 
38 | }));


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.massload.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * ### Massload plugin
  3 |  *
  4 |  * Adds massload functionality to jsTree, so that multiple nodes can be loaded in a single request (only useful with lazy loading).
  5 |  */
  6 | /*globals jQuery, define, exports, require, document */
  7 | (function (factory) {
  8 | 	"use strict";
  9 | 	if (typeof define === 'function' && define.amd) {
 10 | 		define('jstree.massload', ['jquery','jstree'], factory);
 11 | 	}
 12 | 	else if(typeof exports === 'object') {
 13 | 		factory(require('jquery'), require('jstree'));
 14 | 	}
 15 | 	else {
 16 | 		factory(jQuery, jQuery.jstree);
 17 | 	}
 18 | }(function ($, jstree, undefined) {
 19 | 	"use strict";
 20 | 
 21 | 	if($.jstree.plugins.massload) { return; }
 22 | 
 23 | 	/**
 24 | 	 * massload configuration
 25 | 	 *
 26 | 	 * It is possible to set this to a standard jQuery-like AJAX config.
 27 | 	 * In addition to the standard jQuery ajax options here you can supply functions for `data` and `url`, the functions will be run in the current instance's scope and a param will be passed indicating which node IDs need to be loaded, the return value of those functions will be used.
 28 | 	 *
 29 | 	 * You can also set this to a function, that function will receive the node IDs being loaded as argument and a second param which is a function (callback) which should be called with the result.
 30 | 	 *
 31 | 	 * Both the AJAX and the function approach rely on the same return value - an object where the keys are the node IDs, and the value is the children of that node as an array.
 32 | 	 *
 33 | 	 *	{
 34 | 	 *		"id1" : [{ "text" : "Child of ID1", "id" : "c1" }, { "text" : "Another child of ID1", "id" : "c2" }],
 35 | 	 *		"id2" : [{ "text" : "Child of ID2", "id" : "c3" }]
 36 | 	 *	}
 37 | 	 * 
 38 | 	 * @name $.jstree.defaults.massload
 39 | 	 * @plugin massload
 40 | 	 */
 41 | 	$.jstree.defaults.massload = null;
 42 | 	$.jstree.plugins.massload = function (options, parent) {
 43 | 		this.init = function (el, options) {
 44 | 			this._data.massload = {};
 45 | 			parent.init.call(this, el, options);
 46 | 		};
 47 | 		this._load_nodes = function (nodes, callback, is_callback, force_reload) {
 48 | 			var s = this.settings.massload,
 49 | 				nodesString = JSON.stringify(nodes),
 50 | 				toLoad = [],
 51 | 				m = this._model.data,
 52 | 				i, j, dom;
 53 | 			if (!is_callback) {
 54 | 				for(i = 0, j = nodes.length; i < j; i++) {
 55 | 					if(!m[nodes[i]] || ( (!m[nodes[i]].state.loaded && !m[nodes[i]].state.failed) || force_reload) ) {
 56 | 						toLoad.push(nodes[i]);
 57 | 						dom = this.get_node(nodes[i], true);
 58 | 						if (dom && dom.length) {
 59 | 							dom.addClass("jstree-loading").attr('aria-busy',true);
 60 | 						}
 61 | 					}
 62 | 				}
 63 | 				this._data.massload = {};
 64 | 				if (toLoad.length) {
 65 | 					if($.isFunction(s)) {
 66 | 						return s.call(this, toLoad, $.proxy(function (data) {
 67 | 							var i, j;
 68 | 							if(data) {
 69 | 								for(i in data) {
 70 | 									if(data.hasOwnProperty(i)) {
 71 | 										this._data.massload[i] = data[i];
 72 | 									}
 73 | 								}
 74 | 							}
 75 | 							for(i = 0, j = nodes.length; i < j; i++) {
 76 | 								dom = this.get_node(nodes[i], true);
 77 | 								if (dom && dom.length) {
 78 | 									dom.removeClass("jstree-loading").attr('aria-busy',false);
 79 | 								}
 80 | 							}
 81 | 							parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
 82 | 						}, this));
 83 | 					}
 84 | 					if(typeof s === 'object' && s && s.url) {
 85 | 						s = $.extend(true, {}, s);
 86 | 						if($.isFunction(s.url)) {
 87 | 							s.url = s.url.call(this, toLoad);
 88 | 						}
 89 | 						if($.isFunction(s.data)) {
 90 | 							s.data = s.data.call(this, toLoad);
 91 | 						}
 92 | 						return $.ajax(s)
 93 | 							.done($.proxy(function (data,t,x) {
 94 | 									var i, j;
 95 | 									if(data) {
 96 | 										for(i in data) {
 97 | 											if(data.hasOwnProperty(i)) {
 98 | 												this._data.massload[i] = data[i];
 99 | 											}
100 | 										}
101 | 									}
102 | 									for(i = 0, j = nodes.length; i < j; i++) {
103 | 										dom = this.get_node(nodes[i], true);
104 | 										if (dom && dom.length) {
105 | 											dom.removeClass("jstree-loading").attr('aria-busy',false);
106 | 										}
107 | 									}
108 | 									parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
109 | 								}, this))
110 | 							.fail($.proxy(function (f) {
111 | 									parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
112 | 								}, this));
113 | 					}
114 | 				}
115 | 			}
116 | 			return parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
117 | 		};
118 | 		this._load_node = function (obj, callback) {
119 | 			var data = this._data.massload[obj.id],
120 | 				rslt = null, dom;
121 | 			if(data) {
122 | 				rslt = this[typeof data === 'string' ? '_append_html_data' : '_append_json_data'](
123 | 					obj,
124 | 					typeof data === 'string' ? $($.parseHTML(data)).filter(function () { return this.nodeType !== 3; }) : data,
125 | 					function (status) { callback.call(this, status); }
126 | 				);
127 | 				dom = this.get_node(obj.id, true);
128 | 				if (dom && dom.length) {
129 | 					dom.removeClass("jstree-loading").attr('aria-busy',false);
130 | 				}
131 | 				delete this._data.massload[obj.id];
132 | 				return rslt;
133 | 			}
134 | 			return parent._load_node.call(this, obj, callback);
135 | 		};
136 | 	};
137 | }));


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.sort.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * ### Sort plugin
 3 |  *
 4 |  * Automatically sorts all siblings in the tree according to a sorting function.
 5 |  */
 6 | /*globals jQuery, define, exports, require */
 7 | (function (factory) {
 8 | 	"use strict";
 9 | 	if (typeof define === 'function' && define.amd) {
10 | 		define('jstree.sort', ['jquery','jstree'], factory);
11 | 	}
12 | 	else if(typeof exports === 'object') {
13 | 		factory(require('jquery'), require('jstree'));
14 | 	}
15 | 	else {
16 | 		factory(jQuery, jQuery.jstree);
17 | 	}
18 | }(function ($, jstree, undefined) {
19 | 	"use strict";
20 | 
21 | 	if($.jstree.plugins.sort) { return; }
22 | 
23 | 	/**
24 | 	 * the settings function used to sort the nodes.
25 | 	 * It is executed in the tree's context, accepts two nodes as arguments and should return `1` or `-1`.
26 | 	 * @name $.jstree.defaults.sort
27 | 	 * @plugin sort
28 | 	 */
29 | 	$.jstree.defaults.sort = function (a, b) {
30 | 		//return this.get_type(a) === this.get_type(b) ? (this.get_text(a) > this.get_text(b) ? 1 : -1) : this.get_type(a) >= this.get_type(b);
31 | 		return this.get_text(a) > this.get_text(b) ? 1 : -1;
32 | 	};
33 | 	$.jstree.plugins.sort = function (options, parent) {
34 | 		this.bind = function () {
35 | 			parent.bind.call(this);
36 | 			this.element
37 | 				.on("model.jstree", $.proxy(function (e, data) {
38 | 						this.sort(data.parent, true);
39 | 					}, this))
40 | 				.on("rename_node.jstree create_node.jstree", $.proxy(function (e, data) {
41 | 						this.sort(data.parent || data.node.parent, false);
42 | 						this.redraw_node(data.parent || data.node.parent, true);
43 | 					}, this))
44 | 				.on("move_node.jstree copy_node.jstree", $.proxy(function (e, data) {
45 | 						this.sort(data.parent, false);
46 | 						this.redraw_node(data.parent, true);
47 | 					}, this));
48 | 		};
49 | 		/**
50 | 		 * used to sort a node's children
51 | 		 * @private
52 | 		 * @name sort(obj [, deep])
53 | 		 * @param  {mixed} obj the node
54 | 		 * @param {Boolean} deep if set to `true` nodes are sorted recursively.
55 | 		 * @plugin sort
56 | 		 * @trigger search.jstree
57 | 		 */
58 | 		this.sort = function (obj, deep) {
59 | 			var i, j;
60 | 			obj = this.get_node(obj);
61 | 			if(obj && obj.children && obj.children.length) {
62 | 				obj.children.sort($.proxy(this.settings.sort, this));
63 | 				if(deep) {
64 | 					for(i = 0, j = obj.children_d.length; i < j; i++) {
65 | 						this.sort(obj.children_d[i], false);
66 | 					}
67 | 				}
68 | 			}
69 | 		};
70 | 	};
71 | 
72 | 	// include the sort plugin by default
73 | 	// $.jstree.defaults.plugins.push("sort");
74 | }));


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.state.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * ### State plugin
  3 |  *
  4 |  * Saves the state of the tree (selected nodes, opened nodes) on the user's computer using available options (localStorage, cookies, etc)
  5 |  */
  6 | /*globals jQuery, define, exports, require */
  7 | (function (factory) {
  8 | 	"use strict";
  9 | 	if (typeof define === 'function' && define.amd) {
 10 | 		define('jstree.state', ['jquery','jstree'], factory);
 11 | 	}
 12 | 	else if(typeof exports === 'object') {
 13 | 		factory(require('jquery'), require('jstree'));
 14 | 	}
 15 | 	else {
 16 | 		factory(jQuery, jQuery.jstree);
 17 | 	}
 18 | }(function ($, jstree, undefined) {
 19 | 	"use strict";
 20 | 
 21 | 	if($.jstree.plugins.state) { return; }
 22 | 
 23 | 	var to = false;
 24 | 	/**
 25 | 	 * stores all defaults for the state plugin
 26 | 	 * @name $.jstree.defaults.state
 27 | 	 * @plugin state
 28 | 	 */
 29 | 	$.jstree.defaults.state = {
 30 | 		/**
 31 | 		 * A string for the key to use when saving the current tree (change if using multiple trees in your project). Defaults to `jstree`.
 32 | 		 * @name $.jstree.defaults.state.key
 33 | 		 * @plugin state
 34 | 		 */
 35 | 		key		: 'jstree',
 36 | 		/**
 37 | 		 * A space separated list of events that trigger a state save. Defaults to `changed.jstree open_node.jstree close_node.jstree`.
 38 | 		 * @name $.jstree.defaults.state.events
 39 | 		 * @plugin state
 40 | 		 */
 41 | 		events	: 'changed.jstree open_node.jstree close_node.jstree check_node.jstree uncheck_node.jstree',
 42 | 		/**
 43 | 		 * Time in milliseconds after which the state will expire. Defaults to 'false' meaning - no expire.
 44 | 		 * @name $.jstree.defaults.state.ttl
 45 | 		 * @plugin state
 46 | 		 */
 47 | 		ttl		: false,
 48 | 		/**
 49 | 		 * A function that will be executed prior to restoring state with one argument - the state object. Can be used to clear unwanted parts of the state.
 50 | 		 * @name $.jstree.defaults.state.filter
 51 | 		 * @plugin state
 52 | 		 */
 53 | 		filter	: false,
 54 | 		/**
 55 | 		 * Should loaded nodes be restored (setting this to true means that it is possible that the whole tree will be loaded for some users - use with caution). Defaults to `false`
 56 | 		 * @name $.jstree.defaults.state.preserve_loaded
 57 | 		 * @plugin state
 58 | 		 */
 59 | 		preserve_loaded : false
 60 | 	};
 61 | 	$.jstree.plugins.state = function (options, parent) {
 62 | 		this.bind = function () {
 63 | 			parent.bind.call(this);
 64 | 			var bind = $.proxy(function () {
 65 | 				this.element.on(this.settings.state.events, $.proxy(function () {
 66 | 					if(to) { clearTimeout(to); }
 67 | 					to = setTimeout($.proxy(function () { this.save_state(); }, this), 100);
 68 | 				}, this));
 69 | 				/**
 70 | 				 * triggered when the state plugin is finished restoring the state (and immediately after ready if there is no state to restore).
 71 | 				 * @event
 72 | 				 * @name state_ready.jstree
 73 | 				 * @plugin state
 74 | 				 */
 75 | 				this.trigger('state_ready');
 76 | 			}, this);
 77 | 			this.element
 78 | 				.on("ready.jstree", $.proxy(function (e, data) {
 79 | 						this.element.one("restore_state.jstree", bind);
 80 | 						if(!this.restore_state()) { bind(); }
 81 | 					}, this));
 82 | 		};
 83 | 		/**
 84 | 		 * save the state
 85 | 		 * @name save_state()
 86 | 		 * @plugin state
 87 | 		 */
 88 | 		this.save_state = function () {
 89 | 			var tm = this.get_state();
 90 | 			if (!this.settings.state.preserve_loaded) {
 91 | 				delete tm.core.loaded;
 92 | 			}
 93 | 			var st = { 'state' : tm, 'ttl' : this.settings.state.ttl, 'sec' : +(new Date()) };
 94 | 			$.vakata.storage.set(this.settings.state.key, JSON.stringify(st));
 95 | 		};
 96 | 		/**
 97 | 		 * restore the state from the user's computer
 98 | 		 * @name restore_state()
 99 | 		 * @plugin state
100 | 		 */
101 | 		this.restore_state = function () {
102 | 			var k = $.vakata.storage.get(this.settings.state.key);
103 | 			if(!!k) { try { k = JSON.parse(k); } catch(ex) { return false; } }
104 | 			if(!!k && k.ttl && k.sec && +(new Date()) - k.sec > k.ttl) { return false; }
105 | 			if(!!k && k.state) { k = k.state; }
106 | 			if(!!k && $.isFunction(this.settings.state.filter)) { k = this.settings.state.filter.call(this, k); }
107 | 			if(!!k) {
108 | 				if (!this.settings.state.preserve_loaded) {
109 | 					delete k.core.loaded;
110 | 				}
111 | 				this.element.one("set_state.jstree", function (e, data) { data.instance.trigger('restore_state', { 'state' : $.extend(true, {}, k) }); });
112 | 				this.set_state(k);
113 | 				return true;
114 | 			}
115 | 			return false;
116 | 		};
117 | 		/**
118 | 		 * clear the state on the user's computer
119 | 		 * @name clear_state()
120 | 		 * @plugin state
121 | 		 */
122 | 		this.clear_state = function () {
123 | 			return $.vakata.storage.del(this.settings.state.key);
124 | 		};
125 | 	};
126 | 
127 | 	(function ($, undefined) {
128 | 		$.vakata.storage = {
129 | 			// simply specifying the functions in FF throws an error
130 | 			set : function (key, val) { return window.localStorage.setItem(key, val); },
131 | 			get : function (key) { return window.localStorage.getItem(key); },
132 | 			del : function (key) { return window.localStorage.removeItem(key); }
133 | 		};
134 | 	}($));
135 | 
136 | 	// include the state plugin by default
137 | 	// $.jstree.defaults.plugins.push("state");
138 | }));


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.wholerow.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * ### Wholerow plugin
  3 |  *
  4 |  * Makes each node appear block level. Making selection easier. May cause slow down for large trees in old browsers.
  5 |  */
  6 | /*globals jQuery, define, exports, require */
  7 | (function (factory) {
  8 | 	"use strict";
  9 | 	if (typeof define === 'function' && define.amd) {
 10 | 		define('jstree.wholerow', ['jquery','jstree'], factory);
 11 | 	}
 12 | 	else if(typeof exports === 'object') {
 13 | 		factory(require('jquery'), require('jstree'));
 14 | 	}
 15 | 	else {
 16 | 		factory(jQuery, jQuery.jstree);
 17 | 	}
 18 | }(function ($, jstree, undefined) {
 19 | 	"use strict";
 20 | 
 21 | 	if($.jstree.plugins.wholerow) { return; }
 22 | 
 23 | 	var div = document.createElement('DIV');
 24 | 	div.setAttribute('unselectable','on');
 25 | 	div.setAttribute('role','presentation');
 26 | 	div.className = 'jstree-wholerow';
 27 | 	div.innerHTML = '&#160;';
 28 | 	$.jstree.plugins.wholerow = function (options, parent) {
 29 | 		this.bind = function () {
 30 | 			parent.bind.call(this);
 31 | 
 32 | 			this.element
 33 | 				.on('ready.jstree set_state.jstree', $.proxy(function () {
 34 | 						this.hide_dots();
 35 | 					}, this))
 36 | 				.on("init.jstree loading.jstree ready.jstree", $.proxy(function () {
 37 | 						//div.style.height = this._data.core.li_height + 'px';
 38 | 						this.get_container_ul().addClass('jstree-wholerow-ul');
 39 | 					}, this))
 40 | 				.on("deselect_all.jstree", $.proxy(function (e, data) {
 41 | 						this.element.find('.jstree-wholerow-clicked').removeClass('jstree-wholerow-clicked');
 42 | 					}, this))
 43 | 				.on("changed.jstree", $.proxy(function (e, data) {
 44 | 						this.element.find('.jstree-wholerow-clicked').removeClass('jstree-wholerow-clicked');
 45 | 						var tmp = false, i, j;
 46 | 						for(i = 0, j = data.selected.length; i < j; i++) {
 47 | 							tmp = this.get_node(data.selected[i], true);
 48 | 							if(tmp && tmp.length) {
 49 | 								tmp.children('.jstree-wholerow').addClass('jstree-wholerow-clicked');
 50 | 							}
 51 | 						}
 52 | 					}, this))
 53 | 				.on("open_node.jstree", $.proxy(function (e, data) {
 54 | 						this.get_node(data.node, true).find('.jstree-clicked').parent().children('.jstree-wholerow').addClass('jstree-wholerow-clicked');
 55 | 					}, this))
 56 | 				.on("hover_node.jstree dehover_node.jstree", $.proxy(function (e, data) {
 57 | 						if(e.type === "hover_node" && this.is_disabled(data.node)) { return; }
 58 | 						this.get_node(data.node, true).children('.jstree-wholerow')[e.type === "hover_node"?"addClass":"removeClass"]('jstree-wholerow-hovered');
 59 | 					}, this))
 60 | 				.on("contextmenu.jstree", ".jstree-wholerow", $.proxy(function (e) {
 61 | 						if (this._data.contextmenu) {
 62 | 							e.preventDefault();
 63 | 							var tmp = $.Event('contextmenu', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey, pageX : e.pageX, pageY : e.pageY });
 64 | 							$(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp);
 65 | 						}
 66 | 					}, this))
 67 | 				/*!
 68 | 				.on("mousedown.jstree touchstart.jstree", ".jstree-wholerow", function (e) {
 69 | 						if(e.target === e.currentTarget) {
 70 | 							var a = $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor");
 71 | 							e.target = a[0];
 72 | 							a.trigger(e);
 73 | 						}
 74 | 					})
 75 | 				*/
 76 | 				.on("click.jstree", ".jstree-wholerow", function (e) {
 77 | 						e.stopImmediatePropagation();
 78 | 						var tmp = $.Event('click', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey });
 79 | 						$(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus();
 80 | 					})
 81 | 				.on("dblclick.jstree", ".jstree-wholerow", function (e) {
 82 | 						e.stopImmediatePropagation();
 83 | 						var tmp = $.Event('dblclick', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey });
 84 | 						$(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus();
 85 | 					})
 86 | 				.on("click.jstree", ".jstree-leaf > .jstree-ocl", $.proxy(function (e) {
 87 | 						e.stopImmediatePropagation();
 88 | 						var tmp = $.Event('click', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey });
 89 | 						$(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus();
 90 | 					}, this))
 91 | 				.on("mouseover.jstree", ".jstree-wholerow, .jstree-icon", $.proxy(function (e) {
 92 | 						e.stopImmediatePropagation();
 93 | 						if(!this.is_disabled(e.currentTarget)) {
 94 | 							this.hover_node(e.currentTarget);
 95 | 						}
 96 | 						return false;
 97 | 					}, this))
 98 | 				.on("mouseleave.jstree", ".jstree-node", $.proxy(function (e) {
 99 | 						this.dehover_node(e.currentTarget);
100 | 					}, this));
101 | 		};
102 | 		this.teardown = function () {
103 | 			if(this.settings.wholerow) {
104 | 				this.element.find(".jstree-wholerow").remove();
105 | 			}
106 | 			parent.teardown.call(this);
107 | 		};
108 | 		this.redraw_node = function(obj, deep, callback, force_render) {
109 | 			obj = parent.redraw_node.apply(this, arguments);
110 | 			if(obj) {
111 | 				var tmp = div.cloneNode(true);
112 | 				//tmp.style.height = this._data.core.li_height + 'px';
113 | 				if($.inArray(obj.id, this._data.core.selected) !== -1) { tmp.className += ' jstree-wholerow-clicked'; }
114 | 				if(this._data.core.focused && this._data.core.focused === obj.id) { tmp.className += ' jstree-wholerow-hovered'; }
115 | 				obj.insertBefore(tmp, obj.childNodes[0]);
116 | 			}
117 | 			return obj;
118 | 		};
119 | 	};
120 | 	// include the wholerow plugin by default
121 | 	// $.jstree.defaults.plugins.push("wholerow");
122 | }));
123 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/outro.js:
--------------------------------------------------------------------------------
1 | }));


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/sample.js:
--------------------------------------------------------------------------------
 1 | /*global jQuery */
 2 | // wrap in IIFE and pass jQuery as $
 3 | (function ($, undefined) {
 4 | 	"use strict";
 5 | 
 6 | 	// some private plugin stuff if needed
 7 | 	var private_var = null;
 8 | 
 9 | 	// extending the defaults
10 | 	$.jstree.defaults.sample = {
11 | 		sample_option : 'sample_val'
12 | 	};
13 | 
14 | 	// the actual plugin code
15 | 	$.jstree.plugins.sample = function (options, parent) {
16 | 		// own function
17 | 		this.sample_function = function (arg) {
18 | 			// you can chain this method if needed and available
19 | 			if(parent.sample_function) { parent.sample_function.call(this, arg); }
20 | 		};
21 | 
22 | 		// *SPECIAL* FUNCTIONS
23 | 		this.init = function (el, options) {
24 | 			// do not forget parent
25 | 			parent.init.call(this, el, options);
26 | 		};
27 | 		// bind events if needed
28 | 		this.bind = function () {
29 | 			// call parent function first
30 | 			parent.bind.call(this);
31 | 			// do(stuff);
32 | 		};
33 | 		// unbind events if needed (all in jquery namespace are taken care of by the core)
34 | 		this.unbind = function () {
35 | 			// do(stuff);
36 | 			// call parent function last
37 | 			parent.unbind.call(this);
38 | 		};
39 | 		this.teardown = function () {
40 | 			// do not forget parent
41 | 			parent.teardown.call(this);
42 | 		};
43 | 		// state management - get and restore
44 | 		this.get_state = function () {
45 | 			// always get state from parent first
46 | 			var state = parent.get_state.call(this);
47 | 			// add own stuff to state
48 | 			state.sample = { 'var' : 'val' };
49 | 			return state;
50 | 		};
51 | 		this.set_state = function (state, callback) {
52 | 			// only process your part if parent returns true
53 | 			// there will be multiple times with false
54 | 			if(parent.set_state.call(this, state, callback)) {
55 | 				// check the key you set above
56 | 				if(state.sample) {
57 | 					// do(stuff); // like calling this.sample_function(state.sample.var);
58 | 					// remove your part of the state, call again and RETURN FALSE, the next cycle will be TRUE
59 | 					delete state.sample;
60 | 					this.set_state(state, callback);
61 | 					return false;
62 | 				}
63 | 				// return true if your state is gone (cleared in the previous step)
64 | 				return true;
65 | 			}
66 | 			// parent was false - return false too
67 | 			return false;
68 | 		};
69 | 		// node transportation
70 | 		this.get_json = function (obj, options, flat) {
71 | 			// get the node from the parent
72 | 			var tmp = parent.get_json.call(this, obj, options, flat), i, j;
73 | 			if($.isArray(tmp)) {
74 | 				for(i = 0, j = tmp.length; i < j; i++) {
75 | 					tmp[i].sample = 'value';
76 | 				}
77 | 			}
78 | 			else {
79 | 				tmp.sample = 'value';
80 | 			}
81 | 			// return the original / modified node
82 | 			return tmp;
83 | 		};
84 | 	};
85 | 
86 | 	// attach to document ready if needed
87 | 	$(function () {
88 | 		// do(stuff);
89 | 	});
90 | 
91 | 	// you can include the sample plugin in all instances by default
92 | 	$.jstree.defaults.plugins.push("sample");
93 | })(jQuery);


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/base.less:
--------------------------------------------------------------------------------
 1 | // base jstree
 2 | .jstree-node, .jstree-children, .jstree-container-ul { display:block; margin:0; padding:0; list-style-type:none; list-style-image:none; }
 3 | .jstree-node { white-space:nowrap; }
 4 | .jstree-anchor { display:inline-block; color:black; white-space:nowrap; padding:0 4px 0 1px; margin:0; vertical-align:top; }
 5 | .jstree-anchor:focus { outline:0; }
 6 | .jstree-anchor, .jstree-anchor:link, .jstree-anchor:visited, .jstree-anchor:hover, .jstree-anchor:active { text-decoration:none; color:inherit; }
 7 | .jstree-icon { display:inline-block; text-decoration:none; margin:0; padding:0; vertical-align:top; text-align:center; }
 8 | .jstree-icon:empty { display:inline-block; text-decoration:none; margin:0; padding:0; vertical-align:top; text-align:center; }
 9 | .jstree-ocl { cursor:pointer; }
10 | .jstree-leaf > .jstree-ocl { cursor:default; }
11 | .jstree .jstree-open > .jstree-children { display:block; }
12 | .jstree .jstree-closed > .jstree-children,
13 | .jstree .jstree-leaf > .jstree-children { display:none; }
14 | .jstree-anchor > .jstree-themeicon { margin-right:2px; }
15 | .jstree-no-icons .jstree-themeicon,
16 | .jstree-anchor > .jstree-themeicon-hidden { display:none; }
17 | .jstree-hidden, .jstree-node.jstree-hidden { display:none; }
18 | 
19 | // base jstree rtl
20 | .jstree-rtl {
21 | 	.jstree-anchor { padding:0 1px 0 4px; }
22 | 	.jstree-anchor > .jstree-themeicon { margin-left:2px; margin-right:0; }
23 | 	.jstree-node { margin-left:0; }
24 | 	.jstree-container-ul > .jstree-node { margin-right:0; }
25 | }
26 | 
27 | // base jstree wholerow
28 | .jstree-wholerow-ul {
29 | 	position:relative;
30 | 	display:inline-block;
31 | 	min-width:100%;
32 | 	.jstree-leaf > .jstree-ocl { cursor:pointer; }
33 | 	.jstree-anchor, .jstree-icon { position:relative; }
34 | 	.jstree-wholerow { width:100%; cursor:pointer; position:absolute; left:0; -webkit-user-select:none; -moz-user-select:none; -ms-user-select:none; user-select:none; }
35 | }
36 | 
37 | // base contextmenu
38 | .jstree-contextmenu .jstree-anchor {
39 | 	-webkit-user-select: none; /* disable selection/Copy of UIWebView */
40 | 	-webkit-touch-callout: none; /* disable the IOS popup when long-press on a link */
41 | }
42 | .vakata-context {
43 | 	display:none;
44 | 	&, ul { margin:0; padding:2px; position:absolute; background:#f5f5f5; border:1px solid #979797; box-shadow:2px 2px 2px #999999; }
45 | 	ul { list-style:none; left:100%; margin-top:-2.7em; margin-left:-4px; }
46 | 	.vakata-context-right ul { left:auto; right:100%; margin-left:auto; margin-right:-4px; }
47 | 	li {
48 | 		list-style:none;
49 | 		> a {
50 | 			display:block; padding:0 2em 0 2em; text-decoration:none; width:auto; color:black; white-space:nowrap; line-height:2.4em; text-shadow:1px 1px 0 white; border-radius:1px;
51 | 			&:hover { position:relative; background-color:#e8eff7; box-shadow:0 0 2px #0a6aa1; }
52 | 			&.vakata-context-parent { background-image:url("data:image/gif;base64,R0lGODlhCwAHAIAAACgoKP///yH5BAEAAAEALAAAAAALAAcAAAIORI4JlrqN1oMSnmmZDQUAOw=="); background-position:right center; background-repeat:no-repeat; }
53 | 		}
54 | 		> a:focus { outline:0; }
55 | 	}
56 | 	.vakata-context-hover > a { position:relative; background-color:#e8eff7; box-shadow:0 0 2px #0a6aa1; }
57 | 	.vakata-context-separator {
58 | 		> a, > a:hover { background:white; border:0; border-top:1px solid #e2e3e3; height:1px; min-height:1px; max-height:1px; padding:0; margin:0 0 0 2.4em; border-left:1px solid #e0e0e0; text-shadow:0 0 0 transparent; box-shadow:0 0 0 transparent; border-radius:0; }
59 | 	}
60 | 	.vakata-contextmenu-disabled {
61 | 		a, a:hover { color:silver; background-color:transparent; border:0; box-shadow:0 0 0; }
62 | 	}
63 | 	li > a {
64 | 		> i { text-decoration:none; display:inline-block; width:2.4em; height:2.4em; background:transparent; margin:0 0 0 -2em; vertical-align:top; text-align:center; line-height:2.4em; }
65 | 		> i:empty { width:2.4em; line-height:2.4em; }
66 | 		.vakata-contextmenu-sep { display:inline-block; width:1px; height:2.4em; background:white; margin:0 0.5em 0 0; border-left:1px solid #e2e3e3; }
67 | 	}
68 | 	.vakata-contextmenu-shortcut { font-size:0.8em; color:silver; opacity:0.5; display:none; }
69 | }
70 | .vakata-context-rtl {
71 | 	ul { left:auto; right:100%; margin-left:auto; margin-right:-4px; }
72 | 	li > a.vakata-context-parent { background-image:url("data:image/gif;base64,R0lGODlhCwAHAIAAACgoKP///yH5BAEAAAEALAAAAAALAAcAAAINjI+AC7rWHIsPtmoxLAA7"); background-position:left center; background-repeat:no-repeat; }
73 | 	.vakata-context-separator > a { margin:0 2.4em 0 0; border-left:0; border-right:1px solid #e2e3e3;}
74 | 	.vakata-context-left ul { right:auto; left:100%; margin-left:-4px; margin-right:auto; }
75 | 	li > a {
76 | 		> i { margin:0 -2em 0 0; }
77 | 		.vakata-contextmenu-sep { margin:0 0 0 0.5em; border-left-color:white; background:#e2e3e3; }
78 | 	}
79 | }
80 | 
81 | // base drag'n'drop
82 | #jstree-marker { position: absolute; top:0; left:0; margin:-5px 0 0 0; padding:0; border-right:0; border-top:5px solid transparent; border-bottom:5px solid transparent; border-left:5px solid; width:0; height:0; font-size:0; line-height:0; }
83 | #jstree-dnd {
84 | 	line-height:16px;
85 | 	margin:0;
86 | 	padding:4px;
87 | 	.jstree-icon,
88 | 	.jstree-copy { display:inline-block; text-decoration:none; margin:0 2px 0 0; padding:0; width:16px; height:16px; }
89 | 	.jstree-ok { background:green; }
90 | 	.jstree-er { background:red; }
91 | 	.jstree-copy { margin:0 2px 0 2px; }
92 | }
93 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/32px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/40px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/style.less:
--------------------------------------------------------------------------------
 1 | /* jsTree default dark theme */
 2 | @theme-name:				default-dark;
 3 | @hovered-bg-color:			#555;
 4 | @hovered-shadow-color:		#555;
 5 | @disabled-color:			#666666;
 6 | @disabled-bg-color:			#333333;
 7 | @clicked-bg-color:			#5fa2db;
 8 | @clicked-shadow-color:		#666666;
 9 | @clicked-gradient-color-1:	#5fa2db;
10 | @clicked-gradient-color-2:	#5fa2db;
11 | @search-result-color:		#ffffff;
12 | @mobile-wholerow-bg-color:	#333333;
13 | @mobile-wholerow-shadow:	#111111;
14 | @mobile-wholerow-bordert:	#666;
15 | @mobile-wholerow-borderb:	#000;
16 | @responsive:				true;
17 | @image-path:				"";
18 | @base-height:				40px;
19 | 
20 | @import "../mixins.less";
21 | @import "../base.less";
22 | @import "../main.less";
23 | 
24 | .jstree-@{theme-name} {
25 | 	background:#333;
26 | 	.jstree-anchor { color:#999; text-shadow:1px 1px 0 rgba(0,0,0,0.5); }
27 | 	.jstree-clicked, .jstree-checked { color:white; }
28 | 	.jstree-hovered { color:white; }
29 | 	#jstree-marker& {
30 | 		border-left-color:#999;
31 | 		background:transparent;
32 | 	}
33 | 	.jstree-anchor > .jstree-icon { opacity:0.75; }
34 | 	.jstree-clicked > .jstree-icon,
35 | 	.jstree-hovered > .jstree-icon,
36 | 	.jstree-checked > .jstree-icon { opacity:1; }
37 | }
38 | // theme variants
39 | .jstree-@{theme-name} {
40 | 	&.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); }
41 | 	&.jstree-rtl .jstree-last { background:transparent; }
42 | }
43 | .jstree-@{theme-name}-small {
44 | 	&.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAACAQMAAABv1h6PAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMHBgAAiABBI4gz9AAAAABJRU5ErkJggg=="); }
45 | 	&.jstree-rtl .jstree-last { background:transparent; }
46 | }
47 | .jstree-@{theme-name}-large {
48 | 	&.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAACAQMAAAAD0EyKAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjgIIGBgABCgCBvVLXcAAAAABJRU5ErkJggg=="); }
49 | 	&.jstree-rtl .jstree-last { background:transparent; }
50 | }


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/throbber.gif


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/32px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/40px.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/style.less:
--------------------------------------------------------------------------------
 1 | /* jsTree default theme */
 2 | @theme-name:				default;
 3 | @hovered-bg-color:			#e7f4f9;
 4 | @hovered-shadow-color:		#cccccc;
 5 | @disabled-color:			#666666;
 6 | @disabled-bg-color:			#efefef;
 7 | @clicked-bg-color:			#beebff;
 8 | @clicked-shadow-color:		#999999;
 9 | @clicked-gradient-color-1:	#beebff;
10 | @clicked-gradient-color-2:	#a8e4ff;
11 | @search-result-color:		#8b0000;
12 | @mobile-wholerow-bg-color:	#ebebeb;
13 | @mobile-wholerow-shadow:	#666666;
14 | @mobile-wholerow-bordert:	rgba(255,255,255,0.7);
15 | @mobile-wholerow-borderb:	rgba(64,64,64,0.2);
16 | @responsive:				true;
17 | @image-path:				"";
18 | @base-height:				40px;
19 | 
20 | @import "../mixins.less";
21 | @import "../base.less";
22 | @import "../main.less";


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/throbber.gif


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/main.less:
--------------------------------------------------------------------------------
 1 | .jstree-@{theme-name} {
 2 | 	.jstree-node,
 3 | 	.jstree-icon { background-repeat:no-repeat; background-color:transparent; }
 4 | 	.jstree-anchor,
 5 | 	.jstree-animated,
 6 | 	.jstree-wholerow { transition:background-color 0.15s, box-shadow 0.15s; }
 7 | 	.jstree-hovered { background:@hovered-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @hovered-shadow-color; }
 8 | 	.jstree-context { background:@hovered-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @hovered-shadow-color; }
 9 | 	.jstree-clicked { background:@clicked-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @clicked-shadow-color; }
10 | 	.jstree-no-icons .jstree-anchor > .jstree-themeicon { display:none; }
11 | 	.jstree-disabled {
12 | 		background:transparent; color:@disabled-color;
13 | 		&.jstree-hovered { background:transparent; box-shadow:none; }
14 | 		&.jstree-clicked { background:@disabled-bg-color; }
15 | 		> .jstree-icon { opacity:0.8; filter: url("data:image/svg+xml;utf8,<svg xmlns=\'http://www.w3.org/2000/svg\'><filter id=\'jstree-grayscale\'><feColorMatrix type=\'matrix\' values=\'0.3333 0.3333 0.3333 0 0 0.3333 0.3333 0.3333 0 0 0.3333 0.3333 0.3333 0 0 0 0 0 1 0\'/></filter></svg>#jstree-grayscale"); /* Firefox 10+ */ filter: gray; /* IE6-9 */ -webkit-filter: grayscale(100%); /* Chrome 19+ & Safari 6+ */ }
16 | 	}
17 | 	// search
18 | 	.jstree-search { font-style:italic; color:@search-result-color; font-weight:bold; }
19 | 	// checkboxes
20 | 	.jstree-no-checkboxes .jstree-checkbox { display:none !important; }
21 | 	&.jstree-checkbox-no-clicked {
22 | 		.jstree-clicked {
23 | 			background:transparent;
24 | 			box-shadow:none;
25 | 			&.jstree-hovered { background:@hovered-bg-color; }
26 | 		}
27 | 		> .jstree-wholerow-ul .jstree-wholerow-clicked {
28 | 			background:transparent;
29 | 			&.jstree-wholerow-hovered { background:@hovered-bg-color; }
30 | 		}
31 | 	}
32 | 	// stripes
33 | 	> .jstree-striped { min-width:100%; display:inline-block; background:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAkCAMAAAB/qqA+AAAABlBMVEUAAAAAAAClZ7nPAAAAAnRSTlMNAMM9s3UAAAAXSURBVHjajcEBAQAAAIKg/H/aCQZ70AUBjAATb6YPDgAAAABJRU5ErkJggg==") left top repeat; }
34 | 	// wholerow
35 | 	> .jstree-wholerow-ul .jstree-hovered,
36 | 	> .jstree-wholerow-ul .jstree-clicked { background:transparent; box-shadow:none; border-radius:0; }
37 | 	.jstree-wholerow { -moz-box-sizing:border-box; -webkit-box-sizing:border-box; box-sizing:border-box; }
38 | 	.jstree-wholerow-hovered { background:@hovered-bg-color; }
39 | 	.jstree-wholerow-clicked { .gradient(@clicked-gradient-color-1, @clicked-gradient-color-2); }
40 | }
41 | 
42 | // theme variants
43 | .jstree-@{theme-name} {
44 | 	.jstree-theme(24px, "@{image-path}32px.png", 32px);
45 | 	&.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); }
46 | 	&.jstree-rtl .jstree-last { background:transparent; }
47 | }
48 | .jstree-@{theme-name}-small {
49 | 	.jstree-theme(18px, "@{image-path}32px.png", 32px);
50 | 	&.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAACAQMAAABv1h6PAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMHBgAAiABBI4gz9AAAAABJRU5ErkJggg=="); }
51 | 	&.jstree-rtl .jstree-last { background:transparent; }
52 | }
53 | .jstree-@{theme-name}-large {
54 | 	.jstree-theme(32px, "@{image-path}32px.png", 32px);
55 | 	&.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAACAQMAAAAD0EyKAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjgIIGBgABCgCBvVLXcAAAAABJRU5ErkJggg=="); }
56 | 	&.jstree-rtl .jstree-last { background:transparent; }
57 | }
58 | 
59 | // mobile theme attempt
60 | @media (max-width: 768px) {
61 | 	#jstree-dnd.jstree-dnd-responsive when (@responsive = true) {
62 | 		line-height:@base-height; font-weight:bold; font-size:1.1em; text-shadow:1px 1px white;
63 | 		> i { background:transparent; width:@base-height; height:@base-height; }
64 | 		> .jstree-ok { background-image:url("@{image-path}@{base-height}.png"); background-position:0 -(@base-height * 5); background-size:(@base-height * 3) (@base-height * 6); }
65 | 		> .jstree-er { background-image:url("@{image-path}@{base-height}.png"); background-position:-(@base-height * 1) -(@base-height * 5); background-size:(@base-height * 3) (@base-height * 6); }
66 | 	}
67 | 	#jstree-marker.jstree-dnd-responsive when (@responsive = true) {
68 | 		border-left-width:10px;
69 | 		border-top-width:10px;
70 | 		border-bottom-width:10px;
71 | 		margin-top:-10px;
72 | 	}
73 | }
74 | 
75 | .jstree-@{theme-name}-responsive when (@responsive = true) {
76 | 	@import "responsive.less";
77 | }
78 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/mixins.less:
--------------------------------------------------------------------------------
  1 | .gradient (@color1; @color2) {
  2 | 	background:@color1;
  3 | 	background: -webkit-linear-gradient(top, @color1 0%,@color2 100%);
  4 | 	background: linear-gradient(to bottom, @color1 0%,@color2 100%);
  5 | }
  6 | 
  7 | .jstree-theme (@base-height, @image, @image-height) {
  8 | 	@correction: (@image-height - @base-height) / 2;
  9 | 
 10 | 	.jstree-node { min-height:@base-height; line-height:@base-height; margin-left:@base-height; min-width:@base-height; }
 11 | 	.jstree-anchor { line-height:@base-height; height:@base-height; }
 12 | 	.jstree-icon { width:@base-height; height:@base-height; line-height:@base-height; }
 13 | 	.jstree-icon:empty { width:@base-height; height:@base-height; line-height:@base-height; }
 14 | 	&.jstree-rtl .jstree-node { margin-right:@base-height; }
 15 | 	.jstree-wholerow { height:@base-height; }
 16 | 
 17 | 	.jstree-node,
 18 | 	.jstree-icon { background-image:url("@{image}"); }
 19 | 	.jstree-node { background-position:-(@image-height * 9 + @correction) -@correction; background-repeat:repeat-y; }
 20 | 	.jstree-last { background:transparent; }
 21 | 
 22 | 	.jstree-open > .jstree-ocl { background-position:-(@image-height * 4 + @correction) -@correction; }
 23 | 	.jstree-closed > .jstree-ocl { background-position:-(@image-height * 3 + @correction) -@correction; }
 24 | 	.jstree-leaf > .jstree-ocl { background-position:-(@image-height * 2 + @correction) -@correction; }
 25 | 
 26 | 	.jstree-themeicon { background-position:-(@image-height * 8 + @correction) -@correction; }
 27 | 
 28 | 	> .jstree-no-dots {
 29 | 		.jstree-node,
 30 | 		.jstree-leaf > .jstree-ocl { background:transparent; }
 31 | 		.jstree-open > .jstree-ocl { background-position:-(@image-height * 1 + @correction) -@correction; }
 32 | 		.jstree-closed > .jstree-ocl { background-position:-@correction -@correction; }
 33 | 	}
 34 | 
 35 | 	.jstree-disabled {
 36 | 		background:transparent;
 37 | 		&.jstree-hovered {
 38 | 			background:transparent;
 39 | 		}
 40 | 		&.jstree-clicked {
 41 | 			background:#efefef;
 42 | 		}
 43 | 	}
 44 | 
 45 | 	.jstree-checkbox {
 46 | 		background-position:-(@image-height * 5 + @correction) -@correction;
 47 | 		&:hover { background-position:-(@image-height * 5 + @correction) -(@image-height * 1 + @correction); }
 48 | 	}
 49 | 
 50 | 	&.jstree-checkbox-selection .jstree-clicked, .jstree-checked {
 51 | 		> .jstree-checkbox {
 52 | 			background-position:-(@image-height * 7 + @correction) -@correction;
 53 | 			&:hover { background-position:-(@image-height * 7 + @correction) -(@image-height * 1 + @correction); }
 54 | 		}
 55 | 	}
 56 | 	.jstree-anchor {
 57 | 		> .jstree-undetermined {
 58 | 			background-position:-(@image-height * 6 + @correction) -@correction;
 59 | 			&:hover {
 60 | 				background-position:-(@image-height * 6 + @correction) -(@image-height * 1 + @correction);
 61 | 			}
 62 | 		}
 63 | 	}
 64 | 	.jstree-checkbox-disabled { opacity:0.8; filter: url("data:image/svg+xml;utf8,<svg xmlns=\'http://www.w3.org/2000/svg\'><filter id=\'jstree-grayscale\'><feColorMatrix type=\'matrix\' values=\'0.3333 0.3333 0.3333 0 0 0.3333 0.3333 0.3333 0 0 0.3333 0.3333 0.3333 0 0 0 0 0 1 0\'/></filter></svg>#jstree-grayscale"); /* Firefox 10+ */ filter: gray; /* IE6-9 */ -webkit-filter: grayscale(100%); /* Chrome 19+ & Safari 6+ */ }
 65 | 
 66 | 	> .jstree-striped { background-size:auto (@base-height * 2); }
 67 | 
 68 | 	&.jstree-rtl {
 69 | 		.jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); background-position: 100% 1px; background-repeat:repeat-y; }
 70 | 		.jstree-last { background:transparent; }
 71 | 		.jstree-open > .jstree-ocl { background-position:-(@image-height * 4 + @correction) -(@image-height * 1 + @correction); }
 72 | 		.jstree-closed > .jstree-ocl { background-position:-(@image-height * 3 + @correction) -(@image-height * 1 + @correction); }
 73 | 		.jstree-leaf > .jstree-ocl { background-position:-(@image-height * 2 + @correction) -(@image-height * 1 + @correction); }
 74 | 		> .jstree-no-dots {
 75 | 			.jstree-node,
 76 | 			.jstree-leaf > .jstree-ocl { background:transparent; }
 77 | 			.jstree-open > .jstree-ocl { background-position:-(@image-height * 1 + @correction) -(@image-height * 1 + @correction); }
 78 | 			.jstree-closed > .jstree-ocl { background-position:-@correction -(@image-height * 1 + @correction); }
 79 | 		}
 80 | 	}
 81 | 	.jstree-themeicon-custom { background-color:transparent; background-image:none; background-position:0 0; }
 82 | 
 83 | 	> .jstree-container-ul .jstree-loading > .jstree-ocl { background:url("@{image-path}throbber.gif") center center no-repeat; }
 84 | 
 85 | 	.jstree-file { background:url("@{image}") -(@image-height * 3 + @correction) -(@image-height * 2 + @correction) no-repeat; }
 86 | 	.jstree-folder { background:url("@{image}") -(@image-height * 8 + @correction) -(@correction) no-repeat; }
 87 | 
 88 | 	> .jstree-container-ul > .jstree-node { margin-left:0; margin-right:0; }
 89 | 
 90 | 	// drag'n'drop
 91 | 	#jstree-dnd& {
 92 | 		line-height:@base-height; padding:0 4px;
 93 | 		.jstree-ok,
 94 | 		.jstree-er { background-image:url("@{image-path}32px.png"); background-repeat:no-repeat; background-color:transparent; }
 95 | 		i { background:transparent; width:@base-height; height:@base-height; line-height:@base-height; }
 96 | 		.jstree-ok { background-position: -(@correction) -(@image-height * 2 + @correction); }
 97 | 		.jstree-er { background-position: -(@image-height * 1 + @correction) -(@image-height * 2 + @correction); }
 98 | 	}
 99 | 
100 | 	// ellipsis
101 | 	.jstree-ellipsis { overflow: hidden; }
102 | 	// base height + PADDINGS!
103 | 	.jstree-ellipsis .jstree-anchor { width: calc(100% ~"-" (@base-height + 5px)); text-overflow: ellipsis; overflow: hidden; }
104 | 	.jstree-ellipsis.jstree-no-icons .jstree-anchor { width: calc(100% ~"-" 5px); }
105 | }
106 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/responsive.less:
--------------------------------------------------------------------------------
 1 | @media (max-width: 768px) {
 2 | 	// background image
 3 | 	.jstree-icon { background-image:url("@{image-path}@{base-height}.png"); }
 4 | 
 5 | 	.jstree-node,
 6 | 	.jstree-leaf > .jstree-ocl { background:transparent; }
 7 | 
 8 | 	.jstree-node { min-height:@base-height; line-height:@base-height; margin-left:@base-height; min-width:@base-height; white-space:nowrap; }
 9 | 	.jstree-anchor { line-height:@base-height; height:@base-height; }
10 | 	.jstree-icon, .jstree-icon:empty { width:@base-height; height:@base-height; line-height:@base-height; }
11 | 
12 | 	> .jstree-container-ul > .jstree-node { margin-left:0; }
13 | 	&.jstree-rtl .jstree-node { margin-left:0; margin-right:@base-height; background:transparent; }
14 | 	&.jstree-rtl .jstree-container-ul > .jstree-node { margin-right:0; }
15 | 
16 | 	.jstree-ocl,
17 | 	.jstree-themeicon,
18 | 	.jstree-checkbox { background-size:(@base-height * 3) (@base-height * 6); }
19 | 	.jstree-leaf > .jstree-ocl,
20 | 	&.jstree-rtl .jstree-leaf > .jstree-ocl { background:transparent; }
21 | 	.jstree-open > .jstree-ocl { background-position:0 0px !important; }
22 | 	.jstree-closed > .jstree-ocl { background-position:0 -(@base-height * 1) !important; }
23 | 	&.jstree-rtl .jstree-closed > .jstree-ocl { background-position:-(@base-height * 1) 0px !important; }
24 | 
25 | 	.jstree-themeicon { background-position:-(@base-height * 1) -(@base-height * 1); }
26 | 
27 | 	.jstree-checkbox, .jstree-checkbox:hover { background-position:-(@base-height * 1) -(@base-height * 2); }
28 | 	&.jstree-checkbox-selection .jstree-clicked > .jstree-checkbox,
29 | 	&.jstree-checkbox-selection .jstree-clicked > .jstree-checkbox:hover,
30 | 	.jstree-checked > .jstree-checkbox,
31 | 	.jstree-checked > .jstree-checkbox:hover { background-position:0 -(@base-height * 2); }
32 | 	.jstree-anchor > .jstree-undetermined, .jstree-anchor > .jstree-undetermined:hover { background-position:0 -(@base-height * 3); }
33 | 
34 | 	.jstree-anchor { font-weight:bold; font-size:1.1em; text-shadow:1px 1px white; }
35 | 
36 | 	> .jstree-striped { background:transparent; }
37 | 	.jstree-wholerow { border-top:1px solid @mobile-wholerow-bordert; border-bottom:1px solid @mobile-wholerow-borderb; background:@mobile-wholerow-bg-color; height:@base-height; }
38 | 	.jstree-wholerow-hovered { background:@hovered-bg-color; }
39 | 	.jstree-wholerow-clicked { background:@clicked-bg-color; }
40 | 
41 | 	// thanks to PHOTONUI
42 | 	.jstree-children .jstree-last > .jstree-wholerow { box-shadow: inset 0 -6px 3px -5px @mobile-wholerow-shadow; }
43 | 	.jstree-children .jstree-open > .jstree-wholerow { box-shadow: inset 0 6px 3px -5px @mobile-wholerow-shadow; border-top:0; }
44 | 	.jstree-children .jstree-open + .jstree-open { box-shadow:none; }
45 | 
46 | 	// experiment
47 | 	.jstree-node,
48 | 	.jstree-icon,
49 | 	.jstree-node > .jstree-ocl,
50 | 	.jstree-themeicon,
51 | 	.jstree-checkbox { background-image:url("@{image-path}@{base-height}.png"); background-size:(@base-height * 3) (@base-height * 6); }
52 | 
53 | 	.jstree-node { background-position:-(@base-height * 2) 0; background-repeat:repeat-y; }
54 | 	.jstree-last { background:transparent; }
55 | 	.jstree-leaf > .jstree-ocl { background-position:-(@base-height * 1) -(@base-height * 3); }
56 | 	.jstree-last > .jstree-ocl { background-position:-(@base-height * 1) -(@base-height * 4); }
57 | 	/*
58 | 	.jstree-open > .jstree-ocl,
59 | 	.jstree-closed > .jstree-ocl { border-radius:20px; background-color:white; }
60 | 	*/
61 | 
62 | 	.jstree-themeicon-custom { background-color:transparent; background-image:none; background-position:0 0; }
63 | 	.jstree-file { background:url("@{image-path}@{base-height}.png") 0 -(@base-height * 4) no-repeat; background-size:(@base-height * 3) (@base-height * 6); }
64 | 	.jstree-folder { background:url("@{image-path}@{base-height}.png") -(@base-height * 1) -(@base-height * 1) no-repeat; background-size:(@base-height * 3) (@base-height * 6); }
65 | 
66 | 	> .jstree-container-ul > .jstree-node { margin-left:0; margin-right:0; }
67 | }


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/vakata-jstree.js:
--------------------------------------------------------------------------------
 1 | (function (factory) {
 2 | 	"use strict";
 3 | 	if (typeof define === 'function' && define.amd) {
 4 | 		define('jstree.checkbox', ['jquery','jstree'], factory);
 5 | 	}
 6 | 	else if(typeof exports === 'object') {
 7 | 		factory(require('jquery'), require('jstree'));
 8 | 	}
 9 | 	else {
10 | 		factory(jQuery);
11 | 	}
12 | }(function ($, undefined) {
13 | 	"use strict";
14 | 	if(document.registerElement && Object && Object.create) {
15 | 		var proto = Object.create(HTMLElement.prototype);
16 | 		proto.createdCallback = function () {
17 | 			var c = { core : {}, plugins : [] }, i;
18 | 			for(i in $.jstree.plugins) {
19 | 				if($.jstree.plugins.hasOwnProperty(i) && this.attributes[i]) {
20 | 					c.plugins.push(i);
21 | 					if(this.getAttribute(i) && JSON.parse(this.getAttribute(i))) {
22 | 						c[i] = JSON.parse(this.getAttribute(i));
23 | 					}
24 | 				}
25 | 			}
26 | 			for(i in $.jstree.defaults.core) {
27 | 				if($.jstree.defaults.core.hasOwnProperty(i) && this.attributes[i]) {
28 | 					c.core[i] = JSON.parse(this.getAttribute(i)) || this.getAttribute(i);
29 | 				}
30 | 			}
31 | 			$(this).jstree(c);
32 | 		};
33 | 		// proto.attributeChangedCallback = function (name, previous, value) { };
34 | 		try {
35 | 			document.registerElement("vakata-jstree", { prototype: proto });
36 | 		} catch(ignore) { }
37 | 	}
38 | }));
39 | 


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/unit/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |   <meta charset="utf-8">
 5 |   <title>Basic Test Suite</title>
 6 |   <!-- Load local QUnit. -->
 7 |   <link rel="stylesheet" href="libs/qunit.css" media="screen">
 8 |   <script src="libs/qunit.js"></script>
 9 | </head>
10 | <body>
11 |   <div id="qunit"></div>
12 |   <div id="qunit-fixture">this had better work.</div>
13 |   <!-- Load local lib and tests. -->
14 |   <script src="test.js"></script>
15 | </body>
16 | </html>


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/unit/libs/qunit.css:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * QUnit v1.12.0 - A JavaScript Unit Testing Framework
  3 |  *
  4 |  * http://qunitjs.com
  5 |  *
  6 |  * Copyright 2012 jQuery Foundation and other contributors
  7 |  * Released under the MIT license.
  8 |  * http://jquery.org/license
  9 |  */
 10 | 
 11 | /** Font Family and Sizes */
 12 | 
 13 | #qunit-tests, #qunit-header, #qunit-banner, #qunit-testrunner-toolbar, #qunit-userAgent, #qunit-testresult {
 14 |   font-family: "Helvetica Neue Light", "HelveticaNeue-Light", "Helvetica Neue", Calibri, Helvetica, Arial, sans-serif;
 15 | }
 16 | 
 17 | #qunit-testrunner-toolbar, #qunit-userAgent, #qunit-testresult, #qunit-tests li { font-size: small; }
 18 | #qunit-tests { font-size: smaller; }
 19 | 
 20 | 
 21 | /** Resets */
 22 | 
 23 | #qunit-tests, #qunit-header, #qunit-banner, #qunit-userAgent, #qunit-testresult, #qunit-modulefilter {
 24 |   margin: 0;
 25 |   padding: 0;
 26 | }
 27 | 
 28 | 
 29 | /** Header */
 30 | 
 31 | #qunit-header {
 32 |   padding: 0.5em 0 0.5em 1em;
 33 | 
 34 |   color: #8699a4;
 35 |   background-color: #0d3349;
 36 | 
 37 |   font-size: 1.5em;
 38 |   line-height: 1em;
 39 |   font-weight: normal;
 40 | 
 41 |   border-radius: 5px 5px 0 0;
 42 |   -moz-border-radius: 5px 5px 0 0;
 43 |   -webkit-border-top-right-radius: 5px;
 44 |   -webkit-border-top-left-radius: 5px;
 45 | }
 46 | 
 47 | #qunit-header a {
 48 |   text-decoration: none;
 49 |   color: #c2ccd1;
 50 | }
 51 | 
 52 | #qunit-header a:hover,
 53 | #qunit-header a:focus {
 54 |   color: #fff;
 55 | }
 56 | 
 57 | #qunit-testrunner-toolbar label {
 58 |   display: inline-block;
 59 |   padding: 0 .5em 0 .1em;
 60 | }
 61 | 
 62 | #qunit-banner {
 63 |   height: 5px;
 64 | }
 65 | 
 66 | #qunit-testrunner-toolbar {
 67 |   padding: 0.5em 0 0.5em 2em;
 68 |   color: #5E740B;
 69 |   background-color: #eee;
 70 |   overflow: hidden;
 71 | }
 72 | 
 73 | #qunit-userAgent {
 74 |   padding: 0.5em 0 0.5em 2.5em;
 75 |   background-color: #2b81af;
 76 |   color: #fff;
 77 |   text-shadow: rgba(0, 0, 0, 0.5) 2px 2px 1px;
 78 | }
 79 | 
 80 | #qunit-modulefilter-container {
 81 |   float: right;
 82 | }
 83 | 
 84 | /** Tests: Pass/Fail */
 85 | 
 86 | #qunit-tests {
 87 |   list-style-position: inside;
 88 | }
 89 | 
 90 | #qunit-tests li {
 91 |   padding: 0.4em 0.5em 0.4em 2.5em;
 92 |   border-bottom: 1px solid #fff;
 93 |   list-style-position: inside;
 94 | }
 95 | 
 96 | #qunit-tests.hidepass li.pass, #qunit-tests.hidepass li.running  {
 97 |   display: none;
 98 | }
 99 | 
100 | #qunit-tests li strong {
101 |   cursor: pointer;
102 | }
103 | 
104 | #qunit-tests li a {
105 |   padding: 0.5em;
106 |   color: #c2ccd1;
107 |   text-decoration: none;
108 | }
109 | #qunit-tests li a:hover,
110 | #qunit-tests li a:focus {
111 |   color: #000;
112 | }
113 | 
114 | #qunit-tests li .runtime {
115 |   float: right;
116 |   font-size: smaller;
117 | }
118 | 
119 | .qunit-assert-list {
120 |   margin-top: 0.5em;
121 |   padding: 0.5em;
122 | 
123 |   background-color: #fff;
124 | 
125 |   border-radius: 5px;
126 |   -moz-border-radius: 5px;
127 |   -webkit-border-radius: 5px;
128 | }
129 | 
130 | .qunit-collapsed {
131 |   display: none;
132 | }
133 | 
134 | #qunit-tests table {
135 |   border-collapse: collapse;
136 |   margin-top: .2em;
137 | }
138 | 
139 | #qunit-tests th {
140 |   text-align: right;
141 |   vertical-align: top;
142 |   padding: 0 .5em 0 0;
143 | }
144 | 
145 | #qunit-tests td {
146 |   vertical-align: top;
147 | }
148 | 
149 | #qunit-tests pre {
150 |   margin: 0;
151 |   white-space: pre-wrap;
152 |   word-wrap: break-word;
153 | }
154 | 
155 | #qunit-tests del {
156 |   background-color: #e0f2be;
157 |   color: #374e0c;
158 |   text-decoration: none;
159 | }
160 | 
161 | #qunit-tests ins {
162 |   background-color: #ffcaca;
163 |   color: #500;
164 |   text-decoration: none;
165 | }
166 | 
167 | /*** Test Counts */
168 | 
169 | #qunit-tests b.counts                       { color: black; }
170 | #qunit-tests b.passed                       { color: #5E740B; }
171 | #qunit-tests b.failed                       { color: #710909; }
172 | 
173 | #qunit-tests li li {
174 |   padding: 5px;
175 |   background-color: #fff;
176 |   border-bottom: none;
177 |   list-style-position: inside;
178 | }
179 | 
180 | /*** Passing Styles */
181 | 
182 | #qunit-tests li li.pass {
183 |   color: #3c510c;
184 |   background-color: #fff;
185 |   border-left: 10px solid #C6E746;
186 | }
187 | 
188 | #qunit-tests .pass                          { color: #528CE0; background-color: #D2E0E6; }
189 | #qunit-tests .pass .test-name               { color: #366097; }
190 | 
191 | #qunit-tests .pass .test-actual,
192 | #qunit-tests .pass .test-expected           { color: #999999; }
193 | 
194 | #qunit-banner.qunit-pass                    { background-color: #C6E746; }
195 | 
196 | /*** Failing Styles */
197 | 
198 | #qunit-tests li li.fail {
199 |   color: #710909;
200 |   background-color: #fff;
201 |   border-left: 10px solid #EE5757;
202 |   white-space: pre;
203 | }
204 | 
205 | #qunit-tests > li:last-child {
206 |   border-radius: 0 0 5px 5px;
207 |   -moz-border-radius: 0 0 5px 5px;
208 |   -webkit-border-bottom-right-radius: 5px;
209 |   -webkit-border-bottom-left-radius: 5px;
210 | }
211 | 
212 | #qunit-tests .fail                          { color: #000000; background-color: #EE5757; }
213 | #qunit-tests .fail .test-name,
214 | #qunit-tests .fail .module-name             { color: #000000; }
215 | 
216 | #qunit-tests .fail .test-actual             { color: #EE5757; }
217 | #qunit-tests .fail .test-expected           { color: green;   }
218 | 
219 | #qunit-banner.qunit-fail                    { background-color: #EE5757; }
220 | 
221 | 
222 | /** Result */
223 | 
224 | #qunit-testresult {
225 |   padding: 0.5em 0.5em 0.5em 2.5em;
226 | 
227 |   color: #2b81af;
228 |   background-color: #D2E0E6;
229 | 
230 |   border-bottom: 1px solid white;
231 | }
232 | #qunit-testresult .module-name {
233 |   font-weight: bold;
234 | }
235 | 
236 | /** Fixture */
237 | 
238 | #qunit-fixture {
239 |   position: absolute;
240 |   top: -10000px;
241 |   left: -10000px;
242 |   width: 1000px;
243 |   height: 1000px;
244 | }


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/unit/test.js:
--------------------------------------------------------------------------------
 1 | test('basic test', function() {
 2 |   expect(1);
 3 |   ok(true, 'this had better work.');
 4 | });
 5 | 
 6 | 
 7 | test('can access the DOM', function() {
 8 |   expect(1);
 9 |   var fixture = document.getElementById('qunit-fixture');
10 |   equal(fixture.innerText || fixture.textContent, 'this had better work.', 'should be able to access the DOM.');
11 | });


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/desktop/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 | 	<meta charset="UTF-8">
 5 | 	<title>Light theme visual tests</title>
 6 | 	<link rel="stylesheet" href="./../../../dist/themes/default/style.min.css">
 7 | 	<link rel="stylesheet" href="./../../../dist/themes/default-dark/style.min.css">
 8 | 	<style>.tree { border:1px solid black; padding:10px; width:300px; margin:20px; float:left; min-height:200px; }</style>
 9 | </head>
10 | <body style="background:white;">
11 | 	<div class="tree" id="empty"></div>
12 | 	<div class="tree" id="core"><ul><li>asdf</li></ul></div>
13 | 	<div class="tree" id="tree">
14 | 		<ul>
15 | 			<li>Node 01
16 | 				<ul>
17 | 					<li>Node</li>
18 | 					<li>Node</li>
19 | 				</ul>
20 | 			</li>
21 | 			<li>Node 02</li>
22 | 			<li data-jstree='{"opened" : true}'>Node 03
23 | 				<ul>
24 | 					<li>Node</li>
25 | 					<li>Node</li>
26 | 				</ul>
27 | 			</li>
28 | 			<li>Node 04</li>
29 | 			<li>Node 05</li>
30 | 		</ul>
31 | 	</div>
32 | 	<div class="tree" id="full"><ul><li data-jstree='{ "selected" : true, "type" : "file" }'>full</li><li>asdf</li></ul></div>
33 | 	<div class="tree" id="dark"><ul><li data-jstree='{ "selected" : true, "type" : "file"}'>full</li><li>asdf</li></ul></div>
34 | 
35 | 	<script src="./../../../dist/libs/jquery.js"></script>
36 | 	<script src="./../../../dist/jstree.min.js"></script>
37 | 	<script>
38 | 	$('#empty').jstree();
39 | 	$('#tree, #core').jstree();
40 | 	$('#full').jstree({ plugins : ["checkbox","sort","types","wholerow"], "types" : { "file" : { "icon" : "jstree-file" } } });
41 | 	$('#dark').jstree({ plugins : ["checkbox","sort","types","wholerow"], "core" : { "themes" : { "name" : "default-dark" } }, "types" : { "file" : { "icon" : "jstree-file" } } });
42 | 	</script>
43 | </body>
44 | </html>


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/mobile/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 | 	<meta charset="UTF-8">
 5 | 	<title>Mobile theme visual tests</title>
 6 | 	<link rel="stylesheet" href="./../../../dist/themes/default/style.min.css">
 7 | 	<link rel="stylesheet" href="./../../../dist/themes/default-dark/style.min.css">
 8 | 	<style>.tree { border:1px solid black; padding:10px; width:300px; margin:20px; float:left; min-height:200px; }</style>
 9 | </head>
10 | <body style="background:white;">
11 | 	<div class="tree" id="tree">
12 | 		<ul>
13 | 			<li>Node 01
14 | 				<ul>
15 | 					<li>Node</li>
16 | 					<li>Node</li>
17 | 				</ul>
18 | 			</li>
19 | 			<li>Node 02</li>
20 | 			<li data-jstree='{"opened" : true}'>Node 03
21 | 				<ul>
22 | 					<li>Node</li>
23 | 					<li>Node</li>
24 | 				</ul>
25 | 			</li>
26 | 			<li>Node 04</li>
27 | 			<li>Node 05</li>
28 | 		</ul>
29 | 	</div>
30 | 	<div class="tree" id="full"><ul><li data-jstree='{ "selected" : true, "type" : "file" }'>full</li><li>asdf</li></ul></div>
31 | 	<div class="tree" id="dark"><ul><li data-jstree='{ "selected" : true, "type" : "file"}'>full</li><li>asdf</li></ul></div>
32 | 
33 | 	<script src="./../../../dist/libs/jquery.js"></script>
34 | 	<script src="./../../../dist/jstree.min.js"></script>
35 | 	<script>
36 | 	$.jstree.defaults.core.themes.responsive = true;
37 | 	$('#tree').jstree();
38 | 	$('#full').jstree({ plugins : ["checkbox","sort","types","wholerow"], "types" : { "file" : { "icon" : "jstree-file" } } });
39 | 	$('#dark').jstree({ plugins : ["checkbox","sort","types","wholerow"], "core" : { "themes" : { "name" : "default-dark" } }, "types" : { "file" : { "icon" : "jstree-file" } } });
40 | 	</script>
41 | </body>
42 | </html>


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/desktop/.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/desktop/desktop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/desktop.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/desktop/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/home.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/mobile/.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/mobile/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/home.png


--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/mobile/mobile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/mobile.png


--------------------------------------------------------------------------------