├── .gitignore ├── LICENSE ├── README.md ├── data ├── Advertising.csv ├── Credit.csv ├── WineData.csv ├── airquality.csv ├── churn-bigml-20.csv ├── churn-bigml-80.csv ├── cuse_binary.csv ├── horseshoe_crab.csv ├── hsb2.csv ├── hsb2_modified.csv ├── iris.csv ├── kaggle-titanic-gender_submission.csv ├── kaggle-titanic-test.csv ├── kaggle-titanic-train.csv ├── mtcars.csv ├── prostate.csv ├── saved-mtcars │ ├── .part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc │ ├── _SUCCESS │ └── part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv ├── saved-twitter │ ├── .part-00000.crc │ ├── _SUCCESS │ └── part-00000 ├── titanic │ ├── gender_submission.csv │ ├── test.csv │ └── train.csv └── twitter.txt ├── delete-readme.txt ├── images ├── simple-nlp-pipeline.png └── spark-pipeline.png ├── index.Rmd ├── index.html ├── legacy ├── 01_entry_points_to_spark.Rmd ├── 02_rdd_object.Rmd ├── 03_dataframe_object.Rmd ├── HashingTF-and-CountVectorizer.Rmd ├── categorical-data.Rmd ├── continuous-to-categorical-variable.Rmd ├── conversion-between-rdd-and-dataframe.Rmd ├── cross-validation-in-r.Rmd ├── decision-tree-classification.Rmd ├── dttreeC.Rmd ├── dttreeC.html ├── dttreeR.Rmd ├── fnn.Rmd ├── index.Rmd ├── information-extraction.Rmd ├── install.Rmd ├── k-folds-cross-validation.Rmd ├── kmeans.Rmd ├── linear-regression.Rmd ├── linearRegression.Rmd ├── logistic-regression.Rmd ├── machine-learning-framework.Rmd ├── nlp-and-nltk-basics.Rmd ├── nlpC.Rmd ├── nlpLDA.Rmd ├── pyspark-on-jupyter.Rmd ├── pyspark-on-rodeo.Rmd ├── pyspark-vectors.Rmd ├── pyspark.ml.feature-module.Rmd ├── r-markdown-header.Rmd ├── randomforest.Rmd ├── randomforestC.Rmd ├── regularization.Rmd ├── sna.Rmd ├── spark-on-jetstream-cloud.Rmd └── tf-idf.Rmd ├── link-spark-with-jupyter.md ├── logo.jpg ├── notebooks ├── 01-data-strcture │ ├── .gitignore │ ├── .ipynb_checkpoints │ │ ├── 1.1-rdd-checkpoint.ipynb │ │ ├── 1.2-dataframe-checkpoint.ipynb │ │ ├── 1.3-conversion-between-rdd-and-dataframe-checkpoint.ipynb │ │ └── 1.4-merge-and-split-columns-checkpoint.ipynb │ ├── 1.1-rdd.ipynb │ ├── 1.2-dataframe.ipynb │ ├── 1.3-conversion-between-rdd-and-dataframe.ipynb │ └── 1.4-merge-and-split-columns.ipynb ├── 02-data-manipulation │ ├── .ipynb_checkpoints │ │ ├── 2.1-map-functions-checkpoint.ipynb │ │ ├── 2.2-aggregate-functions-checkpoint.ipynb │ │ ├── 2.3-continuous-variable-to-categorical-variable-checkpoint.ipynb │ │ ├── 2.4-first-data-check-checkpoint.ipynb │ │ ├── 2.7.1-column-expression-checkpoint.ipynb │ │ ├── 2.7.3-boolean-column-expression-checkpoint.ipynb │ │ ├── 2.8-sql-functions-to-extend-column-expressions-checkpoint.ipynb │ │ └── 2.9-user-defined-sql-function (udf)-checkpoint.ipynb │ ├── 2.1-map-functions.ipynb │ ├── 2.2-aggregate-functions.ipynb │ ├── 2.3-continuous-variable-to-categorical-variable.ipynb │ ├── 2.4-first-data-check.ipynb │ ├── 2.5-subset-dataframe-by-row.ipynb │ ├── 2.6-subset-dataframe-by-column.ipynb │ ├── 2.7.1-column-expression.ipynb │ ├── 2.7.2-dot-column-expression.ipynb │ ├── 2.7.3-boolean-column-expression.ipynb │ ├── 2.8-sql-functions-to-extend-column-expressions.ipynb │ ├── 2.9-user-defined-sql-function (udf).ipynb │ └── import-and-export-data.ipynb ├── 03-data-preparation │ ├── stringindexer-and-onehotencoder.ipynb │ └── vector-assembler.ipynb ├── 04-miscellaneous │ ├── .ipynb_checkpoints │ │ └── user-defined-sql-function (udf)-checkpoint.ipynb │ ├── TF-IDF.ipynb │ ├── add-python-files-to-spark-cluster.ipynb │ ├── dense-vs-sparse-vectors.ipynb │ ├── issues-and-solutions.ipynb │ ├── pipeline.ipynb │ └── sql-functions.ipynb ├── 05-module-turning │ ├── cross-validation.ipynb │ └── regularization.ipynb ├── 06-machine-learning │ ├── classification │ │ ├── binary-classification.ipynb │ │ ├── decision-tree-classification.ipynb │ │ ├── gradient-boost-tree-classification.ipynb │ │ ├── logistic-regression.ipynb │ │ ├── naive-bayes-classification.ipynb │ │ └── random-forest-classification.ipynb │ └── regression │ │ ├── generalized-linear-regression.ipynb │ │ └── linear-regression.ipynb ├── 07-natural-language-processing │ ├── nlp-and-nltk-basics.ipynb │ ├── nlp-information-extraction.ipynb │ └── skills-needed-for-nlp-jobs.ipynb └── ipynb │ ├── .ipynb_checkpoints │ ├── DecisionTree-checkpoint.ipynb │ ├── Feedforward neural network(1)-checkpoint.ipynb │ ├── HashingTF-and-CountVectorizer-checkpoint.ipynb │ ├── NaiveBayes-checkpoint.ipynb │ └── RDD-manipulation-checkpoint.ipynb │ ├── Categoricaldata.ipynb │ ├── DataWrangling.ipynb │ ├── DecisionTree.ipynb │ ├── DecisionTreeC3.ipynb │ ├── DecisionTreeC7.ipynb │ ├── DecisionTreeR.ipynb │ ├── Feedforward neural network(1).ipynb │ ├── Feedforward neural network.ipynb │ ├── HashingTF-and-CountVectorizer.ipynb │ ├── LinearRegression.ipynb │ ├── NaiveBayes.ipynb │ ├── Natural Language Processing nb.ipynb │ ├── PysparkCluster.ipynb │ ├── RandomForest.ipynb │ ├── Regression.ipynb │ ├── derby.log │ ├── preproc.py │ └── vector.ipynb ├── pyFiles ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── pyFiles.iml │ └── workspace.xml └── my_module.py └── vakata-jstree-3.3.5 ├── .gitignore ├── LICENSE-MIT ├── README.md ├── bower.json ├── component.json ├── composer.json ├── demo ├── README.md └── basic │ ├── index.html │ └── root.json ├── dist ├── jstree.js ├── jstree.min.js └── themes │ ├── default-dark │ ├── 32px.png │ ├── 40px.png │ ├── style.css │ ├── style.min.css │ └── throbber.gif │ └── default │ ├── 32px.png │ ├── 40px.png │ ├── style.css │ ├── style.min.css │ └── throbber.gif ├── gruntfile.js ├── jstree.jquery.json ├── package.json ├── src ├── intro.js ├── jstree.changed.js ├── jstree.checkbox.js ├── jstree.conditionalselect.js ├── jstree.contextmenu.js ├── jstree.dnd.js ├── jstree.js ├── jstree.massload.js ├── jstree.search.js ├── jstree.sort.js ├── jstree.state.js ├── jstree.types.js ├── jstree.unique.js ├── jstree.wholerow.js ├── misc.js ├── outro.js ├── sample.js ├── themes │ ├── base.less │ ├── default-dark │ │ ├── 32px.png │ │ ├── 40px.png │ │ ├── style.css │ │ ├── style.less │ │ └── throbber.gif │ ├── default │ │ ├── 32px.png │ │ ├── 40px.png │ │ ├── style.css │ │ ├── style.less │ │ └── throbber.gif │ ├── main.less │ ├── mixins.less │ └── responsive.less └── vakata-jstree.js └── test ├── unit ├── index.html ├── libs │ ├── qunit.css │ └── qunit.js └── test.js └── visual ├── desktop └── index.html ├── mobile └── index.html └── screenshots ├── desktop ├── .png ├── desktop.png └── home.png └── mobile ├── .png ├── home.png └── mobile.png /.gitignore: -------------------------------------------------------------------------------- 1 | /.Rproj.user 2 | /.Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | *.ipybn 7 | /.ipynb_checkpoints 8 | /.idea 9 | .Rproj.user 10 | metastore_db 11 | *_cache 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Ming Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Learning Apache Spark](https://mingchen0919.github.io/learning-apache-spark/) 2 | 3 | 4 | Website: https://mingchen0919.github.io/learning-apache-spark/index.html 5 | -------------------------------------------------------------------------------- /data/Advertising.csv: -------------------------------------------------------------------------------- 1 | TV,Radio,Newspaper,Sales 230.1,37.8,69.2,22.1 44.5,39.3,45.1,10.4 17.2,45.9,69.3,9.3 151.5,41.3,58.5,18.5 180.8,10.8,58.4,12.9 8.7,48.9,75,7.2 57.5,32.8,23.5,11.8 120.2,19.6,11.6,13.2 8.6,2.1,1,4.8 199.8,2.6,21.2,10.6 66.1,5.8,24.2,8.6 214.7,24,4,17.4 23.8,35.1,65.9,9.2 97.5,7.6,7.2,9.7 204.1,32.9,46,19 195.4,47.7,52.9,22.4 67.8,36.6,114,12.5 281.4,39.6,55.8,24.4 69.2,20.5,18.3,11.3 147.3,23.9,19.1,14.6 218.4,27.7,53.4,18 237.4,5.1,23.5,12.5 13.2,15.9,49.6,5.6 228.3,16.9,26.2,15.5 62.3,12.6,18.3,9.7 262.9,3.5,19.5,12 142.9,29.3,12.6,15 240.1,16.7,22.9,15.9 248.8,27.1,22.9,18.9 70.6,16,40.8,10.5 292.9,28.3,43.2,21.4 112.9,17.4,38.6,11.9 97.2,1.5,30,9.6 265.6,20,0.3,17.4 95.7,1.4,7.4,9.5 290.7,4.1,8.5,12.8 266.9,43.8,5,25.4 74.7,49.4,45.7,14.7 43.1,26.7,35.1,10.1 228,37.7,32,21.5 202.5,22.3,31.6,16.6 177,33.4,38.7,17.1 293.6,27.7,1.8,20.7 206.9,8.4,26.4,12.9 25.1,25.7,43.3,8.5 175.1,22.5,31.5,14.9 89.7,9.9,35.7,10.6 239.9,41.5,18.5,23.2 227.2,15.8,49.9,14.8 66.9,11.7,36.8,9.7 199.8,3.1,34.6,11.4 100.4,9.6,3.6,10.7 216.4,41.7,39.6,22.6 182.6,46.2,58.7,21.2 262.7,28.8,15.9,20.2 198.9,49.4,60,23.7 7.3,28.1,41.4,5.5 136.2,19.2,16.6,13.2 210.8,49.6,37.7,23.8 210.7,29.5,9.3,18.4 53.5,2,21.4,8.1 261.3,42.7,54.7,24.2 239.3,15.5,27.3,15.7 102.7,29.6,8.4,14 131.1,42.8,28.9,18 69,9.3,0.9,9.3 31.5,24.6,2.2,9.5 139.3,14.5,10.2,13.4 237.4,27.5,11,18.9 216.8,43.9,27.2,22.3 199.1,30.6,38.7,18.3 109.8,14.3,31.7,12.4 26.8,33,19.3,8.8 129.4,5.7,31.3,11 213.4,24.6,13.1,17 16.9,43.7,89.4,8.7 27.5,1.6,20.7,6.9 120.5,28.5,14.2,14.2 5.4,29.9,9.4,5.3 116,7.7,23.1,11 76.4,26.7,22.3,11.8 239.8,4.1,36.9,12.3 75.3,20.3,32.5,11.3 68.4,44.5,35.6,13.6 213.5,43,33.8,21.7 193.2,18.4,65.7,15.2 76.3,27.5,16,12 110.7,40.6,63.2,16 88.3,25.5,73.4,12.9 109.8,47.8,51.4,16.7 134.3,4.9,9.3,11.2 28.6,1.5,33,7.3 217.7,33.5,59,19.4 250.9,36.5,72.3,22.2 107.4,14,10.9,11.5 163.3,31.6,52.9,16.9 197.6,3.5,5.9,11.7 184.9,21,22,15.5 289.7,42.3,51.2,25.4 135.2,41.7,45.9,17.2 222.4,4.3,49.8,11.7 296.4,36.3,100.9,23.8 280.2,10.1,21.4,14.8 187.9,17.2,17.9,14.7 238.2,34.3,5.3,20.7 137.9,46.4,59,19.2 25,11,29.7,7.2 90.4,0.3,23.2,8.7 13.1,0.4,25.6,5.3 255.4,26.9,5.5,19.8 225.8,8.2,56.5,13.4 241.7,38,23.2,21.8 175.7,15.4,2.4,14.1 209.6,20.6,10.7,15.9 78.2,46.8,34.5,14.6 75.1,35,52.7,12.6 139.2,14.3,25.6,12.2 76.4,0.8,14.8,9.4 125.7,36.9,79.2,15.9 19.4,16,22.3,6.6 141.3,26.8,46.2,15.5 18.8,21.7,50.4,7 224,2.4,15.6,11.6 123.1,34.6,12.4,15.2 229.5,32.3,74.2,19.7 87.2,11.8,25.9,10.6 7.8,38.9,50.6,6.6 80.2,0,9.2,8.8 220.3,49,3.2,24.7 59.6,12,43.1,9.7 0.7,39.6,8.7,1.6 265.2,2.9,43,12.7 8.4,27.2,2.1,5.7 219.8,33.5,45.1,19.6 36.9,38.6,65.6,10.8 48.3,47,8.5,11.6 25.6,39,9.3,9.5 273.7,28.9,59.7,20.8 43,25.9,20.5,9.6 184.9,43.9,1.7,20.7 73.4,17,12.9,10.9 193.7,35.4,75.6,19.2 220.5,33.2,37.9,20.1 104.6,5.7,34.4,10.4 96.2,14.8,38.9,11.4 140.3,1.9,9,10.3 240.1,7.3,8.7,13.2 243.2,49,44.3,25.4 38,40.3,11.9,10.9 44.7,25.8,20.6,10.1 280.7,13.9,37,16.1 121,8.4,48.7,11.6 197.6,23.3,14.2,16.6 171.3,39.7,37.7,19 187.8,21.1,9.5,15.6 4.1,11.6,5.7,3.2 93.9,43.5,50.5,15.3 149.8,1.3,24.3,10.1 11.7,36.9,45.2,7.3 131.7,18.4,34.6,12.9 172.5,18.1,30.7,14.4 85.7,35.8,49.3,13.3 188.4,18.1,25.6,14.9 163.5,36.8,7.4,18 117.2,14.7,5.4,11.9 234.5,3.4,84.8,11.9 17.9,37.6,21.6,8 206.8,5.2,19.4,12.2 215.4,23.6,57.6,17.1 284.3,10.6,6.4,15 50,11.6,18.4,8.4 164.5,20.9,47.4,14.5 19.6,20.1,17,7.6 168.4,7.1,12.8,11.7 222.4,3.4,13.1,11.5 276.9,48.9,41.8,27 248.4,30.2,20.3,20.2 170.2,7.8,35.2,11.7 276.7,2.3,23.7,11.8 165.6,10,17.6,12.6 156.6,2.6,8.3,10.5 218.5,5.4,27.4,12.2 56.2,5.7,29.7,8.7 287.6,43,71.8,26.2 253.8,21.3,30,17.6 205,45.1,19.6,22.6 139.5,2.1,26.6,10.3 191.1,28.7,18.2,17.3 286,13.9,3.7,15.9 18.7,12.1,23.4,6.7 39.5,41.1,5.8,10.8 75.5,10.8,6,9.9 17.2,4.1,31.6,5.9 166.8,42,3.6,19.6 149.7,35.6,6,17.3 38.2,3.7,13.8,7.6 94.2,4.9,8.1,9.7 177,9.3,6.4,12.8 283.6,42,66.2,25.5 232.1,8.6,8.7,13.4 -------------------------------------------------------------------------------- /data/airquality.csv: -------------------------------------------------------------------------------- 1 | "ozone","solar.r","wind","temp","month","day" 2 | 41,190,7.4,67,5,1 3 | 36,118,8,72,5,2 4 | 12,149,12.6,74,5,3 5 | 18,313,11.5,62,5,4 6 | NA,NA,14.3,56,5,5 7 | 28,NA,14.9,66,5,6 8 | 23,299,8.6,65,5,7 9 | 19,99,13.8,59,5,8 10 | 8,19,20.1,61,5,9 11 | NA,194,8.6,69,5,10 12 | 7,NA,6.9,74,5,11 13 | 16,256,9.7,69,5,12 14 | 11,290,9.2,66,5,13 15 | 14,274,10.9,68,5,14 16 | 18,65,13.2,58,5,15 17 | 14,334,11.5,64,5,16 18 | 34,307,12,66,5,17 19 | 6,78,18.4,57,5,18 20 | 30,322,11.5,68,5,19 21 | 11,44,9.7,62,5,20 22 | 1,8,9.7,59,5,21 23 | 11,320,16.6,73,5,22 24 | 4,25,9.7,61,5,23 25 | 32,92,12,61,5,24 26 | NA,66,16.6,57,5,25 27 | NA,266,14.9,58,5,26 28 | NA,NA,8,57,5,27 29 | 23,13,12,67,5,28 30 | 45,252,14.9,81,5,29 31 | 115,223,5.7,79,5,30 32 | 37,279,7.4,76,5,31 33 | NA,286,8.6,78,6,1 34 | NA,287,9.7,74,6,2 35 | NA,242,16.1,67,6,3 36 | NA,186,9.2,84,6,4 37 | NA,220,8.6,85,6,5 38 | NA,264,14.3,79,6,6 39 | 29,127,9.7,82,6,7 40 | NA,273,6.9,87,6,8 41 | 71,291,13.8,90,6,9 42 | 39,323,11.5,87,6,10 43 | NA,259,10.9,93,6,11 44 | NA,250,9.2,92,6,12 45 | 23,148,8,82,6,13 46 | NA,332,13.8,80,6,14 47 | NA,322,11.5,79,6,15 48 | 21,191,14.9,77,6,16 49 | 37,284,20.7,72,6,17 50 | 20,37,9.2,65,6,18 51 | 12,120,11.5,73,6,19 52 | 13,137,10.3,76,6,20 53 | NA,150,6.3,77,6,21 54 | NA,59,1.7,76,6,22 55 | NA,91,4.6,76,6,23 56 | NA,250,6.3,76,6,24 57 | NA,135,8,75,6,25 58 | NA,127,8,78,6,26 59 | NA,47,10.3,73,6,27 60 | NA,98,11.5,80,6,28 61 | NA,31,14.9,77,6,29 62 | NA,138,8,83,6,30 63 | 135,269,4.1,84,7,1 64 | 49,248,9.2,85,7,2 65 | 32,236,9.2,81,7,3 66 | NA,101,10.9,84,7,4 67 | 64,175,4.6,83,7,5 68 | 40,314,10.9,83,7,6 69 | 77,276,5.1,88,7,7 70 | 97,267,6.3,92,7,8 71 | 97,272,5.7,92,7,9 72 | 85,175,7.4,89,7,10 73 | NA,139,8.6,82,7,11 74 | 10,264,14.3,73,7,12 75 | 27,175,14.9,81,7,13 76 | NA,291,14.9,91,7,14 77 | 7,48,14.3,80,7,15 78 | 48,260,6.9,81,7,16 79 | 35,274,10.3,82,7,17 80 | 61,285,6.3,84,7,18 81 | 79,187,5.1,87,7,19 82 | 63,220,11.5,85,7,20 83 | 16,7,6.9,74,7,21 84 | NA,258,9.7,81,7,22 85 | NA,295,11.5,82,7,23 86 | 80,294,8.6,86,7,24 87 | 108,223,8,85,7,25 88 | 20,81,8.6,82,7,26 89 | 52,82,12,86,7,27 90 | 82,213,7.4,88,7,28 91 | 50,275,7.4,86,7,29 92 | 64,253,7.4,83,7,30 93 | 59,254,9.2,81,7,31 94 | 39,83,6.9,81,8,1 95 | 9,24,13.8,81,8,2 96 | 16,77,7.4,82,8,3 97 | 78,NA,6.9,86,8,4 98 | 35,NA,7.4,85,8,5 99 | 66,NA,4.6,87,8,6 100 | 122,255,4,89,8,7 101 | 89,229,10.3,90,8,8 102 | 110,207,8,90,8,9 103 | NA,222,8.6,92,8,10 104 | NA,137,11.5,86,8,11 105 | 44,192,11.5,86,8,12 106 | 28,273,11.5,82,8,13 107 | 65,157,9.7,80,8,14 108 | NA,64,11.5,79,8,15 109 | 22,71,10.3,77,8,16 110 | 59,51,6.3,79,8,17 111 | 23,115,7.4,76,8,18 112 | 31,244,10.9,78,8,19 113 | 44,190,10.3,78,8,20 114 | 21,259,15.5,77,8,21 115 | 9,36,14.3,72,8,22 116 | NA,255,12.6,75,8,23 117 | 45,212,9.7,79,8,24 118 | 168,238,3.4,81,8,25 119 | 73,215,8,86,8,26 120 | NA,153,5.7,88,8,27 121 | 76,203,9.7,97,8,28 122 | 118,225,2.3,94,8,29 123 | 84,237,6.3,96,8,30 124 | 85,188,6.3,94,8,31 125 | 96,167,6.9,91,9,1 126 | 78,197,5.1,92,9,2 127 | 73,183,2.8,93,9,3 128 | 91,189,4.6,93,9,4 129 | 47,95,7.4,87,9,5 130 | 32,92,15.5,84,9,6 131 | 20,252,10.9,80,9,7 132 | 23,220,10.3,78,9,8 133 | 21,230,10.9,75,9,9 134 | 24,259,9.7,73,9,10 135 | 44,236,14.9,81,9,11 136 | 21,259,15.5,76,9,12 137 | 28,238,6.3,77,9,13 138 | 9,24,10.9,71,9,14 139 | 13,112,11.5,71,9,15 140 | 46,237,6.9,78,9,16 141 | 18,224,13.8,67,9,17 142 | 13,27,10.3,76,9,18 143 | 24,238,10.3,68,9,19 144 | 16,201,8,82,9,20 145 | 13,238,12.6,64,9,21 146 | 23,14,9.2,71,9,22 147 | 36,139,10.3,81,9,23 148 | 7,49,10.3,69,9,24 149 | 14,20,16.6,63,9,25 150 | 30,193,6.9,70,9,26 151 | NA,145,13.2,77,9,27 152 | 14,191,14.3,75,9,28 153 | 18,131,8,76,9,29 154 | 20,223,11.5,68,9,30 155 | -------------------------------------------------------------------------------- /data/horseshoe_crab.csv: -------------------------------------------------------------------------------- 1 | C,S,W,Wt,Sa 2 | 2,3,28.3,3.05,8 3 | 3,3,26,2.6,4 4 | 3,3,25.6,2.15,0 5 | 4,2,21,1.85,0 6 | 2,3,29,3,1 7 | 1,2,25,2.3,3 8 | 4,3,26.2,1.3,0 9 | 2,3,24.9,2.1,0 10 | 2,1,25.7,2,8 11 | 2,3,27.5,3.15,6 12 | 1,1,26.1,2.8,5 13 | 3,3,28.9,2.8,4 14 | 2,1,30.3,3.6,3 15 | 2,3,22.9,1.6,4 16 | 3,3,26.2,2.3,3 17 | 3,3,24.5,2.05,5 18 | 2,3,30,3.05,8 19 | 2,3,26.2,2.4,3 20 | 2,3,25.4,2.25,6 21 | 2,3,25.4,2.25,4 22 | 4,3,27.5,2.9,0 23 | 4,3,27,2.25,3 24 | 2,2,24,1.7,0 25 | 2,1,28.7,3.2,0 26 | 3,3,26.5,1.97,1 27 | 2,3,24.5,1.6,1 28 | 3,3,27.3,2.9,1 29 | 2,3,26.5,2.3,4 30 | 2,3,25,2.1,2 31 | 3,3,22,1.4,0 32 | 1,1,30.2,3.28,2 33 | 2,2,25.4,2.3,0 34 | 2,1,24.9,2.3,6 35 | 4,3,25.8,2.25,10 36 | 3,3,27.2,2.4,5 37 | 2,3,30.5,3.32,3 38 | 4,3,25,2.1,8 39 | 2,3,30,3,9 40 | 2,1,22.9,1.6,0 41 | 2,3,23.9,1.85,2 42 | 2,3,26,2.28,3 43 | 2,3,25.8,2.2,0 44 | 3,3,29,3.28,4 45 | 1,1,26.5,2.35,0 46 | 3,3,22.5,1.55,0 47 | 2,3,23.8,2.1,0 48 | 3,3,24.3,2.15,0 49 | 2,1,26,2.3,14 50 | 4,3,24.7,2.2,0 51 | 2,1,22.5,1.6,1 52 | 2,3,28.7,3.15,3 53 | 1,1,29.3,3.2,4 54 | 2,1,26.7,2.7,5 55 | 4,3,23.4,1.9,0 56 | 1,1,27.7,2.5,6 57 | 2,3,28.2,2.6,6 58 | 4,3,24.7,2.1,5 59 | 2,1,25.7,2,5 60 | 2,1,27.8,2.75,0 61 | 3,1,27,2.45,3 62 | 2,3,29,3.2,10 63 | 3,3,25.6,2.8,7 64 | 3,3,24.2,1.9,0 65 | 3,3,25.7,1.2,0 66 | 3,3,23.1,1.65,0 67 | 2,3,28.5,3.05,0 68 | 2,1,29.7,3.85,5 69 | 3,3,23.1,1.55,0 70 | 3,3,24.5,2.2,1 71 | 2,3,27.5,2.55,1 72 | 2,3,26.3,2.4,1 73 | 2,3,27.8,3.25,3 74 | 2,3,31.9,3.33,2 75 | 2,3,25,2.4,5 76 | 3,3,26.2,2.22,0 77 | 3,3,28.4,3.2,3 78 | 1,2,24.5,1.95,6 79 | 2,3,27.9,3.05,7 80 | 2,2,25,2.25,6 81 | 3,3,29,2.92,3 82 | 2,1,31.7,3.73,4 83 | 2,3,27.6,2.85,4 84 | 4,3,24.5,1.9,0 85 | 3,3,23.8,1.8,0 86 | 2,3,28.2,3.05,8 87 | 3,3,24.1,1.8,0 88 | 1,1,28,2.62,0 89 | 1,1,26,2.3,9 90 | 3,2,24.7,1.9,0 91 | 2,3,25.8,2.65,0 92 | 1,1,27.1,2.95,8 93 | 2,3,27.4,2.7,5 94 | 3,3,26.7,2.6,2 95 | 2,1,26.8,2.7,5 96 | 1,3,25.8,2.6,0 97 | 4,3,23.7,1.85,0 98 | 2,3,27.9,2.8,6 99 | 2,1,30,3.3,5 100 | 2,3,25,2.1,4 101 | 2,3,27.7,2.9,5 102 | 2,3,28.3,3,15 103 | 4,3,25.5,2.25,0 104 | 2,3,26,2.15,5 105 | 2,3,26.2,2.4,0 106 | 3,3,23,1.65,1 107 | 2,2,22.9,1.6,0 108 | 2,3,25.1,2.1,5 109 | 3,1,25.9,2.55,4 110 | 4,1,25.5,2.75,0 111 | 2,1,26.8,2.55,0 112 | 2,1,29,2.8,1 113 | 3,3,28.5,3,1 114 | 2,2,24.7,2.55,4 115 | 2,3,29,3.1,1 116 | 2,3,27,2.5,6 117 | 4,3,23.7,1.8,0 118 | 3,3,27,2.5,6 119 | 2,3,24.2,1.65,2 120 | 4,3,22.5,1.47,4 121 | 2,3,25.1,1.8,0 122 | 2,3,24.9,2.2,0 123 | 2,3,27.5,2.63,6 124 | 2,1,24.3,2,0 125 | 2,3,29.5,3.02,4 126 | 2,3,26.2,2.3,0 127 | 2,3,24.7,1.95,4 128 | 3,2,29.8,3.5,4 129 | 4,3,25.7,2.15,0 130 | 3,3,26.2,2.17,2 131 | 4,3,27,2.63,0 132 | 3,3,24.8,2.1,0 133 | 2,1,23.7,1.95,0 134 | 2,3,28.2,3.05,11 135 | 2,3,25.2,2,1 136 | 2,2,23.2,1.95,4 137 | 4,3,25.8,2,3 138 | 4,3,27.5,2.6,0 139 | 2,2,25.7,2,0 140 | 2,3,26.8,2.65,0 141 | 3,3,27.5,3.1,3 142 | 3,1,28.5,3.25,9 143 | 2,3,28.5,3,3 144 | 1,1,27.4,2.7,6 145 | 2,3,27.2,2.7,3 146 | 3,3,27.1,2.55,0 147 | 2,3,28,2.8,1 148 | 2,1,26.5,1.3,0 149 | 3,3,23,1.8,0 150 | 3,2,26,2.2,3 151 | 3,2,24.5,2.25,0 152 | 2,3,25.8,2.3,0 153 | 4,3,23.5,1.9,0 154 | 4,3,26.7,2.45,0 155 | 3,3,25.5,2.25,0 156 | 2,3,28.2,2.87,1 157 | 2,1,25.2,2,1 158 | 2,3,25.3,1.9,2 159 | 3,3,25.7,2.1,0 160 | 4,3,29.3,3.23,12 161 | 3,3,23.8,1.8,6 162 | 2,3,27.4,2.9,3 163 | 2,3,26.2,2.02,2 164 | 2,1,28,2.9,4 165 | 2,1,28.4,3.1,5 166 | 2,1,33.5,5.2,7 167 | 2,3,25.8,2.4,0 168 | 3,3,24,1.9,10 169 | 2,1,23.1,2,0 170 | 2,3,28.3,3.2,0 171 | 2,3,26.5,2.35,4 172 | 2,3,26.5,2.75,7 173 | 3,3,26.1,2.75,3 174 | 2,2,24.5,2,0 -------------------------------------------------------------------------------- /data/iris.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width,species 2 | 5.1,3.5,1.4,0.2,setosa 3 | 4.9,3,1.4,0.2,setosa 4 | 4.7,3.2,1.3,0.2,setosa 5 | 4.6,3.1,1.5,0.2,setosa 6 | 5,3.6,1.4,0.2,setosa 7 | 5.4,3.9,1.7,0.4,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 5,3.4,1.5,0.2,setosa 10 | 4.4,2.9,1.4,0.2,setosa 11 | 4.9,3.1,1.5,0.1,setosa 12 | 5.4,3.7,1.5,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3,1.4,0.1,setosa 15 | 4.3,3,1.1,0.1,setosa 16 | 5.8,4,1.2,0.2,setosa 17 | 5.7,4.4,1.5,0.4,setosa 18 | 5.4,3.9,1.3,0.4,setosa 19 | 5.1,3.5,1.4,0.3,setosa 20 | 5.7,3.8,1.7,0.3,setosa 21 | 5.1,3.8,1.5,0.3,setosa 22 | 5.4,3.4,1.7,0.2,setosa 23 | 5.1,3.7,1.5,0.4,setosa 24 | 4.6,3.6,1,0.2,setosa 25 | 5.1,3.3,1.7,0.5,setosa 26 | 4.8,3.4,1.9,0.2,setosa 27 | 5,3,1.6,0.2,setosa 28 | 5,3.4,1.6,0.4,setosa 29 | 5.2,3.5,1.5,0.2,setosa 30 | 5.2,3.4,1.4,0.2,setosa 31 | 4.7,3.2,1.6,0.2,setosa 32 | 4.8,3.1,1.6,0.2,setosa 33 | 5.4,3.4,1.5,0.4,setosa 34 | 5.2,4.1,1.5,0.1,setosa 35 | 5.5,4.2,1.4,0.2,setosa 36 | 4.9,3.1,1.5,0.1,setosa 37 | 5,3.2,1.2,0.2,setosa 38 | 5.5,3.5,1.3,0.2,setosa 39 | 4.9,3.1,1.5,0.1,setosa 40 | 4.4,3,1.3,0.2,setosa 41 | 5.1,3.4,1.5,0.2,setosa 42 | 5,3.5,1.3,0.3,setosa 43 | 4.5,2.3,1.3,0.3,setosa 44 | 4.4,3.2,1.3,0.2,setosa 45 | 5,3.5,1.6,0.6,setosa 46 | 5.1,3.8,1.9,0.4,setosa 47 | 4.8,3,1.4,0.3,setosa 48 | 5.1,3.8,1.6,0.2,setosa 49 | 4.6,3.2,1.4,0.2,setosa 50 | 5.3,3.7,1.5,0.2,setosa 51 | 5,3.3,1.4,0.2,setosa 52 | 7,3.2,4.7,1.4,versicolor 53 | 6.4,3.2,4.5,1.5,versicolor 54 | 6.9,3.1,4.9,1.5,versicolor 55 | 5.5,2.3,4,1.3,versicolor 56 | 6.5,2.8,4.6,1.5,versicolor 57 | 5.7,2.8,4.5,1.3,versicolor 58 | 6.3,3.3,4.7,1.6,versicolor 59 | 4.9,2.4,3.3,1,versicolor 60 | 6.6,2.9,4.6,1.3,versicolor 61 | 5.2,2.7,3.9,1.4,versicolor 62 | 5,2,3.5,1,versicolor 63 | 5.9,3,4.2,1.5,versicolor 64 | 6,2.2,4,1,versicolor 65 | 6.1,2.9,4.7,1.4,versicolor 66 | 5.6,2.9,3.6,1.3,versicolor 67 | 6.7,3.1,4.4,1.4,versicolor 68 | 5.6,3,4.5,1.5,versicolor 69 | 5.8,2.7,4.1,1,versicolor 70 | 6.2,2.2,4.5,1.5,versicolor 71 | 5.6,2.5,3.9,1.1,versicolor 72 | 5.9,3.2,4.8,1.8,versicolor 73 | 6.1,2.8,4,1.3,versicolor 74 | 6.3,2.5,4.9,1.5,versicolor 75 | 6.1,2.8,4.7,1.2,versicolor 76 | 6.4,2.9,4.3,1.3,versicolor 77 | 6.6,3,4.4,1.4,versicolor 78 | 6.8,2.8,4.8,1.4,versicolor 79 | 6.7,3,5,1.7,versicolor 80 | 6,2.9,4.5,1.5,versicolor 81 | 5.7,2.6,3.5,1,versicolor 82 | 5.5,2.4,3.8,1.1,versicolor 83 | 5.5,2.4,3.7,1,versicolor 84 | 5.8,2.7,3.9,1.2,versicolor 85 | 6,2.7,5.1,1.6,versicolor 86 | 5.4,3,4.5,1.5,versicolor 87 | 6,3.4,4.5,1.6,versicolor 88 | 6.7,3.1,4.7,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 5.6,3,4.1,1.3,versicolor 91 | 5.5,2.5,4,1.3,versicolor 92 | 5.5,2.6,4.4,1.2,versicolor 93 | 6.1,3,4.6,1.4,versicolor 94 | 5.8,2.6,4,1.2,versicolor 95 | 5,2.3,3.3,1,versicolor 96 | 5.6,2.7,4.2,1.3,versicolor 97 | 5.7,3,4.2,1.2,versicolor 98 | 5.7,2.9,4.2,1.3,versicolor 99 | 6.2,2.9,4.3,1.3,versicolor 100 | 5.1,2.5,3,1.1,versicolor 101 | 5.7,2.8,4.1,1.3,versicolor 102 | 6.3,3.3,6,2.5,virginica 103 | 5.8,2.7,5.1,1.9,virginica 104 | 7.1,3,5.9,2.1,virginica 105 | 6.3,2.9,5.6,1.8,virginica 106 | 6.5,3,5.8,2.2,virginica 107 | 7.6,3,6.6,2.1,virginica 108 | 4.9,2.5,4.5,1.7,virginica 109 | 7.3,2.9,6.3,1.8,virginica 110 | 6.7,2.5,5.8,1.8,virginica 111 | 7.2,3.6,6.1,2.5,virginica 112 | 6.5,3.2,5.1,2,virginica 113 | 6.4,2.7,5.3,1.9,virginica 114 | 6.8,3,5.5,2.1,virginica 115 | 5.7,2.5,5,2,virginica 116 | 5.8,2.8,5.1,2.4,virginica 117 | 6.4,3.2,5.3,2.3,virginica 118 | 6.5,3,5.5,1.8,virginica 119 | 7.7,3.8,6.7,2.2,virginica 120 | 7.7,2.6,6.9,2.3,virginica 121 | 6,2.2,5,1.5,virginica 122 | 6.9,3.2,5.7,2.3,virginica 123 | 5.6,2.8,4.9,2,virginica 124 | 7.7,2.8,6.7,2,virginica 125 | 6.3,2.7,4.9,1.8,virginica 126 | 6.7,3.3,5.7,2.1,virginica 127 | 7.2,3.2,6,1.8,virginica 128 | 6.2,2.8,4.8,1.8,virginica 129 | 6.1,3,4.9,1.8,virginica 130 | 6.4,2.8,5.6,2.1,virginica 131 | 7.2,3,5.8,1.6,virginica 132 | 7.4,2.8,6.1,1.9,virginica 133 | 7.9,3.8,6.4,2,virginica 134 | 6.4,2.8,5.6,2.2,virginica 135 | 6.3,2.8,5.1,1.5,virginica 136 | 6.1,2.6,5.6,1.4,virginica 137 | 7.7,3,6.1,2.3,virginica 138 | 6.3,3.4,5.6,2.4,virginica 139 | 6.4,3.1,5.5,1.8,virginica 140 | 6,3,4.8,1.8,virginica 141 | 6.9,3.1,5.4,2.1,virginica 142 | 6.7,3.1,5.6,2.4,virginica 143 | 6.9,3.1,5.1,2.3,virginica 144 | 5.8,2.7,5.1,1.9,virginica 145 | 6.8,3.2,5.9,2.3,virginica 146 | 6.7,3.3,5.7,2.5,virginica 147 | 6.7,3,5.2,2.3,virginica 148 | 6.3,2.5,5,1.9,virginica 149 | 6.5,3,5.2,2,virginica 150 | 6.2,3.4,5.4,2.3,virginica 151 | 5.9,3,5.1,1.8,virginica 152 | -------------------------------------------------------------------------------- /data/kaggle-titanic-gender_submission.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,1 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,1 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,1 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,1 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,1 368 | 1258,0 369 | 1259,1 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /data/mtcars.csv: -------------------------------------------------------------------------------- 1 | ,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb 2 | Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4 3 | Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4 4 | Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1 5 | Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1 6 | Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2 7 | Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1 8 | Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4 9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2 10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2 11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4 12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4 13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3 14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3 15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18,0,0,3,3 16 | Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4 17 | Lincoln Continental,10.4,8,460,215,3,5.424,17.82,0,0,3,4 18 | Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4 19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1 20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2 21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1 22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1 23 | Dodge Challenger,15.5,8,318,150,2.76,3.52,16.87,0,0,3,2 24 | AMC Javelin,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2 25 | Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4 26 | Pontiac Firebird,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2 27 | Fiat X1-9,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1 28 | Porsche 914-2,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2 29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2 30 | Ford Pantera L,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4 31 | Ferrari Dino,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6 32 | Maserati Bora,15,8,301,335,3.54,3.57,14.6,0,1,5,8 33 | Volvo 142E,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2 -------------------------------------------------------------------------------- /data/saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc -------------------------------------------------------------------------------- /data/saved-mtcars/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-mtcars/_SUCCESS -------------------------------------------------------------------------------- /data/saved-mtcars/part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv: -------------------------------------------------------------------------------- 1 | x_rown_ames,x_mpg,x_cyl,x_disp,x_hp,x_drat,x_wt,x_qsec,x_vs,x_am,x_gear,x_carb 2 | Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4 3 | Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4 4 | Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1 5 | Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1 6 | Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2 7 | Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1 8 | Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4 9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2 10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2 11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4 12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4 13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3 14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3 15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0,0,3,3 16 | Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,0,3,4 17 | Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,0,0,3,4 18 | Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,0,0,3,4 19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1 20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2 21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1 22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1 23 | Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,0,0,3,2 24 | AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2 25 | Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,0,3,4 26 | Pontiac Firebird,19.2,8,400.0,175,3.08,3.845,17.05,0,0,3,2 27 | Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,1,4,1 28 | Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,1,5,2 29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2 30 | Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4 31 | Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6 32 | Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8 33 | Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2 34 | -------------------------------------------------------------------------------- /data/saved-twitter/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-twitter/.part-00000.crc -------------------------------------------------------------------------------- /data/saved-twitter/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-twitter/_SUCCESS -------------------------------------------------------------------------------- /data/saved-twitter/part-00000: -------------------------------------------------------------------------------- 1 | Fresh install of XP on new computer. Sweet relief! fuck vista 1018769417 1.0 2 | Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl 10284216536 1.0 3 | "Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting." 10298589026 1.0 4 | Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever! 109017669432377344 1.0 5 | 'Cheap Eats in SLP' - http://t.co/4w8gRp7 109642968603963392 1.0 6 | Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW 10995492579 1.0 7 | New demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa 11713360136 1.0 8 | hi all - i'm going to be tweeting things lookstat at the @lookstat twitter account. please follow me there 1208319583 1.0 9 | Holy carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D? 121330835726155776 1.0 10 | "Did I really need to learn ""I bought a box and put in it things"" in arabic? This is the most random book ever." 12358025545 1.0 11 | -------------------------------------------------------------------------------- /data/titanic/gender_submission.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,1 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,1 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,1 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,1 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,1 368 | 1258,0 369 | 1259,1 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /data/twitter.txt: -------------------------------------------------------------------------------- 1 | Fresh install of XP on new computer. Sweet relief! fuck vista 1018769417 1.0 2 | Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl 10284216536 1.0 3 | "Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting." 10298589026 1.0 4 | Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever! 109017669432377344 1.0 5 | 'Cheap Eats in SLP' - http://t.co/4w8gRp7 109642968603963392 1.0 6 | Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW 10995492579 1.0 7 | New demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa 11713360136 1.0 8 | hi all - i'm going to be tweeting things lookstat at the @lookstat twitter account. please follow me there 1208319583 1.0 9 | Holy carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D? 121330835726155776 1.0 10 | "Did I really need to learn ""I bought a box and put in it things"" in arabic? This is the most random book ever." 12358025545 1.0 11 | -------------------------------------------------------------------------------- /delete-readme.txt: -------------------------------------------------------------------------------- 1 | # Learning apache spark 2 | 3 | **[Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/)** 4 | 5 | ## Introduction 6 | 7 | This repository contains mainly notes from learning Apache Spark by [Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/). We try to use the detailed demo code and examples to show how to use pyspark for big data mining. **If you find your work wasn't cited in this note, please feel free to let us know.** 8 | 9 | ## Content 10 | 11 | * ***Cheat Sheets*** 12 | + [Python Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PythonForDataScience.pdf) 13 | + [Pandas Basics](http://datacamp-community.s3.amazonaws.com/3857975e-e12f-406a-b3e8-7d627217e952) 14 | + [Numpy Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf) 15 | + [Scikit-Learn](http://datacamp-community.s3.amazonaws.com/5433fa18-9f43-44cc-b228-74672efcd116) 16 | + [RDD Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_Cheat_Sheet_Python.pdf) 17 | + [DataFrame Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf) 18 | + [Apache Spark Cheat Sheet](https://hackr.io/tutorials/learn-apache-spark) 19 | 20 | * ***Data Manipulation*** 21 | + [Entry Points to Spark](entry-points-to-spark.ipynb) 22 | + [RDD Object](rdd-object.ipynb) 23 | + [DataFrame Object](dataframe-object.ipynb) 24 | + [RDD and DataFrame conversion](conversion-between-rdd-and-dataframe.ipynb) 25 | + [Categorical Data, `StringIndexer` and `OneHotEncoder`](stringindexer-and-onehotencoder.ipynb) 26 | + [Continuous variables to categorical variables](Continuous-variable-to-categorical-variable.ipynb) 27 | + [Import and export data](import-and-export-data.ipynb) 28 | + [Subset data](subset-data.ipynb): 29 | * select rows by index 30 | * select rows by logical criteria 31 | * select columns by index 32 | * select columns by names 33 | * select columns by regex pattern 34 | + [`udf()` function and SQL data types](udf-and-sql-types.ipynb): 35 | * use `udf()` function 36 | * difference between `ArrayType` and `StructType` 37 | + [Pipeline](pipeline.ipynb) 38 | + [Dense and sparse vectors](dense-vs-sparse-vectors.ipynb) 39 | + [Assemble feature columns into a `featuresCol` column with `VectorAssembler`](vector-assembler.ipynb) 40 | + [TF-IDF, HashingTF and CountVectorizer](TF-IDF.ipynb) 41 | + Feature processing: 42 | - [First data check](first-data-check.ipynb) 43 | + [SQL functions](sql-functions.ipynb) 44 | + [Add py Files to cluster](add-py-files-to-spark-cluster.ipynb) 45 | 46 | * ***Machine Learning*** 47 | + [Machine Learning Framework](machine-learning-framework.Rmd) 48 | + **Regression** 49 | 50 | - [Linear regression](linear-regression.ipynb) 51 | - [Logistic regression](logistic-regression.ipynb) 52 | 53 | + **Classification** 54 | 55 | - [Naive bayes classification](naive-bayes-classification.ipynb) 56 | - [Decision tree](decision-tree-classification.ipynb) 57 | - [Random forest classification](random-forest-classification.ipynb) 58 | - [Gradient boost tree classification](gradient-boost-tree-classification.ipynb) 59 | 60 | * **Model Tuning** 61 | + [Regularization](regularization.ipynb) 62 | + [Cross-validation](cross-validation.ipynb) 63 | 64 | * **Nutural Language Processing** 65 | + [NLP and NLTK Basics](nlp-and-nltk-basics.ipynb) 66 | + [NLP Information Extraction](nlp-information-extraction.ipynb) 67 | 68 | ### Acknowledgement 69 | 70 | At here, we would like to thank Jian Sun and Zhongbo Li at the University of Tennessee at Knoxville for the valuable disscussion and thank the generous anonymous authors for providing the detailed solutions and source code on the internet. Without those help, this repository would not have been possible to be made. Wenqiang also would like to thank the Institute for Mathematics and Its Applications (IMA) at University of Minnesota, Twin Cities for support during his IMA Data Scientist Fellow visit. 71 | 72 | ### Feedback and suggestions 73 | 74 | Your comments and suggestions are highly appreciated. We are more than happy to receive corrections, suggestions or feedbacks through email (Ming Chen: mchen33@utk.edu, Wenqiang Feng: wfeng1@utk.edu) for improvements. 75 | -------------------------------------------------------------------------------- /images/simple-nlp-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/images/simple-nlp-pipeline.png -------------------------------------------------------------------------------- /images/spark-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/images/spark-pipeline.png -------------------------------------------------------------------------------- /index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: '[Learning Apache Spark](https://github.com/MingChen0919/learning-apache-spark)' 3 | output: 4 | html_document: 5 | highlight: pygments 6 | --- 7 | 8 | ```{r setup, include=FALSE, warning=FALSE, message=FALSE} 9 | knitr::opts_knit$set(progress = FALSE) 10 | knitr::opts_chunk$set(error = TRUE, echo = FALSE) 11 | library(htmltools) 12 | ``` 13 | 14 | ```{r, echo=FALSE} 15 | # to make the css theme to work, tags cannot be added directly 16 | # as tags as below. 17 | # it has to be added using a code chunk with the htmltool functions!!! 18 | css_link = tags$link() 19 | css_link$attribs = list(rel="stylesheet", href="vakata-jstree-3.3.5/dist/themes/default/style.min.css") 20 | css_link 21 | ``` 22 | 23 | ```{r, eval=FALSE, echo=FALSE} 24 | # this code chunk is purely for adding comments 25 | # below is to add jQuery and jstree javascripts 26 | ``` 27 | 28 | 29 | 30 | ```{r, eval=FALSE, echo=FALSE} 31 | # this code chunk is purely for adding comments 32 | # javascript code below is to build the file tree interface 33 | # see this for how to implement opening hyperlink: https://stackoverflow.com/questions/18611317/how-to-get-i-get-leaf-nodes-in-jstree-to-open-their-hyperlink-when-clicked-when 34 | ``` 35 | 43 | 44 | 45 | ```{r} 46 | file_tree = function(dir = '.'){ 47 | # # get the OUTPUT_DIR folder data: dataset_NUMBER_files 48 | # report_files_path = Sys.getenv('REPORT_FILES_PATH') 49 | # output_dir = tail(strsplit(report_files_path, '/')[[1]], 1) 50 | 51 | files = list.files(path = dir, recursive = FALSE, full.names = TRUE) 52 | # files also include directorys, need to remove directorys 53 | files = files[!dir.exists(files)] 54 | dirs = list.dirs(path = dir, recursive = FALSE, full.names = TRUE) 55 | # exclude .ipynb_checkpoints folder 56 | # ipynb_checkpoints = grep(pattern = 'ipynb_checkpoints', x = dirs) 57 | # dirs = dirs[-ipynb_checkpoints] 58 | github_repo_url = 'https://github.com/MingChen0919/learning-apache-spark/blob/master/' 59 | tags$ul( 60 | { 61 | if (length(files) > 0) { 62 | lapply(files, function(x){ 63 | path_end = tail(strsplit(x, '/')[[1]],1) 64 | li_item = tags$li(tags$a(path_end, href=paste0(github_repo_url, x))) 65 | li_item$attribs = list('data-jstree'='{"icon":"jstree-file"}') 66 | li_item 67 | }) 68 | } 69 | }, 70 | { 71 | if (length(dirs) > 0) { 72 | lapply(dirs, function(x){ 73 | path_end = tail(strsplit(x, '/')[[1]],1) 74 | if (!(path_end %in% c('vakata-jstree-3.3.5', '.ipynb_checkpoints', 'spark-warehouse', 'ipynb'))) { 75 | li_item = tags$li(path_end, file_tree(x)) 76 | li_item$attribs = list('data-jstree' = '{"icon":"jstree-folder"}', class=list('jstree-open')) 77 | li_item 78 | } 79 | }) 80 | } 81 | } 82 | ) 83 | } 84 | ``` 85 | 86 | 87 | **[Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/)** 88 | 89 | ## Introduction 90 | 91 | This repository contains mainly notes from learning Apache Spark by [Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/). We try to use the detailed demo code and examples to show how to use pyspark for big data mining. **If you find your work wasn't cited in this note, please feel free to let us know.** 92 | 93 | 94 | ## Cheat Sheets 95 | 96 | + [Python Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PythonForDataScience.pdf) 97 | + [Pandas Basics](http://datacamp-community.s3.amazonaws.com/3857975e-e12f-406a-b3e8-7d627217e952) 98 | + [Numpy Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf) 99 | + [Scikit-Learn](http://datacamp-community.s3.amazonaws.com/5433fa18-9f43-44cc-b228-74672efcd116) 100 | + [RDD Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_Cheat_Sheet_Python.pdf) 101 | + [DataFrame Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf) 102 | + [Apache Spark Cheat Sheet](https://hackr.io/tutorials/learn-apache-spark) 103 | 104 | ## Contents 105 | 106 | ```{r, echo=FALSE} 107 | # create a div container to store the file tree interface 108 | tags$div( 109 | id="jstree", 110 | file_tree('notebooks') 111 | ) 112 | ``` 113 | 114 | 115 | ## Feedback and suggestions 116 | 117 | Your comments and suggestions are highly appreciated. We are more than happy to receive corrections, suggestions or feedbacks for improvements. 118 | -------------------------------------------------------------------------------- /legacy/01_entry_points_to_spark.Rmd: -------------------------------------------------------------------------------- 1 | 2 | 3 | ```{r setup, include=FALSE} 4 | knitr::opts_chunk$set(echo = TRUE, eval=FALSE) 5 | ``` 6 | 7 | 8 | # Entry points to spark cluster 9 | 10 | There are two main entry points to spark cluster: 11 | 12 | * **SparkContext**: create **RDD** and broadcast variables on the cluster. 13 | * **SparkSession**: create **DataFrame** (pyspark.sql.dataframe.DataFrame). 14 | 15 | # Create entry point instances 16 | 17 | * Create a **SparkContext** instance: 18 | 19 | ```{python eval=FALSE} 20 | from pyspark import SparkContext 21 | sc = SparkContext(master = 'local') 22 | ``` 23 | 24 | * Create a **SparkSession** instance 25 | 26 | ```{python eval=FALSE} 27 | from pyspark.sql import SparkSession 28 | spark = SparkSession.builder \ 29 | .appName("Python Spark SQL basic example") \ 30 | .config("spark.some.config.option", "some-value") \ 31 | .getOrCreate() 32 | ``` -------------------------------------------------------------------------------- /legacy/03_dataframe_object.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DataFrame object" 3 | author: "Ming Chen" 4 | date: "6/4/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 10 | ``` 11 | 12 | 13 | # Content 14 | 15 | * [Create a DataFrame object](#create-a-dataframe-object) 16 | * [Column instance](#column-instance) 17 | * [DataFrame column methods](#dataframe-column-methods) 18 | 19 | ## Create a DataFrame object 20 | 21 | ```{python} 22 | mtcars = spark.read.csv(path='data/mtcars.csv', 23 | sep=',', 24 | encoding='UTF-8', 25 | comment=None, 26 | header=True, 27 | inferSchema=True) 28 | ``` 29 | 30 | ```{python} 31 | mtcars.show(n=5, truncate=False) 32 | ``` 33 | 34 | ```{python} 35 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+ 36 | |model |mpg |cyl|disp |hp |drat|wt |qsec |vs |am |gear|carb| 37 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+ 38 | |Mazda RX4 |21.0|6 |160.0|110|3.9 |2.62 |16.46|0 |1 |4 |4 | 39 | |Mazda RX4 Wag |21.0|6 |160.0|110|3.9 |2.875|17.02|0 |1 |4 |4 | 40 | |Datsun 710 |22.8|4 |108.0|93 |3.85|2.32 |18.61|1 |1 |4 |1 | 41 | |Hornet 4 Drive |21.4|6 |258.0|110|3.08|3.215|19.44|1 |0 |3 |1 | 42 | |Hornet Sportabout|18.7|8 |360.0|175|3.15|3.44 |17.02|0 |0 |3 |2 | 43 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+ 44 | only showing top 5 rows 45 | ``` 46 | 47 | 48 | ## Column instance 49 | 50 | Column instances can be created in two ways: 51 | 52 | 1. directly select a column out of a *DataFrame*: `df.colName` 53 | 2. create from a column expression: `df.colName + 1` 54 | 55 | Technically, there is only one way to create a column instance. Column expressions start from a column instance. 56 | 57 | **Remember how to create column instances, because this is usually the starting point if we want to operate DataFrame columns.** 58 | 59 | The column classes come with some methods that can operate on a column instance. ***However, almost all functions from the `pyspark.sql.functions` module take one or more column instances as argument(s)***. These functions are important for data manipulation tools. 60 | 61 | ## DataFrame column methods 62 | 63 | ### Methods that take column names as arguments: 64 | 65 | * `corr(col1, col2)`: two column names. 66 | * `cov(col1, col2)`: two column names. 67 | * `crosstab(col1, col2)`: two column names. 68 | * `describe(*cols)`: ***`*cols` refers to only column names (strings).*** 69 | 70 | ### Methods that take column names or column expressions or **both** as arguments: 71 | 72 | * `cube(*cols)`: column names (string) or column expressions or **both**. 73 | * `drop(*cols)`: ***a list of column names OR a single column expression.*** 74 | * `groupBy(*cols)`: column name (string) or column expression or **both**. 75 | * `rollup(*cols)`: column name (string) or column expression or **both**. 76 | * `select(*cols)`: column name (string) or column expression or **both**. 77 | * `sort(*cols, **kwargs)`: column name (string) or column expression or **both**. 78 | * `sortWithinPartitions(*cols, **kwargs)`: column name (string) or column expression or **both**. 79 | * `orderBy(*cols, **kwargs)`: column name (string) or column expression or **both**. 80 | * `sampleBy(col, fractions, sed=None)`: a column name. 81 | * `toDF(*cols)`: **a list of column names (string).** 82 | * `withColumn(colName, col)`: `colName` refers to column name; `col` refers to a column expression. 83 | * `withColumnRenamed(existing, new)`: takes column names as arguments. 84 | * `filter(condition)`: ***condition** refers to a column expression that returns `types.BooleanType` of values. 85 | -------------------------------------------------------------------------------- /legacy/HashingTF-and-CountVectorizer.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "HashingTF and CountVectorizer" 3 | author: "Wenqiang & Ming Chen" 4 | date: "3/23/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 10 | ``` 11 | 12 | 13 | ## HashingTF and CountVectorizer 14 | 15 | ### Load data 16 | 17 | ```{python} 18 | twitter = spark.createDataFrame([ 19 | ('Wenqiang is a spark expert', 'Wenqiang', 1.0), 20 | ('Ming is learning spark', 'Ming', 0.0)], 21 | ['text', 'id', 'label'] 22 | ) 23 | ``` 24 | 25 | ```{python} 26 | twitter.show() 27 | ``` 28 | 29 | ```{python} 30 | +--------------------+--------+-----+ 31 | | text| id|label| 32 | +--------------------+--------+-----+ 33 | |Wenqiang is a spa...|Wenqiang| 1.0| 34 | |Ming is learning ...| Ming| 0.0| 35 | +--------------------+--------+-----+ 36 | ``` 37 | 38 | 39 | ### Tokenization 40 | 41 | ```{python} 42 | from pyspark.ml.feature import Tokenizer 43 | ``` 44 | 45 | ```{python} 46 | tokenizer_mod = Tokenizer(inputCol='text', outputCol='tokens') 47 | twitter_tokens = tokenizer_mod.transform(twitter) 48 | twitter_tokens.show() 49 | ``` 50 | 51 | ```{python} 52 | +--------------------+--------+-----+--------------------+ 53 | | text| id|label| tokens| 54 | +--------------------+--------+-----+--------------------+ 55 | |Wenqiang is a spa...|Wenqiang| 1.0|[wenqiang, is, a,...| 56 | |Ming is learning ...| Ming| 0.0|[ming, is, learni...| 57 | +--------------------+--------+-----+--------------------+ 58 | ``` 59 | 60 | 61 | ### HashingTF 62 | 63 | ```{python} 64 | from pyspark.ml.feature import HashingTF 65 | hashingTF_mod = HashingTF(numFeatures=pow(2,4), inputCol='tokens', outputCol='features') 66 | hashingTF_twitter = hashingTF_mod.transform(twitter_tokens) 67 | ``` 68 | 69 | ```{python} 70 | hashingTF_twitter.show(truncate=False) 71 | ``` 72 | 73 | ```{python} 74 | +--------------------------+--------+-----+--------------------------------+---------------------------------+ 75 | |text |id |label|tokens |features | 76 | +--------------------------+--------+-----+--------------------------------+---------------------------------+ 77 | |Wenqiang is a spark expert|Wenqiang|1.0 |[wenqiang, is, a, spark, expert]|(16,[1,2,9,13],[2.0,1.0,1.0,1.0])| 78 | |Ming is learning spark |Ming |0.0 |[ming, is, learning, spark] |(16,[0,1,14],[1.0,2.0,1.0]) | 79 | +--------------------------+--------+-----+--------------------------------+---------------------------------+ 80 | ``` 81 | 82 | 83 | ### CountVectorizer 84 | 85 | ```{python} 86 | from pyspark.ml.feature import CountVectorizer 87 | count_vectorizer = CountVectorizer(vocabSize=pow(2,4), inputCol='tokens', outputCol='features') 88 | countVectorizer_mod = count_vectorizer.fit(twitter_tokens) 89 | countVectorizer_twitter = countVectorizer_mod.transform(twitter_tokens) 90 | ``` 91 | 92 | ```{python} 93 | countVectorizer_twitter.show(truncate=False) 94 | ``` 95 | 96 | ```{python} 97 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+ 98 | |text |id |label|tokens |features | 99 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+ 100 | |Wenqiang is a spark expert|Wenqiang|1.0 |[wenqiang, is, a, spark, expert]|(7,[0,1,2,3,5],[1.0,1.0,1.0,1.0,1.0])| 101 | |Ming is learning spark |Ming |0.0 |[ming, is, learning, spark] |(7,[0,1,4,6],[1.0,1.0,1.0,1.0]) | 102 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+ 103 | ``` 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /legacy/continuous-to-categorical-variable.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Continuous to categorical data" 3 | author: "Ming Chen" 4 | date: "6/9/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 10 | ``` 11 | 12 | ## Convert continuous variables to categorical variables 13 | 14 | There are two functions we can use to split a continuous variable into categories: 15 | 16 | * `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold 17 | * `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points. 18 | + with $n + 1$ split points, there are n categories (buckets). 19 | 20 | **Create some data** 21 | 22 | ```{python} 23 | import numpy as np 24 | import pandas as pd 25 | np.random.seed(seed=1234) 26 | pdf = pd.DataFrame({ 27 | 'x1': np.random.randn(10), 28 | 'x2': np.random.rand(10)*10 29 | }) 30 | np.random.seed(seed=None) 31 | df = spark.createDataFrame(pdf) 32 | df.show() 33 | 34 | +--------------------+------------------+ 35 | | x1| x2| 36 | +--------------------+------------------+ 37 | | 0.47143516373249306| 6.834629351721363| 38 | | -1.1909756947064645| 7.127020269829002| 39 | | 1.4327069684260973|3.7025075479039495| 40 | | -0.3126518960917129| 5.611961860656249| 41 | | -0.7205887333650116| 5.030831653078097| 42 | | 0.8871629403077386|0.1376844959068224| 43 | | 0.8595884137174165| 7.728266216123741| 44 | | -0.6365235044173491| 8.826411906361166| 45 | |0.015696372114428918| 3.648859839013723| 46 | | -2.2426849541854055| 6.153961784334937| 47 | +--------------------+------------------+ 48 | ``` 49 | 50 | **`Binarize` the column `x1` and `Bucketize` the column `x2`** 51 | 52 | ```{python} 53 | from pyspark.ml.feature import Binarizer, Bucketizer 54 | # threshold = 0 for binarizer 55 | binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new') 56 | # provide 5 split points to generate 4 buckets 57 | bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new') 58 | 59 | # pipeline stages 60 | from pyspark.ml import Pipeline 61 | stages = [binarizer, bucketizer] 62 | pipeline = Pipeline(stages=stages) 63 | 64 | # fit the pipeline model and transform the data 65 | pipeline.fit(df).transform(df).show() 66 | 67 | +--------------------+------------------+------+------+ 68 | | x1| x2|x1_new|x2_new| 69 | +--------------------+------------------+------+------+ 70 | | 0.47143516373249306| 6.834629351721363| 1.0| 2.0| 71 | | -1.1909756947064645| 7.127020269829002| 0.0| 2.0| 72 | | 1.4327069684260973|3.7025075479039495| 1.0| 1.0| 73 | | -0.3126518960917129| 5.611961860656249| 0.0| 2.0| 74 | | -0.7205887333650116| 5.030831653078097| 0.0| 2.0| 75 | | 0.8871629403077386|0.1376844959068224| 1.0| 0.0| 76 | | 0.8595884137174165| 7.728266216123741| 1.0| 3.0| 77 | | -0.6365235044173491| 8.826411906361166| 0.0| 3.0| 78 | |0.015696372114428918| 3.648859839013723| 1.0| 1.0| 79 | | -2.2426849541854055| 6.153961784334937| 0.0| 2.0| 80 | +--------------------+------------------+------+------+ 81 | ``` 82 | 83 | 84 | -------------------------------------------------------------------------------- /legacy/cross-validation-in-r.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Cross-validation in R" 3 | author: "Ming Chen" 4 | date: "6/5/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | library(boot) 11 | ``` 12 | 13 | 14 | ## Prepare data 15 | 16 | ```{r} 17 | horseshoe_crab = read.csv("data/horseshoe_crab.csv") 18 | horseshoe_crab$C = as.factor(horseshoe_crab$C) 19 | horseshoe_crab$S = as.factor(horseshoe_crab$S) 20 | y = numeric() 21 | y[horseshoe_crab$Sa != 0] = 1 22 | y[horseshoe_crab$Sa == 0] = 0 23 | horseshoe_crab$y = y 24 | ``` 25 | 26 | ## Split data into training and test datasets 27 | 28 | ```{r} 29 | training_index = sort(sample(nrow(horseshoe_crab), nrow(horseshoe_crab)*0.8)) 30 | training = horseshoe_crab[training_index, ] 31 | test = horseshoe_crab[-training_index, ] 32 | ``` 33 | 34 | 35 | ## Build cross validation model 36 | 37 | ```{r} 38 | glm_logit = glm(formula = y ~ C + S + W + Wt, data = training, 39 | family = 'binomial') 40 | glm_logit = glm(formula = y ~ C + S + W + Wt, data = horseshoe_crab, 41 | family = 'binomial') 42 | # 4 fold cross validation 43 | cv_glm_5 = cv.glm(data = training, glmfit = glm_logit, K = 4) 44 | ``` 45 | 46 | -------------------------------------------------------------------------------- /legacy/index.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Learning Apache Spark" 3 | output: html_document 4 | --- 5 | 6 | **All materials are converted to notebooks (in ipynb format) and moved to the github repository. Click here to go to the [repository](https://github.com/MingChen0919/learning-apache-spark/blob/master/README.md).** -------------------------------------------------------------------------------- /legacy/install.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Installations" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "2/17/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ### **Caution**: Before you start the following steps, please make sure you have already installed 13 | 14 | - [Java JDK](http://www.oracle.com/technetwork/java/javase/downloads/index-jsp-138363.html) 15 | - [Ipython and python](https://ipython.org/install.html) 16 | - If you use this method to install the Spark, you can skip the [Spark on Jupyter section](pyspark-on-jupyter.html) 17 | 18 | ### 1. Download Apache Spark from the official website 19 | Weblink: [Download Apache Spark™](http://spark.apache.org/downloads.html) 20 | 21 | ### 2. Installation 22 | 23 | Actually, the Pre-build version doesn't need installation. You can 24 | use it when you unpack it. 25 | 26 | ### 3. Set path link 27 | 28 | This is the most difficult step for the beginner. However, this step can be easily solved via Min RK's [`findspark`](https://github.com/minrk/findspark). 29 | 30 | - install findspark 31 | ```{python eval=FALSE} 32 | pip install findspark 33 | ``` 34 | - open `ipython` in terminal and import findspark 35 | ```{python eval=FALSE} 36 | import findspark 37 | findspark.init() 38 | ``` 39 | - finding spark path 40 | ```{python eval=FALSE} 41 | findspark.find() 42 | ``` 43 | ```{python eval=FALSE} 44 | Out[3]: '/Users/wenqiangfeng/spark/' 45 | ``` 46 | - open `ipython --profile=myprofile` in terminal then run the following code 47 | ```{python eval=FALSE} 48 | findspark.init('/Users/wenqiangfeng/spark/', edit_profile=True) 49 | ``` 50 | ```{python eval=FALSE} 51 | findspark.init('/Users/wenqiangfeng/spark/', edit_rc=True) 52 | ``` 53 | 54 | ### Note: 55 | 56 | - This will also help you to set up the `ipython notebook` or `Jupyter`. You may run the following code in terminal to double check it: 57 | ```{python eval=FALSE} 58 | jupyter notebook 59 | ``` 60 | 61 | * If you PySpark still doesn't work, you need to check your `.profile` or `bash_profile` and add the following path to it 62 | 63 | + check `.profile` or `bash_profile` at terminal 64 | + add the path to your `.profile` or `bash_profile` 65 | ```{bash eval=FALSE} 66 | vim ~/.profile 67 | ``` 68 | 69 | 70 | ```{bash eval=FALSE} 71 | # Added for Pyspark 72 | export SPARK_HOME=YOUR_PATH/apache-spark/libexec 73 | export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin 74 | export PYSPARK_DRIVER_PYTHON="jupyter" 75 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook" 76 | ``` 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /legacy/k-folds-cross-validation.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "K-folds Cross Validation" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "2/20/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 10 | ``` 11 | 12 | ## Training/validation/test data sets 13 | 14 | * **Training set**: the data set for training your models. 15 | * **Validation set**: The data set used for testing the performance of your models you have built with training sets. Based on the performance, you choose the best model (final). 16 | * **Test set**: use this data set to test the performance of your final model. 17 | 18 | ## K-folds cross validation steps (k=4 as an example). 19 | 20 | * step 1: split your data into training set and test set (for example 80% training and 20% test). Test set will never be used in model training and selection. 21 | * step 2: split training set into k (k=4) eqaul subsets: 3 subsets for traing + 1 subset for validation. 22 | * step 3: training your models with the 3 subsets and calculate a performance score with the remaining 1 subset. 23 | * step 4: choose a different subset for validation and then repeat step 3 until every subset has been used as a validation subset. 24 | * step 5: for a k=4 fold cross validation, each trained model should have been validated by 4 subsets and therefore has 4 performance scores. Calculate the average of these 4 perfermance scores for each model. Use the average score to select the best, final model. 25 | * step 6: apply your final model to the **untouched** test data and see how it performs. 26 | 27 | ## Example of k-folds cross validation 28 | 29 | * **Build parameter grids** 30 | + parameter grid: a combination of all variable parameters in your model. 31 | + example: If I want to train a logistic regression model on 4 different *regParam* and 3 different *elasticNetParam*, I will have 3 x 4 = 12 models to train and validate. 32 | 33 | ```{python} 34 | from pyspark.ml.classification import LogisticRegression 35 | blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial') 36 | 37 | from pyspark.ml.tuning import ParamGridBuilder 38 | param_grid = ParamGridBuilder().\ 39 | addGrid(blor.regParam, [0, 0.5, 1, 2]).\ 40 | addGrid(blor.elasticNetParam, [0, 0.5, 1]).\ 41 | build() 42 | ``` 43 | 44 | ```{python} 45 | # the first 2 elements in param_grid 46 | [{Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0, 47 | Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 0}, 48 | {Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5, 49 | Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 0}] 50 | ``` 51 | 52 | * **Split data into training and test sets** 53 | + Refer to the [logistic regression page](logistic-regression.html) to see what data we used and how the training and test sets were generated. 54 | 55 | * **Run k (k=4) folds cross validation** 56 | ```{python} 57 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 58 | evaluator = BinaryClassificationEvaluator() 59 | 60 | from pyspark.ml.tuning import CrossValidator 61 | cv = CrossValidator(estimator=blor, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4) 62 | 63 | cvModel = cv.fit(training) 64 | ``` 65 | 66 | * **Find the best model** 67 | + best model ID 68 | 69 | ```{python} 70 | cvModel.bestModel 71 | ``` 72 | 73 | ```{python} 74 | LogisticRegression_41fe9f7454164180f433 75 | ``` 76 | 77 | + average cross-validation metrics 78 | + the 10th model has highest score and is the best model 79 | + *regParam* = 2 and *elasticNetParam* = 0. It is a ridge regularization method. 80 | 81 | ```{python} 82 | cvModel.avgMetrics 83 | ``` 84 | 85 | ```{python} 86 | [0.8191225353777875, 87 | 0.8191225353777875, 88 | 0.8191225353777875, 89 | 0.8243105196624104, 90 | 0.5, 91 | 0.5, 92 | 0.8247709310997127, 93 | 0.5, 94 | 0.5, 95 | 0.8259072947360763, 96 | 0.5, 97 | 0.5] 98 | ``` 99 | 100 | 101 | ```{python} 102 | param_grid[9] 103 | ``` 104 | 105 | ```{python} 106 | {Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0, Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 2} 107 | ``` 108 | 109 | + Model comparison (not finished) 110 | 111 | 112 | ```{python} 113 | # new model 114 | blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial') 115 | model = blor.fit(training) 116 | evaluator.evaluate(model.transform(training)) 117 | evaluator.evaluate(model.transform(test)) 118 | 119 | new_blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial', regParam=0.5, elasticNetParam=0) 120 | new_model = new_blor.fit(training) 121 | evaluator.evaluate(new_model.transform(training)) 122 | evaluator.evaluate(new_model.transform(test)) 123 | ``` 124 | -------------------------------------------------------------------------------- /legacy/linear-regression.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Linear Regression" 3 | author: "Ming Chen" 4 | date: "6/5/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 10 | ``` 11 | 12 | # Linear regression 13 | 14 | ## Linear regression without cross-valiation 15 | 16 | **Import data** 17 | 18 | ```{python} 19 | ad = spark.read.csv('data/Advertising.csv', header=True, inferSchema=True) 20 | ad.show(5) 21 | 22 | +-----+-----+---------+-----+ 23 | | TV|Radio|Newspaper|Sales| 24 | +-----+-----+---------+-----+ 25 | |230.1| 37.8| 69.2| 22.1| 26 | | 44.5| 39.3| 45.1| 10.4| 27 | | 17.2| 45.9| 69.3| 9.3| 28 | |151.5| 41.3| 58.5| 18.5| 29 | |180.8| 10.8| 58.4| 12.9| 30 | +-----+-----+---------+-----+ 31 | only showing top 5 rows 32 | ``` 33 | 34 | **Transform data structure** 35 | 36 | ```{python} 37 | from pyspark.ml.linalg import Vectors 38 | ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label']) 39 | ad_df.show(5) 40 | 41 | +-----------------+-----+ 42 | | features|label| 43 | +-----------------+-----+ 44 | |[230.1,37.8,69.2]| 22.1| 45 | | [44.5,39.3,45.1]| 10.4| 46 | | [17.2,45.9,69.3]| 9.3| 47 | |[151.5,41.3,58.5]| 18.5| 48 | |[180.8,10.8,58.4]| 12.9| 49 | +-----------------+-----+ 50 | only showing top 5 rows 51 | ``` 52 | 53 | **Build linear regression model** 54 | 55 | ```{python} 56 | from pyspark.ml.regression import LinearRegression 57 | lr = LinearRegression(featuresCol = 'features', labelCol = 'label') 58 | ``` 59 | 60 | **Fit the model** 61 | 62 | ```{python} 63 | lr_model = lr.fit(ad_df) 64 | ``` 65 | 66 | **Module evaluation** 67 | 68 | ```{python} 69 | from pyspark.ml.evaluation import RegressionEvaluator 70 | evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label') 71 | evaluator.evaluate(ad_pred, {evaluator.metricName: "r2"}) 72 | 73 | 0.897210638178952 74 | ``` 75 | 76 | **Compare results with results from R** 77 | 78 | The comparison below shows that the linear regression analyses from pyspark and R obtained very close results. 79 | 80 | ```{python} 81 | # intercept and coefficients from R 82 | advertise = read.csv('data/Advertising.csv', header = TRUE) 83 | lr_ad = lm(Sales~., data = advertise) 84 | lr_ad$coefficients 85 | 86 | (Intercept) TV Radio Newspaper 87 | 2.938889369 0.045764645 0.188530017 -0.001037493 88 | 89 | # intercept and coefficents from pyspark 90 | lr_model.intercept 91 | 92 | 2.9388893694594134 93 | 94 | lr_model.coefficients 95 | 96 | DenseVector([0.0458, 0.1885, -0.001]) 97 | 98 | # R squared from R 99 | summary(lr_ad)$r.squared 100 | 101 | 0.8972106 102 | 103 | # R squared from pyspark 104 | evaluator.evaluate(ad_pred, {evaluator.metricName: "r2"}) 105 | 106 | 0.897210638178952 107 | ``` 108 | 109 | 110 | ## Linear regression with cross-validation 111 | 112 | **Training and test datasets** 113 | 114 | ```{python} 115 | ## split data into training and test datasets 116 | training, test = ad_df.randomSplit([0.8, 0.2], seed=123) 117 | ``` 118 | 119 | **Build cross-validation model** 120 | 121 | ```{python} 122 | ##=====build cross valiation model====== 123 | 124 | # estimator 125 | lr = LinearRegression(featuresCol = 'features', labelCol = 'label') 126 | 127 | # parameter grid 128 | from pyspark.ml.tuning import ParamGridBuilder 129 | param_grid = ParamGridBuilder().\ 130 | addGrid(lr.regParam, [0, 0.5, 1]).\ 131 | addGrid(lr.elasticNetParam, [0, 0.5, 1]).\ 132 | build() 133 | 134 | # evaluator 135 | evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2') 136 | 137 | # cross-validation model 138 | from pyspark.ml.tuning import CrossValidator 139 | cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4) 140 | ``` 141 | 142 | **Fit cross-validation model** 143 | 144 | ```{python} 145 | cv_model = cv.fit(training) 146 | ``` 147 | 148 | **Prediction** 149 | 150 | ```{python} 151 | pred_training_cv = cv_model.transform(training) 152 | pred_test_cv = cv_model.transform(test) 153 | ``` 154 | 155 | **Evaluation** 156 | 157 | ```{python} 158 | # performance on training data 159 | evaluator.evaluate(pred_training_cv) 160 | 161 | 0.8982486958337326 162 | 163 | # performance on test data 164 | evaluator.evaluate(pred_test_cv) 165 | 166 | 0.8896562076565583 167 | ``` 168 | 169 | 170 | **Intercept and coefficients** 171 | 172 | ```{python} 173 | cv_model.bestModel.intercept 174 | 175 | 3.075068686285647 176 | 177 | cv_model.bestModel.coefficients 178 | 179 | DenseVector([0.0465, 0.1809, -0.0011]) 180 | ``` 181 | 182 | **Get parameter values from the best model** 183 | 184 | Parameters can be extracted by calling the java property. 185 | 186 | ```{python} 187 | print('best regParam: ' + str(cv_model.bestModel._java_obj.getRegParam()) + "\n" + 188 | 'best ElasticNetParam:' + str(cv_model.bestModel._java_obj.getElasticNetParam())) 189 | 190 | best regParam: 0.0 191 | best ElasticNetParam:0.0 192 | ``` 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /legacy/pyspark-on-jupyter.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Spark on Jupyter" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "2/5/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ### 1. Install jupyter with conda 13 | 14 | ```{python eval=FALSE} 15 | conda install jupyter 16 | ``` 17 | 18 | ### 2. Get `jupyter binary executable path` 19 | 20 | ```{python eval=FALSE} 21 | which jupyter 22 | ``` 23 | ```{python eval=FALSE} 24 | # output 25 | /Users/mingchen/anaconda2/bin/jupyter 26 | ``` 27 | 28 | ### 3.Link spark with jupyter 29 | 30 | ```{python eval=FALSE} 31 | export PYSPARK_DRIVER_PYTHON=/Users/mingchen/anaconda2/bin/jupyter 32 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8880" 33 | ``` 34 | 35 | You can also add the two environmental variables to the `~/.bash_profile` file to permenantly link spark with jupyter 36 | 37 | ### 4. Run jupyter notebook 38 | 39 | ```{python eval=FALSE} 40 | pyspark 41 | ``` 42 | 43 | Then go to [http://127.0.0.1:8880](http://127.0.0.1:8880) 44 | 45 | -------------------------------------------------------------------------------- /legacy/pyspark-on-rodeo.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Pyspark On Rodeo" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "2/5/2017" 5 | output: html_document 6 | --- 7 | 8 | ### 1. Install Rodeo on Mac 9 | 10 | * Download DMG file [https://www.yhat.com/products/rodeo/](https://www.yhat.com/products/rodeo/) 11 | 12 | ### 2. Install `apache-spark` with homebrew 13 | 14 | ```{bash eval=FALSE} 15 | brew install apache-spark 16 | ``` 17 | 18 | ### 3.Locate the `python` directory within apache-spark root directory 19 | 20 | ```{bash eval=FALSE} 21 | /usr/local/Cellar/apache-spark/2.1.0/libexec/python 22 | ``` 23 | 24 | 25 | ### 4. Set environment variable 26 | 27 | * Open Rodeo, go to **settings**->**ENVIRONMENT VARIABLES** 28 | * Add the path `/usr/local/Cellar/apache-spark/2.1.0/libexec/python` to `PYTHONPATH` 29 | 30 | ### 5. Test pyspark on Rodeo 31 | 32 | Run the following command 33 | 34 | ```{python eval=FALSE} 35 | from pyspark import SparkConf, SparkContext 36 | 37 | conf = SparkConf().setAppName("myAppName") 38 | sc = SparkContext(conf=conf) 39 | sc 40 | ``` 41 | 42 | You should get something this 43 | 44 | ```{python eval=FALSE} 45 | 46 | ``` -------------------------------------------------------------------------------- /legacy/pyspark-vectors.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Pyspark Vectors" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "2/18/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | ## Remark: 12 | 13 | - You can download the complete [ipython notebook](./ipynb/vector.ipynb) for the this session. 14 | 15 | 16 | ## Dense vector vs. Sparse vector 17 | 18 | * Both dense vector and sparse vector are homogeneous and can only have numeric data. 19 | 20 | * `DenseVector` takes one single argument and is very like an R vector. 21 | * `SparseVector` only display non-zero values. `SparseVector` uses three values to achieve this. The non-value entries' indices (positions) and corresponding values, and the vector size. With this information, you can figure out which entries in a vector have zero values, and therefore, a complete vector. 22 | 23 | ## Example: 24 | 25 | + set up spark context and SparkSession 26 | 27 | ```{python eval =FALSE} 28 | from pyspark import SparkConf, SparkContext 29 | ## set up spark context 30 | from pyspark.sql import SQLContext 31 | sc = SparkContext() 32 | sqlContext = SQLContext(sc) 33 | ## set up SparkSession 34 | from pyspark.sql import SparkSession 35 | 36 | spark = SparkSession \ 37 | .builder \ 38 | .appName("Python Spark SQL basic example") \ 39 | .config("spark.some.config.option", "some-value") \ 40 | .getOrCreate() 41 | ``` 42 | 43 | + import `Vectors` from pyspark library 44 | 45 | ```{python eval=FALSE} 46 | from pyspark.ml.linalg import Vectors 47 | ``` 48 | 49 | + dense vector 50 | 51 | ```{python eval=FALSE} 52 | densevector = Vectors.dense([1,3,4,2.5]) 53 | densevector 54 | ``` 55 | 56 | ```{python eval=FALSE} 57 | # output 58 | DenseVector([1.0, 3.0, 4.0, 2.5]) 59 | ``` 60 | 61 | ```{python eval=FALSE} 62 | densevector.toArray() 63 | ``` 64 | 65 | ```{python eval=FALSE} 66 | # output 67 | array([ 1. , 3. , 4. , 2.5]) 68 | ``` 69 | + sparse vector 70 | = The sparse vector below is a representation of vector [ 0. , 3. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ] 71 | 72 | ```{python eval=FALSE} 73 | sparseVector = Vectors.sparse(10, [1, 3], [3.0, 4.5]) 74 | sparseVector.toArray() 75 | ``` 76 | 77 | ```{python eval=FALSE} 78 | # output 79 | array([ 0. , 3. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ]) 80 | ``` -------------------------------------------------------------------------------- /legacy/pyspark.ml.feature-module.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "pyspark.ml.feature module" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "2/15/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## Introduction 13 | 14 | This module provides a set of functions, methods and classes which act on **features**. A feature is like a column from the data frame or table. You can see that most of functions or classes take parameters like `inputCol`, `featuresCol`, `outputCol`, `labelCol`. These parameters specify the names of column (features) that you want to work on. 15 | 16 | ## class pairs and `fit/transform` functions 17 | 18 | I found that there are a lot of class pairs in this module. For example: 19 | 20 | * `ChiSqSelector` and `ChiSqSelectorModel` 21 | * `CountVectorizer` and `CountVectorizerModel` 22 | * `IDF` and `IDFModel` 23 | * a lot of other pairs ... 24 | 25 | The first class in a pair have functions to build model (instructions about how you want to transform your data). The second class in a pair do the actual data transformation. 26 | 27 | * The `fit` function is from the first class and fit the built model to your data. 28 | * The `transform` function is from the second class and does the actual data transformation. -------------------------------------------------------------------------------- /legacy/r-markdown-header.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "r-markdown-hearder" 3 | author: "Ming Chen" 4 | output: html_document 5 | --- 6 | 7 | 15 | 16 | ```{r setup, include=FALSE} 17 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 18 | ``` 19 | 20 | 21 | ## Create SparkContext & SparkSession 22 | 23 | **SparkContext** 24 | 25 | ```{python} 26 | from pyspark import SparkContext 27 | sc = SparkContext(master = 'local') 28 | ``` 29 | 30 | **SparkSession** 31 | 32 | ```{python} 33 | from pyspark.sql import SparkSession 34 | spark = SparkSession.builder \ 35 | .appName("Learning Apach Spark") \ 36 | .config("spark.some.config.option", "some-value") \ 37 | .getOrCreate() 38 | ``` 39 | -------------------------------------------------------------------------------- /legacy/randomforest.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Random Forest Regression" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "February 19, 2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | ### Remark: 12 | 13 | - You can download the complete [ipython notebook](./ipynb/RandomForest.ipynb) for this tutorial session. 14 | 15 | ### 1. Set up spark context and SparkSession 16 | 17 | ```{python eval=FALSE} 18 | from pyspark.sql import SparkSession 19 | 20 | spark = SparkSession \ 21 | .builder \ 22 | .appName("Python Spark Random Forest Regression") \ 23 | .config("spark.some.config.option", "some-value") \ 24 | .getOrCreate() 25 | ``` 26 | 27 | ### 2. Load dataset 28 | ```{python eval=FALSE} 29 | df = spark.read.format('com.databricks.spark.csv').\ 30 | options(header='true', \ 31 | inferschema='true').load("./data/WineData.csv",header=True); 32 | ``` 33 | 34 | ```{python eval=FALSE} 35 | df.printSchema() 36 | ``` 37 | ```{python eval=FALSE} 38 | #output 39 | root 40 | |-- fixed acidity: double (nullable = true) 41 | |-- volatile acidity: double (nullable = true) 42 | |-- citric acid: double (nullable = true) 43 | |-- residual sugar: double (nullable = true) 44 | |-- chlorides: double (nullable = true) 45 | |-- free sulfur dioxide: double (nullable = true) 46 | |-- total sulfur dioxide: double (nullable = true) 47 | |-- density: double (nullable = true) 48 | |-- pH: double (nullable = true) 49 | |-- sulphates: double (nullable = true) 50 | |-- alcohol: double (nullable = true) 51 | |-- quality: integer (nullable = true) 52 | ``` 53 | 54 | 55 | ### 3. Convert the data to dense vector 56 | ```{python eval=FALSE} 57 | from pyspark.sql import Row 58 | from pyspark.ml.linalg import Vectors 59 | ``` 60 | ```{python eval=FALSE} 61 | def transData(data): 62 | return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label']) 63 | ``` 64 | 65 | ```{python eval=FALSE} 66 | transformed= transData(df) 67 | transformed.show(6) 68 | ``` 69 | 70 | ```{python eval=FALSE} 71 | #output 72 | +--------------------+-----+ 73 | | features|label| 74 | +--------------------+-----+ 75 | |[7.4,0.7,0.0,1.9,...| 5| 76 | |[7.8,0.88,0.0,2.6...| 5| 77 | |[7.8,0.76,0.04,2....| 5| 78 | |[11.2,0.28,0.56,1...| 6| 79 | |[7.4,0.7,0.0,1.9,...| 5| 80 | |[7.4,0.66,0.0,1.8...| 5| 81 | +--------------------+-----+ 82 | only showing top 6 rows 83 | ``` 84 | 85 | ```{python eval=FALSE} 86 | from pyspark.ml import Pipeline 87 | from pyspark.ml.regression import RandomForestRegressor 88 | from pyspark.ml.feature import VectorIndexer 89 | from pyspark.ml.evaluation import RegressionEvaluator 90 | ``` 91 | ### 4. Split the data into training and test sets (30% held out for testing) 92 | ```{python eval=FALSE} 93 | # Split the data into training and test sets (30% held out for testing) 94 | (trainingData, testData) = transformed.randomSplit([0.7, 0.3]) 95 | ``` 96 | ### 5. Train a RandomForest model. 97 | 98 | ```{python eval=FALSE} 99 | # Train a RandomForest model. 100 | rf = RandomForestRegressor() 101 | model = rf.fit(trainingData) 102 | ``` 103 | ### 6. Make predictions. 104 | 105 | ```{python eval=FALSE} 106 | # Make predictions. 107 | predictions = model.transform(testData) 108 | ``` 109 | ### 6. Show esults 110 | ```{python eval=FALSE} 111 | # Select example rows to display. 112 | predictions.select("prediction", "label", "features").show(5) 113 | ``` 114 | 115 | ```{python eval=FALSE} 116 | #output 117 | +------------------+-----+--------------------+ 118 | | prediction|label| features| 119 | +------------------+-----+--------------------+ 120 | | 6.489667556875804| 7|[4.9,0.42,0.0,2.1...| 121 | | 6.267301910170284| 7|[5.1,0.42,0.0,1.8...| 122 | |6.0526786505470245| 7|[5.1,0.585,0.0,1....| 123 | | 5.257985010985523| 5|[5.2,0.32,0.25,1....| 124 | | 5.943264423589821| 7|[5.2,0.48,0.04,1....| 125 | +------------------+-----+--------------------+ 126 | ``` 127 | 128 | ### 7. Model Evaluation 129 | ```{python eval=FALSE} 130 | # Select (prediction, true label) and compute test error 131 | evaluator = RegressionEvaluator( 132 | labelCol="label", predictionCol="prediction", metricName="rmse") 133 | rmse = evaluator.evaluate(predictions) 134 | print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) 135 | ``` 136 | 137 | ```{python eval=FALSE} 138 | Root Mean Squared Error (RMSE) on test data = 0.659148 139 | ``` 140 | 141 | 142 | -------------------------------------------------------------------------------- /legacy/randomforestC.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Random Forest Regression" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "February 19, 2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | ### Remark: 12 | 13 | - You can download the complete [ipython notebook](./ipynb/RandomForest.ipynb) for this tutorial session. 14 | 15 | ### 1. Set up spark context and SparkSession 16 | 17 | ```{python eval=FALSE} 18 | from pyspark.sql import SparkSession 19 | 20 | spark = SparkSession \ 21 | .builder \ 22 | .appName("Python Spark Random Forest Regression") \ 23 | .config("spark.some.config.option", "some-value") \ 24 | .getOrCreate() 25 | ``` 26 | 27 | ### 2. Load dataset 28 | ```{python eval=FALSE} 29 | df = spark.read.format('com.databricks.spark.csv').\ 30 | options(header='true', \ 31 | inferschema='true').load("./data/WineData.csv",header=True); 32 | ``` 33 | 34 | ```{python eval=FALSE} 35 | df.printSchema() 36 | ``` 37 | ```{python eval=FALSE} 38 | #output 39 | root 40 | |-- fixed acidity: double (nullable = true) 41 | |-- volatile acidity: double (nullable = true) 42 | |-- citric acid: double (nullable = true) 43 | |-- residual sugar: double (nullable = true) 44 | |-- chlorides: double (nullable = true) 45 | |-- free sulfur dioxide: double (nullable = true) 46 | |-- total sulfur dioxide: double (nullable = true) 47 | |-- density: double (nullable = true) 48 | |-- pH: double (nullable = true) 49 | |-- sulphates: double (nullable = true) 50 | |-- alcohol: double (nullable = true) 51 | |-- quality: integer (nullable = true) 52 | ``` 53 | 54 | 55 | ### 3. Convert the data to dense vector 56 | ```{python eval=FALSE} 57 | from pyspark.sql import Row 58 | from pyspark.ml.linalg import Vectors 59 | ``` 60 | ```{python eval=FALSE} 61 | def transData(data): 62 | return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label']) 63 | ``` 64 | 65 | ```{python eval=FALSE} 66 | transformed= transData(df) 67 | transformed.show(6) 68 | ``` 69 | 70 | ```{python eval=FALSE} 71 | #output 72 | +--------------------+-----+ 73 | | features|label| 74 | +--------------------+-----+ 75 | |[7.4,0.7,0.0,1.9,...| 5| 76 | |[7.8,0.88,0.0,2.6...| 5| 77 | |[7.8,0.76,0.04,2....| 5| 78 | |[11.2,0.28,0.56,1...| 6| 79 | |[7.4,0.7,0.0,1.9,...| 5| 80 | |[7.4,0.66,0.0,1.8...| 5| 81 | +--------------------+-----+ 82 | only showing top 6 rows 83 | ``` 84 | 85 | ```{python eval=FALSE} 86 | from pyspark.ml import Pipeline 87 | from pyspark.ml.regression import RandomForestRegressor 88 | from pyspark.ml.feature import VectorIndexer 89 | from pyspark.ml.evaluation import RegressionEvaluator 90 | ``` 91 | ### 4. Split the data into training and test sets (30% held out for testing) 92 | ```{python eval=FALSE} 93 | # Split the data into training and test sets (30% held out for testing) 94 | (trainingData, testData) = transformed.randomSplit([0.7, 0.3]) 95 | ``` 96 | ### 5. Train a RandomForest model. 97 | 98 | ```{python eval=FALSE} 99 | # Train a RandomForest model. 100 | rf = RandomForestRegressor() 101 | model = rf.fit(trainingData) 102 | ``` 103 | ### 6. Make predictions. 104 | 105 | ```{python eval=FALSE} 106 | # Make predictions. 107 | predictions = model.transform(testData) 108 | ``` 109 | ### 6. Show esults 110 | ```{python eval=FALSE} 111 | # Select example rows to display. 112 | predictions.select("prediction", "label", "features").show(5) 113 | ``` 114 | 115 | ```{python eval=FALSE} 116 | #output 117 | +------------------+-----+--------------------+ 118 | | prediction|label| features| 119 | +------------------+-----+--------------------+ 120 | | 6.489667556875804| 7|[4.9,0.42,0.0,2.1...| 121 | | 6.267301910170284| 7|[5.1,0.42,0.0,1.8...| 122 | |6.0526786505470245| 7|[5.1,0.585,0.0,1....| 123 | | 5.257985010985523| 5|[5.2,0.32,0.25,1....| 124 | | 5.943264423589821| 7|[5.2,0.48,0.04,1....| 125 | +------------------+-----+--------------------+ 126 | ``` 127 | 128 | ### 7. Model Evaluation 129 | ```{python eval=FALSE} 130 | # Select (prediction, true label) and compute test error 131 | evaluator = RegressionEvaluator( 132 | labelCol="label", predictionCol="prediction", metricName="rmse") 133 | rmse = evaluator.evaluate(predictions) 134 | print("Root Mean Squared Error (RMSE) on test data = %g" % rmse) 135 | ``` 136 | 137 | ```{python eval=FALSE} 138 | Root Mean Squared Error (RMSE) on test data = 0.659148 139 | ``` 140 | 141 | 142 | -------------------------------------------------------------------------------- /legacy/regularization.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Regularization" 3 | author: "Ming Chen" 4 | date: "6/5/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 10 | ``` 11 | 12 | ## Regularization 13 | 14 | Regularization is the technique used to solve the overfitting problem. An overfitted model means that the model can predict very well with the training data, but perform poorly with independent validation data. 15 | 16 | When we add more predictors to our model, we will almost neccessarily decrease the **Residual Sum of Squares** (RSS; smaller RSS indicates better model). This increases the complexity of our model and makes our model only perform well on the training data (overfitting). 17 | 18 | To balance the RSS and model overfitting, we introduce penalty for adding new predictors (coefficient $\beta \neq 0$) to the model. 19 | 20 | ## LASSO regularization and Ridge regularization 21 | 22 | * **LASSO**: $min \{RSS + \lambda\sum_{j=1}^{p}|\beta|\}$ 23 | * **Ridge**: $min \{RSS + \lambda\sum_{j=1}^{p}\beta^2_j\}$ 24 | 25 | ## Elastic Net regularization 26 | 27 | Elastic net is a regularized method that linearly combines penalities of the lasso and ridge methods. 28 | 29 | * **elastic net**: $min \{RSS + \lambda_1\sum_{j=1}^{p}|\beta| + \lambda_2\sum_{j=1}^{p}\beta^2_j\}$ 30 | 31 | ## *regParam* and *elasticNetParam* parameters in regression models 32 | 33 | * **regParam**: regularization parameter $\lambda$ 34 | * **elasticNetParam**: $\lambda_2$ ridge penalty 35 | * **Scenarios**: 36 | + *regParam* = $0$, *elasticNetParam* = $0$: no regularization applied, $\lambda = 0$ 37 | + *regParam* $\neq 0$, *elasticNetParam* = $1$: lasso regularization applied 38 | + *regParam* $\neq 0$, *elasticNetParam* = $0$: ridge regularization applied 39 | + *regParam* $\neq 0$, $0 < elasticNetParam < 1$: elastic net regularization applied -------------------------------------------------------------------------------- /legacy/sna.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Social Network Analysis" 3 | author: "Wenqiang Feng" 4 | date: "4/7/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## R Markdown 13 | 14 | This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . 15 | 16 | When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: 17 | 18 | ```{r cars} 19 | summary(cars) 20 | ``` 21 | 22 | ## Including Plots 23 | 24 | You can also embed plots, for example: 25 | 26 | ```{r pressure, echo=FALSE} 27 | plot(pressure) 28 | ``` 29 | 30 | Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. 31 | -------------------------------------------------------------------------------- /legacy/spark-on-jetstream-cloud.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Spark on Jetstream Cloud" 3 | author: "Wenqiang Feng & Ming Chen" 4 | date: "3/8/2017" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE) 10 | ``` 11 | 12 | 13 | ## Set up apache spark on jetstream 14 | 15 | * Install linuxbrew and spark 16 | 17 | ```{python} 18 | sudo apt-get install -y ruby 19 | ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Linuxbrew/install/master/install)" 20 | 21 | echo 'export PATH="/home/mchen33/.linuxbrew/bin:$PATH"' >>~/.bash_profile 22 | echo 'export MANPATH="/home/mchen33/.linuxbrew/share/man:$MANPATH"' >>~/.bash_profile 23 | echo 'export INFOPATH="/home/mchen33/.linuxbrew/share/info:$INFOPATH"' >>~/.bash_profile 24 | 25 | source ~/.bash_profile 26 | 27 | sudo apt-get install build-essential 28 | 29 | brew install apache-spark 30 | 31 | ## install java 32 | sudo apt-get install -y default-jre 33 | ``` 34 | 35 | 36 | ```{python} 37 | export SPARK_LOCAL_IP="127.0.0.1" 38 | ``` 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /link-spark-with-jupyter.md: -------------------------------------------------------------------------------- 1 | ## Install jupyter with conda 2 | 3 | ``` 4 | conda install jupyter 5 | ``` 6 | 7 | ## Get `jupyter binary executable path` 8 | 9 | ``` 10 | which jupyter 11 | ``` 12 | 13 | output 14 | 15 | ``` 16 | /Users/mingchen/anaconda2/bin/jupyter 17 | ``` 18 | 19 | ## Link spark with jupyter 20 | 21 | ``` 22 | export PYSPARK_DRIVER_PYTHON=/Users/mingchen/anaconda2/bin/jupyter 23 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8880" 24 | ``` 25 | 26 | You can also add the two environmental variables to the `~/.bash_profile` file to permenantly link spark with jupyter 27 | 28 | ## Run jupyter notebook 29 | 30 | ``` 31 | pyspark 32 | ``` 33 | 34 | Then go to [http://127.0.0.1:8880](http://127.0.0.1:8880) 35 | 36 | -------------------------------------------------------------------------------- /logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/logo.jpg -------------------------------------------------------------------------------- /notebooks/01-data-strcture/.gitignore: -------------------------------------------------------------------------------- 1 | spark-warehouse 2 | -------------------------------------------------------------------------------- /notebooks/02-data-manipulation/.ipynb_checkpoints/2.3-continuous-variable-to-categorical-variable-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# create entry points to spark\n", 10 | "try:\n", 11 | " sc.stop()\n", 12 | "except:\n", 13 | " pass\n", 14 | "from pyspark import SparkContext, SparkConf\n", 15 | "from pyspark.sql import SparkSession\n", 16 | "sc=SparkContext()\n", 17 | "spark = SparkSession(sparkContext=sc)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Convert continuous variables to categorical variables" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "There are two functions we can use to split a continuous variable into categories:\n", 32 | "\n", 33 | "* `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold\n", 34 | "* `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.\n", 35 | " + with n+1 split points, there are n categories (buckets).\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Create some data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "+--------------------+------------------+\n", 55 | "| x1| x2|\n", 56 | "+--------------------+------------------+\n", 57 | "| 0.47143516373249306| 6.834629351721363|\n", 58 | "| -1.1909756947064645| 7.127020269829002|\n", 59 | "| 1.4327069684260973|3.7025075479039495|\n", 60 | "| -0.3126518960917129| 5.611961860656249|\n", 61 | "| -0.7205887333650116| 5.030831653078097|\n", 62 | "| 0.8871629403077386|0.1376844959068224|\n", 63 | "| 0.8595884137174165| 7.728266216123741|\n", 64 | "| -0.6365235044173491| 8.826411906361166|\n", 65 | "|0.015696372114428918| 3.648859839013723|\n", 66 | "| -2.2426849541854055| 6.153961784334937|\n", 67 | "+--------------------+------------------+\n", 68 | "\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "import numpy as np\n", 74 | "import pandas as pd\n", 75 | "np.random.seed(seed=1234)\n", 76 | "pdf = pd.DataFrame({\n", 77 | " 'x1': np.random.randn(10),\n", 78 | " 'x2': np.random.rand(10)*10\n", 79 | " })\n", 80 | "np.random.seed(seed=None)\n", 81 | "df = spark.createDataFrame(pdf)\n", 82 | "df.show()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Binarize the column x1 and Bucketize the column x2" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+--------------------+------------------+------+------+\n", 102 | "| x1| x2|x1_new|x2_new|\n", 103 | "+--------------------+------------------+------+------+\n", 104 | "| 0.47143516373249306| 6.834629351721363| 1.0| 2.0|\n", 105 | "| -1.1909756947064645| 7.127020269829002| 0.0| 2.0|\n", 106 | "| 1.4327069684260973|3.7025075479039495| 1.0| 1.0|\n", 107 | "| -0.3126518960917129| 5.611961860656249| 0.0| 2.0|\n", 108 | "| -0.7205887333650116| 5.030831653078097| 0.0| 2.0|\n", 109 | "| 0.8871629403077386|0.1376844959068224| 1.0| 0.0|\n", 110 | "| 0.8595884137174165| 7.728266216123741| 1.0| 3.0|\n", 111 | "| -0.6365235044173491| 8.826411906361166| 0.0| 3.0|\n", 112 | "|0.015696372114428918| 3.648859839013723| 1.0| 1.0|\n", 113 | "| -2.2426849541854055| 6.153961784334937| 0.0| 2.0|\n", 114 | "+--------------------+------------------+------+------+\n", 115 | "\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "from pyspark.ml.feature import Binarizer, Bucketizer\n", 121 | "# threshold = 0 for binarizer\n", 122 | "binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')\n", 123 | "# provide 5 split points to generate 4 buckets\n", 124 | "bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')\n", 125 | "\n", 126 | "# pipeline stages\n", 127 | "from pyspark.ml import Pipeline\n", 128 | "stages = [binarizer, bucketizer]\n", 129 | "pipeline = Pipeline(stages=stages)\n", 130 | "\n", 131 | "# fit the pipeline model and transform the data\n", 132 | "pipeline.fit(df).transform(df).show()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.6.5" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 2 166 | } 167 | -------------------------------------------------------------------------------- /notebooks/02-data-manipulation/2.3-continuous-variable-to-categorical-variable.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# create entry points to spark\n", 10 | "try:\n", 11 | " sc.stop()\n", 12 | "except:\n", 13 | " pass\n", 14 | "from pyspark import SparkContext, SparkConf\n", 15 | "from pyspark.sql import SparkSession\n", 16 | "sc=SparkContext()\n", 17 | "spark = SparkSession(sparkContext=sc)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Convert continuous variables to categorical variables" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "There are two functions we can use to split a continuous variable into categories:\n", 32 | "\n", 33 | "* `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold\n", 34 | "* `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.\n", 35 | " + with n+1 split points, there are n categories (buckets).\n" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "## Create some data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "+--------------------+------------------+\n", 55 | "| x1| x2|\n", 56 | "+--------------------+------------------+\n", 57 | "| 0.47143516373249306| 6.834629351721363|\n", 58 | "| -1.1909756947064645| 7.127020269829002|\n", 59 | "| 1.4327069684260973|3.7025075479039495|\n", 60 | "| -0.3126518960917129| 5.611961860656249|\n", 61 | "| -0.7205887333650116| 5.030831653078097|\n", 62 | "| 0.8871629403077386|0.1376844959068224|\n", 63 | "| 0.8595884137174165| 7.728266216123741|\n", 64 | "| -0.6365235044173491| 8.826411906361166|\n", 65 | "|0.015696372114428918| 3.648859839013723|\n", 66 | "| -2.2426849541854055| 6.153961784334937|\n", 67 | "+--------------------+------------------+\n", 68 | "\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "import numpy as np\n", 74 | "import pandas as pd\n", 75 | "np.random.seed(seed=1234)\n", 76 | "pdf = pd.DataFrame({\n", 77 | " 'x1': np.random.randn(10),\n", 78 | " 'x2': np.random.rand(10)*10\n", 79 | " })\n", 80 | "np.random.seed(seed=None)\n", 81 | "df = spark.createDataFrame(pdf)\n", 82 | "df.show()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": {}, 88 | "source": [ 89 | "## Binarize the column x1 and Bucketize the column x2" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+--------------------+------------------+------+------+\n", 102 | "| x1| x2|x1_new|x2_new|\n", 103 | "+--------------------+------------------+------+------+\n", 104 | "| 0.47143516373249306| 6.834629351721363| 1.0| 2.0|\n", 105 | "| -1.1909756947064645| 7.127020269829002| 0.0| 2.0|\n", 106 | "| 1.4327069684260973|3.7025075479039495| 1.0| 1.0|\n", 107 | "| -0.3126518960917129| 5.611961860656249| 0.0| 2.0|\n", 108 | "| -0.7205887333650116| 5.030831653078097| 0.0| 2.0|\n", 109 | "| 0.8871629403077386|0.1376844959068224| 1.0| 0.0|\n", 110 | "| 0.8595884137174165| 7.728266216123741| 1.0| 3.0|\n", 111 | "| -0.6365235044173491| 8.826411906361166| 0.0| 3.0|\n", 112 | "|0.015696372114428918| 3.648859839013723| 1.0| 1.0|\n", 113 | "| -2.2426849541854055| 6.153961784334937| 0.0| 2.0|\n", 114 | "+--------------------+------------------+------+------+\n", 115 | "\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "from pyspark.ml.feature import Binarizer, Bucketizer\n", 121 | "# threshold = 0 for binarizer\n", 122 | "binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')\n", 123 | "# provide 5 split points to generate 4 buckets\n", 124 | "bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')\n", 125 | "\n", 126 | "# pipeline stages\n", 127 | "from pyspark.ml import Pipeline\n", 128 | "stages = [binarizer, bucketizer]\n", 129 | "pipeline = Pipeline(stages=stages)\n", 130 | "\n", 131 | "# fit the pipeline model and transform the data\n", 132 | "pipeline.fit(df).transform(df).show()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": { 139 | "collapsed": true 140 | }, 141 | "outputs": [], 142 | "source": [] 143 | } 144 | ], 145 | "metadata": { 146 | "kernelspec": { 147 | "display_name": "Python 3", 148 | "language": "python", 149 | "name": "python3" 150 | }, 151 | "language_info": { 152 | "codemirror_mode": { 153 | "name": "ipython", 154 | "version": 3 155 | }, 156 | "file_extension": ".py", 157 | "mimetype": "text/x-python", 158 | "name": "python", 159 | "nbconvert_exporter": "python", 160 | "pygments_lexer": "ipython3", 161 | "version": "3.6.5" 162 | } 163 | }, 164 | "nbformat": 4, 165 | "nbformat_minor": 2 166 | } 167 | -------------------------------------------------------------------------------- /notebooks/02-data-manipulation/2.7.1-column-expression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "# create entry points to spark\n", 10 | "try:\n", 11 | " sc.stop()\n", 12 | "except:\n", 13 | " pass\n", 14 | "from pyspark import SparkContext, SparkConf\n", 15 | "from pyspark.sql import SparkSession\n", 16 | "sc=SparkContext()\n", 17 | "spark = SparkSession(sparkContext=sc)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Column expression\n", 25 | "\n", 26 | "A Spark **column instance** is **NOT a column of values** from the **DataFrame**: when you crate a column instance, it does not give you the actual values of that column in the DataFrame. I found it makes more sense to me if I consider a **column instance as a column of expressions**. These expressions are evaluated by other methods (e.g., the **select()**, **groupby()**, and **orderby()** from **pyspark.sql.DataFrame**)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Example data" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n", 46 | "| model| mpg|cyl| disp| hp|drat| wt| qsec| vs| am|gear|carb|\n", 47 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n", 48 | "| Mazda RX4|21.0| 6|160.0|110| 3.9| 2.62|16.46| 0| 1| 4| 4|\n", 49 | "| Mazda RX4 Wag|21.0| 6|160.0|110| 3.9|2.875|17.02| 0| 1| 4| 4|\n", 50 | "| Datsun 710|22.8| 4|108.0| 93|3.85| 2.32|18.61| 1| 1| 4| 1|\n", 51 | "| Hornet 4 Drive|21.4| 6|258.0|110|3.08|3.215|19.44| 1| 0| 3| 1|\n", 52 | "|Hornet Sportabout|18.7| 8|360.0|175|3.15| 3.44|17.02| 0| 0| 3| 2|\n", 53 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n", 54 | "only showing top 5 rows\n", 55 | "\n" 56 | ] 57 | } 58 | ], 59 | "source": [ 60 | "mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)\n", 61 | "mtcars = mtcars.withColumnRenamed('_c0', 'model')\n", 62 | "mtcars.show(5)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Use dot (.) to select column from DataFrame" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "Column" 81 | ] 82 | }, 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "mpg_col = mtcars.mpg\n", 90 | "mpg_col" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "## Modify a column to generate a new column" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "Column" 109 | ] 110 | }, 111 | "execution_count": 5, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "mpg_col + 1" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "+-----------+\n", 130 | "|(mpg * 100)|\n", 131 | "+-----------+\n", 132 | "| 2100.0|\n", 133 | "| 2100.0|\n", 134 | "| 2280.0|\n", 135 | "| 2140.0|\n", 136 | "| 1870.0|\n", 137 | "+-----------+\n", 138 | "only showing top 5 rows\n", 139 | "\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "mtcars.select(mpg_col * 100).show(5)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "The **pyspark.sql.Column** has many methods that acts on a column and returns a column instance." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 7, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "+----------------+\n", 164 | "|(gear IN (2, 3))|\n", 165 | "+----------------+\n", 166 | "| false|\n", 167 | "| false|\n", 168 | "| false|\n", 169 | "| true|\n", 170 | "| true|\n", 171 | "+----------------+\n", 172 | "only showing top 5 rows\n", 173 | "\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "mtcars.select(mtcars.gear.isin([2,3])).show(5)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 8, 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "Column" 190 | ] 191 | }, 192 | "execution_count": 8, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "mtcars.mpg.asc()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [] 209 | } 210 | ], 211 | "metadata": { 212 | "kernelspec": { 213 | "display_name": "Python 3", 214 | "language": "python", 215 | "name": "python3" 216 | }, 217 | "language_info": { 218 | "codemirror_mode": { 219 | "name": "ipython", 220 | "version": 3 221 | }, 222 | "file_extension": ".py", 223 | "mimetype": "text/x-python", 224 | "name": "python", 225 | "nbconvert_exporter": "python", 226 | "pygments_lexer": "ipython3", 227 | "version": "3.6.5" 228 | } 229 | }, 230 | "nbformat": 4, 231 | "nbformat_minor": 2 232 | } 233 | -------------------------------------------------------------------------------- /notebooks/02-data-manipulation/2.7.2-dot-column-expression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# create entry points to spark\n", 12 | "try:\n", 13 | " sc.stop()\n", 14 | "except:\n", 15 | " pass\n", 16 | "from pyspark import SparkContext, SparkConf\n", 17 | "from pyspark.sql import SparkSession\n", 18 | "sc=SparkContext()\n", 19 | "spark = SparkSession(sparkContext=sc)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "## Example data" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": {}, 33 | "outputs": [ 34 | { 35 | "name": "stdout", 36 | "output_type": "stream", 37 | "text": [ 38 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n", 39 | "| model| mpg|cyl| disp| hp|drat| wt| qsec| vs| am|gear|carb|\n", 40 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n", 41 | "| Mazda RX4|21.0| 6|160.0|110| 3.9| 2.62|16.46| 0| 1| 4| 4|\n", 42 | "| Mazda RX4 Wag|21.0| 6|160.0|110| 3.9|2.875|17.02| 0| 1| 4| 4|\n", 43 | "| Datsun 710|22.8| 4|108.0| 93|3.85| 2.32|18.61| 1| 1| 4| 1|\n", 44 | "| Hornet 4 Drive|21.4| 6|258.0|110|3.08|3.215|19.44| 1| 0| 3| 1|\n", 45 | "|Hornet Sportabout|18.7| 8|360.0|175|3.15| 3.44|17.02| 0| 0| 3| 2|\n", 46 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n", 47 | "only showing top 5 rows\n", 48 | "\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "mtcars = spark.read.csv('../../../data/mtcars.csv', inferSchema=True, header=True)\n", 54 | "mtcars = mtcars.withColumnRenamed('_c0', 'model')\n", 55 | "mtcars.show(5)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Dot (.) column expression\n", 63 | "\n", 64 | "Create a column expression that will return the original column values." 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 3, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/plain": [ 75 | "Column" 76 | ] 77 | }, 78 | "execution_count": 3, 79 | "metadata": {}, 80 | "output_type": "execute_result" 81 | } 82 | ], 83 | "source": [ 84 | "mpg_col_exp = mtcars.mpg\n", 85 | "mpg_col_exp" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "name": "stdout", 95 | "output_type": "stream", 96 | "text": [ 97 | "+----+\n", 98 | "| mpg|\n", 99 | "+----+\n", 100 | "|21.0|\n", 101 | "|21.0|\n", 102 | "|22.8|\n", 103 | "|21.4|\n", 104 | "|18.7|\n", 105 | "+----+\n", 106 | "only showing top 5 rows\n", 107 | "\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "mtcars.select(mpg_col_exp).show(5)" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.5.0" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /notebooks/04-miscellaneous/add-python-files-to-spark-cluster.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "The `SparkContext.addPyFiles()` function can be used to add py files. We can define objects and variables in these files and make them available to the Spark cluster." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Create a SparkContext object" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from pyspark import SparkConf, SparkContext, SparkFiles\n", 26 | "from pyspark.sql import SparkSession" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "sc = SparkContext(conf=SparkConf())" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "# Add py files" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": true 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "sc.addPyFile('pyFiles/my_module.py')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "'/private/var/folders/2_/kb60z5_j0k91tyh740s1zhn40000gn/T/spark-4f959e9f-4af6-490e-afce-02e1582aae8d/userFiles-8b1c073b-4c82-467a-b9ff-021aa3067abe/my_module.py'" 67 | ] 68 | }, 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "SparkFiles.get('my_module.py')" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "# Use **my_module.py**\n", 83 | "We can import `my_module` as a python module" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "from my_module import *" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 6, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "True" 106 | ] 107 | }, 108 | "execution_count": 6, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "addPyFiles_is_successfull()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 7, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "data": { 124 | "text/plain": [ 125 | "9" 126 | ] 127 | }, 128 | "execution_count": 7, 129 | "metadata": {}, 130 | "output_type": "execute_result" 131 | } 132 | ], 133 | "source": [ 134 | "sum_two_variables(4,5)" 135 | ] 136 | } 137 | ], 138 | "metadata": { 139 | "kernelspec": { 140 | "display_name": "Python 3", 141 | "language": "python", 142 | "name": "python3" 143 | }, 144 | "language_info": { 145 | "codemirror_mode": { 146 | "name": "ipython", 147 | "version": 3 148 | }, 149 | "file_extension": ".py", 150 | "mimetype": "text/x-python", 151 | "name": "python", 152 | "nbconvert_exporter": "python", 153 | "pygments_lexer": "ipython3", 154 | "version": "3.5.0" 155 | } 156 | }, 157 | "nbformat": 4, 158 | "nbformat_minor": 2 159 | } 160 | -------------------------------------------------------------------------------- /notebooks/04-miscellaneous/dense-vs-sparse-vectors.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pyspark import SparkConf, SparkContext\n", 12 | "from pyspark.sql import SparkSession" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 2, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "sc = SparkContext(conf=SparkConf())\n", 24 | "spark = SparkSession(sparkContext=sc)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "from pyspark.ml.linalg import Vector, DenseVector, SparseVector" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": {}, 41 | "source": [ 42 | "# Dense vector and sparse vector" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "A vector can be represented in dense and sparse formats. A dense vector is a regular vector that has each elements printed. A sparse vector use three components to represent a vector but with less memory." 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 22, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])" 61 | ] 62 | }, 63 | "execution_count": 22, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "dv = DenseVector([1.0,0.,0.,0.,4.5,0])\n", 70 | "dv" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## Three components of a sparse vector\n", 78 | "\n", 79 | "* vector size\n", 80 | "* indices of active elements\n", 81 | "* values of active elements\n", 82 | "\n", 83 | "In the above dense vector:\n", 84 | "\n", 85 | "* vector size = 6\n", 86 | "* indices of active elements = [0, 4]\n", 87 | "* values of active elements = [1.0, 4.5]" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "We can use the `SparseVector()` function to create a sparse vector. The first argument is the vector size, the second\n", 95 | "argument is a dictionary. The keys are indices of active elements and the values are values of active elements." 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 23, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "SparseVector(6, {0: 1.0, 4: 4.5})" 107 | ] 108 | }, 109 | "execution_count": 23, 110 | "metadata": {}, 111 | "output_type": "execute_result" 112 | } 113 | ], 114 | "source": [ 115 | "sv = SparseVector(6, {0:1.0, 4:4.5})\n", 116 | "sv" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## Convert sparse vector to dense vector" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 30, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])" 135 | ] 136 | }, 137 | "execution_count": 30, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "DenseVector(sv.toArray())" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Convert dense vector to sparse vector" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 33, 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "{0: 1.0, 4: 4.5}" 162 | ] 163 | }, 164 | "execution_count": 33, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "active_elements_dict = {index: value for index, value in enumerate(dv) if value != 0}\n", 171 | "active_elements_dict" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 34, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "SparseVector(6, {0: 1.0, 4: 4.5})" 183 | ] 184 | }, 185 | "execution_count": 34, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "SparseVector(len(dv), active_elements_dict)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [] 202 | } 203 | ], 204 | "metadata": { 205 | "kernelspec": { 206 | "display_name": "Python 3", 207 | "language": "python", 208 | "name": "python3" 209 | }, 210 | "language_info": { 211 | "codemirror_mode": { 212 | "name": "ipython", 213 | "version": 3 214 | }, 215 | "file_extension": ".py", 216 | "mimetype": "text/x-python", 217 | "name": "python", 218 | "nbconvert_exporter": "python", 219 | "pygments_lexer": "ipython3", 220 | "version": "3.5.0" 221 | } 222 | }, 223 | "nbformat": 4, 224 | "nbformat_minor": 2 225 | } 226 | -------------------------------------------------------------------------------- /notebooks/04-miscellaneous/issues-and-solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Issues and Solutions" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "**load pyspark environment permission denied**\n", 15 | "\n", 16 | "This issue might be caused by a recently Mac OS updating to Sierra 10.12.5." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "```\n", 24 | "\n", 25 | "```" 26 | ] 27 | } 28 | ], 29 | "metadata": { 30 | "kernelspec": { 31 | "display_name": "Python 3", 32 | "language": "python", 33 | "name": "python3" 34 | }, 35 | "language_info": { 36 | "codemirror_mode": { 37 | "name": "ipython", 38 | "version": 3 39 | }, 40 | "file_extension": ".py", 41 | "mimetype": "text/x-python", 42 | "name": "python", 43 | "nbconvert_exporter": "python", 44 | "pygments_lexer": "ipython3", 45 | "version": "3.6.1" 46 | } 47 | }, 48 | "nbformat": 4, 49 | "nbformat_minor": 2 50 | } 51 | -------------------------------------------------------------------------------- /notebooks/05-module-turning/cross-validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Cross-validation\n", 8 | "---" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Training/validation/test data sets\n", 16 | "\n", 17 | "* **Training set**: the data set for training your models.\n", 18 | "* **Validation set**: The data set used for testing the performance of your models you have built with training sets. Based on the performance, you choose the best model (final).\n", 19 | "* **Test set**: use this data set to test the performance of your final model.\n", 20 | "\n", 21 | "## K-folds cross validation steps (k=4 as an example).\n", 22 | "\n", 23 | "* step 1: split your data into training set and test set (for example 80% training and 20% test). Test set will never be used in model training and selection. \n", 24 | "* step 2: split training set into k (k=4) eqaul subsets: 3 subsets for traing + 1 subset for validation.\n", 25 | "* step 3: training your models with the 3 subsets and calculate a performance score with the remaining 1 subset.\n", 26 | "* step 4: choose a different subset for validation and then repeat step 3 until every subset has been used as a validation subset.\n", 27 | "* step 5: for a k=4 fold cross validation, each trained model should have been validated by 4 subsets and therefore has 4 performance scores. Calculate the average of these 4 perfermance scores for each model. Use the average score to select the best, final model.\n", 28 | "* step 6: apply your final model to the **untouched** test data and see how it performs.\n", 29 | "\n", 30 | "## Example of k-folds cross validation\n", 31 | "\n", 32 | "### Build parameter grids\n", 33 | "\n", 34 | "* parameter grid: a combination of all variable parameters in your model.\n", 35 | "* example: If I want to train a logistic regression model on 4 different *regParam* and 3 different *elasticNetParam*, I will have 3 x 4 = 12 models to train and validate.\n", 36 | " " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": { 43 | "collapsed": true 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "from pyspark.ml.classification import LogisticRegression\n", 48 | "blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial')\n", 49 | "\n", 50 | "from pyspark.ml.tuning import ParamGridBuilder\n", 51 | "param_grid = ParamGridBuilder().\\\n", 52 | " addGrid(blor.regParam, [0, 0.5, 1, 2]).\\\n", 53 | " addGrid(blor.elasticNetParam, [0, 0.5, 1]).\\\n", 54 | " build()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "### Split data into training and test sets\n", 62 | "* Refer to the [logistic regression page](logistic-regression.ipynb) to see what data we used and how the training and test sets were generated.\n", 63 | "\n", 64 | "### Run k (k=4) folds cross validation" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": true 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 76 | "evaluator = BinaryClassificationEvaluator()\n", 77 | "\n", 78 | "from pyspark.ml.tuning import CrossValidator\n", 79 | "cv = CrossValidator(estimator=blor, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)\n", 80 | "\n", 81 | "cvModel = cv.fit(training)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [] 92 | } 93 | ], 94 | "metadata": { 95 | "kernelspec": { 96 | "display_name": "Python 3", 97 | "language": "python", 98 | "name": "python3" 99 | }, 100 | "language_info": { 101 | "codemirror_mode": { 102 | "name": "ipython", 103 | "version": 3 104 | }, 105 | "file_extension": ".py", 106 | "mimetype": "text/x-python", 107 | "name": "python", 108 | "nbconvert_exporter": "python", 109 | "pygments_lexer": "ipython3", 110 | "version": "3.6.1" 111 | } 112 | }, 113 | "nbformat": 4, 114 | "nbformat_minor": 2 115 | } 116 | -------------------------------------------------------------------------------- /notebooks/05-module-turning/regularization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Regularization\n", 8 | "---" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "## Regularization\n", 16 | "\n", 17 | "Regularization is the technique used to solve the overfitting problem. An overfitted model means that the model can predict very well with the training data, but perform poorly with independent validation data.\n", 18 | "\n", 19 | "When we add more predictors to our model, we will almost neccessarily decrease the **Residual Sum of Squares** (RSS; smaller RSS indicates better model). This increases the complexity of our model and makes our model only perform well on the training data (overfitting).\n", 20 | "\n", 21 | "To balance the RSS and model overfitting, we introduce penalty for adding new predictors (coefficient $\\beta \\neq 0$) to the model.\n", 22 | "\n", 23 | "\n", 24 | "\n", 25 | "\n", 26 | "\n", 27 | "\n", 28 | "\n", 29 | "\n", 30 | "## LASSO regularization and Ridge regularization\n", 31 | "\n", 32 | "* **LASSO**: $min \\{RSS + \\lambda\\sum_{j=1}^{p}|\\beta_1|\\}$\n", 33 | "* **Ridge**: $min \\{RSS + \\lambda\\sum_{j=1}^{p}\\beta^2_2\\}$\n", 34 | "\n", 35 | "## Elastic Net regularization\n", 36 | "\n", 37 | "Elastic net is a regularized method that linearly combines penalities of the lasso and ridge methods.\n", 38 | "\n", 39 | "* **elastic net**: $min \\{RSS + \\lambda[\\sum_{j=1}^{p}\\frac{1}{2}(1-\\alpha)|\\beta^2_2| + \\alpha\\sum_{j=1}^{p}\\beta_1]\\}$\n", 40 | "\n", 41 | "Reference: https://spark.apache.org/docs/2.1.1/ml-classification-regression.html\n", 42 | "\n", 43 | "## *regParam* and *elasticNetParam* parameters in regression models\n", 44 | "\n", 45 | "* **regParam**: corresponds to $\\lambda$\n", 46 | "* **elasticNetParam** corresponds to $\\alpha$. When $\\alpha = 0$, it is ridge regularization (L2 penalty). When $\\alpha = 1$, it is lasso regularization (L1 penalty)." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.6.1" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 2 80 | } 81 | -------------------------------------------------------------------------------- /notebooks/ipynb/.ipynb_checkpoints/HashingTF-and-CountVectorizer-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/ipynb/.ipynb_checkpoints/NaiveBayes-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/ipynb/.ipynb_checkpoints/RDD-manipulation-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /notebooks/ipynb/derby.log: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------- 2 | Wed Mar 22 19:59:25 EDT 2017: 3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.12.1.1 - (1704137): instance a816c00e-015a-f875-fa3e-0000108dd888 4 | on database directory /Users/mingchen/GoogleDrive/R-projects/learning-apache-spark/ipynb/metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@3bcf99f7 5 | Loaded from file:/usr/local/Cellar/apache-spark/2.1.0/libexec/jars/derby-10.12.1.1.jar 6 | java.vendor=Oracle Corporation 7 | java.runtime.version=1.8.0_51-b16 8 | user.dir=/Users/mingchen/GoogleDrive/R-projects/learning-apache-spark/ipynb 9 | os.name=Mac OS X 10 | os.arch=x86_64 11 | os.version=10.12.3 12 | derby.system.home=null 13 | Database Class Loader started - derby.database.classpath='' 14 | -------------------------------------------------------------------------------- /notebooks/ipynb/preproc.py: -------------------------------------------------------------------------------- 1 | from nltk.stem.wordnet import WordNetLemmatizer 2 | from nltk.corpus import stopwords 3 | from nltk import pos_tag 4 | import string 5 | import re 6 | import langid 7 | 8 | # Convert to float format 9 | def string_to_float(x): 10 | return float(x) 11 | 12 | # Use langid module to classify the language to make sure we are applying the correct cleanup actions for English 13 | # https://github.com/saffsd/langid.py 14 | def check_lang(data_str): 15 | predict_lang = langid.classify(data_str) 16 | if predict_lang[1] >= .9: 17 | language = predict_lang[0] 18 | else: 19 | language = 'NA' 20 | return language 21 | 22 | 23 | # Stop words usually refer to the most common words in a language, there is no single universal list of stop words used 24 | # by all natural language processing tools. 25 | # Reduces Dimensionality 26 | # removes stop words of a single Tweets (cleaned_str/row/document) 27 | def remove_stops(data_str): 28 | # expects a string 29 | stops = set(stopwords.words("english")) 30 | list_pos = 0 31 | cleaned_str = '' 32 | text = data_str.split() 33 | for word in text: 34 | if word not in stops: 35 | # rebuild cleaned_str 36 | if list_pos == 0: 37 | cleaned_str = word 38 | else: 39 | cleaned_str = cleaned_str + ' ' + word 40 | list_pos += 1 41 | return cleaned_str 42 | 43 | 44 | # catch-all to remove other 'words' that I felt didn't add a lot of value 45 | # Reduces Dimensionality, gets rid of a lot of unique urls 46 | def remove_features(data_str): 47 | # compile regex 48 | url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?') 49 | punc_re = re.compile('[%s]' % re.escape(string.punctuation)) 50 | num_re = re.compile('(\\d+)') 51 | mention_re = re.compile('@(\w+)') 52 | alpha_num_re = re.compile("^[a-z0-9_.]+$") 53 | # convert to lowercase 54 | data_str = data_str.lower() 55 | # remove hyperlinks 56 | data_str = url_re.sub(' ', data_str) 57 | # remove @mentions 58 | data_str = mention_re.sub(' ', data_str) 59 | # remove puncuation 60 | data_str = punc_re.sub(' ', data_str) 61 | # remove numeric 'words' 62 | data_str = num_re.sub(' ', data_str) 63 | # remove non a-z 0-9 characters and words shorter than 3 characters 64 | list_pos = 0 65 | cleaned_str = '' 66 | for word in data_str.split(): 67 | if list_pos == 0: 68 | if alpha_num_re.match(word) and len(word) > 2: 69 | cleaned_str = word 70 | else: 71 | cleaned_str = ' ' 72 | else: 73 | if alpha_num_re.match(word) and len(word) > 2: 74 | cleaned_str = cleaned_str + ' ' + word 75 | else: 76 | cleaned_str += ' ' 77 | list_pos += 1 78 | return cleaned_str 79 | 80 | 81 | # Process of classifying words into their parts of speech and labeling them accordingly is known as part-of-speech 82 | # tagging, POS-tagging, or simply tagging. Parts of speech are also known as word classes or lexical categories. The 83 | # collection of tags used for a particular task is known as a tagset. Our emphasis in this chapter is on exploiting 84 | # tags, and tagging text automatically. 85 | # http://www.nltk.org/book/ch05.html 86 | def tag_and_remove(data_str): 87 | cleaned_str = ' ' 88 | # noun tags 89 | nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS'] 90 | # adjectives 91 | jj_tags = ['JJ', 'JJR', 'JJS'] 92 | # verbs 93 | vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] 94 | nltk_tags = nn_tags + jj_tags + vb_tags 95 | 96 | # break string into 'words' 97 | text = data_str.split() 98 | 99 | # tag the text and keep only those with the right tags 100 | tagged_text = pos_tag(text) 101 | for tagged_word in tagged_text: 102 | if tagged_word[1] in nltk_tags: 103 | cleaned_str += tagged_word[0] + ' ' 104 | 105 | return cleaned_str 106 | 107 | 108 | # Tweets are going to use different forms of a word, such as organize, organizes, and 109 | # organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy, 110 | # democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these 111 | # words to return documents that contain another word in the set. 112 | # Reduces Dimensionality and boosts numerical measures like TFIDF 113 | 114 | # http://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html 115 | # lemmatization of a single Tweets (cleaned_str/row/document) 116 | def lemmatize(data_str): 117 | # expects a string 118 | list_pos = 0 119 | cleaned_str = '' 120 | lmtzr = WordNetLemmatizer() 121 | text = data_str.split() 122 | tagged_words = pos_tag(text) 123 | for word in tagged_words: 124 | if 'v' in word[1].lower(): 125 | lemma = lmtzr.lemmatize(word[0], pos='v') 126 | else: 127 | lemma = lmtzr.lemmatize(word[0], pos='n') 128 | if list_pos == 0: 129 | cleaned_str = lemma 130 | else: 131 | cleaned_str = cleaned_str + ' ' + lemma 132 | list_pos += 1 133 | return cleaned_str 134 | 135 | 136 | # check to see if a row only contains whitespace 137 | def check_blanks(data_str): 138 | is_blank = str(data_str.isspace()) 139 | return is_blank 140 | -------------------------------------------------------------------------------- /notebooks/ipynb/vector.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "from pyspark import SparkConf, SparkContext\n", 12 | "## set up spark context\n", 13 | "from pyspark.sql import SQLContext\n", 14 | "sc = SparkContext()\n", 15 | "sqlContext = SQLContext(sc)\n", 16 | "## set up SparkSession\n", 17 | "from pyspark.sql import SparkSession\n", 18 | "\n", 19 | "spark = SparkSession \\\n", 20 | " .builder \\\n", 21 | " .appName(\"Python Spark SQL basic example\") \\\n", 22 | " .config(\"spark.some.config.option\", \"some-value\") \\\n", 23 | " .getOrCreate()" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "from pyspark.ml.linalg import Vectors\n", 35 | "densevector = Vectors.dense([1,3,4,2.5])" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": { 42 | "collapsed": false 43 | }, 44 | "outputs": [ 45 | { 46 | "data": { 47 | "text/plain": [ 48 | "DenseVector([1.0, 3.0, 4.0, 2.5])" 49 | ] 50 | }, 51 | "execution_count": 3, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "densevector" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 4, 63 | "metadata": { 64 | "collapsed": false 65 | }, 66 | "outputs": [ 67 | { 68 | "data": { 69 | "text/plain": [ 70 | "array([ 1. , 3. , 4. , 2.5])" 71 | ] 72 | }, 73 | "execution_count": 4, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "densevector.toArray()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 5, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "sparseVector = Vectors.sparse(10, [1, 3], [3.0, 4.5])" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 6, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [ 100 | { 101 | "data": { 102 | "text/plain": [ 103 | "array([ 0. , 3. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ])" 104 | ] 105 | }, 106 | "execution_count": 6, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "sparseVector.toArray()" 113 | ] 114 | } 115 | ], 116 | "metadata": { 117 | "kernelspec": { 118 | "display_name": "Python 2", 119 | "language": "python", 120 | "name": "python2" 121 | }, 122 | "language_info": { 123 | "codemirror_mode": { 124 | "name": "ipython", 125 | "version": 2 126 | }, 127 | "file_extension": ".py", 128 | "mimetype": "text/x-python", 129 | "name": "python", 130 | "nbconvert_exporter": "python", 131 | "pygments_lexer": "ipython2", 132 | "version": "2.7.6" 133 | } 134 | }, 135 | "nbformat": 4, 136 | "nbformat_minor": 1 137 | } 138 | -------------------------------------------------------------------------------- /pyFiles/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /pyFiles/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /pyFiles/.idea/pyFiles.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /pyFiles/my_module.py: -------------------------------------------------------------------------------- 1 | def addPyFiles_is_successfull(): 2 | return(True) 3 | 4 | def sum_two_variables(a, b): 5 | return(sum([a,b])) 6 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/.gitignore: -------------------------------------------------------------------------------- 1 | /debug 2 | /jstree.sublime-project 3 | /jstree.sublime-workspace 4 | /bower_components 5 | /node_modules 6 | /site 7 | /nuget 8 | /demo/filebrowser/data/root 9 | /npm.txt 10 | /libs 11 | /docs 12 | /dist/libs 13 | /.vscode 14 | /.idea -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Ivan Bozhanov 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jstree", 3 | "license": "MIT", 4 | "version": "3.3.5", 5 | "main" : [ 6 | "./dist/jstree.js", 7 | "./dist/themes/default/style.css" 8 | ], 9 | "ignore": [ 10 | "**/.*", 11 | "docs", 12 | "demo", 13 | "libs", 14 | "node_modules", 15 | "test", 16 | "libs", 17 | "jstree.jquery.json", 18 | "gruntfile.js", 19 | "package.json", 20 | "bower.json", 21 | "component.json", 22 | "LICENCE-MIT", 23 | "README.md" 24 | ], 25 | "dependencies": { 26 | "jquery": ">=1.9.1" 27 | }, 28 | "keywords": [ 29 | "ui", 30 | "tree", 31 | "jstree" 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/component.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jstree", 3 | "repo": "vakata/jstree", 4 | "description": "jsTree is jquery plugin, that provides interactive trees.", 5 | "version": "3.3.5", 6 | "license": "MIT", 7 | "keywords": [ 8 | "ui", 9 | "tree", 10 | "jstree" 11 | ], 12 | "scripts": [ 13 | "dist/jstree.js", 14 | "dist/jstree.min.js" 15 | ], 16 | "images": [ 17 | "dist/themes/default/32px.png", 18 | "dist/themes/default/40px.png", 19 | "dist/themes/default/throbber.gif" 20 | ], 21 | "styles": [ 22 | "dist/themes/default/style.css", 23 | "dist/themes/default/style.min.css" 24 | ], 25 | "dependencies": { 26 | "components/jquery": ">=1.9.1" 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vakata/jstree", 3 | "description": "jsTree is jquery plugin, that provides interactive trees.", 4 | "type": "component", 5 | "homepage": "http://jstree.com", 6 | "license": "MIT", 7 | "support": { 8 | "issues": "https://github.com/vakata/jstree/issues", 9 | "forum": "https://groups.google.com/forum/#!forum/jstree", 10 | "source": "https://github.com/vakata/jstree" 11 | }, 12 | "authors": [ 13 | { 14 | "name": "Ivan Bozhanov", 15 | "email": "jstree@jstree.com" 16 | } 17 | ], 18 | "require": { 19 | "components/jquery": ">=1.9.1" 20 | }, 21 | "suggest": { 22 | "robloach/component-installer": "Allows installation of Components via Composer" 23 | }, 24 | "extra": { 25 | "component": { 26 | "scripts": [ 27 | "dist/jstree.js" 28 | ], 29 | "styles": [ 30 | "dist/themes/default/style.css" 31 | ], 32 | "images": [ 33 | "dist/themes/default/32px.png", 34 | "dist/themes/default/40px.png", 35 | "dist/themes/default/throbber.gif" 36 | ], 37 | "files": [ 38 | "dist/jstree.min.js", 39 | "dist/themes/default/style.min.css", 40 | "dist/themes/default/32px.png", 41 | "dist/themes/default/40px.png", 42 | "dist/themes/default/throbber.gif" 43 | ] 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/demo/README.md: -------------------------------------------------------------------------------- 1 | ## PHP demos moved to new repository 2 | https://github.com/vakata/jstree-php-demos -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/demo/basic/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | jstree basic demos 6 | 12 | 13 | 14 | 15 |

HTML demo

16 |
17 |
    18 |
  • Root node 19 |
      20 |
    • Child node 1
    • 21 |
    • Child node 2
    • 22 |
    23 |
  • 24 |
25 |
26 | 27 |

Inline data demo

28 |
29 | 30 |

Data format demo

31 |
32 | 33 |

AJAX demo

34 |
35 | 36 |

Lazy loading demo

37 |
38 | 39 |

Callback function data demo

40 |
41 | 42 |

Interaction and events demo

43 | either click the button or a node in the tree 44 |
45 | 46 | 47 | 48 | 49 | 145 | 146 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/demo/basic/root.json: -------------------------------------------------------------------------------- 1 | [{"id":1,"text":"Root node","children":[{"id":2,"text":"Child node 1"},{"id":3,"text":"Child node 2"}]}] -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/dist/themes/default-dark/32px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/32px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/dist/themes/default-dark/40px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/40px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/dist/themes/default-dark/throbber.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/throbber.gif -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/dist/themes/default/32px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/32px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/dist/themes/default/40px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/40px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/dist/themes/default/throbber.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/throbber.gif -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/jstree.jquery.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jstree", 3 | "title": "jsTree", 4 | "description": "Tree view for jQuery", 5 | "version": "3.3.5", 6 | "homepage": "http://jstree.com", 7 | "keywords": [ 8 | "ui", 9 | "tree", 10 | "jstree" 11 | ], 12 | "author": { 13 | "name": "Ivan Bozhanov", 14 | "email": "jstree@jstree.com", 15 | "url": "http://vakata.com" 16 | }, 17 | "licenses": [ 18 | { 19 | "type": "MIT", 20 | "url": "https://github.com/vakata/jstree/blob/master/LICENSE-MIT" 21 | } 22 | ], 23 | "bugs": "https://github.com/vakata/jstree/issues", 24 | "demo": "http://jstree.com/demo", 25 | "dependencies": { 26 | "jquery": ">=1.9.1" 27 | } 28 | } -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "jstree", 3 | "title": "jsTree", 4 | "description": "jQuery tree plugin", 5 | "version": "3.3.5", 6 | "homepage": "http://jstree.com", 7 | "main": "./dist/jstree.js", 8 | "author": { 9 | "name": "Ivan Bozhanov", 10 | "email": "jstree@jstree.com", 11 | "url": "http://vakata.com" 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git://github.com/vakata/jstree.git" 16 | }, 17 | "bugs": { 18 | "url": "https://github.com/vakata/jstree/issues" 19 | }, 20 | "license": "MIT", 21 | "licenses": [ 22 | { 23 | "type": "MIT", 24 | "url": "https://github.com/vakata/jstree/blob/master/LICENSE-MIT" 25 | } 26 | ], 27 | "keywords": [], 28 | "devDependencies": { 29 | "dox": "~0.4.4", 30 | "grunt": "~0.4.0", 31 | "grunt-contrib-concat": "*", 32 | "grunt-contrib-copy": "*", 33 | "grunt-contrib-imagemin": "~0.4.0", 34 | "grunt-contrib-jshint": "*", 35 | "grunt-contrib-less": "~0.8.2", 36 | "grunt-contrib-qunit": "~v0.3.0", 37 | "grunt-contrib-uglify": "*", 38 | "grunt-contrib-watch": "~0.5.3", 39 | "grunt-phantomcss-gitdiff": "0.0.7", 40 | "grunt-resemble-cli": "0.0.8", 41 | "grunt-text-replace": "~0.3.11" 42 | }, 43 | "dependencies": { 44 | "jquery": ">=1.9.1" 45 | }, 46 | "npmName": "jstree", 47 | "npmFileMap": [ 48 | { 49 | "basePath": "/dist/", 50 | "files": [ 51 | "jstree.min.js", 52 | "themes/**/*.png", 53 | "themes/**/*.gif", 54 | "themes/**/*.min.css" 55 | ] 56 | } 57 | ] 58 | } 59 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/intro.js: -------------------------------------------------------------------------------- 1 | /*globals jQuery, define, module, exports, require, window, document, postMessage */ 2 | (function (factory) { 3 | "use strict"; 4 | if (typeof define === 'function' && define.amd) { 5 | define(['jquery'], factory); 6 | } 7 | else if(typeof module !== 'undefined' && module.exports) { 8 | module.exports = factory(require('jquery')); 9 | } 10 | else { 11 | factory(jQuery); 12 | } 13 | }(function ($, undefined) { 14 | "use strict"; 15 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/jstree.changed.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ### Changed plugin 3 | * 4 | * This plugin adds more information to the `changed.jstree` event. The new data is contained in the `changed` event data property, and contains a lists of `selected` and `deselected` nodes. 5 | */ 6 | /*globals jQuery, define, exports, require, document */ 7 | (function (factory) { 8 | "use strict"; 9 | if (typeof define === 'function' && define.amd) { 10 | define('jstree.changed', ['jquery','jstree'], factory); 11 | } 12 | else if(typeof exports === 'object') { 13 | factory(require('jquery'), require('jstree')); 14 | } 15 | else { 16 | factory(jQuery, jQuery.jstree); 17 | } 18 | }(function ($, jstree, undefined) { 19 | "use strict"; 20 | 21 | if($.jstree.plugins.changed) { return; } 22 | 23 | $.jstree.plugins.changed = function (options, parent) { 24 | var last = []; 25 | this.trigger = function (ev, data) { 26 | var i, j; 27 | if(!data) { 28 | data = {}; 29 | } 30 | if(ev.replace('.jstree','') === 'changed') { 31 | data.changed = { selected : [], deselected : [] }; 32 | var tmp = {}; 33 | for(i = 0, j = last.length; i < j; i++) { 34 | tmp[last[i]] = 1; 35 | } 36 | for(i = 0, j = data.selected.length; i < j; i++) { 37 | if(!tmp[data.selected[i]]) { 38 | data.changed.selected.push(data.selected[i]); 39 | } 40 | else { 41 | tmp[data.selected[i]] = 2; 42 | } 43 | } 44 | for(i = 0, j = last.length; i < j; i++) { 45 | if(tmp[last[i]] === 1) { 46 | data.changed.deselected.push(last[i]); 47 | } 48 | } 49 | last = data.selected.slice(); 50 | } 51 | /** 52 | * triggered when selection changes (the "changed" plugin enhances the original event with more data) 53 | * @event 54 | * @name changed.jstree 55 | * @param {Object} node 56 | * @param {Object} action the action that caused the selection to change 57 | * @param {Array} selected the current selection 58 | * @param {Object} changed an object containing two properties `selected` and `deselected` - both arrays of node IDs, which were selected or deselected since the last changed event 59 | * @param {Object} event the event (if any) that triggered this changed event 60 | * @plugin changed 61 | */ 62 | parent.trigger.call(this, ev, data); 63 | }; 64 | this.refresh = function (skip_loading, forget_state) { 65 | last = []; 66 | return parent.refresh.apply(this, arguments); 67 | }; 68 | }; 69 | })); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/jstree.conditionalselect.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ### Conditionalselect plugin 3 | * 4 | * This plugin allows defining a callback to allow or deny node selection by user input (activate node method). 5 | */ 6 | /*globals jQuery, define, exports, require, document */ 7 | (function (factory) { 8 | "use strict"; 9 | if (typeof define === 'function' && define.amd) { 10 | define('jstree.conditionalselect', ['jquery','jstree'], factory); 11 | } 12 | else if(typeof exports === 'object') { 13 | factory(require('jquery'), require('jstree')); 14 | } 15 | else { 16 | factory(jQuery, jQuery.jstree); 17 | } 18 | }(function ($, jstree, undefined) { 19 | "use strict"; 20 | 21 | if($.jstree.plugins.conditionalselect) { return; } 22 | 23 | /** 24 | * a callback (function) which is invoked in the instance's scope and receives two arguments - the node and the event that triggered the `activate_node` call. Returning false prevents working with the node, returning true allows invoking activate_node. Defaults to returning `true`. 25 | * @name $.jstree.defaults.checkbox.visible 26 | * @plugin checkbox 27 | */ 28 | $.jstree.defaults.conditionalselect = function () { return true; }; 29 | $.jstree.plugins.conditionalselect = function (options, parent) { 30 | // own function 31 | this.activate_node = function (obj, e) { 32 | if(this.settings.conditionalselect.call(this, this.get_node(obj), e)) { 33 | return parent.activate_node.call(this, obj, e); 34 | } 35 | }; 36 | }; 37 | 38 | })); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/jstree.massload.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ### Massload plugin 3 | * 4 | * Adds massload functionality to jsTree, so that multiple nodes can be loaded in a single request (only useful with lazy loading). 5 | */ 6 | /*globals jQuery, define, exports, require, document */ 7 | (function (factory) { 8 | "use strict"; 9 | if (typeof define === 'function' && define.amd) { 10 | define('jstree.massload', ['jquery','jstree'], factory); 11 | } 12 | else if(typeof exports === 'object') { 13 | factory(require('jquery'), require('jstree')); 14 | } 15 | else { 16 | factory(jQuery, jQuery.jstree); 17 | } 18 | }(function ($, jstree, undefined) { 19 | "use strict"; 20 | 21 | if($.jstree.plugins.massload) { return; } 22 | 23 | /** 24 | * massload configuration 25 | * 26 | * It is possible to set this to a standard jQuery-like AJAX config. 27 | * In addition to the standard jQuery ajax options here you can supply functions for `data` and `url`, the functions will be run in the current instance's scope and a param will be passed indicating which node IDs need to be loaded, the return value of those functions will be used. 28 | * 29 | * You can also set this to a function, that function will receive the node IDs being loaded as argument and a second param which is a function (callback) which should be called with the result. 30 | * 31 | * Both the AJAX and the function approach rely on the same return value - an object where the keys are the node IDs, and the value is the children of that node as an array. 32 | * 33 | * { 34 | * "id1" : [{ "text" : "Child of ID1", "id" : "c1" }, { "text" : "Another child of ID1", "id" : "c2" }], 35 | * "id2" : [{ "text" : "Child of ID2", "id" : "c3" }] 36 | * } 37 | * 38 | * @name $.jstree.defaults.massload 39 | * @plugin massload 40 | */ 41 | $.jstree.defaults.massload = null; 42 | $.jstree.plugins.massload = function (options, parent) { 43 | this.init = function (el, options) { 44 | this._data.massload = {}; 45 | parent.init.call(this, el, options); 46 | }; 47 | this._load_nodes = function (nodes, callback, is_callback, force_reload) { 48 | var s = this.settings.massload, 49 | nodesString = JSON.stringify(nodes), 50 | toLoad = [], 51 | m = this._model.data, 52 | i, j, dom; 53 | if (!is_callback) { 54 | for(i = 0, j = nodes.length; i < j; i++) { 55 | if(!m[nodes[i]] || ( (!m[nodes[i]].state.loaded && !m[nodes[i]].state.failed) || force_reload) ) { 56 | toLoad.push(nodes[i]); 57 | dom = this.get_node(nodes[i], true); 58 | if (dom && dom.length) { 59 | dom.addClass("jstree-loading").attr('aria-busy',true); 60 | } 61 | } 62 | } 63 | this._data.massload = {}; 64 | if (toLoad.length) { 65 | if($.isFunction(s)) { 66 | return s.call(this, toLoad, $.proxy(function (data) { 67 | var i, j; 68 | if(data) { 69 | for(i in data) { 70 | if(data.hasOwnProperty(i)) { 71 | this._data.massload[i] = data[i]; 72 | } 73 | } 74 | } 75 | for(i = 0, j = nodes.length; i < j; i++) { 76 | dom = this.get_node(nodes[i], true); 77 | if (dom && dom.length) { 78 | dom.removeClass("jstree-loading").attr('aria-busy',false); 79 | } 80 | } 81 | parent._load_nodes.call(this, nodes, callback, is_callback, force_reload); 82 | }, this)); 83 | } 84 | if(typeof s === 'object' && s && s.url) { 85 | s = $.extend(true, {}, s); 86 | if($.isFunction(s.url)) { 87 | s.url = s.url.call(this, toLoad); 88 | } 89 | if($.isFunction(s.data)) { 90 | s.data = s.data.call(this, toLoad); 91 | } 92 | return $.ajax(s) 93 | .done($.proxy(function (data,t,x) { 94 | var i, j; 95 | if(data) { 96 | for(i in data) { 97 | if(data.hasOwnProperty(i)) { 98 | this._data.massload[i] = data[i]; 99 | } 100 | } 101 | } 102 | for(i = 0, j = nodes.length; i < j; i++) { 103 | dom = this.get_node(nodes[i], true); 104 | if (dom && dom.length) { 105 | dom.removeClass("jstree-loading").attr('aria-busy',false); 106 | } 107 | } 108 | parent._load_nodes.call(this, nodes, callback, is_callback, force_reload); 109 | }, this)) 110 | .fail($.proxy(function (f) { 111 | parent._load_nodes.call(this, nodes, callback, is_callback, force_reload); 112 | }, this)); 113 | } 114 | } 115 | } 116 | return parent._load_nodes.call(this, nodes, callback, is_callback, force_reload); 117 | }; 118 | this._load_node = function (obj, callback) { 119 | var data = this._data.massload[obj.id], 120 | rslt = null, dom; 121 | if(data) { 122 | rslt = this[typeof data === 'string' ? '_append_html_data' : '_append_json_data']( 123 | obj, 124 | typeof data === 'string' ? $($.parseHTML(data)).filter(function () { return this.nodeType !== 3; }) : data, 125 | function (status) { callback.call(this, status); } 126 | ); 127 | dom = this.get_node(obj.id, true); 128 | if (dom && dom.length) { 129 | dom.removeClass("jstree-loading").attr('aria-busy',false); 130 | } 131 | delete this._data.massload[obj.id]; 132 | return rslt; 133 | } 134 | return parent._load_node.call(this, obj, callback); 135 | }; 136 | }; 137 | })); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/jstree.sort.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ### Sort plugin 3 | * 4 | * Automatically sorts all siblings in the tree according to a sorting function. 5 | */ 6 | /*globals jQuery, define, exports, require */ 7 | (function (factory) { 8 | "use strict"; 9 | if (typeof define === 'function' && define.amd) { 10 | define('jstree.sort', ['jquery','jstree'], factory); 11 | } 12 | else if(typeof exports === 'object') { 13 | factory(require('jquery'), require('jstree')); 14 | } 15 | else { 16 | factory(jQuery, jQuery.jstree); 17 | } 18 | }(function ($, jstree, undefined) { 19 | "use strict"; 20 | 21 | if($.jstree.plugins.sort) { return; } 22 | 23 | /** 24 | * the settings function used to sort the nodes. 25 | * It is executed in the tree's context, accepts two nodes as arguments and should return `1` or `-1`. 26 | * @name $.jstree.defaults.sort 27 | * @plugin sort 28 | */ 29 | $.jstree.defaults.sort = function (a, b) { 30 | //return this.get_type(a) === this.get_type(b) ? (this.get_text(a) > this.get_text(b) ? 1 : -1) : this.get_type(a) >= this.get_type(b); 31 | return this.get_text(a) > this.get_text(b) ? 1 : -1; 32 | }; 33 | $.jstree.plugins.sort = function (options, parent) { 34 | this.bind = function () { 35 | parent.bind.call(this); 36 | this.element 37 | .on("model.jstree", $.proxy(function (e, data) { 38 | this.sort(data.parent, true); 39 | }, this)) 40 | .on("rename_node.jstree create_node.jstree", $.proxy(function (e, data) { 41 | this.sort(data.parent || data.node.parent, false); 42 | this.redraw_node(data.parent || data.node.parent, true); 43 | }, this)) 44 | .on("move_node.jstree copy_node.jstree", $.proxy(function (e, data) { 45 | this.sort(data.parent, false); 46 | this.redraw_node(data.parent, true); 47 | }, this)); 48 | }; 49 | /** 50 | * used to sort a node's children 51 | * @private 52 | * @name sort(obj [, deep]) 53 | * @param {mixed} obj the node 54 | * @param {Boolean} deep if set to `true` nodes are sorted recursively. 55 | * @plugin sort 56 | * @trigger search.jstree 57 | */ 58 | this.sort = function (obj, deep) { 59 | var i, j; 60 | obj = this.get_node(obj); 61 | if(obj && obj.children && obj.children.length) { 62 | obj.children.sort($.proxy(this.settings.sort, this)); 63 | if(deep) { 64 | for(i = 0, j = obj.children_d.length; i < j; i++) { 65 | this.sort(obj.children_d[i], false); 66 | } 67 | } 68 | } 69 | }; 70 | }; 71 | 72 | // include the sort plugin by default 73 | // $.jstree.defaults.plugins.push("sort"); 74 | })); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/jstree.state.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ### State plugin 3 | * 4 | * Saves the state of the tree (selected nodes, opened nodes) on the user's computer using available options (localStorage, cookies, etc) 5 | */ 6 | /*globals jQuery, define, exports, require */ 7 | (function (factory) { 8 | "use strict"; 9 | if (typeof define === 'function' && define.amd) { 10 | define('jstree.state', ['jquery','jstree'], factory); 11 | } 12 | else if(typeof exports === 'object') { 13 | factory(require('jquery'), require('jstree')); 14 | } 15 | else { 16 | factory(jQuery, jQuery.jstree); 17 | } 18 | }(function ($, jstree, undefined) { 19 | "use strict"; 20 | 21 | if($.jstree.plugins.state) { return; } 22 | 23 | var to = false; 24 | /** 25 | * stores all defaults for the state plugin 26 | * @name $.jstree.defaults.state 27 | * @plugin state 28 | */ 29 | $.jstree.defaults.state = { 30 | /** 31 | * A string for the key to use when saving the current tree (change if using multiple trees in your project). Defaults to `jstree`. 32 | * @name $.jstree.defaults.state.key 33 | * @plugin state 34 | */ 35 | key : 'jstree', 36 | /** 37 | * A space separated list of events that trigger a state save. Defaults to `changed.jstree open_node.jstree close_node.jstree`. 38 | * @name $.jstree.defaults.state.events 39 | * @plugin state 40 | */ 41 | events : 'changed.jstree open_node.jstree close_node.jstree check_node.jstree uncheck_node.jstree', 42 | /** 43 | * Time in milliseconds after which the state will expire. Defaults to 'false' meaning - no expire. 44 | * @name $.jstree.defaults.state.ttl 45 | * @plugin state 46 | */ 47 | ttl : false, 48 | /** 49 | * A function that will be executed prior to restoring state with one argument - the state object. Can be used to clear unwanted parts of the state. 50 | * @name $.jstree.defaults.state.filter 51 | * @plugin state 52 | */ 53 | filter : false, 54 | /** 55 | * Should loaded nodes be restored (setting this to true means that it is possible that the whole tree will be loaded for some users - use with caution). Defaults to `false` 56 | * @name $.jstree.defaults.state.preserve_loaded 57 | * @plugin state 58 | */ 59 | preserve_loaded : false 60 | }; 61 | $.jstree.plugins.state = function (options, parent) { 62 | this.bind = function () { 63 | parent.bind.call(this); 64 | var bind = $.proxy(function () { 65 | this.element.on(this.settings.state.events, $.proxy(function () { 66 | if(to) { clearTimeout(to); } 67 | to = setTimeout($.proxy(function () { this.save_state(); }, this), 100); 68 | }, this)); 69 | /** 70 | * triggered when the state plugin is finished restoring the state (and immediately after ready if there is no state to restore). 71 | * @event 72 | * @name state_ready.jstree 73 | * @plugin state 74 | */ 75 | this.trigger('state_ready'); 76 | }, this); 77 | this.element 78 | .on("ready.jstree", $.proxy(function (e, data) { 79 | this.element.one("restore_state.jstree", bind); 80 | if(!this.restore_state()) { bind(); } 81 | }, this)); 82 | }; 83 | /** 84 | * save the state 85 | * @name save_state() 86 | * @plugin state 87 | */ 88 | this.save_state = function () { 89 | var tm = this.get_state(); 90 | if (!this.settings.state.preserve_loaded) { 91 | delete tm.core.loaded; 92 | } 93 | var st = { 'state' : tm, 'ttl' : this.settings.state.ttl, 'sec' : +(new Date()) }; 94 | $.vakata.storage.set(this.settings.state.key, JSON.stringify(st)); 95 | }; 96 | /** 97 | * restore the state from the user's computer 98 | * @name restore_state() 99 | * @plugin state 100 | */ 101 | this.restore_state = function () { 102 | var k = $.vakata.storage.get(this.settings.state.key); 103 | if(!!k) { try { k = JSON.parse(k); } catch(ex) { return false; } } 104 | if(!!k && k.ttl && k.sec && +(new Date()) - k.sec > k.ttl) { return false; } 105 | if(!!k && k.state) { k = k.state; } 106 | if(!!k && $.isFunction(this.settings.state.filter)) { k = this.settings.state.filter.call(this, k); } 107 | if(!!k) { 108 | if (!this.settings.state.preserve_loaded) { 109 | delete k.core.loaded; 110 | } 111 | this.element.one("set_state.jstree", function (e, data) { data.instance.trigger('restore_state', { 'state' : $.extend(true, {}, k) }); }); 112 | this.set_state(k); 113 | return true; 114 | } 115 | return false; 116 | }; 117 | /** 118 | * clear the state on the user's computer 119 | * @name clear_state() 120 | * @plugin state 121 | */ 122 | this.clear_state = function () { 123 | return $.vakata.storage.del(this.settings.state.key); 124 | }; 125 | }; 126 | 127 | (function ($, undefined) { 128 | $.vakata.storage = { 129 | // simply specifying the functions in FF throws an error 130 | set : function (key, val) { return window.localStorage.setItem(key, val); }, 131 | get : function (key) { return window.localStorage.getItem(key); }, 132 | del : function (key) { return window.localStorage.removeItem(key); } 133 | }; 134 | }($)); 135 | 136 | // include the state plugin by default 137 | // $.jstree.defaults.plugins.push("state"); 138 | })); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/jstree.wholerow.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ### Wholerow plugin 3 | * 4 | * Makes each node appear block level. Making selection easier. May cause slow down for large trees in old browsers. 5 | */ 6 | /*globals jQuery, define, exports, require */ 7 | (function (factory) { 8 | "use strict"; 9 | if (typeof define === 'function' && define.amd) { 10 | define('jstree.wholerow', ['jquery','jstree'], factory); 11 | } 12 | else if(typeof exports === 'object') { 13 | factory(require('jquery'), require('jstree')); 14 | } 15 | else { 16 | factory(jQuery, jQuery.jstree); 17 | } 18 | }(function ($, jstree, undefined) { 19 | "use strict"; 20 | 21 | if($.jstree.plugins.wholerow) { return; } 22 | 23 | var div = document.createElement('DIV'); 24 | div.setAttribute('unselectable','on'); 25 | div.setAttribute('role','presentation'); 26 | div.className = 'jstree-wholerow'; 27 | div.innerHTML = ' '; 28 | $.jstree.plugins.wholerow = function (options, parent) { 29 | this.bind = function () { 30 | parent.bind.call(this); 31 | 32 | this.element 33 | .on('ready.jstree set_state.jstree', $.proxy(function () { 34 | this.hide_dots(); 35 | }, this)) 36 | .on("init.jstree loading.jstree ready.jstree", $.proxy(function () { 37 | //div.style.height = this._data.core.li_height + 'px'; 38 | this.get_container_ul().addClass('jstree-wholerow-ul'); 39 | }, this)) 40 | .on("deselect_all.jstree", $.proxy(function (e, data) { 41 | this.element.find('.jstree-wholerow-clicked').removeClass('jstree-wholerow-clicked'); 42 | }, this)) 43 | .on("changed.jstree", $.proxy(function (e, data) { 44 | this.element.find('.jstree-wholerow-clicked').removeClass('jstree-wholerow-clicked'); 45 | var tmp = false, i, j; 46 | for(i = 0, j = data.selected.length; i < j; i++) { 47 | tmp = this.get_node(data.selected[i], true); 48 | if(tmp && tmp.length) { 49 | tmp.children('.jstree-wholerow').addClass('jstree-wholerow-clicked'); 50 | } 51 | } 52 | }, this)) 53 | .on("open_node.jstree", $.proxy(function (e, data) { 54 | this.get_node(data.node, true).find('.jstree-clicked').parent().children('.jstree-wholerow').addClass('jstree-wholerow-clicked'); 55 | }, this)) 56 | .on("hover_node.jstree dehover_node.jstree", $.proxy(function (e, data) { 57 | if(e.type === "hover_node" && this.is_disabled(data.node)) { return; } 58 | this.get_node(data.node, true).children('.jstree-wholerow')[e.type === "hover_node"?"addClass":"removeClass"]('jstree-wholerow-hovered'); 59 | }, this)) 60 | .on("contextmenu.jstree", ".jstree-wholerow", $.proxy(function (e) { 61 | if (this._data.contextmenu) { 62 | e.preventDefault(); 63 | var tmp = $.Event('contextmenu', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey, pageX : e.pageX, pageY : e.pageY }); 64 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp); 65 | } 66 | }, this)) 67 | /*! 68 | .on("mousedown.jstree touchstart.jstree", ".jstree-wholerow", function (e) { 69 | if(e.target === e.currentTarget) { 70 | var a = $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor"); 71 | e.target = a[0]; 72 | a.trigger(e); 73 | } 74 | }) 75 | */ 76 | .on("click.jstree", ".jstree-wholerow", function (e) { 77 | e.stopImmediatePropagation(); 78 | var tmp = $.Event('click', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey }); 79 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus(); 80 | }) 81 | .on("dblclick.jstree", ".jstree-wholerow", function (e) { 82 | e.stopImmediatePropagation(); 83 | var tmp = $.Event('dblclick', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey }); 84 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus(); 85 | }) 86 | .on("click.jstree", ".jstree-leaf > .jstree-ocl", $.proxy(function (e) { 87 | e.stopImmediatePropagation(); 88 | var tmp = $.Event('click', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey }); 89 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus(); 90 | }, this)) 91 | .on("mouseover.jstree", ".jstree-wholerow, .jstree-icon", $.proxy(function (e) { 92 | e.stopImmediatePropagation(); 93 | if(!this.is_disabled(e.currentTarget)) { 94 | this.hover_node(e.currentTarget); 95 | } 96 | return false; 97 | }, this)) 98 | .on("mouseleave.jstree", ".jstree-node", $.proxy(function (e) { 99 | this.dehover_node(e.currentTarget); 100 | }, this)); 101 | }; 102 | this.teardown = function () { 103 | if(this.settings.wholerow) { 104 | this.element.find(".jstree-wholerow").remove(); 105 | } 106 | parent.teardown.call(this); 107 | }; 108 | this.redraw_node = function(obj, deep, callback, force_render) { 109 | obj = parent.redraw_node.apply(this, arguments); 110 | if(obj) { 111 | var tmp = div.cloneNode(true); 112 | //tmp.style.height = this._data.core.li_height + 'px'; 113 | if($.inArray(obj.id, this._data.core.selected) !== -1) { tmp.className += ' jstree-wholerow-clicked'; } 114 | if(this._data.core.focused && this._data.core.focused === obj.id) { tmp.className += ' jstree-wholerow-hovered'; } 115 | obj.insertBefore(tmp, obj.childNodes[0]); 116 | } 117 | return obj; 118 | }; 119 | }; 120 | // include the wholerow plugin by default 121 | // $.jstree.defaults.plugins.push("wholerow"); 122 | })); 123 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/outro.js: -------------------------------------------------------------------------------- 1 | })); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/sample.js: -------------------------------------------------------------------------------- 1 | /*global jQuery */ 2 | // wrap in IIFE and pass jQuery as $ 3 | (function ($, undefined) { 4 | "use strict"; 5 | 6 | // some private plugin stuff if needed 7 | var private_var = null; 8 | 9 | // extending the defaults 10 | $.jstree.defaults.sample = { 11 | sample_option : 'sample_val' 12 | }; 13 | 14 | // the actual plugin code 15 | $.jstree.plugins.sample = function (options, parent) { 16 | // own function 17 | this.sample_function = function (arg) { 18 | // you can chain this method if needed and available 19 | if(parent.sample_function) { parent.sample_function.call(this, arg); } 20 | }; 21 | 22 | // *SPECIAL* FUNCTIONS 23 | this.init = function (el, options) { 24 | // do not forget parent 25 | parent.init.call(this, el, options); 26 | }; 27 | // bind events if needed 28 | this.bind = function () { 29 | // call parent function first 30 | parent.bind.call(this); 31 | // do(stuff); 32 | }; 33 | // unbind events if needed (all in jquery namespace are taken care of by the core) 34 | this.unbind = function () { 35 | // do(stuff); 36 | // call parent function last 37 | parent.unbind.call(this); 38 | }; 39 | this.teardown = function () { 40 | // do not forget parent 41 | parent.teardown.call(this); 42 | }; 43 | // state management - get and restore 44 | this.get_state = function () { 45 | // always get state from parent first 46 | var state = parent.get_state.call(this); 47 | // add own stuff to state 48 | state.sample = { 'var' : 'val' }; 49 | return state; 50 | }; 51 | this.set_state = function (state, callback) { 52 | // only process your part if parent returns true 53 | // there will be multiple times with false 54 | if(parent.set_state.call(this, state, callback)) { 55 | // check the key you set above 56 | if(state.sample) { 57 | // do(stuff); // like calling this.sample_function(state.sample.var); 58 | // remove your part of the state, call again and RETURN FALSE, the next cycle will be TRUE 59 | delete state.sample; 60 | this.set_state(state, callback); 61 | return false; 62 | } 63 | // return true if your state is gone (cleared in the previous step) 64 | return true; 65 | } 66 | // parent was false - return false too 67 | return false; 68 | }; 69 | // node transportation 70 | this.get_json = function (obj, options, flat) { 71 | // get the node from the parent 72 | var tmp = parent.get_json.call(this, obj, options, flat), i, j; 73 | if($.isArray(tmp)) { 74 | for(i = 0, j = tmp.length; i < j; i++) { 75 | tmp[i].sample = 'value'; 76 | } 77 | } 78 | else { 79 | tmp.sample = 'value'; 80 | } 81 | // return the original / modified node 82 | return tmp; 83 | }; 84 | }; 85 | 86 | // attach to document ready if needed 87 | $(function () { 88 | // do(stuff); 89 | }); 90 | 91 | // you can include the sample plugin in all instances by default 92 | $.jstree.defaults.plugins.push("sample"); 93 | })(jQuery); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/base.less: -------------------------------------------------------------------------------- 1 | // base jstree 2 | .jstree-node, .jstree-children, .jstree-container-ul { display:block; margin:0; padding:0; list-style-type:none; list-style-image:none; } 3 | .jstree-node { white-space:nowrap; } 4 | .jstree-anchor { display:inline-block; color:black; white-space:nowrap; padding:0 4px 0 1px; margin:0; vertical-align:top; } 5 | .jstree-anchor:focus { outline:0; } 6 | .jstree-anchor, .jstree-anchor:link, .jstree-anchor:visited, .jstree-anchor:hover, .jstree-anchor:active { text-decoration:none; color:inherit; } 7 | .jstree-icon { display:inline-block; text-decoration:none; margin:0; padding:0; vertical-align:top; text-align:center; } 8 | .jstree-icon:empty { display:inline-block; text-decoration:none; margin:0; padding:0; vertical-align:top; text-align:center; } 9 | .jstree-ocl { cursor:pointer; } 10 | .jstree-leaf > .jstree-ocl { cursor:default; } 11 | .jstree .jstree-open > .jstree-children { display:block; } 12 | .jstree .jstree-closed > .jstree-children, 13 | .jstree .jstree-leaf > .jstree-children { display:none; } 14 | .jstree-anchor > .jstree-themeicon { margin-right:2px; } 15 | .jstree-no-icons .jstree-themeicon, 16 | .jstree-anchor > .jstree-themeicon-hidden { display:none; } 17 | .jstree-hidden, .jstree-node.jstree-hidden { display:none; } 18 | 19 | // base jstree rtl 20 | .jstree-rtl { 21 | .jstree-anchor { padding:0 1px 0 4px; } 22 | .jstree-anchor > .jstree-themeicon { margin-left:2px; margin-right:0; } 23 | .jstree-node { margin-left:0; } 24 | .jstree-container-ul > .jstree-node { margin-right:0; } 25 | } 26 | 27 | // base jstree wholerow 28 | .jstree-wholerow-ul { 29 | position:relative; 30 | display:inline-block; 31 | min-width:100%; 32 | .jstree-leaf > .jstree-ocl { cursor:pointer; } 33 | .jstree-anchor, .jstree-icon { position:relative; } 34 | .jstree-wholerow { width:100%; cursor:pointer; position:absolute; left:0; -webkit-user-select:none; -moz-user-select:none; -ms-user-select:none; user-select:none; } 35 | } 36 | 37 | // base contextmenu 38 | .jstree-contextmenu .jstree-anchor { 39 | -webkit-user-select: none; /* disable selection/Copy of UIWebView */ 40 | -webkit-touch-callout: none; /* disable the IOS popup when long-press on a link */ 41 | } 42 | .vakata-context { 43 | display:none; 44 | &, ul { margin:0; padding:2px; position:absolute; background:#f5f5f5; border:1px solid #979797; box-shadow:2px 2px 2px #999999; } 45 | ul { list-style:none; left:100%; margin-top:-2.7em; margin-left:-4px; } 46 | .vakata-context-right ul { left:auto; right:100%; margin-left:auto; margin-right:-4px; } 47 | li { 48 | list-style:none; 49 | > a { 50 | display:block; padding:0 2em 0 2em; text-decoration:none; width:auto; color:black; white-space:nowrap; line-height:2.4em; text-shadow:1px 1px 0 white; border-radius:1px; 51 | &:hover { position:relative; background-color:#e8eff7; box-shadow:0 0 2px #0a6aa1; } 52 | &.vakata-context-parent { background-image:url("data:image/gif;base64,R0lGODlhCwAHAIAAACgoKP///yH5BAEAAAEALAAAAAALAAcAAAIORI4JlrqN1oMSnmmZDQUAOw=="); background-position:right center; background-repeat:no-repeat; } 53 | } 54 | > a:focus { outline:0; } 55 | } 56 | .vakata-context-hover > a { position:relative; background-color:#e8eff7; box-shadow:0 0 2px #0a6aa1; } 57 | .vakata-context-separator { 58 | > a, > a:hover { background:white; border:0; border-top:1px solid #e2e3e3; height:1px; min-height:1px; max-height:1px; padding:0; margin:0 0 0 2.4em; border-left:1px solid #e0e0e0; text-shadow:0 0 0 transparent; box-shadow:0 0 0 transparent; border-radius:0; } 59 | } 60 | .vakata-contextmenu-disabled { 61 | a, a:hover { color:silver; background-color:transparent; border:0; box-shadow:0 0 0; } 62 | } 63 | li > a { 64 | > i { text-decoration:none; display:inline-block; width:2.4em; height:2.4em; background:transparent; margin:0 0 0 -2em; vertical-align:top; text-align:center; line-height:2.4em; } 65 | > i:empty { width:2.4em; line-height:2.4em; } 66 | .vakata-contextmenu-sep { display:inline-block; width:1px; height:2.4em; background:white; margin:0 0.5em 0 0; border-left:1px solid #e2e3e3; } 67 | } 68 | .vakata-contextmenu-shortcut { font-size:0.8em; color:silver; opacity:0.5; display:none; } 69 | } 70 | .vakata-context-rtl { 71 | ul { left:auto; right:100%; margin-left:auto; margin-right:-4px; } 72 | li > a.vakata-context-parent { background-image:url("data:image/gif;base64,R0lGODlhCwAHAIAAACgoKP///yH5BAEAAAEALAAAAAALAAcAAAINjI+AC7rWHIsPtmoxLAA7"); background-position:left center; background-repeat:no-repeat; } 73 | .vakata-context-separator > a { margin:0 2.4em 0 0; border-left:0; border-right:1px solid #e2e3e3;} 74 | .vakata-context-left ul { right:auto; left:100%; margin-left:-4px; margin-right:auto; } 75 | li > a { 76 | > i { margin:0 -2em 0 0; } 77 | .vakata-contextmenu-sep { margin:0 0 0 0.5em; border-left-color:white; background:#e2e3e3; } 78 | } 79 | } 80 | 81 | // base drag'n'drop 82 | #jstree-marker { position: absolute; top:0; left:0; margin:-5px 0 0 0; padding:0; border-right:0; border-top:5px solid transparent; border-bottom:5px solid transparent; border-left:5px solid; width:0; height:0; font-size:0; line-height:0; } 83 | #jstree-dnd { 84 | line-height:16px; 85 | margin:0; 86 | padding:4px; 87 | .jstree-icon, 88 | .jstree-copy { display:inline-block; text-decoration:none; margin:0 2px 0 0; padding:0; width:16px; height:16px; } 89 | .jstree-ok { background:green; } 90 | .jstree-er { background:red; } 91 | .jstree-copy { margin:0 2px 0 2px; } 92 | } 93 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default-dark/32px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/32px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default-dark/40px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/40px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default-dark/style.less: -------------------------------------------------------------------------------- 1 | /* jsTree default dark theme */ 2 | @theme-name: default-dark; 3 | @hovered-bg-color: #555; 4 | @hovered-shadow-color: #555; 5 | @disabled-color: #666666; 6 | @disabled-bg-color: #333333; 7 | @clicked-bg-color: #5fa2db; 8 | @clicked-shadow-color: #666666; 9 | @clicked-gradient-color-1: #5fa2db; 10 | @clicked-gradient-color-2: #5fa2db; 11 | @search-result-color: #ffffff; 12 | @mobile-wholerow-bg-color: #333333; 13 | @mobile-wholerow-shadow: #111111; 14 | @mobile-wholerow-bordert: #666; 15 | @mobile-wholerow-borderb: #000; 16 | @responsive: true; 17 | @image-path: ""; 18 | @base-height: 40px; 19 | 20 | @import "../mixins.less"; 21 | @import "../base.less"; 22 | @import "../main.less"; 23 | 24 | .jstree-@{theme-name} { 25 | background:#333; 26 | .jstree-anchor { color:#999; text-shadow:1px 1px 0 rgba(0,0,0,0.5); } 27 | .jstree-clicked, .jstree-checked { color:white; } 28 | .jstree-hovered { color:white; } 29 | #jstree-marker& { 30 | border-left-color:#999; 31 | background:transparent; 32 | } 33 | .jstree-anchor > .jstree-icon { opacity:0.75; } 34 | .jstree-clicked > .jstree-icon, 35 | .jstree-hovered > .jstree-icon, 36 | .jstree-checked > .jstree-icon { opacity:1; } 37 | } 38 | // theme variants 39 | .jstree-@{theme-name} { 40 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); } 41 | &.jstree-rtl .jstree-last { background:transparent; } 42 | } 43 | .jstree-@{theme-name}-small { 44 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAACAQMAAABv1h6PAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMHBgAAiABBI4gz9AAAAABJRU5ErkJggg=="); } 45 | &.jstree-rtl .jstree-last { background:transparent; } 46 | } 47 | .jstree-@{theme-name}-large { 48 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAACAQMAAAAD0EyKAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjgIIGBgABCgCBvVLXcAAAAABJRU5ErkJggg=="); } 49 | &.jstree-rtl .jstree-last { background:transparent; } 50 | } -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default-dark/throbber.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/throbber.gif -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default/32px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/32px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default/40px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/40px.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default/style.less: -------------------------------------------------------------------------------- 1 | /* jsTree default theme */ 2 | @theme-name: default; 3 | @hovered-bg-color: #e7f4f9; 4 | @hovered-shadow-color: #cccccc; 5 | @disabled-color: #666666; 6 | @disabled-bg-color: #efefef; 7 | @clicked-bg-color: #beebff; 8 | @clicked-shadow-color: #999999; 9 | @clicked-gradient-color-1: #beebff; 10 | @clicked-gradient-color-2: #a8e4ff; 11 | @search-result-color: #8b0000; 12 | @mobile-wholerow-bg-color: #ebebeb; 13 | @mobile-wholerow-shadow: #666666; 14 | @mobile-wholerow-bordert: rgba(255,255,255,0.7); 15 | @mobile-wholerow-borderb: rgba(64,64,64,0.2); 16 | @responsive: true; 17 | @image-path: ""; 18 | @base-height: 40px; 19 | 20 | @import "../mixins.less"; 21 | @import "../base.less"; 22 | @import "../main.less"; -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/default/throbber.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/throbber.gif -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/main.less: -------------------------------------------------------------------------------- 1 | .jstree-@{theme-name} { 2 | .jstree-node, 3 | .jstree-icon { background-repeat:no-repeat; background-color:transparent; } 4 | .jstree-anchor, 5 | .jstree-animated, 6 | .jstree-wholerow { transition:background-color 0.15s, box-shadow 0.15s; } 7 | .jstree-hovered { background:@hovered-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @hovered-shadow-color; } 8 | .jstree-context { background:@hovered-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @hovered-shadow-color; } 9 | .jstree-clicked { background:@clicked-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @clicked-shadow-color; } 10 | .jstree-no-icons .jstree-anchor > .jstree-themeicon { display:none; } 11 | .jstree-disabled { 12 | background:transparent; color:@disabled-color; 13 | &.jstree-hovered { background:transparent; box-shadow:none; } 14 | &.jstree-clicked { background:@disabled-bg-color; } 15 | > .jstree-icon { opacity:0.8; filter: url("data:image/svg+xml;utf8,#jstree-grayscale"); /* Firefox 10+ */ filter: gray; /* IE6-9 */ -webkit-filter: grayscale(100%); /* Chrome 19+ & Safari 6+ */ } 16 | } 17 | // search 18 | .jstree-search { font-style:italic; color:@search-result-color; font-weight:bold; } 19 | // checkboxes 20 | .jstree-no-checkboxes .jstree-checkbox { display:none !important; } 21 | &.jstree-checkbox-no-clicked { 22 | .jstree-clicked { 23 | background:transparent; 24 | box-shadow:none; 25 | &.jstree-hovered { background:@hovered-bg-color; } 26 | } 27 | > .jstree-wholerow-ul .jstree-wholerow-clicked { 28 | background:transparent; 29 | &.jstree-wholerow-hovered { background:@hovered-bg-color; } 30 | } 31 | } 32 | // stripes 33 | > .jstree-striped { min-width:100%; display:inline-block; background:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAkCAMAAAB/qqA+AAAABlBMVEUAAAAAAAClZ7nPAAAAAnRSTlMNAMM9s3UAAAAXSURBVHjajcEBAQAAAIKg/H/aCQZ70AUBjAATb6YPDgAAAABJRU5ErkJggg==") left top repeat; } 34 | // wholerow 35 | > .jstree-wholerow-ul .jstree-hovered, 36 | > .jstree-wholerow-ul .jstree-clicked { background:transparent; box-shadow:none; border-radius:0; } 37 | .jstree-wholerow { -moz-box-sizing:border-box; -webkit-box-sizing:border-box; box-sizing:border-box; } 38 | .jstree-wholerow-hovered { background:@hovered-bg-color; } 39 | .jstree-wholerow-clicked { .gradient(@clicked-gradient-color-1, @clicked-gradient-color-2); } 40 | } 41 | 42 | // theme variants 43 | .jstree-@{theme-name} { 44 | .jstree-theme(24px, "@{image-path}32px.png", 32px); 45 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); } 46 | &.jstree-rtl .jstree-last { background:transparent; } 47 | } 48 | .jstree-@{theme-name}-small { 49 | .jstree-theme(18px, "@{image-path}32px.png", 32px); 50 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAACAQMAAABv1h6PAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMHBgAAiABBI4gz9AAAAABJRU5ErkJggg=="); } 51 | &.jstree-rtl .jstree-last { background:transparent; } 52 | } 53 | .jstree-@{theme-name}-large { 54 | .jstree-theme(32px, "@{image-path}32px.png", 32px); 55 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAACAQMAAAAD0EyKAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjgIIGBgABCgCBvVLXcAAAAABJRU5ErkJggg=="); } 56 | &.jstree-rtl .jstree-last { background:transparent; } 57 | } 58 | 59 | // mobile theme attempt 60 | @media (max-width: 768px) { 61 | #jstree-dnd.jstree-dnd-responsive when (@responsive = true) { 62 | line-height:@base-height; font-weight:bold; font-size:1.1em; text-shadow:1px 1px white; 63 | > i { background:transparent; width:@base-height; height:@base-height; } 64 | > .jstree-ok { background-image:url("@{image-path}@{base-height}.png"); background-position:0 -(@base-height * 5); background-size:(@base-height * 3) (@base-height * 6); } 65 | > .jstree-er { background-image:url("@{image-path}@{base-height}.png"); background-position:-(@base-height * 1) -(@base-height * 5); background-size:(@base-height * 3) (@base-height * 6); } 66 | } 67 | #jstree-marker.jstree-dnd-responsive when (@responsive = true) { 68 | border-left-width:10px; 69 | border-top-width:10px; 70 | border-bottom-width:10px; 71 | margin-top:-10px; 72 | } 73 | } 74 | 75 | .jstree-@{theme-name}-responsive when (@responsive = true) { 76 | @import "responsive.less"; 77 | } 78 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/mixins.less: -------------------------------------------------------------------------------- 1 | .gradient (@color1; @color2) { 2 | background:@color1; 3 | background: -webkit-linear-gradient(top, @color1 0%,@color2 100%); 4 | background: linear-gradient(to bottom, @color1 0%,@color2 100%); 5 | } 6 | 7 | .jstree-theme (@base-height, @image, @image-height) { 8 | @correction: (@image-height - @base-height) / 2; 9 | 10 | .jstree-node { min-height:@base-height; line-height:@base-height; margin-left:@base-height; min-width:@base-height; } 11 | .jstree-anchor { line-height:@base-height; height:@base-height; } 12 | .jstree-icon { width:@base-height; height:@base-height; line-height:@base-height; } 13 | .jstree-icon:empty { width:@base-height; height:@base-height; line-height:@base-height; } 14 | &.jstree-rtl .jstree-node { margin-right:@base-height; } 15 | .jstree-wholerow { height:@base-height; } 16 | 17 | .jstree-node, 18 | .jstree-icon { background-image:url("@{image}"); } 19 | .jstree-node { background-position:-(@image-height * 9 + @correction) -@correction; background-repeat:repeat-y; } 20 | .jstree-last { background:transparent; } 21 | 22 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 4 + @correction) -@correction; } 23 | .jstree-closed > .jstree-ocl { background-position:-(@image-height * 3 + @correction) -@correction; } 24 | .jstree-leaf > .jstree-ocl { background-position:-(@image-height * 2 + @correction) -@correction; } 25 | 26 | .jstree-themeicon { background-position:-(@image-height * 8 + @correction) -@correction; } 27 | 28 | > .jstree-no-dots { 29 | .jstree-node, 30 | .jstree-leaf > .jstree-ocl { background:transparent; } 31 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 1 + @correction) -@correction; } 32 | .jstree-closed > .jstree-ocl { background-position:-@correction -@correction; } 33 | } 34 | 35 | .jstree-disabled { 36 | background:transparent; 37 | &.jstree-hovered { 38 | background:transparent; 39 | } 40 | &.jstree-clicked { 41 | background:#efefef; 42 | } 43 | } 44 | 45 | .jstree-checkbox { 46 | background-position:-(@image-height * 5 + @correction) -@correction; 47 | &:hover { background-position:-(@image-height * 5 + @correction) -(@image-height * 1 + @correction); } 48 | } 49 | 50 | &.jstree-checkbox-selection .jstree-clicked, .jstree-checked { 51 | > .jstree-checkbox { 52 | background-position:-(@image-height * 7 + @correction) -@correction; 53 | &:hover { background-position:-(@image-height * 7 + @correction) -(@image-height * 1 + @correction); } 54 | } 55 | } 56 | .jstree-anchor { 57 | > .jstree-undetermined { 58 | background-position:-(@image-height * 6 + @correction) -@correction; 59 | &:hover { 60 | background-position:-(@image-height * 6 + @correction) -(@image-height * 1 + @correction); 61 | } 62 | } 63 | } 64 | .jstree-checkbox-disabled { opacity:0.8; filter: url("data:image/svg+xml;utf8,#jstree-grayscale"); /* Firefox 10+ */ filter: gray; /* IE6-9 */ -webkit-filter: grayscale(100%); /* Chrome 19+ & Safari 6+ */ } 65 | 66 | > .jstree-striped { background-size:auto (@base-height * 2); } 67 | 68 | &.jstree-rtl { 69 | .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); background-position: 100% 1px; background-repeat:repeat-y; } 70 | .jstree-last { background:transparent; } 71 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 4 + @correction) -(@image-height * 1 + @correction); } 72 | .jstree-closed > .jstree-ocl { background-position:-(@image-height * 3 + @correction) -(@image-height * 1 + @correction); } 73 | .jstree-leaf > .jstree-ocl { background-position:-(@image-height * 2 + @correction) -(@image-height * 1 + @correction); } 74 | > .jstree-no-dots { 75 | .jstree-node, 76 | .jstree-leaf > .jstree-ocl { background:transparent; } 77 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 1 + @correction) -(@image-height * 1 + @correction); } 78 | .jstree-closed > .jstree-ocl { background-position:-@correction -(@image-height * 1 + @correction); } 79 | } 80 | } 81 | .jstree-themeicon-custom { background-color:transparent; background-image:none; background-position:0 0; } 82 | 83 | > .jstree-container-ul .jstree-loading > .jstree-ocl { background:url("@{image-path}throbber.gif") center center no-repeat; } 84 | 85 | .jstree-file { background:url("@{image}") -(@image-height * 3 + @correction) -(@image-height * 2 + @correction) no-repeat; } 86 | .jstree-folder { background:url("@{image}") -(@image-height * 8 + @correction) -(@correction) no-repeat; } 87 | 88 | > .jstree-container-ul > .jstree-node { margin-left:0; margin-right:0; } 89 | 90 | // drag'n'drop 91 | #jstree-dnd& { 92 | line-height:@base-height; padding:0 4px; 93 | .jstree-ok, 94 | .jstree-er { background-image:url("@{image-path}32px.png"); background-repeat:no-repeat; background-color:transparent; } 95 | i { background:transparent; width:@base-height; height:@base-height; line-height:@base-height; } 96 | .jstree-ok { background-position: -(@correction) -(@image-height * 2 + @correction); } 97 | .jstree-er { background-position: -(@image-height * 1 + @correction) -(@image-height * 2 + @correction); } 98 | } 99 | 100 | // ellipsis 101 | .jstree-ellipsis { overflow: hidden; } 102 | // base height + PADDINGS! 103 | .jstree-ellipsis .jstree-anchor { width: calc(100% ~"-" (@base-height + 5px)); text-overflow: ellipsis; overflow: hidden; } 104 | .jstree-ellipsis.jstree-no-icons .jstree-anchor { width: calc(100% ~"-" 5px); } 105 | } 106 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/themes/responsive.less: -------------------------------------------------------------------------------- 1 | @media (max-width: 768px) { 2 | // background image 3 | .jstree-icon { background-image:url("@{image-path}@{base-height}.png"); } 4 | 5 | .jstree-node, 6 | .jstree-leaf > .jstree-ocl { background:transparent; } 7 | 8 | .jstree-node { min-height:@base-height; line-height:@base-height; margin-left:@base-height; min-width:@base-height; white-space:nowrap; } 9 | .jstree-anchor { line-height:@base-height; height:@base-height; } 10 | .jstree-icon, .jstree-icon:empty { width:@base-height; height:@base-height; line-height:@base-height; } 11 | 12 | > .jstree-container-ul > .jstree-node { margin-left:0; } 13 | &.jstree-rtl .jstree-node { margin-left:0; margin-right:@base-height; background:transparent; } 14 | &.jstree-rtl .jstree-container-ul > .jstree-node { margin-right:0; } 15 | 16 | .jstree-ocl, 17 | .jstree-themeicon, 18 | .jstree-checkbox { background-size:(@base-height * 3) (@base-height * 6); } 19 | .jstree-leaf > .jstree-ocl, 20 | &.jstree-rtl .jstree-leaf > .jstree-ocl { background:transparent; } 21 | .jstree-open > .jstree-ocl { background-position:0 0px !important; } 22 | .jstree-closed > .jstree-ocl { background-position:0 -(@base-height * 1) !important; } 23 | &.jstree-rtl .jstree-closed > .jstree-ocl { background-position:-(@base-height * 1) 0px !important; } 24 | 25 | .jstree-themeicon { background-position:-(@base-height * 1) -(@base-height * 1); } 26 | 27 | .jstree-checkbox, .jstree-checkbox:hover { background-position:-(@base-height * 1) -(@base-height * 2); } 28 | &.jstree-checkbox-selection .jstree-clicked > .jstree-checkbox, 29 | &.jstree-checkbox-selection .jstree-clicked > .jstree-checkbox:hover, 30 | .jstree-checked > .jstree-checkbox, 31 | .jstree-checked > .jstree-checkbox:hover { background-position:0 -(@base-height * 2); } 32 | .jstree-anchor > .jstree-undetermined, .jstree-anchor > .jstree-undetermined:hover { background-position:0 -(@base-height * 3); } 33 | 34 | .jstree-anchor { font-weight:bold; font-size:1.1em; text-shadow:1px 1px white; } 35 | 36 | > .jstree-striped { background:transparent; } 37 | .jstree-wholerow { border-top:1px solid @mobile-wholerow-bordert; border-bottom:1px solid @mobile-wholerow-borderb; background:@mobile-wholerow-bg-color; height:@base-height; } 38 | .jstree-wholerow-hovered { background:@hovered-bg-color; } 39 | .jstree-wholerow-clicked { background:@clicked-bg-color; } 40 | 41 | // thanks to PHOTONUI 42 | .jstree-children .jstree-last > .jstree-wholerow { box-shadow: inset 0 -6px 3px -5px @mobile-wholerow-shadow; } 43 | .jstree-children .jstree-open > .jstree-wholerow { box-shadow: inset 0 6px 3px -5px @mobile-wholerow-shadow; border-top:0; } 44 | .jstree-children .jstree-open + .jstree-open { box-shadow:none; } 45 | 46 | // experiment 47 | .jstree-node, 48 | .jstree-icon, 49 | .jstree-node > .jstree-ocl, 50 | .jstree-themeicon, 51 | .jstree-checkbox { background-image:url("@{image-path}@{base-height}.png"); background-size:(@base-height * 3) (@base-height * 6); } 52 | 53 | .jstree-node { background-position:-(@base-height * 2) 0; background-repeat:repeat-y; } 54 | .jstree-last { background:transparent; } 55 | .jstree-leaf > .jstree-ocl { background-position:-(@base-height * 1) -(@base-height * 3); } 56 | .jstree-last > .jstree-ocl { background-position:-(@base-height * 1) -(@base-height * 4); } 57 | /* 58 | .jstree-open > .jstree-ocl, 59 | .jstree-closed > .jstree-ocl { border-radius:20px; background-color:white; } 60 | */ 61 | 62 | .jstree-themeicon-custom { background-color:transparent; background-image:none; background-position:0 0; } 63 | .jstree-file { background:url("@{image-path}@{base-height}.png") 0 -(@base-height * 4) no-repeat; background-size:(@base-height * 3) (@base-height * 6); } 64 | .jstree-folder { background:url("@{image-path}@{base-height}.png") -(@base-height * 1) -(@base-height * 1) no-repeat; background-size:(@base-height * 3) (@base-height * 6); } 65 | 66 | > .jstree-container-ul > .jstree-node { margin-left:0; margin-right:0; } 67 | } -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/src/vakata-jstree.js: -------------------------------------------------------------------------------- 1 | (function (factory) { 2 | "use strict"; 3 | if (typeof define === 'function' && define.amd) { 4 | define('jstree.checkbox', ['jquery','jstree'], factory); 5 | } 6 | else if(typeof exports === 'object') { 7 | factory(require('jquery'), require('jstree')); 8 | } 9 | else { 10 | factory(jQuery); 11 | } 12 | }(function ($, undefined) { 13 | "use strict"; 14 | if(document.registerElement && Object && Object.create) { 15 | var proto = Object.create(HTMLElement.prototype); 16 | proto.createdCallback = function () { 17 | var c = { core : {}, plugins : [] }, i; 18 | for(i in $.jstree.plugins) { 19 | if($.jstree.plugins.hasOwnProperty(i) && this.attributes[i]) { 20 | c.plugins.push(i); 21 | if(this.getAttribute(i) && JSON.parse(this.getAttribute(i))) { 22 | c[i] = JSON.parse(this.getAttribute(i)); 23 | } 24 | } 25 | } 26 | for(i in $.jstree.defaults.core) { 27 | if($.jstree.defaults.core.hasOwnProperty(i) && this.attributes[i]) { 28 | c.core[i] = JSON.parse(this.getAttribute(i)) || this.getAttribute(i); 29 | } 30 | } 31 | $(this).jstree(c); 32 | }; 33 | // proto.attributeChangedCallback = function (name, previous, value) { }; 34 | try { 35 | document.registerElement("vakata-jstree", { prototype: proto }); 36 | } catch(ignore) { } 37 | } 38 | })); 39 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/unit/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Basic Test Suite 6 | 7 | 8 | 9 | 10 | 11 |
12 |
this had better work.
13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/unit/libs/qunit.css: -------------------------------------------------------------------------------- 1 | /** 2 | * QUnit v1.12.0 - A JavaScript Unit Testing Framework 3 | * 4 | * http://qunitjs.com 5 | * 6 | * Copyright 2012 jQuery Foundation and other contributors 7 | * Released under the MIT license. 8 | * http://jquery.org/license 9 | */ 10 | 11 | /** Font Family and Sizes */ 12 | 13 | #qunit-tests, #qunit-header, #qunit-banner, #qunit-testrunner-toolbar, #qunit-userAgent, #qunit-testresult { 14 | font-family: "Helvetica Neue Light", "HelveticaNeue-Light", "Helvetica Neue", Calibri, Helvetica, Arial, sans-serif; 15 | } 16 | 17 | #qunit-testrunner-toolbar, #qunit-userAgent, #qunit-testresult, #qunit-tests li { font-size: small; } 18 | #qunit-tests { font-size: smaller; } 19 | 20 | 21 | /** Resets */ 22 | 23 | #qunit-tests, #qunit-header, #qunit-banner, #qunit-userAgent, #qunit-testresult, #qunit-modulefilter { 24 | margin: 0; 25 | padding: 0; 26 | } 27 | 28 | 29 | /** Header */ 30 | 31 | #qunit-header { 32 | padding: 0.5em 0 0.5em 1em; 33 | 34 | color: #8699a4; 35 | background-color: #0d3349; 36 | 37 | font-size: 1.5em; 38 | line-height: 1em; 39 | font-weight: normal; 40 | 41 | border-radius: 5px 5px 0 0; 42 | -moz-border-radius: 5px 5px 0 0; 43 | -webkit-border-top-right-radius: 5px; 44 | -webkit-border-top-left-radius: 5px; 45 | } 46 | 47 | #qunit-header a { 48 | text-decoration: none; 49 | color: #c2ccd1; 50 | } 51 | 52 | #qunit-header a:hover, 53 | #qunit-header a:focus { 54 | color: #fff; 55 | } 56 | 57 | #qunit-testrunner-toolbar label { 58 | display: inline-block; 59 | padding: 0 .5em 0 .1em; 60 | } 61 | 62 | #qunit-banner { 63 | height: 5px; 64 | } 65 | 66 | #qunit-testrunner-toolbar { 67 | padding: 0.5em 0 0.5em 2em; 68 | color: #5E740B; 69 | background-color: #eee; 70 | overflow: hidden; 71 | } 72 | 73 | #qunit-userAgent { 74 | padding: 0.5em 0 0.5em 2.5em; 75 | background-color: #2b81af; 76 | color: #fff; 77 | text-shadow: rgba(0, 0, 0, 0.5) 2px 2px 1px; 78 | } 79 | 80 | #qunit-modulefilter-container { 81 | float: right; 82 | } 83 | 84 | /** Tests: Pass/Fail */ 85 | 86 | #qunit-tests { 87 | list-style-position: inside; 88 | } 89 | 90 | #qunit-tests li { 91 | padding: 0.4em 0.5em 0.4em 2.5em; 92 | border-bottom: 1px solid #fff; 93 | list-style-position: inside; 94 | } 95 | 96 | #qunit-tests.hidepass li.pass, #qunit-tests.hidepass li.running { 97 | display: none; 98 | } 99 | 100 | #qunit-tests li strong { 101 | cursor: pointer; 102 | } 103 | 104 | #qunit-tests li a { 105 | padding: 0.5em; 106 | color: #c2ccd1; 107 | text-decoration: none; 108 | } 109 | #qunit-tests li a:hover, 110 | #qunit-tests li a:focus { 111 | color: #000; 112 | } 113 | 114 | #qunit-tests li .runtime { 115 | float: right; 116 | font-size: smaller; 117 | } 118 | 119 | .qunit-assert-list { 120 | margin-top: 0.5em; 121 | padding: 0.5em; 122 | 123 | background-color: #fff; 124 | 125 | border-radius: 5px; 126 | -moz-border-radius: 5px; 127 | -webkit-border-radius: 5px; 128 | } 129 | 130 | .qunit-collapsed { 131 | display: none; 132 | } 133 | 134 | #qunit-tests table { 135 | border-collapse: collapse; 136 | margin-top: .2em; 137 | } 138 | 139 | #qunit-tests th { 140 | text-align: right; 141 | vertical-align: top; 142 | padding: 0 .5em 0 0; 143 | } 144 | 145 | #qunit-tests td { 146 | vertical-align: top; 147 | } 148 | 149 | #qunit-tests pre { 150 | margin: 0; 151 | white-space: pre-wrap; 152 | word-wrap: break-word; 153 | } 154 | 155 | #qunit-tests del { 156 | background-color: #e0f2be; 157 | color: #374e0c; 158 | text-decoration: none; 159 | } 160 | 161 | #qunit-tests ins { 162 | background-color: #ffcaca; 163 | color: #500; 164 | text-decoration: none; 165 | } 166 | 167 | /*** Test Counts */ 168 | 169 | #qunit-tests b.counts { color: black; } 170 | #qunit-tests b.passed { color: #5E740B; } 171 | #qunit-tests b.failed { color: #710909; } 172 | 173 | #qunit-tests li li { 174 | padding: 5px; 175 | background-color: #fff; 176 | border-bottom: none; 177 | list-style-position: inside; 178 | } 179 | 180 | /*** Passing Styles */ 181 | 182 | #qunit-tests li li.pass { 183 | color: #3c510c; 184 | background-color: #fff; 185 | border-left: 10px solid #C6E746; 186 | } 187 | 188 | #qunit-tests .pass { color: #528CE0; background-color: #D2E0E6; } 189 | #qunit-tests .pass .test-name { color: #366097; } 190 | 191 | #qunit-tests .pass .test-actual, 192 | #qunit-tests .pass .test-expected { color: #999999; } 193 | 194 | #qunit-banner.qunit-pass { background-color: #C6E746; } 195 | 196 | /*** Failing Styles */ 197 | 198 | #qunit-tests li li.fail { 199 | color: #710909; 200 | background-color: #fff; 201 | border-left: 10px solid #EE5757; 202 | white-space: pre; 203 | } 204 | 205 | #qunit-tests > li:last-child { 206 | border-radius: 0 0 5px 5px; 207 | -moz-border-radius: 0 0 5px 5px; 208 | -webkit-border-bottom-right-radius: 5px; 209 | -webkit-border-bottom-left-radius: 5px; 210 | } 211 | 212 | #qunit-tests .fail { color: #000000; background-color: #EE5757; } 213 | #qunit-tests .fail .test-name, 214 | #qunit-tests .fail .module-name { color: #000000; } 215 | 216 | #qunit-tests .fail .test-actual { color: #EE5757; } 217 | #qunit-tests .fail .test-expected { color: green; } 218 | 219 | #qunit-banner.qunit-fail { background-color: #EE5757; } 220 | 221 | 222 | /** Result */ 223 | 224 | #qunit-testresult { 225 | padding: 0.5em 0.5em 0.5em 2.5em; 226 | 227 | color: #2b81af; 228 | background-color: #D2E0E6; 229 | 230 | border-bottom: 1px solid white; 231 | } 232 | #qunit-testresult .module-name { 233 | font-weight: bold; 234 | } 235 | 236 | /** Fixture */ 237 | 238 | #qunit-fixture { 239 | position: absolute; 240 | top: -10000px; 241 | left: -10000px; 242 | width: 1000px; 243 | height: 1000px; 244 | } -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/unit/test.js: -------------------------------------------------------------------------------- 1 | test('basic test', function() { 2 | expect(1); 3 | ok(true, 'this had better work.'); 4 | }); 5 | 6 | 7 | test('can access the DOM', function() { 8 | expect(1); 9 | var fixture = document.getElementById('qunit-fixture'); 10 | equal(fixture.innerText || fixture.textContent, 'this had better work.', 'should be able to access the DOM.'); 11 | }); -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/desktop/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Light theme visual tests 6 | 7 | 8 | 9 | 10 | 11 |
12 |
  • asdf
13 |
14 |
    15 |
  • Node 01 16 |
      17 |
    • Node
    • 18 |
    • Node
    • 19 |
    20 |
  • 21 |
  • Node 02
  • 22 |
  • Node 03 23 |
      24 |
    • Node
    • 25 |
    • Node
    • 26 |
    27 |
  • 28 |
  • Node 04
  • 29 |
  • Node 05
  • 30 |
31 |
32 |
  • full
  • asdf
33 |
  • full
  • asdf
34 | 35 | 36 | 37 | 43 | 44 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/mobile/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Mobile theme visual tests 6 | 7 | 8 | 9 | 10 | 11 |
12 |
    13 |
  • Node 01 14 |
      15 |
    • Node
    • 16 |
    • Node
    • 17 |
    18 |
  • 19 |
  • Node 02
  • 20 |
  • Node 03 21 |
      22 |
    • Node
    • 23 |
    • Node
    • 24 |
    25 |
  • 26 |
  • Node 04
  • 27 |
  • Node 05
  • 28 |
29 |
30 |
  • full
  • asdf
31 |
  • full
  • asdf
32 | 33 | 34 | 35 | 41 | 42 | -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/screenshots/desktop/.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/screenshots/desktop/desktop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/desktop.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/screenshots/desktop/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/home.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/screenshots/mobile/.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/screenshots/mobile/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/home.png -------------------------------------------------------------------------------- /vakata-jstree-3.3.5/test/visual/screenshots/mobile/mobile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/mobile.png --------------------------------------------------------------------------------