├── .gitignore
├── LICENSE
├── README.md
├── data
├── Advertising.csv
├── Credit.csv
├── WineData.csv
├── airquality.csv
├── churn-bigml-20.csv
├── churn-bigml-80.csv
├── cuse_binary.csv
├── horseshoe_crab.csv
├── hsb2.csv
├── hsb2_modified.csv
├── iris.csv
├── kaggle-titanic-gender_submission.csv
├── kaggle-titanic-test.csv
├── kaggle-titanic-train.csv
├── mtcars.csv
├── prostate.csv
├── saved-mtcars
│ ├── .part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc
│ ├── _SUCCESS
│ └── part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv
├── saved-twitter
│ ├── .part-00000.crc
│ ├── _SUCCESS
│ └── part-00000
├── titanic
│ ├── gender_submission.csv
│ ├── test.csv
│ └── train.csv
└── twitter.txt
├── delete-readme.txt
├── images
├── simple-nlp-pipeline.png
└── spark-pipeline.png
├── index.Rmd
├── index.html
├── legacy
├── 01_entry_points_to_spark.Rmd
├── 02_rdd_object.Rmd
├── 03_dataframe_object.Rmd
├── HashingTF-and-CountVectorizer.Rmd
├── categorical-data.Rmd
├── continuous-to-categorical-variable.Rmd
├── conversion-between-rdd-and-dataframe.Rmd
├── cross-validation-in-r.Rmd
├── decision-tree-classification.Rmd
├── dttreeC.Rmd
├── dttreeC.html
├── dttreeR.Rmd
├── fnn.Rmd
├── index.Rmd
├── information-extraction.Rmd
├── install.Rmd
├── k-folds-cross-validation.Rmd
├── kmeans.Rmd
├── linear-regression.Rmd
├── linearRegression.Rmd
├── logistic-regression.Rmd
├── machine-learning-framework.Rmd
├── nlp-and-nltk-basics.Rmd
├── nlpC.Rmd
├── nlpLDA.Rmd
├── pyspark-on-jupyter.Rmd
├── pyspark-on-rodeo.Rmd
├── pyspark-vectors.Rmd
├── pyspark.ml.feature-module.Rmd
├── r-markdown-header.Rmd
├── randomforest.Rmd
├── randomforestC.Rmd
├── regularization.Rmd
├── sna.Rmd
├── spark-on-jetstream-cloud.Rmd
└── tf-idf.Rmd
├── link-spark-with-jupyter.md
├── logo.jpg
├── notebooks
├── 01-data-strcture
│ ├── .gitignore
│ ├── .ipynb_checkpoints
│ │ ├── 1.1-rdd-checkpoint.ipynb
│ │ ├── 1.2-dataframe-checkpoint.ipynb
│ │ ├── 1.3-conversion-between-rdd-and-dataframe-checkpoint.ipynb
│ │ └── 1.4-merge-and-split-columns-checkpoint.ipynb
│ ├── 1.1-rdd.ipynb
│ ├── 1.2-dataframe.ipynb
│ ├── 1.3-conversion-between-rdd-and-dataframe.ipynb
│ └── 1.4-merge-and-split-columns.ipynb
├── 02-data-manipulation
│ ├── .ipynb_checkpoints
│ │ ├── 2.1-map-functions-checkpoint.ipynb
│ │ ├── 2.2-aggregate-functions-checkpoint.ipynb
│ │ ├── 2.3-continuous-variable-to-categorical-variable-checkpoint.ipynb
│ │ ├── 2.4-first-data-check-checkpoint.ipynb
│ │ ├── 2.7.1-column-expression-checkpoint.ipynb
│ │ ├── 2.7.3-boolean-column-expression-checkpoint.ipynb
│ │ ├── 2.8-sql-functions-to-extend-column-expressions-checkpoint.ipynb
│ │ └── 2.9-user-defined-sql-function (udf)-checkpoint.ipynb
│ ├── 2.1-map-functions.ipynb
│ ├── 2.2-aggregate-functions.ipynb
│ ├── 2.3-continuous-variable-to-categorical-variable.ipynb
│ ├── 2.4-first-data-check.ipynb
│ ├── 2.5-subset-dataframe-by-row.ipynb
│ ├── 2.6-subset-dataframe-by-column.ipynb
│ ├── 2.7.1-column-expression.ipynb
│ ├── 2.7.2-dot-column-expression.ipynb
│ ├── 2.7.3-boolean-column-expression.ipynb
│ ├── 2.8-sql-functions-to-extend-column-expressions.ipynb
│ ├── 2.9-user-defined-sql-function (udf).ipynb
│ └── import-and-export-data.ipynb
├── 03-data-preparation
│ ├── stringindexer-and-onehotencoder.ipynb
│ └── vector-assembler.ipynb
├── 04-miscellaneous
│ ├── .ipynb_checkpoints
│ │ └── user-defined-sql-function (udf)-checkpoint.ipynb
│ ├── TF-IDF.ipynb
│ ├── add-python-files-to-spark-cluster.ipynb
│ ├── dense-vs-sparse-vectors.ipynb
│ ├── issues-and-solutions.ipynb
│ ├── pipeline.ipynb
│ └── sql-functions.ipynb
├── 05-module-turning
│ ├── cross-validation.ipynb
│ └── regularization.ipynb
├── 06-machine-learning
│ ├── classification
│ │ ├── binary-classification.ipynb
│ │ ├── decision-tree-classification.ipynb
│ │ ├── gradient-boost-tree-classification.ipynb
│ │ ├── logistic-regression.ipynb
│ │ ├── naive-bayes-classification.ipynb
│ │ └── random-forest-classification.ipynb
│ └── regression
│ │ ├── generalized-linear-regression.ipynb
│ │ └── linear-regression.ipynb
├── 07-natural-language-processing
│ ├── nlp-and-nltk-basics.ipynb
│ ├── nlp-information-extraction.ipynb
│ └── skills-needed-for-nlp-jobs.ipynb
└── ipynb
│ ├── .ipynb_checkpoints
│ ├── DecisionTree-checkpoint.ipynb
│ ├── Feedforward neural network(1)-checkpoint.ipynb
│ ├── HashingTF-and-CountVectorizer-checkpoint.ipynb
│ ├── NaiveBayes-checkpoint.ipynb
│ └── RDD-manipulation-checkpoint.ipynb
│ ├── Categoricaldata.ipynb
│ ├── DataWrangling.ipynb
│ ├── DecisionTree.ipynb
│ ├── DecisionTreeC3.ipynb
│ ├── DecisionTreeC7.ipynb
│ ├── DecisionTreeR.ipynb
│ ├── Feedforward neural network(1).ipynb
│ ├── Feedforward neural network.ipynb
│ ├── HashingTF-and-CountVectorizer.ipynb
│ ├── LinearRegression.ipynb
│ ├── NaiveBayes.ipynb
│ ├── Natural Language Processing nb.ipynb
│ ├── PysparkCluster.ipynb
│ ├── RandomForest.ipynb
│ ├── Regression.ipynb
│ ├── derby.log
│ ├── preproc.py
│ └── vector.ipynb
├── pyFiles
├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── pyFiles.iml
│ └── workspace.xml
└── my_module.py
└── vakata-jstree-3.3.5
├── .gitignore
├── LICENSE-MIT
├── README.md
├── bower.json
├── component.json
├── composer.json
├── demo
├── README.md
└── basic
│ ├── index.html
│ └── root.json
├── dist
├── jstree.js
├── jstree.min.js
└── themes
│ ├── default-dark
│ ├── 32px.png
│ ├── 40px.png
│ ├── style.css
│ ├── style.min.css
│ └── throbber.gif
│ └── default
│ ├── 32px.png
│ ├── 40px.png
│ ├── style.css
│ ├── style.min.css
│ └── throbber.gif
├── gruntfile.js
├── jstree.jquery.json
├── package.json
├── src
├── intro.js
├── jstree.changed.js
├── jstree.checkbox.js
├── jstree.conditionalselect.js
├── jstree.contextmenu.js
├── jstree.dnd.js
├── jstree.js
├── jstree.massload.js
├── jstree.search.js
├── jstree.sort.js
├── jstree.state.js
├── jstree.types.js
├── jstree.unique.js
├── jstree.wholerow.js
├── misc.js
├── outro.js
├── sample.js
├── themes
│ ├── base.less
│ ├── default-dark
│ │ ├── 32px.png
│ │ ├── 40px.png
│ │ ├── style.css
│ │ ├── style.less
│ │ └── throbber.gif
│ ├── default
│ │ ├── 32px.png
│ │ ├── 40px.png
│ │ ├── style.css
│ │ ├── style.less
│ │ └── throbber.gif
│ ├── main.less
│ ├── mixins.less
│ └── responsive.less
└── vakata-jstree.js
└── test
├── unit
├── index.html
├── libs
│ ├── qunit.css
│ └── qunit.js
└── test.js
└── visual
├── desktop
└── index.html
├── mobile
└── index.html
└── screenshots
├── desktop
├── .png
├── desktop.png
└── home.png
└── mobile
├── .png
├── home.png
└── mobile.png
/.gitignore:
--------------------------------------------------------------------------------
1 | /.Rproj.user
2 | /.Rhistory
3 | .RData
4 | .Ruserdata
5 | .DS_Store
6 | *.ipybn
7 | /.ipynb_checkpoints
8 | /.idea
9 | .Rproj.user
10 | metastore_db
11 | *_cache
12 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Ming Chen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # [Learning Apache Spark](https://mingchen0919.github.io/learning-apache-spark/)
2 |
3 |
4 | Website: https://mingchen0919.github.io/learning-apache-spark/index.html
5 |
--------------------------------------------------------------------------------
/data/Advertising.csv:
--------------------------------------------------------------------------------
1 | TV,Radio,Newspaper,Sales
230.1,37.8,69.2,22.1
44.5,39.3,45.1,10.4
17.2,45.9,69.3,9.3
151.5,41.3,58.5,18.5
180.8,10.8,58.4,12.9
8.7,48.9,75,7.2
57.5,32.8,23.5,11.8
120.2,19.6,11.6,13.2
8.6,2.1,1,4.8
199.8,2.6,21.2,10.6
66.1,5.8,24.2,8.6
214.7,24,4,17.4
23.8,35.1,65.9,9.2
97.5,7.6,7.2,9.7
204.1,32.9,46,19
195.4,47.7,52.9,22.4
67.8,36.6,114,12.5
281.4,39.6,55.8,24.4
69.2,20.5,18.3,11.3
147.3,23.9,19.1,14.6
218.4,27.7,53.4,18
237.4,5.1,23.5,12.5
13.2,15.9,49.6,5.6
228.3,16.9,26.2,15.5
62.3,12.6,18.3,9.7
262.9,3.5,19.5,12
142.9,29.3,12.6,15
240.1,16.7,22.9,15.9
248.8,27.1,22.9,18.9
70.6,16,40.8,10.5
292.9,28.3,43.2,21.4
112.9,17.4,38.6,11.9
97.2,1.5,30,9.6
265.6,20,0.3,17.4
95.7,1.4,7.4,9.5
290.7,4.1,8.5,12.8
266.9,43.8,5,25.4
74.7,49.4,45.7,14.7
43.1,26.7,35.1,10.1
228,37.7,32,21.5
202.5,22.3,31.6,16.6
177,33.4,38.7,17.1
293.6,27.7,1.8,20.7
206.9,8.4,26.4,12.9
25.1,25.7,43.3,8.5
175.1,22.5,31.5,14.9
89.7,9.9,35.7,10.6
239.9,41.5,18.5,23.2
227.2,15.8,49.9,14.8
66.9,11.7,36.8,9.7
199.8,3.1,34.6,11.4
100.4,9.6,3.6,10.7
216.4,41.7,39.6,22.6
182.6,46.2,58.7,21.2
262.7,28.8,15.9,20.2
198.9,49.4,60,23.7
7.3,28.1,41.4,5.5
136.2,19.2,16.6,13.2
210.8,49.6,37.7,23.8
210.7,29.5,9.3,18.4
53.5,2,21.4,8.1
261.3,42.7,54.7,24.2
239.3,15.5,27.3,15.7
102.7,29.6,8.4,14
131.1,42.8,28.9,18
69,9.3,0.9,9.3
31.5,24.6,2.2,9.5
139.3,14.5,10.2,13.4
237.4,27.5,11,18.9
216.8,43.9,27.2,22.3
199.1,30.6,38.7,18.3
109.8,14.3,31.7,12.4
26.8,33,19.3,8.8
129.4,5.7,31.3,11
213.4,24.6,13.1,17
16.9,43.7,89.4,8.7
27.5,1.6,20.7,6.9
120.5,28.5,14.2,14.2
5.4,29.9,9.4,5.3
116,7.7,23.1,11
76.4,26.7,22.3,11.8
239.8,4.1,36.9,12.3
75.3,20.3,32.5,11.3
68.4,44.5,35.6,13.6
213.5,43,33.8,21.7
193.2,18.4,65.7,15.2
76.3,27.5,16,12
110.7,40.6,63.2,16
88.3,25.5,73.4,12.9
109.8,47.8,51.4,16.7
134.3,4.9,9.3,11.2
28.6,1.5,33,7.3
217.7,33.5,59,19.4
250.9,36.5,72.3,22.2
107.4,14,10.9,11.5
163.3,31.6,52.9,16.9
197.6,3.5,5.9,11.7
184.9,21,22,15.5
289.7,42.3,51.2,25.4
135.2,41.7,45.9,17.2
222.4,4.3,49.8,11.7
296.4,36.3,100.9,23.8
280.2,10.1,21.4,14.8
187.9,17.2,17.9,14.7
238.2,34.3,5.3,20.7
137.9,46.4,59,19.2
25,11,29.7,7.2
90.4,0.3,23.2,8.7
13.1,0.4,25.6,5.3
255.4,26.9,5.5,19.8
225.8,8.2,56.5,13.4
241.7,38,23.2,21.8
175.7,15.4,2.4,14.1
209.6,20.6,10.7,15.9
78.2,46.8,34.5,14.6
75.1,35,52.7,12.6
139.2,14.3,25.6,12.2
76.4,0.8,14.8,9.4
125.7,36.9,79.2,15.9
19.4,16,22.3,6.6
141.3,26.8,46.2,15.5
18.8,21.7,50.4,7
224,2.4,15.6,11.6
123.1,34.6,12.4,15.2
229.5,32.3,74.2,19.7
87.2,11.8,25.9,10.6
7.8,38.9,50.6,6.6
80.2,0,9.2,8.8
220.3,49,3.2,24.7
59.6,12,43.1,9.7
0.7,39.6,8.7,1.6
265.2,2.9,43,12.7
8.4,27.2,2.1,5.7
219.8,33.5,45.1,19.6
36.9,38.6,65.6,10.8
48.3,47,8.5,11.6
25.6,39,9.3,9.5
273.7,28.9,59.7,20.8
43,25.9,20.5,9.6
184.9,43.9,1.7,20.7
73.4,17,12.9,10.9
193.7,35.4,75.6,19.2
220.5,33.2,37.9,20.1
104.6,5.7,34.4,10.4
96.2,14.8,38.9,11.4
140.3,1.9,9,10.3
240.1,7.3,8.7,13.2
243.2,49,44.3,25.4
38,40.3,11.9,10.9
44.7,25.8,20.6,10.1
280.7,13.9,37,16.1
121,8.4,48.7,11.6
197.6,23.3,14.2,16.6
171.3,39.7,37.7,19
187.8,21.1,9.5,15.6
4.1,11.6,5.7,3.2
93.9,43.5,50.5,15.3
149.8,1.3,24.3,10.1
11.7,36.9,45.2,7.3
131.7,18.4,34.6,12.9
172.5,18.1,30.7,14.4
85.7,35.8,49.3,13.3
188.4,18.1,25.6,14.9
163.5,36.8,7.4,18
117.2,14.7,5.4,11.9
234.5,3.4,84.8,11.9
17.9,37.6,21.6,8
206.8,5.2,19.4,12.2
215.4,23.6,57.6,17.1
284.3,10.6,6.4,15
50,11.6,18.4,8.4
164.5,20.9,47.4,14.5
19.6,20.1,17,7.6
168.4,7.1,12.8,11.7
222.4,3.4,13.1,11.5
276.9,48.9,41.8,27
248.4,30.2,20.3,20.2
170.2,7.8,35.2,11.7
276.7,2.3,23.7,11.8
165.6,10,17.6,12.6
156.6,2.6,8.3,10.5
218.5,5.4,27.4,12.2
56.2,5.7,29.7,8.7
287.6,43,71.8,26.2
253.8,21.3,30,17.6
205,45.1,19.6,22.6
139.5,2.1,26.6,10.3
191.1,28.7,18.2,17.3
286,13.9,3.7,15.9
18.7,12.1,23.4,6.7
39.5,41.1,5.8,10.8
75.5,10.8,6,9.9
17.2,4.1,31.6,5.9
166.8,42,3.6,19.6
149.7,35.6,6,17.3
38.2,3.7,13.8,7.6
94.2,4.9,8.1,9.7
177,9.3,6.4,12.8
283.6,42,66.2,25.5
232.1,8.6,8.7,13.4
--------------------------------------------------------------------------------
/data/airquality.csv:
--------------------------------------------------------------------------------
1 | "ozone","solar.r","wind","temp","month","day"
2 | 41,190,7.4,67,5,1
3 | 36,118,8,72,5,2
4 | 12,149,12.6,74,5,3
5 | 18,313,11.5,62,5,4
6 | NA,NA,14.3,56,5,5
7 | 28,NA,14.9,66,5,6
8 | 23,299,8.6,65,5,7
9 | 19,99,13.8,59,5,8
10 | 8,19,20.1,61,5,9
11 | NA,194,8.6,69,5,10
12 | 7,NA,6.9,74,5,11
13 | 16,256,9.7,69,5,12
14 | 11,290,9.2,66,5,13
15 | 14,274,10.9,68,5,14
16 | 18,65,13.2,58,5,15
17 | 14,334,11.5,64,5,16
18 | 34,307,12,66,5,17
19 | 6,78,18.4,57,5,18
20 | 30,322,11.5,68,5,19
21 | 11,44,9.7,62,5,20
22 | 1,8,9.7,59,5,21
23 | 11,320,16.6,73,5,22
24 | 4,25,9.7,61,5,23
25 | 32,92,12,61,5,24
26 | NA,66,16.6,57,5,25
27 | NA,266,14.9,58,5,26
28 | NA,NA,8,57,5,27
29 | 23,13,12,67,5,28
30 | 45,252,14.9,81,5,29
31 | 115,223,5.7,79,5,30
32 | 37,279,7.4,76,5,31
33 | NA,286,8.6,78,6,1
34 | NA,287,9.7,74,6,2
35 | NA,242,16.1,67,6,3
36 | NA,186,9.2,84,6,4
37 | NA,220,8.6,85,6,5
38 | NA,264,14.3,79,6,6
39 | 29,127,9.7,82,6,7
40 | NA,273,6.9,87,6,8
41 | 71,291,13.8,90,6,9
42 | 39,323,11.5,87,6,10
43 | NA,259,10.9,93,6,11
44 | NA,250,9.2,92,6,12
45 | 23,148,8,82,6,13
46 | NA,332,13.8,80,6,14
47 | NA,322,11.5,79,6,15
48 | 21,191,14.9,77,6,16
49 | 37,284,20.7,72,6,17
50 | 20,37,9.2,65,6,18
51 | 12,120,11.5,73,6,19
52 | 13,137,10.3,76,6,20
53 | NA,150,6.3,77,6,21
54 | NA,59,1.7,76,6,22
55 | NA,91,4.6,76,6,23
56 | NA,250,6.3,76,6,24
57 | NA,135,8,75,6,25
58 | NA,127,8,78,6,26
59 | NA,47,10.3,73,6,27
60 | NA,98,11.5,80,6,28
61 | NA,31,14.9,77,6,29
62 | NA,138,8,83,6,30
63 | 135,269,4.1,84,7,1
64 | 49,248,9.2,85,7,2
65 | 32,236,9.2,81,7,3
66 | NA,101,10.9,84,7,4
67 | 64,175,4.6,83,7,5
68 | 40,314,10.9,83,7,6
69 | 77,276,5.1,88,7,7
70 | 97,267,6.3,92,7,8
71 | 97,272,5.7,92,7,9
72 | 85,175,7.4,89,7,10
73 | NA,139,8.6,82,7,11
74 | 10,264,14.3,73,7,12
75 | 27,175,14.9,81,7,13
76 | NA,291,14.9,91,7,14
77 | 7,48,14.3,80,7,15
78 | 48,260,6.9,81,7,16
79 | 35,274,10.3,82,7,17
80 | 61,285,6.3,84,7,18
81 | 79,187,5.1,87,7,19
82 | 63,220,11.5,85,7,20
83 | 16,7,6.9,74,7,21
84 | NA,258,9.7,81,7,22
85 | NA,295,11.5,82,7,23
86 | 80,294,8.6,86,7,24
87 | 108,223,8,85,7,25
88 | 20,81,8.6,82,7,26
89 | 52,82,12,86,7,27
90 | 82,213,7.4,88,7,28
91 | 50,275,7.4,86,7,29
92 | 64,253,7.4,83,7,30
93 | 59,254,9.2,81,7,31
94 | 39,83,6.9,81,8,1
95 | 9,24,13.8,81,8,2
96 | 16,77,7.4,82,8,3
97 | 78,NA,6.9,86,8,4
98 | 35,NA,7.4,85,8,5
99 | 66,NA,4.6,87,8,6
100 | 122,255,4,89,8,7
101 | 89,229,10.3,90,8,8
102 | 110,207,8,90,8,9
103 | NA,222,8.6,92,8,10
104 | NA,137,11.5,86,8,11
105 | 44,192,11.5,86,8,12
106 | 28,273,11.5,82,8,13
107 | 65,157,9.7,80,8,14
108 | NA,64,11.5,79,8,15
109 | 22,71,10.3,77,8,16
110 | 59,51,6.3,79,8,17
111 | 23,115,7.4,76,8,18
112 | 31,244,10.9,78,8,19
113 | 44,190,10.3,78,8,20
114 | 21,259,15.5,77,8,21
115 | 9,36,14.3,72,8,22
116 | NA,255,12.6,75,8,23
117 | 45,212,9.7,79,8,24
118 | 168,238,3.4,81,8,25
119 | 73,215,8,86,8,26
120 | NA,153,5.7,88,8,27
121 | 76,203,9.7,97,8,28
122 | 118,225,2.3,94,8,29
123 | 84,237,6.3,96,8,30
124 | 85,188,6.3,94,8,31
125 | 96,167,6.9,91,9,1
126 | 78,197,5.1,92,9,2
127 | 73,183,2.8,93,9,3
128 | 91,189,4.6,93,9,4
129 | 47,95,7.4,87,9,5
130 | 32,92,15.5,84,9,6
131 | 20,252,10.9,80,9,7
132 | 23,220,10.3,78,9,8
133 | 21,230,10.9,75,9,9
134 | 24,259,9.7,73,9,10
135 | 44,236,14.9,81,9,11
136 | 21,259,15.5,76,9,12
137 | 28,238,6.3,77,9,13
138 | 9,24,10.9,71,9,14
139 | 13,112,11.5,71,9,15
140 | 46,237,6.9,78,9,16
141 | 18,224,13.8,67,9,17
142 | 13,27,10.3,76,9,18
143 | 24,238,10.3,68,9,19
144 | 16,201,8,82,9,20
145 | 13,238,12.6,64,9,21
146 | 23,14,9.2,71,9,22
147 | 36,139,10.3,81,9,23
148 | 7,49,10.3,69,9,24
149 | 14,20,16.6,63,9,25
150 | 30,193,6.9,70,9,26
151 | NA,145,13.2,77,9,27
152 | 14,191,14.3,75,9,28
153 | 18,131,8,76,9,29
154 | 20,223,11.5,68,9,30
155 |
--------------------------------------------------------------------------------
/data/horseshoe_crab.csv:
--------------------------------------------------------------------------------
1 | C,S,W,Wt,Sa
2 | 2,3,28.3,3.05,8
3 | 3,3,26,2.6,4
4 | 3,3,25.6,2.15,0
5 | 4,2,21,1.85,0
6 | 2,3,29,3,1
7 | 1,2,25,2.3,3
8 | 4,3,26.2,1.3,0
9 | 2,3,24.9,2.1,0
10 | 2,1,25.7,2,8
11 | 2,3,27.5,3.15,6
12 | 1,1,26.1,2.8,5
13 | 3,3,28.9,2.8,4
14 | 2,1,30.3,3.6,3
15 | 2,3,22.9,1.6,4
16 | 3,3,26.2,2.3,3
17 | 3,3,24.5,2.05,5
18 | 2,3,30,3.05,8
19 | 2,3,26.2,2.4,3
20 | 2,3,25.4,2.25,6
21 | 2,3,25.4,2.25,4
22 | 4,3,27.5,2.9,0
23 | 4,3,27,2.25,3
24 | 2,2,24,1.7,0
25 | 2,1,28.7,3.2,0
26 | 3,3,26.5,1.97,1
27 | 2,3,24.5,1.6,1
28 | 3,3,27.3,2.9,1
29 | 2,3,26.5,2.3,4
30 | 2,3,25,2.1,2
31 | 3,3,22,1.4,0
32 | 1,1,30.2,3.28,2
33 | 2,2,25.4,2.3,0
34 | 2,1,24.9,2.3,6
35 | 4,3,25.8,2.25,10
36 | 3,3,27.2,2.4,5
37 | 2,3,30.5,3.32,3
38 | 4,3,25,2.1,8
39 | 2,3,30,3,9
40 | 2,1,22.9,1.6,0
41 | 2,3,23.9,1.85,2
42 | 2,3,26,2.28,3
43 | 2,3,25.8,2.2,0
44 | 3,3,29,3.28,4
45 | 1,1,26.5,2.35,0
46 | 3,3,22.5,1.55,0
47 | 2,3,23.8,2.1,0
48 | 3,3,24.3,2.15,0
49 | 2,1,26,2.3,14
50 | 4,3,24.7,2.2,0
51 | 2,1,22.5,1.6,1
52 | 2,3,28.7,3.15,3
53 | 1,1,29.3,3.2,4
54 | 2,1,26.7,2.7,5
55 | 4,3,23.4,1.9,0
56 | 1,1,27.7,2.5,6
57 | 2,3,28.2,2.6,6
58 | 4,3,24.7,2.1,5
59 | 2,1,25.7,2,5
60 | 2,1,27.8,2.75,0
61 | 3,1,27,2.45,3
62 | 2,3,29,3.2,10
63 | 3,3,25.6,2.8,7
64 | 3,3,24.2,1.9,0
65 | 3,3,25.7,1.2,0
66 | 3,3,23.1,1.65,0
67 | 2,3,28.5,3.05,0
68 | 2,1,29.7,3.85,5
69 | 3,3,23.1,1.55,0
70 | 3,3,24.5,2.2,1
71 | 2,3,27.5,2.55,1
72 | 2,3,26.3,2.4,1
73 | 2,3,27.8,3.25,3
74 | 2,3,31.9,3.33,2
75 | 2,3,25,2.4,5
76 | 3,3,26.2,2.22,0
77 | 3,3,28.4,3.2,3
78 | 1,2,24.5,1.95,6
79 | 2,3,27.9,3.05,7
80 | 2,2,25,2.25,6
81 | 3,3,29,2.92,3
82 | 2,1,31.7,3.73,4
83 | 2,3,27.6,2.85,4
84 | 4,3,24.5,1.9,0
85 | 3,3,23.8,1.8,0
86 | 2,3,28.2,3.05,8
87 | 3,3,24.1,1.8,0
88 | 1,1,28,2.62,0
89 | 1,1,26,2.3,9
90 | 3,2,24.7,1.9,0
91 | 2,3,25.8,2.65,0
92 | 1,1,27.1,2.95,8
93 | 2,3,27.4,2.7,5
94 | 3,3,26.7,2.6,2
95 | 2,1,26.8,2.7,5
96 | 1,3,25.8,2.6,0
97 | 4,3,23.7,1.85,0
98 | 2,3,27.9,2.8,6
99 | 2,1,30,3.3,5
100 | 2,3,25,2.1,4
101 | 2,3,27.7,2.9,5
102 | 2,3,28.3,3,15
103 | 4,3,25.5,2.25,0
104 | 2,3,26,2.15,5
105 | 2,3,26.2,2.4,0
106 | 3,3,23,1.65,1
107 | 2,2,22.9,1.6,0
108 | 2,3,25.1,2.1,5
109 | 3,1,25.9,2.55,4
110 | 4,1,25.5,2.75,0
111 | 2,1,26.8,2.55,0
112 | 2,1,29,2.8,1
113 | 3,3,28.5,3,1
114 | 2,2,24.7,2.55,4
115 | 2,3,29,3.1,1
116 | 2,3,27,2.5,6
117 | 4,3,23.7,1.8,0
118 | 3,3,27,2.5,6
119 | 2,3,24.2,1.65,2
120 | 4,3,22.5,1.47,4
121 | 2,3,25.1,1.8,0
122 | 2,3,24.9,2.2,0
123 | 2,3,27.5,2.63,6
124 | 2,1,24.3,2,0
125 | 2,3,29.5,3.02,4
126 | 2,3,26.2,2.3,0
127 | 2,3,24.7,1.95,4
128 | 3,2,29.8,3.5,4
129 | 4,3,25.7,2.15,0
130 | 3,3,26.2,2.17,2
131 | 4,3,27,2.63,0
132 | 3,3,24.8,2.1,0
133 | 2,1,23.7,1.95,0
134 | 2,3,28.2,3.05,11
135 | 2,3,25.2,2,1
136 | 2,2,23.2,1.95,4
137 | 4,3,25.8,2,3
138 | 4,3,27.5,2.6,0
139 | 2,2,25.7,2,0
140 | 2,3,26.8,2.65,0
141 | 3,3,27.5,3.1,3
142 | 3,1,28.5,3.25,9
143 | 2,3,28.5,3,3
144 | 1,1,27.4,2.7,6
145 | 2,3,27.2,2.7,3
146 | 3,3,27.1,2.55,0
147 | 2,3,28,2.8,1
148 | 2,1,26.5,1.3,0
149 | 3,3,23,1.8,0
150 | 3,2,26,2.2,3
151 | 3,2,24.5,2.25,0
152 | 2,3,25.8,2.3,0
153 | 4,3,23.5,1.9,0
154 | 4,3,26.7,2.45,0
155 | 3,3,25.5,2.25,0
156 | 2,3,28.2,2.87,1
157 | 2,1,25.2,2,1
158 | 2,3,25.3,1.9,2
159 | 3,3,25.7,2.1,0
160 | 4,3,29.3,3.23,12
161 | 3,3,23.8,1.8,6
162 | 2,3,27.4,2.9,3
163 | 2,3,26.2,2.02,2
164 | 2,1,28,2.9,4
165 | 2,1,28.4,3.1,5
166 | 2,1,33.5,5.2,7
167 | 2,3,25.8,2.4,0
168 | 3,3,24,1.9,10
169 | 2,1,23.1,2,0
170 | 2,3,28.3,3.2,0
171 | 2,3,26.5,2.35,4
172 | 2,3,26.5,2.75,7
173 | 3,3,26.1,2.75,3
174 | 2,2,24.5,2,0
--------------------------------------------------------------------------------
/data/iris.csv:
--------------------------------------------------------------------------------
1 | sepal_length,sepal_width,petal_length,petal_width,species
2 | 5.1,3.5,1.4,0.2,setosa
3 | 4.9,3,1.4,0.2,setosa
4 | 4.7,3.2,1.3,0.2,setosa
5 | 4.6,3.1,1.5,0.2,setosa
6 | 5,3.6,1.4,0.2,setosa
7 | 5.4,3.9,1.7,0.4,setosa
8 | 4.6,3.4,1.4,0.3,setosa
9 | 5,3.4,1.5,0.2,setosa
10 | 4.4,2.9,1.4,0.2,setosa
11 | 4.9,3.1,1.5,0.1,setosa
12 | 5.4,3.7,1.5,0.2,setosa
13 | 4.8,3.4,1.6,0.2,setosa
14 | 4.8,3,1.4,0.1,setosa
15 | 4.3,3,1.1,0.1,setosa
16 | 5.8,4,1.2,0.2,setosa
17 | 5.7,4.4,1.5,0.4,setosa
18 | 5.4,3.9,1.3,0.4,setosa
19 | 5.1,3.5,1.4,0.3,setosa
20 | 5.7,3.8,1.7,0.3,setosa
21 | 5.1,3.8,1.5,0.3,setosa
22 | 5.4,3.4,1.7,0.2,setosa
23 | 5.1,3.7,1.5,0.4,setosa
24 | 4.6,3.6,1,0.2,setosa
25 | 5.1,3.3,1.7,0.5,setosa
26 | 4.8,3.4,1.9,0.2,setosa
27 | 5,3,1.6,0.2,setosa
28 | 5,3.4,1.6,0.4,setosa
29 | 5.2,3.5,1.5,0.2,setosa
30 | 5.2,3.4,1.4,0.2,setosa
31 | 4.7,3.2,1.6,0.2,setosa
32 | 4.8,3.1,1.6,0.2,setosa
33 | 5.4,3.4,1.5,0.4,setosa
34 | 5.2,4.1,1.5,0.1,setosa
35 | 5.5,4.2,1.4,0.2,setosa
36 | 4.9,3.1,1.5,0.1,setosa
37 | 5,3.2,1.2,0.2,setosa
38 | 5.5,3.5,1.3,0.2,setosa
39 | 4.9,3.1,1.5,0.1,setosa
40 | 4.4,3,1.3,0.2,setosa
41 | 5.1,3.4,1.5,0.2,setosa
42 | 5,3.5,1.3,0.3,setosa
43 | 4.5,2.3,1.3,0.3,setosa
44 | 4.4,3.2,1.3,0.2,setosa
45 | 5,3.5,1.6,0.6,setosa
46 | 5.1,3.8,1.9,0.4,setosa
47 | 4.8,3,1.4,0.3,setosa
48 | 5.1,3.8,1.6,0.2,setosa
49 | 4.6,3.2,1.4,0.2,setosa
50 | 5.3,3.7,1.5,0.2,setosa
51 | 5,3.3,1.4,0.2,setosa
52 | 7,3.2,4.7,1.4,versicolor
53 | 6.4,3.2,4.5,1.5,versicolor
54 | 6.9,3.1,4.9,1.5,versicolor
55 | 5.5,2.3,4,1.3,versicolor
56 | 6.5,2.8,4.6,1.5,versicolor
57 | 5.7,2.8,4.5,1.3,versicolor
58 | 6.3,3.3,4.7,1.6,versicolor
59 | 4.9,2.4,3.3,1,versicolor
60 | 6.6,2.9,4.6,1.3,versicolor
61 | 5.2,2.7,3.9,1.4,versicolor
62 | 5,2,3.5,1,versicolor
63 | 5.9,3,4.2,1.5,versicolor
64 | 6,2.2,4,1,versicolor
65 | 6.1,2.9,4.7,1.4,versicolor
66 | 5.6,2.9,3.6,1.3,versicolor
67 | 6.7,3.1,4.4,1.4,versicolor
68 | 5.6,3,4.5,1.5,versicolor
69 | 5.8,2.7,4.1,1,versicolor
70 | 6.2,2.2,4.5,1.5,versicolor
71 | 5.6,2.5,3.9,1.1,versicolor
72 | 5.9,3.2,4.8,1.8,versicolor
73 | 6.1,2.8,4,1.3,versicolor
74 | 6.3,2.5,4.9,1.5,versicolor
75 | 6.1,2.8,4.7,1.2,versicolor
76 | 6.4,2.9,4.3,1.3,versicolor
77 | 6.6,3,4.4,1.4,versicolor
78 | 6.8,2.8,4.8,1.4,versicolor
79 | 6.7,3,5,1.7,versicolor
80 | 6,2.9,4.5,1.5,versicolor
81 | 5.7,2.6,3.5,1,versicolor
82 | 5.5,2.4,3.8,1.1,versicolor
83 | 5.5,2.4,3.7,1,versicolor
84 | 5.8,2.7,3.9,1.2,versicolor
85 | 6,2.7,5.1,1.6,versicolor
86 | 5.4,3,4.5,1.5,versicolor
87 | 6,3.4,4.5,1.6,versicolor
88 | 6.7,3.1,4.7,1.5,versicolor
89 | 6.3,2.3,4.4,1.3,versicolor
90 | 5.6,3,4.1,1.3,versicolor
91 | 5.5,2.5,4,1.3,versicolor
92 | 5.5,2.6,4.4,1.2,versicolor
93 | 6.1,3,4.6,1.4,versicolor
94 | 5.8,2.6,4,1.2,versicolor
95 | 5,2.3,3.3,1,versicolor
96 | 5.6,2.7,4.2,1.3,versicolor
97 | 5.7,3,4.2,1.2,versicolor
98 | 5.7,2.9,4.2,1.3,versicolor
99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3,5.8,2.2,virginica
107 | 7.6,3,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3,5.5,2.1,virginica
115 | 5.7,2.5,5,2,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6,2.2,5,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2,virginica
124 | 7.7,2.8,6.7,2,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6,3,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3,5.2,2.3,virginica
148 | 6.3,2.5,5,1.9,virginica
149 | 6.5,3,5.2,2,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3,5.1,1.8,virginica
152 |
--------------------------------------------------------------------------------
/data/kaggle-titanic-gender_submission.csv:
--------------------------------------------------------------------------------
1 | PassengerId,Survived
2 | 892,0
3 | 893,1
4 | 894,0
5 | 895,0
6 | 896,1
7 | 897,0
8 | 898,1
9 | 899,0
10 | 900,1
11 | 901,0
12 | 902,0
13 | 903,0
14 | 904,1
15 | 905,0
16 | 906,1
17 | 907,1
18 | 908,0
19 | 909,0
20 | 910,1
21 | 911,1
22 | 912,0
23 | 913,0
24 | 914,1
25 | 915,0
26 | 916,1
27 | 917,0
28 | 918,1
29 | 919,0
30 | 920,0
31 | 921,0
32 | 922,0
33 | 923,0
34 | 924,1
35 | 925,1
36 | 926,0
37 | 927,0
38 | 928,1
39 | 929,1
40 | 930,0
41 | 931,0
42 | 932,0
43 | 933,0
44 | 934,0
45 | 935,1
46 | 936,1
47 | 937,0
48 | 938,0
49 | 939,0
50 | 940,1
51 | 941,1
52 | 942,0
53 | 943,0
54 | 944,1
55 | 945,1
56 | 946,0
57 | 947,0
58 | 948,0
59 | 949,0
60 | 950,0
61 | 951,1
62 | 952,0
63 | 953,0
64 | 954,0
65 | 955,1
66 | 956,0
67 | 957,1
68 | 958,1
69 | 959,0
70 | 960,0
71 | 961,1
72 | 962,1
73 | 963,0
74 | 964,1
75 | 965,0
76 | 966,1
77 | 967,0
78 | 968,0
79 | 969,1
80 | 970,0
81 | 971,1
82 | 972,0
83 | 973,0
84 | 974,0
85 | 975,0
86 | 976,0
87 | 977,0
88 | 978,1
89 | 979,1
90 | 980,1
91 | 981,0
92 | 982,1
93 | 983,0
94 | 984,1
95 | 985,0
96 | 986,0
97 | 987,0
98 | 988,1
99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 |
--------------------------------------------------------------------------------
/data/mtcars.csv:
--------------------------------------------------------------------------------
1 | ,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
2 | Mazda RX4,21,6,160,110,3.9,2.62,16.46,0,1,4,4
3 | Mazda RX4 Wag,21,6,160,110,3.9,2.875,17.02,0,1,4,4
4 | Datsun 710,22.8,4,108,93,3.85,2.32,18.61,1,1,4,1
5 | Hornet 4 Drive,21.4,6,258,110,3.08,3.215,19.44,1,0,3,1
6 | Hornet Sportabout,18.7,8,360,175,3.15,3.44,17.02,0,0,3,2
7 | Valiant,18.1,6,225,105,2.76,3.46,20.22,1,0,3,1
8 | Duster 360,14.3,8,360,245,3.21,3.57,15.84,0,0,3,4
9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20,1,0,4,2
10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18,0,0,3,3
16 | Cadillac Fleetwood,10.4,8,472,205,2.93,5.25,17.98,0,0,3,4
17 | Lincoln Continental,10.4,8,460,215,3,5.424,17.82,0,0,3,4
18 | Chrysler Imperial,14.7,8,440,230,3.23,5.345,17.42,0,0,3,4
19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | Dodge Challenger,15.5,8,318,150,2.76,3.52,16.87,0,0,3,2
24 | AMC Javelin,15.2,8,304,150,3.15,3.435,17.3,0,0,3,2
25 | Camaro Z28,13.3,8,350,245,3.73,3.84,15.41,0,0,3,4
26 | Pontiac Firebird,19.2,8,400,175,3.08,3.845,17.05,0,0,3,2
27 | Fiat X1-9,27.3,4,79,66,4.08,1.935,18.9,1,1,4,1
28 | Porsche 914-2,26,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | Ford Pantera L,15.8,8,351,264,4.22,3.17,14.5,0,1,5,4
31 | Ferrari Dino,19.7,6,145,175,3.62,2.77,15.5,0,1,5,6
32 | Maserati Bora,15,8,301,335,3.54,3.57,14.6,0,1,5,8
33 | Volvo 142E,21.4,4,121,109,4.11,2.78,18.6,1,1,4,2
--------------------------------------------------------------------------------
/data/saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-mtcars/.part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv.crc
--------------------------------------------------------------------------------
/data/saved-mtcars/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-mtcars/_SUCCESS
--------------------------------------------------------------------------------
/data/saved-mtcars/part-00000-1bbfe035-9f3f-4242-b1f2-be740ac7b5fb-c000.csv:
--------------------------------------------------------------------------------
1 | x_rown_ames,x_mpg,x_cyl,x_disp,x_hp,x_drat,x_wt,x_qsec,x_vs,x_am,x_gear,x_carb
2 | Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
3 | Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
4 | Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
5 | Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
6 | Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
7 | Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
8 | Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
9 | Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
10 | Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
11 | Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4
12 | Merc 280C,17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4
13 | Merc 450SE,16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3
14 | Merc 450SL,17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3
15 | Merc 450SLC,15.2,8,275.8,180,3.07,3.78,18.0,0,0,3,3
16 | Cadillac Fleetwood,10.4,8,472.0,205,2.93,5.25,17.98,0,0,3,4
17 | Lincoln Continental,10.4,8,460.0,215,3.0,5.424,17.82,0,0,3,4
18 | Chrysler Imperial,14.7,8,440.0,230,3.23,5.345,17.42,0,0,3,4
19 | Fiat 128,32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1
20 | Honda Civic,30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2
21 | Toyota Corolla,33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1
22 | Toyota Corona,21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1
23 | Dodge Challenger,15.5,8,318.0,150,2.76,3.52,16.87,0,0,3,2
24 | AMC Javelin,15.2,8,304.0,150,3.15,3.435,17.3,0,0,3,2
25 | Camaro Z28,13.3,8,350.0,245,3.73,3.84,15.41,0,0,3,4
26 | Pontiac Firebird,19.2,8,400.0,175,3.08,3.845,17.05,0,0,3,2
27 | Fiat X1-9,27.3,4,79.0,66,4.08,1.935,18.9,1,1,4,1
28 | Porsche 914-2,26.0,4,120.3,91,4.43,2.14,16.7,0,1,5,2
29 | Lotus Europa,30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2
30 | Ford Pantera L,15.8,8,351.0,264,4.22,3.17,14.5,0,1,5,4
31 | Ferrari Dino,19.7,6,145.0,175,3.62,2.77,15.5,0,1,5,6
32 | Maserati Bora,15.0,8,301.0,335,3.54,3.57,14.6,0,1,5,8
33 | Volvo 142E,21.4,4,121.0,109,4.11,2.78,18.6,1,1,4,2
34 |
--------------------------------------------------------------------------------
/data/saved-twitter/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-twitter/.part-00000.crc
--------------------------------------------------------------------------------
/data/saved-twitter/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/data/saved-twitter/_SUCCESS
--------------------------------------------------------------------------------
/data/saved-twitter/part-00000:
--------------------------------------------------------------------------------
1 | Fresh install of XP on new computer. Sweet relief! fuck vista 1018769417 1.0
2 | Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl 10284216536 1.0
3 | "Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting." 10298589026 1.0
4 | Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever! 109017669432377344 1.0
5 | 'Cheap Eats in SLP' - http://t.co/4w8gRp7 109642968603963392 1.0
6 | Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW 10995492579 1.0
7 | New demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa 11713360136 1.0
8 | hi all - i'm going to be tweeting things lookstat at the @lookstat twitter account. please follow me there 1208319583 1.0
9 | Holy carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D? 121330835726155776 1.0
10 | "Did I really need to learn ""I bought a box and put in it things"" in arabic? This is the most random book ever." 12358025545 1.0
11 |
--------------------------------------------------------------------------------
/data/titanic/gender_submission.csv:
--------------------------------------------------------------------------------
1 | PassengerId,Survived
2 | 892,0
3 | 893,1
4 | 894,0
5 | 895,0
6 | 896,1
7 | 897,0
8 | 898,1
9 | 899,0
10 | 900,1
11 | 901,0
12 | 902,0
13 | 903,0
14 | 904,1
15 | 905,0
16 | 906,1
17 | 907,1
18 | 908,0
19 | 909,0
20 | 910,1
21 | 911,1
22 | 912,0
23 | 913,0
24 | 914,1
25 | 915,0
26 | 916,1
27 | 917,0
28 | 918,1
29 | 919,0
30 | 920,0
31 | 921,0
32 | 922,0
33 | 923,0
34 | 924,1
35 | 925,1
36 | 926,0
37 | 927,0
38 | 928,1
39 | 929,1
40 | 930,0
41 | 931,0
42 | 932,0
43 | 933,0
44 | 934,0
45 | 935,1
46 | 936,1
47 | 937,0
48 | 938,0
49 | 939,0
50 | 940,1
51 | 941,1
52 | 942,0
53 | 943,0
54 | 944,1
55 | 945,1
56 | 946,0
57 | 947,0
58 | 948,0
59 | 949,0
60 | 950,0
61 | 951,1
62 | 952,0
63 | 953,0
64 | 954,0
65 | 955,1
66 | 956,0
67 | 957,1
68 | 958,1
69 | 959,0
70 | 960,0
71 | 961,1
72 | 962,1
73 | 963,0
74 | 964,1
75 | 965,0
76 | 966,1
77 | 967,0
78 | 968,0
79 | 969,1
80 | 970,0
81 | 971,1
82 | 972,0
83 | 973,0
84 | 974,0
85 | 975,0
86 | 976,0
87 | 977,0
88 | 978,1
89 | 979,1
90 | 980,1
91 | 981,0
92 | 982,1
93 | 983,0
94 | 984,1
95 | 985,0
96 | 986,0
97 | 987,0
98 | 988,1
99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 |
--------------------------------------------------------------------------------
/data/twitter.txt:
--------------------------------------------------------------------------------
1 | Fresh install of XP on new computer. Sweet relief! fuck vista 1018769417 1.0
2 | Well. Now I know where to go when I want my knives. #ChiChevySXSW http://post.ly/RvDl 10284216536 1.0
3 | "Literally six weeks before I can take off ""SSC Chair"" off my email. Its like the torturous 4th mile before everything stops hurting." 10298589026 1.0
4 | Mitsubishi i MiEV - Wikipedia, the free encyclopedia - http://goo.gl/xipe Cutest car ever! 109017669432377344 1.0
5 | 'Cheap Eats in SLP' - http://t.co/4w8gRp7 109642968603963392 1.0
6 | Teenage Mutant Ninja Turtle art is never a bad thing... http://bit.ly/aDMHyW 10995492579 1.0
7 | New demographic survey of online video viewers: http://bit.ly/cx8b7I via @KellyOlexa 11713360136 1.0
8 | hi all - i'm going to be tweeting things lookstat at the @lookstat twitter account. please follow me there 1208319583 1.0
9 | Holy carp, no. That movie will seriously suffer for it. RT @MouseInfo: Anyone excited for The Little Mermaid in 3D? 121330835726155776 1.0
10 | "Did I really need to learn ""I bought a box and put in it things"" in arabic? This is the most random book ever." 12358025545 1.0
11 |
--------------------------------------------------------------------------------
/delete-readme.txt:
--------------------------------------------------------------------------------
1 | # Learning apache spark
2 |
3 | **[Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/)**
4 |
5 | ## Introduction
6 |
7 | This repository contains mainly notes from learning Apache Spark by [Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/). We try to use the detailed demo code and examples to show how to use pyspark for big data mining. **If you find your work wasn't cited in this note, please feel free to let us know.**
8 |
9 | ## Content
10 |
11 | * ***Cheat Sheets***
12 | + [Python Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PythonForDataScience.pdf)
13 | + [Pandas Basics](http://datacamp-community.s3.amazonaws.com/3857975e-e12f-406a-b3e8-7d627217e952)
14 | + [Numpy Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf)
15 | + [Scikit-Learn](http://datacamp-community.s3.amazonaws.com/5433fa18-9f43-44cc-b228-74672efcd116)
16 | + [RDD Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_Cheat_Sheet_Python.pdf)
17 | + [DataFrame Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf)
18 | + [Apache Spark Cheat Sheet](https://hackr.io/tutorials/learn-apache-spark)
19 |
20 | * ***Data Manipulation***
21 | + [Entry Points to Spark](entry-points-to-spark.ipynb)
22 | + [RDD Object](rdd-object.ipynb)
23 | + [DataFrame Object](dataframe-object.ipynb)
24 | + [RDD and DataFrame conversion](conversion-between-rdd-and-dataframe.ipynb)
25 | + [Categorical Data, `StringIndexer` and `OneHotEncoder`](stringindexer-and-onehotencoder.ipynb)
26 | + [Continuous variables to categorical variables](Continuous-variable-to-categorical-variable.ipynb)
27 | + [Import and export data](import-and-export-data.ipynb)
28 | + [Subset data](subset-data.ipynb):
29 | * select rows by index
30 | * select rows by logical criteria
31 | * select columns by index
32 | * select columns by names
33 | * select columns by regex pattern
34 | + [`udf()` function and SQL data types](udf-and-sql-types.ipynb):
35 | * use `udf()` function
36 | * difference between `ArrayType` and `StructType`
37 | + [Pipeline](pipeline.ipynb)
38 | + [Dense and sparse vectors](dense-vs-sparse-vectors.ipynb)
39 | + [Assemble feature columns into a `featuresCol` column with `VectorAssembler`](vector-assembler.ipynb)
40 | + [TF-IDF, HashingTF and CountVectorizer](TF-IDF.ipynb)
41 | + Feature processing:
42 | - [First data check](first-data-check.ipynb)
43 | + [SQL functions](sql-functions.ipynb)
44 | + [Add py Files to cluster](add-py-files-to-spark-cluster.ipynb)
45 |
46 | * ***Machine Learning***
47 | + [Machine Learning Framework](machine-learning-framework.Rmd)
48 | + **Regression**
49 |
50 | - [Linear regression](linear-regression.ipynb)
51 | - [Logistic regression](logistic-regression.ipynb)
52 |
53 | + **Classification**
54 |
55 | - [Naive bayes classification](naive-bayes-classification.ipynb)
56 | - [Decision tree](decision-tree-classification.ipynb)
57 | - [Random forest classification](random-forest-classification.ipynb)
58 | - [Gradient boost tree classification](gradient-boost-tree-classification.ipynb)
59 |
60 | * **Model Tuning**
61 | + [Regularization](regularization.ipynb)
62 | + [Cross-validation](cross-validation.ipynb)
63 |
64 | * **Nutural Language Processing**
65 | + [NLP and NLTK Basics](nlp-and-nltk-basics.ipynb)
66 | + [NLP Information Extraction](nlp-information-extraction.ipynb)
67 |
68 | ### Acknowledgement
69 |
70 | At here, we would like to thank Jian Sun and Zhongbo Li at the University of Tennessee at Knoxville for the valuable disscussion and thank the generous anonymous authors for providing the detailed solutions and source code on the internet. Without those help, this repository would not have been possible to be made. Wenqiang also would like to thank the Institute for Mathematics and Its Applications (IMA) at University of Minnesota, Twin Cities for support during his IMA Data Scientist Fellow visit.
71 |
72 | ### Feedback and suggestions
73 |
74 | Your comments and suggestions are highly appreciated. We are more than happy to receive corrections, suggestions or feedbacks through email (Ming Chen: mchen33@utk.edu, Wenqiang Feng: wfeng1@utk.edu) for improvements.
75 |
--------------------------------------------------------------------------------
/images/simple-nlp-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/images/simple-nlp-pipeline.png
--------------------------------------------------------------------------------
/images/spark-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/images/spark-pipeline.png
--------------------------------------------------------------------------------
/index.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: '[Learning Apache Spark](https://github.com/MingChen0919/learning-apache-spark)'
3 | output:
4 | html_document:
5 | highlight: pygments
6 | ---
7 |
8 | ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
9 | knitr::opts_knit$set(progress = FALSE)
10 | knitr::opts_chunk$set(error = TRUE, echo = FALSE)
11 | library(htmltools)
12 | ```
13 |
14 | ```{r, echo=FALSE}
15 | # to make the css theme to work, tags cannot be added directly
16 | # as tags as below.
17 | # it has to be added using a code chunk with the htmltool functions!!!
18 | css_link = tags$link()
19 | css_link$attribs = list(rel="stylesheet", href="vakata-jstree-3.3.5/dist/themes/default/style.min.css")
20 | css_link
21 | ```
22 |
23 | ```{r, eval=FALSE, echo=FALSE}
24 | # this code chunk is purely for adding comments
25 | # below is to add jQuery and jstree javascripts
26 | ```
27 |
28 |
29 |
30 | ```{r, eval=FALSE, echo=FALSE}
31 | # this code chunk is purely for adding comments
32 | # javascript code below is to build the file tree interface
33 | # see this for how to implement opening hyperlink: https://stackoverflow.com/questions/18611317/how-to-get-i-get-leaf-nodes-in-jstree-to-open-their-hyperlink-when-clicked-when
34 | ```
35 |
43 |
44 |
45 | ```{r}
46 | file_tree = function(dir = '.'){
47 | # # get the OUTPUT_DIR folder data: dataset_NUMBER_files
48 | # report_files_path = Sys.getenv('REPORT_FILES_PATH')
49 | # output_dir = tail(strsplit(report_files_path, '/')[[1]], 1)
50 |
51 | files = list.files(path = dir, recursive = FALSE, full.names = TRUE)
52 | # files also include directorys, need to remove directorys
53 | files = files[!dir.exists(files)]
54 | dirs = list.dirs(path = dir, recursive = FALSE, full.names = TRUE)
55 | # exclude .ipynb_checkpoints folder
56 | # ipynb_checkpoints = grep(pattern = 'ipynb_checkpoints', x = dirs)
57 | # dirs = dirs[-ipynb_checkpoints]
58 | github_repo_url = 'https://github.com/MingChen0919/learning-apache-spark/blob/master/'
59 | tags$ul(
60 | {
61 | if (length(files) > 0) {
62 | lapply(files, function(x){
63 | path_end = tail(strsplit(x, '/')[[1]],1)
64 | li_item = tags$li(tags$a(path_end, href=paste0(github_repo_url, x)))
65 | li_item$attribs = list('data-jstree'='{"icon":"jstree-file"}')
66 | li_item
67 | })
68 | }
69 | },
70 | {
71 | if (length(dirs) > 0) {
72 | lapply(dirs, function(x){
73 | path_end = tail(strsplit(x, '/')[[1]],1)
74 | if (!(path_end %in% c('vakata-jstree-3.3.5', '.ipynb_checkpoints', 'spark-warehouse', 'ipynb'))) {
75 | li_item = tags$li(path_end, file_tree(x))
76 | li_item$attribs = list('data-jstree' = '{"icon":"jstree-folder"}', class=list('jstree-open'))
77 | li_item
78 | }
79 | })
80 | }
81 | }
82 | )
83 | }
84 | ```
85 |
86 |
87 | **[Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/)**
88 |
89 | ## Introduction
90 |
91 | This repository contains mainly notes from learning Apache Spark by [Ming Chen](https://github.com/MingChen0919) & [Wenqiang Feng](http://web.utk.edu/~wfeng1/). We try to use the detailed demo code and examples to show how to use pyspark for big data mining. **If you find your work wasn't cited in this note, please feel free to let us know.**
92 |
93 |
94 | ## Cheat Sheets
95 |
96 | + [Python Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PythonForDataScience.pdf)
97 | + [Pandas Basics](http://datacamp-community.s3.amazonaws.com/3857975e-e12f-406a-b3e8-7d627217e952)
98 | + [Numpy Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Numpy_Python_Cheat_Sheet.pdf)
99 | + [Scikit-Learn](http://datacamp-community.s3.amazonaws.com/5433fa18-9f43-44cc-b228-74672efcd116)
100 | + [RDD Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_Cheat_Sheet_Python.pdf)
101 | + [DataFrame Basics](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_SQL_Cheat_Sheet_Python.pdf)
102 | + [Apache Spark Cheat Sheet](https://hackr.io/tutorials/learn-apache-spark)
103 |
104 | ## Contents
105 |
106 | ```{r, echo=FALSE}
107 | # create a div container to store the file tree interface
108 | tags$div(
109 | id="jstree",
110 | file_tree('notebooks')
111 | )
112 | ```
113 |
114 |
115 | ## Feedback and suggestions
116 |
117 | Your comments and suggestions are highly appreciated. We are more than happy to receive corrections, suggestions or feedbacks for improvements.
118 |
--------------------------------------------------------------------------------
/legacy/01_entry_points_to_spark.Rmd:
--------------------------------------------------------------------------------
1 |
2 |
3 | ```{r setup, include=FALSE}
4 | knitr::opts_chunk$set(echo = TRUE, eval=FALSE)
5 | ```
6 |
7 |
8 | # Entry points to spark cluster
9 |
10 | There are two main entry points to spark cluster:
11 |
12 | * **SparkContext**: create **RDD** and broadcast variables on the cluster.
13 | * **SparkSession**: create **DataFrame** (pyspark.sql.dataframe.DataFrame).
14 |
15 | # Create entry point instances
16 |
17 | * Create a **SparkContext** instance:
18 |
19 | ```{python eval=FALSE}
20 | from pyspark import SparkContext
21 | sc = SparkContext(master = 'local')
22 | ```
23 |
24 | * Create a **SparkSession** instance
25 |
26 | ```{python eval=FALSE}
27 | from pyspark.sql import SparkSession
28 | spark = SparkSession.builder \
29 | .appName("Python Spark SQL basic example") \
30 | .config("spark.some.config.option", "some-value") \
31 | .getOrCreate()
32 | ```
--------------------------------------------------------------------------------
/legacy/03_dataframe_object.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "DataFrame object"
3 | author: "Ming Chen"
4 | date: "6/4/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 |
12 |
13 | # Content
14 |
15 | * [Create a DataFrame object](#create-a-dataframe-object)
16 | * [Column instance](#column-instance)
17 | * [DataFrame column methods](#dataframe-column-methods)
18 |
19 | ## Create a DataFrame object
20 |
21 | ```{python}
22 | mtcars = spark.read.csv(path='data/mtcars.csv',
23 | sep=',',
24 | encoding='UTF-8',
25 | comment=None,
26 | header=True,
27 | inferSchema=True)
28 | ```
29 |
30 | ```{python}
31 | mtcars.show(n=5, truncate=False)
32 | ```
33 |
34 | ```{python}
35 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
36 | |model |mpg |cyl|disp |hp |drat|wt |qsec |vs |am |gear|carb|
37 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
38 | |Mazda RX4 |21.0|6 |160.0|110|3.9 |2.62 |16.46|0 |1 |4 |4 |
39 | |Mazda RX4 Wag |21.0|6 |160.0|110|3.9 |2.875|17.02|0 |1 |4 |4 |
40 | |Datsun 710 |22.8|4 |108.0|93 |3.85|2.32 |18.61|1 |1 |4 |1 |
41 | |Hornet 4 Drive |21.4|6 |258.0|110|3.08|3.215|19.44|1 |0 |3 |1 |
42 | |Hornet Sportabout|18.7|8 |360.0|175|3.15|3.44 |17.02|0 |0 |3 |2 |
43 | +-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+
44 | only showing top 5 rows
45 | ```
46 |
47 |
48 | ## Column instance
49 |
50 | Column instances can be created in two ways:
51 |
52 | 1. directly select a column out of a *DataFrame*: `df.colName`
53 | 2. create from a column expression: `df.colName + 1`
54 |
55 | Technically, there is only one way to create a column instance. Column expressions start from a column instance.
56 |
57 | **Remember how to create column instances, because this is usually the starting point if we want to operate DataFrame columns.**
58 |
59 | The column classes come with some methods that can operate on a column instance. ***However, almost all functions from the `pyspark.sql.functions` module take one or more column instances as argument(s)***. These functions are important for data manipulation tools.
60 |
61 | ## DataFrame column methods
62 |
63 | ### Methods that take column names as arguments:
64 |
65 | * `corr(col1, col2)`: two column names.
66 | * `cov(col1, col2)`: two column names.
67 | * `crosstab(col1, col2)`: two column names.
68 | * `describe(*cols)`: ***`*cols` refers to only column names (strings).***
69 |
70 | ### Methods that take column names or column expressions or **both** as arguments:
71 |
72 | * `cube(*cols)`: column names (string) or column expressions or **both**.
73 | * `drop(*cols)`: ***a list of column names OR a single column expression.***
74 | * `groupBy(*cols)`: column name (string) or column expression or **both**.
75 | * `rollup(*cols)`: column name (string) or column expression or **both**.
76 | * `select(*cols)`: column name (string) or column expression or **both**.
77 | * `sort(*cols, **kwargs)`: column name (string) or column expression or **both**.
78 | * `sortWithinPartitions(*cols, **kwargs)`: column name (string) or column expression or **both**.
79 | * `orderBy(*cols, **kwargs)`: column name (string) or column expression or **both**.
80 | * `sampleBy(col, fractions, sed=None)`: a column name.
81 | * `toDF(*cols)`: **a list of column names (string).**
82 | * `withColumn(colName, col)`: `colName` refers to column name; `col` refers to a column expression.
83 | * `withColumnRenamed(existing, new)`: takes column names as arguments.
84 | * `filter(condition)`: ***condition** refers to a column expression that returns `types.BooleanType` of values.
85 |
--------------------------------------------------------------------------------
/legacy/HashingTF-and-CountVectorizer.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "HashingTF and CountVectorizer"
3 | author: "Wenqiang & Ming Chen"
4 | date: "3/23/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 |
12 |
13 | ## HashingTF and CountVectorizer
14 |
15 | ### Load data
16 |
17 | ```{python}
18 | twitter = spark.createDataFrame([
19 | ('Wenqiang is a spark expert', 'Wenqiang', 1.0),
20 | ('Ming is learning spark', 'Ming', 0.0)],
21 | ['text', 'id', 'label']
22 | )
23 | ```
24 |
25 | ```{python}
26 | twitter.show()
27 | ```
28 |
29 | ```{python}
30 | +--------------------+--------+-----+
31 | | text| id|label|
32 | +--------------------+--------+-----+
33 | |Wenqiang is a spa...|Wenqiang| 1.0|
34 | |Ming is learning ...| Ming| 0.0|
35 | +--------------------+--------+-----+
36 | ```
37 |
38 |
39 | ### Tokenization
40 |
41 | ```{python}
42 | from pyspark.ml.feature import Tokenizer
43 | ```
44 |
45 | ```{python}
46 | tokenizer_mod = Tokenizer(inputCol='text', outputCol='tokens')
47 | twitter_tokens = tokenizer_mod.transform(twitter)
48 | twitter_tokens.show()
49 | ```
50 |
51 | ```{python}
52 | +--------------------+--------+-----+--------------------+
53 | | text| id|label| tokens|
54 | +--------------------+--------+-----+--------------------+
55 | |Wenqiang is a spa...|Wenqiang| 1.0|[wenqiang, is, a,...|
56 | |Ming is learning ...| Ming| 0.0|[ming, is, learni...|
57 | +--------------------+--------+-----+--------------------+
58 | ```
59 |
60 |
61 | ### HashingTF
62 |
63 | ```{python}
64 | from pyspark.ml.feature import HashingTF
65 | hashingTF_mod = HashingTF(numFeatures=pow(2,4), inputCol='tokens', outputCol='features')
66 | hashingTF_twitter = hashingTF_mod.transform(twitter_tokens)
67 | ```
68 |
69 | ```{python}
70 | hashingTF_twitter.show(truncate=False)
71 | ```
72 |
73 | ```{python}
74 | +--------------------------+--------+-----+--------------------------------+---------------------------------+
75 | |text |id |label|tokens |features |
76 | +--------------------------+--------+-----+--------------------------------+---------------------------------+
77 | |Wenqiang is a spark expert|Wenqiang|1.0 |[wenqiang, is, a, spark, expert]|(16,[1,2,9,13],[2.0,1.0,1.0,1.0])|
78 | |Ming is learning spark |Ming |0.0 |[ming, is, learning, spark] |(16,[0,1,14],[1.0,2.0,1.0]) |
79 | +--------------------------+--------+-----+--------------------------------+---------------------------------+
80 | ```
81 |
82 |
83 | ### CountVectorizer
84 |
85 | ```{python}
86 | from pyspark.ml.feature import CountVectorizer
87 | count_vectorizer = CountVectorizer(vocabSize=pow(2,4), inputCol='tokens', outputCol='features')
88 | countVectorizer_mod = count_vectorizer.fit(twitter_tokens)
89 | countVectorizer_twitter = countVectorizer_mod.transform(twitter_tokens)
90 | ```
91 |
92 | ```{python}
93 | countVectorizer_twitter.show(truncate=False)
94 | ```
95 |
96 | ```{python}
97 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+
98 | |text |id |label|tokens |features |
99 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+
100 | |Wenqiang is a spark expert|Wenqiang|1.0 |[wenqiang, is, a, spark, expert]|(7,[0,1,2,3,5],[1.0,1.0,1.0,1.0,1.0])|
101 | |Ming is learning spark |Ming |0.0 |[ming, is, learning, spark] |(7,[0,1,4,6],[1.0,1.0,1.0,1.0]) |
102 | +--------------------------+--------+-----+--------------------------------+-------------------------------------+
103 | ```
104 |
105 |
106 |
107 |
108 |
109 |
--------------------------------------------------------------------------------
/legacy/continuous-to-categorical-variable.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Continuous to categorical data"
3 | author: "Ming Chen"
4 | date: "6/9/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 |
12 | ## Convert continuous variables to categorical variables
13 |
14 | There are two functions we can use to split a continuous variable into categories:
15 |
16 | * `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold
17 | * `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.
18 | + with $n + 1$ split points, there are n categories (buckets).
19 |
20 | **Create some data**
21 |
22 | ```{python}
23 | import numpy as np
24 | import pandas as pd
25 | np.random.seed(seed=1234)
26 | pdf = pd.DataFrame({
27 | 'x1': np.random.randn(10),
28 | 'x2': np.random.rand(10)*10
29 | })
30 | np.random.seed(seed=None)
31 | df = spark.createDataFrame(pdf)
32 | df.show()
33 |
34 | +--------------------+------------------+
35 | | x1| x2|
36 | +--------------------+------------------+
37 | | 0.47143516373249306| 6.834629351721363|
38 | | -1.1909756947064645| 7.127020269829002|
39 | | 1.4327069684260973|3.7025075479039495|
40 | | -0.3126518960917129| 5.611961860656249|
41 | | -0.7205887333650116| 5.030831653078097|
42 | | 0.8871629403077386|0.1376844959068224|
43 | | 0.8595884137174165| 7.728266216123741|
44 | | -0.6365235044173491| 8.826411906361166|
45 | |0.015696372114428918| 3.648859839013723|
46 | | -2.2426849541854055| 6.153961784334937|
47 | +--------------------+------------------+
48 | ```
49 |
50 | **`Binarize` the column `x1` and `Bucketize` the column `x2`**
51 |
52 | ```{python}
53 | from pyspark.ml.feature import Binarizer, Bucketizer
54 | # threshold = 0 for binarizer
55 | binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')
56 | # provide 5 split points to generate 4 buckets
57 | bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')
58 |
59 | # pipeline stages
60 | from pyspark.ml import Pipeline
61 | stages = [binarizer, bucketizer]
62 | pipeline = Pipeline(stages=stages)
63 |
64 | # fit the pipeline model and transform the data
65 | pipeline.fit(df).transform(df).show()
66 |
67 | +--------------------+------------------+------+------+
68 | | x1| x2|x1_new|x2_new|
69 | +--------------------+------------------+------+------+
70 | | 0.47143516373249306| 6.834629351721363| 1.0| 2.0|
71 | | -1.1909756947064645| 7.127020269829002| 0.0| 2.0|
72 | | 1.4327069684260973|3.7025075479039495| 1.0| 1.0|
73 | | -0.3126518960917129| 5.611961860656249| 0.0| 2.0|
74 | | -0.7205887333650116| 5.030831653078097| 0.0| 2.0|
75 | | 0.8871629403077386|0.1376844959068224| 1.0| 0.0|
76 | | 0.8595884137174165| 7.728266216123741| 1.0| 3.0|
77 | | -0.6365235044173491| 8.826411906361166| 0.0| 3.0|
78 | |0.015696372114428918| 3.648859839013723| 1.0| 1.0|
79 | | -2.2426849541854055| 6.153961784334937| 0.0| 2.0|
80 | +--------------------+------------------+------+------+
81 | ```
82 |
83 |
84 |
--------------------------------------------------------------------------------
/legacy/cross-validation-in-r.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Cross-validation in R"
3 | author: "Ming Chen"
4 | date: "6/5/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | library(boot)
11 | ```
12 |
13 |
14 | ## Prepare data
15 |
16 | ```{r}
17 | horseshoe_crab = read.csv("data/horseshoe_crab.csv")
18 | horseshoe_crab$C = as.factor(horseshoe_crab$C)
19 | horseshoe_crab$S = as.factor(horseshoe_crab$S)
20 | y = numeric()
21 | y[horseshoe_crab$Sa != 0] = 1
22 | y[horseshoe_crab$Sa == 0] = 0
23 | horseshoe_crab$y = y
24 | ```
25 |
26 | ## Split data into training and test datasets
27 |
28 | ```{r}
29 | training_index = sort(sample(nrow(horseshoe_crab), nrow(horseshoe_crab)*0.8))
30 | training = horseshoe_crab[training_index, ]
31 | test = horseshoe_crab[-training_index, ]
32 | ```
33 |
34 |
35 | ## Build cross validation model
36 |
37 | ```{r}
38 | glm_logit = glm(formula = y ~ C + S + W + Wt, data = training,
39 | family = 'binomial')
40 | glm_logit = glm(formula = y ~ C + S + W + Wt, data = horseshoe_crab,
41 | family = 'binomial')
42 | # 4 fold cross validation
43 | cv_glm_5 = cv.glm(data = training, glmfit = glm_logit, K = 4)
44 | ```
45 |
46 |
--------------------------------------------------------------------------------
/legacy/index.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Learning Apache Spark"
3 | output: html_document
4 | ---
5 |
6 | **All materials are converted to notebooks (in ipynb format) and moved to the github repository. Click here to go to the [repository](https://github.com/MingChen0919/learning-apache-spark/blob/master/README.md).**
--------------------------------------------------------------------------------
/legacy/install.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Installations"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "2/17/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ### **Caution**: Before you start the following steps, please make sure you have already installed
13 |
14 | - [Java JDK](http://www.oracle.com/technetwork/java/javase/downloads/index-jsp-138363.html)
15 | - [Ipython and python](https://ipython.org/install.html)
16 | - If you use this method to install the Spark, you can skip the [Spark on Jupyter section](pyspark-on-jupyter.html)
17 |
18 | ### 1. Download Apache Spark from the official website
19 | Weblink: [Download Apache Spark™](http://spark.apache.org/downloads.html)
20 |
21 | ### 2. Installation
22 |
23 | Actually, the Pre-build version doesn't need installation. You can
24 | use it when you unpack it.
25 |
26 | ### 3. Set path link
27 |
28 | This is the most difficult step for the beginner. However, this step can be easily solved via Min RK's [`findspark`](https://github.com/minrk/findspark).
29 |
30 | - install findspark
31 | ```{python eval=FALSE}
32 | pip install findspark
33 | ```
34 | - open `ipython` in terminal and import findspark
35 | ```{python eval=FALSE}
36 | import findspark
37 | findspark.init()
38 | ```
39 | - finding spark path
40 | ```{python eval=FALSE}
41 | findspark.find()
42 | ```
43 | ```{python eval=FALSE}
44 | Out[3]: '/Users/wenqiangfeng/spark/'
45 | ```
46 | - open `ipython --profile=myprofile` in terminal then run the following code
47 | ```{python eval=FALSE}
48 | findspark.init('/Users/wenqiangfeng/spark/', edit_profile=True)
49 | ```
50 | ```{python eval=FALSE}
51 | findspark.init('/Users/wenqiangfeng/spark/', edit_rc=True)
52 | ```
53 |
54 | ### Note:
55 |
56 | - This will also help you to set up the `ipython notebook` or `Jupyter`. You may run the following code in terminal to double check it:
57 | ```{python eval=FALSE}
58 | jupyter notebook
59 | ```
60 |
61 | * If you PySpark still doesn't work, you need to check your `.profile` or `bash_profile` and add the following path to it
62 |
63 | + check `.profile` or `bash_profile` at terminal
64 | + add the path to your `.profile` or `bash_profile`
65 | ```{bash eval=FALSE}
66 | vim ~/.profile
67 | ```
68 |
69 |
70 | ```{bash eval=FALSE}
71 | # Added for Pyspark
72 | export SPARK_HOME=YOUR_PATH/apache-spark/libexec
73 | export PATH=$PATH:$SPARK_HOME/bin:$SPARK_HOME/sbin
74 | export PYSPARK_DRIVER_PYTHON="jupyter"
75 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
76 | ```
77 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/legacy/k-folds-cross-validation.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "K-folds Cross Validation"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "2/20/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 |
12 | ## Training/validation/test data sets
13 |
14 | * **Training set**: the data set for training your models.
15 | * **Validation set**: The data set used for testing the performance of your models you have built with training sets. Based on the performance, you choose the best model (final).
16 | * **Test set**: use this data set to test the performance of your final model.
17 |
18 | ## K-folds cross validation steps (k=4 as an example).
19 |
20 | * step 1: split your data into training set and test set (for example 80% training and 20% test). Test set will never be used in model training and selection.
21 | * step 2: split training set into k (k=4) eqaul subsets: 3 subsets for traing + 1 subset for validation.
22 | * step 3: training your models with the 3 subsets and calculate a performance score with the remaining 1 subset.
23 | * step 4: choose a different subset for validation and then repeat step 3 until every subset has been used as a validation subset.
24 | * step 5: for a k=4 fold cross validation, each trained model should have been validated by 4 subsets and therefore has 4 performance scores. Calculate the average of these 4 perfermance scores for each model. Use the average score to select the best, final model.
25 | * step 6: apply your final model to the **untouched** test data and see how it performs.
26 |
27 | ## Example of k-folds cross validation
28 |
29 | * **Build parameter grids**
30 | + parameter grid: a combination of all variable parameters in your model.
31 | + example: If I want to train a logistic regression model on 4 different *regParam* and 3 different *elasticNetParam*, I will have 3 x 4 = 12 models to train and validate.
32 |
33 | ```{python}
34 | from pyspark.ml.classification import LogisticRegression
35 | blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial')
36 |
37 | from pyspark.ml.tuning import ParamGridBuilder
38 | param_grid = ParamGridBuilder().\
39 | addGrid(blor.regParam, [0, 0.5, 1, 2]).\
40 | addGrid(blor.elasticNetParam, [0, 0.5, 1]).\
41 | build()
42 | ```
43 |
44 | ```{python}
45 | # the first 2 elements in param_grid
46 | [{Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0,
47 | Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 0},
48 | {Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5,
49 | Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 0}]
50 | ```
51 |
52 | * **Split data into training and test sets**
53 | + Refer to the [logistic regression page](logistic-regression.html) to see what data we used and how the training and test sets were generated.
54 |
55 | * **Run k (k=4) folds cross validation**
56 | ```{python}
57 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
58 | evaluator = BinaryClassificationEvaluator()
59 |
60 | from pyspark.ml.tuning import CrossValidator
61 | cv = CrossValidator(estimator=blor, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)
62 |
63 | cvModel = cv.fit(training)
64 | ```
65 |
66 | * **Find the best model**
67 | + best model ID
68 |
69 | ```{python}
70 | cvModel.bestModel
71 | ```
72 |
73 | ```{python}
74 | LogisticRegression_41fe9f7454164180f433
75 | ```
76 |
77 | + average cross-validation metrics
78 | + the 10th model has highest score and is the best model
79 | + *regParam* = 2 and *elasticNetParam* = 0. It is a ridge regularization method.
80 |
81 | ```{python}
82 | cvModel.avgMetrics
83 | ```
84 |
85 | ```{python}
86 | [0.8191225353777875,
87 | 0.8191225353777875,
88 | 0.8191225353777875,
89 | 0.8243105196624104,
90 | 0.5,
91 | 0.5,
92 | 0.8247709310997127,
93 | 0.5,
94 | 0.5,
95 | 0.8259072947360763,
96 | 0.5,
97 | 0.5]
98 | ```
99 |
100 |
101 | ```{python}
102 | param_grid[9]
103 | ```
104 |
105 | ```{python}
106 | {Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0, Param(parent=u'LogisticRegression_41fe9f7454164180f433', name='regParam', doc='regularization parameter (>= 0).'): 2}
107 | ```
108 |
109 | + Model comparison (not finished)
110 |
111 |
112 | ```{python}
113 | # new model
114 | blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial')
115 | model = blor.fit(training)
116 | evaluator.evaluate(model.transform(training))
117 | evaluator.evaluate(model.transform(test))
118 |
119 | new_blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial', regParam=0.5, elasticNetParam=0)
120 | new_model = new_blor.fit(training)
121 | evaluator.evaluate(new_model.transform(training))
122 | evaluator.evaluate(new_model.transform(test))
123 | ```
124 |
--------------------------------------------------------------------------------
/legacy/linear-regression.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Linear Regression"
3 | author: "Ming Chen"
4 | date: "6/5/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 |
12 | # Linear regression
13 |
14 | ## Linear regression without cross-valiation
15 |
16 | **Import data**
17 |
18 | ```{python}
19 | ad = spark.read.csv('data/Advertising.csv', header=True, inferSchema=True)
20 | ad.show(5)
21 |
22 | +-----+-----+---------+-----+
23 | | TV|Radio|Newspaper|Sales|
24 | +-----+-----+---------+-----+
25 | |230.1| 37.8| 69.2| 22.1|
26 | | 44.5| 39.3| 45.1| 10.4|
27 | | 17.2| 45.9| 69.3| 9.3|
28 | |151.5| 41.3| 58.5| 18.5|
29 | |180.8| 10.8| 58.4| 12.9|
30 | +-----+-----+---------+-----+
31 | only showing top 5 rows
32 | ```
33 |
34 | **Transform data structure**
35 |
36 | ```{python}
37 | from pyspark.ml.linalg import Vectors
38 | ad_df = ad.rdd.map(lambda x: [Vectors.dense(x[0:3]), x[-1]]).toDF(['features', 'label'])
39 | ad_df.show(5)
40 |
41 | +-----------------+-----+
42 | | features|label|
43 | +-----------------+-----+
44 | |[230.1,37.8,69.2]| 22.1|
45 | | [44.5,39.3,45.1]| 10.4|
46 | | [17.2,45.9,69.3]| 9.3|
47 | |[151.5,41.3,58.5]| 18.5|
48 | |[180.8,10.8,58.4]| 12.9|
49 | +-----------------+-----+
50 | only showing top 5 rows
51 | ```
52 |
53 | **Build linear regression model**
54 |
55 | ```{python}
56 | from pyspark.ml.regression import LinearRegression
57 | lr = LinearRegression(featuresCol = 'features', labelCol = 'label')
58 | ```
59 |
60 | **Fit the model**
61 |
62 | ```{python}
63 | lr_model = lr.fit(ad_df)
64 | ```
65 |
66 | **Module evaluation**
67 |
68 | ```{python}
69 | from pyspark.ml.evaluation import RegressionEvaluator
70 | evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label')
71 | evaluator.evaluate(ad_pred, {evaluator.metricName: "r2"})
72 |
73 | 0.897210638178952
74 | ```
75 |
76 | **Compare results with results from R**
77 |
78 | The comparison below shows that the linear regression analyses from pyspark and R obtained very close results.
79 |
80 | ```{python}
81 | # intercept and coefficients from R
82 | advertise = read.csv('data/Advertising.csv', header = TRUE)
83 | lr_ad = lm(Sales~., data = advertise)
84 | lr_ad$coefficients
85 |
86 | (Intercept) TV Radio Newspaper
87 | 2.938889369 0.045764645 0.188530017 -0.001037493
88 |
89 | # intercept and coefficents from pyspark
90 | lr_model.intercept
91 |
92 | 2.9388893694594134
93 |
94 | lr_model.coefficients
95 |
96 | DenseVector([0.0458, 0.1885, -0.001])
97 |
98 | # R squared from R
99 | summary(lr_ad)$r.squared
100 |
101 | 0.8972106
102 |
103 | # R squared from pyspark
104 | evaluator.evaluate(ad_pred, {evaluator.metricName: "r2"})
105 |
106 | 0.897210638178952
107 | ```
108 |
109 |
110 | ## Linear regression with cross-validation
111 |
112 | **Training and test datasets**
113 |
114 | ```{python}
115 | ## split data into training and test datasets
116 | training, test = ad_df.randomSplit([0.8, 0.2], seed=123)
117 | ```
118 |
119 | **Build cross-validation model**
120 |
121 | ```{python}
122 | ##=====build cross valiation model======
123 |
124 | # estimator
125 | lr = LinearRegression(featuresCol = 'features', labelCol = 'label')
126 |
127 | # parameter grid
128 | from pyspark.ml.tuning import ParamGridBuilder
129 | param_grid = ParamGridBuilder().\
130 | addGrid(lr.regParam, [0, 0.5, 1]).\
131 | addGrid(lr.elasticNetParam, [0, 0.5, 1]).\
132 | build()
133 |
134 | # evaluator
135 | evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='label', metricName='r2')
136 |
137 | # cross-validation model
138 | from pyspark.ml.tuning import CrossValidator
139 | cv = CrossValidator(estimator=lr, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)
140 | ```
141 |
142 | **Fit cross-validation model**
143 |
144 | ```{python}
145 | cv_model = cv.fit(training)
146 | ```
147 |
148 | **Prediction**
149 |
150 | ```{python}
151 | pred_training_cv = cv_model.transform(training)
152 | pred_test_cv = cv_model.transform(test)
153 | ```
154 |
155 | **Evaluation**
156 |
157 | ```{python}
158 | # performance on training data
159 | evaluator.evaluate(pred_training_cv)
160 |
161 | 0.8982486958337326
162 |
163 | # performance on test data
164 | evaluator.evaluate(pred_test_cv)
165 |
166 | 0.8896562076565583
167 | ```
168 |
169 |
170 | **Intercept and coefficients**
171 |
172 | ```{python}
173 | cv_model.bestModel.intercept
174 |
175 | 3.075068686285647
176 |
177 | cv_model.bestModel.coefficients
178 |
179 | DenseVector([0.0465, 0.1809, -0.0011])
180 | ```
181 |
182 | **Get parameter values from the best model**
183 |
184 | Parameters can be extracted by calling the java property.
185 |
186 | ```{python}
187 | print('best regParam: ' + str(cv_model.bestModel._java_obj.getRegParam()) + "\n" +
188 | 'best ElasticNetParam:' + str(cv_model.bestModel._java_obj.getElasticNetParam()))
189 |
190 | best regParam: 0.0
191 | best ElasticNetParam:0.0
192 | ```
193 |
194 |
195 |
196 |
--------------------------------------------------------------------------------
/legacy/pyspark-on-jupyter.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Spark on Jupyter"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "2/5/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ### 1. Install jupyter with conda
13 |
14 | ```{python eval=FALSE}
15 | conda install jupyter
16 | ```
17 |
18 | ### 2. Get `jupyter binary executable path`
19 |
20 | ```{python eval=FALSE}
21 | which jupyter
22 | ```
23 | ```{python eval=FALSE}
24 | # output
25 | /Users/mingchen/anaconda2/bin/jupyter
26 | ```
27 |
28 | ### 3.Link spark with jupyter
29 |
30 | ```{python eval=FALSE}
31 | export PYSPARK_DRIVER_PYTHON=/Users/mingchen/anaconda2/bin/jupyter
32 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8880"
33 | ```
34 |
35 | You can also add the two environmental variables to the `~/.bash_profile` file to permenantly link spark with jupyter
36 |
37 | ### 4. Run jupyter notebook
38 |
39 | ```{python eval=FALSE}
40 | pyspark
41 | ```
42 |
43 | Then go to [http://127.0.0.1:8880](http://127.0.0.1:8880)
44 |
45 |
--------------------------------------------------------------------------------
/legacy/pyspark-on-rodeo.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pyspark On Rodeo"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "2/5/2017"
5 | output: html_document
6 | ---
7 |
8 | ### 1. Install Rodeo on Mac
9 |
10 | * Download DMG file [https://www.yhat.com/products/rodeo/](https://www.yhat.com/products/rodeo/)
11 |
12 | ### 2. Install `apache-spark` with homebrew
13 |
14 | ```{bash eval=FALSE}
15 | brew install apache-spark
16 | ```
17 |
18 | ### 3.Locate the `python` directory within apache-spark root directory
19 |
20 | ```{bash eval=FALSE}
21 | /usr/local/Cellar/apache-spark/2.1.0/libexec/python
22 | ```
23 |
24 |
25 | ### 4. Set environment variable
26 |
27 | * Open Rodeo, go to **settings**->**ENVIRONMENT VARIABLES**
28 | * Add the path `/usr/local/Cellar/apache-spark/2.1.0/libexec/python` to `PYTHONPATH`
29 |
30 | ### 5. Test pyspark on Rodeo
31 |
32 | Run the following command
33 |
34 | ```{python eval=FALSE}
35 | from pyspark import SparkConf, SparkContext
36 |
37 | conf = SparkConf().setAppName("myAppName")
38 | sc = SparkContext(conf=conf)
39 | sc
40 | ```
41 |
42 | You should get something this
43 |
44 | ```{python eval=FALSE}
45 |
46 | ```
--------------------------------------------------------------------------------
/legacy/pyspark-vectors.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pyspark Vectors"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "2/18/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | ## Remark:
12 |
13 | - You can download the complete [ipython notebook](./ipynb/vector.ipynb) for the this session.
14 |
15 |
16 | ## Dense vector vs. Sparse vector
17 |
18 | * Both dense vector and sparse vector are homogeneous and can only have numeric data.
19 |
20 | * `DenseVector` takes one single argument and is very like an R vector.
21 | * `SparseVector` only display non-zero values. `SparseVector` uses three values to achieve this. The non-value entries' indices (positions) and corresponding values, and the vector size. With this information, you can figure out which entries in a vector have zero values, and therefore, a complete vector.
22 |
23 | ## Example:
24 |
25 | + set up spark context and SparkSession
26 |
27 | ```{python eval =FALSE}
28 | from pyspark import SparkConf, SparkContext
29 | ## set up spark context
30 | from pyspark.sql import SQLContext
31 | sc = SparkContext()
32 | sqlContext = SQLContext(sc)
33 | ## set up SparkSession
34 | from pyspark.sql import SparkSession
35 |
36 | spark = SparkSession \
37 | .builder \
38 | .appName("Python Spark SQL basic example") \
39 | .config("spark.some.config.option", "some-value") \
40 | .getOrCreate()
41 | ```
42 |
43 | + import `Vectors` from pyspark library
44 |
45 | ```{python eval=FALSE}
46 | from pyspark.ml.linalg import Vectors
47 | ```
48 |
49 | + dense vector
50 |
51 | ```{python eval=FALSE}
52 | densevector = Vectors.dense([1,3,4,2.5])
53 | densevector
54 | ```
55 |
56 | ```{python eval=FALSE}
57 | # output
58 | DenseVector([1.0, 3.0, 4.0, 2.5])
59 | ```
60 |
61 | ```{python eval=FALSE}
62 | densevector.toArray()
63 | ```
64 |
65 | ```{python eval=FALSE}
66 | # output
67 | array([ 1. , 3. , 4. , 2.5])
68 | ```
69 | + sparse vector
70 | = The sparse vector below is a representation of vector [ 0. , 3. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ]
71 |
72 | ```{python eval=FALSE}
73 | sparseVector = Vectors.sparse(10, [1, 3], [3.0, 4.5])
74 | sparseVector.toArray()
75 | ```
76 |
77 | ```{python eval=FALSE}
78 | # output
79 | array([ 0. , 3. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ])
80 | ```
--------------------------------------------------------------------------------
/legacy/pyspark.ml.feature-module.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "pyspark.ml.feature module"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "2/15/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## Introduction
13 |
14 | This module provides a set of functions, methods and classes which act on **features**. A feature is like a column from the data frame or table. You can see that most of functions or classes take parameters like `inputCol`, `featuresCol`, `outputCol`, `labelCol`. These parameters specify the names of column (features) that you want to work on.
15 |
16 | ## class pairs and `fit/transform` functions
17 |
18 | I found that there are a lot of class pairs in this module. For example:
19 |
20 | * `ChiSqSelector` and `ChiSqSelectorModel`
21 | * `CountVectorizer` and `CountVectorizerModel`
22 | * `IDF` and `IDFModel`
23 | * a lot of other pairs ...
24 |
25 | The first class in a pair have functions to build model (instructions about how you want to transform your data). The second class in a pair do the actual data transformation.
26 |
27 | * The `fit` function is from the first class and fit the built model to your data.
28 | * The `transform` function is from the second class and does the actual data transformation.
--------------------------------------------------------------------------------
/legacy/r-markdown-header.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "r-markdown-hearder"
3 | author: "Ming Chen"
4 | output: html_document
5 | ---
6 |
7 |
15 |
16 | ```{r setup, include=FALSE}
17 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
18 | ```
19 |
20 |
21 | ## Create SparkContext & SparkSession
22 |
23 | **SparkContext**
24 |
25 | ```{python}
26 | from pyspark import SparkContext
27 | sc = SparkContext(master = 'local')
28 | ```
29 |
30 | **SparkSession**
31 |
32 | ```{python}
33 | from pyspark.sql import SparkSession
34 | spark = SparkSession.builder \
35 | .appName("Learning Apach Spark") \
36 | .config("spark.some.config.option", "some-value") \
37 | .getOrCreate()
38 | ```
39 |
--------------------------------------------------------------------------------
/legacy/randomforest.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Random Forest Regression"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "February 19, 2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | ### Remark:
12 |
13 | - You can download the complete [ipython notebook](./ipynb/RandomForest.ipynb) for this tutorial session.
14 |
15 | ### 1. Set up spark context and SparkSession
16 |
17 | ```{python eval=FALSE}
18 | from pyspark.sql import SparkSession
19 |
20 | spark = SparkSession \
21 | .builder \
22 | .appName("Python Spark Random Forest Regression") \
23 | .config("spark.some.config.option", "some-value") \
24 | .getOrCreate()
25 | ```
26 |
27 | ### 2. Load dataset
28 | ```{python eval=FALSE}
29 | df = spark.read.format('com.databricks.spark.csv').\
30 | options(header='true', \
31 | inferschema='true').load("./data/WineData.csv",header=True);
32 | ```
33 |
34 | ```{python eval=FALSE}
35 | df.printSchema()
36 | ```
37 | ```{python eval=FALSE}
38 | #output
39 | root
40 | |-- fixed acidity: double (nullable = true)
41 | |-- volatile acidity: double (nullable = true)
42 | |-- citric acid: double (nullable = true)
43 | |-- residual sugar: double (nullable = true)
44 | |-- chlorides: double (nullable = true)
45 | |-- free sulfur dioxide: double (nullable = true)
46 | |-- total sulfur dioxide: double (nullable = true)
47 | |-- density: double (nullable = true)
48 | |-- pH: double (nullable = true)
49 | |-- sulphates: double (nullable = true)
50 | |-- alcohol: double (nullable = true)
51 | |-- quality: integer (nullable = true)
52 | ```
53 |
54 |
55 | ### 3. Convert the data to dense vector
56 | ```{python eval=FALSE}
57 | from pyspark.sql import Row
58 | from pyspark.ml.linalg import Vectors
59 | ```
60 | ```{python eval=FALSE}
61 | def transData(data):
62 | return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])
63 | ```
64 |
65 | ```{python eval=FALSE}
66 | transformed= transData(df)
67 | transformed.show(6)
68 | ```
69 |
70 | ```{python eval=FALSE}
71 | #output
72 | +--------------------+-----+
73 | | features|label|
74 | +--------------------+-----+
75 | |[7.4,0.7,0.0,1.9,...| 5|
76 | |[7.8,0.88,0.0,2.6...| 5|
77 | |[7.8,0.76,0.04,2....| 5|
78 | |[11.2,0.28,0.56,1...| 6|
79 | |[7.4,0.7,0.0,1.9,...| 5|
80 | |[7.4,0.66,0.0,1.8...| 5|
81 | +--------------------+-----+
82 | only showing top 6 rows
83 | ```
84 |
85 | ```{python eval=FALSE}
86 | from pyspark.ml import Pipeline
87 | from pyspark.ml.regression import RandomForestRegressor
88 | from pyspark.ml.feature import VectorIndexer
89 | from pyspark.ml.evaluation import RegressionEvaluator
90 | ```
91 | ### 4. Split the data into training and test sets (30% held out for testing)
92 | ```{python eval=FALSE}
93 | # Split the data into training and test sets (30% held out for testing)
94 | (trainingData, testData) = transformed.randomSplit([0.7, 0.3])
95 | ```
96 | ### 5. Train a RandomForest model.
97 |
98 | ```{python eval=FALSE}
99 | # Train a RandomForest model.
100 | rf = RandomForestRegressor()
101 | model = rf.fit(trainingData)
102 | ```
103 | ### 6. Make predictions.
104 |
105 | ```{python eval=FALSE}
106 | # Make predictions.
107 | predictions = model.transform(testData)
108 | ```
109 | ### 6. Show esults
110 | ```{python eval=FALSE}
111 | # Select example rows to display.
112 | predictions.select("prediction", "label", "features").show(5)
113 | ```
114 |
115 | ```{python eval=FALSE}
116 | #output
117 | +------------------+-----+--------------------+
118 | | prediction|label| features|
119 | +------------------+-----+--------------------+
120 | | 6.489667556875804| 7|[4.9,0.42,0.0,2.1...|
121 | | 6.267301910170284| 7|[5.1,0.42,0.0,1.8...|
122 | |6.0526786505470245| 7|[5.1,0.585,0.0,1....|
123 | | 5.257985010985523| 5|[5.2,0.32,0.25,1....|
124 | | 5.943264423589821| 7|[5.2,0.48,0.04,1....|
125 | +------------------+-----+--------------------+
126 | ```
127 |
128 | ### 7. Model Evaluation
129 | ```{python eval=FALSE}
130 | # Select (prediction, true label) and compute test error
131 | evaluator = RegressionEvaluator(
132 | labelCol="label", predictionCol="prediction", metricName="rmse")
133 | rmse = evaluator.evaluate(predictions)
134 | print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
135 | ```
136 |
137 | ```{python eval=FALSE}
138 | Root Mean Squared Error (RMSE) on test data = 0.659148
139 | ```
140 |
141 |
142 |
--------------------------------------------------------------------------------
/legacy/randomforestC.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Random Forest Regression"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "February 19, 2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 | ### Remark:
12 |
13 | - You can download the complete [ipython notebook](./ipynb/RandomForest.ipynb) for this tutorial session.
14 |
15 | ### 1. Set up spark context and SparkSession
16 |
17 | ```{python eval=FALSE}
18 | from pyspark.sql import SparkSession
19 |
20 | spark = SparkSession \
21 | .builder \
22 | .appName("Python Spark Random Forest Regression") \
23 | .config("spark.some.config.option", "some-value") \
24 | .getOrCreate()
25 | ```
26 |
27 | ### 2. Load dataset
28 | ```{python eval=FALSE}
29 | df = spark.read.format('com.databricks.spark.csv').\
30 | options(header='true', \
31 | inferschema='true').load("./data/WineData.csv",header=True);
32 | ```
33 |
34 | ```{python eval=FALSE}
35 | df.printSchema()
36 | ```
37 | ```{python eval=FALSE}
38 | #output
39 | root
40 | |-- fixed acidity: double (nullable = true)
41 | |-- volatile acidity: double (nullable = true)
42 | |-- citric acid: double (nullable = true)
43 | |-- residual sugar: double (nullable = true)
44 | |-- chlorides: double (nullable = true)
45 | |-- free sulfur dioxide: double (nullable = true)
46 | |-- total sulfur dioxide: double (nullable = true)
47 | |-- density: double (nullable = true)
48 | |-- pH: double (nullable = true)
49 | |-- sulphates: double (nullable = true)
50 | |-- alcohol: double (nullable = true)
51 | |-- quality: integer (nullable = true)
52 | ```
53 |
54 |
55 | ### 3. Convert the data to dense vector
56 | ```{python eval=FALSE}
57 | from pyspark.sql import Row
58 | from pyspark.ml.linalg import Vectors
59 | ```
60 | ```{python eval=FALSE}
61 | def transData(data):
62 | return data.rdd.map(lambda r: [Vectors.dense(r[:-1]),r[-1]]).toDF(['features','label'])
63 | ```
64 |
65 | ```{python eval=FALSE}
66 | transformed= transData(df)
67 | transformed.show(6)
68 | ```
69 |
70 | ```{python eval=FALSE}
71 | #output
72 | +--------------------+-----+
73 | | features|label|
74 | +--------------------+-----+
75 | |[7.4,0.7,0.0,1.9,...| 5|
76 | |[7.8,0.88,0.0,2.6...| 5|
77 | |[7.8,0.76,0.04,2....| 5|
78 | |[11.2,0.28,0.56,1...| 6|
79 | |[7.4,0.7,0.0,1.9,...| 5|
80 | |[7.4,0.66,0.0,1.8...| 5|
81 | +--------------------+-----+
82 | only showing top 6 rows
83 | ```
84 |
85 | ```{python eval=FALSE}
86 | from pyspark.ml import Pipeline
87 | from pyspark.ml.regression import RandomForestRegressor
88 | from pyspark.ml.feature import VectorIndexer
89 | from pyspark.ml.evaluation import RegressionEvaluator
90 | ```
91 | ### 4. Split the data into training and test sets (30% held out for testing)
92 | ```{python eval=FALSE}
93 | # Split the data into training and test sets (30% held out for testing)
94 | (trainingData, testData) = transformed.randomSplit([0.7, 0.3])
95 | ```
96 | ### 5. Train a RandomForest model.
97 |
98 | ```{python eval=FALSE}
99 | # Train a RandomForest model.
100 | rf = RandomForestRegressor()
101 | model = rf.fit(trainingData)
102 | ```
103 | ### 6. Make predictions.
104 |
105 | ```{python eval=FALSE}
106 | # Make predictions.
107 | predictions = model.transform(testData)
108 | ```
109 | ### 6. Show esults
110 | ```{python eval=FALSE}
111 | # Select example rows to display.
112 | predictions.select("prediction", "label", "features").show(5)
113 | ```
114 |
115 | ```{python eval=FALSE}
116 | #output
117 | +------------------+-----+--------------------+
118 | | prediction|label| features|
119 | +------------------+-----+--------------------+
120 | | 6.489667556875804| 7|[4.9,0.42,0.0,2.1...|
121 | | 6.267301910170284| 7|[5.1,0.42,0.0,1.8...|
122 | |6.0526786505470245| 7|[5.1,0.585,0.0,1....|
123 | | 5.257985010985523| 5|[5.2,0.32,0.25,1....|
124 | | 5.943264423589821| 7|[5.2,0.48,0.04,1....|
125 | +------------------+-----+--------------------+
126 | ```
127 |
128 | ### 7. Model Evaluation
129 | ```{python eval=FALSE}
130 | # Select (prediction, true label) and compute test error
131 | evaluator = RegressionEvaluator(
132 | labelCol="label", predictionCol="prediction", metricName="rmse")
133 | rmse = evaluator.evaluate(predictions)
134 | print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)
135 | ```
136 |
137 | ```{python eval=FALSE}
138 | Root Mean Squared Error (RMSE) on test data = 0.659148
139 | ```
140 |
141 |
142 |
--------------------------------------------------------------------------------
/legacy/regularization.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Regularization"
3 | author: "Ming Chen"
4 | date: "6/5/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 |
12 | ## Regularization
13 |
14 | Regularization is the technique used to solve the overfitting problem. An overfitted model means that the model can predict very well with the training data, but perform poorly with independent validation data.
15 |
16 | When we add more predictors to our model, we will almost neccessarily decrease the **Residual Sum of Squares** (RSS; smaller RSS indicates better model). This increases the complexity of our model and makes our model only perform well on the training data (overfitting).
17 |
18 | To balance the RSS and model overfitting, we introduce penalty for adding new predictors (coefficient $\beta \neq 0$) to the model.
19 |
20 | ## LASSO regularization and Ridge regularization
21 |
22 | * **LASSO**: $min \{RSS + \lambda\sum_{j=1}^{p}|\beta|\}$
23 | * **Ridge**: $min \{RSS + \lambda\sum_{j=1}^{p}\beta^2_j\}$
24 |
25 | ## Elastic Net regularization
26 |
27 | Elastic net is a regularized method that linearly combines penalities of the lasso and ridge methods.
28 |
29 | * **elastic net**: $min \{RSS + \lambda_1\sum_{j=1}^{p}|\beta| + \lambda_2\sum_{j=1}^{p}\beta^2_j\}$
30 |
31 | ## *regParam* and *elasticNetParam* parameters in regression models
32 |
33 | * **regParam**: regularization parameter $\lambda$
34 | * **elasticNetParam**: $\lambda_2$ ridge penalty
35 | * **Scenarios**:
36 | + *regParam* = $0$, *elasticNetParam* = $0$: no regularization applied, $\lambda = 0$
37 | + *regParam* $\neq 0$, *elasticNetParam* = $1$: lasso regularization applied
38 | + *regParam* $\neq 0$, *elasticNetParam* = $0$: ridge regularization applied
39 | + *regParam* $\neq 0$, $0 < elasticNetParam < 1$: elastic net regularization applied
--------------------------------------------------------------------------------
/legacy/sna.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Social Network Analysis"
3 | author: "Wenqiang Feng"
4 | date: "4/7/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## R Markdown
13 |
14 | This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see .
15 |
16 | When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
17 |
18 | ```{r cars}
19 | summary(cars)
20 | ```
21 |
22 | ## Including Plots
23 |
24 | You can also embed plots, for example:
25 |
26 | ```{r pressure, echo=FALSE}
27 | plot(pressure)
28 | ```
29 |
30 | Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.
31 |
--------------------------------------------------------------------------------
/legacy/spark-on-jetstream-cloud.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Spark on Jetstream Cloud"
3 | author: "Wenqiang Feng & Ming Chen"
4 | date: "3/8/2017"
5 | output: html_document
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE, eval = FALSE)
10 | ```
11 |
12 |
13 | ## Set up apache spark on jetstream
14 |
15 | * Install linuxbrew and spark
16 |
17 | ```{python}
18 | sudo apt-get install -y ruby
19 | ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Linuxbrew/install/master/install)"
20 |
21 | echo 'export PATH="/home/mchen33/.linuxbrew/bin:$PATH"' >>~/.bash_profile
22 | echo 'export MANPATH="/home/mchen33/.linuxbrew/share/man:$MANPATH"' >>~/.bash_profile
23 | echo 'export INFOPATH="/home/mchen33/.linuxbrew/share/info:$INFOPATH"' >>~/.bash_profile
24 |
25 | source ~/.bash_profile
26 |
27 | sudo apt-get install build-essential
28 |
29 | brew install apache-spark
30 |
31 | ## install java
32 | sudo apt-get install -y default-jre
33 | ```
34 |
35 |
36 | ```{python}
37 | export SPARK_LOCAL_IP="127.0.0.1"
38 | ```
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/link-spark-with-jupyter.md:
--------------------------------------------------------------------------------
1 | ## Install jupyter with conda
2 |
3 | ```
4 | conda install jupyter
5 | ```
6 |
7 | ## Get `jupyter binary executable path`
8 |
9 | ```
10 | which jupyter
11 | ```
12 |
13 | output
14 |
15 | ```
16 | /Users/mingchen/anaconda2/bin/jupyter
17 | ```
18 |
19 | ## Link spark with jupyter
20 |
21 | ```
22 | export PYSPARK_DRIVER_PYTHON=/Users/mingchen/anaconda2/bin/jupyter
23 | export PYSPARK_DRIVER_PYTHON_OPTS="notebook --NotebookApp.open_browser=False --NotebookApp.ip='*' --NotebookApp.port=8880"
24 | ```
25 |
26 | You can also add the two environmental variables to the `~/.bash_profile` file to permenantly link spark with jupyter
27 |
28 | ## Run jupyter notebook
29 |
30 | ```
31 | pyspark
32 | ```
33 |
34 | Then go to [http://127.0.0.1:8880](http://127.0.0.1:8880)
35 |
36 |
--------------------------------------------------------------------------------
/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/logo.jpg
--------------------------------------------------------------------------------
/notebooks/01-data-strcture/.gitignore:
--------------------------------------------------------------------------------
1 | spark-warehouse
2 |
--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/.ipynb_checkpoints/2.3-continuous-variable-to-categorical-variable-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# create entry points to spark\n",
10 | "try:\n",
11 | " sc.stop()\n",
12 | "except:\n",
13 | " pass\n",
14 | "from pyspark import SparkContext, SparkConf\n",
15 | "from pyspark.sql import SparkSession\n",
16 | "sc=SparkContext()\n",
17 | "spark = SparkSession(sparkContext=sc)"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "# Convert continuous variables to categorical variables"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "There are two functions we can use to split a continuous variable into categories:\n",
32 | "\n",
33 | "* `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold\n",
34 | "* `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.\n",
35 | " + with n+1 split points, there are n categories (buckets).\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Create some data"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "name": "stdout",
52 | "output_type": "stream",
53 | "text": [
54 | "+--------------------+------------------+\n",
55 | "| x1| x2|\n",
56 | "+--------------------+------------------+\n",
57 | "| 0.47143516373249306| 6.834629351721363|\n",
58 | "| -1.1909756947064645| 7.127020269829002|\n",
59 | "| 1.4327069684260973|3.7025075479039495|\n",
60 | "| -0.3126518960917129| 5.611961860656249|\n",
61 | "| -0.7205887333650116| 5.030831653078097|\n",
62 | "| 0.8871629403077386|0.1376844959068224|\n",
63 | "| 0.8595884137174165| 7.728266216123741|\n",
64 | "| -0.6365235044173491| 8.826411906361166|\n",
65 | "|0.015696372114428918| 3.648859839013723|\n",
66 | "| -2.2426849541854055| 6.153961784334937|\n",
67 | "+--------------------+------------------+\n",
68 | "\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "import numpy as np\n",
74 | "import pandas as pd\n",
75 | "np.random.seed(seed=1234)\n",
76 | "pdf = pd.DataFrame({\n",
77 | " 'x1': np.random.randn(10),\n",
78 | " 'x2': np.random.rand(10)*10\n",
79 | " })\n",
80 | "np.random.seed(seed=None)\n",
81 | "df = spark.createDataFrame(pdf)\n",
82 | "df.show()"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "## Binarize the column x1 and Bucketize the column x2"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 3,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "+--------------------+------------------+------+------+\n",
102 | "| x1| x2|x1_new|x2_new|\n",
103 | "+--------------------+------------------+------+------+\n",
104 | "| 0.47143516373249306| 6.834629351721363| 1.0| 2.0|\n",
105 | "| -1.1909756947064645| 7.127020269829002| 0.0| 2.0|\n",
106 | "| 1.4327069684260973|3.7025075479039495| 1.0| 1.0|\n",
107 | "| -0.3126518960917129| 5.611961860656249| 0.0| 2.0|\n",
108 | "| -0.7205887333650116| 5.030831653078097| 0.0| 2.0|\n",
109 | "| 0.8871629403077386|0.1376844959068224| 1.0| 0.0|\n",
110 | "| 0.8595884137174165| 7.728266216123741| 1.0| 3.0|\n",
111 | "| -0.6365235044173491| 8.826411906361166| 0.0| 3.0|\n",
112 | "|0.015696372114428918| 3.648859839013723| 1.0| 1.0|\n",
113 | "| -2.2426849541854055| 6.153961784334937| 0.0| 2.0|\n",
114 | "+--------------------+------------------+------+------+\n",
115 | "\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "from pyspark.ml.feature import Binarizer, Bucketizer\n",
121 | "# threshold = 0 for binarizer\n",
122 | "binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')\n",
123 | "# provide 5 split points to generate 4 buckets\n",
124 | "bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')\n",
125 | "\n",
126 | "# pipeline stages\n",
127 | "from pyspark.ml import Pipeline\n",
128 | "stages = [binarizer, bucketizer]\n",
129 | "pipeline = Pipeline(stages=stages)\n",
130 | "\n",
131 | "# fit the pipeline model and transform the data\n",
132 | "pipeline.fit(df).transform(df).show()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "collapsed": true
140 | },
141 | "outputs": [],
142 | "source": []
143 | }
144 | ],
145 | "metadata": {
146 | "kernelspec": {
147 | "display_name": "Python 3",
148 | "language": "python",
149 | "name": "python3"
150 | },
151 | "language_info": {
152 | "codemirror_mode": {
153 | "name": "ipython",
154 | "version": 3
155 | },
156 | "file_extension": ".py",
157 | "mimetype": "text/x-python",
158 | "name": "python",
159 | "nbconvert_exporter": "python",
160 | "pygments_lexer": "ipython3",
161 | "version": "3.6.5"
162 | }
163 | },
164 | "nbformat": 4,
165 | "nbformat_minor": 2
166 | }
167 |
--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/2.3-continuous-variable-to-categorical-variable.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# create entry points to spark\n",
10 | "try:\n",
11 | " sc.stop()\n",
12 | "except:\n",
13 | " pass\n",
14 | "from pyspark import SparkContext, SparkConf\n",
15 | "from pyspark.sql import SparkSession\n",
16 | "sc=SparkContext()\n",
17 | "spark = SparkSession(sparkContext=sc)"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "# Convert continuous variables to categorical variables"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "There are two functions we can use to split a continuous variable into categories:\n",
32 | "\n",
33 | "* `pyspark.ml.feature.Binarizer`: split a column of continuous features given a threshold\n",
34 | "* `pyspark.ml.feature.Bucktizer`: split a column of continuous features into categories given several breaking points.\n",
35 | " + with n+1 split points, there are n categories (buckets).\n"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "## Create some data"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "metadata": {},
49 | "outputs": [
50 | {
51 | "name": "stdout",
52 | "output_type": "stream",
53 | "text": [
54 | "+--------------------+------------------+\n",
55 | "| x1| x2|\n",
56 | "+--------------------+------------------+\n",
57 | "| 0.47143516373249306| 6.834629351721363|\n",
58 | "| -1.1909756947064645| 7.127020269829002|\n",
59 | "| 1.4327069684260973|3.7025075479039495|\n",
60 | "| -0.3126518960917129| 5.611961860656249|\n",
61 | "| -0.7205887333650116| 5.030831653078097|\n",
62 | "| 0.8871629403077386|0.1376844959068224|\n",
63 | "| 0.8595884137174165| 7.728266216123741|\n",
64 | "| -0.6365235044173491| 8.826411906361166|\n",
65 | "|0.015696372114428918| 3.648859839013723|\n",
66 | "| -2.2426849541854055| 6.153961784334937|\n",
67 | "+--------------------+------------------+\n",
68 | "\n"
69 | ]
70 | }
71 | ],
72 | "source": [
73 | "import numpy as np\n",
74 | "import pandas as pd\n",
75 | "np.random.seed(seed=1234)\n",
76 | "pdf = pd.DataFrame({\n",
77 | " 'x1': np.random.randn(10),\n",
78 | " 'x2': np.random.rand(10)*10\n",
79 | " })\n",
80 | "np.random.seed(seed=None)\n",
81 | "df = spark.createDataFrame(pdf)\n",
82 | "df.show()"
83 | ]
84 | },
85 | {
86 | "cell_type": "markdown",
87 | "metadata": {},
88 | "source": [
89 | "## Binarize the column x1 and Bucketize the column x2"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 3,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "+--------------------+------------------+------+------+\n",
102 | "| x1| x2|x1_new|x2_new|\n",
103 | "+--------------------+------------------+------+------+\n",
104 | "| 0.47143516373249306| 6.834629351721363| 1.0| 2.0|\n",
105 | "| -1.1909756947064645| 7.127020269829002| 0.0| 2.0|\n",
106 | "| 1.4327069684260973|3.7025075479039495| 1.0| 1.0|\n",
107 | "| -0.3126518960917129| 5.611961860656249| 0.0| 2.0|\n",
108 | "| -0.7205887333650116| 5.030831653078097| 0.0| 2.0|\n",
109 | "| 0.8871629403077386|0.1376844959068224| 1.0| 0.0|\n",
110 | "| 0.8595884137174165| 7.728266216123741| 1.0| 3.0|\n",
111 | "| -0.6365235044173491| 8.826411906361166| 0.0| 3.0|\n",
112 | "|0.015696372114428918| 3.648859839013723| 1.0| 1.0|\n",
113 | "| -2.2426849541854055| 6.153961784334937| 0.0| 2.0|\n",
114 | "+--------------------+------------------+------+------+\n",
115 | "\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "from pyspark.ml.feature import Binarizer, Bucketizer\n",
121 | "# threshold = 0 for binarizer\n",
122 | "binarizer = Binarizer(threshold=0, inputCol='x1', outputCol='x1_new')\n",
123 | "# provide 5 split points to generate 4 buckets\n",
124 | "bucketizer = Bucketizer(splits=[0, 2.5, 5, 7.5, 10], inputCol='x2', outputCol='x2_new')\n",
125 | "\n",
126 | "# pipeline stages\n",
127 | "from pyspark.ml import Pipeline\n",
128 | "stages = [binarizer, bucketizer]\n",
129 | "pipeline = Pipeline(stages=stages)\n",
130 | "\n",
131 | "# fit the pipeline model and transform the data\n",
132 | "pipeline.fit(df).transform(df).show()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {
139 | "collapsed": true
140 | },
141 | "outputs": [],
142 | "source": []
143 | }
144 | ],
145 | "metadata": {
146 | "kernelspec": {
147 | "display_name": "Python 3",
148 | "language": "python",
149 | "name": "python3"
150 | },
151 | "language_info": {
152 | "codemirror_mode": {
153 | "name": "ipython",
154 | "version": 3
155 | },
156 | "file_extension": ".py",
157 | "mimetype": "text/x-python",
158 | "name": "python",
159 | "nbconvert_exporter": "python",
160 | "pygments_lexer": "ipython3",
161 | "version": "3.6.5"
162 | }
163 | },
164 | "nbformat": 4,
165 | "nbformat_minor": 2
166 | }
167 |
--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/2.7.1-column-expression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "# create entry points to spark\n",
10 | "try:\n",
11 | " sc.stop()\n",
12 | "except:\n",
13 | " pass\n",
14 | "from pyspark import SparkContext, SparkConf\n",
15 | "from pyspark.sql import SparkSession\n",
16 | "sc=SparkContext()\n",
17 | "spark = SparkSession(sparkContext=sc)"
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Column expression\n",
25 | "\n",
26 | "A Spark **column instance** is **NOT a column of values** from the **DataFrame**: when you crate a column instance, it does not give you the actual values of that column in the DataFrame. I found it makes more sense to me if I consider a **column instance as a column of expressions**. These expressions are evaluated by other methods (e.g., the **select()**, **groupby()**, and **orderby()** from **pyspark.sql.DataFrame**)"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "## Example data"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
46 | "| model| mpg|cyl| disp| hp|drat| wt| qsec| vs| am|gear|carb|\n",
47 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
48 | "| Mazda RX4|21.0| 6|160.0|110| 3.9| 2.62|16.46| 0| 1| 4| 4|\n",
49 | "| Mazda RX4 Wag|21.0| 6|160.0|110| 3.9|2.875|17.02| 0| 1| 4| 4|\n",
50 | "| Datsun 710|22.8| 4|108.0| 93|3.85| 2.32|18.61| 1| 1| 4| 1|\n",
51 | "| Hornet 4 Drive|21.4| 6|258.0|110|3.08|3.215|19.44| 1| 0| 3| 1|\n",
52 | "|Hornet Sportabout|18.7| 8|360.0|175|3.15| 3.44|17.02| 0| 0| 3| 2|\n",
53 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
54 | "only showing top 5 rows\n",
55 | "\n"
56 | ]
57 | }
58 | ],
59 | "source": [
60 | "mtcars = spark.read.csv('../../data/mtcars.csv', inferSchema=True, header=True)\n",
61 | "mtcars = mtcars.withColumnRenamed('_c0', 'model')\n",
62 | "mtcars.show(5)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Use dot (.) to select column from DataFrame"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 4,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "data": {
79 | "text/plain": [
80 | "Column"
81 | ]
82 | },
83 | "execution_count": 4,
84 | "metadata": {},
85 | "output_type": "execute_result"
86 | }
87 | ],
88 | "source": [
89 | "mpg_col = mtcars.mpg\n",
90 | "mpg_col"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "## Modify a column to generate a new column"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 5,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "data": {
107 | "text/plain": [
108 | "Column"
109 | ]
110 | },
111 | "execution_count": 5,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "mpg_col + 1"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "+-----------+\n",
130 | "|(mpg * 100)|\n",
131 | "+-----------+\n",
132 | "| 2100.0|\n",
133 | "| 2100.0|\n",
134 | "| 2280.0|\n",
135 | "| 2140.0|\n",
136 | "| 1870.0|\n",
137 | "+-----------+\n",
138 | "only showing top 5 rows\n",
139 | "\n"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "mtcars.select(mpg_col * 100).show(5)"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "The **pyspark.sql.Column** has many methods that acts on a column and returns a column instance."
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 7,
157 | "metadata": {},
158 | "outputs": [
159 | {
160 | "name": "stdout",
161 | "output_type": "stream",
162 | "text": [
163 | "+----------------+\n",
164 | "|(gear IN (2, 3))|\n",
165 | "+----------------+\n",
166 | "| false|\n",
167 | "| false|\n",
168 | "| false|\n",
169 | "| true|\n",
170 | "| true|\n",
171 | "+----------------+\n",
172 | "only showing top 5 rows\n",
173 | "\n"
174 | ]
175 | }
176 | ],
177 | "source": [
178 | "mtcars.select(mtcars.gear.isin([2,3])).show(5)"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 8,
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "data": {
188 | "text/plain": [
189 | "Column"
190 | ]
191 | },
192 | "execution_count": 8,
193 | "metadata": {},
194 | "output_type": "execute_result"
195 | }
196 | ],
197 | "source": [
198 | "mtcars.mpg.asc()"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": null,
204 | "metadata": {
205 | "collapsed": true
206 | },
207 | "outputs": [],
208 | "source": []
209 | }
210 | ],
211 | "metadata": {
212 | "kernelspec": {
213 | "display_name": "Python 3",
214 | "language": "python",
215 | "name": "python3"
216 | },
217 | "language_info": {
218 | "codemirror_mode": {
219 | "name": "ipython",
220 | "version": 3
221 | },
222 | "file_extension": ".py",
223 | "mimetype": "text/x-python",
224 | "name": "python",
225 | "nbconvert_exporter": "python",
226 | "pygments_lexer": "ipython3",
227 | "version": "3.6.5"
228 | }
229 | },
230 | "nbformat": 4,
231 | "nbformat_minor": 2
232 | }
233 |
--------------------------------------------------------------------------------
/notebooks/02-data-manipulation/2.7.2-dot-column-expression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "# create entry points to spark\n",
12 | "try:\n",
13 | " sc.stop()\n",
14 | "except:\n",
15 | " pass\n",
16 | "from pyspark import SparkContext, SparkConf\n",
17 | "from pyspark.sql import SparkSession\n",
18 | "sc=SparkContext()\n",
19 | "spark = SparkSession(sparkContext=sc)"
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {},
25 | "source": [
26 | "## Example data"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "name": "stdout",
36 | "output_type": "stream",
37 | "text": [
38 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
39 | "| model| mpg|cyl| disp| hp|drat| wt| qsec| vs| am|gear|carb|\n",
40 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
41 | "| Mazda RX4|21.0| 6|160.0|110| 3.9| 2.62|16.46| 0| 1| 4| 4|\n",
42 | "| Mazda RX4 Wag|21.0| 6|160.0|110| 3.9|2.875|17.02| 0| 1| 4| 4|\n",
43 | "| Datsun 710|22.8| 4|108.0| 93|3.85| 2.32|18.61| 1| 1| 4| 1|\n",
44 | "| Hornet 4 Drive|21.4| 6|258.0|110|3.08|3.215|19.44| 1| 0| 3| 1|\n",
45 | "|Hornet Sportabout|18.7| 8|360.0|175|3.15| 3.44|17.02| 0| 0| 3| 2|\n",
46 | "+-----------------+----+---+-----+---+----+-----+-----+---+---+----+----+\n",
47 | "only showing top 5 rows\n",
48 | "\n"
49 | ]
50 | }
51 | ],
52 | "source": [
53 | "mtcars = spark.read.csv('../../../data/mtcars.csv', inferSchema=True, header=True)\n",
54 | "mtcars = mtcars.withColumnRenamed('_c0', 'model')\n",
55 | "mtcars.show(5)"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Dot (.) column expression\n",
63 | "\n",
64 | "Create a column expression that will return the original column values."
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 3,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "data": {
74 | "text/plain": [
75 | "Column"
76 | ]
77 | },
78 | "execution_count": 3,
79 | "metadata": {},
80 | "output_type": "execute_result"
81 | }
82 | ],
83 | "source": [
84 | "mpg_col_exp = mtcars.mpg\n",
85 | "mpg_col_exp"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 5,
91 | "metadata": {},
92 | "outputs": [
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "+----+\n",
98 | "| mpg|\n",
99 | "+----+\n",
100 | "|21.0|\n",
101 | "|21.0|\n",
102 | "|22.8|\n",
103 | "|21.4|\n",
104 | "|18.7|\n",
105 | "+----+\n",
106 | "only showing top 5 rows\n",
107 | "\n"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "mtcars.select(mpg_col_exp).show(5)"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": true
120 | },
121 | "outputs": [],
122 | "source": []
123 | }
124 | ],
125 | "metadata": {
126 | "kernelspec": {
127 | "display_name": "Python 3",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.5.0"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 2
146 | }
147 |
--------------------------------------------------------------------------------
/notebooks/04-miscellaneous/add-python-files-to-spark-cluster.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "The `SparkContext.addPyFiles()` function can be used to add py files. We can define objects and variables in these files and make them available to the Spark cluster."
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "# Create a SparkContext object"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "from pyspark import SparkConf, SparkContext, SparkFiles\n",
26 | "from pyspark.sql import SparkSession"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "sc = SparkContext(conf=SparkConf())"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "# Add py files"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {
51 | "collapsed": true
52 | },
53 | "outputs": [],
54 | "source": [
55 | "sc.addPyFile('pyFiles/my_module.py')"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "data": {
65 | "text/plain": [
66 | "'/private/var/folders/2_/kb60z5_j0k91tyh740s1zhn40000gn/T/spark-4f959e9f-4af6-490e-afce-02e1582aae8d/userFiles-8b1c073b-4c82-467a-b9ff-021aa3067abe/my_module.py'"
67 | ]
68 | },
69 | "execution_count": 4,
70 | "metadata": {},
71 | "output_type": "execute_result"
72 | }
73 | ],
74 | "source": [
75 | "SparkFiles.get('my_module.py')"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "# Use **my_module.py**\n",
83 | "We can import `my_module` as a python module"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": true
91 | },
92 | "outputs": [],
93 | "source": [
94 | "from my_module import *"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 6,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "data": {
104 | "text/plain": [
105 | "True"
106 | ]
107 | },
108 | "execution_count": 6,
109 | "metadata": {},
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "addPyFiles_is_successfull()"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 7,
120 | "metadata": {},
121 | "outputs": [
122 | {
123 | "data": {
124 | "text/plain": [
125 | "9"
126 | ]
127 | },
128 | "execution_count": 7,
129 | "metadata": {},
130 | "output_type": "execute_result"
131 | }
132 | ],
133 | "source": [
134 | "sum_two_variables(4,5)"
135 | ]
136 | }
137 | ],
138 | "metadata": {
139 | "kernelspec": {
140 | "display_name": "Python 3",
141 | "language": "python",
142 | "name": "python3"
143 | },
144 | "language_info": {
145 | "codemirror_mode": {
146 | "name": "ipython",
147 | "version": 3
148 | },
149 | "file_extension": ".py",
150 | "mimetype": "text/x-python",
151 | "name": "python",
152 | "nbconvert_exporter": "python",
153 | "pygments_lexer": "ipython3",
154 | "version": "3.5.0"
155 | }
156 | },
157 | "nbformat": 4,
158 | "nbformat_minor": 2
159 | }
160 |
--------------------------------------------------------------------------------
/notebooks/04-miscellaneous/dense-vs-sparse-vectors.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from pyspark import SparkConf, SparkContext\n",
12 | "from pyspark.sql import SparkSession"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 2,
18 | "metadata": {
19 | "collapsed": true
20 | },
21 | "outputs": [],
22 | "source": [
23 | "sc = SparkContext(conf=SparkConf())\n",
24 | "spark = SparkSession(sparkContext=sc)"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 3,
30 | "metadata": {
31 | "collapsed": true
32 | },
33 | "outputs": [],
34 | "source": [
35 | "from pyspark.ml.linalg import Vector, DenseVector, SparseVector"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "metadata": {},
41 | "source": [
42 | "# Dense vector and sparse vector"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "A vector can be represented in dense and sparse formats. A dense vector is a regular vector that has each elements printed. A sparse vector use three components to represent a vector but with less memory."
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 22,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/plain": [
60 | "DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])"
61 | ]
62 | },
63 | "execution_count": 22,
64 | "metadata": {},
65 | "output_type": "execute_result"
66 | }
67 | ],
68 | "source": [
69 | "dv = DenseVector([1.0,0.,0.,0.,4.5,0])\n",
70 | "dv"
71 | ]
72 | },
73 | {
74 | "cell_type": "markdown",
75 | "metadata": {},
76 | "source": [
77 | "## Three components of a sparse vector\n",
78 | "\n",
79 | "* vector size\n",
80 | "* indices of active elements\n",
81 | "* values of active elements\n",
82 | "\n",
83 | "In the above dense vector:\n",
84 | "\n",
85 | "* vector size = 6\n",
86 | "* indices of active elements = [0, 4]\n",
87 | "* values of active elements = [1.0, 4.5]"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "We can use the `SparseVector()` function to create a sparse vector. The first argument is the vector size, the second\n",
95 | "argument is a dictionary. The keys are indices of active elements and the values are values of active elements."
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 23,
101 | "metadata": {},
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "SparseVector(6, {0: 1.0, 4: 4.5})"
107 | ]
108 | },
109 | "execution_count": 23,
110 | "metadata": {},
111 | "output_type": "execute_result"
112 | }
113 | ],
114 | "source": [
115 | "sv = SparseVector(6, {0:1.0, 4:4.5})\n",
116 | "sv"
117 | ]
118 | },
119 | {
120 | "cell_type": "markdown",
121 | "metadata": {},
122 | "source": [
123 | "## Convert sparse vector to dense vector"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 30,
129 | "metadata": {},
130 | "outputs": [
131 | {
132 | "data": {
133 | "text/plain": [
134 | "DenseVector([1.0, 0.0, 0.0, 0.0, 4.5, 0.0])"
135 | ]
136 | },
137 | "execution_count": 30,
138 | "metadata": {},
139 | "output_type": "execute_result"
140 | }
141 | ],
142 | "source": [
143 | "DenseVector(sv.toArray())"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "## Convert dense vector to sparse vector"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 33,
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/plain": [
161 | "{0: 1.0, 4: 4.5}"
162 | ]
163 | },
164 | "execution_count": 33,
165 | "metadata": {},
166 | "output_type": "execute_result"
167 | }
168 | ],
169 | "source": [
170 | "active_elements_dict = {index: value for index, value in enumerate(dv) if value != 0}\n",
171 | "active_elements_dict"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 34,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | "SparseVector(6, {0: 1.0, 4: 4.5})"
183 | ]
184 | },
185 | "execution_count": 34,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "SparseVector(len(dv), active_elements_dict)"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": null,
197 | "metadata": {
198 | "collapsed": true
199 | },
200 | "outputs": [],
201 | "source": []
202 | }
203 | ],
204 | "metadata": {
205 | "kernelspec": {
206 | "display_name": "Python 3",
207 | "language": "python",
208 | "name": "python3"
209 | },
210 | "language_info": {
211 | "codemirror_mode": {
212 | "name": "ipython",
213 | "version": 3
214 | },
215 | "file_extension": ".py",
216 | "mimetype": "text/x-python",
217 | "name": "python",
218 | "nbconvert_exporter": "python",
219 | "pygments_lexer": "ipython3",
220 | "version": "3.5.0"
221 | }
222 | },
223 | "nbformat": 4,
224 | "nbformat_minor": 2
225 | }
226 |
--------------------------------------------------------------------------------
/notebooks/04-miscellaneous/issues-and-solutions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## Issues and Solutions"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "**load pyspark environment permission denied**\n",
15 | "\n",
16 | "This issue might be caused by a recently Mac OS updating to Sierra 10.12.5."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "```\n",
24 | "\n",
25 | "```"
26 | ]
27 | }
28 | ],
29 | "metadata": {
30 | "kernelspec": {
31 | "display_name": "Python 3",
32 | "language": "python",
33 | "name": "python3"
34 | },
35 | "language_info": {
36 | "codemirror_mode": {
37 | "name": "ipython",
38 | "version": 3
39 | },
40 | "file_extension": ".py",
41 | "mimetype": "text/x-python",
42 | "name": "python",
43 | "nbconvert_exporter": "python",
44 | "pygments_lexer": "ipython3",
45 | "version": "3.6.1"
46 | }
47 | },
48 | "nbformat": 4,
49 | "nbformat_minor": 2
50 | }
51 |
--------------------------------------------------------------------------------
/notebooks/05-module-turning/cross-validation.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Cross-validation\n",
8 | "---"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Training/validation/test data sets\n",
16 | "\n",
17 | "* **Training set**: the data set for training your models.\n",
18 | "* **Validation set**: The data set used for testing the performance of your models you have built with training sets. Based on the performance, you choose the best model (final).\n",
19 | "* **Test set**: use this data set to test the performance of your final model.\n",
20 | "\n",
21 | "## K-folds cross validation steps (k=4 as an example).\n",
22 | "\n",
23 | "* step 1: split your data into training set and test set (for example 80% training and 20% test). Test set will never be used in model training and selection. \n",
24 | "* step 2: split training set into k (k=4) eqaul subsets: 3 subsets for traing + 1 subset for validation.\n",
25 | "* step 3: training your models with the 3 subsets and calculate a performance score with the remaining 1 subset.\n",
26 | "* step 4: choose a different subset for validation and then repeat step 3 until every subset has been used as a validation subset.\n",
27 | "* step 5: for a k=4 fold cross validation, each trained model should have been validated by 4 subsets and therefore has 4 performance scores. Calculate the average of these 4 perfermance scores for each model. Use the average score to select the best, final model.\n",
28 | "* step 6: apply your final model to the **untouched** test data and see how it performs.\n",
29 | "\n",
30 | "## Example of k-folds cross validation\n",
31 | "\n",
32 | "### Build parameter grids\n",
33 | "\n",
34 | "* parameter grid: a combination of all variable parameters in your model.\n",
35 | "* example: If I want to train a logistic regression model on 4 different *regParam* and 3 different *elasticNetParam*, I will have 3 x 4 = 12 models to train and validate.\n",
36 | " "
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": null,
42 | "metadata": {
43 | "collapsed": true
44 | },
45 | "outputs": [],
46 | "source": [
47 | "from pyspark.ml.classification import LogisticRegression\n",
48 | "blor = LogisticRegression(featuresCol='indexed_features', labelCol='label', family='binomial')\n",
49 | "\n",
50 | "from pyspark.ml.tuning import ParamGridBuilder\n",
51 | "param_grid = ParamGridBuilder().\\\n",
52 | " addGrid(blor.regParam, [0, 0.5, 1, 2]).\\\n",
53 | " addGrid(blor.elasticNetParam, [0, 0.5, 1]).\\\n",
54 | " build()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "### Split data into training and test sets\n",
62 | "* Refer to the [logistic regression page](logistic-regression.ipynb) to see what data we used and how the training and test sets were generated.\n",
63 | "\n",
64 | "### Run k (k=4) folds cross validation"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {
71 | "collapsed": true
72 | },
73 | "outputs": [],
74 | "source": [
75 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
76 | "evaluator = BinaryClassificationEvaluator()\n",
77 | "\n",
78 | "from pyspark.ml.tuning import CrossValidator\n",
79 | "cv = CrossValidator(estimator=blor, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=4)\n",
80 | "\n",
81 | "cvModel = cv.fit(training)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": null,
87 | "metadata": {
88 | "collapsed": true
89 | },
90 | "outputs": [],
91 | "source": []
92 | }
93 | ],
94 | "metadata": {
95 | "kernelspec": {
96 | "display_name": "Python 3",
97 | "language": "python",
98 | "name": "python3"
99 | },
100 | "language_info": {
101 | "codemirror_mode": {
102 | "name": "ipython",
103 | "version": 3
104 | },
105 | "file_extension": ".py",
106 | "mimetype": "text/x-python",
107 | "name": "python",
108 | "nbconvert_exporter": "python",
109 | "pygments_lexer": "ipython3",
110 | "version": "3.6.1"
111 | }
112 | },
113 | "nbformat": 4,
114 | "nbformat_minor": 2
115 | }
116 |
--------------------------------------------------------------------------------
/notebooks/05-module-turning/regularization.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Regularization\n",
8 | "---"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "metadata": {},
14 | "source": [
15 | "## Regularization\n",
16 | "\n",
17 | "Regularization is the technique used to solve the overfitting problem. An overfitted model means that the model can predict very well with the training data, but perform poorly with independent validation data.\n",
18 | "\n",
19 | "When we add more predictors to our model, we will almost neccessarily decrease the **Residual Sum of Squares** (RSS; smaller RSS indicates better model). This increases the complexity of our model and makes our model only perform well on the training data (overfitting).\n",
20 | "\n",
21 | "To balance the RSS and model overfitting, we introduce penalty for adding new predictors (coefficient $\\beta \\neq 0$) to the model.\n",
22 | "\n",
23 | "\n",
24 | "\n",
25 | "\n",
26 | "\n",
27 | "\n",
28 | "\n",
29 | "\n",
30 | "## LASSO regularization and Ridge regularization\n",
31 | "\n",
32 | "* **LASSO**: $min \\{RSS + \\lambda\\sum_{j=1}^{p}|\\beta_1|\\}$\n",
33 | "* **Ridge**: $min \\{RSS + \\lambda\\sum_{j=1}^{p}\\beta^2_2\\}$\n",
34 | "\n",
35 | "## Elastic Net regularization\n",
36 | "\n",
37 | "Elastic net is a regularized method that linearly combines penalities of the lasso and ridge methods.\n",
38 | "\n",
39 | "* **elastic net**: $min \\{RSS + \\lambda[\\sum_{j=1}^{p}\\frac{1}{2}(1-\\alpha)|\\beta^2_2| + \\alpha\\sum_{j=1}^{p}\\beta_1]\\}$\n",
40 | "\n",
41 | "Reference: https://spark.apache.org/docs/2.1.1/ml-classification-regression.html\n",
42 | "\n",
43 | "## *regParam* and *elasticNetParam* parameters in regression models\n",
44 | "\n",
45 | "* **regParam**: corresponds to $\\lambda$\n",
46 | "* **elasticNetParam** corresponds to $\\alpha$. When $\\alpha = 0$, it is ridge regularization (L2 penalty). When $\\alpha = 1$, it is lasso regularization (L1 penalty)."
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "outputs": [],
56 | "source": []
57 | }
58 | ],
59 | "metadata": {
60 | "kernelspec": {
61 | "display_name": "Python 3",
62 | "language": "python",
63 | "name": "python3"
64 | },
65 | "language_info": {
66 | "codemirror_mode": {
67 | "name": "ipython",
68 | "version": 3
69 | },
70 | "file_extension": ".py",
71 | "mimetype": "text/x-python",
72 | "name": "python",
73 | "nbconvert_exporter": "python",
74 | "pygments_lexer": "ipython3",
75 | "version": "3.6.1"
76 | }
77 | },
78 | "nbformat": 4,
79 | "nbformat_minor": 2
80 | }
81 |
--------------------------------------------------------------------------------
/notebooks/ipynb/.ipynb_checkpoints/HashingTF-and-CountVectorizer-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 0
6 | }
7 |
--------------------------------------------------------------------------------
/notebooks/ipynb/.ipynb_checkpoints/NaiveBayes-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 0
6 | }
7 |
--------------------------------------------------------------------------------
/notebooks/ipynb/.ipynb_checkpoints/RDD-manipulation-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 0
6 | }
7 |
--------------------------------------------------------------------------------
/notebooks/ipynb/derby.log:
--------------------------------------------------------------------------------
1 | ----------------------------------------------------------------
2 | Wed Mar 22 19:59:25 EDT 2017:
3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.12.1.1 - (1704137): instance a816c00e-015a-f875-fa3e-0000108dd888
4 | on database directory /Users/mingchen/GoogleDrive/R-projects/learning-apache-spark/ipynb/metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@3bcf99f7
5 | Loaded from file:/usr/local/Cellar/apache-spark/2.1.0/libexec/jars/derby-10.12.1.1.jar
6 | java.vendor=Oracle Corporation
7 | java.runtime.version=1.8.0_51-b16
8 | user.dir=/Users/mingchen/GoogleDrive/R-projects/learning-apache-spark/ipynb
9 | os.name=Mac OS X
10 | os.arch=x86_64
11 | os.version=10.12.3
12 | derby.system.home=null
13 | Database Class Loader started - derby.database.classpath=''
14 |
--------------------------------------------------------------------------------
/notebooks/ipynb/preproc.py:
--------------------------------------------------------------------------------
1 | from nltk.stem.wordnet import WordNetLemmatizer
2 | from nltk.corpus import stopwords
3 | from nltk import pos_tag
4 | import string
5 | import re
6 | import langid
7 |
8 | # Convert to float format
9 | def string_to_float(x):
10 | return float(x)
11 |
12 | # Use langid module to classify the language to make sure we are applying the correct cleanup actions for English
13 | # https://github.com/saffsd/langid.py
14 | def check_lang(data_str):
15 | predict_lang = langid.classify(data_str)
16 | if predict_lang[1] >= .9:
17 | language = predict_lang[0]
18 | else:
19 | language = 'NA'
20 | return language
21 |
22 |
23 | # Stop words usually refer to the most common words in a language, there is no single universal list of stop words used
24 | # by all natural language processing tools.
25 | # Reduces Dimensionality
26 | # removes stop words of a single Tweets (cleaned_str/row/document)
27 | def remove_stops(data_str):
28 | # expects a string
29 | stops = set(stopwords.words("english"))
30 | list_pos = 0
31 | cleaned_str = ''
32 | text = data_str.split()
33 | for word in text:
34 | if word not in stops:
35 | # rebuild cleaned_str
36 | if list_pos == 0:
37 | cleaned_str = word
38 | else:
39 | cleaned_str = cleaned_str + ' ' + word
40 | list_pos += 1
41 | return cleaned_str
42 |
43 |
44 | # catch-all to remove other 'words' that I felt didn't add a lot of value
45 | # Reduces Dimensionality, gets rid of a lot of unique urls
46 | def remove_features(data_str):
47 | # compile regex
48 | url_re = re.compile('https?://(www.)?\w+\.\w+(/\w+)*/?')
49 | punc_re = re.compile('[%s]' % re.escape(string.punctuation))
50 | num_re = re.compile('(\\d+)')
51 | mention_re = re.compile('@(\w+)')
52 | alpha_num_re = re.compile("^[a-z0-9_.]+$")
53 | # convert to lowercase
54 | data_str = data_str.lower()
55 | # remove hyperlinks
56 | data_str = url_re.sub(' ', data_str)
57 | # remove @mentions
58 | data_str = mention_re.sub(' ', data_str)
59 | # remove puncuation
60 | data_str = punc_re.sub(' ', data_str)
61 | # remove numeric 'words'
62 | data_str = num_re.sub(' ', data_str)
63 | # remove non a-z 0-9 characters and words shorter than 3 characters
64 | list_pos = 0
65 | cleaned_str = ''
66 | for word in data_str.split():
67 | if list_pos == 0:
68 | if alpha_num_re.match(word) and len(word) > 2:
69 | cleaned_str = word
70 | else:
71 | cleaned_str = ' '
72 | else:
73 | if alpha_num_re.match(word) and len(word) > 2:
74 | cleaned_str = cleaned_str + ' ' + word
75 | else:
76 | cleaned_str += ' '
77 | list_pos += 1
78 | return cleaned_str
79 |
80 |
81 | # Process of classifying words into their parts of speech and labeling them accordingly is known as part-of-speech
82 | # tagging, POS-tagging, or simply tagging. Parts of speech are also known as word classes or lexical categories. The
83 | # collection of tags used for a particular task is known as a tagset. Our emphasis in this chapter is on exploiting
84 | # tags, and tagging text automatically.
85 | # http://www.nltk.org/book/ch05.html
86 | def tag_and_remove(data_str):
87 | cleaned_str = ' '
88 | # noun tags
89 | nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS']
90 | # adjectives
91 | jj_tags = ['JJ', 'JJR', 'JJS']
92 | # verbs
93 | vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
94 | nltk_tags = nn_tags + jj_tags + vb_tags
95 |
96 | # break string into 'words'
97 | text = data_str.split()
98 |
99 | # tag the text and keep only those with the right tags
100 | tagged_text = pos_tag(text)
101 | for tagged_word in tagged_text:
102 | if tagged_word[1] in nltk_tags:
103 | cleaned_str += tagged_word[0] + ' '
104 |
105 | return cleaned_str
106 |
107 |
108 | # Tweets are going to use different forms of a word, such as organize, organizes, and
109 | # organizing. Additionally, there are families of derivationally related words with similar meanings, such as democracy,
110 | # democratic, and democratization. In many situations, it seems as if it would be useful for a search for one of these
111 | # words to return documents that contain another word in the set.
112 | # Reduces Dimensionality and boosts numerical measures like TFIDF
113 |
114 | # http://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html
115 | # lemmatization of a single Tweets (cleaned_str/row/document)
116 | def lemmatize(data_str):
117 | # expects a string
118 | list_pos = 0
119 | cleaned_str = ''
120 | lmtzr = WordNetLemmatizer()
121 | text = data_str.split()
122 | tagged_words = pos_tag(text)
123 | for word in tagged_words:
124 | if 'v' in word[1].lower():
125 | lemma = lmtzr.lemmatize(word[0], pos='v')
126 | else:
127 | lemma = lmtzr.lemmatize(word[0], pos='n')
128 | if list_pos == 0:
129 | cleaned_str = lemma
130 | else:
131 | cleaned_str = cleaned_str + ' ' + lemma
132 | list_pos += 1
133 | return cleaned_str
134 |
135 |
136 | # check to see if a row only contains whitespace
137 | def check_blanks(data_str):
138 | is_blank = str(data_str.isspace())
139 | return is_blank
140 |
--------------------------------------------------------------------------------
/notebooks/ipynb/vector.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "from pyspark import SparkConf, SparkContext\n",
12 | "## set up spark context\n",
13 | "from pyspark.sql import SQLContext\n",
14 | "sc = SparkContext()\n",
15 | "sqlContext = SQLContext(sc)\n",
16 | "## set up SparkSession\n",
17 | "from pyspark.sql import SparkSession\n",
18 | "\n",
19 | "spark = SparkSession \\\n",
20 | " .builder \\\n",
21 | " .appName(\"Python Spark SQL basic example\") \\\n",
22 | " .config(\"spark.some.config.option\", \"some-value\") \\\n",
23 | " .getOrCreate()"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [],
33 | "source": [
34 | "from pyspark.ml.linalg import Vectors\n",
35 | "densevector = Vectors.dense([1,3,4,2.5])"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 3,
41 | "metadata": {
42 | "collapsed": false
43 | },
44 | "outputs": [
45 | {
46 | "data": {
47 | "text/plain": [
48 | "DenseVector([1.0, 3.0, 4.0, 2.5])"
49 | ]
50 | },
51 | "execution_count": 3,
52 | "metadata": {},
53 | "output_type": "execute_result"
54 | }
55 | ],
56 | "source": [
57 | "densevector"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 4,
63 | "metadata": {
64 | "collapsed": false
65 | },
66 | "outputs": [
67 | {
68 | "data": {
69 | "text/plain": [
70 | "array([ 1. , 3. , 4. , 2.5])"
71 | ]
72 | },
73 | "execution_count": 4,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "densevector.toArray()"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 5,
85 | "metadata": {
86 | "collapsed": true
87 | },
88 | "outputs": [],
89 | "source": [
90 | "sparseVector = Vectors.sparse(10, [1, 3], [3.0, 4.5])"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 6,
96 | "metadata": {
97 | "collapsed": false
98 | },
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | "array([ 0. , 3. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ])"
104 | ]
105 | },
106 | "execution_count": 6,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "sparseVector.toArray()"
113 | ]
114 | }
115 | ],
116 | "metadata": {
117 | "kernelspec": {
118 | "display_name": "Python 2",
119 | "language": "python",
120 | "name": "python2"
121 | },
122 | "language_info": {
123 | "codemirror_mode": {
124 | "name": "ipython",
125 | "version": 2
126 | },
127 | "file_extension": ".py",
128 | "mimetype": "text/x-python",
129 | "name": "python",
130 | "nbconvert_exporter": "python",
131 | "pygments_lexer": "ipython2",
132 | "version": "2.7.6"
133 | }
134 | },
135 | "nbformat": 4,
136 | "nbformat_minor": 1
137 | }
138 |
--------------------------------------------------------------------------------
/pyFiles/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/pyFiles/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/pyFiles/.idea/pyFiles.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/pyFiles/my_module.py:
--------------------------------------------------------------------------------
1 | def addPyFiles_is_successfull():
2 | return(True)
3 |
4 | def sum_two_variables(a, b):
5 | return(sum([a,b]))
6 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/.gitignore:
--------------------------------------------------------------------------------
1 | /debug
2 | /jstree.sublime-project
3 | /jstree.sublime-workspace
4 | /bower_components
5 | /node_modules
6 | /site
7 | /nuget
8 | /demo/filebrowser/data/root
9 | /npm.txt
10 | /libs
11 | /docs
12 | /dist/libs
13 | /.vscode
14 | /.idea
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/LICENSE-MIT:
--------------------------------------------------------------------------------
1 | Copyright (c) 2014 Ivan Bozhanov
2 |
3 | Permission is hereby granted, free of charge, to any person
4 | obtaining a copy of this software and associated documentation
5 | files (the "Software"), to deal in the Software without
6 | restriction, including without limitation the rights to use,
7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | copies of the Software, and to permit persons to whom the
9 | Software is furnished to do so, subject to the following
10 | conditions:
11 |
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/bower.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "jstree",
3 | "license": "MIT",
4 | "version": "3.3.5",
5 | "main" : [
6 | "./dist/jstree.js",
7 | "./dist/themes/default/style.css"
8 | ],
9 | "ignore": [
10 | "**/.*",
11 | "docs",
12 | "demo",
13 | "libs",
14 | "node_modules",
15 | "test",
16 | "libs",
17 | "jstree.jquery.json",
18 | "gruntfile.js",
19 | "package.json",
20 | "bower.json",
21 | "component.json",
22 | "LICENCE-MIT",
23 | "README.md"
24 | ],
25 | "dependencies": {
26 | "jquery": ">=1.9.1"
27 | },
28 | "keywords": [
29 | "ui",
30 | "tree",
31 | "jstree"
32 | ]
33 | }
34 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/component.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "jstree",
3 | "repo": "vakata/jstree",
4 | "description": "jsTree is jquery plugin, that provides interactive trees.",
5 | "version": "3.3.5",
6 | "license": "MIT",
7 | "keywords": [
8 | "ui",
9 | "tree",
10 | "jstree"
11 | ],
12 | "scripts": [
13 | "dist/jstree.js",
14 | "dist/jstree.min.js"
15 | ],
16 | "images": [
17 | "dist/themes/default/32px.png",
18 | "dist/themes/default/40px.png",
19 | "dist/themes/default/throbber.gif"
20 | ],
21 | "styles": [
22 | "dist/themes/default/style.css",
23 | "dist/themes/default/style.min.css"
24 | ],
25 | "dependencies": {
26 | "components/jquery": ">=1.9.1"
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "vakata/jstree",
3 | "description": "jsTree is jquery plugin, that provides interactive trees.",
4 | "type": "component",
5 | "homepage": "http://jstree.com",
6 | "license": "MIT",
7 | "support": {
8 | "issues": "https://github.com/vakata/jstree/issues",
9 | "forum": "https://groups.google.com/forum/#!forum/jstree",
10 | "source": "https://github.com/vakata/jstree"
11 | },
12 | "authors": [
13 | {
14 | "name": "Ivan Bozhanov",
15 | "email": "jstree@jstree.com"
16 | }
17 | ],
18 | "require": {
19 | "components/jquery": ">=1.9.1"
20 | },
21 | "suggest": {
22 | "robloach/component-installer": "Allows installation of Components via Composer"
23 | },
24 | "extra": {
25 | "component": {
26 | "scripts": [
27 | "dist/jstree.js"
28 | ],
29 | "styles": [
30 | "dist/themes/default/style.css"
31 | ],
32 | "images": [
33 | "dist/themes/default/32px.png",
34 | "dist/themes/default/40px.png",
35 | "dist/themes/default/throbber.gif"
36 | ],
37 | "files": [
38 | "dist/jstree.min.js",
39 | "dist/themes/default/style.min.css",
40 | "dist/themes/default/32px.png",
41 | "dist/themes/default/40px.png",
42 | "dist/themes/default/throbber.gif"
43 | ]
44 | }
45 | }
46 | }
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/demo/README.md:
--------------------------------------------------------------------------------
1 | ## PHP demos moved to new repository
2 | https://github.com/vakata/jstree-php-demos
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/demo/basic/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | jstree basic demos
6 |
12 |
13 |
14 |
15 | HTML demo
16 |
17 |
18 | - Root node
19 |
20 | - Child node 1
21 | - Child node 2
22 |
23 |
24 |
25 |
26 |
27 | Inline data demo
28 |
29 |
30 | Data format demo
31 |
32 |
33 | AJAX demo
34 |
35 |
36 | Lazy loading demo
37 |
38 |
39 | Callback function data demo
40 |
41 |
42 | Interaction and events demo
43 | either click the button or a node in the tree
44 |
45 |
46 |
47 |
48 |
49 |
145 |
146 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/demo/basic/root.json:
--------------------------------------------------------------------------------
1 | [{"id":1,"text":"Root node","children":[{"id":2,"text":"Child node 1"},{"id":3,"text":"Child node 2"}]}]
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default-dark/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/32px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default-dark/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/40px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default-dark/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default-dark/throbber.gif
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/32px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/40px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/dist/themes/default/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/dist/themes/default/throbber.gif
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/jstree.jquery.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "jstree",
3 | "title": "jsTree",
4 | "description": "Tree view for jQuery",
5 | "version": "3.3.5",
6 | "homepage": "http://jstree.com",
7 | "keywords": [
8 | "ui",
9 | "tree",
10 | "jstree"
11 | ],
12 | "author": {
13 | "name": "Ivan Bozhanov",
14 | "email": "jstree@jstree.com",
15 | "url": "http://vakata.com"
16 | },
17 | "licenses": [
18 | {
19 | "type": "MIT",
20 | "url": "https://github.com/vakata/jstree/blob/master/LICENSE-MIT"
21 | }
22 | ],
23 | "bugs": "https://github.com/vakata/jstree/issues",
24 | "demo": "http://jstree.com/demo",
25 | "dependencies": {
26 | "jquery": ">=1.9.1"
27 | }
28 | }
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "jstree",
3 | "title": "jsTree",
4 | "description": "jQuery tree plugin",
5 | "version": "3.3.5",
6 | "homepage": "http://jstree.com",
7 | "main": "./dist/jstree.js",
8 | "author": {
9 | "name": "Ivan Bozhanov",
10 | "email": "jstree@jstree.com",
11 | "url": "http://vakata.com"
12 | },
13 | "repository": {
14 | "type": "git",
15 | "url": "git://github.com/vakata/jstree.git"
16 | },
17 | "bugs": {
18 | "url": "https://github.com/vakata/jstree/issues"
19 | },
20 | "license": "MIT",
21 | "licenses": [
22 | {
23 | "type": "MIT",
24 | "url": "https://github.com/vakata/jstree/blob/master/LICENSE-MIT"
25 | }
26 | ],
27 | "keywords": [],
28 | "devDependencies": {
29 | "dox": "~0.4.4",
30 | "grunt": "~0.4.0",
31 | "grunt-contrib-concat": "*",
32 | "grunt-contrib-copy": "*",
33 | "grunt-contrib-imagemin": "~0.4.0",
34 | "grunt-contrib-jshint": "*",
35 | "grunt-contrib-less": "~0.8.2",
36 | "grunt-contrib-qunit": "~v0.3.0",
37 | "grunt-contrib-uglify": "*",
38 | "grunt-contrib-watch": "~0.5.3",
39 | "grunt-phantomcss-gitdiff": "0.0.7",
40 | "grunt-resemble-cli": "0.0.8",
41 | "grunt-text-replace": "~0.3.11"
42 | },
43 | "dependencies": {
44 | "jquery": ">=1.9.1"
45 | },
46 | "npmName": "jstree",
47 | "npmFileMap": [
48 | {
49 | "basePath": "/dist/",
50 | "files": [
51 | "jstree.min.js",
52 | "themes/**/*.png",
53 | "themes/**/*.gif",
54 | "themes/**/*.min.css"
55 | ]
56 | }
57 | ]
58 | }
59 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/intro.js:
--------------------------------------------------------------------------------
1 | /*globals jQuery, define, module, exports, require, window, document, postMessage */
2 | (function (factory) {
3 | "use strict";
4 | if (typeof define === 'function' && define.amd) {
5 | define(['jquery'], factory);
6 | }
7 | else if(typeof module !== 'undefined' && module.exports) {
8 | module.exports = factory(require('jquery'));
9 | }
10 | else {
11 | factory(jQuery);
12 | }
13 | }(function ($, undefined) {
14 | "use strict";
15 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.changed.js:
--------------------------------------------------------------------------------
1 | /**
2 | * ### Changed plugin
3 | *
4 | * This plugin adds more information to the `changed.jstree` event. The new data is contained in the `changed` event data property, and contains a lists of `selected` and `deselected` nodes.
5 | */
6 | /*globals jQuery, define, exports, require, document */
7 | (function (factory) {
8 | "use strict";
9 | if (typeof define === 'function' && define.amd) {
10 | define('jstree.changed', ['jquery','jstree'], factory);
11 | }
12 | else if(typeof exports === 'object') {
13 | factory(require('jquery'), require('jstree'));
14 | }
15 | else {
16 | factory(jQuery, jQuery.jstree);
17 | }
18 | }(function ($, jstree, undefined) {
19 | "use strict";
20 |
21 | if($.jstree.plugins.changed) { return; }
22 |
23 | $.jstree.plugins.changed = function (options, parent) {
24 | var last = [];
25 | this.trigger = function (ev, data) {
26 | var i, j;
27 | if(!data) {
28 | data = {};
29 | }
30 | if(ev.replace('.jstree','') === 'changed') {
31 | data.changed = { selected : [], deselected : [] };
32 | var tmp = {};
33 | for(i = 0, j = last.length; i < j; i++) {
34 | tmp[last[i]] = 1;
35 | }
36 | for(i = 0, j = data.selected.length; i < j; i++) {
37 | if(!tmp[data.selected[i]]) {
38 | data.changed.selected.push(data.selected[i]);
39 | }
40 | else {
41 | tmp[data.selected[i]] = 2;
42 | }
43 | }
44 | for(i = 0, j = last.length; i < j; i++) {
45 | if(tmp[last[i]] === 1) {
46 | data.changed.deselected.push(last[i]);
47 | }
48 | }
49 | last = data.selected.slice();
50 | }
51 | /**
52 | * triggered when selection changes (the "changed" plugin enhances the original event with more data)
53 | * @event
54 | * @name changed.jstree
55 | * @param {Object} node
56 | * @param {Object} action the action that caused the selection to change
57 | * @param {Array} selected the current selection
58 | * @param {Object} changed an object containing two properties `selected` and `deselected` - both arrays of node IDs, which were selected or deselected since the last changed event
59 | * @param {Object} event the event (if any) that triggered this changed event
60 | * @plugin changed
61 | */
62 | parent.trigger.call(this, ev, data);
63 | };
64 | this.refresh = function (skip_loading, forget_state) {
65 | last = [];
66 | return parent.refresh.apply(this, arguments);
67 | };
68 | };
69 | }));
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.conditionalselect.js:
--------------------------------------------------------------------------------
1 | /**
2 | * ### Conditionalselect plugin
3 | *
4 | * This plugin allows defining a callback to allow or deny node selection by user input (activate node method).
5 | */
6 | /*globals jQuery, define, exports, require, document */
7 | (function (factory) {
8 | "use strict";
9 | if (typeof define === 'function' && define.amd) {
10 | define('jstree.conditionalselect', ['jquery','jstree'], factory);
11 | }
12 | else if(typeof exports === 'object') {
13 | factory(require('jquery'), require('jstree'));
14 | }
15 | else {
16 | factory(jQuery, jQuery.jstree);
17 | }
18 | }(function ($, jstree, undefined) {
19 | "use strict";
20 |
21 | if($.jstree.plugins.conditionalselect) { return; }
22 |
23 | /**
24 | * a callback (function) which is invoked in the instance's scope and receives two arguments - the node and the event that triggered the `activate_node` call. Returning false prevents working with the node, returning true allows invoking activate_node. Defaults to returning `true`.
25 | * @name $.jstree.defaults.checkbox.visible
26 | * @plugin checkbox
27 | */
28 | $.jstree.defaults.conditionalselect = function () { return true; };
29 | $.jstree.plugins.conditionalselect = function (options, parent) {
30 | // own function
31 | this.activate_node = function (obj, e) {
32 | if(this.settings.conditionalselect.call(this, this.get_node(obj), e)) {
33 | return parent.activate_node.call(this, obj, e);
34 | }
35 | };
36 | };
37 |
38 | }));
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.massload.js:
--------------------------------------------------------------------------------
1 | /**
2 | * ### Massload plugin
3 | *
4 | * Adds massload functionality to jsTree, so that multiple nodes can be loaded in a single request (only useful with lazy loading).
5 | */
6 | /*globals jQuery, define, exports, require, document */
7 | (function (factory) {
8 | "use strict";
9 | if (typeof define === 'function' && define.amd) {
10 | define('jstree.massload', ['jquery','jstree'], factory);
11 | }
12 | else if(typeof exports === 'object') {
13 | factory(require('jquery'), require('jstree'));
14 | }
15 | else {
16 | factory(jQuery, jQuery.jstree);
17 | }
18 | }(function ($, jstree, undefined) {
19 | "use strict";
20 |
21 | if($.jstree.plugins.massload) { return; }
22 |
23 | /**
24 | * massload configuration
25 | *
26 | * It is possible to set this to a standard jQuery-like AJAX config.
27 | * In addition to the standard jQuery ajax options here you can supply functions for `data` and `url`, the functions will be run in the current instance's scope and a param will be passed indicating which node IDs need to be loaded, the return value of those functions will be used.
28 | *
29 | * You can also set this to a function, that function will receive the node IDs being loaded as argument and a second param which is a function (callback) which should be called with the result.
30 | *
31 | * Both the AJAX and the function approach rely on the same return value - an object where the keys are the node IDs, and the value is the children of that node as an array.
32 | *
33 | * {
34 | * "id1" : [{ "text" : "Child of ID1", "id" : "c1" }, { "text" : "Another child of ID1", "id" : "c2" }],
35 | * "id2" : [{ "text" : "Child of ID2", "id" : "c3" }]
36 | * }
37 | *
38 | * @name $.jstree.defaults.massload
39 | * @plugin massload
40 | */
41 | $.jstree.defaults.massload = null;
42 | $.jstree.plugins.massload = function (options, parent) {
43 | this.init = function (el, options) {
44 | this._data.massload = {};
45 | parent.init.call(this, el, options);
46 | };
47 | this._load_nodes = function (nodes, callback, is_callback, force_reload) {
48 | var s = this.settings.massload,
49 | nodesString = JSON.stringify(nodes),
50 | toLoad = [],
51 | m = this._model.data,
52 | i, j, dom;
53 | if (!is_callback) {
54 | for(i = 0, j = nodes.length; i < j; i++) {
55 | if(!m[nodes[i]] || ( (!m[nodes[i]].state.loaded && !m[nodes[i]].state.failed) || force_reload) ) {
56 | toLoad.push(nodes[i]);
57 | dom = this.get_node(nodes[i], true);
58 | if (dom && dom.length) {
59 | dom.addClass("jstree-loading").attr('aria-busy',true);
60 | }
61 | }
62 | }
63 | this._data.massload = {};
64 | if (toLoad.length) {
65 | if($.isFunction(s)) {
66 | return s.call(this, toLoad, $.proxy(function (data) {
67 | var i, j;
68 | if(data) {
69 | for(i in data) {
70 | if(data.hasOwnProperty(i)) {
71 | this._data.massload[i] = data[i];
72 | }
73 | }
74 | }
75 | for(i = 0, j = nodes.length; i < j; i++) {
76 | dom = this.get_node(nodes[i], true);
77 | if (dom && dom.length) {
78 | dom.removeClass("jstree-loading").attr('aria-busy',false);
79 | }
80 | }
81 | parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
82 | }, this));
83 | }
84 | if(typeof s === 'object' && s && s.url) {
85 | s = $.extend(true, {}, s);
86 | if($.isFunction(s.url)) {
87 | s.url = s.url.call(this, toLoad);
88 | }
89 | if($.isFunction(s.data)) {
90 | s.data = s.data.call(this, toLoad);
91 | }
92 | return $.ajax(s)
93 | .done($.proxy(function (data,t,x) {
94 | var i, j;
95 | if(data) {
96 | for(i in data) {
97 | if(data.hasOwnProperty(i)) {
98 | this._data.massload[i] = data[i];
99 | }
100 | }
101 | }
102 | for(i = 0, j = nodes.length; i < j; i++) {
103 | dom = this.get_node(nodes[i], true);
104 | if (dom && dom.length) {
105 | dom.removeClass("jstree-loading").attr('aria-busy',false);
106 | }
107 | }
108 | parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
109 | }, this))
110 | .fail($.proxy(function (f) {
111 | parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
112 | }, this));
113 | }
114 | }
115 | }
116 | return parent._load_nodes.call(this, nodes, callback, is_callback, force_reload);
117 | };
118 | this._load_node = function (obj, callback) {
119 | var data = this._data.massload[obj.id],
120 | rslt = null, dom;
121 | if(data) {
122 | rslt = this[typeof data === 'string' ? '_append_html_data' : '_append_json_data'](
123 | obj,
124 | typeof data === 'string' ? $($.parseHTML(data)).filter(function () { return this.nodeType !== 3; }) : data,
125 | function (status) { callback.call(this, status); }
126 | );
127 | dom = this.get_node(obj.id, true);
128 | if (dom && dom.length) {
129 | dom.removeClass("jstree-loading").attr('aria-busy',false);
130 | }
131 | delete this._data.massload[obj.id];
132 | return rslt;
133 | }
134 | return parent._load_node.call(this, obj, callback);
135 | };
136 | };
137 | }));
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.sort.js:
--------------------------------------------------------------------------------
1 | /**
2 | * ### Sort plugin
3 | *
4 | * Automatically sorts all siblings in the tree according to a sorting function.
5 | */
6 | /*globals jQuery, define, exports, require */
7 | (function (factory) {
8 | "use strict";
9 | if (typeof define === 'function' && define.amd) {
10 | define('jstree.sort', ['jquery','jstree'], factory);
11 | }
12 | else if(typeof exports === 'object') {
13 | factory(require('jquery'), require('jstree'));
14 | }
15 | else {
16 | factory(jQuery, jQuery.jstree);
17 | }
18 | }(function ($, jstree, undefined) {
19 | "use strict";
20 |
21 | if($.jstree.plugins.sort) { return; }
22 |
23 | /**
24 | * the settings function used to sort the nodes.
25 | * It is executed in the tree's context, accepts two nodes as arguments and should return `1` or `-1`.
26 | * @name $.jstree.defaults.sort
27 | * @plugin sort
28 | */
29 | $.jstree.defaults.sort = function (a, b) {
30 | //return this.get_type(a) === this.get_type(b) ? (this.get_text(a) > this.get_text(b) ? 1 : -1) : this.get_type(a) >= this.get_type(b);
31 | return this.get_text(a) > this.get_text(b) ? 1 : -1;
32 | };
33 | $.jstree.plugins.sort = function (options, parent) {
34 | this.bind = function () {
35 | parent.bind.call(this);
36 | this.element
37 | .on("model.jstree", $.proxy(function (e, data) {
38 | this.sort(data.parent, true);
39 | }, this))
40 | .on("rename_node.jstree create_node.jstree", $.proxy(function (e, data) {
41 | this.sort(data.parent || data.node.parent, false);
42 | this.redraw_node(data.parent || data.node.parent, true);
43 | }, this))
44 | .on("move_node.jstree copy_node.jstree", $.proxy(function (e, data) {
45 | this.sort(data.parent, false);
46 | this.redraw_node(data.parent, true);
47 | }, this));
48 | };
49 | /**
50 | * used to sort a node's children
51 | * @private
52 | * @name sort(obj [, deep])
53 | * @param {mixed} obj the node
54 | * @param {Boolean} deep if set to `true` nodes are sorted recursively.
55 | * @plugin sort
56 | * @trigger search.jstree
57 | */
58 | this.sort = function (obj, deep) {
59 | var i, j;
60 | obj = this.get_node(obj);
61 | if(obj && obj.children && obj.children.length) {
62 | obj.children.sort($.proxy(this.settings.sort, this));
63 | if(deep) {
64 | for(i = 0, j = obj.children_d.length; i < j; i++) {
65 | this.sort(obj.children_d[i], false);
66 | }
67 | }
68 | }
69 | };
70 | };
71 |
72 | // include the sort plugin by default
73 | // $.jstree.defaults.plugins.push("sort");
74 | }));
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.state.js:
--------------------------------------------------------------------------------
1 | /**
2 | * ### State plugin
3 | *
4 | * Saves the state of the tree (selected nodes, opened nodes) on the user's computer using available options (localStorage, cookies, etc)
5 | */
6 | /*globals jQuery, define, exports, require */
7 | (function (factory) {
8 | "use strict";
9 | if (typeof define === 'function' && define.amd) {
10 | define('jstree.state', ['jquery','jstree'], factory);
11 | }
12 | else if(typeof exports === 'object') {
13 | factory(require('jquery'), require('jstree'));
14 | }
15 | else {
16 | factory(jQuery, jQuery.jstree);
17 | }
18 | }(function ($, jstree, undefined) {
19 | "use strict";
20 |
21 | if($.jstree.plugins.state) { return; }
22 |
23 | var to = false;
24 | /**
25 | * stores all defaults for the state plugin
26 | * @name $.jstree.defaults.state
27 | * @plugin state
28 | */
29 | $.jstree.defaults.state = {
30 | /**
31 | * A string for the key to use when saving the current tree (change if using multiple trees in your project). Defaults to `jstree`.
32 | * @name $.jstree.defaults.state.key
33 | * @plugin state
34 | */
35 | key : 'jstree',
36 | /**
37 | * A space separated list of events that trigger a state save. Defaults to `changed.jstree open_node.jstree close_node.jstree`.
38 | * @name $.jstree.defaults.state.events
39 | * @plugin state
40 | */
41 | events : 'changed.jstree open_node.jstree close_node.jstree check_node.jstree uncheck_node.jstree',
42 | /**
43 | * Time in milliseconds after which the state will expire. Defaults to 'false' meaning - no expire.
44 | * @name $.jstree.defaults.state.ttl
45 | * @plugin state
46 | */
47 | ttl : false,
48 | /**
49 | * A function that will be executed prior to restoring state with one argument - the state object. Can be used to clear unwanted parts of the state.
50 | * @name $.jstree.defaults.state.filter
51 | * @plugin state
52 | */
53 | filter : false,
54 | /**
55 | * Should loaded nodes be restored (setting this to true means that it is possible that the whole tree will be loaded for some users - use with caution). Defaults to `false`
56 | * @name $.jstree.defaults.state.preserve_loaded
57 | * @plugin state
58 | */
59 | preserve_loaded : false
60 | };
61 | $.jstree.plugins.state = function (options, parent) {
62 | this.bind = function () {
63 | parent.bind.call(this);
64 | var bind = $.proxy(function () {
65 | this.element.on(this.settings.state.events, $.proxy(function () {
66 | if(to) { clearTimeout(to); }
67 | to = setTimeout($.proxy(function () { this.save_state(); }, this), 100);
68 | }, this));
69 | /**
70 | * triggered when the state plugin is finished restoring the state (and immediately after ready if there is no state to restore).
71 | * @event
72 | * @name state_ready.jstree
73 | * @plugin state
74 | */
75 | this.trigger('state_ready');
76 | }, this);
77 | this.element
78 | .on("ready.jstree", $.proxy(function (e, data) {
79 | this.element.one("restore_state.jstree", bind);
80 | if(!this.restore_state()) { bind(); }
81 | }, this));
82 | };
83 | /**
84 | * save the state
85 | * @name save_state()
86 | * @plugin state
87 | */
88 | this.save_state = function () {
89 | var tm = this.get_state();
90 | if (!this.settings.state.preserve_loaded) {
91 | delete tm.core.loaded;
92 | }
93 | var st = { 'state' : tm, 'ttl' : this.settings.state.ttl, 'sec' : +(new Date()) };
94 | $.vakata.storage.set(this.settings.state.key, JSON.stringify(st));
95 | };
96 | /**
97 | * restore the state from the user's computer
98 | * @name restore_state()
99 | * @plugin state
100 | */
101 | this.restore_state = function () {
102 | var k = $.vakata.storage.get(this.settings.state.key);
103 | if(!!k) { try { k = JSON.parse(k); } catch(ex) { return false; } }
104 | if(!!k && k.ttl && k.sec && +(new Date()) - k.sec > k.ttl) { return false; }
105 | if(!!k && k.state) { k = k.state; }
106 | if(!!k && $.isFunction(this.settings.state.filter)) { k = this.settings.state.filter.call(this, k); }
107 | if(!!k) {
108 | if (!this.settings.state.preserve_loaded) {
109 | delete k.core.loaded;
110 | }
111 | this.element.one("set_state.jstree", function (e, data) { data.instance.trigger('restore_state', { 'state' : $.extend(true, {}, k) }); });
112 | this.set_state(k);
113 | return true;
114 | }
115 | return false;
116 | };
117 | /**
118 | * clear the state on the user's computer
119 | * @name clear_state()
120 | * @plugin state
121 | */
122 | this.clear_state = function () {
123 | return $.vakata.storage.del(this.settings.state.key);
124 | };
125 | };
126 |
127 | (function ($, undefined) {
128 | $.vakata.storage = {
129 | // simply specifying the functions in FF throws an error
130 | set : function (key, val) { return window.localStorage.setItem(key, val); },
131 | get : function (key) { return window.localStorage.getItem(key); },
132 | del : function (key) { return window.localStorage.removeItem(key); }
133 | };
134 | }($));
135 |
136 | // include the state plugin by default
137 | // $.jstree.defaults.plugins.push("state");
138 | }));
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/jstree.wholerow.js:
--------------------------------------------------------------------------------
1 | /**
2 | * ### Wholerow plugin
3 | *
4 | * Makes each node appear block level. Making selection easier. May cause slow down for large trees in old browsers.
5 | */
6 | /*globals jQuery, define, exports, require */
7 | (function (factory) {
8 | "use strict";
9 | if (typeof define === 'function' && define.amd) {
10 | define('jstree.wholerow', ['jquery','jstree'], factory);
11 | }
12 | else if(typeof exports === 'object') {
13 | factory(require('jquery'), require('jstree'));
14 | }
15 | else {
16 | factory(jQuery, jQuery.jstree);
17 | }
18 | }(function ($, jstree, undefined) {
19 | "use strict";
20 |
21 | if($.jstree.plugins.wholerow) { return; }
22 |
23 | var div = document.createElement('DIV');
24 | div.setAttribute('unselectable','on');
25 | div.setAttribute('role','presentation');
26 | div.className = 'jstree-wholerow';
27 | div.innerHTML = ' ';
28 | $.jstree.plugins.wholerow = function (options, parent) {
29 | this.bind = function () {
30 | parent.bind.call(this);
31 |
32 | this.element
33 | .on('ready.jstree set_state.jstree', $.proxy(function () {
34 | this.hide_dots();
35 | }, this))
36 | .on("init.jstree loading.jstree ready.jstree", $.proxy(function () {
37 | //div.style.height = this._data.core.li_height + 'px';
38 | this.get_container_ul().addClass('jstree-wholerow-ul');
39 | }, this))
40 | .on("deselect_all.jstree", $.proxy(function (e, data) {
41 | this.element.find('.jstree-wholerow-clicked').removeClass('jstree-wholerow-clicked');
42 | }, this))
43 | .on("changed.jstree", $.proxy(function (e, data) {
44 | this.element.find('.jstree-wholerow-clicked').removeClass('jstree-wholerow-clicked');
45 | var tmp = false, i, j;
46 | for(i = 0, j = data.selected.length; i < j; i++) {
47 | tmp = this.get_node(data.selected[i], true);
48 | if(tmp && tmp.length) {
49 | tmp.children('.jstree-wholerow').addClass('jstree-wholerow-clicked');
50 | }
51 | }
52 | }, this))
53 | .on("open_node.jstree", $.proxy(function (e, data) {
54 | this.get_node(data.node, true).find('.jstree-clicked').parent().children('.jstree-wholerow').addClass('jstree-wholerow-clicked');
55 | }, this))
56 | .on("hover_node.jstree dehover_node.jstree", $.proxy(function (e, data) {
57 | if(e.type === "hover_node" && this.is_disabled(data.node)) { return; }
58 | this.get_node(data.node, true).children('.jstree-wholerow')[e.type === "hover_node"?"addClass":"removeClass"]('jstree-wholerow-hovered');
59 | }, this))
60 | .on("contextmenu.jstree", ".jstree-wholerow", $.proxy(function (e) {
61 | if (this._data.contextmenu) {
62 | e.preventDefault();
63 | var tmp = $.Event('contextmenu', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey, pageX : e.pageX, pageY : e.pageY });
64 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp);
65 | }
66 | }, this))
67 | /*!
68 | .on("mousedown.jstree touchstart.jstree", ".jstree-wholerow", function (e) {
69 | if(e.target === e.currentTarget) {
70 | var a = $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor");
71 | e.target = a[0];
72 | a.trigger(e);
73 | }
74 | })
75 | */
76 | .on("click.jstree", ".jstree-wholerow", function (e) {
77 | e.stopImmediatePropagation();
78 | var tmp = $.Event('click', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey });
79 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus();
80 | })
81 | .on("dblclick.jstree", ".jstree-wholerow", function (e) {
82 | e.stopImmediatePropagation();
83 | var tmp = $.Event('dblclick', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey });
84 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus();
85 | })
86 | .on("click.jstree", ".jstree-leaf > .jstree-ocl", $.proxy(function (e) {
87 | e.stopImmediatePropagation();
88 | var tmp = $.Event('click', { metaKey : e.metaKey, ctrlKey : e.ctrlKey, altKey : e.altKey, shiftKey : e.shiftKey });
89 | $(e.currentTarget).closest(".jstree-node").children(".jstree-anchor").first().trigger(tmp).focus();
90 | }, this))
91 | .on("mouseover.jstree", ".jstree-wholerow, .jstree-icon", $.proxy(function (e) {
92 | e.stopImmediatePropagation();
93 | if(!this.is_disabled(e.currentTarget)) {
94 | this.hover_node(e.currentTarget);
95 | }
96 | return false;
97 | }, this))
98 | .on("mouseleave.jstree", ".jstree-node", $.proxy(function (e) {
99 | this.dehover_node(e.currentTarget);
100 | }, this));
101 | };
102 | this.teardown = function () {
103 | if(this.settings.wholerow) {
104 | this.element.find(".jstree-wholerow").remove();
105 | }
106 | parent.teardown.call(this);
107 | };
108 | this.redraw_node = function(obj, deep, callback, force_render) {
109 | obj = parent.redraw_node.apply(this, arguments);
110 | if(obj) {
111 | var tmp = div.cloneNode(true);
112 | //tmp.style.height = this._data.core.li_height + 'px';
113 | if($.inArray(obj.id, this._data.core.selected) !== -1) { tmp.className += ' jstree-wholerow-clicked'; }
114 | if(this._data.core.focused && this._data.core.focused === obj.id) { tmp.className += ' jstree-wholerow-hovered'; }
115 | obj.insertBefore(tmp, obj.childNodes[0]);
116 | }
117 | return obj;
118 | };
119 | };
120 | // include the wholerow plugin by default
121 | // $.jstree.defaults.plugins.push("wholerow");
122 | }));
123 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/outro.js:
--------------------------------------------------------------------------------
1 | }));
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/sample.js:
--------------------------------------------------------------------------------
1 | /*global jQuery */
2 | // wrap in IIFE and pass jQuery as $
3 | (function ($, undefined) {
4 | "use strict";
5 |
6 | // some private plugin stuff if needed
7 | var private_var = null;
8 |
9 | // extending the defaults
10 | $.jstree.defaults.sample = {
11 | sample_option : 'sample_val'
12 | };
13 |
14 | // the actual plugin code
15 | $.jstree.plugins.sample = function (options, parent) {
16 | // own function
17 | this.sample_function = function (arg) {
18 | // you can chain this method if needed and available
19 | if(parent.sample_function) { parent.sample_function.call(this, arg); }
20 | };
21 |
22 | // *SPECIAL* FUNCTIONS
23 | this.init = function (el, options) {
24 | // do not forget parent
25 | parent.init.call(this, el, options);
26 | };
27 | // bind events if needed
28 | this.bind = function () {
29 | // call parent function first
30 | parent.bind.call(this);
31 | // do(stuff);
32 | };
33 | // unbind events if needed (all in jquery namespace are taken care of by the core)
34 | this.unbind = function () {
35 | // do(stuff);
36 | // call parent function last
37 | parent.unbind.call(this);
38 | };
39 | this.teardown = function () {
40 | // do not forget parent
41 | parent.teardown.call(this);
42 | };
43 | // state management - get and restore
44 | this.get_state = function () {
45 | // always get state from parent first
46 | var state = parent.get_state.call(this);
47 | // add own stuff to state
48 | state.sample = { 'var' : 'val' };
49 | return state;
50 | };
51 | this.set_state = function (state, callback) {
52 | // only process your part if parent returns true
53 | // there will be multiple times with false
54 | if(parent.set_state.call(this, state, callback)) {
55 | // check the key you set above
56 | if(state.sample) {
57 | // do(stuff); // like calling this.sample_function(state.sample.var);
58 | // remove your part of the state, call again and RETURN FALSE, the next cycle will be TRUE
59 | delete state.sample;
60 | this.set_state(state, callback);
61 | return false;
62 | }
63 | // return true if your state is gone (cleared in the previous step)
64 | return true;
65 | }
66 | // parent was false - return false too
67 | return false;
68 | };
69 | // node transportation
70 | this.get_json = function (obj, options, flat) {
71 | // get the node from the parent
72 | var tmp = parent.get_json.call(this, obj, options, flat), i, j;
73 | if($.isArray(tmp)) {
74 | for(i = 0, j = tmp.length; i < j; i++) {
75 | tmp[i].sample = 'value';
76 | }
77 | }
78 | else {
79 | tmp.sample = 'value';
80 | }
81 | // return the original / modified node
82 | return tmp;
83 | };
84 | };
85 |
86 | // attach to document ready if needed
87 | $(function () {
88 | // do(stuff);
89 | });
90 |
91 | // you can include the sample plugin in all instances by default
92 | $.jstree.defaults.plugins.push("sample");
93 | })(jQuery);
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/base.less:
--------------------------------------------------------------------------------
1 | // base jstree
2 | .jstree-node, .jstree-children, .jstree-container-ul { display:block; margin:0; padding:0; list-style-type:none; list-style-image:none; }
3 | .jstree-node { white-space:nowrap; }
4 | .jstree-anchor { display:inline-block; color:black; white-space:nowrap; padding:0 4px 0 1px; margin:0; vertical-align:top; }
5 | .jstree-anchor:focus { outline:0; }
6 | .jstree-anchor, .jstree-anchor:link, .jstree-anchor:visited, .jstree-anchor:hover, .jstree-anchor:active { text-decoration:none; color:inherit; }
7 | .jstree-icon { display:inline-block; text-decoration:none; margin:0; padding:0; vertical-align:top; text-align:center; }
8 | .jstree-icon:empty { display:inline-block; text-decoration:none; margin:0; padding:0; vertical-align:top; text-align:center; }
9 | .jstree-ocl { cursor:pointer; }
10 | .jstree-leaf > .jstree-ocl { cursor:default; }
11 | .jstree .jstree-open > .jstree-children { display:block; }
12 | .jstree .jstree-closed > .jstree-children,
13 | .jstree .jstree-leaf > .jstree-children { display:none; }
14 | .jstree-anchor > .jstree-themeicon { margin-right:2px; }
15 | .jstree-no-icons .jstree-themeicon,
16 | .jstree-anchor > .jstree-themeicon-hidden { display:none; }
17 | .jstree-hidden, .jstree-node.jstree-hidden { display:none; }
18 |
19 | // base jstree rtl
20 | .jstree-rtl {
21 | .jstree-anchor { padding:0 1px 0 4px; }
22 | .jstree-anchor > .jstree-themeicon { margin-left:2px; margin-right:0; }
23 | .jstree-node { margin-left:0; }
24 | .jstree-container-ul > .jstree-node { margin-right:0; }
25 | }
26 |
27 | // base jstree wholerow
28 | .jstree-wholerow-ul {
29 | position:relative;
30 | display:inline-block;
31 | min-width:100%;
32 | .jstree-leaf > .jstree-ocl { cursor:pointer; }
33 | .jstree-anchor, .jstree-icon { position:relative; }
34 | .jstree-wholerow { width:100%; cursor:pointer; position:absolute; left:0; -webkit-user-select:none; -moz-user-select:none; -ms-user-select:none; user-select:none; }
35 | }
36 |
37 | // base contextmenu
38 | .jstree-contextmenu .jstree-anchor {
39 | -webkit-user-select: none; /* disable selection/Copy of UIWebView */
40 | -webkit-touch-callout: none; /* disable the IOS popup when long-press on a link */
41 | }
42 | .vakata-context {
43 | display:none;
44 | &, ul { margin:0; padding:2px; position:absolute; background:#f5f5f5; border:1px solid #979797; box-shadow:2px 2px 2px #999999; }
45 | ul { list-style:none; left:100%; margin-top:-2.7em; margin-left:-4px; }
46 | .vakata-context-right ul { left:auto; right:100%; margin-left:auto; margin-right:-4px; }
47 | li {
48 | list-style:none;
49 | > a {
50 | display:block; padding:0 2em 0 2em; text-decoration:none; width:auto; color:black; white-space:nowrap; line-height:2.4em; text-shadow:1px 1px 0 white; border-radius:1px;
51 | &:hover { position:relative; background-color:#e8eff7; box-shadow:0 0 2px #0a6aa1; }
52 | &.vakata-context-parent { background-image:url("data:image/gif;base64,R0lGODlhCwAHAIAAACgoKP///yH5BAEAAAEALAAAAAALAAcAAAIORI4JlrqN1oMSnmmZDQUAOw=="); background-position:right center; background-repeat:no-repeat; }
53 | }
54 | > a:focus { outline:0; }
55 | }
56 | .vakata-context-hover > a { position:relative; background-color:#e8eff7; box-shadow:0 0 2px #0a6aa1; }
57 | .vakata-context-separator {
58 | > a, > a:hover { background:white; border:0; border-top:1px solid #e2e3e3; height:1px; min-height:1px; max-height:1px; padding:0; margin:0 0 0 2.4em; border-left:1px solid #e0e0e0; text-shadow:0 0 0 transparent; box-shadow:0 0 0 transparent; border-radius:0; }
59 | }
60 | .vakata-contextmenu-disabled {
61 | a, a:hover { color:silver; background-color:transparent; border:0; box-shadow:0 0 0; }
62 | }
63 | li > a {
64 | > i { text-decoration:none; display:inline-block; width:2.4em; height:2.4em; background:transparent; margin:0 0 0 -2em; vertical-align:top; text-align:center; line-height:2.4em; }
65 | > i:empty { width:2.4em; line-height:2.4em; }
66 | .vakata-contextmenu-sep { display:inline-block; width:1px; height:2.4em; background:white; margin:0 0.5em 0 0; border-left:1px solid #e2e3e3; }
67 | }
68 | .vakata-contextmenu-shortcut { font-size:0.8em; color:silver; opacity:0.5; display:none; }
69 | }
70 | .vakata-context-rtl {
71 | ul { left:auto; right:100%; margin-left:auto; margin-right:-4px; }
72 | li > a.vakata-context-parent { background-image:url("data:image/gif;base64,R0lGODlhCwAHAIAAACgoKP///yH5BAEAAAEALAAAAAALAAcAAAINjI+AC7rWHIsPtmoxLAA7"); background-position:left center; background-repeat:no-repeat; }
73 | .vakata-context-separator > a { margin:0 2.4em 0 0; border-left:0; border-right:1px solid #e2e3e3;}
74 | .vakata-context-left ul { right:auto; left:100%; margin-left:-4px; margin-right:auto; }
75 | li > a {
76 | > i { margin:0 -2em 0 0; }
77 | .vakata-contextmenu-sep { margin:0 0 0 0.5em; border-left-color:white; background:#e2e3e3; }
78 | }
79 | }
80 |
81 | // base drag'n'drop
82 | #jstree-marker { position: absolute; top:0; left:0; margin:-5px 0 0 0; padding:0; border-right:0; border-top:5px solid transparent; border-bottom:5px solid transparent; border-left:5px solid; width:0; height:0; font-size:0; line-height:0; }
83 | #jstree-dnd {
84 | line-height:16px;
85 | margin:0;
86 | padding:4px;
87 | .jstree-icon,
88 | .jstree-copy { display:inline-block; text-decoration:none; margin:0 2px 0 0; padding:0; width:16px; height:16px; }
89 | .jstree-ok { background:green; }
90 | .jstree-er { background:red; }
91 | .jstree-copy { margin:0 2px 0 2px; }
92 | }
93 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/32px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/40px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/style.less:
--------------------------------------------------------------------------------
1 | /* jsTree default dark theme */
2 | @theme-name: default-dark;
3 | @hovered-bg-color: #555;
4 | @hovered-shadow-color: #555;
5 | @disabled-color: #666666;
6 | @disabled-bg-color: #333333;
7 | @clicked-bg-color: #5fa2db;
8 | @clicked-shadow-color: #666666;
9 | @clicked-gradient-color-1: #5fa2db;
10 | @clicked-gradient-color-2: #5fa2db;
11 | @search-result-color: #ffffff;
12 | @mobile-wholerow-bg-color: #333333;
13 | @mobile-wholerow-shadow: #111111;
14 | @mobile-wholerow-bordert: #666;
15 | @mobile-wholerow-borderb: #000;
16 | @responsive: true;
17 | @image-path: "";
18 | @base-height: 40px;
19 |
20 | @import "../mixins.less";
21 | @import "../base.less";
22 | @import "../main.less";
23 |
24 | .jstree-@{theme-name} {
25 | background:#333;
26 | .jstree-anchor { color:#999; text-shadow:1px 1px 0 rgba(0,0,0,0.5); }
27 | .jstree-clicked, .jstree-checked { color:white; }
28 | .jstree-hovered { color:white; }
29 | #jstree-marker& {
30 | border-left-color:#999;
31 | background:transparent;
32 | }
33 | .jstree-anchor > .jstree-icon { opacity:0.75; }
34 | .jstree-clicked > .jstree-icon,
35 | .jstree-hovered > .jstree-icon,
36 | .jstree-checked > .jstree-icon { opacity:1; }
37 | }
38 | // theme variants
39 | .jstree-@{theme-name} {
40 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); }
41 | &.jstree-rtl .jstree-last { background:transparent; }
42 | }
43 | .jstree-@{theme-name}-small {
44 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAACAQMAAABv1h6PAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMHBgAAiABBI4gz9AAAAABJRU5ErkJggg=="); }
45 | &.jstree-rtl .jstree-last { background:transparent; }
46 | }
47 | .jstree-@{theme-name}-large {
48 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAACAQMAAAAD0EyKAAAABlBMVEUAAACZmZl+9SADAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjgIIGBgABCgCBvVLXcAAAAABJRU5ErkJggg=="); }
49 | &.jstree-rtl .jstree-last { background:transparent; }
50 | }
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default-dark/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default-dark/throbber.gif
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/32px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/40px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/40px.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/style.less:
--------------------------------------------------------------------------------
1 | /* jsTree default theme */
2 | @theme-name: default;
3 | @hovered-bg-color: #e7f4f9;
4 | @hovered-shadow-color: #cccccc;
5 | @disabled-color: #666666;
6 | @disabled-bg-color: #efefef;
7 | @clicked-bg-color: #beebff;
8 | @clicked-shadow-color: #999999;
9 | @clicked-gradient-color-1: #beebff;
10 | @clicked-gradient-color-2: #a8e4ff;
11 | @search-result-color: #8b0000;
12 | @mobile-wholerow-bg-color: #ebebeb;
13 | @mobile-wholerow-shadow: #666666;
14 | @mobile-wholerow-bordert: rgba(255,255,255,0.7);
15 | @mobile-wholerow-borderb: rgba(64,64,64,0.2);
16 | @responsive: true;
17 | @image-path: "";
18 | @base-height: 40px;
19 |
20 | @import "../mixins.less";
21 | @import "../base.less";
22 | @import "../main.less";
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/default/throbber.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/src/themes/default/throbber.gif
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/main.less:
--------------------------------------------------------------------------------
1 | .jstree-@{theme-name} {
2 | .jstree-node,
3 | .jstree-icon { background-repeat:no-repeat; background-color:transparent; }
4 | .jstree-anchor,
5 | .jstree-animated,
6 | .jstree-wholerow { transition:background-color 0.15s, box-shadow 0.15s; }
7 | .jstree-hovered { background:@hovered-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @hovered-shadow-color; }
8 | .jstree-context { background:@hovered-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @hovered-shadow-color; }
9 | .jstree-clicked { background:@clicked-bg-color; border-radius:2px; box-shadow:inset 0 0 1px @clicked-shadow-color; }
10 | .jstree-no-icons .jstree-anchor > .jstree-themeicon { display:none; }
11 | .jstree-disabled {
12 | background:transparent; color:@disabled-color;
13 | &.jstree-hovered { background:transparent; box-shadow:none; }
14 | &.jstree-clicked { background:@disabled-bg-color; }
15 | > .jstree-icon { opacity:0.8; filter: url("data:image/svg+xml;utf8,#jstree-grayscale"); /* Firefox 10+ */ filter: gray; /* IE6-9 */ -webkit-filter: grayscale(100%); /* Chrome 19+ & Safari 6+ */ }
16 | }
17 | // search
18 | .jstree-search { font-style:italic; color:@search-result-color; font-weight:bold; }
19 | // checkboxes
20 | .jstree-no-checkboxes .jstree-checkbox { display:none !important; }
21 | &.jstree-checkbox-no-clicked {
22 | .jstree-clicked {
23 | background:transparent;
24 | box-shadow:none;
25 | &.jstree-hovered { background:@hovered-bg-color; }
26 | }
27 | > .jstree-wholerow-ul .jstree-wholerow-clicked {
28 | background:transparent;
29 | &.jstree-wholerow-hovered { background:@hovered-bg-color; }
30 | }
31 | }
32 | // stripes
33 | > .jstree-striped { min-width:100%; display:inline-block; background:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAkCAMAAAB/qqA+AAAABlBMVEUAAAAAAAClZ7nPAAAAAnRSTlMNAMM9s3UAAAAXSURBVHjajcEBAQAAAIKg/H/aCQZ70AUBjAATb6YPDgAAAABJRU5ErkJggg==") left top repeat; }
34 | // wholerow
35 | > .jstree-wholerow-ul .jstree-hovered,
36 | > .jstree-wholerow-ul .jstree-clicked { background:transparent; box-shadow:none; border-radius:0; }
37 | .jstree-wholerow { -moz-box-sizing:border-box; -webkit-box-sizing:border-box; box-sizing:border-box; }
38 | .jstree-wholerow-hovered { background:@hovered-bg-color; }
39 | .jstree-wholerow-clicked { .gradient(@clicked-gradient-color-1, @clicked-gradient-color-2); }
40 | }
41 |
42 | // theme variants
43 | .jstree-@{theme-name} {
44 | .jstree-theme(24px, "@{image-path}32px.png", 32px);
45 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); }
46 | &.jstree-rtl .jstree-last { background:transparent; }
47 | }
48 | .jstree-@{theme-name}-small {
49 | .jstree-theme(18px, "@{image-path}32px.png", 32px);
50 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABIAAAACAQMAAABv1h6PAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMHBgAAiABBI4gz9AAAAABJRU5ErkJggg=="); }
51 | &.jstree-rtl .jstree-last { background:transparent; }
52 | }
53 | .jstree-@{theme-name}-large {
54 | .jstree-theme(32px, "@{image-path}32px.png", 32px);
55 | &.jstree-rtl .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAACAAAAACAQMAAAAD0EyKAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjgIIGBgABCgCBvVLXcAAAAABJRU5ErkJggg=="); }
56 | &.jstree-rtl .jstree-last { background:transparent; }
57 | }
58 |
59 | // mobile theme attempt
60 | @media (max-width: 768px) {
61 | #jstree-dnd.jstree-dnd-responsive when (@responsive = true) {
62 | line-height:@base-height; font-weight:bold; font-size:1.1em; text-shadow:1px 1px white;
63 | > i { background:transparent; width:@base-height; height:@base-height; }
64 | > .jstree-ok { background-image:url("@{image-path}@{base-height}.png"); background-position:0 -(@base-height * 5); background-size:(@base-height * 3) (@base-height * 6); }
65 | > .jstree-er { background-image:url("@{image-path}@{base-height}.png"); background-position:-(@base-height * 1) -(@base-height * 5); background-size:(@base-height * 3) (@base-height * 6); }
66 | }
67 | #jstree-marker.jstree-dnd-responsive when (@responsive = true) {
68 | border-left-width:10px;
69 | border-top-width:10px;
70 | border-bottom-width:10px;
71 | margin-top:-10px;
72 | }
73 | }
74 |
75 | .jstree-@{theme-name}-responsive when (@responsive = true) {
76 | @import "responsive.less";
77 | }
78 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/mixins.less:
--------------------------------------------------------------------------------
1 | .gradient (@color1; @color2) {
2 | background:@color1;
3 | background: -webkit-linear-gradient(top, @color1 0%,@color2 100%);
4 | background: linear-gradient(to bottom, @color1 0%,@color2 100%);
5 | }
6 |
7 | .jstree-theme (@base-height, @image, @image-height) {
8 | @correction: (@image-height - @base-height) / 2;
9 |
10 | .jstree-node { min-height:@base-height; line-height:@base-height; margin-left:@base-height; min-width:@base-height; }
11 | .jstree-anchor { line-height:@base-height; height:@base-height; }
12 | .jstree-icon { width:@base-height; height:@base-height; line-height:@base-height; }
13 | .jstree-icon:empty { width:@base-height; height:@base-height; line-height:@base-height; }
14 | &.jstree-rtl .jstree-node { margin-right:@base-height; }
15 | .jstree-wholerow { height:@base-height; }
16 |
17 | .jstree-node,
18 | .jstree-icon { background-image:url("@{image}"); }
19 | .jstree-node { background-position:-(@image-height * 9 + @correction) -@correction; background-repeat:repeat-y; }
20 | .jstree-last { background:transparent; }
21 |
22 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 4 + @correction) -@correction; }
23 | .jstree-closed > .jstree-ocl { background-position:-(@image-height * 3 + @correction) -@correction; }
24 | .jstree-leaf > .jstree-ocl { background-position:-(@image-height * 2 + @correction) -@correction; }
25 |
26 | .jstree-themeicon { background-position:-(@image-height * 8 + @correction) -@correction; }
27 |
28 | > .jstree-no-dots {
29 | .jstree-node,
30 | .jstree-leaf > .jstree-ocl { background:transparent; }
31 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 1 + @correction) -@correction; }
32 | .jstree-closed > .jstree-ocl { background-position:-@correction -@correction; }
33 | }
34 |
35 | .jstree-disabled {
36 | background:transparent;
37 | &.jstree-hovered {
38 | background:transparent;
39 | }
40 | &.jstree-clicked {
41 | background:#efefef;
42 | }
43 | }
44 |
45 | .jstree-checkbox {
46 | background-position:-(@image-height * 5 + @correction) -@correction;
47 | &:hover { background-position:-(@image-height * 5 + @correction) -(@image-height * 1 + @correction); }
48 | }
49 |
50 | &.jstree-checkbox-selection .jstree-clicked, .jstree-checked {
51 | > .jstree-checkbox {
52 | background-position:-(@image-height * 7 + @correction) -@correction;
53 | &:hover { background-position:-(@image-height * 7 + @correction) -(@image-height * 1 + @correction); }
54 | }
55 | }
56 | .jstree-anchor {
57 | > .jstree-undetermined {
58 | background-position:-(@image-height * 6 + @correction) -@correction;
59 | &:hover {
60 | background-position:-(@image-height * 6 + @correction) -(@image-height * 1 + @correction);
61 | }
62 | }
63 | }
64 | .jstree-checkbox-disabled { opacity:0.8; filter: url("data:image/svg+xml;utf8,#jstree-grayscale"); /* Firefox 10+ */ filter: gray; /* IE6-9 */ -webkit-filter: grayscale(100%); /* Chrome 19+ & Safari 6+ */ }
65 |
66 | > .jstree-striped { background-size:auto (@base-height * 2); }
67 |
68 | &.jstree-rtl {
69 | .jstree-node { background-image:url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABgAAAACAQMAAAB49I5GAAAABlBMVEUAAAAdHRvEkCwcAAAAAXRSTlMAQObYZgAAAAxJREFUCNdjAAMOBgAAGAAJMwQHdQAAAABJRU5ErkJggg=="); background-position: 100% 1px; background-repeat:repeat-y; }
70 | .jstree-last { background:transparent; }
71 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 4 + @correction) -(@image-height * 1 + @correction); }
72 | .jstree-closed > .jstree-ocl { background-position:-(@image-height * 3 + @correction) -(@image-height * 1 + @correction); }
73 | .jstree-leaf > .jstree-ocl { background-position:-(@image-height * 2 + @correction) -(@image-height * 1 + @correction); }
74 | > .jstree-no-dots {
75 | .jstree-node,
76 | .jstree-leaf > .jstree-ocl { background:transparent; }
77 | .jstree-open > .jstree-ocl { background-position:-(@image-height * 1 + @correction) -(@image-height * 1 + @correction); }
78 | .jstree-closed > .jstree-ocl { background-position:-@correction -(@image-height * 1 + @correction); }
79 | }
80 | }
81 | .jstree-themeicon-custom { background-color:transparent; background-image:none; background-position:0 0; }
82 |
83 | > .jstree-container-ul .jstree-loading > .jstree-ocl { background:url("@{image-path}throbber.gif") center center no-repeat; }
84 |
85 | .jstree-file { background:url("@{image}") -(@image-height * 3 + @correction) -(@image-height * 2 + @correction) no-repeat; }
86 | .jstree-folder { background:url("@{image}") -(@image-height * 8 + @correction) -(@correction) no-repeat; }
87 |
88 | > .jstree-container-ul > .jstree-node { margin-left:0; margin-right:0; }
89 |
90 | // drag'n'drop
91 | #jstree-dnd& {
92 | line-height:@base-height; padding:0 4px;
93 | .jstree-ok,
94 | .jstree-er { background-image:url("@{image-path}32px.png"); background-repeat:no-repeat; background-color:transparent; }
95 | i { background:transparent; width:@base-height; height:@base-height; line-height:@base-height; }
96 | .jstree-ok { background-position: -(@correction) -(@image-height * 2 + @correction); }
97 | .jstree-er { background-position: -(@image-height * 1 + @correction) -(@image-height * 2 + @correction); }
98 | }
99 |
100 | // ellipsis
101 | .jstree-ellipsis { overflow: hidden; }
102 | // base height + PADDINGS!
103 | .jstree-ellipsis .jstree-anchor { width: calc(100% ~"-" (@base-height + 5px)); text-overflow: ellipsis; overflow: hidden; }
104 | .jstree-ellipsis.jstree-no-icons .jstree-anchor { width: calc(100% ~"-" 5px); }
105 | }
106 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/themes/responsive.less:
--------------------------------------------------------------------------------
1 | @media (max-width: 768px) {
2 | // background image
3 | .jstree-icon { background-image:url("@{image-path}@{base-height}.png"); }
4 |
5 | .jstree-node,
6 | .jstree-leaf > .jstree-ocl { background:transparent; }
7 |
8 | .jstree-node { min-height:@base-height; line-height:@base-height; margin-left:@base-height; min-width:@base-height; white-space:nowrap; }
9 | .jstree-anchor { line-height:@base-height; height:@base-height; }
10 | .jstree-icon, .jstree-icon:empty { width:@base-height; height:@base-height; line-height:@base-height; }
11 |
12 | > .jstree-container-ul > .jstree-node { margin-left:0; }
13 | &.jstree-rtl .jstree-node { margin-left:0; margin-right:@base-height; background:transparent; }
14 | &.jstree-rtl .jstree-container-ul > .jstree-node { margin-right:0; }
15 |
16 | .jstree-ocl,
17 | .jstree-themeicon,
18 | .jstree-checkbox { background-size:(@base-height * 3) (@base-height * 6); }
19 | .jstree-leaf > .jstree-ocl,
20 | &.jstree-rtl .jstree-leaf > .jstree-ocl { background:transparent; }
21 | .jstree-open > .jstree-ocl { background-position:0 0px !important; }
22 | .jstree-closed > .jstree-ocl { background-position:0 -(@base-height * 1) !important; }
23 | &.jstree-rtl .jstree-closed > .jstree-ocl { background-position:-(@base-height * 1) 0px !important; }
24 |
25 | .jstree-themeicon { background-position:-(@base-height * 1) -(@base-height * 1); }
26 |
27 | .jstree-checkbox, .jstree-checkbox:hover { background-position:-(@base-height * 1) -(@base-height * 2); }
28 | &.jstree-checkbox-selection .jstree-clicked > .jstree-checkbox,
29 | &.jstree-checkbox-selection .jstree-clicked > .jstree-checkbox:hover,
30 | .jstree-checked > .jstree-checkbox,
31 | .jstree-checked > .jstree-checkbox:hover { background-position:0 -(@base-height * 2); }
32 | .jstree-anchor > .jstree-undetermined, .jstree-anchor > .jstree-undetermined:hover { background-position:0 -(@base-height * 3); }
33 |
34 | .jstree-anchor { font-weight:bold; font-size:1.1em; text-shadow:1px 1px white; }
35 |
36 | > .jstree-striped { background:transparent; }
37 | .jstree-wholerow { border-top:1px solid @mobile-wholerow-bordert; border-bottom:1px solid @mobile-wholerow-borderb; background:@mobile-wholerow-bg-color; height:@base-height; }
38 | .jstree-wholerow-hovered { background:@hovered-bg-color; }
39 | .jstree-wholerow-clicked { background:@clicked-bg-color; }
40 |
41 | // thanks to PHOTONUI
42 | .jstree-children .jstree-last > .jstree-wholerow { box-shadow: inset 0 -6px 3px -5px @mobile-wholerow-shadow; }
43 | .jstree-children .jstree-open > .jstree-wholerow { box-shadow: inset 0 6px 3px -5px @mobile-wholerow-shadow; border-top:0; }
44 | .jstree-children .jstree-open + .jstree-open { box-shadow:none; }
45 |
46 | // experiment
47 | .jstree-node,
48 | .jstree-icon,
49 | .jstree-node > .jstree-ocl,
50 | .jstree-themeicon,
51 | .jstree-checkbox { background-image:url("@{image-path}@{base-height}.png"); background-size:(@base-height * 3) (@base-height * 6); }
52 |
53 | .jstree-node { background-position:-(@base-height * 2) 0; background-repeat:repeat-y; }
54 | .jstree-last { background:transparent; }
55 | .jstree-leaf > .jstree-ocl { background-position:-(@base-height * 1) -(@base-height * 3); }
56 | .jstree-last > .jstree-ocl { background-position:-(@base-height * 1) -(@base-height * 4); }
57 | /*
58 | .jstree-open > .jstree-ocl,
59 | .jstree-closed > .jstree-ocl { border-radius:20px; background-color:white; }
60 | */
61 |
62 | .jstree-themeicon-custom { background-color:transparent; background-image:none; background-position:0 0; }
63 | .jstree-file { background:url("@{image-path}@{base-height}.png") 0 -(@base-height * 4) no-repeat; background-size:(@base-height * 3) (@base-height * 6); }
64 | .jstree-folder { background:url("@{image-path}@{base-height}.png") -(@base-height * 1) -(@base-height * 1) no-repeat; background-size:(@base-height * 3) (@base-height * 6); }
65 |
66 | > .jstree-container-ul > .jstree-node { margin-left:0; margin-right:0; }
67 | }
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/src/vakata-jstree.js:
--------------------------------------------------------------------------------
1 | (function (factory) {
2 | "use strict";
3 | if (typeof define === 'function' && define.amd) {
4 | define('jstree.checkbox', ['jquery','jstree'], factory);
5 | }
6 | else if(typeof exports === 'object') {
7 | factory(require('jquery'), require('jstree'));
8 | }
9 | else {
10 | factory(jQuery);
11 | }
12 | }(function ($, undefined) {
13 | "use strict";
14 | if(document.registerElement && Object && Object.create) {
15 | var proto = Object.create(HTMLElement.prototype);
16 | proto.createdCallback = function () {
17 | var c = { core : {}, plugins : [] }, i;
18 | for(i in $.jstree.plugins) {
19 | if($.jstree.plugins.hasOwnProperty(i) && this.attributes[i]) {
20 | c.plugins.push(i);
21 | if(this.getAttribute(i) && JSON.parse(this.getAttribute(i))) {
22 | c[i] = JSON.parse(this.getAttribute(i));
23 | }
24 | }
25 | }
26 | for(i in $.jstree.defaults.core) {
27 | if($.jstree.defaults.core.hasOwnProperty(i) && this.attributes[i]) {
28 | c.core[i] = JSON.parse(this.getAttribute(i)) || this.getAttribute(i);
29 | }
30 | }
31 | $(this).jstree(c);
32 | };
33 | // proto.attributeChangedCallback = function (name, previous, value) { };
34 | try {
35 | document.registerElement("vakata-jstree", { prototype: proto });
36 | } catch(ignore) { }
37 | }
38 | }));
39 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/unit/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Basic Test Suite
6 |
7 |
8 |
9 |
10 |
11 |
12 | this had better work.
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/unit/libs/qunit.css:
--------------------------------------------------------------------------------
1 | /**
2 | * QUnit v1.12.0 - A JavaScript Unit Testing Framework
3 | *
4 | * http://qunitjs.com
5 | *
6 | * Copyright 2012 jQuery Foundation and other contributors
7 | * Released under the MIT license.
8 | * http://jquery.org/license
9 | */
10 |
11 | /** Font Family and Sizes */
12 |
13 | #qunit-tests, #qunit-header, #qunit-banner, #qunit-testrunner-toolbar, #qunit-userAgent, #qunit-testresult {
14 | font-family: "Helvetica Neue Light", "HelveticaNeue-Light", "Helvetica Neue", Calibri, Helvetica, Arial, sans-serif;
15 | }
16 |
17 | #qunit-testrunner-toolbar, #qunit-userAgent, #qunit-testresult, #qunit-tests li { font-size: small; }
18 | #qunit-tests { font-size: smaller; }
19 |
20 |
21 | /** Resets */
22 |
23 | #qunit-tests, #qunit-header, #qunit-banner, #qunit-userAgent, #qunit-testresult, #qunit-modulefilter {
24 | margin: 0;
25 | padding: 0;
26 | }
27 |
28 |
29 | /** Header */
30 |
31 | #qunit-header {
32 | padding: 0.5em 0 0.5em 1em;
33 |
34 | color: #8699a4;
35 | background-color: #0d3349;
36 |
37 | font-size: 1.5em;
38 | line-height: 1em;
39 | font-weight: normal;
40 |
41 | border-radius: 5px 5px 0 0;
42 | -moz-border-radius: 5px 5px 0 0;
43 | -webkit-border-top-right-radius: 5px;
44 | -webkit-border-top-left-radius: 5px;
45 | }
46 |
47 | #qunit-header a {
48 | text-decoration: none;
49 | color: #c2ccd1;
50 | }
51 |
52 | #qunit-header a:hover,
53 | #qunit-header a:focus {
54 | color: #fff;
55 | }
56 |
57 | #qunit-testrunner-toolbar label {
58 | display: inline-block;
59 | padding: 0 .5em 0 .1em;
60 | }
61 |
62 | #qunit-banner {
63 | height: 5px;
64 | }
65 |
66 | #qunit-testrunner-toolbar {
67 | padding: 0.5em 0 0.5em 2em;
68 | color: #5E740B;
69 | background-color: #eee;
70 | overflow: hidden;
71 | }
72 |
73 | #qunit-userAgent {
74 | padding: 0.5em 0 0.5em 2.5em;
75 | background-color: #2b81af;
76 | color: #fff;
77 | text-shadow: rgba(0, 0, 0, 0.5) 2px 2px 1px;
78 | }
79 |
80 | #qunit-modulefilter-container {
81 | float: right;
82 | }
83 |
84 | /** Tests: Pass/Fail */
85 |
86 | #qunit-tests {
87 | list-style-position: inside;
88 | }
89 |
90 | #qunit-tests li {
91 | padding: 0.4em 0.5em 0.4em 2.5em;
92 | border-bottom: 1px solid #fff;
93 | list-style-position: inside;
94 | }
95 |
96 | #qunit-tests.hidepass li.pass, #qunit-tests.hidepass li.running {
97 | display: none;
98 | }
99 |
100 | #qunit-tests li strong {
101 | cursor: pointer;
102 | }
103 |
104 | #qunit-tests li a {
105 | padding: 0.5em;
106 | color: #c2ccd1;
107 | text-decoration: none;
108 | }
109 | #qunit-tests li a:hover,
110 | #qunit-tests li a:focus {
111 | color: #000;
112 | }
113 |
114 | #qunit-tests li .runtime {
115 | float: right;
116 | font-size: smaller;
117 | }
118 |
119 | .qunit-assert-list {
120 | margin-top: 0.5em;
121 | padding: 0.5em;
122 |
123 | background-color: #fff;
124 |
125 | border-radius: 5px;
126 | -moz-border-radius: 5px;
127 | -webkit-border-radius: 5px;
128 | }
129 |
130 | .qunit-collapsed {
131 | display: none;
132 | }
133 |
134 | #qunit-tests table {
135 | border-collapse: collapse;
136 | margin-top: .2em;
137 | }
138 |
139 | #qunit-tests th {
140 | text-align: right;
141 | vertical-align: top;
142 | padding: 0 .5em 0 0;
143 | }
144 |
145 | #qunit-tests td {
146 | vertical-align: top;
147 | }
148 |
149 | #qunit-tests pre {
150 | margin: 0;
151 | white-space: pre-wrap;
152 | word-wrap: break-word;
153 | }
154 |
155 | #qunit-tests del {
156 | background-color: #e0f2be;
157 | color: #374e0c;
158 | text-decoration: none;
159 | }
160 |
161 | #qunit-tests ins {
162 | background-color: #ffcaca;
163 | color: #500;
164 | text-decoration: none;
165 | }
166 |
167 | /*** Test Counts */
168 |
169 | #qunit-tests b.counts { color: black; }
170 | #qunit-tests b.passed { color: #5E740B; }
171 | #qunit-tests b.failed { color: #710909; }
172 |
173 | #qunit-tests li li {
174 | padding: 5px;
175 | background-color: #fff;
176 | border-bottom: none;
177 | list-style-position: inside;
178 | }
179 |
180 | /*** Passing Styles */
181 |
182 | #qunit-tests li li.pass {
183 | color: #3c510c;
184 | background-color: #fff;
185 | border-left: 10px solid #C6E746;
186 | }
187 |
188 | #qunit-tests .pass { color: #528CE0; background-color: #D2E0E6; }
189 | #qunit-tests .pass .test-name { color: #366097; }
190 |
191 | #qunit-tests .pass .test-actual,
192 | #qunit-tests .pass .test-expected { color: #999999; }
193 |
194 | #qunit-banner.qunit-pass { background-color: #C6E746; }
195 |
196 | /*** Failing Styles */
197 |
198 | #qunit-tests li li.fail {
199 | color: #710909;
200 | background-color: #fff;
201 | border-left: 10px solid #EE5757;
202 | white-space: pre;
203 | }
204 |
205 | #qunit-tests > li:last-child {
206 | border-radius: 0 0 5px 5px;
207 | -moz-border-radius: 0 0 5px 5px;
208 | -webkit-border-bottom-right-radius: 5px;
209 | -webkit-border-bottom-left-radius: 5px;
210 | }
211 |
212 | #qunit-tests .fail { color: #000000; background-color: #EE5757; }
213 | #qunit-tests .fail .test-name,
214 | #qunit-tests .fail .module-name { color: #000000; }
215 |
216 | #qunit-tests .fail .test-actual { color: #EE5757; }
217 | #qunit-tests .fail .test-expected { color: green; }
218 |
219 | #qunit-banner.qunit-fail { background-color: #EE5757; }
220 |
221 |
222 | /** Result */
223 |
224 | #qunit-testresult {
225 | padding: 0.5em 0.5em 0.5em 2.5em;
226 |
227 | color: #2b81af;
228 | background-color: #D2E0E6;
229 |
230 | border-bottom: 1px solid white;
231 | }
232 | #qunit-testresult .module-name {
233 | font-weight: bold;
234 | }
235 |
236 | /** Fixture */
237 |
238 | #qunit-fixture {
239 | position: absolute;
240 | top: -10000px;
241 | left: -10000px;
242 | width: 1000px;
243 | height: 1000px;
244 | }
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/unit/test.js:
--------------------------------------------------------------------------------
1 | test('basic test', function() {
2 | expect(1);
3 | ok(true, 'this had better work.');
4 | });
5 |
6 |
7 | test('can access the DOM', function() {
8 | expect(1);
9 | var fixture = document.getElementById('qunit-fixture');
10 | equal(fixture.innerText || fixture.textContent, 'this had better work.', 'should be able to access the DOM.');
11 | });
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/desktop/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Light theme visual tests
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | - Node 01
16 |
20 |
21 | - Node 02
22 | - Node 03
23 |
27 |
28 | - Node 04
29 | - Node 05
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
43 |
44 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/mobile/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Mobile theme visual tests
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | - Node 01
14 |
18 |
19 | - Node 02
20 | - Node 03
21 |
25 |
26 | - Node 04
27 | - Node 05
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
41 |
42 |
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/desktop/.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/desktop/desktop.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/desktop.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/desktop/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/desktop/home.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/mobile/.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/mobile/home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/home.png
--------------------------------------------------------------------------------
/vakata-jstree-3.3.5/test/visual/screenshots/mobile/mobile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MingChen0919/learning-apache-spark/675210da79cdb99b728deac0e5c1b0f9057a4357/vakata-jstree-3.3.5/test/visual/screenshots/mobile/mobile.png
--------------------------------------------------------------------------------