├── .ipynb_checkpoints
├── Exercise - RDD-checkpoint.ipynb
├── Module 1 - Foundations & RDD-checkpoint.ipynb
├── Module 2 - PySpark DataFrames-checkpoint.ipynb
├── PySpark-RDD-checkpoint.ipynb
├── Untitled-checkpoint.ipynb
└── Untitled1-checkpoint.ipynb
├── 1.png
├── 1800.csv
├── 2.png
├── 25. PySpark ML.ipynb
├── Allstate insurance Amount Prediction - Regression.ipynb
├── Allstate+insurance+Amount+Prediction+-+Regression.ipynb
├── Allstate-Project.zip
├── Baby_Names__Beginning_2007.csv
├── Broadcast Join.ipynb
├── Exercise - RDD.ipynb
├── GraphFrame Application.ipynb
├── HR_comma_sep.csv
├── Module 1 - Foundations & RDD.ipynb
├── Module 2 - PySpark DataFrames.ipynb
├── Pandas UDFs Benchmark.ipynb
├── Payment.ipynb
├── Predict+Employee+Exit+-+Classification (1).ipynb
├── Predict+Employee+Exit+-+Classification+-2.ipynb
├── Preprocessing.ipynb
├── Project-PySpark.zip
├── PySpark-DataFrames.ipynb
├── PySpark-ML.ipynb
├── PySpark-MlLib.ipynb
├── PySpark-RDD.ipynb
├── PySpark-Structured+Streaming.ipynb
├── README.md
├── Recommendation+Engine+ (1).ipynb
├── Recommendation+Engine+.ipynb
├── SCIO-PySpark-DF-Day1.ipynb
├── SCIO-PySpark-DF-Day2.ipynb
├── Spark Architecture.pptx
├── Spark RDD.pptx
├── Spark Storage Data Formats.pptx
├── Spark-Introduction.pptx
├── Spark-Partitioning.pptx
├── Spark-Performance Tuning.pptx
├── Spark_SQL
├── TSP.ipynb
├── Tensorframes.ipynb
├── Titanic Data Analysis using DataFrames.html
├── Titanic Data Analysis using DataFrames.ipynb
├── Uber-Jan-Feb-FOIL.csv
├── Untitled.ipynb
├── Untitled1.ipynb
├── abc.txt
├── ._SUCCESS.crc
├── .part-00000.crc
├── _SUCCESS
└── part-00000
├── all-world-cup-players.json
├── allstate_test.csv.zip
├── allstate_train.csv.zip
├── births_train.csv.gz
├── births_transformed.csv.gz
├── cass_code.py
├── cigna1-rdd.ipynb
├── cigna1.ipynb
├── customer-orders.csv
├── data
├── 2010-12-01.csv
├── 2015-summary.csv.txt
├── 2015-summary.json.txt
├── HR_comma_sep.csv
├── graphx
│ ├── followers.txt
│ └── users.txt
├── mllib
│ ├── als
│ │ ├── sample_movielens_ratings.txt
│ │ └── test.data
│ ├── gmm_data.txt
│ ├── kmeans_data.txt
│ ├── pagerank_data.txt
│ ├── pic_data.txt
│ ├── ridge-data
│ │ └── lpsa.data
│ ├── sample_binary_classification_data.txt
│ ├── sample_fpgrowth.txt
│ ├── sample_isotonic_regression_libsvm_data.txt
│ ├── sample_kmeans_data.txt
│ ├── sample_lda_data.txt
│ ├── sample_lda_libsvm_data.txt
│ ├── sample_libsvm_data.txt
│ ├── sample_linear_regression_data.txt
│ ├── sample_movielens_data.txt
│ ├── sample_multiclass_classification_data.txt
│ ├── sample_svm_data.txt
│ └── streaming_kmeans_data_test.txt
├── payments-data.csv
├── sales-data.csv
├── sales-funnel.xlsx
├── sales-of-shampoo-over-a-three-ye.csv
├── sp500.csv
├── store.csv
└── streaming
│ └── AFINN-111.txt
├── derby.log
├── entry.py
├── fakefriends.csv
├── hadoop-2.7.1.zip
├── hr_data_analysis.py
├── income-adult.data
├── kddcup.data_10_percent.gz
├── metastore_db
├── README_DO_NOT_TOUCH_FILES.txt
├── db.lck
├── dbex.lck
├── log
│ ├── README_DO_NOT_TOUCH_FILES.txt
│ ├── log.ctrl
│ ├── log1.dat
│ └── logmirror.ctrl
├── seg0
│ ├── README_DO_NOT_TOUCH_FILES.txt
│ ├── c10.dat
│ ├── c101.dat
│ ├── c111.dat
│ ├── c121.dat
│ ├── c130.dat
│ ├── c141.dat
│ ├── c150.dat
│ ├── c161.dat
│ ├── c171.dat
│ ├── c180.dat
│ ├── c191.dat
│ ├── c1a1.dat
│ ├── c1b1.dat
│ ├── c1c0.dat
│ ├── c1d1.dat
│ ├── c1e0.dat
│ ├── c1f1.dat
│ ├── c20.dat
│ ├── c200.dat
│ ├── c211.dat
│ ├── c221.dat
│ ├── c230.dat
│ ├── c241.dat
│ ├── c251.dat
│ ├── c260.dat
│ ├── c271.dat
│ ├── c281.dat
│ ├── c290.dat
│ ├── c2a1.dat
│ ├── c2b1.dat
│ ├── c2c1.dat
│ ├── c2d0.dat
│ ├── c2e1.dat
│ ├── c2f0.dat
│ ├── c300.dat
│ ├── c31.dat
│ ├── c311.dat
│ ├── c321.dat
│ ├── c331.dat
│ ├── c340.dat
│ ├── c351.dat
│ ├── c361.dat
│ ├── c371.dat
│ ├── c380.dat
│ ├── c391.dat
│ ├── c3a1.dat
│ ├── c3b1.dat
│ ├── c3c0.dat
│ ├── c3d1.dat
│ ├── c3e1.dat
│ ├── c3f1.dat
│ ├── c400.dat
│ ├── c41.dat
│ ├── c411.dat
│ ├── c421.dat
│ ├── c430.dat
│ ├── c441.dat
│ ├── c451.dat
│ ├── c461.dat
│ ├── c470.dat
│ ├── c481.dat
│ ├── c490.dat
│ ├── c4a1.dat
│ ├── c4b0.dat
│ ├── c4c1.dat
│ ├── c4d1.dat
│ ├── c4e1.dat
│ ├── c4f0.dat
│ ├── c501.dat
│ ├── c51.dat
│ ├── c510.dat
│ ├── c521.dat
│ ├── c530.dat
│ ├── c541.dat
│ ├── c550.dat
│ ├── c561.dat
│ ├── c570.dat
│ ├── c581.dat
│ ├── c590.dat
│ ├── c5a1.dat
│ ├── c5b0.dat
│ ├── c5c1.dat
│ ├── c5d0.dat
│ ├── c5e1.dat
│ ├── c5f0.dat
│ ├── c60.dat
│ ├── c601.dat
│ ├── c610.dat
│ ├── c621.dat
│ ├── c630.dat
│ ├── c641.dat
│ ├── c650.dat
│ ├── c661.dat
│ ├── c670.dat
│ ├── c681.dat
│ ├── c690.dat
│ ├── c6a1.dat
│ ├── c6b0.dat
│ ├── c6c1.dat
│ ├── c6d0.dat
│ ├── c6e1.dat
│ ├── c6f0.dat
│ ├── c701.dat
│ ├── c71.dat
│ ├── c711.dat
│ ├── c721.dat
│ ├── c731.dat
│ ├── c741.dat
│ ├── c751.dat
│ ├── c761.dat
│ ├── c771.dat
│ ├── c781.dat
│ ├── c791.dat
│ ├── c7a1.dat
│ ├── c7b1.dat
│ ├── c7c1.dat
│ ├── c7d1.dat
│ ├── c7e1.dat
│ ├── c7f1.dat
│ ├── c801.dat
│ ├── c81.dat
│ ├── c811.dat
│ ├── c821.dat
│ ├── c831.dat
│ ├── c840.dat
│ ├── c851.dat
│ ├── c860.dat
│ ├── c871.dat
│ ├── c880.dat
│ ├── c891.dat
│ ├── c8a0.dat
│ ├── c8b1.dat
│ ├── c8c1.dat
│ ├── c8d1.dat
│ ├── c8e1.dat
│ ├── c8f1.dat
│ ├── c90.dat
│ ├── c901.dat
│ ├── c911.dat
│ ├── c920.dat
│ ├── c931.dat
│ ├── c940.dat
│ ├── c951.dat
│ ├── c960.dat
│ ├── c971.dat
│ ├── c981.dat
│ ├── c990.dat
│ ├── c9a1.dat
│ ├── c9b1.dat
│ ├── c9c0.dat
│ ├── c9d1.dat
│ ├── c9e0.dat
│ ├── c9f1.dat
│ ├── ca01.dat
│ ├── ca1.dat
│ ├── ca11.dat
│ ├── ca21.dat
│ ├── cb1.dat
│ ├── cc0.dat
│ ├── cd1.dat
│ ├── ce1.dat
│ └── cf0.dat
└── service.properties
├── nifi_script.py
├── om.py
├── pyspark1.png
├── pyspark2.png
├── resources
├── employees.json
├── full_user.avsc
├── kv1.txt
├── people.json
├── people.txt
├── user.avsc
├── users.avro
└── users.parquet
├── setup.py
├── spark_hive.py
├── test_file.py
├── test_hr_data.csv
├── test_hr_data_analysis.py
├── titanic-survival-project.tar
├── titanic-train.csv
├── titanic.py
└── udt.py
/.ipynb_checkpoints/Exercise - RDD-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/.ipynb_checkpoints/Untitled1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [],
3 | "metadata": {},
4 | "nbformat": 4,
5 | "nbformat_minor": 2
6 | }
7 |
--------------------------------------------------------------------------------
/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/1.png
--------------------------------------------------------------------------------
/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/2.png
--------------------------------------------------------------------------------
/Allstate-Project.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Allstate-Project.zip
--------------------------------------------------------------------------------
/Broadcast Join.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["from pyspark.sql.functions import broadcast"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":1},{"cell_type":"code","source":["df1 = spark.createDataFrame([('a',2),('b',3),('c',4),('c',44)],['A','B'])"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":2},{"cell_type":"code","source":["df1.show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+---+---+\n A| B|\n+---+---+\n a| 2|\n b| 3|\n c| 4|\n c| 44|\n+---+---+\n\n
"]}}],"execution_count":3},{"cell_type":"code","source":["df2 = spark.createDataFrame([('a','aaaa'),('b','bbbb'),('c','cccc')],('A','C'))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":4},{"cell_type":"code","source":["df1.join(df2, df1.A == df2.A,'inner').show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+---+---+---+----+\n A| B| A| C|\n+---+---+---+----+\n c| 4| c|cccc|\n c| 44| c|cccc|\n b| 3| b|bbbb|\n a| 2| a|aaaa|\n+---+---+---+----+\n\n
"]}}],"execution_count":5},{"cell_type":"code","source":["df1.join(broadcast(df2),'A').show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+---+---+----+\n A| B| C|\n+---+---+----+\n a| 2|aaaa|\n b| 3|bbbb|\n c| 4|cccc|\n c| 44|cccc|\n+---+---+----+\n\n
"]}}],"execution_count":6},{"cell_type":"code","source":["spark.catalog.listDatabases()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\nOut[7]: [Database(name=u'default', description=u'Default Hive database', locationUri=u'dbfs:/user/hive/warehouse')]\n
"]}}],"execution_count":7},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":8}],"metadata":{"name":"Broadcast Join","notebookId":3788746436117310},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/Exercise - RDD.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "lines = sc.textFile('fakefriends.csv')"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 4,
17 | "metadata": {
18 | "collapsed": true
19 | },
20 | "outputs": [],
21 | "source": [
22 | "def parseLine(line):\n",
23 | " fields = line.split(',')\n",
24 | " age = int(fields[2])\n",
25 | " numFriends = int(fields[3])\n",
26 | " return (age, numFriends)"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 5,
32 | "metadata": {
33 | "collapsed": true
34 | },
35 | "outputs": [],
36 | "source": [
37 | "rdd = lines.map(parseLine)\n",
38 | "totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))\n",
39 | "averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])\n",
40 | "results = averagesByAge.collect()"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 6,
46 | "metadata": {
47 | "collapsed": false
48 | },
49 | "outputs": [
50 | {
51 | "name": "stdout",
52 | "output_type": "stream",
53 | "text": [
54 | "(18, 343)\n",
55 | "(19, 213)\n",
56 | "(20, 165)\n",
57 | "(21, 350)\n",
58 | "(22, 206)\n",
59 | "(23, 246)\n",
60 | "(24, 233)\n",
61 | "(25, 197)\n",
62 | "(26, 242)\n",
63 | "(27, 228)\n",
64 | "(28, 209)\n",
65 | "(29, 215)\n",
66 | "(30, 235)\n",
67 | "(31, 267)\n",
68 | "(32, 207)\n",
69 | "(33, 325)\n",
70 | "(34, 245)\n",
71 | "(35, 211)\n",
72 | "(36, 246)\n",
73 | "(37, 249)\n",
74 | "(38, 193)\n",
75 | "(39, 169)\n",
76 | "(40, 250)\n",
77 | "(41, 268)\n",
78 | "(42, 303)\n",
79 | "(43, 230)\n",
80 | "(44, 282)\n",
81 | "(45, 309)\n",
82 | "(46, 223)\n",
83 | "(47, 233)\n",
84 | "(48, 281)\n",
85 | "(49, 184)\n",
86 | "(50, 254)\n",
87 | "(51, 302)\n",
88 | "(52, 340)\n",
89 | "(53, 222)\n",
90 | "(54, 278)\n",
91 | "(55, 295)\n",
92 | "(56, 306)\n",
93 | "(57, 258)\n",
94 | "(58, 116)\n",
95 | "(59, 220)\n",
96 | "(60, 202)\n",
97 | "(61, 256)\n",
98 | "(62, 220)\n",
99 | "(63, 384)\n",
100 | "(64, 281)\n",
101 | "(65, 298)\n",
102 | "(66, 276)\n",
103 | "(67, 214)\n",
104 | "(68, 269)\n",
105 | "(69, 235)\n"
106 | ]
107 | }
108 | ],
109 | "source": [
110 | "for result in results:\n",
111 | " print(result)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {
118 | "collapsed": true
119 | },
120 | "outputs": [],
121 | "source": []
122 | }
123 | ],
124 | "metadata": {
125 | "kernelspec": {
126 | "display_name": "Python 2",
127 | "language": "python",
128 | "name": "python2"
129 | },
130 | "language_info": {
131 | "codemirror_mode": {
132 | "name": "ipython",
133 | "version": 2
134 | },
135 | "file_extension": ".py",
136 | "mimetype": "text/x-python",
137 | "name": "python",
138 | "nbconvert_exporter": "python",
139 | "pygments_lexer": "ipython2",
140 | "version": "2.7.13"
141 | }
142 | },
143 | "nbformat": 4,
144 | "nbformat_minor": 2
145 | }
146 |
--------------------------------------------------------------------------------
/GraphFrame Application.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["# Set File Paths\ntripdelaysFilePath = \"/databricks-datasets/flights/departuredelays.csv\"\nairportsnaFilePath = \"/databricks-datasets/flights/airport-codes-na.txt\"\n\n# Obtain airports dataset\n# Note, this dataset is tab-delimited with a header\nairportsna = spark.read.csv(airportsnaFilePath, header='true', inferSchema='true', sep='\\t')\nairportsna.createOrReplaceTempView(\"airports_na\")\n\n# Obtain departure Delays data\n# Note, this dataset is comma-delimited with a header\ndepartureDelays = spark.read.csv(tripdelaysFilePath, header='true')\ndepartureDelays.createOrReplaceTempView(\"departureDelays\")\ndepartureDelays.cache()"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["departureDelays.show()"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["airportsna.show()"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["#\n# Available IATA codes from the departuredelays sample dataset\ntripIATA = spark.sql(\"select distinct iata from (select distinct origin as iata from departureDelays union all select distinct destination as iata from departureDelays) a\")\ntripIATA.createOrReplaceTempView(\"tripIATA\")\ntripIATA.show()"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["\n# Only include airports with atleast one trip from the \n# `departureDelays` dataset\nairports = spark.sql(\"select f.IATA, f.City, f.State, f.Country from airports_na f join tripIATA t on t.IATA = f.IATA\")\nairports.createOrReplaceTempView(\"airports\")\nairports.cache()\nairports.show()"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["# Build `departureDelays_geo` DataFrame\n# Obtain key attributes such as Date of flight, delays, distance, \n# and airport information (Origin, Destination) \ndepartureDelays_geo = spark.sql(\"select cast(f.date as int) as tripid, cast(concat(concat(concat(concat(concat(concat('2014-', concat(concat(substr(cast(f.date as string), 1, 2), '-')), substr(cast(f.date as string), 3, 2)), ''), substr(cast(f.date as string), 5, 2)), ':'), substr(cast(f.date as string), 7, 2)), ':00') as timestamp) as `localdate`, cast(f.delay as int), cast(f.distance as int), f.origin as src, f.destination as dst, o.city as city_src, d.city as city_dst, o.state as state_src, d.state as state_dst from departuredelays f join airports o on o.iata = f.origin join airports d on d.iata = f.destination\") \n\n# Create Temporary View and cache\ndepartureDelays_geo.createOrReplaceTempView(\"departureDelays_geo\")\ndepartureDelays_geo.cache()\ndepartureDelays.printSchema()"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["# Review the top 10 rows of the `departureDelays_geo` DataFrame\ndepartureDelays_geo.show(10)"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["# Note, ensure you have already installed \n# the GraphFrames spark-package\nfrom pyspark.sql.functions import *\nfrom graphframes import *\n\n# Create Vertices (airports) and Edges (flights)\ntripVertices = airports.withColumnRenamed(\"IATA\", \"id\").distinct()\ntripEdges = departureDelays_geo.select(\"tripid\", \"delay\", \"src\", \"dst\", \"city_dst\", \"state_dst\")\n\n# Cache Vertices and Edges\ntripEdges.cache()\ntripVertices.cache()"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["display(tripEdges)"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["tripGraph = GraphFrame(tripVertices, tripEdges)"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["print \"Airports: %d\" % tripGraph.vertices.count()\nprint \"Trips: %d\" % tripGraph.edges.count()"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["tripGraph.edges.groupBy().max(\"delay\").collect()"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["print \"On-time / Early Flights: %d\" % tripGraph.edges.filter(\"delay <= 0\").count()\nprint \"Delayed Flights: %d\" % tripGraph.edges.filter(\"delay > 0\").count()"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"markdown","source":["Which flights departing Seattle are most likely to have significant delay ?
"],"metadata":{}},{"cell_type":"code","source":["tripGraph.edges\\\n .filter(\"src = 'SEA' and delay > 0\")\\\n .groupBy(\"src\", \"dst\")\\\n .avg(\"delay\")\\\n .sort(desc(\"avg(delay)\"))\\\n .show(5)"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"markdown","source":["Which states tend to have significant delays departing from Seattle
"],"metadata":{}},{"cell_type":"code","source":["# States with the longest cumulative delays (with individual\n# delays > 100 minutes) (origin: Seattle)\ndisplay(tripGraph.edges.filter(\"src = 'SEA' and delay > 100\"))"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"code","source":["display(tripGraph.degrees.sort(desc(\"degree\")).limit(20))"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"markdown","source":["Determining the top transfer airports
"],"metadata":{}},{"cell_type":"code","source":["inDeg = tripGraph.inDegrees #flights coming to airport\noutDeg = tripGraph.outDegrees #flights leaving airport"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"code","source":["degreeRatio = inDeg.join(outDeg, inDeg.id == outDeg.id).drop(outDeg.id).selectExpr(\"id\", \"double(inDegree)/double(outDegree) as degreeRatio\").cache()\ndegreeRatio.show(10)"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"code","source":["degreeRatio.join(airports, degreeRatio.id == airports.IATA).show(10)"],"metadata":{},"outputs":[],"execution_count":22},{"cell_type":"code","source":["transferAirports = degreeRatio.join(airports, degreeRatio.id == airports.IATA).selectExpr(\"id\",\"city\",\"degreeRatio\").filter(\"degreeRatio between 0.9 and 1.1\")\ndisplay(transferAirports.orderBy(\"degreeRatio\").limit(10))"],"metadata":{},"outputs":[],"execution_count":23},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":24},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":25}],"metadata":{"name":"GraphFrame Application","notebookId":3438443876603409},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/Pandas UDFs Benchmark.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["from pyspark.sql.types import *\nfrom pyspark.sql.functions import col, count, rand, collect_list, explode, struct, count, lit\nfrom pyspark.sql.functions import pandas_udf, PandasUDFType"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":1},{"cell_type":"code","source":["df = spark.range(0, 10 * 1000 * 1000).withColumn('id', (col('id') / 10000).cast('integer')).withColumn('v', rand())\ndf.cache()\ndf.count()\n\ndf.show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+---+--------------------+\n id| v|\n+---+--------------------+\n 0| 0.2606134722164434|\n 0| 0.8339772953748603|\n 0| 0.07305638728029662|\n 0|0.029602261658446816|\n 0| 0.7621764639789104|\n 0| 0.5869532091424473|\n 0| 0.5820613668108897|\n 0| 0.6558386724790438|\n 0| 0.9755782526778792|\n 0| 0.9562032763319009|\n 0| 0.2117948756600163|\n 0|0.025825721817323677|\n 0| 0.6758571301001655|\n 0| 0.3368090652216287|\n 0| 0.17677799850453058|\n 0| 0.42655675271888405|\n 0| 0.9738368781982333|\n 0| 0.22303982349690687|\n 0| 0.7831450268748825|\n 0| 0.08998811176838517|\n+---+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":2},{"cell_type":"code","source":["from pyspark.sql.types import IntegerType\nslen = udf(lambda s: col(s) + 1, IntegerType())"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["@udf('double')\ndef plus_one(v):\n return v + 1\n\n%timeit df.withColumn('v', plus_one(df.v)).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 4.47 s per loop\n
"]}}],"execution_count":4},{"cell_type":"code","source":["@pandas_udf(\"double\", PandasUDFType.SCALAR)\ndef pandas_plus_one(v):\n return v + 1\n\n%timeit df.withColumn('v', pandas_plus_one(df.v)).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 1.26 s per loop\n
"]}}],"execution_count":5},{"cell_type":"code","source":["import pandas as pd\nfrom scipy import stats\n\n@udf('double')\ndef cdf(v):\n return float(stats.norm.cdf(v))\n\n%timeit df.withColumn('cumulative_probability', cdf(df.v)).agg(count(col('cumulative_probability'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":6},{"cell_type":"code","source":["import pandas as pd\nfrom scipy import stats\n\n@pandas_udf('double', PandasUDFType.SCALAR)\ndef pandas_cdf(v):\n return pd.Series(stats.norm.cdf(v))\n\n%timeit df.withColumn('cumulative_probability', pandas_cdf(df.v)).agg(count(col('cumulative_probability'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n1 loops, best of 3: 1.24 s per loop\n
"]}}],"execution_count":7},{"cell_type":"code","source":["from pyspark.sql import Row\n@udf(ArrayType(df.schema))\ndef substract_mean(rows):\n vs = pd.Series([r.v for r in rows])\n vs = vs - vs.mean()\n return [Row(id=rows[i]['id'], v=float(vs[i])) for i in range(len(rows))]\n \n%timeit df.groupby('id').agg(collect_list(struct(df['id'], df['v'])).alias('rows')).withColumn('new_rows', substract_mean(col('rows'))).withColumn('new_row', explode(col('new_rows'))).withColumn('id', col('new_row.id')).withColumn('v', col('new_row.v')).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 2min 4s per loop\n
"]}}],"execution_count":8},{"cell_type":"code","source":["@pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)\n# Input/output are both a pandas.DataFrame\ndef pandas_subtract_mean(pdf):\n\treturn pdf.assign(v=pdf.v - pdf.v.mean())\n\n%timeit df.groupby('id').apply(pandas_subtract_mean).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 4.65 s per loop\n
"]}}],"execution_count":9},{"cell_type":"code","source":["df2 = df.withColumn('y', rand()).withColumn('x1', rand()).withColumn('x2', rand()).select('id', 'y', 'x1', 'x2')\ndf2.show() "],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+---+-------------------+--------------------+-------------------+\n id| y| x1| x2|\n+---+-------------------+--------------------+-------------------+\n 0| 0.681296060112836| 0.1288311079969241| 0.8181088445104816|\n 0| 0.8888773955549102| 0.25520608131769806| 0.8371196660049978|\n 0|0.12865336389189674| 0.05949353129319879|0.31240880781369607|\n 0|0.33102706063173315| 0.3184970944017924| 0.9934788617057889|\n 0|0.08530551734633984| 0.849098348411309|0.25958206625946156|\n 0| 0.1142436882234027| 0.09221618780441287|0.06660852847156451|\n 0| 0.3734801477601759| 0.16175735111155454|0.23741551784520665|\n 0| 0.4626832884602221| 0.4090520759820342|0.21143786407406573|\n 0| 0.3089074870133427| 0.7875508394004922|0.20291186344825263|\n 0| 0.6963359144225203| 0.24446551311290765|0.09530396721263867|\n 0|0.18601574521309183| 0.18283878773443607| 0.9049305072841698|\n 0| 0.9986921036424282| 0.5744991393169917| 0.4377204256577574|\n 0|0.47413548244645665| 0.8647990390377169| 0.6145253333423468|\n 0| 0.8678090740409161| 0.9349286905893688| 0.897022900084491|\n 0| 0.6752577347437083| 0.20625908730646103|0.10315736062362346|\n 0|0.22669523505013633| 0.6099324032866738| 0.8357508819755833|\n 0| 0.6880907870618188| 0.8392228400945341| 0.7226505258273653|\n 0|0.30101130104653884| 0.9651274666079585| 0.5422836657606281|\n 0| 0.7195503022011948| 0.9288544640693567|0.03643847265357025|\n 0|0.27410622722360234|0.051428600469085706| 0.7646588630569261|\n+---+-------------------+--------------------+-------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":10},{"cell_type":"code","source":["import pandas as pd\nimport statsmodels.api as sm\n# df has four columns: id, y, x1, x2\n\ngroup_column = 'id'\ny_column = 'y'\nx_columns = ['x1', 'x2']\nschema = df2.select(group_column, *x_columns).schema\n\n@pandas_udf(schema, PandasUDFType.GROUPED_MAP)\n# Input/output are both a pandas.DataFrame\ndef ols(pdf):\n group_key = pdf[group_column].iloc[0]\n y = pdf[y_column]\n X = pdf[x_columns]\n X = sm.add_constant(X)\n model = sm.OLS(y, X).fit()\n return pd.DataFrame([[group_key] + [model.params[i] for i in x_columns]], columns=[group_column] + x_columns)\n\nbeta = df2.groupby(group_column).apply(ols)\nbeta.show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n+---+--------------------+--------------------+\n id| x1| x2|\n+---+--------------------+--------------------+\n148|-0.00917654654646...|-0.01005665429668...|\n463|-0.01500674870033...|0.016025010035176222|\n471|-0.00832321162854...|-0.00404915919899...|\n496|-0.01122055554723648|-0.00280962655140...|\n833|-0.01629553352114...|0.002779410391406...|\n243|0.005259499685366535|0.011281848381048665|\n392|0.005589240115972826|-0.00950385069041...|\n540|5.918574070326934...|0.012159354453070217|\n623|0.020442434869455878|0.004083702101312427|\n737|0.006226657113389954|0.003961770851249408|\n858|0.001940560121997...|0.006720865070135...|\n897|-0.00142535705654...|0.004045227546180374|\n 31|0.005465606169062085|0.008832031597331093|\n516|-0.00531332000715...|0.001981946321763...|\n 85|0.012725673978444558|-0.02828045053679735|\n137|-0.00131062800808...|-5.30640018178707...|\n251|0.006229489454833485|0.002962616001996...|\n451|0.003804104279762211|-0.00447206880074...|\n580|0.026962287867315624|3.293459638984281E-4|\n808|-0.01025147566168...|0.004950671582079154|\n+---+--------------------+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":11}],"metadata":{"name":"Pandas UDFs Benchmark","notebookId":328232164696745},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/Predict+Employee+Exit+-+Classification (1).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "hr_data = spark.read.csv('HR_comma_sep.csv',inferSchema=True,header=True)"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {
18 | "collapsed": false
19 | },
20 | "outputs": [
21 | {
22 | "data": {
23 | "text/plain": [
24 | "Row(satisfaction_level=0.38, last_evaluation=0.53, number_project=2, average_montly_hours=157, time_spend_company=3, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'low')"
25 | ]
26 | },
27 | "execution_count": 2,
28 | "metadata": {},
29 | "output_type": "execute_result"
30 | }
31 | ],
32 | "source": [
33 | "hr_data.head()"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {
40 | "collapsed": false
41 | },
42 | "outputs": [
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "root\n",
48 | " |-- satisfaction_level: double (nullable = true)\n",
49 | " |-- last_evaluation: double (nullable = true)\n",
50 | " |-- number_project: integer (nullable = true)\n",
51 | " |-- average_montly_hours: integer (nullable = true)\n",
52 | " |-- time_spend_company: integer (nullable = true)\n",
53 | " |-- Work_accident: integer (nullable = true)\n",
54 | " |-- left: integer (nullable = true)\n",
55 | " |-- promotion_last_5years: integer (nullable = true)\n",
56 | " |-- sales: string (nullable = true)\n",
57 | " |-- salary: string (nullable = true)\n",
58 | "\n"
59 | ]
60 | }
61 | ],
62 | "source": [
63 | "hr_data.printSchema()"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {
70 | "collapsed": false
71 | },
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/plain": [
76 | "[Row(sales=u'management'),\n",
77 | " Row(sales=u'product_mng'),\n",
78 | " Row(sales=u'marketing'),\n",
79 | " Row(sales=u'sales'),\n",
80 | " Row(sales=u'hr'),\n",
81 | " Row(sales=u'accounting'),\n",
82 | " Row(sales=u'support'),\n",
83 | " Row(sales=u'IT'),\n",
84 | " Row(sales=u'technical'),\n",
85 | " Row(sales=u'RandD')]"
86 | ]
87 | },
88 | "execution_count": 4,
89 | "metadata": {},
90 | "output_type": "execute_result"
91 | }
92 | ],
93 | "source": [
94 | "#Get unique data of sales col\n",
95 | "hr_data[['sales']].distinct().collect()"
96 | ]
97 | },
98 | {
99 | "cell_type": "code",
100 | "execution_count": 5,
101 | "metadata": {
102 | "collapsed": true
103 | },
104 | "outputs": [],
105 | "source": [
106 | "#Rename col from sales to dept\n",
107 | "hr_data = hr_data.withColumnRenamed('sales','dept')"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 6,
113 | "metadata": {
114 | "collapsed": false
115 | },
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "root\n",
122 | " |-- satisfaction_level: double (nullable = true)\n",
123 | " |-- last_evaluation: double (nullable = true)\n",
124 | " |-- number_project: integer (nullable = true)\n",
125 | " |-- average_montly_hours: integer (nullable = true)\n",
126 | " |-- time_spend_company: integer (nullable = true)\n",
127 | " |-- Work_accident: integer (nullable = true)\n",
128 | " |-- left: integer (nullable = true)\n",
129 | " |-- promotion_last_5years: integer (nullable = true)\n",
130 | " |-- dept: string (nullable = true)\n",
131 | " |-- salary: string (nullable = true)\n",
132 | "\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "hr_data.printSchema()"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 7,
143 | "metadata": {
144 | "collapsed": false
145 | },
146 | "outputs": [
147 | {
148 | "data": {
149 | "text/plain": [
150 | "[Row(salary=u'low'), Row(salary=u'high'), Row(salary=u'medium')]"
151 | ]
152 | },
153 | "execution_count": 7,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "#Get unique data of sales col\n",
160 | "\n",
161 | "#hr_data[['salary']].distinct().collect()\n",
162 | "\n",
163 | "hr_data.select('salary').distinct().collect()"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 8,
169 | "metadata": {
170 | "collapsed": false
171 | },
172 | "outputs": [
173 | {
174 | "name": "stdout",
175 | "output_type": "stream",
176 | "text": [
177 | "+-------+-------------------+-------------------+------------------+\n",
178 | "|summary| satisfaction_level| left| number_project|\n",
179 | "+-------+-------------------+-------------------+------------------+\n",
180 | "| count| 14999| 14999| 14999|\n",
181 | "| mean| 0.6128335222348166| 0.2380825388359224| 3.80305353690246|\n",
182 | "| stddev|0.24863065106114257|0.42592409938029885|1.2325923553183513|\n",
183 | "| min| 0.09| 0| 2|\n",
184 | "| max| 1.0| 1| 7|\n",
185 | "+-------+-------------------+-------------------+------------------+\n",
186 | "\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "hr_data.describe()['summary','satisfaction_level','left','number_project'].show()"
192 | ]
193 | },
194 | {
195 | "cell_type": "markdown",
196 | "metadata": {},
197 | "source": [
198 | "### Featurization - Convert string data to numbers\n",
199 | "* dept & salary are categorical information\n",
200 | "* Need to convert them to number"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 9,
206 | "metadata": {
207 | "collapsed": true
208 | },
209 | "outputs": [],
210 | "source": [
211 | "import pyspark.ml.feature as ft\n",
212 | "\n",
213 | "#StringIndexer - converts string data to numbers\n",
214 | "#input cols are dept 7 salary.\n",
215 | "#output are *_en\n",
216 | "\n",
217 | "transformer_dept = ft.StringIndexer(inputCol='dept', outputCol='dept_en')\n",
218 | "transformer_salary = ft.StringIndexer(inputCol='salary', outputCol='salary_en')"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 11,
224 | "metadata": {
225 | "collapsed": true
226 | },
227 | "outputs": [],
228 | "source": [
229 | "# Convert numerical data into vector\n",
230 | "# VectorAssembler for creating vector"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 13,
236 | "metadata": {
237 | "collapsed": false
238 | },
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/plain": [
243 | "['satisfaction_level',\n",
244 | " 'last_evaluation',\n",
245 | " 'number_project',\n",
246 | " 'average_montly_hours',\n",
247 | " 'time_spend_company',\n",
248 | " 'Work_accident',\n",
249 | " 'left',\n",
250 | " 'promotion_last_5years',\n",
251 | " 'dept',\n",
252 | " 'salary']"
253 | ]
254 | },
255 | "execution_count": 13,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "hr_data.columns"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 17,
267 | "metadata": {
268 | "collapsed": false
269 | },
270 | "outputs": [],
271 | "source": [
272 | "#Convert all numerical data to vector\n",
273 | "featurescreator = ft.VectorAssembler(inputCols=['satisfaction_level',\n",
274 | " 'last_evaluation',\n",
275 | " 'number_project',\n",
276 | " 'average_montly_hours',\n",
277 | " 'time_spend_company',\n",
278 | " 'Work_accident',\n",
279 | " 'promotion_last_5years','dept_en','salary_en'], outputCol='features')"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 18,
285 | "metadata": {
286 | "collapsed": true
287 | },
288 | "outputs": [],
289 | "source": [
290 | "import pyspark.ml.classification as cl"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 19,
296 | "metadata": {
297 | "collapsed": true
298 | },
299 | "outputs": [],
300 | "source": [
301 | "#creating estimator\n",
302 | "logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='left')"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 20,
308 | "metadata": {
309 | "collapsed": true
310 | },
311 | "outputs": [],
312 | "source": [
313 | "from pyspark.ml import Pipeline"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 21,
319 | "metadata": {
320 | "collapsed": true
321 | },
322 | "outputs": [],
323 | "source": [
324 | "#create pipeline connecting 3 transformers & one estimator\n",
325 | "pipeline = Pipeline(stages=[transformer_dept, \n",
326 | " transformer_salary, \n",
327 | " featurescreator,\n",
328 | " logistic])"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 22,
334 | "metadata": {
335 | "collapsed": true
336 | },
337 | "outputs": [],
338 | "source": [
339 | "#Split data for test & train\n",
340 | "#seed controls the random data generated\n",
341 | "hr_data_train, hr_data_test = hr_data.randomSplit([0.7,0.3],seed=100)"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 24,
347 | "metadata": {
348 | "collapsed": true
349 | },
350 | "outputs": [],
351 | "source": [
352 | "#Training the pipeline\n",
353 | "model = pipeline.fit(hr_data_train)"
354 | ]
355 | },
356 | {
357 | "cell_type": "code",
358 | "execution_count": 25,
359 | "metadata": {
360 | "collapsed": true
361 | },
362 | "outputs": [],
363 | "source": [
364 | "test_out = model.transform(hr_data_test)"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 26,
370 | "metadata": {
371 | "collapsed": false
372 | },
373 | "outputs": [
374 | {
375 | "data": {
376 | "text/plain": [
377 | "DataFrame[satisfaction_level: double, last_evaluation: double, number_project: int, average_montly_hours: int, time_spend_company: int, Work_accident: int, left: int, promotion_last_5years: int, dept: string, salary: string, dept_en: double, salary_en: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]"
378 | ]
379 | },
380 | "execution_count": 26,
381 | "metadata": {},
382 | "output_type": "execute_result"
383 | }
384 | ],
385 | "source": [
386 | "test_out"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 37,
392 | "metadata": {
393 | "collapsed": false
394 | },
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "[Row(probability=DenseVector([0.3824, 0.6176])),\n",
400 | " Row(probability=DenseVector([0.327, 0.673])),\n",
401 | " Row(probability=DenseVector([0.3909, 0.6091])),\n",
402 | " Row(probability=DenseVector([0.359, 0.641])),\n",
403 | " Row(probability=DenseVector([0.3142, 0.6858]))]"
404 | ]
405 | },
406 | "execution_count": 37,
407 | "metadata": {},
408 | "output_type": "execute_result"
409 | }
410 | ],
411 | "source": [
412 | "test_out[['probability']].take(5)"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": 31,
418 | "metadata": {
419 | "collapsed": true
420 | },
421 | "outputs": [],
422 | "source": [
423 | "#To find accuracy of the algo under processing\n",
424 | "import pyspark.ml.evaluation as ev"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 33,
430 | "metadata": {
431 | "collapsed": true
432 | },
433 | "outputs": [],
434 | "source": [
435 | "#BinaryClassification\n",
436 | "evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', \n",
437 | " labelCol='left')"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 34,
443 | "metadata": {
444 | "collapsed": false
445 | },
446 | "outputs": [
447 | {
448 | "data": {
449 | "text/plain": [
450 | "0.822542214798108"
451 | ]
452 | },
453 | "execution_count": 34,
454 | "metadata": {},
455 | "output_type": "execute_result"
456 | }
457 | ],
458 | "source": [
459 | "evaluator.evaluate(test_out)"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 45,
465 | "metadata": {
466 | "collapsed": false
467 | },
468 | "outputs": [
469 | {
470 | "data": {
471 | "text/plain": [
472 | "[Row(prediction=1.0),\n",
473 | " Row(prediction=1.0),\n",
474 | " Row(prediction=1.0),\n",
475 | " Row(prediction=1.0),\n",
476 | " Row(prediction=1.0)]"
477 | ]
478 | },
479 | "execution_count": 45,
480 | "metadata": {},
481 | "output_type": "execute_result"
482 | }
483 | ],
484 | "source": [
485 | "test_out[['prediction']].take(5)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 48,
491 | "metadata": {
492 | "collapsed": false
493 | },
494 | "outputs": [
495 | {
496 | "name": "stdout",
497 | "output_type": "stream",
498 | "text": [
499 | "+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+\n",
500 | "|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years| dept|salary|\n",
501 | "+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+\n",
502 | "| 0.09| 0.77| 5| 275| 4| 0| 1| 0|product_mng|medium|\n",
503 | "| 0.09| 0.77| 6| 244| 4| 0| 1| 0|product_mng| low|\n",
504 | "| 0.09| 0.77| 6| 256| 5| 0| 1| 0| support|medium|\n",
505 | "| 0.09| 0.77| 6| 282| 5| 0| 1| 0| sales|medium|\n",
506 | "| 0.09| 0.78| 6| 254| 4| 0| 1| 0| support| low|\n",
507 | "+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+\n",
508 | "only showing top 5 rows\n",
509 | "\n"
510 | ]
511 | }
512 | ],
513 | "source": [
514 | "test_out[hr_data.columns].show(5)"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": 51,
520 | "metadata": {
521 | "collapsed": false
522 | },
523 | "outputs": [
524 | {
525 | "data": {
526 | "text/plain": [
527 | "-0.3883749834241161"
528 | ]
529 | },
530 | "execution_count": 51,
531 | "metadata": {},
532 | "output_type": "execute_result"
533 | }
534 | ],
535 | "source": [
536 | "hr_data.corr(col1='satisfaction_level', col2='left')"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": null,
542 | "metadata": {
543 | "collapsed": true
544 | },
545 | "outputs": [],
546 | "source": []
547 | }
548 | ],
549 | "metadata": {
550 | "kernelspec": {
551 | "display_name": "Python 2",
552 | "language": "python",
553 | "name": "python2"
554 | },
555 | "language_info": {
556 | "codemirror_mode": {
557 | "name": "ipython",
558 | "version": 2
559 | },
560 | "file_extension": ".py",
561 | "mimetype": "text/x-python",
562 | "name": "python",
563 | "nbconvert_exporter": "python",
564 | "pygments_lexer": "ipython2",
565 | "version": "2.7.13"
566 | }
567 | },
568 | "nbformat": 4,
569 | "nbformat_minor": 2
570 | }
571 |
--------------------------------------------------------------------------------
/Project-PySpark.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Project-PySpark.zip
--------------------------------------------------------------------------------
/PySpark-Structured+Streaming.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Structured Streaming using Python DataFrames API\n",
8 | "\n",
9 | "Apache Spark 2.0 adds the first version of a new higher-level stream processing API, Structured Streaming. In this notebook we are going to take a quick look at how to use DataFrame API to build Structured Streaming applications. We want to compute real-time metrics like running counts and windowed counts on a stream of timestamped actions (e.g. Open, Close, etc)."
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "## Sample Data\n",
17 | "We have some sample action data as files in `/databricks-datasets/structured-streaming/events/` which we are going to use to build this appication. Let's take a look at the contents of this directory."
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 3,
23 | "metadata": {
24 | "collapsed": true
25 | },
26 | "outputs": [],
27 | "source": [
28 | "%fs ls /databricks-datasets/structured-streaming/"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "There are about 50 JSON files in the directory. Let's see what each JSON file contains."
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 5,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": [
46 | "%fs head /databricks-datasets/structured-streaming/events/file-0.json"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "Each line in the file contains JSON record with two fields - `time` and `action`. Let's try to analyze these files interactively."
54 | ]
55 | },
56 | {
57 | "cell_type": "markdown",
58 | "metadata": {},
59 | "source": [
60 | "## Batch/Interactive Processing\n",
61 | "The usual first step in attempting to process the data is to interactively query the data. Let's define a static DataFrame on the files, and give it a table name."
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 8,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "from pyspark.sql.types import *\n",
73 | "\n",
74 | "inputPath = \"/databricks-datasets/structured-streaming/events/\"\n",
75 | "\n",
76 | "# Since we know the data format already, let's define the schema to speed up processing (no need for Spark to infer schema)\n",
77 | "jsonSchema = StructType([ StructField(\"time\", TimestampType(), True), StructField(\"action\", StringType(), True) ])\n",
78 | "\n",
79 | "# Static DataFrame representing data in the JSON files\n",
80 | "staticInputDF = (\n",
81 | " spark\n",
82 | " .read\n",
83 | " .schema(jsonSchema)\n",
84 | " .json(inputPath)\n",
85 | ")\n",
86 | "\n",
87 | "display(staticInputDF)"
88 | ]
89 | },
90 | {
91 | "cell_type": "markdown",
92 | "metadata": {},
93 | "source": [
94 | "Now we can compute the number of \"open\" and \"close\" actions with one hour windows. To do this, we will group by the `action` column and 1 hour windows over the `time` column."
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 10,
100 | "metadata": {
101 | "collapsed": true
102 | },
103 | "outputs": [],
104 | "source": [
105 | "from pyspark.sql.functions import * # for window() function\n",
106 | "\n",
107 | "staticCountsDF = (\n",
108 | " staticInputDF\n",
109 | " .groupBy(\n",
110 | " staticInputDF.action, \n",
111 | " window(staticInputDF.time, \"1 hour\")) \n",
112 | " .count()\n",
113 | ")\n",
114 | "staticCountsDF.cache()\n",
115 | "\n",
116 | "# Register the DataFrame as table 'static_counts'\n",
117 | "staticCountsDF.createOrReplaceTempView(\"static_counts\")"
118 | ]
119 | },
120 | {
121 | "cell_type": "markdown",
122 | "metadata": {},
123 | "source": [
124 | "Now we can directly use SQL to query the table. For example, here are the total counts across all the hours."
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 12,
130 | "metadata": {
131 | "collapsed": true
132 | },
133 | "outputs": [],
134 | "source": [
135 | "%sql select action, sum(count) as total_count from static_counts group by action"
136 | ]
137 | },
138 | {
139 | "cell_type": "markdown",
140 | "metadata": {},
141 | "source": [
142 | "How about a timeline of windowed counts?"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 14,
148 | "metadata": {
149 | "collapsed": true
150 | },
151 | "outputs": [],
152 | "source": [
153 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from static_counts order by time, action"
154 | ]
155 | },
156 | {
157 | "cell_type": "markdown",
158 | "metadata": {},
159 | "source": [
160 | "Note the two ends of the graph. The close actions are generated such that they are after the corresponding open actions, so there are more \"opens\" in the beginning and more \"closes\" in the end."
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "## Stream Processing \n",
168 | "Now that we have analyzed the data interactively, let's convert this to a streaming query that continuously updates as data comes. Since we just have a static set of files, we are going to emulate a stream from them by reading one file at a time, in the chronological order they were created. The query we have to write is pretty much the same as the interactive query above."
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 17,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": [
179 | "from pyspark.sql.functions import *\n",
180 | "\n",
181 | "# Similar to definition of staticInputDF above, just using `readStream` instead of `read`\n",
182 | "streamingInputDF = (\n",
183 | " spark\n",
184 | " .readStream \n",
185 | " .schema(jsonSchema) # Set the schema of the JSON data\n",
186 | " .option(\"maxFilesPerTrigger\", 1) # Treat a sequence of files as a stream by picking one file at a time\n",
187 | " .json(inputPath)\n",
188 | ")\n",
189 | "\n",
190 | "# Same query as staticInputDF\n",
191 | "streamingCountsDF = ( \n",
192 | " streamingInputDF\n",
193 | " .groupBy(\n",
194 | " streamingInputDF.action, \n",
195 | " window(streamingInputDF.time, \"2 hours\"))\n",
196 | " .count()\n",
197 | ")\n",
198 | "\n",
199 | "# Is this DF actually a streaming DF?\n",
200 | "streamingCountsDF.isStreaming"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "As you can see, `streamingCountsDF` is a streaming Dataframe (`streamingCountsDF.isStreaming` was `true`). You can start streaming computation, by defining the sink and starting it. \n",
208 | "In our case, we want to interactively query the counts (same queries as above), so we will set the complete set of 1 hour counts to be in a in-memory table (note that this for testing purpose only in Spark 2.0)."
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 19,
214 | "metadata": {
215 | "collapsed": true
216 | },
217 | "outputs": [],
218 | "source": [
219 | "spark.conf.set(\"spark.sql.shuffle.partitions\", \"2\") # keep the size of shuffles small\n",
220 | "\n",
221 | "query = (\n",
222 | " streamingCountsDF\n",
223 | " .writeStream\n",
224 | " .format(\"memory\") # memory = store in-memory table (for testing only in Spark 2.0)\n",
225 | " .queryName(\"counts\") # counts = name of the in-memory table\n",
226 | " .outputMode(\"complete\") # complete = all the counts should be in the table\n",
227 | " .start()\n",
228 | ")"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "`query` is a handle to the streaming query that is running in the background. This query is continuously picking up files and updating the windowed counts. \n",
236 | "\n",
237 | "Note the status of query in the above cell. Both the `Status: ACTIVE` and the progress bar shows that the query is active. \n",
238 | "Furthermore, if you expand the `>Details` above, you will find the number of files they have already processed. \n",
239 | "\n",
240 | "Let's wait a bit for a few files to be processed and then interactively query the in-memory `counts` table."
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 21,
246 | "metadata": {
247 | "collapsed": true
248 | },
249 | "outputs": [],
250 | "source": [
251 | "from time import sleep\n",
252 | "sleep(5) # wait a bit for computation to start"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 22,
258 | "metadata": {
259 | "collapsed": true
260 | },
261 | "outputs": [],
262 | "source": [
263 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from counts order by time, action"
264 | ]
265 | },
266 | {
267 | "cell_type": "markdown",
268 | "metadata": {},
269 | "source": [
270 | "We see the timeline of windowed counts (similar to the static one ealrier) building up. If we keep running this interactive query repeatedly, we will see the latest updated counts which the streaming query is updating in the background."
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 24,
276 | "metadata": {
277 | "collapsed": true
278 | },
279 | "outputs": [],
280 | "source": [
281 | "sleep(5) # wait a bit more for more data to be computed"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 25,
287 | "metadata": {
288 | "collapsed": true
289 | },
290 | "outputs": [],
291 | "source": [
292 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from counts order by time, action"
293 | ]
294 | },
295 | {
296 | "cell_type": "code",
297 | "execution_count": 26,
298 | "metadata": {
299 | "collapsed": true
300 | },
301 | "outputs": [],
302 | "source": [
303 | "sleep(5) # wait a bit more for more data to be computed"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 27,
309 | "metadata": {
310 | "collapsed": true
311 | },
312 | "outputs": [],
313 | "source": [
314 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from counts order by time, action"
315 | ]
316 | },
317 | {
318 | "cell_type": "markdown",
319 | "metadata": {},
320 | "source": [
321 | "Also, let's see the total number of \"opens\" and \"closes\"."
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 29,
327 | "metadata": {
328 | "collapsed": true
329 | },
330 | "outputs": [],
331 | "source": [
332 | "%sql select action, sum(count) as total_count from counts group by action order by action"
333 | ]
334 | },
335 | {
336 | "cell_type": "markdown",
337 | "metadata": {},
338 | "source": []
339 | }
340 | ],
341 | "metadata": {
342 | "kernelspec": {
343 | "display_name": "Python 2",
344 | "language": "python",
345 | "name": "python2"
346 | },
347 | "language_info": {
348 | "codemirror_mode": {
349 | "name": "ipython",
350 | "version": 2
351 | },
352 | "file_extension": ".py",
353 | "mimetype": "text/x-python",
354 | "name": "python",
355 | "nbconvert_exporter": "python",
356 | "pygments_lexer": "ipython2",
357 | "version": "2.7.13"
358 | },
359 | "name": "Introduction to Structured Streaming",
360 | "notebookId": 2229079593072345
361 | },
362 | "nbformat": 4,
363 | "nbformat_minor": 0
364 | }
365 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pyspark
2 |
--------------------------------------------------------------------------------
/Recommendation+Engine+.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 10,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#data in textfile separated by ::\n",
12 | "#so loading as textFile\n",
13 | "lines = spark.read.text('data/mllib/als/sample_movielens_ratings.txt').rdd"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 11,
19 | "metadata": {
20 | "collapsed": false
21 | },
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/plain": [
26 | "[Row(value=u'0::2::3::1424380312'),\n",
27 | " Row(value=u'0::3::1::1424380312'),\n",
28 | " Row(value=u'0::5::2::1424380312'),\n",
29 | " Row(value=u'0::9::4::1424380312'),\n",
30 | " Row(value=u'0::11::1::1424380312')]"
31 | ]
32 | },
33 | "execution_count": 11,
34 | "metadata": {},
35 | "output_type": "execute_result"
36 | }
37 | ],
38 | "source": [
39 | "lines.take(5)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 19,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [],
49 | "source": [
50 | "#Split based on ::\n",
51 | "data = lines.map(lambda row:row.split('::'))"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 13,
57 | "metadata": {
58 | "collapsed": false
59 | },
60 | "outputs": [
61 | {
62 | "data": {
63 | "text/plain": [
64 | "[[u'0', u'2', u'3', u'1424380312'],\n",
65 | " [u'0', u'3', u'1', u'1424380312'],\n",
66 | " [u'0', u'5', u'2', u'1424380312'],\n",
67 | " [u'0', u'9', u'4', u'1424380312'],\n",
68 | " [u'0', u'11', u'1', u'1424380312']]"
69 | ]
70 | },
71 | "execution_count": 13,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "data.take(5)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 17,
83 | "metadata": {
84 | "collapsed": false
85 | },
86 | "outputs": [],
87 | "source": [
88 | "from pyspark.sql import Row\n",
89 | "\n",
90 | "rdd = data.map(lambda d: Row(userId= int(d[0]), \n",
91 | " movieId=int(d[1]), \n",
92 | " rating=int(d[2]), \n",
93 | " timestamp=long(d[3]) ))"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 18,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [
103 | {
104 | "data": {
105 | "text/plain": [
106 | "[Row(movieId=2, rating=3, timestamp=1424380312L, userId=0),\n",
107 | " Row(movieId=3, rating=1, timestamp=1424380312L, userId=0)]"
108 | ]
109 | },
110 | "execution_count": 18,
111 | "metadata": {},
112 | "output_type": "execute_result"
113 | }
114 | ],
115 | "source": [
116 | "rdd.take(2)"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 20,
122 | "metadata": {
123 | "collapsed": true
124 | },
125 | "outputs": [],
126 | "source": [
127 | "df = spark.createDataFrame(rdd)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 21,
133 | "metadata": {
134 | "collapsed": false
135 | },
136 | "outputs": [
137 | {
138 | "name": "stdout",
139 | "output_type": "stream",
140 | "text": [
141 | "+-------+------+----------+------+\n",
142 | "|movieId|rating| timestamp|userId|\n",
143 | "+-------+------+----------+------+\n",
144 | "| 2| 3|1424380312| 0|\n",
145 | "| 3| 1|1424380312| 0|\n",
146 | "| 5| 2|1424380312| 0|\n",
147 | "| 9| 4|1424380312| 0|\n",
148 | "| 11| 1|1424380312| 0|\n",
149 | "| 12| 2|1424380312| 0|\n",
150 | "| 15| 1|1424380312| 0|\n",
151 | "| 17| 1|1424380312| 0|\n",
152 | "| 19| 1|1424380312| 0|\n",
153 | "| 21| 1|1424380312| 0|\n",
154 | "| 23| 1|1424380312| 0|\n",
155 | "| 26| 3|1424380312| 0|\n",
156 | "| 27| 1|1424380312| 0|\n",
157 | "| 28| 1|1424380312| 0|\n",
158 | "| 29| 1|1424380312| 0|\n",
159 | "| 30| 1|1424380312| 0|\n",
160 | "| 31| 1|1424380312| 0|\n",
161 | "| 34| 1|1424380312| 0|\n",
162 | "| 37| 1|1424380312| 0|\n",
163 | "| 41| 2|1424380312| 0|\n",
164 | "+-------+------+----------+------+\n",
165 | "only showing top 20 rows\n",
166 | "\n"
167 | ]
168 | }
169 | ],
170 | "source": [
171 | "df.show()"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 22,
177 | "metadata": {
178 | "collapsed": false
179 | },
180 | "outputs": [
181 | {
182 | "name": "stdout",
183 | "output_type": "stream",
184 | "text": [
185 | "+-------+------+----------+------+\n",
186 | "|movieId|rating| timestamp|userId|\n",
187 | "+-------+------+----------+------+\n",
188 | "| 2| 3|1424380312| 0|\n",
189 | "| 3| 1|1424380312| 0|\n",
190 | "| 5| 2|1424380312| 0|\n",
191 | "| 9| 4|1424380312| 0|\n",
192 | "| 11| 1|1424380312| 0|\n",
193 | "| 12| 2|1424380312| 0|\n",
194 | "| 15| 1|1424380312| 0|\n",
195 | "| 17| 1|1424380312| 0|\n",
196 | "| 19| 1|1424380312| 0|\n",
197 | "| 21| 1|1424380312| 0|\n",
198 | "| 23| 1|1424380312| 0|\n",
199 | "| 26| 3|1424380312| 0|\n",
200 | "| 27| 1|1424380312| 0|\n",
201 | "| 28| 1|1424380312| 0|\n",
202 | "| 29| 1|1424380312| 0|\n",
203 | "| 30| 1|1424380312| 0|\n",
204 | "| 31| 1|1424380312| 0|\n",
205 | "| 34| 1|1424380312| 0|\n",
206 | "| 37| 1|1424380312| 0|\n",
207 | "| 41| 2|1424380312| 0|\n",
208 | "+-------+------+----------+------+\n",
209 | "only showing top 20 rows\n",
210 | "\n"
211 | ]
212 | }
213 | ],
214 | "source": [
215 | "rdd.toDF().show()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 23,
221 | "metadata": {
222 | "collapsed": true
223 | },
224 | "outputs": [],
225 | "source": [
226 | "train, test = df.randomSplit([0.8,0.2])"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 24,
232 | "metadata": {
233 | "collapsed": true
234 | },
235 | "outputs": [],
236 | "source": [
237 | "from pyspark.ml.recommendation import ALS"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {
244 | "collapsed": true
245 | },
246 | "outputs": [],
247 | "source": [
248 | "als = ALS(maxIter=5, regParam=0.01, userCol=\"userId\", itemCol=\"movieId\")"
249 | ]
250 | }
251 | ],
252 | "metadata": {
253 | "kernelspec": {
254 | "display_name": "Python 2",
255 | "language": "python",
256 | "name": "python2"
257 | },
258 | "language_info": {
259 | "codemirror_mode": {
260 | "name": "ipython",
261 | "version": 2
262 | },
263 | "file_extension": ".py",
264 | "mimetype": "text/x-python",
265 | "name": "python",
266 | "nbconvert_exporter": "python",
267 | "pygments_lexer": "ipython2",
268 | "version": "2.7.13"
269 | }
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 2
273 | }
274 |
--------------------------------------------------------------------------------
/Spark Architecture.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark Architecture.pptx
--------------------------------------------------------------------------------
/Spark RDD.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark RDD.pptx
--------------------------------------------------------------------------------
/Spark Storage Data Formats.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark Storage Data Formats.pptx
--------------------------------------------------------------------------------
/Spark-Introduction.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark-Introduction.pptx
--------------------------------------------------------------------------------
/Spark-Partitioning.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark-Partitioning.pptx
--------------------------------------------------------------------------------
/Spark-Performance Tuning.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark-Performance Tuning.pptx
--------------------------------------------------------------------------------
/Spark_SQL:
--------------------------------------------------------------------------------
1 | spark.read.json("/FileStore/tables/2015_flight_data-f8a15.json")\
2 | .createOrReplaceTempView("some_sql_view") # DF => SQL
3 |
4 | spark.sql("""
5 | SELECT DEST_COUNTRY_NAME, sum(count)
6 | FROM some_sql_view GROUP BY DEST_COUNTRY_NAME
7 | """)\
8 | .where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10")\
9 | .count() # SQL => DF
10 |
11 |
12 | CREATE TABLE flights (
13 | DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG)
14 | USING JSON OPTIONS (path '/FileStore/tables/2015_flight_data-f8a15.json')
15 |
16 |
17 | ----
18 |
19 | CREATE TABLE flights_csv (
20 | DEST_COUNTRY_NAME STRING,
21 | ORIGIN_COUNTRY_NAME STRING COMMENT "remember, the US will be most prevalent",
22 | count LONG)
23 | USING csv OPTIONS (header true, path '/data/flight-data/csv/2015-summary.csv')
24 |
25 |
26 | ----
27 |
28 | CREATE TABLE flights_from_select USING parquet AS SELECT * FROM flights
29 |
30 |
31 | ----
32 |
33 | CREATE TABLE IF NOT EXISTS flights_from_select
34 | AS SELECT * FROM flights
35 |
36 |
37 | ----
38 |
39 | CREATE TABLE partitioned_flights USING parquet PARTITIONED BY (DEST_COUNTRY_NAME)
40 | AS SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights LIMIT 5
41 |
42 |
43 | ----
44 |
45 | CREATE EXTERNAL TABLE hive_flights (
46 | DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG)
47 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/data/flight-data-hive/'
48 |
49 |
50 | ----
51 |
52 | CREATE EXTERNAL TABLE hive_flights_2
53 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ','
54 | LOCATION '/data/flight-data-hive/' AS SELECT * FROM flights
55 |
56 |
57 | ----
58 |
59 | INSERT INTO flights_from_select
60 | SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights LIMIT 20
61 |
62 |
63 | ----
64 |
65 | INSERT INTO partitioned_flights
66 | PARTITION (DEST_COUNTRY_NAME="UNITED STATES")
67 | SELECT count, ORIGIN_COUNTRY_NAME FROM flights
68 | WHERE DEST_COUNTRY_NAME='UNITED STATES' LIMIT 12
69 |
70 |
71 | ----
72 |
73 | DESCRIBE TABLE flights_csv
74 |
75 |
76 | ----
77 |
78 | SHOW PARTITIONS partitioned_flights
79 |
80 |
81 | ----
82 |
83 | REFRESH table partitioned_flights
84 |
85 |
86 | ----
87 |
88 | MSCK REPAIR TABLE partitioned_flights
89 |
90 |
91 | ----
92 |
93 | DROP TABLE flights_csv;
94 |
95 |
96 | ----
97 |
98 | DROP TABLE IF EXISTS flights_csv;
99 |
100 |
101 | ----
102 |
103 | CACHE TABLE flights
104 |
105 |
106 | ----
107 |
108 | UNCACHE TABLE FLIGHTS
109 |
110 |
111 | ----
112 |
113 | CREATE VIEW just_usa_view AS
114 | SELECT * FROM flights WHERE dest_country_name = 'United States'
115 |
116 |
117 | ----
118 |
119 | CREATE TEMP VIEW just_usa_view_temp AS
120 | SELECT * FROM flights WHERE dest_country_name = 'United States'
121 |
122 |
123 | ----
124 |
125 | CREATE GLOBAL TEMP VIEW just_usa_global_view_temp AS
126 | SELECT * FROM flights WHERE dest_country_name = 'United States'
127 |
128 |
129 | ----
130 |
131 | SHOW TABLES
132 |
133 |
134 | ----
135 |
136 | CREATE OR REPLACE TEMP VIEW just_usa_view_temp AS
137 | SELECT * FROM flights WHERE dest_country_name = 'United States'
138 |
139 |
140 | ----
141 |
142 | SELECT * FROM just_usa_view_temp
143 |
144 |
145 | ----
146 |
147 | EXPLAIN SELECT * FROM just_usa_view
148 |
149 |
150 | ----
151 |
152 | EXPLAIN SELECT * FROM flights WHERE dest_country_name = 'United States'
153 |
154 |
155 | ----
156 |
157 | DROP VIEW IF EXISTS just_usa_view;
158 |
159 |
160 | ----
161 |
162 | SHOW DATABASES
163 |
164 |
165 | ----
166 |
167 | CREATE DATABASE some_db
168 |
169 |
170 | ----
171 |
172 | USE some_db
173 |
174 |
175 | ----
176 |
177 | SHOW tables
178 |
179 | SELECT * FROM flights -- fails with table/view not found
180 |
181 |
182 | ----
183 |
184 | SELECT * FROM default.flights
185 |
186 |
187 | ----
188 |
189 | SELECT current_database()
190 |
191 |
192 | ----
193 |
194 | USE default;
195 |
196 |
197 | ----
198 |
199 | DROP DATABASE IF EXISTS some_db;
200 |
201 |
202 | ----
203 |
204 | SELECT [ALL|DISTINCT] named_expression[, named_expression, ...]
205 | FROM relation[, relation, ...]
206 | [lateral_view[, lateral_view, ...]]
207 | [WHERE boolean_expression]
208 | [aggregation [HAVING boolean_expression]]
209 | [ORDER BY sort_expressions]
210 | [CLUSTER BY expressions]
211 | [DISTRIBUTE BY expressions]
212 | [SORT BY sort_expressions]
213 | [WINDOW named_window[, WINDOW named_window, ...]]
214 | [LIMIT num_rows]
215 |
216 | named_expression:
217 | : expression [AS alias]
218 |
219 | relation:
220 | | join_relation
221 | | (table_name|query|relation) [sample] [AS alias]
222 | : VALUES (expressions)[, (expressions), ...]
223 | [AS (column_name[, column_name, ...])]
224 |
225 | expressions:
226 | : expression[, expression, ...]
227 |
228 | sort_expressions:
229 | : expression [ASC|DESC][, expression [ASC|DESC], ...]
230 |
231 |
232 | ----
233 |
234 | SELECT
235 | CASE WHEN DEST_COUNTRY_NAME = 'UNITED STATES' THEN 1
236 | WHEN DEST_COUNTRY_NAME = 'Egypt' THEN 0
237 | ELSE -1 END
238 | FROM partitioned_flights
239 |
240 |
241 | ----
242 |
243 | CREATE VIEW IF NOT EXISTS nested_data AS
244 | SELECT (DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME) as country, count FROM flights
245 |
246 |
247 | ----
248 |
249 | SELECT * FROM nested_data
250 |
251 |
252 | ----
253 |
254 | SELECT country.DEST_COUNTRY_NAME, count FROM nested_data
255 |
256 |
257 | ----
258 |
259 | SELECT country.*, count FROM nested_data
260 |
261 |
262 | ----
263 |
264 | SELECT DEST_COUNTRY_NAME as new_name, collect_list(count) as flight_counts,
265 | collect_set(ORIGIN_COUNTRY_NAME) as origin_set
266 | FROM flights GROUP BY DEST_COUNTRY_NAME
267 |
268 |
269 | ----
270 |
271 | SELECT DEST_COUNTRY_NAME, ARRAY(1, 2, 3) FROM flights
272 |
273 |
274 | ----
275 |
276 | SELECT DEST_COUNTRY_NAME as new_name, collect_list(count)[0]
277 | FROM flights GROUP BY DEST_COUNTRY_NAME
278 |
279 |
280 | ----
281 |
282 | CREATE OR REPLACE TEMP VIEW flights_agg AS
283 | SELECT DEST_COUNTRY_NAME, collect_list(count) as collected_counts
284 | FROM flights GROUP BY DEST_COUNTRY_NAME
285 |
286 |
287 | ----
288 |
289 | SELECT explode(collected_counts), DEST_COUNTRY_NAME FROM flights_agg
290 |
291 |
292 | ----
293 |
294 | SHOW FUNCTIONS
295 |
296 |
297 | ----
298 |
299 | SHOW SYSTEM FUNCTIONS
300 |
301 |
302 | ----
303 |
304 | SHOW USER FUNCTIONS
305 |
306 |
307 | ----
308 |
309 | SHOW FUNCTIONS "s*";
310 |
311 |
312 | ----
313 |
314 | SHOW FUNCTIONS LIKE "collect*";
315 |
316 |
317 | ----
318 |
319 | SELECT count, power3(count) FROM flights
320 |
321 |
322 | ----
323 |
324 | SELECT dest_country_name FROM flights
325 | GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5
326 |
327 |
328 | ----
329 |
330 | SELECT * FROM flights
331 | WHERE origin_country_name IN (SELECT dest_country_name FROM flights
332 | GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5)
333 |
334 |
335 | ----
336 |
337 | SELECT * FROM flights f1
338 | WHERE EXISTS (SELECT 1 FROM flights f2
339 | WHERE f1.dest_country_name = f2.origin_country_name)
340 | AND EXISTS (SELECT 1 FROM flights f2
341 | WHERE f2.dest_country_name = f1.origin_country_name)
342 |
343 |
344 | ----
345 |
346 | SELECT *, (SELECT max(count) FROM flights) AS maximum FROM flights
347 |
348 |
349 | ----
350 |
351 | SET spark.sql.shuffle.partitions=20
352 |
353 |
354 | ----
355 |
356 |
357 |
358 |
--------------------------------------------------------------------------------
/Tensorframes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import tensorflow as tf\n",
12 | "import tensorframes as tfs\n",
13 | "from pyspark.sql import Row"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 3,
19 | "metadata": {
20 | "collapsed": false
21 | },
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/plain": [
26 | "[Row(z=3.0, x=0.0),\n",
27 | " Row(z=4.0, x=1.0),\n",
28 | " Row(z=5.0, x=2.0),\n",
29 | " Row(z=6.0, x=3.0),\n",
30 | " Row(z=7.0, x=4.0),\n",
31 | " Row(z=8.0, x=5.0),\n",
32 | " Row(z=9.0, x=6.0),\n",
33 | " Row(z=10.0, x=7.0),\n",
34 | " Row(z=11.0, x=8.0),\n",
35 | " Row(z=12.0, x=9.0)]"
36 | ]
37 | },
38 | "execution_count": 3,
39 | "metadata": {},
40 | "output_type": "execute_result"
41 | }
42 | ],
43 | "source": [
44 | "data = [Row(x=float(x)) for x in range(10)]\n",
45 | "df = spark.createDataFrame(data)\n",
46 | "\n",
47 | "\n",
48 | "with tf.Graph().as_default() as g:\n",
49 | " # The TensorFlow placeholder that corresponds to column 'x'.\n",
50 | " # The shape of the placeholder is automatically inferred from the DataFrame.\n",
51 | " \n",
52 | " x = tfs.block(df, \"x\")\n",
53 | " \n",
54 | " # The output that adds 3 to x\n",
55 | " z = tf.add(x, 3, name='z')\n",
56 | " \n",
57 | " # The resulting dataframe\n",
58 | " df2 = tfs.map_blocks(z, df)\n",
59 | " tf.summary.FileWriter('tensorboard/logs',g)\n",
60 | "\n",
61 | "# The transform is lazy as for most DataFrame operations. This will trigger it:\n",
62 | "df2.collect()"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 4,
68 | "metadata": {
69 | "collapsed": false
70 | },
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "root\n",
77 | " |-- x: double (nullable = true) double[?]\n",
78 | "\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "tfs.print_schema(df)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [
93 | {
94 | "data": {
95 | "text/plain": [
96 | "DataFrame[x: double]"
97 | ]
98 | },
99 | "execution_count": 5,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "tfs.analyze(df)"
106 | ]
107 | },
108 | {
109 | "cell_type": "code",
110 | "execution_count": 6,
111 | "metadata": {
112 | "collapsed": false
113 | },
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "root\n",
120 | " |-- z: double (nullable = false) double[?]\n",
121 | " |-- x: double (nullable = true) double[?]\n",
122 | "\n"
123 | ]
124 | }
125 | ],
126 | "source": [
127 | "tfs.print_schema(df2)"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 17,
133 | "metadata": {
134 | "collapsed": true
135 | },
136 | "outputs": [],
137 | "source": [
138 | "df3 = df2.select(df2.z.alias(\"y\"))"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 18,
144 | "metadata": {
145 | "collapsed": false
146 | },
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "+----+\n",
153 | "| y|\n",
154 | "+----+\n",
155 | "| 3.0|\n",
156 | "| 4.0|\n",
157 | "| 5.0|\n",
158 | "| 6.0|\n",
159 | "| 7.0|\n",
160 | "| 8.0|\n",
161 | "| 9.0|\n",
162 | "|10.0|\n",
163 | "|11.0|\n",
164 | "|12.0|\n",
165 | "+----+\n",
166 | "\n"
167 | ]
168 | }
169 | ],
170 | "source": [
171 | "df3.show()"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 21,
177 | "metadata": {
178 | "collapsed": false
179 | },
180 | "outputs": [
181 | {
182 | "ename": "Exception",
183 | "evalue": "Could not find column with name {col_name}",
184 | "output_type": "error",
185 | "traceback": [
186 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
187 | "\u001b[0;31mException\u001b[0m Traceback (most recent call last)",
188 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGraph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mg\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"z\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mzz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmultiply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mz\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mdf4\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzz\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdf3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
189 | "\u001b[0;32m/tmp/spark-f9646918-662f-4ce3-8f31-c7b56277d5a5/userFiles-35f6b4e1-fa0e-4de1-b002-84d380644b39/databricks_tensorframes-0.2.9-s_2.11.jar/tensorframes/core.py\u001b[0m in \u001b[0;36mblock\u001b[0;34m(df, col_name, tf_name)\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0ma\u001b[0m \u001b[0mTensorFlow\u001b[0m \u001b[0mplaceholder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \"\"\"\n\u001b[0;32m--> 413\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_auto_placeholder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtf_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 414\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtf_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
190 | "\u001b[0;32m/tmp/spark-f9646918-662f-4ce3-8f31-c7b56277d5a5/userFiles-35f6b4e1-fa0e-4de1-b002-84d380644b39/databricks_tensorframes-0.2.9-s_2.11.jar/tensorframes/core.py\u001b[0m in \u001b[0;36m_auto_placeholder\u001b[0;34m(df, col_name, tf_name, block)\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0mcol_shape\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0minfo\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfieldName\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol_shape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 434\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Could not find column with name {col_name}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 435\u001b[0m \u001b[0mcol_shape\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcol_shape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mcol_struct\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfields\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
191 | "\u001b[0;31mException\u001b[0m: Could not find column with name {col_name}"
192 | ]
193 | }
194 | ],
195 | "source": [
196 | "with tf.Graph().as_default() as g:\n",
197 | " z = tfs.block(df3,\"z\")\n",
198 | " zz = tf.multiply(z,3)\n",
199 | " df4 = tfs.map_blocks(zz,df3)"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 22,
205 | "metadata": {
206 | "collapsed": true
207 | },
208 | "outputs": [],
209 | "source": [
210 | "g = tf.Graph()"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 23,
216 | "metadata": {
217 | "collapsed": true
218 | },
219 | "outputs": [],
220 | "source": [
221 | "with g.as_default():\n",
222 | " b = tf.constant(7)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 24,
228 | "metadata": {
229 | "collapsed": false
230 | },
231 | "outputs": [
232 | {
233 | "data": {
234 | "text/plain": [
235 | ""
236 | ]
237 | },
238 | "execution_count": 24,
239 | "metadata": {},
240 | "output_type": "execute_result"
241 | }
242 | ],
243 | "source": [
244 | "b.graph"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 25,
250 | "metadata": {
251 | "collapsed": false
252 | },
253 | "outputs": [
254 | {
255 | "data": {
256 | "text/plain": [
257 | ""
258 | ]
259 | },
260 | "execution_count": 25,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "g"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": null,
272 | "metadata": {
273 | "collapsed": true
274 | },
275 | "outputs": [],
276 | "source": []
277 | }
278 | ],
279 | "metadata": {
280 | "kernelspec": {
281 | "display_name": "Python 2",
282 | "language": "python",
283 | "name": "python2"
284 | },
285 | "language_info": {
286 | "codemirror_mode": {
287 | "name": "ipython",
288 | "version": 2
289 | },
290 | "file_extension": ".py",
291 | "mimetype": "text/x-python",
292 | "name": "python",
293 | "nbconvert_exporter": "python",
294 | "pygments_lexer": "ipython2",
295 | "version": "2.7.13"
296 | }
297 | },
298 | "nbformat": 4,
299 | "nbformat_minor": 2
300 | }
301 |
--------------------------------------------------------------------------------
/Titanic Data Analysis using DataFrames.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","source":["titanic_data = spark.read.csv('/FileStore/tables/titanic/titanic_train-ac800.csv', header=True, inferSchema=True)"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["display(titanic_data)"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["titanic_data.printSchema()"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["#Finding maximum fare\ntitanic_data.agg({\"Fare\":\"max\"}).collect()"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["import pyspark.sql.functions as F"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["#Finding maximum fare - another way\ntitanic_data.agg(F.max(titanic_data.Fare)).collect()"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["display(titanic_data.groupBy('Pclass').count())"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["titanic_data.groupBy('Pclass').avg('Age').collect()"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["#Descending order of age\ndisplay(titanic_data.orderBy(titanic_data.Age.desc()))"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["#Person with longest name\ntitanic_data"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["### UDF in dataframes"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["from pyspark.sql.functions import udf"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["def getLen(word):\n return len(word)"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"code","source":["from pyspark.sql.types import IntegerType\nlen_udf = udf(getLen, IntegerType())"],"metadata":{},"outputs":[],"execution_count":14},{"cell_type":"code","source":["titanic_data = titanic_data.select(\"*\", len_udf(\"Name\").alias(\"len_name\"))"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"code","source":["titanic_data.agg({'len_name':'max'}).collect()"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"code","source":["display(titanic_data[titanic_data.len_name == 82])"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"code","source":["\n"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"code","source":["titanic_data.select(len_udf(\"Name\").alias(\"len_name\")).len_name"],"metadata":{},"outputs":[],"execution_count":19},{"cell_type":"code","source":["##Remove all the rows in which age is missing"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"code","source":["help(titanic_data.dropna)"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"code","source":["display(titanic_data.dropna(subset=['Age']))"],"metadata":{},"outputs":[],"execution_count":22},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":23}],"metadata":{"name":"Titanic Data Analysis using DataFrames","notebookId":3739718737890552},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/Uber-Jan-Feb-FOIL.csv:
--------------------------------------------------------------------------------
1 | dispatching_base_number,date,active_vehicles,trips
2 | B02512,1/1/2015,190,1132
3 | B02765,1/1/2015,225,1765
4 | B02764,1/1/2015,3427,29421
5 | B02682,1/1/2015,945,7679
6 | B02617,1/1/2015,1228,9537
7 | B02598,1/1/2015,870,6903
8 | B02598,1/2/2015,785,4768
9 | B02617,1/2/2015,1137,7065
10 | B02512,1/2/2015,175,875
11 | B02682,1/2/2015,890,5506
12 | B02765,1/2/2015,196,1001
13 | B02764,1/2/2015,3147,19974
14 | B02765,1/3/2015,201,1526
15 | B02617,1/3/2015,1188,10664
16 | B02598,1/3/2015,818,7432
17 | B02682,1/3/2015,915,8010
18 | B02512,1/3/2015,173,1088
19 | B02764,1/3/2015,3215,29729
20 | B02512,1/4/2015,147,791
21 | B02682,1/4/2015,812,5621
22 | B02598,1/4/2015,746,5223
23 | B02765,1/4/2015,183,993
24 | B02617,1/4/2015,1088,7729
25 | B02764,1/4/2015,2862,20441
26 | B02512,1/5/2015,194,984
27 | B02682,1/5/2015,951,6012
28 | B02617,1/5/2015,1218,7899
29 | B02764,1/5/2015,3387,20926
30 | B02598,1/5/2015,907,5798
31 | B02765,1/5/2015,227,1133
32 | B02764,1/6/2015,3473,25301
33 | B02682,1/6/2015,1022,7491
34 | B02617,1/6/2015,1336,10128
35 | B02765,1/6/2015,234,1376
36 | B02512,1/6/2015,218,1314
37 | B02598,1/6/2015,933,6816
38 | B02617,1/7/2015,1363,11528
39 | B02682,1/7/2015,1039,9078
40 | B02764,1/7/2015,3603,29949
41 | B02765,1/7/2015,248,1704
42 | B02512,1/7/2015,217,1446
43 | B02598,1/7/2015,974,8397
44 | B02765,1/8/2015,262,1911
45 | B02598,1/8/2015,1070,10050
46 | B02512,1/8/2015,238,1772
47 | B02682,1/8/2015,1135,10416
48 | B02764,1/8/2015,3831,33802
49 | B02617,1/8/2015,1463,13462
50 | B02617,1/9/2015,1455,13165
51 | B02512,1/9/2015,224,1560
52 | B02764,1/9/2015,3820,33517
53 | B02682,1/9/2015,1140,10477
54 | B02598,1/9/2015,1070,9538
55 | B02765,1/9/2015,280,2039
56 | B02682,1/10/2015,1057,11629
57 | B02617,1/10/2015,1331,13856
58 | B02598,1/10/2015,949,10287
59 | B02512,1/10/2015,206,1646
60 | B02764,1/10/2015,3558,38864
61 | B02765,1/10/2015,245,2202
62 | B02765,1/11/2015,220,1672
63 | B02598,1/11/2015,832,7176
64 | B02682,1/11/2015,943,8461
65 | B02764,1/11/2015,3186,27681
66 | B02617,1/11/2015,1228,10932
67 | B02512,1/11/2015,162,1104
68 | B02764,1/12/2015,3499,26852
69 | B02765,1/12/2015,279,1711
70 | B02512,1/12/2015,217,1399
71 | B02598,1/12/2015,964,7915
72 | B02682,1/12/2015,1082,9107
73 | B02617,1/12/2015,1323,10662
74 | B02765,1/13/2015,258,1697
75 | B02598,1/13/2015,975,8713
76 | B02617,1/13/2015,1342,11825
77 | B02512,1/13/2015,234,1652
78 | B02764,1/13/2015,3658,29983
79 | B02682,1/13/2015,1092,9629
80 | B02764,1/14/2015,3736,29550
81 | B02765,1/14/2015,271,1600
82 | B02598,1/14/2015,1030,8870
83 | B02512,1/14/2015,233,1582
84 | B02617,1/14/2015,1405,11965
85 | B02682,1/14/2015,1174,9762
86 | B02512,1/15/2015,237,1636
87 | B02682,1/15/2015,1208,10391
88 | B02617,1/15/2015,1457,12539
89 | B02765,1/15/2015,270,1797
90 | B02764,1/15/2015,3840,31214
91 | B02598,1/15/2015,1068,9152
92 | B02617,1/16/2015,1445,12977
93 | B02765,1/16/2015,290,2082
94 | B02764,1/16/2015,3975,34822
95 | B02682,1/16/2015,1250,11280
96 | B02512,1/16/2015,234,1481
97 | B02598,1/16/2015,1079,9838
98 | B02598,1/17/2015,974,9546
99 | B02512,1/17/2015,201,1281
100 | B02682,1/17/2015,1137,11382
101 | B02765,1/17/2015,252,2160
102 | B02617,1/17/2015,1306,12676
103 | B02764,1/17/2015,3657,36318
104 | B02512,1/18/2015,177,1521
105 | B02598,1/18/2015,869,9443
106 | B02765,1/18/2015,248,2287
107 | B02764,1/18/2015,3290,35182
108 | B02682,1/18/2015,1056,11161
109 | B02617,1/18/2015,1223,12879
110 | B02682,1/19/2015,883,7028
111 | B02617,1/19/2015,992,7775
112 | B02765,1/19/2015,238,1568
113 | B02764,1/19/2015,2958,22750
114 | B02512,1/19/2015,168,1025
115 | B02598,1/19/2015,706,5609
116 | B02598,1/20/2015,944,7206
117 | B02682,1/20/2015,1151,8496
118 | B02512,1/20/2015,221,1310
119 | B02764,1/20/2015,3654,26137
120 | B02765,1/20/2015,272,1608
121 | B02617,1/20/2015,1350,10015
122 | B02764,1/21/2015,3718,27344
123 | B02512,1/21/2015,242,1519
124 | B02682,1/21/2015,1228,9472
125 | B02598,1/21/2015,1035,8041
126 | B02765,1/21/2015,296,1774
127 | B02617,1/21/2015,1429,10997
128 | B02617,1/22/2015,1471,12143
129 | B02764,1/22/2015,3889,30091
130 | B02512,1/22/2015,246,1551
131 | B02598,1/22/2015,1071,9080
132 | B02682,1/22/2015,1295,10699
133 | B02765,1/22/2015,295,2038
134 | B02598,1/23/2015,1093,9343
135 | B02512,1/23/2015,246,1670
136 | B02765,1/23/2015,299,2162
137 | B02764,1/23/2015,4040,33756
138 | B02617,1/23/2015,1482,13121
139 | B02682,1/23/2015,1330,11767
140 | B02598,1/24/2015,945,10040
141 | B02764,1/24/2015,3652,39187
142 | B02512,1/24/2015,211,1608
143 | B02617,1/24/2015,1367,14143
144 | B02682,1/24/2015,1223,13355
145 | B02765,1/24/2015,245,2376
146 | B02512,1/25/2015,183,1190
147 | B02764,1/25/2015,3300,28066
148 | B02765,1/25/2015,226,1755
149 | B02598,1/25/2015,829,7219
150 | B02682,1/25/2015,1046,9303
151 | B02617,1/25/2015,1203,10362
152 | B02617,1/26/2015,1150,7608
153 | B02598,1/26/2015,860,5919
154 | B02765,1/26/2015,230,1363
155 | B02764,1/26/2015,3012,19940
156 | B02682,1/26/2015,1084,7565
157 | B02512,1/26/2015,197,1000
158 | B02682,1/27/2015,600,4414
159 | B02765,1/27/2015,135,921
160 | B02617,1/27/2015,596,4325
161 | B02598,1/27/2015,434,2957
162 | B02512,1/27/2015,112,629
163 | B02764,1/27/2015,1619,11998
164 | B02764,1/28/2015,3692,28137
165 | B02682,1/28/2015,1235,10025
166 | B02765,1/28/2015,286,1913
167 | B02617,1/28/2015,1356,10862
168 | B02598,1/28/2015,1011,8071
169 | B02512,1/28/2015,235,1438
170 | B02617,1/29/2015,1474,12600
171 | B02764,1/29/2015,3959,31637
172 | B02682,1/29/2015,1316,11485
173 | B02765,1/29/2015,295,2086
174 | B02512,1/29/2015,250,1687
175 | B02598,1/29/2015,1082,9499
176 | B02512,1/30/2015,256,2016
177 | B02617,1/30/2015,1501,14793
178 | B02682,1/30/2015,1384,13852
179 | B02764,1/30/2015,4124,39110
180 | B02765,1/30/2015,322,2785
181 | B02598,1/30/2015,1106,11167
182 | B02765,1/31/2015,309,3282
183 | B02512,1/31/2015,225,1892
184 | B02617,1/31/2015,1394,15756
185 | B02682,1/31/2015,1321,15388
186 | B02764,1/31/2015,3947,44297
187 | B02598,1/31/2015,1027,11642
188 | B02598,2/1/2015,961,9499
189 | B02682,2/1/2015,1214,12436
190 | B02512,2/1/2015,193,1377
191 | B02765,2/1/2015,289,2672
192 | B02617,2/1/2015,1355,13458
193 | B02764,2/1/2015,3740,37468
194 | B02617,2/2/2015,1217,12216
195 | B02682,2/2/2015,1152,11981
196 | B02765,2/2/2015,275,2607
197 | B02598,2/2/2015,939,9511
198 | B02764,2/2/2015,3270,30761
199 | B02512,2/2/2015,227,1904
200 | B02765,2/3/2015,299,2410
201 | B02598,2/3/2015,991,9602
202 | B02512,2/3/2015,257,1915
203 | B02764,2/3/2015,3674,31641
204 | B02617,2/3/2015,1350,12665
205 | B02682,2/3/2015,1269,11955
206 | B02764,2/4/2015,3856,29994
207 | B02765,2/4/2015,309,2334
208 | B02512,2/4/2015,244,1639
209 | B02682,2/4/2015,1311,11309
210 | B02617,2/4/2015,1393,11959
211 | B02598,2/4/2015,1072,9600
212 | B02617,2/5/2015,1524,14499
213 | B02682,2/5/2015,1418,13782
214 | B02598,2/5/2015,1179,11609
215 | B02512,2/5/2015,264,2022
216 | B02765,2/5/2015,355,3011
217 | B02764,2/5/2015,4093,35990
218 | B02617,2/6/2015,1526,15417
219 | B02765,2/6/2015,385,3569
220 | B02598,2/6/2015,1181,11897
221 | B02512,2/6/2015,261,1989
222 | B02764,2/6/2015,4170,38693
223 | B02682,2/6/2015,1414,14375
224 | B02598,2/7/2015,1031,10512
225 | B02512,2/7/2015,211,1504
226 | B02617,2/7/2015,1383,13688
227 | B02682,2/7/2015,1300,13450
228 | B02764,2/7/2015,3849,38530
229 | B02765,2/7/2015,345,3473
230 | B02764,2/8/2015,3422,29692
231 | B02765,2/8/2015,313,2623
232 | B02598,2/8/2015,923,8129
233 | B02617,2/8/2015,1256,11004
234 | B02682,2/8/2015,1136,10356
235 | B02512,2/8/2015,176,1196
236 | B02617,2/9/2015,1312,10887
237 | B02682,2/9/2015,1241,10209
238 | B02598,2/9/2015,976,8135
239 | B02764,2/9/2015,3543,28266
240 | B02512,2/9/2015,228,1565
241 | B02765,2/9/2015,388,2894
242 | B02764,2/10/2015,3700,29124
243 | B02512,2/10/2015,233,1555
244 | B02617,2/10/2015,1364,11401
245 | B02765,2/10/2015,422,3432
246 | B02682,2/10/2015,1281,10536
247 | B02598,2/10/2015,1029,8718
248 | B02617,2/11/2015,1450,12749
249 | B02764,2/11/2015,3849,31889
250 | B02512,2/11/2015,255,1831
251 | B02598,2/11/2015,1115,10034
252 | B02765,2/11/2015,450,3778
253 | B02682,2/11/2015,1396,12189
254 | B02617,2/12/2015,1532,14263
255 | B02512,2/12/2015,269,2092
256 | B02682,2/12/2015,1468,13786
257 | B02765,2/12/2015,536,4609
258 | B02598,2/12/2015,1181,11640
259 | B02764,2/12/2015,4137,36844
260 | B02617,2/13/2015,1590,16996
261 | B02682,2/13/2015,1523,16088
262 | B02764,2/13/2015,4395,43561
263 | B02765,2/13/2015,599,5909
264 | B02512,2/13/2015,281,2408
265 | B02598,2/13/2015,1216,13062
266 | B02764,2/14/2015,4129,45858
267 | B02512,2/14/2015,236,2055
268 | B02598,2/14/2015,1111,12678
269 | B02765,2/14/2015,583,6307
270 | B02617,2/14/2015,1486,16999
271 | B02682,2/14/2015,1428,16448
272 | B02682,2/15/2015,1261,14517
273 | B02764,2/15/2015,3651,41209
274 | B02617,2/15/2015,1293,14662
275 | B02765,2/15/2015,521,5500
276 | B02512,2/15/2015,210,1996
277 | B02598,2/15/2015,1003,11517
278 | B02598,2/16/2015,934,9052
279 | B02512,2/16/2015,207,1576
280 | B02617,2/16/2015,1214,11824
281 | B02764,2/16/2015,3524,33448
282 | B02682,2/16/2015,1164,11323
283 | B02765,2/16/2015,508,4875
284 | B02764,2/17/2015,3826,32473
285 | B02512,2/17/2015,241,1797
286 | B02682,2/17/2015,1314,11887
287 | B02617,2/17/2015,1378,12524
288 | B02598,2/17/2015,1066,9463
289 | B02765,2/17/2015,578,4907
290 | B02598,2/18/2015,1078,9538
291 | B02682,2/18/2015,1314,11724
292 | B02617,2/18/2015,1394,12016
293 | B02765,2/18/2015,586,5059
294 | B02764,2/18/2015,3842,32317
295 | B02512,2/18/2015,228,1589
296 | B02598,2/19/2015,1127,11739
297 | B02512,2/19/2015,250,2120
298 | B02682,2/19/2015,1428,14591
299 | B02764,2/19/2015,4110,39110
300 | B02765,2/19/2015,663,6447
301 | B02617,2/19/2015,1452,14750
302 | B02764,2/20/2015,4384,44755
303 | B02617,2/20/2015,1574,16856
304 | B02598,2/20/2015,1186,12758
305 | B02682,2/20/2015,1497,16342
306 | B02765,2/20/2015,736,7824
307 | B02512,2/20/2015,272,2380
308 | B02598,2/21/2015,1044,12132
309 | B02682,2/21/2015,1374,16149
310 | B02765,2/21/2015,685,7658
311 | B02617,2/21/2015,1443,16098
312 | B02512,2/21/2015,238,2149
313 | B02764,2/21/2015,3981,44194
314 | B02512,2/22/2015,199,1312
315 | B02617,2/22/2015,1248,10696
316 | B02682,2/22/2015,1220,10970
317 | B02764,2/22/2015,3478,30157
318 | B02598,2/22/2015,909,8271
319 | B02765,2/22/2015,566,5034
320 | B02598,2/23/2015,966,8943
321 | B02617,2/23/2015,1332,11720
322 | B02764,2/23/2015,3734,31173
323 | B02682,2/23/2015,1262,11714
324 | B02765,2/23/2015,665,5823
325 | B02512,2/23/2015,238,1844
326 | B02764,2/24/2015,3965,34686
327 | B02512,2/24/2015,247,1869
328 | B02598,2/24/2015,1061,9954
329 | B02682,2/24/2015,1346,12497
330 | B02617,2/24/2015,1456,13719
331 | B02765,2/24/2015,698,6390
332 | B02512,2/25/2015,246,1647
333 | B02598,2/25/2015,1076,9405
334 | B02765,2/25/2015,706,6178
335 | B02682,2/25/2015,1395,12693
336 | B02617,2/25/2015,1473,12811
337 | B02764,2/25/2015,3934,31957
338 | B02598,2/26/2015,1134,10661
339 | B02617,2/26/2015,1539,14461
340 | B02682,2/26/2015,1465,13814
341 | B02512,2/26/2015,243,1797
342 | B02765,2/26/2015,745,6744
343 | B02764,2/26/2015,4101,36091
344 | B02765,2/27/2015,786,7563
345 | B02617,2/27/2015,1551,14677
346 | B02598,2/27/2015,1114,10755
347 | B02512,2/27/2015,272,2056
348 | B02764,2/27/2015,4253,38780
349 | B02682,2/27/2015,1510,14975
350 | B02598,2/28/2015,994,10319
351 | B02764,2/28/2015,3952,39812
352 | B02617,2/28/2015,1372,14022
353 | B02682,2/28/2015,1386,14472
354 | B02512,2/28/2015,230,1803
355 | B02765,2/28/2015,747,7753
356 |
--------------------------------------------------------------------------------
/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "sc"
12 | ]
13 | }
14 | ],
15 | "metadata": {
16 | "kernelspec": {
17 | "display_name": "Python 2",
18 | "language": "python",
19 | "name": "python2"
20 | }
21 | },
22 | "nbformat": 4,
23 | "nbformat_minor": 2
24 | }
25 |
--------------------------------------------------------------------------------
/Untitled1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": []
11 | }
12 | ],
13 | "metadata": {
14 | "kernelspec": {
15 | "display_name": "Python 2",
16 | "language": "python",
17 | "name": "python2"
18 | },
19 | "language_info": {
20 | "codemirror_mode": {
21 | "name": "ipython",
22 | "version": 2
23 | },
24 | "file_extension": ".py",
25 | "mimetype": "text/x-python",
26 | "name": "python",
27 | "nbconvert_exporter": "python",
28 | "pygments_lexer": "ipython2",
29 | "version": "2.7.13"
30 | }
31 | },
32 | "nbformat": 4,
33 | "nbformat_minor": 2
34 | }
35 |
--------------------------------------------------------------------------------
/abc.txt/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/abc.txt/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/abc.txt/.part-00000.crc
--------------------------------------------------------------------------------
/abc.txt/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/abc.txt/_SUCCESS
--------------------------------------------------------------------------------
/abc.txt/part-00000:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 2
4 | 3
5 | 4
6 | 5
7 | 6
8 | 7
9 | 8
10 | 9
11 |
--------------------------------------------------------------------------------
/allstate_test.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/allstate_test.csv.zip
--------------------------------------------------------------------------------
/allstate_train.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/allstate_train.csv.zip
--------------------------------------------------------------------------------
/births_train.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/births_train.csv.gz
--------------------------------------------------------------------------------
/births_transformed.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/births_transformed.csv.gz
--------------------------------------------------------------------------------
/cass_code.py:
--------------------------------------------------------------------------------
1 | from cassandra.cluster import Cluster
2 |
3 | cluster = Cluster()
4 | session = cluster.connect()
5 |
6 | session.set_keyspace('office')
7 |
8 | session.execute('CREATE TABLE user (id int PRIMARY KEY, location text)')
9 | session.execute("INSERT INTO user (id, location) VALUES (%s, %s)",(11,'abc'))
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/data/2015-summary.csv.txt:
--------------------------------------------------------------------------------
1 | DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count
2 | United States,Romania,15
3 | United States,Croatia,1
4 | United States,Ireland,344
5 | Egypt,United States,15
6 | United States,India,62
7 | United States,Singapore,1
8 | United States,Grenada,62
9 | Costa Rica,United States,588
10 | Senegal,United States,40
11 | Moldova,United States,1
12 | United States,Sint Maarten,325
13 | United States,Marshall Islands,39
14 | Guyana,United States,64
15 | Malta,United States,1
16 | Anguilla,United States,41
17 | Bolivia,United States,30
18 | United States,Paraguay,6
19 | Algeria,United States,4
20 | Turks and Caicos Islands,United States,230
21 | United States,Gibraltar,1
22 | Saint Vincent and the Grenadines,United States,1
23 | Italy,United States,382
24 | United States,Federated States of Micronesia,69
25 | United States,Russia,161
26 | Pakistan,United States,12
27 | United States,Netherlands,660
28 | Iceland,United States,181
29 | Marshall Islands,United States,42
30 | Luxembourg,United States,155
31 | Honduras,United States,362
32 | The Bahamas,United States,955
33 | United States,Senegal,42
34 | El Salvador,United States,561
35 | Samoa,United States,25
36 | United States,Angola,13
37 | Switzerland,United States,294
38 | United States,Anguilla,38
39 | Sint Maarten,United States,325
40 | Hong Kong,United States,332
41 | Trinidad and Tobago,United States,211
42 | Latvia,United States,19
43 | United States,Ecuador,300
44 | Suriname,United States,1
45 | Mexico,United States,7140
46 | United States,Cyprus,1
47 | Ecuador,United States,268
48 | United States,Portugal,134
49 | United States,Costa Rica,608
50 | United States,Guatemala,318
51 | United States,Suriname,34
52 | Colombia,United States,873
53 | United States,Cape Verde,14
54 | United States,Jamaica,712
55 | Norway,United States,121
56 | United States,Malaysia,3
57 | United States,Morocco,19
58 | Thailand,United States,3
59 | United States,Samoa,25
60 | Venezuela,United States,290
61 | United States,Palau,31
62 | United States,Venezuela,246
63 | Panama,United States,510
64 | Antigua and Barbuda,United States,126
65 | United States,Chile,185
66 | Morocco,United States,15
67 | United States,Finland,28
68 | Azerbaijan,United States,21
69 | United States,Greece,23
70 | United States,The Bahamas,986
71 | New Zealand,United States,111
72 | Liberia,United States,2
73 | United States,Hong Kong,414
74 | Hungary,United States,2
75 | United States,China,920
76 | United States,Vietnam,2
77 | Burkina Faso,United States,1
78 | Sweden,United States,118
79 | United States,Kuwait,28
80 | United States,Dominican Republic,1420
81 | United States,Egypt,12
82 | Israel,United States,134
83 | United States,United States,370002
84 | Ethiopia,United States,13
85 | United States,Luxembourg,134
86 | United States,Poland,33
87 | Martinique,United States,44
88 | United States,Saint Barthelemy,41
89 | Saint Barthelemy,United States,39
90 | Barbados,United States,154
91 | United States,Turkey,129
92 | Djibouti,United States,1
93 | United States,Azerbaijan,21
94 | United States,Estonia,1
95 | Germany,United States,1468
96 | United States,South Korea,827
97 | United States,El Salvador,508
98 | Ireland,United States,335
99 | United States,Hungary,3
100 | Zambia,United States,1
101 | Malaysia,United States,2
102 | United States,Ethiopia,12
103 | United States,Panama,465
104 | United States,Aruba,342
105 | United States,Thailand,4
106 | United States,Turks and Caicos Islands,236
107 | Croatia,United States,2
108 | United States,Pakistan,12
109 | Cyprus,United States,1
110 | United States,Honduras,407
111 | Fiji,United States,24
112 | Qatar,United States,108
113 | Saint Kitts and Nevis,United States,139
114 | Kuwait,United States,32
115 | Taiwan,United States,266
116 | Haiti,United States,226
117 | Canada,United States,8399
118 | Federated States of Micronesia,United States,69
119 | United States,Liberia,2
120 | Jamaica,United States,666
121 | United States,Malta,2
122 | Dominican Republic,United States,1353
123 | Japan,United States,1548
124 | United States,Lithuania,1
125 | Finland,United States,26
126 | United States,Guadeloupe,59
127 | United States,Ukraine,13
128 | United States,France,952
129 | United States,Norway,115
130 | Aruba,United States,346
131 | French Guiana,United States,5
132 | United States,Kiribati,35
133 | India,United States,61
134 | British Virgin Islands,United States,107
135 | Brazil,United States,853
136 | United States,Germany,1336
137 | United States,New Zealand,74
138 | French Polynesia,United States,43
139 | United Arab Emirates,United States,320
140 | Singapore,United States,3
141 | United States,Mexico,7187
142 | United States,Sweden,119
143 | Netherlands,United States,776
144 | United States,Martinique,43
145 | United States,United Arab Emirates,313
146 | United States,Bulgaria,1
147 | Denmark,United States,153
148 | China,United States,772
149 | United States,Nicaragua,201
150 | United States,Philippines,126
151 | United States,Georgia,1
152 | United States,Belgium,228
153 | Cayman Islands,United States,314
154 | Argentina,United States,180
155 | Peru,United States,279
156 | South Africa,United States,36
157 | United States,Iceland,202
158 | United States,Argentina,141
159 | Spain,United States,420
160 | Bermuda,United States,183
161 | United States,Nigeria,50
162 | United States,Austria,63
163 | United States,"Bonaire, Sint Eustatius, and Saba",59
164 | Kiribati,United States,26
165 | Saudi Arabia,United States,83
166 | Czech Republic,United States,13
167 | United States,Israel,127
168 | Belgium,United States,259
169 | United States,Saint Lucia,136
170 | United States,Bahrain,1
171 | United States,British Virgin Islands,80
172 | Curacao,United States,90
173 | Georgia,United States,2
174 | United States,Denmark,152
175 | United States,Guyana,63
176 | Philippines,United States,134
177 | Grenada,United States,53
178 | Cape Verde,United States,20
179 | Cote d'Ivoire,United States,1
180 | Ukraine,United States,14
181 | United States,Papua New Guinea,1
182 | Russia,United States,176
183 | United States,Saudi Arabia,70
184 | Guatemala,United States,397
185 | Saint Lucia,United States,123
186 | Paraguay,United States,60
187 | United States,Curacao,83
188 | Kosovo,United States,1
189 | United States,Taiwan,235
190 | Tunisia,United States,3
191 | United States,South Africa,40
192 | Niger,United States,2
193 | Turkey,United States,138
194 | United Kingdom,United States,2025
195 | Romania,United States,14
196 | United States,Greenland,4
197 | Papua New Guinea,United States,3
198 | United States,Spain,442
199 | Iraq,United States,1
200 | United States,Italy,438
201 | Cuba,United States,466
202 | United States,Switzerland,305
203 | Dominica,United States,20
204 | United States,Japan,1496
205 | Portugal,United States,127
206 | United States,Brazil,619
207 | Bahrain,United States,19
208 | United States,Peru,337
209 | Indonesia,United States,1
210 | United States,Belize,193
211 | United States,United Kingdom,1970
212 | Belize,United States,188
213 | United States,Ghana,20
214 | United States,Indonesia,2
215 | United States,Fiji,25
216 | United States,Canada,8483
217 | United States,Antigua and Barbuda,117
218 | United States,French Polynesia,40
219 | Nicaragua,United States,179
220 | United States,Latvia,15
221 | United States,Dominica,27
222 | United States,Czech Republic,12
223 | United States,Australia,258
224 | United States,Cook Islands,13
225 | Austria,United States,62
226 | Jordan,United States,44
227 | Palau,United States,30
228 | South Korea,United States,1048
229 | Angola,United States,15
230 | Ghana,United States,18
231 | New Caledonia,United States,1
232 | Guadeloupe,United States,56
233 | France,United States,935
234 | Poland,United States,32
235 | Nigeria,United States,59
236 | United States,Uruguay,13
237 | Greenland,United States,2
238 | United States,Bermuda,193
239 | Chile,United States,174
240 | United States,Cuba,478
241 | United States,Montenegro,1
242 | United States,Colombia,867
243 | United States,Barbados,130
244 | United States,Qatar,109
245 | Australia,United States,329
246 | United States,Cayman Islands,310
247 | United States,Jordan,44
248 | United States,Namibia,1
249 | United States,Trinidad and Tobago,217
250 | United States,Bolivia,13
251 | Cook Islands,United States,13
252 | Bulgaria,United States,3
253 | United States,Saint Kitts and Nevis,145
254 | Uruguay,United States,43
255 | United States,Haiti,225
256 | "Bonaire, Sint Eustatius, and Saba",United States,58
257 | Greece,United States,30
258 |
--------------------------------------------------------------------------------
/data/graphx/followers.txt:
--------------------------------------------------------------------------------
1 | 2 1
2 | 4 1
3 | 1 2
4 | 6 3
5 | 7 3
6 | 7 6
7 | 6 7
8 | 3 7
9 |
--------------------------------------------------------------------------------
/data/graphx/users.txt:
--------------------------------------------------------------------------------
1 | 1,BarackObama,Barack Obama
2 | 2,ladygaga,Goddess of Love
3 | 3,jeresig,John Resig
4 | 4,justinbieber,Justin Bieber
5 | 6,matei_zaharia,Matei Zaharia
6 | 7,odersky,Martin Odersky
7 | 8,anonsys
8 |
--------------------------------------------------------------------------------
/data/mllib/als/test.data:
--------------------------------------------------------------------------------
1 | 1,1,5.0
2 | 1,2,1.0
3 | 1,3,5.0
4 | 1,4,1.0
5 | 2,1,5.0
6 | 2,2,1.0
7 | 2,3,5.0
8 | 2,4,1.0
9 | 3,1,1.0
10 | 3,2,5.0
11 | 3,3,1.0
12 | 3,4,5.0
13 | 4,1,1.0
14 | 4,2,5.0
15 | 4,3,1.0
16 | 4,4,5.0
17 |
--------------------------------------------------------------------------------
/data/mllib/kmeans_data.txt:
--------------------------------------------------------------------------------
1 | 0.0 0.0 0.0
2 | 0.1 0.1 0.1
3 | 0.2 0.2 0.2
4 | 9.0 9.0 9.0
5 | 9.1 9.1 9.1
6 | 9.2 9.2 9.2
7 |
--------------------------------------------------------------------------------
/data/mllib/pagerank_data.txt:
--------------------------------------------------------------------------------
1 | 1 2
2 | 1 3
3 | 1 4
4 | 2 1
5 | 3 1
6 | 4 1
7 |
--------------------------------------------------------------------------------
/data/mllib/pic_data.txt:
--------------------------------------------------------------------------------
1 | 0 1 1.0
2 | 0 2 1.0
3 | 0 3 1.0
4 | 1 2 1.0
5 | 1 3 1.0
6 | 2 3 1.0
7 | 3 4 0.1
8 | 4 5 1.0
9 | 4 15 1.0
10 | 5 6 1.0
11 | 6 7 1.0
12 | 7 8 1.0
13 | 8 9 1.0
14 | 9 10 1.0
15 | 10 11 1.0
16 | 11 12 1.0
17 | 12 13 1.0
18 | 13 14 1.0
19 | 14 15 1.0
20 |
--------------------------------------------------------------------------------
/data/mllib/ridge-data/lpsa.data:
--------------------------------------------------------------------------------
1 | -0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
2 | -0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
3 | -0.1625189,-1.57881887548545 -2.1887840293994 1.36116336875686 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541
4 | -0.1625189,-2.16691708463163 -0.807993896938655 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
5 | 0.3715636,-0.507874475300631 -0.458834049396776 -0.250631301876899 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
6 | 0.7654678,-2.03612849966376 -0.933954647105133 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
7 | 0.8544153,-0.557312518810673 -0.208756571683607 -0.787896192088153 0.990146852537193 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
8 | 1.2669476,-0.929360463147704 -0.0578991819441687 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
9 | 1.2669476,-2.28833047634983 -0.0706369432557794 -0.116315079324086 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
10 | 1.2669476,0.223498042876113 -1.41471935455355 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341
11 | 1.3480731,0.107785900236813 -1.47221551299731 0.420949810887169 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.687186906466865
12 | 1.446919,0.162180092313795 -1.32557369901905 0.286633588334355 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
13 | 1.4701758,-1.49795329918548 -0.263601072284232 0.823898478545609 0.788388310173035 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341
14 | 1.4929041,0.796247055396743 0.0476559407005752 0.286633588334355 -1.02470580167082 -0.522940888712441 0.394013435896129 -1.04215728919298 -0.864466507337306
15 | 1.5581446,-1.62233848461465 -0.843294091975396 -3.07127197548598 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
16 | 1.5993876,-0.990720665490831 0.458513517212311 0.823898478545609 1.07379746308195 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
17 | 1.6389967,-0.171901281967138 -0.489197399065355 -0.65357996953534 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
18 | 1.6956156,-1.60758252338831 -0.590700340358265 -0.65357996953534 -0.619561070667254 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
19 | 1.7137979,0.366273918511144 -0.414014962912583 -0.116315079324086 0.232904453212813 -0.522940888712441 0.971228997418125 0.342627053981254 1.26288870310799
20 | 1.8000583,-0.710307384579833 0.211731938156277 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.442797990776478 0.342627053981254 1.61744790484887
21 | 1.8484548,-0.262791728113881 -1.16708345615721 0.420949810887169 0.0846342590816532 -0.522940888712441 0.163172393491611 0.342627053981254 1.97200710658975
22 | 1.8946169,0.899043117369237 -0.590700340358265 0.152317365781542 -1.02470580167082 -0.522940888712441 1.28643254437683 -1.04215728919298 -0.864466507337306
23 | 1.9242487,-0.903451690500615 1.07659722048274 0.152317365781542 1.28380453408541 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306
24 | 2.008214,-0.0633337899773081 -1.38088970920094 0.958214701098423 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
25 | 2.0476928,-1.15393789990757 -0.961853075398404 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306
26 | 2.1575593,0.0620203721138446 0.0657973885499142 1.22684714620405 -0.468824786336838 -0.522940888712441 1.31421001659859 1.72741139715549 -0.332627704725983
27 | 2.1916535,-0.75731027755674 -2.92717970468456 0.018001143228728 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983
28 | 2.2137539,1.11226993252773 1.06484916245061 0.555266033439982 0.877691038550889 1.89254797819741 1.43890404648442 0.342627053981254 0.376490698755783
29 | 2.2772673,-0.468768642850639 -1.43754788774533 -1.05652863719378 0.576050411655607 -0.522940888712441 0.0120483832567209 0.342627053981254 -0.687186906466865
30 | 2.2975726,-0.618884859896728 -1.1366360750781 -0.519263746982526 -1.02470580167082 -0.522940888712441 -0.863171185425945 3.11219574032972 1.97200710658975
31 | 2.3272777,-0.651431999123483 0.55329161145762 -0.250631301876899 1.11210019001038 -0.522940888712441 -0.179808625688859 -1.04215728919298 -0.864466507337306
32 | 2.5217206,0.115499102435224 -0.512233676577595 0.286633588334355 1.13650173283446 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.155348103855541
33 | 2.5533438,0.266341329949937 -0.551137885443386 -0.384947524429713 0.354857790686005 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983
34 | 2.5687881,1.16902610257751 0.855491905752846 2.03274448152093 1.22628985326088 1.89254797819741 2.02833774827712 3.11219574032972 2.68112551007152
35 | 2.6567569,-0.218972367124187 0.851192298581141 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 0.908329501367106
36 | 2.677591,0.263121415733908 1.4142681068416 0.018001143228728 1.35980653053822 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
37 | 2.7180005,-0.0704736333296423 1.52000996595417 0.286633588334355 1.39364261119802 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983
38 | 2.7942279,-0.751957286017338 0.316843561689933 -1.99674219506348 0.911736065044475 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
39 | 2.8063861,-0.685277652430997 1.28214038482516 0.823898478545609 0.232904453212813 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541
40 | 2.8124102,-0.244991501432929 0.51882005949686 -0.384947524429713 0.823246560137838 -0.522940888712441 -0.863171185425945 0.342627053981254 0.553770299626224
41 | 2.8419982,-0.75731027755674 2.09041984898851 1.22684714620405 1.53428167116843 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
42 | 2.8535925,1.20962937075363 -0.242882661178889 1.09253092365124 -1.02470580167082 -0.522940888712441 1.24263233939889 3.11219574032972 2.50384590920108
43 | 2.9204698,0.570886990493502 0.58243883987948 0.555266033439982 1.16006887775962 -0.522940888712441 1.07357183940747 0.342627053981254 1.61744790484887
44 | 2.9626924,0.719758684343624 0.984970304132004 1.09253092365124 1.52137230773457 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.509907305596424
45 | 2.9626924,-1.52406140158064 1.81975700990333 0.689582255992796 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
46 | 2.9729753,-0.132431544081234 2.68769877553723 1.09253092365124 1.53428167116843 -0.522940888712441 -0.442797990776478 0.342627053981254 -0.687186906466865
47 | 3.0130809,0.436161292804989 -0.0834447307428255 -0.519263746982526 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799
48 | 3.0373539,-0.161195191984091 -0.671900359186746 1.7641120364153 1.13650173283446 -0.522940888712441 -0.863171185425945 0.342627053981254 0.0219314970149
49 | 3.2752562,1.39927182372944 0.513852869452676 0.689582255992796 -1.02470580167082 1.89254797819741 1.49394503405693 0.342627053981254 -0.155348103855541
50 | 3.3375474,1.51967002306341 -0.852203755696565 0.555266033439982 -0.104527297798983 1.89254797819741 1.85927724828569 0.342627053981254 0.908329501367106
51 | 3.3928291,0.560725834706224 1.87867703391426 1.09253092365124 1.39364261119802 -0.522940888712441 0.486423065822545 0.342627053981254 1.26288870310799
52 | 3.4355988,1.00765532502814 1.69426310090641 1.89842825896812 1.53428167116843 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.509907305596424
53 | 3.4578927,1.10152996153577 -0.10927271844907 0.689582255992796 -1.02470580167082 1.89254797819741 1.97630171771485 0.342627053981254 1.61744790484887
54 | 3.5160131,0.100001934217311 -1.30380956369388 0.286633588334355 0.316555063757567 -0.522940888712441 0.28786643052924 0.342627053981254 0.553770299626224
55 | 3.5307626,0.987291634724086 -0.36279314978779 -0.922212414640967 0.232904453212813 -0.522940888712441 1.79270085261407 0.342627053981254 1.26288870310799
56 | 3.5652984,1.07158528137575 0.606453149641961 1.7641120364153 -0.432854616994416 1.89254797819741 0.528504607720369 0.342627053981254 0.199211097885341
57 | 3.5876769,0.180156323255198 0.188987436375017 -0.519263746982526 1.09956763075594 -0.522940888712441 0.708239632330506 0.342627053981254 0.199211097885341
58 | 3.6309855,1.65687973755377 -0.256675483533719 0.018001143228728 -1.02470580167082 1.89254797819741 1.79270085261407 0.342627053981254 1.26288870310799
59 | 3.6800909,0.5720085322365 0.239854450210939 -0.787896192088153 1.0605418233138 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
60 | 3.7123518,0.323806133438225 -0.606717660886078 -0.250631301876899 -1.02470580167082 1.89254797819741 0.342907418101747 0.342627053981254 0.199211097885341
61 | 3.9843437,1.23668206715898 2.54220539083611 0.152317365781542 -1.02470580167082 1.89254797819741 1.89037692416194 0.342627053981254 1.26288870310799
62 | 3.993603,0.180156323255198 0.154448192444669 1.62979581386249 0.576050411655607 1.89254797819741 0.708239632330506 0.342627053981254 1.79472750571931
63 | 4.029806,1.60906277046565 1.10378605019827 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306
64 | 4.1295508,1.0036214996026 0.113496885050331 -0.384947524429713 0.860016436332751 1.89254797819741 -0.863171185425945 0.342627053981254 -0.332627704725983
65 | 4.3851468,1.25591974271076 0.577607033774471 0.555266033439982 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799
66 | 4.6844434,2.09650591351268 0.625488598331018 -2.66832330782754 -1.02470580167082 1.89254797819741 1.67954222367555 0.342627053981254 0.553770299626224
67 | 5.477509,1.30028987435881 0.338383613253713 0.555266033439982 1.00481276295349 1.89254797819741 1.24263233939889 0.342627053981254 1.97200710658975
68 |
--------------------------------------------------------------------------------
/data/mllib/sample_fpgrowth.txt:
--------------------------------------------------------------------------------
1 | r z h k p
2 | z y x w v u t s
3 | s x o n r
4 | x z y m t s q e
5 | z
6 | x z y r q t p
7 |
--------------------------------------------------------------------------------
/data/mllib/sample_isotonic_regression_libsvm_data.txt:
--------------------------------------------------------------------------------
1 | 0.24579296 1:0.01
2 | 0.28505864 1:0.02
3 | 0.31208567 1:0.03
4 | 0.35900051 1:0.04
5 | 0.35747068 1:0.05
6 | 0.16675166 1:0.06
7 | 0.17491076 1:0.07
8 | 0.04181540 1:0.08
9 | 0.04793473 1:0.09
10 | 0.03926568 1:0.10
11 | 0.12952575 1:0.11
12 | 0.00000000 1:0.12
13 | 0.01376849 1:0.13
14 | 0.13105558 1:0.14
15 | 0.08873024 1:0.15
16 | 0.12595614 1:0.16
17 | 0.15247323 1:0.17
18 | 0.25956145 1:0.18
19 | 0.20040796 1:0.19
20 | 0.19581846 1:0.20
21 | 0.15757267 1:0.21
22 | 0.13717491 1:0.22
23 | 0.19020908 1:0.23
24 | 0.19581846 1:0.24
25 | 0.20091790 1:0.25
26 | 0.16879143 1:0.26
27 | 0.18510964 1:0.27
28 | 0.20040796 1:0.28
29 | 0.29576747 1:0.29
30 | 0.43396226 1:0.30
31 | 0.53391127 1:0.31
32 | 0.52116267 1:0.32
33 | 0.48546660 1:0.33
34 | 0.49209587 1:0.34
35 | 0.54156043 1:0.35
36 | 0.59765426 1:0.36
37 | 0.56144824 1:0.37
38 | 0.58592555 1:0.38
39 | 0.52983172 1:0.39
40 | 0.50178480 1:0.40
41 | 0.52626211 1:0.41
42 | 0.58286588 1:0.42
43 | 0.64660887 1:0.43
44 | 0.68077511 1:0.44
45 | 0.74298827 1:0.45
46 | 0.64864865 1:0.46
47 | 0.67261601 1:0.47
48 | 0.65782764 1:0.48
49 | 0.69811321 1:0.49
50 | 0.63029067 1:0.50
51 | 0.61601224 1:0.51
52 | 0.63233044 1:0.52
53 | 0.65323814 1:0.53
54 | 0.65323814 1:0.54
55 | 0.67363590 1:0.55
56 | 0.67006629 1:0.56
57 | 0.51555329 1:0.57
58 | 0.50892402 1:0.58
59 | 0.33299337 1:0.59
60 | 0.36206017 1:0.60
61 | 0.43090260 1:0.61
62 | 0.45996940 1:0.62
63 | 0.56348802 1:0.63
64 | 0.54920959 1:0.64
65 | 0.48393677 1:0.65
66 | 0.48495665 1:0.66
67 | 0.46965834 1:0.67
68 | 0.45181030 1:0.68
69 | 0.45843957 1:0.69
70 | 0.47118817 1:0.70
71 | 0.51555329 1:0.71
72 | 0.58031617 1:0.72
73 | 0.55481897 1:0.73
74 | 0.56297807 1:0.74
75 | 0.56603774 1:0.75
76 | 0.57929628 1:0.76
77 | 0.64762876 1:0.77
78 | 0.66241713 1:0.78
79 | 0.69301377 1:0.79
80 | 0.65119837 1:0.80
81 | 0.68332483 1:0.81
82 | 0.66598674 1:0.82
83 | 0.73890872 1:0.83
84 | 0.73992861 1:0.84
85 | 0.84242733 1:0.85
86 | 0.91330954 1:0.86
87 | 0.88016318 1:0.87
88 | 0.90719021 1:0.88
89 | 0.93115757 1:0.89
90 | 0.93115757 1:0.90
91 | 0.91942886 1:0.91
92 | 0.92911780 1:0.92
93 | 0.95665477 1:0.93
94 | 0.95002550 1:0.94
95 | 0.96940337 1:0.95
96 | 1.00000000 1:0.96
97 | 0.89801122 1:0.97
98 | 0.90311066 1:0.98
99 | 0.90362060 1:0.99
100 | 0.83477817 1:1.0
--------------------------------------------------------------------------------
/data/mllib/sample_kmeans_data.txt:
--------------------------------------------------------------------------------
1 | 0 1:0.0 2:0.0 3:0.0
2 | 1 1:0.1 2:0.1 3:0.1
3 | 2 1:0.2 2:0.2 3:0.2
4 | 3 1:9.0 2:9.0 3:9.0
5 | 4 1:9.1 2:9.1 3:9.1
6 | 5 1:9.2 2:9.2 3:9.2
7 |
--------------------------------------------------------------------------------
/data/mllib/sample_lda_data.txt:
--------------------------------------------------------------------------------
1 | 1 2 6 0 2 3 1 1 0 0 3
2 | 1 3 0 1 3 0 0 2 0 0 1
3 | 1 4 1 0 0 4 9 0 1 2 0
4 | 2 1 0 3 0 0 5 0 2 3 9
5 | 3 1 1 9 3 0 2 0 0 1 3
6 | 4 2 0 3 4 5 1 1 1 4 0
7 | 2 1 0 3 0 0 5 0 2 2 9
8 | 1 1 1 9 2 1 2 0 0 1 3
9 | 4 4 0 3 4 2 1 3 0 0 0
10 | 2 8 2 0 3 0 2 0 2 7 2
11 | 1 1 1 9 0 2 2 0 0 3 3
12 | 4 1 0 0 4 5 1 3 0 1 0
13 |
--------------------------------------------------------------------------------
/data/mllib/sample_lda_libsvm_data.txt:
--------------------------------------------------------------------------------
1 | 0 1:1 2:2 3:6 4:0 5:2 6:3 7:1 8:1 9:0 10:0 11:3
2 | 1 1:1 2:3 3:0 4:1 5:3 6:0 7:0 8:2 9:0 10:0 11:1
3 | 2 1:1 2:4 3:1 4:0 5:0 6:4 7:9 8:0 9:1 10:2 11:0
4 | 3 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:3 11:9
5 | 4 1:3 2:1 3:1 4:9 5:3 6:0 7:2 8:0 9:0 10:1 11:3
6 | 5 1:4 2:2 3:0 4:3 5:4 6:5 7:1 8:1 9:1 10:4 11:0
7 | 6 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:2 11:9
8 | 7 1:1 2:1 3:1 4:9 5:2 6:1 7:2 8:0 9:0 10:1 11:3
9 | 8 1:4 2:4 3:0 4:3 5:4 6:2 7:1 8:3 9:0 10:0 11:0
10 | 9 1:2 2:8 3:2 4:0 5:3 6:0 7:2 8:0 9:2 10:7 11:2
11 | 10 1:1 2:1 3:1 4:9 5:0 6:2 7:2 8:0 9:0 10:3 11:3
12 | 11 1:4 2:1 3:0 4:0 5:4 6:5 7:1 8:3 9:0 10:1 11:0
13 |
--------------------------------------------------------------------------------
/data/mllib/sample_multiclass_classification_data.txt:
--------------------------------------------------------------------------------
1 | 1 1:-0.222222 2:0.5 3:-0.762712 4:-0.833333
2 | 1 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667
3 | 1 1:-0.722222 2:-0.166667 3:-0.864407 4:-0.833333
4 | 1 1:-0.722222 2:0.166667 3:-0.694915 4:-0.916667
5 | 0 1:0.166667 2:-0.416667 3:0.457627 4:0.5
6 | 1 1:-0.833333 3:-0.864407 4:-0.916667
7 | 2 1:-1.32455e-07 2:-0.166667 3:0.220339 4:0.0833333
8 | 2 1:-1.32455e-07 2:-0.333333 3:0.0169491 4:-4.03573e-08
9 | 1 1:-0.5 2:0.75 3:-0.830508 4:-1
10 | 0 1:0.611111 3:0.694915 4:0.416667
11 | 0 1:0.222222 2:-0.166667 3:0.423729 4:0.583333
12 | 1 1:-0.722222 2:-0.166667 3:-0.864407 4:-1
13 | 1 1:-0.5 2:0.166667 3:-0.864407 4:-0.916667
14 | 2 1:-0.222222 2:-0.333333 3:0.0508474 4:-4.03573e-08
15 | 2 1:-0.0555556 2:-0.833333 3:0.0169491 4:-0.25
16 | 2 1:-0.166667 2:-0.416667 3:-0.0169491 4:-0.0833333
17 | 1 1:-0.944444 3:-0.898305 4:-0.916667
18 | 2 1:-0.277778 2:-0.583333 3:-0.0169491 4:-0.166667
19 | 0 1:0.111111 2:-0.333333 3:0.38983 4:0.166667
20 | 2 1:-0.222222 2:-0.166667 3:0.0847457 4:-0.0833333
21 | 0 1:0.166667 2:-0.333333 3:0.559322 4:0.666667
22 | 1 1:-0.611111 2:0.0833333 3:-0.864407 4:-0.916667
23 | 2 1:-0.333333 2:-0.583333 3:0.0169491 4:-4.03573e-08
24 | 0 1:0.555555 2:-0.166667 3:0.661017 4:0.666667
25 | 2 1:0.166667 3:0.186441 4:0.166667
26 | 2 1:0.111111 2:-0.75 3:0.152542 4:-4.03573e-08
27 | 2 1:0.166667 2:-0.25 3:0.118644 4:-4.03573e-08
28 | 0 1:-0.0555556 2:-0.833333 3:0.355932 4:0.166667
29 | 0 1:-0.277778 2:-0.333333 3:0.322034 4:0.583333
30 | 2 1:-0.222222 2:-0.5 3:-0.152542 4:-0.25
31 | 2 1:-0.111111 3:0.288136 4:0.416667
32 | 2 1:-0.0555556 2:-0.25 3:0.186441 4:0.166667
33 | 2 1:0.333333 2:-0.166667 3:0.355932 4:0.333333
34 | 1 1:-0.611111 2:0.25 3:-0.898305 4:-0.833333
35 | 0 1:0.166667 2:-0.333333 3:0.559322 4:0.75
36 | 0 1:0.111111 2:-0.25 3:0.559322 4:0.416667
37 | 0 1:0.833333 2:-0.166667 3:0.898305 4:0.666667
38 | 2 1:-0.277778 2:-0.166667 3:0.186441 4:0.166667
39 | 0 1:-0.666667 2:-0.583333 3:0.186441 4:0.333333
40 | 1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
41 | 1 1:-0.166667 2:0.666667 3:-0.932203 4:-0.916667
42 | 0 1:0.0555554 2:-0.333333 3:0.288136 4:0.416667
43 | 1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
44 | 1 1:-0.833333 2:0.166667 3:-0.864407 4:-0.833333
45 | 0 1:0.0555554 2:0.166667 3:0.491525 4:0.833333
46 | 0 1:0.722222 2:-0.333333 3:0.728813 4:0.5
47 | 2 1:-0.166667 2:-0.416667 3:0.0508474 4:-0.25
48 | 2 1:0.5 3:0.254237 4:0.0833333
49 | 0 1:0.111111 2:-0.583333 3:0.355932 4:0.5
50 | 1 1:-0.944444 2:-0.166667 3:-0.898305 4:-0.916667
51 | 2 1:0.277778 2:-0.25 3:0.220339 4:-4.03573e-08
52 | 0 1:0.666667 2:-0.25 3:0.79661 4:0.416667
53 | 0 1:0.111111 2:0.0833333 3:0.694915 4:1
54 | 0 1:0.444444 3:0.59322 4:0.833333
55 | 2 1:-0.0555556 2:0.166667 3:0.186441 4:0.25
56 | 1 1:-0.833333 2:0.333333 3:-1 4:-0.916667
57 | 1 1:-0.555556 2:0.416667 3:-0.830508 4:-0.75
58 | 2 1:-0.333333 2:-0.5 3:0.152542 4:-0.0833333
59 | 1 1:-1 2:-0.166667 3:-0.966102 4:-1
60 | 1 1:-0.333333 2:0.25 3:-0.898305 4:-0.916667
61 | 2 1:0.388889 2:-0.333333 3:0.288136 4:0.0833333
62 | 2 1:0.277778 2:-0.166667 3:0.152542 4:0.0833333
63 | 0 1:0.333333 2:0.0833333 3:0.59322 4:0.666667
64 | 1 1:-0.777778 3:-0.79661 4:-0.916667
65 | 1 1:-0.444444 2:0.416667 3:-0.830508 4:-0.916667
66 | 0 1:0.222222 2:-0.166667 3:0.627119 4:0.75
67 | 1 1:-0.555556 2:0.5 3:-0.79661 4:-0.916667
68 | 1 1:-0.555556 2:0.5 3:-0.694915 4:-0.75
69 | 2 1:-1.32455e-07 2:-0.25 3:0.254237 4:0.0833333
70 | 1 1:-0.5 2:0.25 3:-0.830508 4:-0.916667
71 | 0 1:0.166667 3:0.457627 4:0.833333
72 | 2 1:0.444444 2:-0.0833334 3:0.322034 4:0.166667
73 | 0 1:0.111111 2:0.166667 3:0.559322 4:0.916667
74 | 1 1:-0.611111 2:0.25 3:-0.79661 4:-0.583333
75 | 0 1:0.388889 3:0.661017 4:0.833333
76 | 1 1:-0.722222 2:0.166667 3:-0.79661 4:-0.916667
77 | 1 1:-0.722222 2:-0.0833334 3:-0.79661 4:-0.916667
78 | 1 1:-0.555556 2:0.166667 3:-0.830508 4:-0.916667
79 | 2 1:-0.666667 2:-0.666667 3:-0.220339 4:-0.25
80 | 2 1:-0.611111 2:-0.75 3:-0.220339 4:-0.25
81 | 2 1:0.0555554 2:-0.833333 3:0.186441 4:0.166667
82 | 0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5
83 | 0 1:0.611111 2:0.333333 3:0.728813 4:1
84 | 2 1:0.0555554 2:-0.25 3:0.118644 4:-4.03573e-08
85 | 1 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667
86 | 1 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667
87 | 0 1:0.611111 2:-0.166667 3:0.627119 4:0.25
88 | 0 1:0.888889 2:0.5 3:0.932203 4:0.75
89 | 2 1:0.222222 2:-0.333333 3:0.220339 4:0.166667
90 | 1 1:-0.555556 2:0.25 3:-0.864407 4:-0.833333
91 | 0 1:-1.32455e-07 2:-0.166667 3:0.322034 4:0.416667
92 | 0 1:-1.32455e-07 2:-0.5 3:0.559322 4:0.0833333
93 | 1 1:-0.611111 3:-0.932203 4:-0.916667
94 | 1 1:-0.333333 2:0.833333 3:-0.864407 4:-0.916667
95 | 0 1:-0.166667 2:-0.333333 3:0.38983 4:0.916667
96 | 2 1:-0.333333 2:-0.666667 3:-0.0847458 4:-0.25
97 | 2 1:-0.0555556 2:-0.416667 3:0.38983 4:0.25
98 | 1 1:-0.388889 2:0.416667 3:-0.830508 4:-0.916667
99 | 0 1:0.444444 2:-0.0833334 3:0.38983 4:0.833333
100 | 1 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667
101 | 0 1:0.111111 2:-0.416667 3:0.322034 4:0.416667
102 | 0 1:0.166667 2:-0.0833334 3:0.525424 4:0.416667
103 | 2 1:0.333333 2:-0.0833334 3:0.152542 4:0.0833333
104 | 0 1:-0.0555556 2:-0.166667 3:0.288136 4:0.416667
105 | 0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5
106 | 1 1:-0.611111 2:0.166667 3:-0.830508 4:-0.916667
107 | 0 1:0.888889 2:-0.166667 3:0.728813 4:0.833333
108 | 2 1:-0.277778 2:-0.25 3:-0.118644 4:-4.03573e-08
109 | 2 1:-0.222222 2:-0.333333 3:0.186441 4:-4.03573e-08
110 | 0 1:0.333333 2:-0.583333 3:0.627119 4:0.416667
111 | 0 1:0.444444 2:-0.0833334 3:0.491525 4:0.666667
112 | 2 1:-0.222222 2:-0.25 3:0.0847457 4:-4.03573e-08
113 | 1 1:-0.611111 2:0.166667 3:-0.79661 4:-0.75
114 | 2 1:-0.277778 2:-0.166667 3:0.0508474 4:-4.03573e-08
115 | 0 1:1 2:0.5 3:0.830508 4:0.583333
116 | 2 1:-0.333333 2:-0.666667 3:-0.0508475 4:-0.166667
117 | 2 1:-0.277778 2:-0.416667 3:0.0847457 4:-4.03573e-08
118 | 0 1:0.888889 2:-0.333333 3:0.932203 4:0.583333
119 | 2 1:-0.111111 2:-0.166667 3:0.0847457 4:0.166667
120 | 2 1:0.111111 2:-0.583333 3:0.322034 4:0.166667
121 | 0 1:0.333333 2:0.0833333 3:0.59322 4:1
122 | 0 1:0.222222 2:-0.166667 3:0.525424 4:0.416667
123 | 1 1:-0.555556 2:0.5 3:-0.830508 4:-0.833333
124 | 0 1:-0.111111 2:-0.166667 3:0.38983 4:0.416667
125 | 0 1:0.888889 2:-0.5 3:1 4:0.833333
126 | 1 1:-0.388889 2:0.583333 3:-0.898305 4:-0.75
127 | 2 1:0.111111 2:0.0833333 3:0.254237 4:0.25
128 | 0 1:0.333333 2:-0.166667 3:0.423729 4:0.833333
129 | 1 1:-0.388889 2:0.166667 3:-0.762712 4:-0.916667
130 | 0 1:0.333333 2:-0.0833334 3:0.559322 4:0.916667
131 | 2 1:-0.333333 2:-0.75 3:0.0169491 4:-4.03573e-08
132 | 1 1:-0.222222 2:1 3:-0.830508 4:-0.75
133 | 1 1:-0.388889 2:0.583333 3:-0.762712 4:-0.75
134 | 2 1:-0.611111 2:-1 3:-0.152542 4:-0.25
135 | 2 1:-1.32455e-07 2:-0.333333 3:0.254237 4:-0.0833333
136 | 2 1:-0.5 2:-0.416667 3:-0.0169491 4:0.0833333
137 | 1 1:-0.888889 2:-0.75 3:-0.898305 4:-0.833333
138 | 1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1
139 | 2 1:-0.555556 2:-0.583333 3:-0.322034 4:-0.166667
140 | 2 1:-0.166667 2:-0.5 3:0.0169491 4:-0.0833333
141 | 1 1:-0.555556 2:0.0833333 3:-0.762712 4:-0.666667
142 | 1 1:-0.777778 3:-0.898305 4:-0.916667
143 | 0 1:0.388889 2:-0.166667 3:0.525424 4:0.666667
144 | 0 1:0.222222 3:0.38983 4:0.583333
145 | 2 1:0.333333 2:-0.0833334 3:0.254237 4:0.166667
146 | 2 1:-0.388889 2:-0.166667 3:0.186441 4:0.166667
147 | 0 1:-0.222222 2:-0.583333 3:0.355932 4:0.583333
148 | 1 1:-0.611111 2:-0.166667 3:-0.79661 4:-0.916667
149 | 1 1:-0.944444 2:-0.25 3:-0.864407 4:-0.916667
150 | 1 1:-0.388889 2:0.166667 3:-0.830508 4:-0.75
151 |
--------------------------------------------------------------------------------
/data/mllib/streaming_kmeans_data_test.txt:
--------------------------------------------------------------------------------
1 | (1.0), [1.7, 0.4, 0.9]
2 | (2.0), [2.2, 1.8, 0.0]
3 |
--------------------------------------------------------------------------------
/data/sales-data.csv:
--------------------------------------------------------------------------------
1 | Month,Sales
2 | 1-01,266
3 | 1-02,145.9
4 | 1-03,183.1
5 | 1-04,119.3
6 | 1-05,180.3
7 | 1-06,168.5
8 | 1-07,231.8
9 | 1-08,224.5
10 | 1-09,192.8
11 | 1-10,122.9
12 | 1-11,336.5
13 | 1-12,185.9
14 | 2-01,194.3
15 | 2-02,149.5
16 | 2-03,210.1
17 | 2-04,273.3
18 | 2-05,191.4
19 | 2-06,287
20 | 2-07,226
21 | 2-08,303.6
22 | 2-09,289.9
23 | 2-10,421.6
24 | 2-11,264.5
25 | 2-12,342.3
26 | 3-01,339.7
27 | 3-02,440.4
28 | 3-03,315.9
29 | 3-04,439.3
30 | 3-05,401.3
31 | 3-06,437.4
32 | 3-07,575.5
33 | 3-08,407.6
34 | 3-09,682
35 | 3-10,475.3
36 | 3-11,581.3
37 | 3-12,646.9
38 |
--------------------------------------------------------------------------------
/data/sales-funnel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/data/sales-funnel.xlsx
--------------------------------------------------------------------------------
/data/sales-of-shampoo-over-a-three-ye.csv:
--------------------------------------------------------------------------------
1 | "Month","Sales of shampoo over a three year period"
2 | "1-01",266.0
3 | "1-02",145.9
4 | "1-03",183.1
5 | "1-04",119.3
6 | "1-05",180.3
7 | "1-06",168.5
8 | "1-07",231.8
9 | "1-08",224.5
10 | "1-09",192.8
11 | "1-10",122.9
12 | "1-11",336.5
13 | "1-12",185.9
14 | "2-01",194.3
15 | "2-02",149.5
16 | "2-03",210.1
17 | "2-04",273.3
18 | "2-05",191.4
19 | "2-06",287.0
20 | "2-07",226.0
21 | "2-08",303.6
22 | "2-09",289.9
23 | "2-10",421.6
24 | "2-11",264.5
25 | "2-12",342.3
26 | "3-01",339.7
27 | "3-02",440.4
28 | "3-03",315.9
29 | "3-04",439.3
30 | "3-05",401.3
31 | "3-06",437.4
32 | "3-07",575.5
33 | "3-08",407.6
34 | "3-09",682.0
35 | "3-10",475.3
36 | "3-11",581.3
37 | "3-12",646.9
38 |
39 | Sales of shampoo over a three year period
40 |
41 |
--------------------------------------------------------------------------------
/entry.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 |
3 | import argparse
4 |
5 | if __name__ == '__main__':
6 | spark = SparkSession.builder.appName('PySpark-App').getOrCreate()
7 | print('Session created')
8 | emp_data = spark.createDataFrame([(1,2),(3,4),(5,6)],['a','b'])
9 | print (emp_data.count())
10 |
--------------------------------------------------------------------------------
/fakefriends.csv:
--------------------------------------------------------------------------------
1 | 0,Will,33,385
2 | 1,Jean-Luc,26,2
3 | 2,Hugh,55,221
4 | 3,Deanna,40,465
5 | 4,Quark,68,21
6 | 5,Weyoun,59,318
7 | 6,Gowron,37,220
8 | 7,Will,54,307
9 | 8,Jadzia,38,380
10 | 9,Hugh,27,181
11 | 10,Odo,53,191
12 | 11,Ben,57,372
13 | 12,Keiko,54,253
14 | 13,Jean-Luc,56,444
15 | 14,Hugh,43,49
16 | 15,Rom,36,49
17 | 16,Weyoun,22,323
18 | 17,Odo,35,13
19 | 18,Jean-Luc,45,455
20 | 19,Geordi,60,246
21 | 20,Odo,67,220
22 | 21,Miles,19,268
23 | 22,Quark,30,72
24 | 23,Keiko,51,271
25 | 24,Julian,25,1
26 | 25,Ben,21,445
27 | 26,Julian,22,100
28 | 27,Leeta,42,363
29 | 28,Martok,49,476
30 | 29,Nog,48,364
31 | 30,Keiko,50,175
32 | 31,Miles,39,161
33 | 32,Nog,26,281
34 | 33,Dukat,53,197
35 | 34,Jean-Luc,43,249
36 | 35,Beverly,27,305
37 | 36,Kasidy,32,81
38 | 37,Geordi,58,21
39 | 38,Deanna,64,65
40 | 39,Morn,31,192
41 | 40,Odo,52,413
42 | 41,Hugh,67,167
43 | 42,Brunt,54,75
44 | 43,Guinan,58,345
45 | 44,Nerys,35,244
46 | 45,Dukat,52,77
47 | 46,Morn,25,96
48 | 47,Brunt,24,49
49 | 48,Nog,20,1
50 | 49,Ezri,40,254
51 | 50,Quark,51,283
52 | 51,Lwaxana,36,212
53 | 52,Beverly,19,269
54 | 53,Geordi,62,31
55 | 54,Brunt,19,5
56 | 55,Keiko,41,278
57 | 56,Gowron,44,194
58 | 57,Odo,57,294
59 | 58,Hugh,59,158
60 | 59,Morn,59,284
61 | 60,Geordi,20,100
62 | 61,Kasidy,62,442
63 | 62,Keiko,69,9
64 | 63,Jean-Luc,58,54
65 | 64,Elim,31,15
66 | 65,Guinan,52,169
67 | 66,Geordi,21,477
68 | 67,Jadzia,48,135
69 | 68,Guinan,33,74
70 | 69,Jean-Luc,30,204
71 | 70,Brunt,52,393
72 | 71,Geordi,45,184
73 | 72,Kasidy,22,179
74 | 73,Brunt,20,384
75 | 74,Leeta,65,208
76 | 75,Morn,40,459
77 | 76,Will,62,201
78 | 77,Weyoun,40,407
79 | 78,Data,61,337
80 | 79,Leeta,58,348
81 | 80,Dukat,67,445
82 | 81,Jadzia,54,440
83 | 82,Hugh,57,465
84 | 83,Geordi,32,308
85 | 84,Ben,28,311
86 | 85,Quark,66,383
87 | 86,Hugh,55,257
88 | 87,Ezri,31,481
89 | 88,Ben,66,188
90 | 89,Worf,24,492
91 | 90,Kasidy,33,471
92 | 91,Rom,46,88
93 | 92,Gowron,54,7
94 | 93,Elim,46,63
95 | 94,Morn,62,133
96 | 95,Odo,29,173
97 | 96,Ezri,25,233
98 | 97,Nerys,69,361
99 | 98,Will,44,178
100 | 99,Keiko,69,491
101 | 100,Jean-Luc,61,460
102 | 101,Morn,67,123
103 | 102,Dukat,40,18
104 | 103,Ezri,61,2
105 | 104,Dukat,32,142
106 | 105,Morn,50,417
107 | 106,Beverly,18,499
108 | 107,Will,64,419
109 | 108,Leeta,25,274
110 | 109,Quark,53,417
111 | 110,Nog,64,137
112 | 111,Nerys,37,46
113 | 112,Morn,25,13
114 | 113,Quark,41,244
115 | 114,Worf,33,275
116 | 115,Dukat,18,397
117 | 116,Ben,69,75
118 | 117,Rom,52,487
119 | 118,Ben,28,304
120 | 119,Worf,29,344
121 | 120,Jean-Luc,68,264
122 | 121,Deanna,35,355
123 | 122,Data,45,400
124 | 123,Jadzia,45,439
125 | 124,Data,47,429
126 | 125,Rom,66,284
127 | 126,Brunt,26,84
128 | 127,Miles,40,284
129 | 128,Julian,34,221
130 | 129,Kasidy,45,252
131 | 130,Gowron,67,350
132 | 131,Hugh,65,309
133 | 132,Odo,46,462
134 | 133,Quark,19,265
135 | 134,Ben,45,340
136 | 135,Rom,42,427
137 | 136,Will,19,335
138 | 137,Martok,28,32
139 | 138,Dukat,32,384
140 | 139,Nog,36,193
141 | 140,Elim,64,234
142 | 141,Miles,36,424
143 | 142,Guinan,59,335
144 | 143,Data,60,124
145 | 144,Miles,22,93
146 | 145,Leeta,45,470
147 | 146,Nerys,58,174
148 | 147,Quark,61,373
149 | 148,Nerys,39,248
150 | 149,Beverly,49,340
151 | 150,Nerys,55,313
152 | 151,Keiko,54,441
153 | 152,Kasidy,54,235
154 | 153,Morn,63,342
155 | 154,Geordi,40,389
156 | 155,Beverly,50,126
157 | 156,Deanna,44,360
158 | 157,Dukat,34,319
159 | 158,Odo,31,340
160 | 159,Kasidy,67,438
161 | 160,Beverly,58,112
162 | 161,Odo,39,207
163 | 162,Ezri,59,14
164 | 163,Nerys,67,204
165 | 164,Will,31,172
166 | 165,Leeta,26,282
167 | 166,Lwaxana,25,10
168 | 167,Quark,48,57
169 | 168,Martok,68,112
170 | 169,Beverly,53,92
171 | 170,Jean-Luc,68,490
172 | 171,Weyoun,29,126
173 | 172,Kasidy,55,204
174 | 173,Leeta,23,129
175 | 174,Deanna,47,87
176 | 175,Will,38,459
177 | 176,Worf,55,474
178 | 177,Brunt,67,316
179 | 178,Kasidy,26,381
180 | 179,Elim,37,426
181 | 180,Kasidy,30,108
182 | 181,Rom,43,404
183 | 182,Weyoun,26,145
184 | 183,Ben,47,488
185 | 184,Julian,44,84
186 | 185,Weyoun,48,287
187 | 186,Miles,31,109
188 | 187,Nerys,47,225
189 | 188,Keiko,54,369
190 | 189,Quark,62,23
191 | 190,Geordi,60,294
192 | 191,Nog,40,349
193 | 192,Jadzia,45,497
194 | 193,Nerys,60,125
195 | 194,Kasidy,38,2
196 | 195,Ben,30,376
197 | 196,Data,38,173
198 | 197,Leeta,38,76
199 | 198,Brunt,48,381
200 | 199,Hugh,38,180
201 | 200,Kasidy,21,472
202 | 201,Ezri,23,174
203 | 202,Lwaxana,63,469
204 | 203,Ezri,46,125
205 | 204,Deanna,64,164
206 | 205,Morn,69,236
207 | 206,Will,21,491
208 | 207,Lwaxana,41,206
209 | 208,Nog,37,271
210 | 209,Brunt,27,174
211 | 210,Data,33,245
212 | 211,Ben,61,73
213 | 212,Geordi,55,284
214 | 213,Worf,28,312
215 | 214,Miles,32,182
216 | 215,Will,22,6
217 | 216,Brunt,34,116
218 | 217,Keiko,29,260
219 | 218,Gowron,66,350
220 | 219,Lwaxana,26,345
221 | 220,Jean-Luc,41,394
222 | 221,Dukat,27,150
223 | 222,Rom,34,346
224 | 223,Odo,40,406
225 | 224,Keiko,44,277
226 | 225,Elim,19,106
227 | 226,Lwaxana,37,207
228 | 227,Ezri,40,198
229 | 228,Martok,26,293
230 | 229,Gowron,24,150
231 | 230,Beverly,54,397
232 | 231,Ezri,59,42
233 | 232,Worf,68,481
234 | 233,Gowron,67,70
235 | 234,Deanna,49,22
236 | 235,Elim,57,8
237 | 236,Brunt,62,442
238 | 237,Nerys,61,469
239 | 238,Deanna,25,305
240 | 239,Nog,48,345
241 | 240,Deanna,46,154
242 | 241,Quark,45,332
243 | 242,Data,25,101
244 | 243,Martok,61,68
245 | 244,Dukat,21,471
246 | 245,Jean-Luc,28,174
247 | 246,Leeta,41,260
248 | 247,Ezri,52,338
249 | 248,Dukat,21,138
250 | 249,Nerys,66,41
251 | 250,Hugh,36,342
252 | 251,Rom,55,57
253 | 252,Will,36,174
254 | 253,Leeta,69,116
255 | 254,Ezri,67,79
256 | 255,Deanna,60,324
257 | 256,Worf,32,412
258 | 257,Data,51,161
259 | 258,Worf,68,217
260 | 259,Kasidy,29,11
261 | 260,Brunt,38,96
262 | 261,Jadzia,40,172
263 | 262,Will,51,334
264 | 263,Martok,40,33
265 | 264,Julian,29,228
266 | 265,Gowron,27,471
267 | 266,Jean-Luc,66,496
268 | 267,Dukat,49,106
269 | 268,Ezri,26,298
270 | 269,Beverly,55,289
271 | 270,Data,44,353
272 | 271,Morn,25,446
273 | 272,Quark,29,367
274 | 273,Data,51,493
275 | 274,Julian,64,244
276 | 275,Will,47,13
277 | 276,Dukat,54,462
278 | 277,Hugh,46,300
279 | 278,Data,44,499
280 | 279,Beverly,23,133
281 | 280,Nerys,26,492
282 | 281,Worf,21,89
283 | 282,Geordi,32,404
284 | 283,Dukat,65,443
285 | 284,Nog,26,269
286 | 285,Data,43,101
287 | 286,Lwaxana,30,384
288 | 287,Beverly,64,396
289 | 288,Hugh,56,354
290 | 289,Ezri,30,221
291 | 290,Beverly,62,290
292 | 291,Dukat,23,373
293 | 292,Nog,63,380
294 | 293,Deanna,23,65
295 | 294,Leeta,38,410
296 | 295,Nerys,40,56
297 | 296,Data,38,454
298 | 297,Ben,45,395
299 | 298,Guinan,57,207
300 | 299,Rom,57,311
301 | 300,Beverly,49,147
302 | 301,Weyoun,28,108
303 | 302,Beverly,37,263
304 | 303,Deanna,46,319
305 | 304,Will,19,404
306 | 305,Quark,29,182
307 | 306,Beverly,23,323
308 | 307,Keiko,41,340
309 | 308,Morn,45,59
310 | 309,Geordi,67,153
311 | 310,Odo,68,189
312 | 311,Martok,43,48
313 | 312,Jadzia,61,421
314 | 313,Dukat,59,169
315 | 314,Geordi,36,168
316 | 315,Weyoun,25,208
317 | 316,Hugh,64,391
318 | 317,Guinan,59,439
319 | 318,Deanna,35,251
320 | 319,Leeta,30,476
321 | 320,Worf,62,450
322 | 321,Data,44,61
323 | 322,Rom,58,92
324 | 323,Nog,29,236
325 | 324,Miles,56,343
326 | 325,Keiko,51,492
327 | 326,Beverly,46,407
328 | 327,Julian,20,63
329 | 328,Deanna,62,41
330 | 329,Dukat,67,35
331 | 330,Ezri,33,356
332 | 331,Martok,30,17
333 | 332,Julian,55,362
334 | 333,Ben,29,207
335 | 334,Leeta,40,7
336 | 335,Odo,27,337
337 | 336,Gowron,47,4
338 | 337,Miles,58,10
339 | 338,Will,28,180
340 | 339,Morn,66,305
341 | 340,Nerys,57,275
342 | 341,Data,18,326
343 | 342,Guinan,46,151
344 | 343,Odo,26,254
345 | 344,Data,30,487
346 | 345,Ezri,31,394
347 | 346,Hugh,29,329
348 | 347,Geordi,32,24
349 | 348,Weyoun,33,460
350 | 349,Kasidy,20,277
351 | 350,Nog,55,464
352 | 351,Keiko,54,72
353 | 352,Deanna,27,53
354 | 353,Julian,64,499
355 | 354,Kasidy,69,15
356 | 355,Keiko,46,352
357 | 356,Weyoun,67,149
358 | 357,Brunt,26,7
359 | 358,Will,52,276
360 | 359,Nog,54,442
361 | 360,Nerys,39,68
362 | 361,Worf,68,206
363 | 362,Ezri,39,120
364 | 363,Dukat,41,397
365 | 364,Lwaxana,54,115
366 | 365,Brunt,65,430
367 | 366,Keiko,19,119
368 | 367,Data,39,106
369 | 368,Elim,26,383
370 | 369,Quark,48,266
371 | 370,Jadzia,53,86
372 | 371,Guinan,31,435
373 | 372,Brunt,62,273
374 | 373,Quark,19,272
375 | 374,Nog,68,293
376 | 375,Hugh,66,201
377 | 376,Gowron,23,392
378 | 377,Beverly,18,418
379 | 378,Guinan,47,97
380 | 379,Data,60,304
381 | 380,Brunt,35,65
382 | 381,Nog,38,95
383 | 382,Worf,66,240
384 | 383,Data,69,148
385 | 384,Martok,67,355
386 | 385,Beverly,57,436
387 | 386,Data,35,428
388 | 387,Will,43,335
389 | 388,Nog,30,184
390 | 389,Weyoun,38,38
391 | 390,Martok,22,266
392 | 391,Ben,64,309
393 | 392,Data,64,343
394 | 393,Quark,50,436
395 | 394,Keiko,23,230
396 | 395,Jean-Luc,56,15
397 | 396,Keiko,67,38
398 | 397,Quark,69,470
399 | 398,Lwaxana,26,124
400 | 399,Beverly,24,401
401 | 400,Data,29,128
402 | 401,Jean-Luc,42,467
403 | 402,Hugh,58,98
404 | 403,Weyoun,21,224
405 | 404,Kasidy,18,24
406 | 405,Nog,56,371
407 | 406,Ben,57,121
408 | 407,Miles,36,68
409 | 408,Dukat,62,496
410 | 409,Nog,19,267
411 | 410,Odo,35,299
412 | 411,Lwaxana,58,22
413 | 412,Jadzia,53,451
414 | 413,Hugh,45,147
415 | 414,Martok,56,313
416 | 415,Quark,30,65
417 | 416,Nerys,33,294
418 | 417,Julian,37,106
419 | 418,Guinan,32,212
420 | 419,Kasidy,55,176
421 | 420,Jadzia,26,391
422 | 421,Will,40,261
423 | 422,Ben,67,292
424 | 423,Will,44,388
425 | 424,Keiko,55,470
426 | 425,Quark,33,243
427 | 426,Worf,24,77
428 | 427,Brunt,28,258
429 | 428,Lwaxana,68,423
430 | 429,Jean-Luc,63,345
431 | 430,Geordi,36,493
432 | 431,Quark,36,343
433 | 432,Brunt,45,54
434 | 433,Ezri,38,203
435 | 434,Deanna,57,289
436 | 435,Guinan,42,275
437 | 436,Geordi,57,229
438 | 437,Morn,59,221
439 | 438,Nog,42,95
440 | 439,Data,18,417
441 | 440,Elim,48,394
442 | 441,Jadzia,38,143
443 | 442,Nog,46,105
444 | 443,Geordi,64,175
445 | 444,Keiko,18,472
446 | 445,Guinan,40,286
447 | 446,Quark,32,41
448 | 447,Julian,38,34
449 | 448,Nerys,48,439
450 | 449,Data,52,419
451 | 450,Weyoun,37,234
452 | 451,Martok,28,34
453 | 452,Ezri,58,6
454 | 453,Julian,44,337
455 | 454,Weyoun,52,456
456 | 455,Elim,33,463
457 | 456,Ezri,37,471
458 | 457,Worf,51,81
459 | 458,Elim,44,335
460 | 459,Geordi,26,84
461 | 460,Hugh,47,400
462 | 461,Geordi,41,236
463 | 462,Nerys,23,287
464 | 463,Keiko,40,220
465 | 464,Beverly,25,485
466 | 465,Morn,53,126
467 | 466,Brunt,33,228
468 | 467,Weyoun,42,194
469 | 468,Ezri,46,227
470 | 469,Brunt,55,271
471 | 470,Deanna,38,160
472 | 471,Brunt,52,273
473 | 472,Nog,27,154
474 | 473,Morn,35,38
475 | 474,Keiko,34,48
476 | 475,Ben,52,446
477 | 476,Jean-Luc,28,378
478 | 477,Gowron,50,119
479 | 478,Dukat,41,62
480 | 479,Kasidy,44,320
481 | 480,Geordi,43,428
482 | 481,Elim,32,97
483 | 482,Ben,48,146
484 | 483,Hugh,57,99
485 | 484,Leeta,22,478
486 | 485,Rom,47,356
487 | 486,Elim,49,17
488 | 487,Brunt,69,431
489 | 488,Nog,61,103
490 | 489,Odo,33,410
491 | 490,Nerys,65,101
492 | 491,Rom,60,2
493 | 492,Dukat,19,36
494 | 493,Hugh,23,357
495 | 494,Kasidy,18,194
496 | 495,Data,46,155
497 | 496,Gowron,39,275
498 | 497,Lwaxana,34,423
499 | 498,Jadzia,62,36
500 | 499,Leeta,62,12
501 |
--------------------------------------------------------------------------------
/hadoop-2.7.1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/hadoop-2.7.1.zip
--------------------------------------------------------------------------------
/hr_data_analysis.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import functions as F
3 |
4 | def getCountHardWorkingLessPaid(hr_data):
5 | return hr_data[(hr_data.satisfaction_level > .9) & (hr_data.salary == "low")].count()
6 |
7 | def increaseSalary(hr_data):
8 | hr_data = hr_data.withColumn('ActualSalary', hr_data.last_evaluation * 10000)
9 | hr_data = hr_data.withColumn('multifactor',
10 | F.when(hr_data.salary == "low",1)
11 | .when(hr_data.salary == "medium",2)
12 | .otherwise(3))
13 | hr_data = hr_data.withColumn('ActualSalary', hr_data.ActualSalary * hr_data.multifactor)
14 | hr_data = hr_data.drop('multifactor')
15 |
16 |
17 | if __name__ == '__main__':
18 |
19 | spark = SparkSession.builder.appName('HR-Data-Analysis').getOrCreate()
20 | print('Session created')
21 |
22 | hr_data = spark.read.csv('hr_data.csv', inferSchema=True, header=True)
23 |
24 | hr_data = hr_data.withColumnRenamed("sales","department")
25 |
26 | hr_data = hr_data.cache()
27 | count = getCountHardWorkingLessPaid(hr_data)
28 | print ('Count of hardworking & Less Paid folks ',count)
29 |
30 | increaseSalary(hr_data)
31 | print (hr_data.show())
32 |
--------------------------------------------------------------------------------
/kddcup.data_10_percent.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/kddcup.data_10_percent.gz
--------------------------------------------------------------------------------
/metastore_db/README_DO_NOT_TOUCH_FILES.txt:
--------------------------------------------------------------------------------
1 |
2 | # *************************************************************************
3 | # *** DO NOT TOUCH FILES IN THIS DIRECTORY! ***
4 | # *** FILES IN THIS DIRECTORY AND SUBDIRECTORIES CONSTITUTE A DERBY ***
5 | # *** DATABASE, WHICH INCLUDES THE DATA (USER AND SYSTEM) AND THE ***
6 | # *** FILES NECESSARY FOR DATABASE RECOVERY. ***
7 | # *** EDITING, ADDING, OR DELETING ANY OF THESE FILES MAY CAUSE DATA ***
8 | # *** CORRUPTION AND LEAVE THE DATABASE IN A NON-RECOVERABLE STATE. ***
9 | # *************************************************************************
--------------------------------------------------------------------------------
/metastore_db/db.lck:
--------------------------------------------------------------------------------
1 | $40348015-015f-c31d-a9cd-000000fc8a88
--------------------------------------------------------------------------------
/metastore_db/dbex.lck:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/metastore_db/log/README_DO_NOT_TOUCH_FILES.txt:
--------------------------------------------------------------------------------
1 |
2 | # *************************************************************************
3 | # *** DO NOT TOUCH FILES IN THIS DIRECTORY! ***
4 | # *** FILES IN THIS DIRECTORY ARE USED BY THE DERBY DATABASE RECOVERY ***
5 | # *** SYSTEM. EDITING, ADDING, OR DELETING FILES IN THIS DIRECTORY ***
6 | # *** WILL CAUSE THE DERBY RECOVERY SYSTEM TO FAIL, LEADING TO ***
7 | # *** NON-RECOVERABLE CORRUPT DATABASES. ***
8 | # *************************************************************************
--------------------------------------------------------------------------------
/metastore_db/log/log.ctrl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/log/log.ctrl
--------------------------------------------------------------------------------
/metastore_db/log/log1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/log/log1.dat
--------------------------------------------------------------------------------
/metastore_db/log/logmirror.ctrl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/log/logmirror.ctrl
--------------------------------------------------------------------------------
/metastore_db/seg0/README_DO_NOT_TOUCH_FILES.txt:
--------------------------------------------------------------------------------
1 |
2 | # *************************************************************************
3 | # *** DO NOT TOUCH FILES IN THIS DIRECTORY! ***
4 | # *** FILES IN THIS DIRECTORY ARE USED BY THE DERBY DATABASE TO STORE ***
5 | # *** USER AND SYSTEM DATA. EDITING, ADDING, OR DELETING FILES IN THIS ***
6 | # *** DIRECTORY WILL CORRUPT THE ASSOCIATED DERBY DATABASE AND MAKE ***
7 | # *** IT NON-RECOVERABLE. ***
8 | # *************************************************************************
--------------------------------------------------------------------------------
/metastore_db/seg0/c10.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c10.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c101.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c101.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c111.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c111.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c121.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c121.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c130.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c130.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c141.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c141.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c150.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c150.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c161.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c161.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c171.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c171.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c180.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c180.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c191.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c191.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c1a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c1b1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1b1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c1c0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1c0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c1d1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1d1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c1e0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1e0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c1f1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1f1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c20.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c20.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c200.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c200.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c211.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c211.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c221.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c221.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c230.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c230.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c241.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c241.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c251.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c251.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c260.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c260.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c271.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c271.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c281.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c281.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c290.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c290.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c2a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c2b1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2b1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c2c1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2c1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c2d0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2d0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c2e1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2e1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c2f0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2f0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c300.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c300.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c31.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c31.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c311.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c311.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c321.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c321.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c331.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c331.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c340.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c340.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c351.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c351.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c361.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c361.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c371.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c371.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c380.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c380.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c391.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c391.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c3a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c3b1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3b1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c3c0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3c0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c3d1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3d1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c3e1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3e1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c3f1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3f1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c400.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c400.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c41.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c41.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c411.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c411.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c421.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c421.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c430.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c430.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c441.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c441.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c451.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c451.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c461.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c461.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c470.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c470.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c481.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c481.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c490.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c490.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c4a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c4b0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4b0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c4c1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4c1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c4d1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4d1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c4e1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4e1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c4f0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4f0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c501.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c501.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c51.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c51.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c510.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c510.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c521.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c521.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c530.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c530.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c541.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c541.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c550.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c550.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c561.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c561.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c570.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c570.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c581.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c581.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c590.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c590.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c5a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c5b0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5b0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c5c1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5c1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c5d0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5d0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c5e1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5e1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c5f0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5f0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c60.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c60.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c601.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c601.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c610.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c610.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c621.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c621.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c630.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c630.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c641.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c641.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c650.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c650.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c661.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c661.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c670.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c670.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c681.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c681.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c690.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c690.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c6a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c6b0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6b0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c6c1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6c1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c6d0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6d0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c6e1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6e1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c6f0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6f0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c701.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c701.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c71.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c71.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c711.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c711.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c721.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c721.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c731.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c731.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c741.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c741.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c751.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c751.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c761.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c761.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c771.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c771.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c781.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c781.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c791.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c791.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c7a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c7b1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7b1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c7c1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7c1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c7d1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7d1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c7e1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7e1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c7f1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7f1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c801.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c801.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c81.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c81.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c811.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c811.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c821.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c821.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c831.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c831.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c840.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c840.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c851.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c851.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c860.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c860.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c871.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c871.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c880.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c880.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c891.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c891.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c8a0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8a0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c8b1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8b1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c8c1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8c1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c8d1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8d1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c8e1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8e1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c8f1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8f1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c90.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c90.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c901.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c901.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c911.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c911.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c920.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c920.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c931.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c931.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c940.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c940.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c951.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c951.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c960.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c960.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c971.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c971.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c981.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c981.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c990.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c990.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c9a1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9a1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c9b1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9b1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c9c0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9c0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c9d1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9d1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c9e0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9e0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/c9f1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9f1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/ca01.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca01.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/ca1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/ca11.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca11.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/ca21.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca21.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/cb1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cb1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/cc0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cc0.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/cd1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cd1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/ce1.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ce1.dat
--------------------------------------------------------------------------------
/metastore_db/seg0/cf0.dat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cf0.dat
--------------------------------------------------------------------------------
/metastore_db/service.properties:
--------------------------------------------------------------------------------
1 | #/tensorFolder/pyspark/metastore_db
2 | # ********************************************************************
3 | # *** Please do NOT edit this file. ***
4 | # *** CHANGING THE CONTENT OF THIS FILE MAY CAUSE DATA CORRUPTION. ***
5 | # ********************************************************************
6 | #Mon Nov 13 05:00:30 UTC 2017
7 | SysschemasIndex2Identifier=225
8 | SyscolumnsIdentifier=144
9 | SysconglomeratesIndex1Identifier=49
10 | SysconglomeratesIdentifier=32
11 | SyscolumnsIndex2Identifier=177
12 | SysschemasIndex1Identifier=209
13 | SysconglomeratesIndex3Identifier=81
14 | SystablesIndex2Identifier=129
15 | SyscolumnsIndex1Identifier=161
16 | derby.serviceProtocol=org.apache.derby.database.Database
17 | SysschemasIdentifier=192
18 | derby.storage.propertiesId=16
19 | SysconglomeratesIndex2Identifier=65
20 | derby.serviceLocale=en_US
21 | SystablesIdentifier=96
22 | SystablesIndex1Identifier=113
23 | #--- last line, don't put anything after this line ---
24 |
--------------------------------------------------------------------------------
/nifi_script.py:
--------------------------------------------------------------------------------
1 | from java.io import BufferedReader, InputStreamReader
2 | import json
3 | import java.io
4 | from org.apache.commons.io import IOUtils
5 | from java.nio.charset import StandardCharsets
6 | from org.apache.nifi.processor.io import StreamCallback
7 |
8 | class ModJSON(StreamCallback):
9 | def __init__(self):
10 | pass
11 |
12 | def process(self, inputStream, outputStream):
13 | text = IOUtils.toString(inputStream, StandardCharsets.UTF_8)
14 | obj = json.loads(text)
15 | newObj = {
16 | "Source": "NiFi",
17 | "Dest":"Stuff",
18 | }
19 | outputStream.write(bytearray(json.dumps(newObj, indent=4).encode('utf-8')))
20 |
21 | flowFile = session.get()
22 | if (flowFile != None):
23 | flowFile = session.write(flowFile, ModJSON())
24 | flowFile = session.putAttribute(flowFile, "filename", flowFile.getAttribute('filename').split('.')[0]+'_translated.json')
25 | session.transfer(flowFile, REL_SUCCESS)
26 | session.commit()
27 |
--------------------------------------------------------------------------------
/om.py:
--------------------------------------------------------------------------------
1 | import uuid
2 | from cassandra.cqlengine import columns
3 | from cassandra.cqlengine import connection
4 | from datetime import datetime
5 | from cassandra.cqlengine.management import sync_table
6 | from cassandra.cqlengine.models import Model
7 |
8 | #first, define a model
9 | class ExampleModel(Model):
10 | example_id = columns.UUID(primary_key=True, default=uuid.uuid4)
11 | example_type = columns.Integer(index=True)
12 | created_at = columns.DateTime()
13 | description = columns.Text(required=False)
14 |
15 |
16 | connection.setup(['127.0.0.1'], "cqlengine", protocol_version=3)
17 |
18 | sync_table(ExampleModel)
19 |
20 | em1 = ExampleModel.create(example_type=0, description="example1", created_at=datetime.now())
21 | em2 = ExampleModel.create(example_type=0, description="example2", created_at=datetime.now())
22 | em3 = ExampleModel.create(example_type=0, description="example3", created_at=datetime.now())
23 | em4 = ExampleModel.create(example_type=0, description="example4", created_at=datetime.now())
24 | em5 = ExampleModel.create(example_type=1, description="example5", created_at=datetime.now())
25 | em6 = ExampleModel.create(example_type=1, description="example6", created_at=datetime.now())
26 | em7 = ExampleModel.create(example_type=1, description="example7", created_at=datetime.now())
27 | em8 = ExampleModel.create(example_type=1, description="example8", created_at=datetime.now())
28 |
29 | print (ExampleModel.objects.count())
30 |
--------------------------------------------------------------------------------
/pyspark1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/pyspark1.png
--------------------------------------------------------------------------------
/pyspark2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/pyspark2.png
--------------------------------------------------------------------------------
/resources/employees.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael", "salary":3000}
2 | {"name":"Andy", "salary":4500}
3 | {"name":"Justin", "salary":3500}
4 | {"name":"Berta", "salary":4000}
5 |
--------------------------------------------------------------------------------
/resources/full_user.avsc:
--------------------------------------------------------------------------------
1 | {"type": "record", "namespace": "example.avro", "name": "User", "fields": [{"type": "string", "name": "name"}, {"type": ["string", "null"], "name": "favorite_color"}, {"type": {"items": "int", "type": "array"}, "name": "favorite_numbers"}]}
--------------------------------------------------------------------------------
/resources/kv1.txt:
--------------------------------------------------------------------------------
1 | 238val_238
2 | 86val_86
3 | 311val_311
4 | 27val_27
5 | 165val_165
6 | 409val_409
7 | 255val_255
8 | 278val_278
9 | 98val_98
10 | 484val_484
11 | 265val_265
12 | 193val_193
13 | 401val_401
14 | 150val_150
15 | 273val_273
16 | 224val_224
17 | 369val_369
18 | 66val_66
19 | 128val_128
20 | 213val_213
21 | 146val_146
22 | 406val_406
23 | 429val_429
24 | 374val_374
25 | 152val_152
26 | 469val_469
27 | 145val_145
28 | 495val_495
29 | 37val_37
30 | 327val_327
31 | 281val_281
32 | 277val_277
33 | 209val_209
34 | 15val_15
35 | 82val_82
36 | 403val_403
37 | 166val_166
38 | 417val_417
39 | 430val_430
40 | 252val_252
41 | 292val_292
42 | 219val_219
43 | 287val_287
44 | 153val_153
45 | 193val_193
46 | 338val_338
47 | 446val_446
48 | 459val_459
49 | 394val_394
50 | 237val_237
51 | 482val_482
52 | 174val_174
53 | 413val_413
54 | 494val_494
55 | 207val_207
56 | 199val_199
57 | 466val_466
58 | 208val_208
59 | 174val_174
60 | 399val_399
61 | 396val_396
62 | 247val_247
63 | 417val_417
64 | 489val_489
65 | 162val_162
66 | 377val_377
67 | 397val_397
68 | 309val_309
69 | 365val_365
70 | 266val_266
71 | 439val_439
72 | 342val_342
73 | 367val_367
74 | 325val_325
75 | 167val_167
76 | 195val_195
77 | 475val_475
78 | 17val_17
79 | 113val_113
80 | 155val_155
81 | 203val_203
82 | 339val_339
83 | 0val_0
84 | 455val_455
85 | 128val_128
86 | 311val_311
87 | 316val_316
88 | 57val_57
89 | 302val_302
90 | 205val_205
91 | 149val_149
92 | 438val_438
93 | 345val_345
94 | 129val_129
95 | 170val_170
96 | 20val_20
97 | 489val_489
98 | 157val_157
99 | 378val_378
100 | 221val_221
101 | 92val_92
102 | 111val_111
103 | 47val_47
104 | 72val_72
105 | 4val_4
106 | 280val_280
107 | 35val_35
108 | 427val_427
109 | 277val_277
110 | 208val_208
111 | 356val_356
112 | 399val_399
113 | 169val_169
114 | 382val_382
115 | 498val_498
116 | 125val_125
117 | 386val_386
118 | 437val_437
119 | 469val_469
120 | 192val_192
121 | 286val_286
122 | 187val_187
123 | 176val_176
124 | 54val_54
125 | 459val_459
126 | 51val_51
127 | 138val_138
128 | 103val_103
129 | 239val_239
130 | 213val_213
131 | 216val_216
132 | 430val_430
133 | 278val_278
134 | 176val_176
135 | 289val_289
136 | 221val_221
137 | 65val_65
138 | 318val_318
139 | 332val_332
140 | 311val_311
141 | 275val_275
142 | 137val_137
143 | 241val_241
144 | 83val_83
145 | 333val_333
146 | 180val_180
147 | 284val_284
148 | 12val_12
149 | 230val_230
150 | 181val_181
151 | 67val_67
152 | 260val_260
153 | 404val_404
154 | 384val_384
155 | 489val_489
156 | 353val_353
157 | 373val_373
158 | 272val_272
159 | 138val_138
160 | 217val_217
161 | 84val_84
162 | 348val_348
163 | 466val_466
164 | 58val_58
165 | 8val_8
166 | 411val_411
167 | 230val_230
168 | 208val_208
169 | 348val_348
170 | 24val_24
171 | 463val_463
172 | 431val_431
173 | 179val_179
174 | 172val_172
175 | 42val_42
176 | 129val_129
177 | 158val_158
178 | 119val_119
179 | 496val_496
180 | 0val_0
181 | 322val_322
182 | 197val_197
183 | 468val_468
184 | 393val_393
185 | 454val_454
186 | 100val_100
187 | 298val_298
188 | 199val_199
189 | 191val_191
190 | 418val_418
191 | 96val_96
192 | 26val_26
193 | 165val_165
194 | 327val_327
195 | 230val_230
196 | 205val_205
197 | 120val_120
198 | 131val_131
199 | 51val_51
200 | 404val_404
201 | 43val_43
202 | 436val_436
203 | 156val_156
204 | 469val_469
205 | 468val_468
206 | 308val_308
207 | 95val_95
208 | 196val_196
209 | 288val_288
210 | 481val_481
211 | 457val_457
212 | 98val_98
213 | 282val_282
214 | 197val_197
215 | 187val_187
216 | 318val_318
217 | 318val_318
218 | 409val_409
219 | 470val_470
220 | 137val_137
221 | 369val_369
222 | 316val_316
223 | 169val_169
224 | 413val_413
225 | 85val_85
226 | 77val_77
227 | 0val_0
228 | 490val_490
229 | 87val_87
230 | 364val_364
231 | 179val_179
232 | 118val_118
233 | 134val_134
234 | 395val_395
235 | 282val_282
236 | 138val_138
237 | 238val_238
238 | 419val_419
239 | 15val_15
240 | 118val_118
241 | 72val_72
242 | 90val_90
243 | 307val_307
244 | 19val_19
245 | 435val_435
246 | 10val_10
247 | 277val_277
248 | 273val_273
249 | 306val_306
250 | 224val_224
251 | 309val_309
252 | 389val_389
253 | 327val_327
254 | 242val_242
255 | 369val_369
256 | 392val_392
257 | 272val_272
258 | 331val_331
259 | 401val_401
260 | 242val_242
261 | 452val_452
262 | 177val_177
263 | 226val_226
264 | 5val_5
265 | 497val_497
266 | 402val_402
267 | 396val_396
268 | 317val_317
269 | 395val_395
270 | 58val_58
271 | 35val_35
272 | 336val_336
273 | 95val_95
274 | 11val_11
275 | 168val_168
276 | 34val_34
277 | 229val_229
278 | 233val_233
279 | 143val_143
280 | 472val_472
281 | 322val_322
282 | 498val_498
283 | 160val_160
284 | 195val_195
285 | 42val_42
286 | 321val_321
287 | 430val_430
288 | 119val_119
289 | 489val_489
290 | 458val_458
291 | 78val_78
292 | 76val_76
293 | 41val_41
294 | 223val_223
295 | 492val_492
296 | 149val_149
297 | 449val_449
298 | 218val_218
299 | 228val_228
300 | 138val_138
301 | 453val_453
302 | 30val_30
303 | 209val_209
304 | 64val_64
305 | 468val_468
306 | 76val_76
307 | 74val_74
308 | 342val_342
309 | 69val_69
310 | 230val_230
311 | 33val_33
312 | 368val_368
313 | 103val_103
314 | 296val_296
315 | 113val_113
316 | 216val_216
317 | 367val_367
318 | 344val_344
319 | 167val_167
320 | 274val_274
321 | 219val_219
322 | 239val_239
323 | 485val_485
324 | 116val_116
325 | 223val_223
326 | 256val_256
327 | 263val_263
328 | 70val_70
329 | 487val_487
330 | 480val_480
331 | 401val_401
332 | 288val_288
333 | 191val_191
334 | 5val_5
335 | 244val_244
336 | 438val_438
337 | 128val_128
338 | 467val_467
339 | 432val_432
340 | 202val_202
341 | 316val_316
342 | 229val_229
343 | 469val_469
344 | 463val_463
345 | 280val_280
346 | 2val_2
347 | 35val_35
348 | 283val_283
349 | 331val_331
350 | 235val_235
351 | 80val_80
352 | 44val_44
353 | 193val_193
354 | 321val_321
355 | 335val_335
356 | 104val_104
357 | 466val_466
358 | 366val_366
359 | 175val_175
360 | 403val_403
361 | 483val_483
362 | 53val_53
363 | 105val_105
364 | 257val_257
365 | 406val_406
366 | 409val_409
367 | 190val_190
368 | 406val_406
369 | 401val_401
370 | 114val_114
371 | 258val_258
372 | 90val_90
373 | 203val_203
374 | 262val_262
375 | 348val_348
376 | 424val_424
377 | 12val_12
378 | 396val_396
379 | 201val_201
380 | 217val_217
381 | 164val_164
382 | 431val_431
383 | 454val_454
384 | 478val_478
385 | 298val_298
386 | 125val_125
387 | 431val_431
388 | 164val_164
389 | 424val_424
390 | 187val_187
391 | 382val_382
392 | 5val_5
393 | 70val_70
394 | 397val_397
395 | 480val_480
396 | 291val_291
397 | 24val_24
398 | 351val_351
399 | 255val_255
400 | 104val_104
401 | 70val_70
402 | 163val_163
403 | 438val_438
404 | 119val_119
405 | 414val_414
406 | 200val_200
407 | 491val_491
408 | 237val_237
409 | 439val_439
410 | 360val_360
411 | 248val_248
412 | 479val_479
413 | 305val_305
414 | 417val_417
415 | 199val_199
416 | 444val_444
417 | 120val_120
418 | 429val_429
419 | 169val_169
420 | 443val_443
421 | 323val_323
422 | 325val_325
423 | 277val_277
424 | 230val_230
425 | 478val_478
426 | 178val_178
427 | 468val_468
428 | 310val_310
429 | 317val_317
430 | 333val_333
431 | 493val_493
432 | 460val_460
433 | 207val_207
434 | 249val_249
435 | 265val_265
436 | 480val_480
437 | 83val_83
438 | 136val_136
439 | 353val_353
440 | 172val_172
441 | 214val_214
442 | 462val_462
443 | 233val_233
444 | 406val_406
445 | 133val_133
446 | 175val_175
447 | 189val_189
448 | 454val_454
449 | 375val_375
450 | 401val_401
451 | 421val_421
452 | 407val_407
453 | 384val_384
454 | 256val_256
455 | 26val_26
456 | 134val_134
457 | 67val_67
458 | 384val_384
459 | 379val_379
460 | 18val_18
461 | 462val_462
462 | 492val_492
463 | 100val_100
464 | 298val_298
465 | 9val_9
466 | 341val_341
467 | 498val_498
468 | 146val_146
469 | 458val_458
470 | 362val_362
471 | 186val_186
472 | 285val_285
473 | 348val_348
474 | 167val_167
475 | 18val_18
476 | 273val_273
477 | 183val_183
478 | 281val_281
479 | 344val_344
480 | 97val_97
481 | 469val_469
482 | 315val_315
483 | 84val_84
484 | 28val_28
485 | 37val_37
486 | 448val_448
487 | 152val_152
488 | 348val_348
489 | 307val_307
490 | 194val_194
491 | 414val_414
492 | 477val_477
493 | 222val_222
494 | 126val_126
495 | 90val_90
496 | 169val_169
497 | 403val_403
498 | 400val_400
499 | 200val_200
500 | 97val_97
501 |
--------------------------------------------------------------------------------
/resources/people.json:
--------------------------------------------------------------------------------
1 | {"name":"Michael"}
2 | {"name":"Andy", "age":30}
3 | {"name":"Justin", "age":19}
4 |
--------------------------------------------------------------------------------
/resources/people.txt:
--------------------------------------------------------------------------------
1 | Michael, 29
2 | Andy, 30
3 | Justin, 19
4 |
--------------------------------------------------------------------------------
/resources/user.avsc:
--------------------------------------------------------------------------------
1 | {"namespace": "example.avro",
2 | "type": "record",
3 | "name": "User",
4 | "fields": [
5 | {"name": "name", "type": "string"},
6 | {"name": "favorite_color", "type": ["string", "null"]}
7 | ]
8 | }
9 |
--------------------------------------------------------------------------------
/resources/users.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/resources/users.avro
--------------------------------------------------------------------------------
/resources/users.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/resources/users.parquet
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name='PySparkUtilities',
5 | version='0.1dev',
6 | packages=['utilities'],
7 | license='''
8 | Creative Commons
9 | Attribution-Noncommercial-Share Alike license''',
10 | long_description='''
11 | An example of how to package code for PySpark'''
12 | )
--------------------------------------------------------------------------------
/spark_hive.py:
--------------------------------------------------------------------------------
1 | from os.path import expanduser, join, abspath
2 |
3 | from pyspark.sql import SparkSession
4 | from pyspark.sql import Row
5 |
6 | # warehouse_location points to the default location for managed databases and tables
7 | warehouse_location = '/home/awantik/spark-warehouse'
8 |
9 | spark = SparkSession \
10 | .builder \
11 | .appName("Python Spark SQL Hive integration example") \
12 | .config("spark.sql.warehouse.dir", warehouse_location) \
13 | .enableHiveSupport() \
14 | .getOrCreate()
15 |
16 | # spark is an existing SparkSession
17 | spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
18 | spark.sql("LOAD DATA LOCAL INPATH '/home/awantik/packages/spark-2.4.3-bin-hadoop2.7/examples/src/main/resources/kv1.txt' INTO TABLE src")
19 | df = spark.sql("SELECT * FROM src")
20 | df.show()
21 |
22 | spark.sql("CREATE TABLE IF NOT EXISTS newsrc (key INT, value STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'")
23 | spark.sql("LOAD DATA LOCAL INPATH '/home/awantik/emp.txt' INTO TABLE newsrc")
24 | df2 = spark.sql("SELECT * FROM newsrc")
25 | df2.show()
26 |
27 | df = df2.unionAll(df)
28 | df.show()
29 |
--------------------------------------------------------------------------------
/test_file.py:
--------------------------------------------------------------------------------
1 | import pyspark
2 | import pyspark.sql
3 | import pytest
4 | from spark_utilities import df_count
5 |
6 |
7 | testdata = [
8 | ([[1, 3], [2, 4]]),
9 | ([[1, 3], [2, 4], [3,3]])
10 | ]
11 | @pytest.mark.parametrize("a",testdata)
12 | def test_spark_session_dataframe(spark_session,a):
13 | test_df = spark_session.createDataFrame(a, "a: int, b: int")
14 | assert type(test_df) == pyspark.sql.dataframe.DataFrame
15 | assert df_count(test_df) == 2
16 |
17 |
18 | @pytest.mark.new
19 | def test_spark_session_sql(spark_session):
20 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int")
21 | test_df.registerTempTable('test')
22 |
23 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1')
24 | assert test_filtered_df.count() == 1
25 |
26 | @pytest.mark.webtest
27 | def test_spark_session_s(spark_session):
28 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int")
29 | test_df.registerTempTable('test')
30 |
31 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1')
32 | assert test_filtered_df.count() == 1
33 |
--------------------------------------------------------------------------------
/test_hr_data.csv:
--------------------------------------------------------------------------------
1 | satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
2 | 0.37,0.5,2,154,3,0,1,0,hr,medium
3 | 0.11,0.93,5,140,5,0,1,0,hr,medium
4 | 0.45,0.46,2,159,3,0,1,0,hr,high
5 | 0.44,0.48,2,158,3,0,1,0,technical,low
6 | 0.44,0.56,2,133,3,0,1,0,technical,medium
7 | 0.11,0.77,6,247,4,0,1,0,technical,medium
8 | 0.79,0.93,5,268,5,0,1,0,technical,medium
9 | 0.8,0.9,5,267,5,0,1,0,technical,medium
10 | 0.1,0.87,7,251,5,0,1,0,technical,low
11 | 0.09,0.93,6,279,4,0,1,0,technical,low
12 | 0.7,0.84,6,161,4,0,1,0,technical,low
13 | 0.72,0.84,4,256,5,0,1,0,technical,low
14 | 0.11,0.8,6,304,4,0,1,0,technical,low
15 | 0.39,0.51,2,137,3,0,1,0,technical,low
16 | 0.4,0.49,2,144,3,0,1,0,support,low
17 | 0.43,0.54,2,142,3,0,1,0,support,low
18 | 0.76,0.87,5,262,5,0,1,0,support,low
19 | 0.4,0.48,2,142,3,0,1,0,support,low
20 | 0.09,0.89,6,282,4,0,1,0,support,low
21 | 0.37,0.54,2,157,3,0,1,0,support,low
22 | 0.87,0.91,5,228,5,0,1,0,support,low
23 | 0.1,0.86,6,283,4,0,1,0,support,low
24 | 0.11,0.86,6,286,4,0,1,0,support,low
25 | 0.43,0.5,2,148,3,0,1,0,support,low
26 | 0.1,0.81,6,245,4,0,1,0,support,low
27 | 0.11,0.95,6,279,4,0,1,0,technical,low
28 | 0.85,0.87,5,245,5,0,1,0,technical,low
29 | 0.37,0.49,2,138,3,0,1,0,technical,low
30 | 0.44,0.52,2,141,3,0,1,0,management,low
31 | 0.1,0.83,7,302,5,0,1,0,IT,medium
32 | 0.11,0.89,6,268,4,0,1,0,IT,medium
33 | 0.87,0.88,5,240,5,0,1,0,IT,medium
34 | 0.39,0.49,2,127,3,0,1,0,IT,medium
35 | 0.1,0.94,7,264,4,0,1,0,IT,medium
36 | 0.44,0.53,2,155,3,0,1,0,product_mng,medium
37 | 0.4,0.49,2,143,3,0,1,0,product_mng,medium
38 | 0.76,0.98,5,217,6,0,1,0,product_mng,medium
39 | 0.46,0.55,2,147,3,0,1,0,product_mng,medium
40 | 0.9,0.92,4,271,5,0,1,0,IT,medium
41 | 0.85,0.87,4,273,5,0,1,0,RandD,medium
42 | 0.1,0.78,5,285,4,1,1,0,RandD,medium
43 | 0.43,0.49,2,131,3,0,1,0,RandD,high
44 | 0.2,0.5,5,135,6,0,1,0,RandD,low
45 | 0.81,0.92,5,239,5,0,1,0,RandD,medium
46 | 0.83,0.85,5,237,5,0,1,0,marketing,medium
47 | 0.14,0.75,4,277,5,1,1,0,sales,medium
48 | 0.1,0.84,5,303,5,0,1,0,accounting,medium
49 | 0.91,0.98,4,242,6,0,1,0,support,low
50 | 0.37,0.57,2,158,3,0,1,0,technical,low
51 | 0.42,0.57,2,147,3,1,1,0,management,low
52 | 0.39,0.68,2,282,5,0,1,0,marketing,low
53 | 0.39,0.54,2,154,3,0,1,0,marketing,low
54 | 0.44,0.52,2,149,3,0,1,0,marketing,low
55 | 0.37,0.45,2,149,3,0,1,0,sales,low
56 | 0.39,0.53,2,146,3,0,1,0,sales,low
57 | 0.72,0.94,4,258,5,0,1,0,sales,low
58 | 0.37,0.49,2,148,3,0,1,0,sales,low
59 | 0.82,0.94,5,236,5,0,1,0,sales,low
60 | 0.42,0.52,2,134,3,0,1,0,sales,low
61 | 0.59,1,2,155,5,0,1,0,sales,low
62 | 0.82,0.86,5,257,5,0,1,0,sales,low
63 | 0.73,0.97,6,189,2,0,1,0,sales,low
64 | 0.78,0.66,3,164,3,0,1,0,sales,low
65 | 0.09,0.95,6,271,4,0,1,0,sales,low
66 | 0.1,0.97,6,280,4,0,1,0,sales,low
67 | 0.45,0.46,2,149,3,0,1,0,sales,low
68 | 0.83,0.81,5,219,5,0,1,0,sales,low
69 | 0.43,0.51,2,128,3,0,1,0,sales,low
70 | 0.4,0.47,2,128,3,0,1,0,sales,medium
71 | 0.43,0.46,2,157,3,0,1,0,sales,medium
72 | 0.78,0.93,4,225,5,0,1,0,sales,medium
73 | 0.39,0.45,2,140,3,0,1,0,sales,medium
74 | 0.11,0.97,6,310,4,0,1,0,accounting,medium
75 | 0.36,0.52,2,143,3,0,1,0,accounting,medium
76 | 0.36,0.54,2,153,3,0,1,0,accounting,medium
77 | 0.1,0.79,7,310,4,0,1,0,hr,medium
78 | 0.4,0.47,2,136,3,0,1,0,hr,medium
79 | 0.81,0.85,4,251,6,0,1,0,hr,medium
80 | 0.4,0.47,2,144,3,0,1,0,hr,medium
81 | 0.09,0.93,6,296,4,0,1,0,technical,medium
82 | 0.76,0.89,5,238,5,0,1,0,technical,high
83 | 0.73,0.93,5,162,4,0,1,0,technical,low
84 | 0.38,0.49,2,137,3,0,1,0,technical,medium
85 | 0.72,0.84,5,257,5,0,1,0,technical,medium
86 | 0.4,0.56,2,148,3,0,1,0,technical,medium
87 | 0.91,0.99,5,254,5,0,1,0,technical,medium
88 | 0.85,0.85,4,247,6,0,1,0,technical,low
89 | 0.9,0.7,5,206,4,0,1,0,technical,low
90 | 0.46,0.55,2,145,3,0,1,0,technical,low
91 | 0.43,0.57,2,159,3,1,1,0,technical,low
92 | 0.89,0.88,5,228,5,1,1,0,support,low
93 | 0.09,0.81,6,257,4,0,1,0,support,low
94 | 0.4,0.48,2,155,3,0,1,0,support,low
95 | 0.76,0.83,6,293,6,0,1,0,support,low
96 | 0.4,0.57,2,151,3,0,1,0,support,low
97 | 0.37,0.48,2,160,3,0,1,0,support,low
98 | 0.37,0.53,2,143,3,0,1,0,support,low
99 | 0.11,0.96,6,280,4,0,1,0,support,low
100 | 0.37,0.52,2,158,3,0,1,0,support,low
101 |
--------------------------------------------------------------------------------
/test_hr_data_analysis.py:
--------------------------------------------------------------------------------
1 | import pyspark
2 | import pyspark.sql
3 | import pytest
4 | import hr_data_analysis
5 |
6 | @pytest.mark.old
7 | def test_spark_session_sql0(spark_session):
8 | test_df = spark_session.read.csv('test_hr_data.csv',inferSchema=True, header=True)
9 | assert hr_data_analysis.getCountHardWorkingLessPaid(test_df) == 1
10 |
11 |
12 | @pytest.mark.new
13 | def test_spark_session_sql(spark_session):
14 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int")
15 | test_df.registerTempTable('test')
16 |
17 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1')
18 | assert test_filtered_df.count() == 1
19 |
20 | @pytest.mark.old
21 | def test_spark_session_sql2(spark_session):
22 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int")
23 | test_df.registerTempTable('test')
24 |
25 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1')
26 | assert test_filtered_df.count() == 1
27 |
28 |
--------------------------------------------------------------------------------
/titanic-survival-project.tar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/titanic-survival-project.tar
--------------------------------------------------------------------------------
/titanic.py:
--------------------------------------------------------------------------------
1 | import pyspark.ml.classification as cl
2 | from pyspark.ml.feature import PCA
3 | from pyspark.ml.feature import StringIndexer,OneHotEncoderEstimator,VectorAssembler
4 | from pyspark.ml import Pipeline
5 |
6 | class Titanic:
7 | def __init__(self,spark,input_data,output_data):
8 | self.spark = spark
9 | self.input = input_data
10 | self.output = output_data
11 |
12 | def load(self):
13 | self.data_df = self.spark.read.csv(self.input,inferSchema=True,header=True)
14 | self.data_df.cache()
15 |
16 | def clean(self):
17 | self.data_df = self.data_df.fillna('S',['Embarked'])
18 | self.data_df = self.data_df.fillna(29,['Age'])
19 |
20 | def create_preprocessors(self):
21 | self.stages = []
22 |
23 | cat_cols = ['Sex','Embarked']
24 |
25 | st_list = []
26 | for col in cat_cols:
27 | st = StringIndexer(inputCol=col, outputCol=col+'_si')
28 | st_list.append(st)
29 |
30 | self.stages.extend(st_list)
31 |
32 | ohe = OneHotEncoderEstimator(inputCols=['Sex_si','Embarked_si'], \
33 | outputCols=['Sex_en','Embarked_en'])
34 |
35 | self.stages.append(ohe)
36 |
37 | num_cols = ['Pclass','Age','Fare']
38 |
39 | feature_cols = num_cols + ['Sex_en','Embarked_en']
40 |
41 | va = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec')
42 |
43 | self.stages.append(va)
44 |
45 | def dimensionaity_reduction(self):
46 |
47 | pca = PCA(k=3, inputCol='feature_vec', outputCol='feature_data')
48 | self.stages.append(pca)
49 |
50 | def create_estimators(self):
51 |
52 | logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='Survived',featuresCol='feature_data')
53 | self.stages.append(logistic)
54 |
55 | def create_pipeline(self):
56 |
57 | self.pipeline = Pipeline(stages=self.stages)
58 |
59 | def split_data(self):
60 | return self.data_df.randomSplit([0.7,0.3])
61 |
62 | def fit(self,train):
63 |
64 | self.pipeline_model = self.pipeline.fit(train)
65 |
66 | def predict(self,test):
67 |
68 | return self.pipeline_model.transform(test)
69 |
--------------------------------------------------------------------------------
/udt.py:
--------------------------------------------------------------------------------
1 | from cassandra.cluster import Cluster
2 |
3 | cluster = Cluster()
4 | session = cluster.connect()
5 | session.set_keyspace('university')
6 |
7 | session.execute("CREATE TYPE address (street text, zipcode int)")
8 | session.execute("CREATE TABLE user (id int PRIMARY KEY, location frozen)")
9 |
10 |
11 |
12 | # create a class to map to the "address" UDT
13 | class Address(object):
14 |
15 | def __init__(self, street, zipcode):
16 | self.street = street
17 | self.zipcode = zipcode
18 |
19 | #cluster.register_user_type('university', 'address', Address)
20 |
21 | data = [Address("123 Main St.", 78723), Address("123 Main St.", 78723)]
22 |
23 | # insert a row using an instance of Address
24 | for idx,d in ennumerate(data):
25 | session.execute("INSERT INTO user (id, location) VALUES (%s, %s)",
26 | (idx, d))
27 |
28 | # results will include Address instances
29 | results = session.execute("SELECT * FROM user")
30 | row = results.one()
31 | print (row.id, row.location.street, row.location.zipcode)
32 |
--------------------------------------------------------------------------------