├── .ipynb_checkpoints ├── Exercise - RDD-checkpoint.ipynb ├── Module 1 - Foundations & RDD-checkpoint.ipynb ├── Module 2 - PySpark DataFrames-checkpoint.ipynb ├── PySpark-RDD-checkpoint.ipynb ├── Untitled-checkpoint.ipynb └── Untitled1-checkpoint.ipynb ├── 1.png ├── 1800.csv ├── 2.png ├── 25. PySpark ML.ipynb ├── Allstate insurance Amount Prediction - Regression.ipynb ├── Allstate+insurance+Amount+Prediction+-+Regression.ipynb ├── Allstate-Project.zip ├── Baby_Names__Beginning_2007.csv ├── Broadcast Join.ipynb ├── Exercise - RDD.ipynb ├── GraphFrame Application.ipynb ├── HR_comma_sep.csv ├── Module 1 - Foundations & RDD.ipynb ├── Module 2 - PySpark DataFrames.ipynb ├── Pandas UDFs Benchmark.ipynb ├── Payment.ipynb ├── Predict+Employee+Exit+-+Classification (1).ipynb ├── Predict+Employee+Exit+-+Classification+-2.ipynb ├── Preprocessing.ipynb ├── Project-PySpark.zip ├── PySpark-DataFrames.ipynb ├── PySpark-ML.ipynb ├── PySpark-MlLib.ipynb ├── PySpark-RDD.ipynb ├── PySpark-Structured+Streaming.ipynb ├── README.md ├── Recommendation+Engine+ (1).ipynb ├── Recommendation+Engine+.ipynb ├── SCIO-PySpark-DF-Day1.ipynb ├── SCIO-PySpark-DF-Day2.ipynb ├── Spark Architecture.pptx ├── Spark RDD.pptx ├── Spark Storage Data Formats.pptx ├── Spark-Introduction.pptx ├── Spark-Partitioning.pptx ├── Spark-Performance Tuning.pptx ├── Spark_SQL ├── TSP.ipynb ├── Tensorframes.ipynb ├── Titanic Data Analysis using DataFrames.html ├── Titanic Data Analysis using DataFrames.ipynb ├── Uber-Jan-Feb-FOIL.csv ├── Untitled.ipynb ├── Untitled1.ipynb ├── abc.txt ├── ._SUCCESS.crc ├── .part-00000.crc ├── _SUCCESS └── part-00000 ├── all-world-cup-players.json ├── allstate_test.csv.zip ├── allstate_train.csv.zip ├── births_train.csv.gz ├── births_transformed.csv.gz ├── cass_code.py ├── cigna1-rdd.ipynb ├── cigna1.ipynb ├── customer-orders.csv ├── data ├── 2010-12-01.csv ├── 2015-summary.csv.txt ├── 2015-summary.json.txt ├── HR_comma_sep.csv ├── graphx │ ├── followers.txt │ └── users.txt ├── mllib │ ├── als │ │ ├── sample_movielens_ratings.txt │ │ └── test.data │ ├── gmm_data.txt │ ├── kmeans_data.txt │ ├── pagerank_data.txt │ ├── pic_data.txt │ ├── ridge-data │ │ └── lpsa.data │ ├── sample_binary_classification_data.txt │ ├── sample_fpgrowth.txt │ ├── sample_isotonic_regression_libsvm_data.txt │ ├── sample_kmeans_data.txt │ ├── sample_lda_data.txt │ ├── sample_lda_libsvm_data.txt │ ├── sample_libsvm_data.txt │ ├── sample_linear_regression_data.txt │ ├── sample_movielens_data.txt │ ├── sample_multiclass_classification_data.txt │ ├── sample_svm_data.txt │ └── streaming_kmeans_data_test.txt ├── payments-data.csv ├── sales-data.csv ├── sales-funnel.xlsx ├── sales-of-shampoo-over-a-three-ye.csv ├── sp500.csv ├── store.csv └── streaming │ └── AFINN-111.txt ├── derby.log ├── entry.py ├── fakefriends.csv ├── hadoop-2.7.1.zip ├── hr_data_analysis.py ├── income-adult.data ├── kddcup.data_10_percent.gz ├── metastore_db ├── README_DO_NOT_TOUCH_FILES.txt ├── db.lck ├── dbex.lck ├── log │ ├── README_DO_NOT_TOUCH_FILES.txt │ ├── log.ctrl │ ├── log1.dat │ └── logmirror.ctrl ├── seg0 │ ├── README_DO_NOT_TOUCH_FILES.txt │ ├── c10.dat │ ├── c101.dat │ ├── c111.dat │ ├── c121.dat │ ├── c130.dat │ ├── c141.dat │ ├── c150.dat │ ├── c161.dat │ ├── c171.dat │ ├── c180.dat │ ├── c191.dat │ ├── c1a1.dat │ ├── c1b1.dat │ ├── c1c0.dat │ ├── c1d1.dat │ ├── c1e0.dat │ ├── c1f1.dat │ ├── c20.dat │ ├── c200.dat │ ├── c211.dat │ ├── c221.dat │ ├── c230.dat │ ├── c241.dat │ ├── c251.dat │ ├── c260.dat │ ├── c271.dat │ ├── c281.dat │ ├── c290.dat │ ├── c2a1.dat │ ├── c2b1.dat │ ├── c2c1.dat │ ├── c2d0.dat │ ├── c2e1.dat │ ├── c2f0.dat │ ├── c300.dat │ ├── c31.dat │ ├── c311.dat │ ├── c321.dat │ ├── c331.dat │ ├── c340.dat │ ├── c351.dat │ ├── c361.dat │ ├── c371.dat │ ├── c380.dat │ ├── c391.dat │ ├── c3a1.dat │ ├── c3b1.dat │ ├── c3c0.dat │ ├── c3d1.dat │ ├── c3e1.dat │ ├── c3f1.dat │ ├── c400.dat │ ├── c41.dat │ ├── c411.dat │ ├── c421.dat │ ├── c430.dat │ ├── c441.dat │ ├── c451.dat │ ├── c461.dat │ ├── c470.dat │ ├── c481.dat │ ├── c490.dat │ ├── c4a1.dat │ ├── c4b0.dat │ ├── c4c1.dat │ ├── c4d1.dat │ ├── c4e1.dat │ ├── c4f0.dat │ ├── c501.dat │ ├── c51.dat │ ├── c510.dat │ ├── c521.dat │ ├── c530.dat │ ├── c541.dat │ ├── c550.dat │ ├── c561.dat │ ├── c570.dat │ ├── c581.dat │ ├── c590.dat │ ├── c5a1.dat │ ├── c5b0.dat │ ├── c5c1.dat │ ├── c5d0.dat │ ├── c5e1.dat │ ├── c5f0.dat │ ├── c60.dat │ ├── c601.dat │ ├── c610.dat │ ├── c621.dat │ ├── c630.dat │ ├── c641.dat │ ├── c650.dat │ ├── c661.dat │ ├── c670.dat │ ├── c681.dat │ ├── c690.dat │ ├── c6a1.dat │ ├── c6b0.dat │ ├── c6c1.dat │ ├── c6d0.dat │ ├── c6e1.dat │ ├── c6f0.dat │ ├── c701.dat │ ├── c71.dat │ ├── c711.dat │ ├── c721.dat │ ├── c731.dat │ ├── c741.dat │ ├── c751.dat │ ├── c761.dat │ ├── c771.dat │ ├── c781.dat │ ├── c791.dat │ ├── c7a1.dat │ ├── c7b1.dat │ ├── c7c1.dat │ ├── c7d1.dat │ ├── c7e1.dat │ ├── c7f1.dat │ ├── c801.dat │ ├── c81.dat │ ├── c811.dat │ ├── c821.dat │ ├── c831.dat │ ├── c840.dat │ ├── c851.dat │ ├── c860.dat │ ├── c871.dat │ ├── c880.dat │ ├── c891.dat │ ├── c8a0.dat │ ├── c8b1.dat │ ├── c8c1.dat │ ├── c8d1.dat │ ├── c8e1.dat │ ├── c8f1.dat │ ├── c90.dat │ ├── c901.dat │ ├── c911.dat │ ├── c920.dat │ ├── c931.dat │ ├── c940.dat │ ├── c951.dat │ ├── c960.dat │ ├── c971.dat │ ├── c981.dat │ ├── c990.dat │ ├── c9a1.dat │ ├── c9b1.dat │ ├── c9c0.dat │ ├── c9d1.dat │ ├── c9e0.dat │ ├── c9f1.dat │ ├── ca01.dat │ ├── ca1.dat │ ├── ca11.dat │ ├── ca21.dat │ ├── cb1.dat │ ├── cc0.dat │ ├── cd1.dat │ ├── ce1.dat │ └── cf0.dat └── service.properties ├── nifi_script.py ├── om.py ├── pyspark1.png ├── pyspark2.png ├── resources ├── employees.json ├── full_user.avsc ├── kv1.txt ├── people.json ├── people.txt ├── user.avsc ├── users.avro └── users.parquet ├── setup.py ├── spark_hive.py ├── test_file.py ├── test_hr_data.csv ├── test_hr_data_analysis.py ├── titanic-survival-project.tar ├── titanic-train.csv ├── titanic.py └── udt.py /.ipynb_checkpoints/Exercise - RDD-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/Untitled1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 2 6 | } 7 | -------------------------------------------------------------------------------- /1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/1.png -------------------------------------------------------------------------------- /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/2.png -------------------------------------------------------------------------------- /Allstate-Project.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Allstate-Project.zip -------------------------------------------------------------------------------- /Broadcast Join.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":["from pyspark.sql.functions import broadcast"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":1},{"cell_type":"code","source":["df1 = spark.createDataFrame([('a',2),('b',3),('c',4),('c',44)],['A','B'])"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":2},{"cell_type":"code","source":["df1.show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+---+---+\n A| B|\n+---+---+\n a| 2|\n b| 3|\n c| 4|\n c| 44|\n+---+---+\n\n
"]}}],"execution_count":3},{"cell_type":"code","source":["df2 = spark.createDataFrame([('a','aaaa'),('b','bbbb'),('c','cccc')],('A','C'))"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":4},{"cell_type":"code","source":["df1.join(df2, df1.A == df2.A,'inner').show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+---+---+---+----+\n A| B| A| C|\n+---+---+---+----+\n c| 4| c|cccc|\n c| 44| c|cccc|\n b| 3| b|bbbb|\n a| 2| a|aaaa|\n+---+---+---+----+\n\n
"]}}],"execution_count":5},{"cell_type":"code","source":["df1.join(broadcast(df2),'A').show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+---+---+----+\n A| B| C|\n+---+---+----+\n a| 2|aaaa|\n b| 3|bbbb|\n c| 4|cccc|\n c| 44|cccc|\n+---+---+----+\n\n
"]}}],"execution_count":6},{"cell_type":"code","source":["spark.catalog.listDatabases()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
Out[7]: [Database(name=u'default', description=u'Default Hive database', locationUri=u'dbfs:/user/hive/warehouse')]\n
"]}}],"execution_count":7},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":8}],"metadata":{"name":"Broadcast Join","notebookId":3788746436117310},"nbformat":4,"nbformat_minor":0} 2 | -------------------------------------------------------------------------------- /Exercise - RDD.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "lines = sc.textFile('fakefriends.csv')" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 4, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "def parseLine(line):\n", 23 | " fields = line.split(',')\n", 24 | " age = int(fields[2])\n", 25 | " numFriends = int(fields[3])\n", 26 | " return (age, numFriends)" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 5, 32 | "metadata": { 33 | "collapsed": true 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "rdd = lines.map(parseLine)\n", 38 | "totalsByAge = rdd.mapValues(lambda x: (x, 1)).reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))\n", 39 | "averagesByAge = totalsByAge.mapValues(lambda x: x[0] / x[1])\n", 40 | "results = averagesByAge.collect()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 6, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "(18, 343)\n", 55 | "(19, 213)\n", 56 | "(20, 165)\n", 57 | "(21, 350)\n", 58 | "(22, 206)\n", 59 | "(23, 246)\n", 60 | "(24, 233)\n", 61 | "(25, 197)\n", 62 | "(26, 242)\n", 63 | "(27, 228)\n", 64 | "(28, 209)\n", 65 | "(29, 215)\n", 66 | "(30, 235)\n", 67 | "(31, 267)\n", 68 | "(32, 207)\n", 69 | "(33, 325)\n", 70 | "(34, 245)\n", 71 | "(35, 211)\n", 72 | "(36, 246)\n", 73 | "(37, 249)\n", 74 | "(38, 193)\n", 75 | "(39, 169)\n", 76 | "(40, 250)\n", 77 | "(41, 268)\n", 78 | "(42, 303)\n", 79 | "(43, 230)\n", 80 | "(44, 282)\n", 81 | "(45, 309)\n", 82 | "(46, 223)\n", 83 | "(47, 233)\n", 84 | "(48, 281)\n", 85 | "(49, 184)\n", 86 | "(50, 254)\n", 87 | "(51, 302)\n", 88 | "(52, 340)\n", 89 | "(53, 222)\n", 90 | "(54, 278)\n", 91 | "(55, 295)\n", 92 | "(56, 306)\n", 93 | "(57, 258)\n", 94 | "(58, 116)\n", 95 | "(59, 220)\n", 96 | "(60, 202)\n", 97 | "(61, 256)\n", 98 | "(62, 220)\n", 99 | "(63, 384)\n", 100 | "(64, 281)\n", 101 | "(65, 298)\n", 102 | "(66, 276)\n", 103 | "(67, 214)\n", 104 | "(68, 269)\n", 105 | "(69, 235)\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "for result in results:\n", 111 | " print(result)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [] 122 | } 123 | ], 124 | "metadata": { 125 | "kernelspec": { 126 | "display_name": "Python 2", 127 | "language": "python", 128 | "name": "python2" 129 | }, 130 | "language_info": { 131 | "codemirror_mode": { 132 | "name": "ipython", 133 | "version": 2 134 | }, 135 | "file_extension": ".py", 136 | "mimetype": "text/x-python", 137 | "name": "python", 138 | "nbconvert_exporter": "python", 139 | "pygments_lexer": "ipython2", 140 | "version": "2.7.13" 141 | } 142 | }, 143 | "nbformat": 4, 144 | "nbformat_minor": 2 145 | } 146 | -------------------------------------------------------------------------------- /GraphFrame Application.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":["# Set File Paths\ntripdelaysFilePath = \"/databricks-datasets/flights/departuredelays.csv\"\nairportsnaFilePath = \"/databricks-datasets/flights/airport-codes-na.txt\"\n\n# Obtain airports dataset\n# Note, this dataset is tab-delimited with a header\nairportsna = spark.read.csv(airportsnaFilePath, header='true', inferSchema='true', sep='\\t')\nairportsna.createOrReplaceTempView(\"airports_na\")\n\n# Obtain departure Delays data\n# Note, this dataset is comma-delimited with a header\ndepartureDelays = spark.read.csv(tripdelaysFilePath, header='true')\ndepartureDelays.createOrReplaceTempView(\"departureDelays\")\ndepartureDelays.cache()"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["departureDelays.show()"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["airportsna.show()"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["#\n# Available IATA codes from the departuredelays sample dataset\ntripIATA = spark.sql(\"select distinct iata from (select distinct origin as iata from departureDelays union all select distinct destination as iata from departureDelays) a\")\ntripIATA.createOrReplaceTempView(\"tripIATA\")\ntripIATA.show()"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["\n# Only include airports with atleast one trip from the \n# `departureDelays` dataset\nairports = spark.sql(\"select f.IATA, f.City, f.State, f.Country from airports_na f join tripIATA t on t.IATA = f.IATA\")\nairports.createOrReplaceTempView(\"airports\")\nairports.cache()\nairports.show()"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["# Build `departureDelays_geo` DataFrame\n# Obtain key attributes such as Date of flight, delays, distance, \n# and airport information (Origin, Destination) \ndepartureDelays_geo = spark.sql(\"select cast(f.date as int) as tripid, cast(concat(concat(concat(concat(concat(concat('2014-', concat(concat(substr(cast(f.date as string), 1, 2), '-')), substr(cast(f.date as string), 3, 2)), ''), substr(cast(f.date as string), 5, 2)), ':'), substr(cast(f.date as string), 7, 2)), ':00') as timestamp) as `localdate`, cast(f.delay as int), cast(f.distance as int), f.origin as src, f.destination as dst, o.city as city_src, d.city as city_dst, o.state as state_src, d.state as state_dst from departuredelays f join airports o on o.iata = f.origin join airports d on d.iata = f.destination\") \n\n# Create Temporary View and cache\ndepartureDelays_geo.createOrReplaceTempView(\"departureDelays_geo\")\ndepartureDelays_geo.cache()\ndepartureDelays.printSchema()"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["# Review the top 10 rows of the `departureDelays_geo` DataFrame\ndepartureDelays_geo.show(10)"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["# Note, ensure you have already installed \n# the GraphFrames spark-package\nfrom pyspark.sql.functions import *\nfrom graphframes import *\n\n# Create Vertices (airports) and Edges (flights)\ntripVertices = airports.withColumnRenamed(\"IATA\", \"id\").distinct()\ntripEdges = departureDelays_geo.select(\"tripid\", \"delay\", \"src\", \"dst\", \"city_dst\", \"state_dst\")\n\n# Cache Vertices and Edges\ntripEdges.cache()\ntripVertices.cache()"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["display(tripEdges)"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["tripGraph = GraphFrame(tripVertices, tripEdges)"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["print \"Airports: %d\" % tripGraph.vertices.count()\nprint \"Trips: %d\" % tripGraph.edges.count()"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["tripGraph.edges.groupBy().max(\"delay\").collect()"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["print \"On-time / Early Flights: %d\" % tripGraph.edges.filter(\"delay <= 0\").count()\nprint \"Delayed Flights: %d\" % tripGraph.edges.filter(\"delay > 0\").count()"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"markdown","source":["

Which flights departing Seattle are most likely to have significant delay ?

"],"metadata":{}},{"cell_type":"code","source":["tripGraph.edges\\\n .filter(\"src = 'SEA' and delay > 0\")\\\n .groupBy(\"src\", \"dst\")\\\n .avg(\"delay\")\\\n .sort(desc(\"avg(delay)\"))\\\n .show(5)"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"markdown","source":["

Which states tend to have significant delays departing from Seattle

"],"metadata":{}},{"cell_type":"code","source":["# States with the longest cumulative delays (with individual\n# delays > 100 minutes) (origin: Seattle)\ndisplay(tripGraph.edges.filter(\"src = 'SEA' and delay > 100\"))"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"code","source":["display(tripGraph.degrees.sort(desc(\"degree\")).limit(20))"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"markdown","source":["

Determining the top transfer airports

"],"metadata":{}},{"cell_type":"code","source":["inDeg = tripGraph.inDegrees #flights coming to airport\noutDeg = tripGraph.outDegrees #flights leaving airport"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"code","source":["degreeRatio = inDeg.join(outDeg, inDeg.id == outDeg.id).drop(outDeg.id).selectExpr(\"id\", \"double(inDegree)/double(outDegree) as degreeRatio\").cache()\ndegreeRatio.show(10)"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"code","source":["degreeRatio.join(airports, degreeRatio.id == airports.IATA).show(10)"],"metadata":{},"outputs":[],"execution_count":22},{"cell_type":"code","source":["transferAirports = degreeRatio.join(airports, degreeRatio.id == airports.IATA).selectExpr(\"id\",\"city\",\"degreeRatio\").filter(\"degreeRatio between 0.9 and 1.1\")\ndisplay(transferAirports.orderBy(\"degreeRatio\").limit(10))"],"metadata":{},"outputs":[],"execution_count":23},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":24},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":25}],"metadata":{"name":"GraphFrame Application","notebookId":3438443876603409},"nbformat":4,"nbformat_minor":0} 2 | -------------------------------------------------------------------------------- /Pandas UDFs Benchmark.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":["from pyspark.sql.types import *\nfrom pyspark.sql.functions import col, count, rand, collect_list, explode, struct, count, lit\nfrom pyspark.sql.functions import pandas_udf, PandasUDFType"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":1},{"cell_type":"code","source":["df = spark.range(0, 10 * 1000 * 1000).withColumn('id', (col('id') / 10000).cast('integer')).withColumn('v', rand())\ndf.cache()\ndf.count()\n\ndf.show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+---+--------------------+\n id| v|\n+---+--------------------+\n 0| 0.2606134722164434|\n 0| 0.8339772953748603|\n 0| 0.07305638728029662|\n 0|0.029602261658446816|\n 0| 0.7621764639789104|\n 0| 0.5869532091424473|\n 0| 0.5820613668108897|\n 0| 0.6558386724790438|\n 0| 0.9755782526778792|\n 0| 0.9562032763319009|\n 0| 0.2117948756600163|\n 0|0.025825721817323677|\n 0| 0.6758571301001655|\n 0| 0.3368090652216287|\n 0| 0.17677799850453058|\n 0| 0.42655675271888405|\n 0| 0.9738368781982333|\n 0| 0.22303982349690687|\n 0| 0.7831450268748825|\n 0| 0.08998811176838517|\n+---+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":2},{"cell_type":"code","source":["from pyspark.sql.types import IntegerType\nslen = udf(lambda s: col(s) + 1, IntegerType())"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["@udf('double')\ndef plus_one(v):\n return v + 1\n\n%timeit df.withColumn('v', plus_one(df.v)).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 4.47 s per loop\n
"]}}],"execution_count":4},{"cell_type":"code","source":["@pandas_udf(\"double\", PandasUDFType.SCALAR)\ndef pandas_plus_one(v):\n return v + 1\n\n%timeit df.withColumn('v', pandas_plus_one(df.v)).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 1.26 s per loop\n
"]}}],"execution_count":5},{"cell_type":"code","source":["import pandas as pd\nfrom scipy import stats\n\n@udf('double')\ndef cdf(v):\n return float(stats.norm.cdf(v))\n\n%timeit df.withColumn('cumulative_probability', cdf(df.v)).agg(count(col('cumulative_probability'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
"]}}],"execution_count":6},{"cell_type":"code","source":["import pandas as pd\nfrom scipy import stats\n\n@pandas_udf('double', PandasUDFType.SCALAR)\ndef pandas_cdf(v):\n return pd.Series(stats.norm.cdf(v))\n\n%timeit df.withColumn('cumulative_probability', pandas_cdf(df.v)).agg(count(col('cumulative_probability'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n+-----------------------------+\ncount(cumulative_probability)|\n+-----------------------------+\n 10000000|\n+-----------------------------+\n\n1 loops, best of 3: 1.24 s per loop\n
"]}}],"execution_count":7},{"cell_type":"code","source":["from pyspark.sql import Row\n@udf(ArrayType(df.schema))\ndef substract_mean(rows):\n vs = pd.Series([r.v for r in rows])\n vs = vs - vs.mean()\n return [Row(id=rows[i]['id'], v=float(vs[i])) for i in range(len(rows))]\n \n%timeit df.groupby('id').agg(collect_list(struct(df['id'], df['v'])).alias('rows')).withColumn('new_rows', substract_mean(col('rows'))).withColumn('new_row', explode(col('new_rows'))).withColumn('id', col('new_row.id')).withColumn('v', col('new_row.v')).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 2min 4s per loop\n
"]}}],"execution_count":8},{"cell_type":"code","source":["@pandas_udf(df.schema, PandasUDFType.GROUPED_MAP)\n# Input/output are both a pandas.DataFrame\ndef pandas_subtract_mean(pdf):\n\treturn pdf.assign(v=pdf.v - pdf.v.mean())\n\n%timeit df.groupby('id').apply(pandas_subtract_mean).agg(count(col('v'))).show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n+--------+\ncount(v)|\n+--------+\n10000000|\n+--------+\n\n1 loops, best of 3: 4.65 s per loop\n
"]}}],"execution_count":9},{"cell_type":"code","source":["df2 = df.withColumn('y', rand()).withColumn('x1', rand()).withColumn('x2', rand()).select('id', 'y', 'x1', 'x2')\ndf2.show() "],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+---+-------------------+--------------------+-------------------+\n id| y| x1| x2|\n+---+-------------------+--------------------+-------------------+\n 0| 0.681296060112836| 0.1288311079969241| 0.8181088445104816|\n 0| 0.8888773955549102| 0.25520608131769806| 0.8371196660049978|\n 0|0.12865336389189674| 0.05949353129319879|0.31240880781369607|\n 0|0.33102706063173315| 0.3184970944017924| 0.9934788617057889|\n 0|0.08530551734633984| 0.849098348411309|0.25958206625946156|\n 0| 0.1142436882234027| 0.09221618780441287|0.06660852847156451|\n 0| 0.3734801477601759| 0.16175735111155454|0.23741551784520665|\n 0| 0.4626832884602221| 0.4090520759820342|0.21143786407406573|\n 0| 0.3089074870133427| 0.7875508394004922|0.20291186344825263|\n 0| 0.6963359144225203| 0.24446551311290765|0.09530396721263867|\n 0|0.18601574521309183| 0.18283878773443607| 0.9049305072841698|\n 0| 0.9986921036424282| 0.5744991393169917| 0.4377204256577574|\n 0|0.47413548244645665| 0.8647990390377169| 0.6145253333423468|\n 0| 0.8678090740409161| 0.9349286905893688| 0.897022900084491|\n 0| 0.6752577347437083| 0.20625908730646103|0.10315736062362346|\n 0|0.22669523505013633| 0.6099324032866738| 0.8357508819755833|\n 0| 0.6880907870618188| 0.8392228400945341| 0.7226505258273653|\n 0|0.30101130104653884| 0.9651274666079585| 0.5422836657606281|\n 0| 0.7195503022011948| 0.9288544640693567|0.03643847265357025|\n 0|0.27410622722360234|0.051428600469085706| 0.7646588630569261|\n+---+-------------------+--------------------+-------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":10},{"cell_type":"code","source":["import pandas as pd\nimport statsmodels.api as sm\n# df has four columns: id, y, x1, x2\n\ngroup_column = 'id'\ny_column = 'y'\nx_columns = ['x1', 'x2']\nschema = df2.select(group_column, *x_columns).schema\n\n@pandas_udf(schema, PandasUDFType.GROUPED_MAP)\n# Input/output are both a pandas.DataFrame\ndef ols(pdf):\n group_key = pdf[group_column].iloc[0]\n y = pdf[y_column]\n X = pdf[x_columns]\n X = sm.add_constant(X)\n model = sm.OLS(y, X).fit()\n return pd.DataFrame([[group_key] + [model.params[i] for i in x_columns]], columns=[group_column] + x_columns)\n\nbeta = df2.groupby(group_column).apply(ols)\nbeta.show()"],"metadata":{},"outputs":[{"metadata":{},"output_type":"display_data","data":{"text/html":["\n
+---+--------------------+--------------------+\n id| x1| x2|\n+---+--------------------+--------------------+\n148|-0.00917654654646...|-0.01005665429668...|\n463|-0.01500674870033...|0.016025010035176222|\n471|-0.00832321162854...|-0.00404915919899...|\n496|-0.01122055554723648|-0.00280962655140...|\n833|-0.01629553352114...|0.002779410391406...|\n243|0.005259499685366535|0.011281848381048665|\n392|0.005589240115972826|-0.00950385069041...|\n540|5.918574070326934...|0.012159354453070217|\n623|0.020442434869455878|0.004083702101312427|\n737|0.006226657113389954|0.003961770851249408|\n858|0.001940560121997...|0.006720865070135...|\n897|-0.00142535705654...|0.004045227546180374|\n 31|0.005465606169062085|0.008832031597331093|\n516|-0.00531332000715...|0.001981946321763...|\n 85|0.012725673978444558|-0.02828045053679735|\n137|-0.00131062800808...|-5.30640018178707...|\n251|0.006229489454833485|0.002962616001996...|\n451|0.003804104279762211|-0.00447206880074...|\n580|0.026962287867315624|3.293459638984281E-4|\n808|-0.01025147566168...|0.004950671582079154|\n+---+--------------------+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":11}],"metadata":{"name":"Pandas UDFs Benchmark","notebookId":328232164696745},"nbformat":4,"nbformat_minor":0} 2 | -------------------------------------------------------------------------------- /Predict+Employee+Exit+-+Classification (1).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "hr_data = spark.read.csv('HR_comma_sep.csv',inferSchema=True,header=True)" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": { 18 | "collapsed": false 19 | }, 20 | "outputs": [ 21 | { 22 | "data": { 23 | "text/plain": [ 24 | "Row(satisfaction_level=0.38, last_evaluation=0.53, number_project=2, average_montly_hours=157, time_spend_company=3, Work_accident=0, left=1, promotion_last_5years=0, sales=u'sales', salary=u'low')" 25 | ] 26 | }, 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "output_type": "execute_result" 30 | } 31 | ], 32 | "source": [ 33 | "hr_data.head()" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "root\n", 48 | " |-- satisfaction_level: double (nullable = true)\n", 49 | " |-- last_evaluation: double (nullable = true)\n", 50 | " |-- number_project: integer (nullable = true)\n", 51 | " |-- average_montly_hours: integer (nullable = true)\n", 52 | " |-- time_spend_company: integer (nullable = true)\n", 53 | " |-- Work_accident: integer (nullable = true)\n", 54 | " |-- left: integer (nullable = true)\n", 55 | " |-- promotion_last_5years: integer (nullable = true)\n", 56 | " |-- sales: string (nullable = true)\n", 57 | " |-- salary: string (nullable = true)\n", 58 | "\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "hr_data.printSchema()" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "[Row(sales=u'management'),\n", 77 | " Row(sales=u'product_mng'),\n", 78 | " Row(sales=u'marketing'),\n", 79 | " Row(sales=u'sales'),\n", 80 | " Row(sales=u'hr'),\n", 81 | " Row(sales=u'accounting'),\n", 82 | " Row(sales=u'support'),\n", 83 | " Row(sales=u'IT'),\n", 84 | " Row(sales=u'technical'),\n", 85 | " Row(sales=u'RandD')]" 86 | ] 87 | }, 88 | "execution_count": 4, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "#Get unique data of sales col\n", 95 | "hr_data[['sales']].distinct().collect()" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "#Rename col from sales to dept\n", 107 | "hr_data = hr_data.withColumnRenamed('sales','dept')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "metadata": { 114 | "collapsed": false 115 | }, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "root\n", 122 | " |-- satisfaction_level: double (nullable = true)\n", 123 | " |-- last_evaluation: double (nullable = true)\n", 124 | " |-- number_project: integer (nullable = true)\n", 125 | " |-- average_montly_hours: integer (nullable = true)\n", 126 | " |-- time_spend_company: integer (nullable = true)\n", 127 | " |-- Work_accident: integer (nullable = true)\n", 128 | " |-- left: integer (nullable = true)\n", 129 | " |-- promotion_last_5years: integer (nullable = true)\n", 130 | " |-- dept: string (nullable = true)\n", 131 | " |-- salary: string (nullable = true)\n", 132 | "\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "hr_data.printSchema()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "[Row(salary=u'low'), Row(salary=u'high'), Row(salary=u'medium')]" 151 | ] 152 | }, 153 | "execution_count": 7, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "#Get unique data of sales col\n", 160 | "\n", 161 | "#hr_data[['salary']].distinct().collect()\n", 162 | "\n", 163 | "hr_data.select('salary').distinct().collect()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 8, 169 | "metadata": { 170 | "collapsed": false 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "+-------+-------------------+-------------------+------------------+\n", 178 | "|summary| satisfaction_level| left| number_project|\n", 179 | "+-------+-------------------+-------------------+------------------+\n", 180 | "| count| 14999| 14999| 14999|\n", 181 | "| mean| 0.6128335222348166| 0.2380825388359224| 3.80305353690246|\n", 182 | "| stddev|0.24863065106114257|0.42592409938029885|1.2325923553183513|\n", 183 | "| min| 0.09| 0| 2|\n", 184 | "| max| 1.0| 1| 7|\n", 185 | "+-------+-------------------+-------------------+------------------+\n", 186 | "\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "hr_data.describe()['summary','satisfaction_level','left','number_project'].show()" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "### Featurization - Convert string data to numbers\n", 199 | "* dept & salary are categorical information\n", 200 | "* Need to convert them to number" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 9, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "import pyspark.ml.feature as ft\n", 212 | "\n", 213 | "#StringIndexer - converts string data to numbers\n", 214 | "#input cols are dept 7 salary.\n", 215 | "#output are *_en\n", 216 | "\n", 217 | "transformer_dept = ft.StringIndexer(inputCol='dept', outputCol='dept_en')\n", 218 | "transformer_salary = ft.StringIndexer(inputCol='salary', outputCol='salary_en')" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 11, 224 | "metadata": { 225 | "collapsed": true 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "# Convert numerical data into vector\n", 230 | "# VectorAssembler for creating vector" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 13, 236 | "metadata": { 237 | "collapsed": false 238 | }, 239 | "outputs": [ 240 | { 241 | "data": { 242 | "text/plain": [ 243 | "['satisfaction_level',\n", 244 | " 'last_evaluation',\n", 245 | " 'number_project',\n", 246 | " 'average_montly_hours',\n", 247 | " 'time_spend_company',\n", 248 | " 'Work_accident',\n", 249 | " 'left',\n", 250 | " 'promotion_last_5years',\n", 251 | " 'dept',\n", 252 | " 'salary']" 253 | ] 254 | }, 255 | "execution_count": 13, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "hr_data.columns" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 17, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "#Convert all numerical data to vector\n", 273 | "featurescreator = ft.VectorAssembler(inputCols=['satisfaction_level',\n", 274 | " 'last_evaluation',\n", 275 | " 'number_project',\n", 276 | " 'average_montly_hours',\n", 277 | " 'time_spend_company',\n", 278 | " 'Work_accident',\n", 279 | " 'promotion_last_5years','dept_en','salary_en'], outputCol='features')" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 18, 285 | "metadata": { 286 | "collapsed": true 287 | }, 288 | "outputs": [], 289 | "source": [ 290 | "import pyspark.ml.classification as cl" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 19, 296 | "metadata": { 297 | "collapsed": true 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "#creating estimator\n", 302 | "logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='left')" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 20, 308 | "metadata": { 309 | "collapsed": true 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "from pyspark.ml import Pipeline" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 21, 319 | "metadata": { 320 | "collapsed": true 321 | }, 322 | "outputs": [], 323 | "source": [ 324 | "#create pipeline connecting 3 transformers & one estimator\n", 325 | "pipeline = Pipeline(stages=[transformer_dept, \n", 326 | " transformer_salary, \n", 327 | " featurescreator,\n", 328 | " logistic])" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 22, 334 | "metadata": { 335 | "collapsed": true 336 | }, 337 | "outputs": [], 338 | "source": [ 339 | "#Split data for test & train\n", 340 | "#seed controls the random data generated\n", 341 | "hr_data_train, hr_data_test = hr_data.randomSplit([0.7,0.3],seed=100)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 24, 347 | "metadata": { 348 | "collapsed": true 349 | }, 350 | "outputs": [], 351 | "source": [ 352 | "#Training the pipeline\n", 353 | "model = pipeline.fit(hr_data_train)" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 25, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [ 364 | "test_out = model.transform(hr_data_test)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 26, 370 | "metadata": { 371 | "collapsed": false 372 | }, 373 | "outputs": [ 374 | { 375 | "data": { 376 | "text/plain": [ 377 | "DataFrame[satisfaction_level: double, last_evaluation: double, number_project: int, average_montly_hours: int, time_spend_company: int, Work_accident: int, left: int, promotion_last_5years: int, dept: string, salary: string, dept_en: double, salary_en: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]" 378 | ] 379 | }, 380 | "execution_count": 26, 381 | "metadata": {}, 382 | "output_type": "execute_result" 383 | } 384 | ], 385 | "source": [ 386 | "test_out" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 37, 392 | "metadata": { 393 | "collapsed": false 394 | }, 395 | "outputs": [ 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "[Row(probability=DenseVector([0.3824, 0.6176])),\n", 400 | " Row(probability=DenseVector([0.327, 0.673])),\n", 401 | " Row(probability=DenseVector([0.3909, 0.6091])),\n", 402 | " Row(probability=DenseVector([0.359, 0.641])),\n", 403 | " Row(probability=DenseVector([0.3142, 0.6858]))]" 404 | ] 405 | }, 406 | "execution_count": 37, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "test_out[['probability']].take(5)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 31, 418 | "metadata": { 419 | "collapsed": true 420 | }, 421 | "outputs": [], 422 | "source": [ 423 | "#To find accuracy of the algo under processing\n", 424 | "import pyspark.ml.evaluation as ev" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 33, 430 | "metadata": { 431 | "collapsed": true 432 | }, 433 | "outputs": [], 434 | "source": [ 435 | "#BinaryClassification\n", 436 | "evaluator = ev.BinaryClassificationEvaluator(rawPredictionCol='probability', \n", 437 | " labelCol='left')" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 34, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [ 447 | { 448 | "data": { 449 | "text/plain": [ 450 | "0.822542214798108" 451 | ] 452 | }, 453 | "execution_count": 34, 454 | "metadata": {}, 455 | "output_type": "execute_result" 456 | } 457 | ], 458 | "source": [ 459 | "evaluator.evaluate(test_out)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 45, 465 | "metadata": { 466 | "collapsed": false 467 | }, 468 | "outputs": [ 469 | { 470 | "data": { 471 | "text/plain": [ 472 | "[Row(prediction=1.0),\n", 473 | " Row(prediction=1.0),\n", 474 | " Row(prediction=1.0),\n", 475 | " Row(prediction=1.0),\n", 476 | " Row(prediction=1.0)]" 477 | ] 478 | }, 479 | "execution_count": 45, 480 | "metadata": {}, 481 | "output_type": "execute_result" 482 | } 483 | ], 484 | "source": [ 485 | "test_out[['prediction']].take(5)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 48, 491 | "metadata": { 492 | "collapsed": false 493 | }, 494 | "outputs": [ 495 | { 496 | "name": "stdout", 497 | "output_type": "stream", 498 | "text": [ 499 | "+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+\n", 500 | "|satisfaction_level|last_evaluation|number_project|average_montly_hours|time_spend_company|Work_accident|left|promotion_last_5years| dept|salary|\n", 501 | "+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+\n", 502 | "| 0.09| 0.77| 5| 275| 4| 0| 1| 0|product_mng|medium|\n", 503 | "| 0.09| 0.77| 6| 244| 4| 0| 1| 0|product_mng| low|\n", 504 | "| 0.09| 0.77| 6| 256| 5| 0| 1| 0| support|medium|\n", 505 | "| 0.09| 0.77| 6| 282| 5| 0| 1| 0| sales|medium|\n", 506 | "| 0.09| 0.78| 6| 254| 4| 0| 1| 0| support| low|\n", 507 | "+------------------+---------------+--------------+--------------------+------------------+-------------+----+---------------------+-----------+------+\n", 508 | "only showing top 5 rows\n", 509 | "\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "test_out[hr_data.columns].show(5)" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 51, 520 | "metadata": { 521 | "collapsed": false 522 | }, 523 | "outputs": [ 524 | { 525 | "data": { 526 | "text/plain": [ 527 | "-0.3883749834241161" 528 | ] 529 | }, 530 | "execution_count": 51, 531 | "metadata": {}, 532 | "output_type": "execute_result" 533 | } 534 | ], 535 | "source": [ 536 | "hr_data.corr(col1='satisfaction_level', col2='left')" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "metadata": { 543 | "collapsed": true 544 | }, 545 | "outputs": [], 546 | "source": [] 547 | } 548 | ], 549 | "metadata": { 550 | "kernelspec": { 551 | "display_name": "Python 2", 552 | "language": "python", 553 | "name": "python2" 554 | }, 555 | "language_info": { 556 | "codemirror_mode": { 557 | "name": "ipython", 558 | "version": 2 559 | }, 560 | "file_extension": ".py", 561 | "mimetype": "text/x-python", 562 | "name": "python", 563 | "nbconvert_exporter": "python", 564 | "pygments_lexer": "ipython2", 565 | "version": "2.7.13" 566 | } 567 | }, 568 | "nbformat": 4, 569 | "nbformat_minor": 2 570 | } 571 | -------------------------------------------------------------------------------- /Project-PySpark.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Project-PySpark.zip -------------------------------------------------------------------------------- /PySpark-Structured+Streaming.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Structured Streaming using Python DataFrames API\n", 8 | "\n", 9 | "Apache Spark 2.0 adds the first version of a new higher-level stream processing API, Structured Streaming. In this notebook we are going to take a quick look at how to use DataFrame API to build Structured Streaming applications. We want to compute real-time metrics like running counts and windowed counts on a stream of timestamped actions (e.g. Open, Close, etc)." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Sample Data\n", 17 | "We have some sample action data as files in `/databricks-datasets/structured-streaming/events/` which we are going to use to build this appication. Let's take a look at the contents of this directory." 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 3, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "%fs ls /databricks-datasets/structured-streaming/" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "There are about 50 JSON files in the directory. Let's see what each JSON file contains." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 5, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "%fs head /databricks-datasets/structured-streaming/events/file-0.json" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "Each line in the file contains JSON record with two fields - `time` and `action`. Let's try to analyze these files interactively." 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Batch/Interactive Processing\n", 61 | "The usual first step in attempting to process the data is to interactively query the data. Let's define a static DataFrame on the files, and give it a table name." 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 8, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "from pyspark.sql.types import *\n", 73 | "\n", 74 | "inputPath = \"/databricks-datasets/structured-streaming/events/\"\n", 75 | "\n", 76 | "# Since we know the data format already, let's define the schema to speed up processing (no need for Spark to infer schema)\n", 77 | "jsonSchema = StructType([ StructField(\"time\", TimestampType(), True), StructField(\"action\", StringType(), True) ])\n", 78 | "\n", 79 | "# Static DataFrame representing data in the JSON files\n", 80 | "staticInputDF = (\n", 81 | " spark\n", 82 | " .read\n", 83 | " .schema(jsonSchema)\n", 84 | " .json(inputPath)\n", 85 | ")\n", 86 | "\n", 87 | "display(staticInputDF)" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "Now we can compute the number of \"open\" and \"close\" actions with one hour windows. To do this, we will group by the `action` column and 1 hour windows over the `time` column." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 10, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "from pyspark.sql.functions import * # for window() function\n", 106 | "\n", 107 | "staticCountsDF = (\n", 108 | " staticInputDF\n", 109 | " .groupBy(\n", 110 | " staticInputDF.action, \n", 111 | " window(staticInputDF.time, \"1 hour\")) \n", 112 | " .count()\n", 113 | ")\n", 114 | "staticCountsDF.cache()\n", 115 | "\n", 116 | "# Register the DataFrame as table 'static_counts'\n", 117 | "staticCountsDF.createOrReplaceTempView(\"static_counts\")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Now we can directly use SQL to query the table. For example, here are the total counts across all the hours." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 12, 130 | "metadata": { 131 | "collapsed": true 132 | }, 133 | "outputs": [], 134 | "source": [ 135 | "%sql select action, sum(count) as total_count from static_counts group by action" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "How about a timeline of windowed counts?" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 14, 148 | "metadata": { 149 | "collapsed": true 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from static_counts order by time, action" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "Note the two ends of the graph. The close actions are generated such that they are after the corresponding open actions, so there are more \"opens\" in the beginning and more \"closes\" in the end." 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## Stream Processing \n", 168 | "Now that we have analyzed the data interactively, let's convert this to a streaming query that continuously updates as data comes. Since we just have a static set of files, we are going to emulate a stream from them by reading one file at a time, in the chronological order they were created. The query we have to write is pretty much the same as the interactive query above." 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 17, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "from pyspark.sql.functions import *\n", 180 | "\n", 181 | "# Similar to definition of staticInputDF above, just using `readStream` instead of `read`\n", 182 | "streamingInputDF = (\n", 183 | " spark\n", 184 | " .readStream \n", 185 | " .schema(jsonSchema) # Set the schema of the JSON data\n", 186 | " .option(\"maxFilesPerTrigger\", 1) # Treat a sequence of files as a stream by picking one file at a time\n", 187 | " .json(inputPath)\n", 188 | ")\n", 189 | "\n", 190 | "# Same query as staticInputDF\n", 191 | "streamingCountsDF = ( \n", 192 | " streamingInputDF\n", 193 | " .groupBy(\n", 194 | " streamingInputDF.action, \n", 195 | " window(streamingInputDF.time, \"2 hours\"))\n", 196 | " .count()\n", 197 | ")\n", 198 | "\n", 199 | "# Is this DF actually a streaming DF?\n", 200 | "streamingCountsDF.isStreaming" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "As you can see, `streamingCountsDF` is a streaming Dataframe (`streamingCountsDF.isStreaming` was `true`). You can start streaming computation, by defining the sink and starting it. \n", 208 | "In our case, we want to interactively query the counts (same queries as above), so we will set the complete set of 1 hour counts to be in a in-memory table (note that this for testing purpose only in Spark 2.0)." 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 19, 214 | "metadata": { 215 | "collapsed": true 216 | }, 217 | "outputs": [], 218 | "source": [ 219 | "spark.conf.set(\"spark.sql.shuffle.partitions\", \"2\") # keep the size of shuffles small\n", 220 | "\n", 221 | "query = (\n", 222 | " streamingCountsDF\n", 223 | " .writeStream\n", 224 | " .format(\"memory\") # memory = store in-memory table (for testing only in Spark 2.0)\n", 225 | " .queryName(\"counts\") # counts = name of the in-memory table\n", 226 | " .outputMode(\"complete\") # complete = all the counts should be in the table\n", 227 | " .start()\n", 228 | ")" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "`query` is a handle to the streaming query that is running in the background. This query is continuously picking up files and updating the windowed counts. \n", 236 | "\n", 237 | "Note the status of query in the above cell. Both the `Status: ACTIVE` and the progress bar shows that the query is active. \n", 238 | "Furthermore, if you expand the `>Details` above, you will find the number of files they have already processed. \n", 239 | "\n", 240 | "Let's wait a bit for a few files to be processed and then interactively query the in-memory `counts` table." 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 21, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "from time import sleep\n", 252 | "sleep(5) # wait a bit for computation to start" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 22, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from counts order by time, action" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "We see the timeline of windowed counts (similar to the static one ealrier) building up. If we keep running this interactive query repeatedly, we will see the latest updated counts which the streaming query is updating in the background." 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 24, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "sleep(5) # wait a bit more for more data to be computed" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 25, 287 | "metadata": { 288 | "collapsed": true 289 | }, 290 | "outputs": [], 291 | "source": [ 292 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from counts order by time, action" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 26, 298 | "metadata": { 299 | "collapsed": true 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "sleep(5) # wait a bit more for more data to be computed" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 27, 309 | "metadata": { 310 | "collapsed": true 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "%sql select action, date_format(window.end, \"MMM-dd HH:mm\") as time, count from counts order by time, action" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "Also, let's see the total number of \"opens\" and \"closes\"." 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 29, 327 | "metadata": { 328 | "collapsed": true 329 | }, 330 | "outputs": [], 331 | "source": [ 332 | "%sql select action, sum(count) as total_count from counts group by action order by action" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [] 339 | } 340 | ], 341 | "metadata": { 342 | "kernelspec": { 343 | "display_name": "Python 2", 344 | "language": "python", 345 | "name": "python2" 346 | }, 347 | "language_info": { 348 | "codemirror_mode": { 349 | "name": "ipython", 350 | "version": 2 351 | }, 352 | "file_extension": ".py", 353 | "mimetype": "text/x-python", 354 | "name": "python", 355 | "nbconvert_exporter": "python", 356 | "pygments_lexer": "ipython2", 357 | "version": "2.7.13" 358 | }, 359 | "name": "Introduction to Structured Streaming", 360 | "notebookId": 2229079593072345 361 | }, 362 | "nbformat": 4, 363 | "nbformat_minor": 0 364 | } 365 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pyspark 2 | -------------------------------------------------------------------------------- /Recommendation+Engine+.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 10, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#data in textfile separated by ::\n", 12 | "#so loading as textFile\n", 13 | "lines = spark.read.text('data/mllib/als/sample_movielens_ratings.txt').rdd" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 11, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "[Row(value=u'0::2::3::1424380312'),\n", 27 | " Row(value=u'0::3::1::1424380312'),\n", 28 | " Row(value=u'0::5::2::1424380312'),\n", 29 | " Row(value=u'0::9::4::1424380312'),\n", 30 | " Row(value=u'0::11::1::1424380312')]" 31 | ] 32 | }, 33 | "execution_count": 11, 34 | "metadata": {}, 35 | "output_type": "execute_result" 36 | } 37 | ], 38 | "source": [ 39 | "lines.take(5)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 19, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "#Split based on ::\n", 51 | "data = lines.map(lambda row:row.split('::'))" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 13, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "[[u'0', u'2', u'3', u'1424380312'],\n", 65 | " [u'0', u'3', u'1', u'1424380312'],\n", 66 | " [u'0', u'5', u'2', u'1424380312'],\n", 67 | " [u'0', u'9', u'4', u'1424380312'],\n", 68 | " [u'0', u'11', u'1', u'1424380312']]" 69 | ] 70 | }, 71 | "execution_count": 13, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "data.take(5)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 17, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [], 87 | "source": [ 88 | "from pyspark.sql import Row\n", 89 | "\n", 90 | "rdd = data.map(lambda d: Row(userId= int(d[0]), \n", 91 | " movieId=int(d[1]), \n", 92 | " rating=int(d[2]), \n", 93 | " timestamp=long(d[3]) ))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 18, 99 | "metadata": { 100 | "collapsed": false 101 | }, 102 | "outputs": [ 103 | { 104 | "data": { 105 | "text/plain": [ 106 | "[Row(movieId=2, rating=3, timestamp=1424380312L, userId=0),\n", 107 | " Row(movieId=3, rating=1, timestamp=1424380312L, userId=0)]" 108 | ] 109 | }, 110 | "execution_count": 18, 111 | "metadata": {}, 112 | "output_type": "execute_result" 113 | } 114 | ], 115 | "source": [ 116 | "rdd.take(2)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 20, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "df = spark.createDataFrame(rdd)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 21, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "+-------+------+----------+------+\n", 142 | "|movieId|rating| timestamp|userId|\n", 143 | "+-------+------+----------+------+\n", 144 | "| 2| 3|1424380312| 0|\n", 145 | "| 3| 1|1424380312| 0|\n", 146 | "| 5| 2|1424380312| 0|\n", 147 | "| 9| 4|1424380312| 0|\n", 148 | "| 11| 1|1424380312| 0|\n", 149 | "| 12| 2|1424380312| 0|\n", 150 | "| 15| 1|1424380312| 0|\n", 151 | "| 17| 1|1424380312| 0|\n", 152 | "| 19| 1|1424380312| 0|\n", 153 | "| 21| 1|1424380312| 0|\n", 154 | "| 23| 1|1424380312| 0|\n", 155 | "| 26| 3|1424380312| 0|\n", 156 | "| 27| 1|1424380312| 0|\n", 157 | "| 28| 1|1424380312| 0|\n", 158 | "| 29| 1|1424380312| 0|\n", 159 | "| 30| 1|1424380312| 0|\n", 160 | "| 31| 1|1424380312| 0|\n", 161 | "| 34| 1|1424380312| 0|\n", 162 | "| 37| 1|1424380312| 0|\n", 163 | "| 41| 2|1424380312| 0|\n", 164 | "+-------+------+----------+------+\n", 165 | "only showing top 20 rows\n", 166 | "\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "df.show()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 22, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "name": "stdout", 183 | "output_type": "stream", 184 | "text": [ 185 | "+-------+------+----------+------+\n", 186 | "|movieId|rating| timestamp|userId|\n", 187 | "+-------+------+----------+------+\n", 188 | "| 2| 3|1424380312| 0|\n", 189 | "| 3| 1|1424380312| 0|\n", 190 | "| 5| 2|1424380312| 0|\n", 191 | "| 9| 4|1424380312| 0|\n", 192 | "| 11| 1|1424380312| 0|\n", 193 | "| 12| 2|1424380312| 0|\n", 194 | "| 15| 1|1424380312| 0|\n", 195 | "| 17| 1|1424380312| 0|\n", 196 | "| 19| 1|1424380312| 0|\n", 197 | "| 21| 1|1424380312| 0|\n", 198 | "| 23| 1|1424380312| 0|\n", 199 | "| 26| 3|1424380312| 0|\n", 200 | "| 27| 1|1424380312| 0|\n", 201 | "| 28| 1|1424380312| 0|\n", 202 | "| 29| 1|1424380312| 0|\n", 203 | "| 30| 1|1424380312| 0|\n", 204 | "| 31| 1|1424380312| 0|\n", 205 | "| 34| 1|1424380312| 0|\n", 206 | "| 37| 1|1424380312| 0|\n", 207 | "| 41| 2|1424380312| 0|\n", 208 | "+-------+------+----------+------+\n", 209 | "only showing top 20 rows\n", 210 | "\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "rdd.toDF().show()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 23, 221 | "metadata": { 222 | "collapsed": true 223 | }, 224 | "outputs": [], 225 | "source": [ 226 | "train, test = df.randomSplit([0.8,0.2])" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 24, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "from pyspark.ml.recommendation import ALS" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "collapsed": true 245 | }, 246 | "outputs": [], 247 | "source": [ 248 | "als = ALS(maxIter=5, regParam=0.01, userCol=\"userId\", itemCol=\"movieId\")" 249 | ] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 2", 255 | "language": "python", 256 | "name": "python2" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 2 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython2", 268 | "version": "2.7.13" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 2 273 | } 274 | -------------------------------------------------------------------------------- /Spark Architecture.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark Architecture.pptx -------------------------------------------------------------------------------- /Spark RDD.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark RDD.pptx -------------------------------------------------------------------------------- /Spark Storage Data Formats.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark Storage Data Formats.pptx -------------------------------------------------------------------------------- /Spark-Introduction.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark-Introduction.pptx -------------------------------------------------------------------------------- /Spark-Partitioning.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark-Partitioning.pptx -------------------------------------------------------------------------------- /Spark-Performance Tuning.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/Spark-Performance Tuning.pptx -------------------------------------------------------------------------------- /Spark_SQL: -------------------------------------------------------------------------------- 1 | spark.read.json("/FileStore/tables/2015_flight_data-f8a15.json")\ 2 | .createOrReplaceTempView("some_sql_view") # DF => SQL 3 | 4 | spark.sql(""" 5 | SELECT DEST_COUNTRY_NAME, sum(count) 6 | FROM some_sql_view GROUP BY DEST_COUNTRY_NAME 7 | """)\ 8 | .where("DEST_COUNTRY_NAME like 'S%'").where("`sum(count)` > 10")\ 9 | .count() # SQL => DF 10 | 11 | 12 | CREATE TABLE flights ( 13 | DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG) 14 | USING JSON OPTIONS (path '/FileStore/tables/2015_flight_data-f8a15.json') 15 | 16 | 17 | ---- 18 | 19 | CREATE TABLE flights_csv ( 20 | DEST_COUNTRY_NAME STRING, 21 | ORIGIN_COUNTRY_NAME STRING COMMENT "remember, the US will be most prevalent", 22 | count LONG) 23 | USING csv OPTIONS (header true, path '/data/flight-data/csv/2015-summary.csv') 24 | 25 | 26 | ---- 27 | 28 | CREATE TABLE flights_from_select USING parquet AS SELECT * FROM flights 29 | 30 | 31 | ---- 32 | 33 | CREATE TABLE IF NOT EXISTS flights_from_select 34 | AS SELECT * FROM flights 35 | 36 | 37 | ---- 38 | 39 | CREATE TABLE partitioned_flights USING parquet PARTITIONED BY (DEST_COUNTRY_NAME) 40 | AS SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights LIMIT 5 41 | 42 | 43 | ---- 44 | 45 | CREATE EXTERNAL TABLE hive_flights ( 46 | DEST_COUNTRY_NAME STRING, ORIGIN_COUNTRY_NAME STRING, count LONG) 47 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '/data/flight-data-hive/' 48 | 49 | 50 | ---- 51 | 52 | CREATE EXTERNAL TABLE hive_flights_2 53 | ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' 54 | LOCATION '/data/flight-data-hive/' AS SELECT * FROM flights 55 | 56 | 57 | ---- 58 | 59 | INSERT INTO flights_from_select 60 | SELECT DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME, count FROM flights LIMIT 20 61 | 62 | 63 | ---- 64 | 65 | INSERT INTO partitioned_flights 66 | PARTITION (DEST_COUNTRY_NAME="UNITED STATES") 67 | SELECT count, ORIGIN_COUNTRY_NAME FROM flights 68 | WHERE DEST_COUNTRY_NAME='UNITED STATES' LIMIT 12 69 | 70 | 71 | ---- 72 | 73 | DESCRIBE TABLE flights_csv 74 | 75 | 76 | ---- 77 | 78 | SHOW PARTITIONS partitioned_flights 79 | 80 | 81 | ---- 82 | 83 | REFRESH table partitioned_flights 84 | 85 | 86 | ---- 87 | 88 | MSCK REPAIR TABLE partitioned_flights 89 | 90 | 91 | ---- 92 | 93 | DROP TABLE flights_csv; 94 | 95 | 96 | ---- 97 | 98 | DROP TABLE IF EXISTS flights_csv; 99 | 100 | 101 | ---- 102 | 103 | CACHE TABLE flights 104 | 105 | 106 | ---- 107 | 108 | UNCACHE TABLE FLIGHTS 109 | 110 | 111 | ---- 112 | 113 | CREATE VIEW just_usa_view AS 114 | SELECT * FROM flights WHERE dest_country_name = 'United States' 115 | 116 | 117 | ---- 118 | 119 | CREATE TEMP VIEW just_usa_view_temp AS 120 | SELECT * FROM flights WHERE dest_country_name = 'United States' 121 | 122 | 123 | ---- 124 | 125 | CREATE GLOBAL TEMP VIEW just_usa_global_view_temp AS 126 | SELECT * FROM flights WHERE dest_country_name = 'United States' 127 | 128 | 129 | ---- 130 | 131 | SHOW TABLES 132 | 133 | 134 | ---- 135 | 136 | CREATE OR REPLACE TEMP VIEW just_usa_view_temp AS 137 | SELECT * FROM flights WHERE dest_country_name = 'United States' 138 | 139 | 140 | ---- 141 | 142 | SELECT * FROM just_usa_view_temp 143 | 144 | 145 | ---- 146 | 147 | EXPLAIN SELECT * FROM just_usa_view 148 | 149 | 150 | ---- 151 | 152 | EXPLAIN SELECT * FROM flights WHERE dest_country_name = 'United States' 153 | 154 | 155 | ---- 156 | 157 | DROP VIEW IF EXISTS just_usa_view; 158 | 159 | 160 | ---- 161 | 162 | SHOW DATABASES 163 | 164 | 165 | ---- 166 | 167 | CREATE DATABASE some_db 168 | 169 | 170 | ---- 171 | 172 | USE some_db 173 | 174 | 175 | ---- 176 | 177 | SHOW tables 178 | 179 | SELECT * FROM flights -- fails with table/view not found 180 | 181 | 182 | ---- 183 | 184 | SELECT * FROM default.flights 185 | 186 | 187 | ---- 188 | 189 | SELECT current_database() 190 | 191 | 192 | ---- 193 | 194 | USE default; 195 | 196 | 197 | ---- 198 | 199 | DROP DATABASE IF EXISTS some_db; 200 | 201 | 202 | ---- 203 | 204 | SELECT [ALL|DISTINCT] named_expression[, named_expression, ...] 205 | FROM relation[, relation, ...] 206 | [lateral_view[, lateral_view, ...]] 207 | [WHERE boolean_expression] 208 | [aggregation [HAVING boolean_expression]] 209 | [ORDER BY sort_expressions] 210 | [CLUSTER BY expressions] 211 | [DISTRIBUTE BY expressions] 212 | [SORT BY sort_expressions] 213 | [WINDOW named_window[, WINDOW named_window, ...]] 214 | [LIMIT num_rows] 215 | 216 | named_expression: 217 | : expression [AS alias] 218 | 219 | relation: 220 | | join_relation 221 | | (table_name|query|relation) [sample] [AS alias] 222 | : VALUES (expressions)[, (expressions), ...] 223 | [AS (column_name[, column_name, ...])] 224 | 225 | expressions: 226 | : expression[, expression, ...] 227 | 228 | sort_expressions: 229 | : expression [ASC|DESC][, expression [ASC|DESC], ...] 230 | 231 | 232 | ---- 233 | 234 | SELECT 235 | CASE WHEN DEST_COUNTRY_NAME = 'UNITED STATES' THEN 1 236 | WHEN DEST_COUNTRY_NAME = 'Egypt' THEN 0 237 | ELSE -1 END 238 | FROM partitioned_flights 239 | 240 | 241 | ---- 242 | 243 | CREATE VIEW IF NOT EXISTS nested_data AS 244 | SELECT (DEST_COUNTRY_NAME, ORIGIN_COUNTRY_NAME) as country, count FROM flights 245 | 246 | 247 | ---- 248 | 249 | SELECT * FROM nested_data 250 | 251 | 252 | ---- 253 | 254 | SELECT country.DEST_COUNTRY_NAME, count FROM nested_data 255 | 256 | 257 | ---- 258 | 259 | SELECT country.*, count FROM nested_data 260 | 261 | 262 | ---- 263 | 264 | SELECT DEST_COUNTRY_NAME as new_name, collect_list(count) as flight_counts, 265 | collect_set(ORIGIN_COUNTRY_NAME) as origin_set 266 | FROM flights GROUP BY DEST_COUNTRY_NAME 267 | 268 | 269 | ---- 270 | 271 | SELECT DEST_COUNTRY_NAME, ARRAY(1, 2, 3) FROM flights 272 | 273 | 274 | ---- 275 | 276 | SELECT DEST_COUNTRY_NAME as new_name, collect_list(count)[0] 277 | FROM flights GROUP BY DEST_COUNTRY_NAME 278 | 279 | 280 | ---- 281 | 282 | CREATE OR REPLACE TEMP VIEW flights_agg AS 283 | SELECT DEST_COUNTRY_NAME, collect_list(count) as collected_counts 284 | FROM flights GROUP BY DEST_COUNTRY_NAME 285 | 286 | 287 | ---- 288 | 289 | SELECT explode(collected_counts), DEST_COUNTRY_NAME FROM flights_agg 290 | 291 | 292 | ---- 293 | 294 | SHOW FUNCTIONS 295 | 296 | 297 | ---- 298 | 299 | SHOW SYSTEM FUNCTIONS 300 | 301 | 302 | ---- 303 | 304 | SHOW USER FUNCTIONS 305 | 306 | 307 | ---- 308 | 309 | SHOW FUNCTIONS "s*"; 310 | 311 | 312 | ---- 313 | 314 | SHOW FUNCTIONS LIKE "collect*"; 315 | 316 | 317 | ---- 318 | 319 | SELECT count, power3(count) FROM flights 320 | 321 | 322 | ---- 323 | 324 | SELECT dest_country_name FROM flights 325 | GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5 326 | 327 | 328 | ---- 329 | 330 | SELECT * FROM flights 331 | WHERE origin_country_name IN (SELECT dest_country_name FROM flights 332 | GROUP BY dest_country_name ORDER BY sum(count) DESC LIMIT 5) 333 | 334 | 335 | ---- 336 | 337 | SELECT * FROM flights f1 338 | WHERE EXISTS (SELECT 1 FROM flights f2 339 | WHERE f1.dest_country_name = f2.origin_country_name) 340 | AND EXISTS (SELECT 1 FROM flights f2 341 | WHERE f2.dest_country_name = f1.origin_country_name) 342 | 343 | 344 | ---- 345 | 346 | SELECT *, (SELECT max(count) FROM flights) AS maximum FROM flights 347 | 348 | 349 | ---- 350 | 351 | SET spark.sql.shuffle.partitions=20 352 | 353 | 354 | ---- 355 | 356 | 357 | 358 | -------------------------------------------------------------------------------- /Tensorframes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import tensorflow as tf\n", 12 | "import tensorframes as tfs\n", 13 | "from pyspark.sql import Row" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": { 20 | "collapsed": false 21 | }, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "[Row(z=3.0, x=0.0),\n", 27 | " Row(z=4.0, x=1.0),\n", 28 | " Row(z=5.0, x=2.0),\n", 29 | " Row(z=6.0, x=3.0),\n", 30 | " Row(z=7.0, x=4.0),\n", 31 | " Row(z=8.0, x=5.0),\n", 32 | " Row(z=9.0, x=6.0),\n", 33 | " Row(z=10.0, x=7.0),\n", 34 | " Row(z=11.0, x=8.0),\n", 35 | " Row(z=12.0, x=9.0)]" 36 | ] 37 | }, 38 | "execution_count": 3, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "data = [Row(x=float(x)) for x in range(10)]\n", 45 | "df = spark.createDataFrame(data)\n", 46 | "\n", 47 | "\n", 48 | "with tf.Graph().as_default() as g:\n", 49 | " # The TensorFlow placeholder that corresponds to column 'x'.\n", 50 | " # The shape of the placeholder is automatically inferred from the DataFrame.\n", 51 | " \n", 52 | " x = tfs.block(df, \"x\")\n", 53 | " \n", 54 | " # The output that adds 3 to x\n", 55 | " z = tf.add(x, 3, name='z')\n", 56 | " \n", 57 | " # The resulting dataframe\n", 58 | " df2 = tfs.map_blocks(z, df)\n", 59 | " tf.summary.FileWriter('tensorboard/logs',g)\n", 60 | "\n", 61 | "# The transform is lazy as for most DataFrame operations. This will trigger it:\n", 62 | "df2.collect()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 4, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "root\n", 77 | " |-- x: double (nullable = true) double[?]\n", 78 | "\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "tfs.print_schema(df)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "DataFrame[x: double]" 97 | ] 98 | }, 99 | "execution_count": 5, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "tfs.analyze(df)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "root\n", 120 | " |-- z: double (nullable = false) double[?]\n", 121 | " |-- x: double (nullable = true) double[?]\n", 122 | "\n" 123 | ] 124 | } 125 | ], 126 | "source": [ 127 | "tfs.print_schema(df2)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 17, 133 | "metadata": { 134 | "collapsed": true 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "df3 = df2.select(df2.z.alias(\"y\"))" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 18, 144 | "metadata": { 145 | "collapsed": false 146 | }, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "+----+\n", 153 | "| y|\n", 154 | "+----+\n", 155 | "| 3.0|\n", 156 | "| 4.0|\n", 157 | "| 5.0|\n", 158 | "| 6.0|\n", 159 | "| 7.0|\n", 160 | "| 8.0|\n", 161 | "| 9.0|\n", 162 | "|10.0|\n", 163 | "|11.0|\n", 164 | "|12.0|\n", 165 | "+----+\n", 166 | "\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "df3.show()" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 21, 177 | "metadata": { 178 | "collapsed": false 179 | }, 180 | "outputs": [ 181 | { 182 | "ename": "Exception", 183 | "evalue": "Could not find column with name {col_name}", 184 | "output_type": "error", 185 | "traceback": [ 186 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 187 | "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", 188 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mGraph\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mas_default\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mg\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mblock\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"z\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mzz\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmultiply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mz\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mdf4\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtfs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap_blocks\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mzz\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mdf3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 189 | "\u001b[0;32m/tmp/spark-f9646918-662f-4ce3-8f31-c7b56277d5a5/userFiles-35f6b4e1-fa0e-4de1-b002-84d380644b39/databricks_tensorframes-0.2.9-s_2.11.jar/tensorframes/core.py\u001b[0m in \u001b[0;36mblock\u001b[0;34m(df, col_name, tf_name)\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0ma\u001b[0m \u001b[0mTensorFlow\u001b[0m \u001b[0mplaceholder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 412\u001b[0m \"\"\"\n\u001b[0;32m--> 413\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_auto_placeholder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtf_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mblock\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 414\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtf_name\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 190 | "\u001b[0;32m/tmp/spark-f9646918-662f-4ce3-8f31-c7b56277d5a5/userFiles-35f6b4e1-fa0e-4de1-b002-84d380644b39/databricks_tensorframes-0.2.9-s_2.11.jar/tensorframes/core.py\u001b[0m in \u001b[0;36m_auto_placeholder\u001b[0;34m(df, col_name, tf_name, block)\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0mcol_shape\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0minfo\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfieldName\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 433\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcol_shape\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 434\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Could not find column with name {col_name}\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 435\u001b[0m \u001b[0mcol_shape\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcol_shape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 436\u001b[0m \u001b[0mcol_struct\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mx\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mx\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mschema\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfields\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mcol_name\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 191 | "\u001b[0;31mException\u001b[0m: Could not find column with name {col_name}" 192 | ] 193 | } 194 | ], 195 | "source": [ 196 | "with tf.Graph().as_default() as g:\n", 197 | " z = tfs.block(df3,\"z\")\n", 198 | " zz = tf.multiply(z,3)\n", 199 | " df4 = tfs.map_blocks(zz,df3)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 22, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "g = tf.Graph()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 23, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "with g.as_default():\n", 222 | " b = tf.constant(7)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 24, 228 | "metadata": { 229 | "collapsed": false 230 | }, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "" 236 | ] 237 | }, 238 | "execution_count": 24, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "b.graph" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 25, 250 | "metadata": { 251 | "collapsed": false 252 | }, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "" 258 | ] 259 | }, 260 | "execution_count": 25, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "g" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "collapsed": true 274 | }, 275 | "outputs": [], 276 | "source": [] 277 | } 278 | ], 279 | "metadata": { 280 | "kernelspec": { 281 | "display_name": "Python 2", 282 | "language": "python", 283 | "name": "python2" 284 | }, 285 | "language_info": { 286 | "codemirror_mode": { 287 | "name": "ipython", 288 | "version": 2 289 | }, 290 | "file_extension": ".py", 291 | "mimetype": "text/x-python", 292 | "name": "python", 293 | "nbconvert_exporter": "python", 294 | "pygments_lexer": "ipython2", 295 | "version": "2.7.13" 296 | } 297 | }, 298 | "nbformat": 4, 299 | "nbformat_minor": 2 300 | } 301 | -------------------------------------------------------------------------------- /Titanic Data Analysis using DataFrames.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","source":["titanic_data = spark.read.csv('/FileStore/tables/titanic/titanic_train-ac800.csv', header=True, inferSchema=True)"],"metadata":{},"outputs":[],"execution_count":1},{"cell_type":"code","source":["display(titanic_data)"],"metadata":{},"outputs":[],"execution_count":2},{"cell_type":"code","source":["titanic_data.printSchema()"],"metadata":{},"outputs":[],"execution_count":3},{"cell_type":"code","source":["#Finding maximum fare\ntitanic_data.agg({\"Fare\":\"max\"}).collect()"],"metadata":{},"outputs":[],"execution_count":4},{"cell_type":"code","source":["import pyspark.sql.functions as F"],"metadata":{},"outputs":[],"execution_count":5},{"cell_type":"code","source":["#Finding maximum fare - another way\ntitanic_data.agg(F.max(titanic_data.Fare)).collect()"],"metadata":{},"outputs":[],"execution_count":6},{"cell_type":"code","source":["display(titanic_data.groupBy('Pclass').count())"],"metadata":{},"outputs":[],"execution_count":7},{"cell_type":"code","source":["titanic_data.groupBy('Pclass').avg('Age').collect()"],"metadata":{},"outputs":[],"execution_count":8},{"cell_type":"code","source":["#Descending order of age\ndisplay(titanic_data.orderBy(titanic_data.Age.desc()))"],"metadata":{},"outputs":[],"execution_count":9},{"cell_type":"code","source":["#Person with longest name\ntitanic_data"],"metadata":{},"outputs":[],"execution_count":10},{"cell_type":"code","source":["### UDF in dataframes"],"metadata":{},"outputs":[],"execution_count":11},{"cell_type":"code","source":["from pyspark.sql.functions import udf"],"metadata":{},"outputs":[],"execution_count":12},{"cell_type":"code","source":["def getLen(word):\n return len(word)"],"metadata":{},"outputs":[],"execution_count":13},{"cell_type":"code","source":["from pyspark.sql.types import IntegerType\nlen_udf = udf(getLen, IntegerType())"],"metadata":{},"outputs":[],"execution_count":14},{"cell_type":"code","source":["titanic_data = titanic_data.select(\"*\", len_udf(\"Name\").alias(\"len_name\"))"],"metadata":{},"outputs":[],"execution_count":15},{"cell_type":"code","source":["titanic_data.agg({'len_name':'max'}).collect()"],"metadata":{},"outputs":[],"execution_count":16},{"cell_type":"code","source":["display(titanic_data[titanic_data.len_name == 82])"],"metadata":{},"outputs":[],"execution_count":17},{"cell_type":"code","source":["\n"],"metadata":{},"outputs":[],"execution_count":18},{"cell_type":"code","source":["titanic_data.select(len_udf(\"Name\").alias(\"len_name\")).len_name"],"metadata":{},"outputs":[],"execution_count":19},{"cell_type":"code","source":["##Remove all the rows in which age is missing"],"metadata":{},"outputs":[],"execution_count":20},{"cell_type":"code","source":["help(titanic_data.dropna)"],"metadata":{},"outputs":[],"execution_count":21},{"cell_type":"code","source":["display(titanic_data.dropna(subset=['Age']))"],"metadata":{},"outputs":[],"execution_count":22},{"cell_type":"code","source":[""],"metadata":{},"outputs":[],"execution_count":23}],"metadata":{"name":"Titanic Data Analysis using DataFrames","notebookId":3739718737890552},"nbformat":4,"nbformat_minor":0} 2 | -------------------------------------------------------------------------------- /Uber-Jan-Feb-FOIL.csv: -------------------------------------------------------------------------------- 1 | dispatching_base_number,date,active_vehicles,trips 2 | B02512,1/1/2015,190,1132 3 | B02765,1/1/2015,225,1765 4 | B02764,1/1/2015,3427,29421 5 | B02682,1/1/2015,945,7679 6 | B02617,1/1/2015,1228,9537 7 | B02598,1/1/2015,870,6903 8 | B02598,1/2/2015,785,4768 9 | B02617,1/2/2015,1137,7065 10 | B02512,1/2/2015,175,875 11 | B02682,1/2/2015,890,5506 12 | B02765,1/2/2015,196,1001 13 | B02764,1/2/2015,3147,19974 14 | B02765,1/3/2015,201,1526 15 | B02617,1/3/2015,1188,10664 16 | B02598,1/3/2015,818,7432 17 | B02682,1/3/2015,915,8010 18 | B02512,1/3/2015,173,1088 19 | B02764,1/3/2015,3215,29729 20 | B02512,1/4/2015,147,791 21 | B02682,1/4/2015,812,5621 22 | B02598,1/4/2015,746,5223 23 | B02765,1/4/2015,183,993 24 | B02617,1/4/2015,1088,7729 25 | B02764,1/4/2015,2862,20441 26 | B02512,1/5/2015,194,984 27 | B02682,1/5/2015,951,6012 28 | B02617,1/5/2015,1218,7899 29 | B02764,1/5/2015,3387,20926 30 | B02598,1/5/2015,907,5798 31 | B02765,1/5/2015,227,1133 32 | B02764,1/6/2015,3473,25301 33 | B02682,1/6/2015,1022,7491 34 | B02617,1/6/2015,1336,10128 35 | B02765,1/6/2015,234,1376 36 | B02512,1/6/2015,218,1314 37 | B02598,1/6/2015,933,6816 38 | B02617,1/7/2015,1363,11528 39 | B02682,1/7/2015,1039,9078 40 | B02764,1/7/2015,3603,29949 41 | B02765,1/7/2015,248,1704 42 | B02512,1/7/2015,217,1446 43 | B02598,1/7/2015,974,8397 44 | B02765,1/8/2015,262,1911 45 | B02598,1/8/2015,1070,10050 46 | B02512,1/8/2015,238,1772 47 | B02682,1/8/2015,1135,10416 48 | B02764,1/8/2015,3831,33802 49 | B02617,1/8/2015,1463,13462 50 | B02617,1/9/2015,1455,13165 51 | B02512,1/9/2015,224,1560 52 | B02764,1/9/2015,3820,33517 53 | B02682,1/9/2015,1140,10477 54 | B02598,1/9/2015,1070,9538 55 | B02765,1/9/2015,280,2039 56 | B02682,1/10/2015,1057,11629 57 | B02617,1/10/2015,1331,13856 58 | B02598,1/10/2015,949,10287 59 | B02512,1/10/2015,206,1646 60 | B02764,1/10/2015,3558,38864 61 | B02765,1/10/2015,245,2202 62 | B02765,1/11/2015,220,1672 63 | B02598,1/11/2015,832,7176 64 | B02682,1/11/2015,943,8461 65 | B02764,1/11/2015,3186,27681 66 | B02617,1/11/2015,1228,10932 67 | B02512,1/11/2015,162,1104 68 | B02764,1/12/2015,3499,26852 69 | B02765,1/12/2015,279,1711 70 | B02512,1/12/2015,217,1399 71 | B02598,1/12/2015,964,7915 72 | B02682,1/12/2015,1082,9107 73 | B02617,1/12/2015,1323,10662 74 | B02765,1/13/2015,258,1697 75 | B02598,1/13/2015,975,8713 76 | B02617,1/13/2015,1342,11825 77 | B02512,1/13/2015,234,1652 78 | B02764,1/13/2015,3658,29983 79 | B02682,1/13/2015,1092,9629 80 | B02764,1/14/2015,3736,29550 81 | B02765,1/14/2015,271,1600 82 | B02598,1/14/2015,1030,8870 83 | B02512,1/14/2015,233,1582 84 | B02617,1/14/2015,1405,11965 85 | B02682,1/14/2015,1174,9762 86 | B02512,1/15/2015,237,1636 87 | B02682,1/15/2015,1208,10391 88 | B02617,1/15/2015,1457,12539 89 | B02765,1/15/2015,270,1797 90 | B02764,1/15/2015,3840,31214 91 | B02598,1/15/2015,1068,9152 92 | B02617,1/16/2015,1445,12977 93 | B02765,1/16/2015,290,2082 94 | B02764,1/16/2015,3975,34822 95 | B02682,1/16/2015,1250,11280 96 | B02512,1/16/2015,234,1481 97 | B02598,1/16/2015,1079,9838 98 | B02598,1/17/2015,974,9546 99 | B02512,1/17/2015,201,1281 100 | B02682,1/17/2015,1137,11382 101 | B02765,1/17/2015,252,2160 102 | B02617,1/17/2015,1306,12676 103 | B02764,1/17/2015,3657,36318 104 | B02512,1/18/2015,177,1521 105 | B02598,1/18/2015,869,9443 106 | B02765,1/18/2015,248,2287 107 | B02764,1/18/2015,3290,35182 108 | B02682,1/18/2015,1056,11161 109 | B02617,1/18/2015,1223,12879 110 | B02682,1/19/2015,883,7028 111 | B02617,1/19/2015,992,7775 112 | B02765,1/19/2015,238,1568 113 | B02764,1/19/2015,2958,22750 114 | B02512,1/19/2015,168,1025 115 | B02598,1/19/2015,706,5609 116 | B02598,1/20/2015,944,7206 117 | B02682,1/20/2015,1151,8496 118 | B02512,1/20/2015,221,1310 119 | B02764,1/20/2015,3654,26137 120 | B02765,1/20/2015,272,1608 121 | B02617,1/20/2015,1350,10015 122 | B02764,1/21/2015,3718,27344 123 | B02512,1/21/2015,242,1519 124 | B02682,1/21/2015,1228,9472 125 | B02598,1/21/2015,1035,8041 126 | B02765,1/21/2015,296,1774 127 | B02617,1/21/2015,1429,10997 128 | B02617,1/22/2015,1471,12143 129 | B02764,1/22/2015,3889,30091 130 | B02512,1/22/2015,246,1551 131 | B02598,1/22/2015,1071,9080 132 | B02682,1/22/2015,1295,10699 133 | B02765,1/22/2015,295,2038 134 | B02598,1/23/2015,1093,9343 135 | B02512,1/23/2015,246,1670 136 | B02765,1/23/2015,299,2162 137 | B02764,1/23/2015,4040,33756 138 | B02617,1/23/2015,1482,13121 139 | B02682,1/23/2015,1330,11767 140 | B02598,1/24/2015,945,10040 141 | B02764,1/24/2015,3652,39187 142 | B02512,1/24/2015,211,1608 143 | B02617,1/24/2015,1367,14143 144 | B02682,1/24/2015,1223,13355 145 | B02765,1/24/2015,245,2376 146 | B02512,1/25/2015,183,1190 147 | B02764,1/25/2015,3300,28066 148 | B02765,1/25/2015,226,1755 149 | B02598,1/25/2015,829,7219 150 | B02682,1/25/2015,1046,9303 151 | B02617,1/25/2015,1203,10362 152 | B02617,1/26/2015,1150,7608 153 | B02598,1/26/2015,860,5919 154 | B02765,1/26/2015,230,1363 155 | B02764,1/26/2015,3012,19940 156 | B02682,1/26/2015,1084,7565 157 | B02512,1/26/2015,197,1000 158 | B02682,1/27/2015,600,4414 159 | B02765,1/27/2015,135,921 160 | B02617,1/27/2015,596,4325 161 | B02598,1/27/2015,434,2957 162 | B02512,1/27/2015,112,629 163 | B02764,1/27/2015,1619,11998 164 | B02764,1/28/2015,3692,28137 165 | B02682,1/28/2015,1235,10025 166 | B02765,1/28/2015,286,1913 167 | B02617,1/28/2015,1356,10862 168 | B02598,1/28/2015,1011,8071 169 | B02512,1/28/2015,235,1438 170 | B02617,1/29/2015,1474,12600 171 | B02764,1/29/2015,3959,31637 172 | B02682,1/29/2015,1316,11485 173 | B02765,1/29/2015,295,2086 174 | B02512,1/29/2015,250,1687 175 | B02598,1/29/2015,1082,9499 176 | B02512,1/30/2015,256,2016 177 | B02617,1/30/2015,1501,14793 178 | B02682,1/30/2015,1384,13852 179 | B02764,1/30/2015,4124,39110 180 | B02765,1/30/2015,322,2785 181 | B02598,1/30/2015,1106,11167 182 | B02765,1/31/2015,309,3282 183 | B02512,1/31/2015,225,1892 184 | B02617,1/31/2015,1394,15756 185 | B02682,1/31/2015,1321,15388 186 | B02764,1/31/2015,3947,44297 187 | B02598,1/31/2015,1027,11642 188 | B02598,2/1/2015,961,9499 189 | B02682,2/1/2015,1214,12436 190 | B02512,2/1/2015,193,1377 191 | B02765,2/1/2015,289,2672 192 | B02617,2/1/2015,1355,13458 193 | B02764,2/1/2015,3740,37468 194 | B02617,2/2/2015,1217,12216 195 | B02682,2/2/2015,1152,11981 196 | B02765,2/2/2015,275,2607 197 | B02598,2/2/2015,939,9511 198 | B02764,2/2/2015,3270,30761 199 | B02512,2/2/2015,227,1904 200 | B02765,2/3/2015,299,2410 201 | B02598,2/3/2015,991,9602 202 | B02512,2/3/2015,257,1915 203 | B02764,2/3/2015,3674,31641 204 | B02617,2/3/2015,1350,12665 205 | B02682,2/3/2015,1269,11955 206 | B02764,2/4/2015,3856,29994 207 | B02765,2/4/2015,309,2334 208 | B02512,2/4/2015,244,1639 209 | B02682,2/4/2015,1311,11309 210 | B02617,2/4/2015,1393,11959 211 | B02598,2/4/2015,1072,9600 212 | B02617,2/5/2015,1524,14499 213 | B02682,2/5/2015,1418,13782 214 | B02598,2/5/2015,1179,11609 215 | B02512,2/5/2015,264,2022 216 | B02765,2/5/2015,355,3011 217 | B02764,2/5/2015,4093,35990 218 | B02617,2/6/2015,1526,15417 219 | B02765,2/6/2015,385,3569 220 | B02598,2/6/2015,1181,11897 221 | B02512,2/6/2015,261,1989 222 | B02764,2/6/2015,4170,38693 223 | B02682,2/6/2015,1414,14375 224 | B02598,2/7/2015,1031,10512 225 | B02512,2/7/2015,211,1504 226 | B02617,2/7/2015,1383,13688 227 | B02682,2/7/2015,1300,13450 228 | B02764,2/7/2015,3849,38530 229 | B02765,2/7/2015,345,3473 230 | B02764,2/8/2015,3422,29692 231 | B02765,2/8/2015,313,2623 232 | B02598,2/8/2015,923,8129 233 | B02617,2/8/2015,1256,11004 234 | B02682,2/8/2015,1136,10356 235 | B02512,2/8/2015,176,1196 236 | B02617,2/9/2015,1312,10887 237 | B02682,2/9/2015,1241,10209 238 | B02598,2/9/2015,976,8135 239 | B02764,2/9/2015,3543,28266 240 | B02512,2/9/2015,228,1565 241 | B02765,2/9/2015,388,2894 242 | B02764,2/10/2015,3700,29124 243 | B02512,2/10/2015,233,1555 244 | B02617,2/10/2015,1364,11401 245 | B02765,2/10/2015,422,3432 246 | B02682,2/10/2015,1281,10536 247 | B02598,2/10/2015,1029,8718 248 | B02617,2/11/2015,1450,12749 249 | B02764,2/11/2015,3849,31889 250 | B02512,2/11/2015,255,1831 251 | B02598,2/11/2015,1115,10034 252 | B02765,2/11/2015,450,3778 253 | B02682,2/11/2015,1396,12189 254 | B02617,2/12/2015,1532,14263 255 | B02512,2/12/2015,269,2092 256 | B02682,2/12/2015,1468,13786 257 | B02765,2/12/2015,536,4609 258 | B02598,2/12/2015,1181,11640 259 | B02764,2/12/2015,4137,36844 260 | B02617,2/13/2015,1590,16996 261 | B02682,2/13/2015,1523,16088 262 | B02764,2/13/2015,4395,43561 263 | B02765,2/13/2015,599,5909 264 | B02512,2/13/2015,281,2408 265 | B02598,2/13/2015,1216,13062 266 | B02764,2/14/2015,4129,45858 267 | B02512,2/14/2015,236,2055 268 | B02598,2/14/2015,1111,12678 269 | B02765,2/14/2015,583,6307 270 | B02617,2/14/2015,1486,16999 271 | B02682,2/14/2015,1428,16448 272 | B02682,2/15/2015,1261,14517 273 | B02764,2/15/2015,3651,41209 274 | B02617,2/15/2015,1293,14662 275 | B02765,2/15/2015,521,5500 276 | B02512,2/15/2015,210,1996 277 | B02598,2/15/2015,1003,11517 278 | B02598,2/16/2015,934,9052 279 | B02512,2/16/2015,207,1576 280 | B02617,2/16/2015,1214,11824 281 | B02764,2/16/2015,3524,33448 282 | B02682,2/16/2015,1164,11323 283 | B02765,2/16/2015,508,4875 284 | B02764,2/17/2015,3826,32473 285 | B02512,2/17/2015,241,1797 286 | B02682,2/17/2015,1314,11887 287 | B02617,2/17/2015,1378,12524 288 | B02598,2/17/2015,1066,9463 289 | B02765,2/17/2015,578,4907 290 | B02598,2/18/2015,1078,9538 291 | B02682,2/18/2015,1314,11724 292 | B02617,2/18/2015,1394,12016 293 | B02765,2/18/2015,586,5059 294 | B02764,2/18/2015,3842,32317 295 | B02512,2/18/2015,228,1589 296 | B02598,2/19/2015,1127,11739 297 | B02512,2/19/2015,250,2120 298 | B02682,2/19/2015,1428,14591 299 | B02764,2/19/2015,4110,39110 300 | B02765,2/19/2015,663,6447 301 | B02617,2/19/2015,1452,14750 302 | B02764,2/20/2015,4384,44755 303 | B02617,2/20/2015,1574,16856 304 | B02598,2/20/2015,1186,12758 305 | B02682,2/20/2015,1497,16342 306 | B02765,2/20/2015,736,7824 307 | B02512,2/20/2015,272,2380 308 | B02598,2/21/2015,1044,12132 309 | B02682,2/21/2015,1374,16149 310 | B02765,2/21/2015,685,7658 311 | B02617,2/21/2015,1443,16098 312 | B02512,2/21/2015,238,2149 313 | B02764,2/21/2015,3981,44194 314 | B02512,2/22/2015,199,1312 315 | B02617,2/22/2015,1248,10696 316 | B02682,2/22/2015,1220,10970 317 | B02764,2/22/2015,3478,30157 318 | B02598,2/22/2015,909,8271 319 | B02765,2/22/2015,566,5034 320 | B02598,2/23/2015,966,8943 321 | B02617,2/23/2015,1332,11720 322 | B02764,2/23/2015,3734,31173 323 | B02682,2/23/2015,1262,11714 324 | B02765,2/23/2015,665,5823 325 | B02512,2/23/2015,238,1844 326 | B02764,2/24/2015,3965,34686 327 | B02512,2/24/2015,247,1869 328 | B02598,2/24/2015,1061,9954 329 | B02682,2/24/2015,1346,12497 330 | B02617,2/24/2015,1456,13719 331 | B02765,2/24/2015,698,6390 332 | B02512,2/25/2015,246,1647 333 | B02598,2/25/2015,1076,9405 334 | B02765,2/25/2015,706,6178 335 | B02682,2/25/2015,1395,12693 336 | B02617,2/25/2015,1473,12811 337 | B02764,2/25/2015,3934,31957 338 | B02598,2/26/2015,1134,10661 339 | B02617,2/26/2015,1539,14461 340 | B02682,2/26/2015,1465,13814 341 | B02512,2/26/2015,243,1797 342 | B02765,2/26/2015,745,6744 343 | B02764,2/26/2015,4101,36091 344 | B02765,2/27/2015,786,7563 345 | B02617,2/27/2015,1551,14677 346 | B02598,2/27/2015,1114,10755 347 | B02512,2/27/2015,272,2056 348 | B02764,2/27/2015,4253,38780 349 | B02682,2/27/2015,1510,14975 350 | B02598,2/28/2015,994,10319 351 | B02764,2/28/2015,3952,39812 352 | B02617,2/28/2015,1372,14022 353 | B02682,2/28/2015,1386,14472 354 | B02512,2/28/2015,230,1803 355 | B02765,2/28/2015,747,7753 356 | -------------------------------------------------------------------------------- /Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "sc" 12 | ] 13 | } 14 | ], 15 | "metadata": { 16 | "kernelspec": { 17 | "display_name": "Python 2", 18 | "language": "python", 19 | "name": "python2" 20 | } 21 | }, 22 | "nbformat": 4, 23 | "nbformat_minor": 2 24 | } 25 | -------------------------------------------------------------------------------- /Untitled1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [] 11 | } 12 | ], 13 | "metadata": { 14 | "kernelspec": { 15 | "display_name": "Python 2", 16 | "language": "python", 17 | "name": "python2" 18 | }, 19 | "language_info": { 20 | "codemirror_mode": { 21 | "name": "ipython", 22 | "version": 2 23 | }, 24 | "file_extension": ".py", 25 | "mimetype": "text/x-python", 26 | "name": "python", 27 | "nbconvert_exporter": "python", 28 | "pygments_lexer": "ipython2", 29 | "version": "2.7.13" 30 | } 31 | }, 32 | "nbformat": 4, 33 | "nbformat_minor": 2 34 | } 35 | -------------------------------------------------------------------------------- /abc.txt/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /abc.txt/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/abc.txt/.part-00000.crc -------------------------------------------------------------------------------- /abc.txt/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/abc.txt/_SUCCESS -------------------------------------------------------------------------------- /abc.txt/part-00000: -------------------------------------------------------------------------------- 1 | 0 2 | 1 3 | 2 4 | 3 5 | 4 6 | 5 7 | 6 8 | 7 9 | 8 10 | 9 11 | -------------------------------------------------------------------------------- /allstate_test.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/allstate_test.csv.zip -------------------------------------------------------------------------------- /allstate_train.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/allstate_train.csv.zip -------------------------------------------------------------------------------- /births_train.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/births_train.csv.gz -------------------------------------------------------------------------------- /births_transformed.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/births_transformed.csv.gz -------------------------------------------------------------------------------- /cass_code.py: -------------------------------------------------------------------------------- 1 | from cassandra.cluster import Cluster 2 | 3 | cluster = Cluster() 4 | session = cluster.connect() 5 | 6 | session.set_keyspace('office') 7 | 8 | session.execute('CREATE TABLE user (id int PRIMARY KEY, location text)') 9 | session.execute("INSERT INTO user (id, location) VALUES (%s, %s)",(11,'abc')) 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /data/2015-summary.csv.txt: -------------------------------------------------------------------------------- 1 | DEST_COUNTRY_NAME,ORIGIN_COUNTRY_NAME,count 2 | United States,Romania,15 3 | United States,Croatia,1 4 | United States,Ireland,344 5 | Egypt,United States,15 6 | United States,India,62 7 | United States,Singapore,1 8 | United States,Grenada,62 9 | Costa Rica,United States,588 10 | Senegal,United States,40 11 | Moldova,United States,1 12 | United States,Sint Maarten,325 13 | United States,Marshall Islands,39 14 | Guyana,United States,64 15 | Malta,United States,1 16 | Anguilla,United States,41 17 | Bolivia,United States,30 18 | United States,Paraguay,6 19 | Algeria,United States,4 20 | Turks and Caicos Islands,United States,230 21 | United States,Gibraltar,1 22 | Saint Vincent and the Grenadines,United States,1 23 | Italy,United States,382 24 | United States,Federated States of Micronesia,69 25 | United States,Russia,161 26 | Pakistan,United States,12 27 | United States,Netherlands,660 28 | Iceland,United States,181 29 | Marshall Islands,United States,42 30 | Luxembourg,United States,155 31 | Honduras,United States,362 32 | The Bahamas,United States,955 33 | United States,Senegal,42 34 | El Salvador,United States,561 35 | Samoa,United States,25 36 | United States,Angola,13 37 | Switzerland,United States,294 38 | United States,Anguilla,38 39 | Sint Maarten,United States,325 40 | Hong Kong,United States,332 41 | Trinidad and Tobago,United States,211 42 | Latvia,United States,19 43 | United States,Ecuador,300 44 | Suriname,United States,1 45 | Mexico,United States,7140 46 | United States,Cyprus,1 47 | Ecuador,United States,268 48 | United States,Portugal,134 49 | United States,Costa Rica,608 50 | United States,Guatemala,318 51 | United States,Suriname,34 52 | Colombia,United States,873 53 | United States,Cape Verde,14 54 | United States,Jamaica,712 55 | Norway,United States,121 56 | United States,Malaysia,3 57 | United States,Morocco,19 58 | Thailand,United States,3 59 | United States,Samoa,25 60 | Venezuela,United States,290 61 | United States,Palau,31 62 | United States,Venezuela,246 63 | Panama,United States,510 64 | Antigua and Barbuda,United States,126 65 | United States,Chile,185 66 | Morocco,United States,15 67 | United States,Finland,28 68 | Azerbaijan,United States,21 69 | United States,Greece,23 70 | United States,The Bahamas,986 71 | New Zealand,United States,111 72 | Liberia,United States,2 73 | United States,Hong Kong,414 74 | Hungary,United States,2 75 | United States,China,920 76 | United States,Vietnam,2 77 | Burkina Faso,United States,1 78 | Sweden,United States,118 79 | United States,Kuwait,28 80 | United States,Dominican Republic,1420 81 | United States,Egypt,12 82 | Israel,United States,134 83 | United States,United States,370002 84 | Ethiopia,United States,13 85 | United States,Luxembourg,134 86 | United States,Poland,33 87 | Martinique,United States,44 88 | United States,Saint Barthelemy,41 89 | Saint Barthelemy,United States,39 90 | Barbados,United States,154 91 | United States,Turkey,129 92 | Djibouti,United States,1 93 | United States,Azerbaijan,21 94 | United States,Estonia,1 95 | Germany,United States,1468 96 | United States,South Korea,827 97 | United States,El Salvador,508 98 | Ireland,United States,335 99 | United States,Hungary,3 100 | Zambia,United States,1 101 | Malaysia,United States,2 102 | United States,Ethiopia,12 103 | United States,Panama,465 104 | United States,Aruba,342 105 | United States,Thailand,4 106 | United States,Turks and Caicos Islands,236 107 | Croatia,United States,2 108 | United States,Pakistan,12 109 | Cyprus,United States,1 110 | United States,Honduras,407 111 | Fiji,United States,24 112 | Qatar,United States,108 113 | Saint Kitts and Nevis,United States,139 114 | Kuwait,United States,32 115 | Taiwan,United States,266 116 | Haiti,United States,226 117 | Canada,United States,8399 118 | Federated States of Micronesia,United States,69 119 | United States,Liberia,2 120 | Jamaica,United States,666 121 | United States,Malta,2 122 | Dominican Republic,United States,1353 123 | Japan,United States,1548 124 | United States,Lithuania,1 125 | Finland,United States,26 126 | United States,Guadeloupe,59 127 | United States,Ukraine,13 128 | United States,France,952 129 | United States,Norway,115 130 | Aruba,United States,346 131 | French Guiana,United States,5 132 | United States,Kiribati,35 133 | India,United States,61 134 | British Virgin Islands,United States,107 135 | Brazil,United States,853 136 | United States,Germany,1336 137 | United States,New Zealand,74 138 | French Polynesia,United States,43 139 | United Arab Emirates,United States,320 140 | Singapore,United States,3 141 | United States,Mexico,7187 142 | United States,Sweden,119 143 | Netherlands,United States,776 144 | United States,Martinique,43 145 | United States,United Arab Emirates,313 146 | United States,Bulgaria,1 147 | Denmark,United States,153 148 | China,United States,772 149 | United States,Nicaragua,201 150 | United States,Philippines,126 151 | United States,Georgia,1 152 | United States,Belgium,228 153 | Cayman Islands,United States,314 154 | Argentina,United States,180 155 | Peru,United States,279 156 | South Africa,United States,36 157 | United States,Iceland,202 158 | United States,Argentina,141 159 | Spain,United States,420 160 | Bermuda,United States,183 161 | United States,Nigeria,50 162 | United States,Austria,63 163 | United States,"Bonaire, Sint Eustatius, and Saba",59 164 | Kiribati,United States,26 165 | Saudi Arabia,United States,83 166 | Czech Republic,United States,13 167 | United States,Israel,127 168 | Belgium,United States,259 169 | United States,Saint Lucia,136 170 | United States,Bahrain,1 171 | United States,British Virgin Islands,80 172 | Curacao,United States,90 173 | Georgia,United States,2 174 | United States,Denmark,152 175 | United States,Guyana,63 176 | Philippines,United States,134 177 | Grenada,United States,53 178 | Cape Verde,United States,20 179 | Cote d'Ivoire,United States,1 180 | Ukraine,United States,14 181 | United States,Papua New Guinea,1 182 | Russia,United States,176 183 | United States,Saudi Arabia,70 184 | Guatemala,United States,397 185 | Saint Lucia,United States,123 186 | Paraguay,United States,60 187 | United States,Curacao,83 188 | Kosovo,United States,1 189 | United States,Taiwan,235 190 | Tunisia,United States,3 191 | United States,South Africa,40 192 | Niger,United States,2 193 | Turkey,United States,138 194 | United Kingdom,United States,2025 195 | Romania,United States,14 196 | United States,Greenland,4 197 | Papua New Guinea,United States,3 198 | United States,Spain,442 199 | Iraq,United States,1 200 | United States,Italy,438 201 | Cuba,United States,466 202 | United States,Switzerland,305 203 | Dominica,United States,20 204 | United States,Japan,1496 205 | Portugal,United States,127 206 | United States,Brazil,619 207 | Bahrain,United States,19 208 | United States,Peru,337 209 | Indonesia,United States,1 210 | United States,Belize,193 211 | United States,United Kingdom,1970 212 | Belize,United States,188 213 | United States,Ghana,20 214 | United States,Indonesia,2 215 | United States,Fiji,25 216 | United States,Canada,8483 217 | United States,Antigua and Barbuda,117 218 | United States,French Polynesia,40 219 | Nicaragua,United States,179 220 | United States,Latvia,15 221 | United States,Dominica,27 222 | United States,Czech Republic,12 223 | United States,Australia,258 224 | United States,Cook Islands,13 225 | Austria,United States,62 226 | Jordan,United States,44 227 | Palau,United States,30 228 | South Korea,United States,1048 229 | Angola,United States,15 230 | Ghana,United States,18 231 | New Caledonia,United States,1 232 | Guadeloupe,United States,56 233 | France,United States,935 234 | Poland,United States,32 235 | Nigeria,United States,59 236 | United States,Uruguay,13 237 | Greenland,United States,2 238 | United States,Bermuda,193 239 | Chile,United States,174 240 | United States,Cuba,478 241 | United States,Montenegro,1 242 | United States,Colombia,867 243 | United States,Barbados,130 244 | United States,Qatar,109 245 | Australia,United States,329 246 | United States,Cayman Islands,310 247 | United States,Jordan,44 248 | United States,Namibia,1 249 | United States,Trinidad and Tobago,217 250 | United States,Bolivia,13 251 | Cook Islands,United States,13 252 | Bulgaria,United States,3 253 | United States,Saint Kitts and Nevis,145 254 | Uruguay,United States,43 255 | United States,Haiti,225 256 | "Bonaire, Sint Eustatius, and Saba",United States,58 257 | Greece,United States,30 258 | -------------------------------------------------------------------------------- /data/graphx/followers.txt: -------------------------------------------------------------------------------- 1 | 2 1 2 | 4 1 3 | 1 2 4 | 6 3 5 | 7 3 6 | 7 6 7 | 6 7 8 | 3 7 9 | -------------------------------------------------------------------------------- /data/graphx/users.txt: -------------------------------------------------------------------------------- 1 | 1,BarackObama,Barack Obama 2 | 2,ladygaga,Goddess of Love 3 | 3,jeresig,John Resig 4 | 4,justinbieber,Justin Bieber 5 | 6,matei_zaharia,Matei Zaharia 6 | 7,odersky,Martin Odersky 7 | 8,anonsys 8 | -------------------------------------------------------------------------------- /data/mllib/als/test.data: -------------------------------------------------------------------------------- 1 | 1,1,5.0 2 | 1,2,1.0 3 | 1,3,5.0 4 | 1,4,1.0 5 | 2,1,5.0 6 | 2,2,1.0 7 | 2,3,5.0 8 | 2,4,1.0 9 | 3,1,1.0 10 | 3,2,5.0 11 | 3,3,1.0 12 | 3,4,5.0 13 | 4,1,1.0 14 | 4,2,5.0 15 | 4,3,1.0 16 | 4,4,5.0 17 | -------------------------------------------------------------------------------- /data/mllib/kmeans_data.txt: -------------------------------------------------------------------------------- 1 | 0.0 0.0 0.0 2 | 0.1 0.1 0.1 3 | 0.2 0.2 0.2 4 | 9.0 9.0 9.0 5 | 9.1 9.1 9.1 6 | 9.2 9.2 9.2 7 | -------------------------------------------------------------------------------- /data/mllib/pagerank_data.txt: -------------------------------------------------------------------------------- 1 | 1 2 2 | 1 3 3 | 1 4 4 | 2 1 5 | 3 1 6 | 4 1 7 | -------------------------------------------------------------------------------- /data/mllib/pic_data.txt: -------------------------------------------------------------------------------- 1 | 0 1 1.0 2 | 0 2 1.0 3 | 0 3 1.0 4 | 1 2 1.0 5 | 1 3 1.0 6 | 2 3 1.0 7 | 3 4 0.1 8 | 4 5 1.0 9 | 4 15 1.0 10 | 5 6 1.0 11 | 6 7 1.0 12 | 7 8 1.0 13 | 8 9 1.0 14 | 9 10 1.0 15 | 10 11 1.0 16 | 11 12 1.0 17 | 12 13 1.0 18 | 13 14 1.0 19 | 14 15 1.0 20 | -------------------------------------------------------------------------------- /data/mllib/ridge-data/lpsa.data: -------------------------------------------------------------------------------- 1 | -0.4307829,-1.63735562648104 -2.00621178480549 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 2 | -0.1625189,-1.98898046126935 -0.722008756122123 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 3 | -0.1625189,-1.57881887548545 -2.1887840293994 1.36116336875686 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 4 | -0.1625189,-2.16691708463163 -0.807993896938655 -0.787896192088153 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 5 | 0.3715636,-0.507874475300631 -0.458834049396776 -0.250631301876899 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 6 | 0.7654678,-2.03612849966376 -0.933954647105133 -1.86242597251066 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 7 | 0.8544153,-0.557312518810673 -0.208756571683607 -0.787896192088153 0.990146852537193 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 8 | 1.2669476,-0.929360463147704 -0.0578991819441687 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 9 | 1.2669476,-2.28833047634983 -0.0706369432557794 -0.116315079324086 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 10 | 1.2669476,0.223498042876113 -1.41471935455355 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 11 | 1.3480731,0.107785900236813 -1.47221551299731 0.420949810887169 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.687186906466865 12 | 1.446919,0.162180092313795 -1.32557369901905 0.286633588334355 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 13 | 1.4701758,-1.49795329918548 -0.263601072284232 0.823898478545609 0.788388310173035 -0.522940888712441 -0.29928234305568 0.342627053981254 0.199211097885341 14 | 1.4929041,0.796247055396743 0.0476559407005752 0.286633588334355 -1.02470580167082 -0.522940888712441 0.394013435896129 -1.04215728919298 -0.864466507337306 15 | 1.5581446,-1.62233848461465 -0.843294091975396 -3.07127197548598 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 16 | 1.5993876,-0.990720665490831 0.458513517212311 0.823898478545609 1.07379746308195 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 17 | 1.6389967,-0.171901281967138 -0.489197399065355 -0.65357996953534 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 18 | 1.6956156,-1.60758252338831 -0.590700340358265 -0.65357996953534 -0.619561070667254 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 19 | 1.7137979,0.366273918511144 -0.414014962912583 -0.116315079324086 0.232904453212813 -0.522940888712441 0.971228997418125 0.342627053981254 1.26288870310799 20 | 1.8000583,-0.710307384579833 0.211731938156277 0.152317365781542 -1.02470580167082 -0.522940888712441 -0.442797990776478 0.342627053981254 1.61744790484887 21 | 1.8484548,-0.262791728113881 -1.16708345615721 0.420949810887169 0.0846342590816532 -0.522940888712441 0.163172393491611 0.342627053981254 1.97200710658975 22 | 1.8946169,0.899043117369237 -0.590700340358265 0.152317365781542 -1.02470580167082 -0.522940888712441 1.28643254437683 -1.04215728919298 -0.864466507337306 23 | 1.9242487,-0.903451690500615 1.07659722048274 0.152317365781542 1.28380453408541 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 24 | 2.008214,-0.0633337899773081 -1.38088970920094 0.958214701098423 0.80409888772376 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 25 | 2.0476928,-1.15393789990757 -0.961853075398404 -0.116315079324086 -1.02470580167082 -0.522940888712441 -0.442797990776478 -1.04215728919298 -0.864466507337306 26 | 2.1575593,0.0620203721138446 0.0657973885499142 1.22684714620405 -0.468824786336838 -0.522940888712441 1.31421001659859 1.72741139715549 -0.332627704725983 27 | 2.1916535,-0.75731027755674 -2.92717970468456 0.018001143228728 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 28 | 2.2137539,1.11226993252773 1.06484916245061 0.555266033439982 0.877691038550889 1.89254797819741 1.43890404648442 0.342627053981254 0.376490698755783 29 | 2.2772673,-0.468768642850639 -1.43754788774533 -1.05652863719378 0.576050411655607 -0.522940888712441 0.0120483832567209 0.342627053981254 -0.687186906466865 30 | 2.2975726,-0.618884859896728 -1.1366360750781 -0.519263746982526 -1.02470580167082 -0.522940888712441 -0.863171185425945 3.11219574032972 1.97200710658975 31 | 2.3272777,-0.651431999123483 0.55329161145762 -0.250631301876899 1.11210019001038 -0.522940888712441 -0.179808625688859 -1.04215728919298 -0.864466507337306 32 | 2.5217206,0.115499102435224 -0.512233676577595 0.286633588334355 1.13650173283446 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.155348103855541 33 | 2.5533438,0.266341329949937 -0.551137885443386 -0.384947524429713 0.354857790686005 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 34 | 2.5687881,1.16902610257751 0.855491905752846 2.03274448152093 1.22628985326088 1.89254797819741 2.02833774827712 3.11219574032972 2.68112551007152 35 | 2.6567569,-0.218972367124187 0.851192298581141 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 0.342627053981254 0.908329501367106 36 | 2.677591,0.263121415733908 1.4142681068416 0.018001143228728 1.35980653053822 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 37 | 2.7180005,-0.0704736333296423 1.52000996595417 0.286633588334355 1.39364261119802 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.332627704725983 38 | 2.7942279,-0.751957286017338 0.316843561689933 -1.99674219506348 0.911736065044475 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 39 | 2.8063861,-0.685277652430997 1.28214038482516 0.823898478545609 0.232904453212813 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.155348103855541 40 | 2.8124102,-0.244991501432929 0.51882005949686 -0.384947524429713 0.823246560137838 -0.522940888712441 -0.863171185425945 0.342627053981254 0.553770299626224 41 | 2.8419982,-0.75731027755674 2.09041984898851 1.22684714620405 1.53428167116843 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 42 | 2.8535925,1.20962937075363 -0.242882661178889 1.09253092365124 -1.02470580167082 -0.522940888712441 1.24263233939889 3.11219574032972 2.50384590920108 43 | 2.9204698,0.570886990493502 0.58243883987948 0.555266033439982 1.16006887775962 -0.522940888712441 1.07357183940747 0.342627053981254 1.61744790484887 44 | 2.9626924,0.719758684343624 0.984970304132004 1.09253092365124 1.52137230773457 -0.522940888712441 -0.179808625688859 0.342627053981254 -0.509907305596424 45 | 2.9626924,-1.52406140158064 1.81975700990333 0.689582255992796 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 46 | 2.9729753,-0.132431544081234 2.68769877553723 1.09253092365124 1.53428167116843 -0.522940888712441 -0.442797990776478 0.342627053981254 -0.687186906466865 47 | 3.0130809,0.436161292804989 -0.0834447307428255 -0.519263746982526 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 48 | 3.0373539,-0.161195191984091 -0.671900359186746 1.7641120364153 1.13650173283446 -0.522940888712441 -0.863171185425945 0.342627053981254 0.0219314970149 49 | 3.2752562,1.39927182372944 0.513852869452676 0.689582255992796 -1.02470580167082 1.89254797819741 1.49394503405693 0.342627053981254 -0.155348103855541 50 | 3.3375474,1.51967002306341 -0.852203755696565 0.555266033439982 -0.104527297798983 1.89254797819741 1.85927724828569 0.342627053981254 0.908329501367106 51 | 3.3928291,0.560725834706224 1.87867703391426 1.09253092365124 1.39364261119802 -0.522940888712441 0.486423065822545 0.342627053981254 1.26288870310799 52 | 3.4355988,1.00765532502814 1.69426310090641 1.89842825896812 1.53428167116843 -0.522940888712441 -0.863171185425945 0.342627053981254 -0.509907305596424 53 | 3.4578927,1.10152996153577 -0.10927271844907 0.689582255992796 -1.02470580167082 1.89254797819741 1.97630171771485 0.342627053981254 1.61744790484887 54 | 3.5160131,0.100001934217311 -1.30380956369388 0.286633588334355 0.316555063757567 -0.522940888712441 0.28786643052924 0.342627053981254 0.553770299626224 55 | 3.5307626,0.987291634724086 -0.36279314978779 -0.922212414640967 0.232904453212813 -0.522940888712441 1.79270085261407 0.342627053981254 1.26288870310799 56 | 3.5652984,1.07158528137575 0.606453149641961 1.7641120364153 -0.432854616994416 1.89254797819741 0.528504607720369 0.342627053981254 0.199211097885341 57 | 3.5876769,0.180156323255198 0.188987436375017 -0.519263746982526 1.09956763075594 -0.522940888712441 0.708239632330506 0.342627053981254 0.199211097885341 58 | 3.6309855,1.65687973755377 -0.256675483533719 0.018001143228728 -1.02470580167082 1.89254797819741 1.79270085261407 0.342627053981254 1.26288870310799 59 | 3.6800909,0.5720085322365 0.239854450210939 -0.787896192088153 1.0605418233138 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 60 | 3.7123518,0.323806133438225 -0.606717660886078 -0.250631301876899 -1.02470580167082 1.89254797819741 0.342907418101747 0.342627053981254 0.199211097885341 61 | 3.9843437,1.23668206715898 2.54220539083611 0.152317365781542 -1.02470580167082 1.89254797819741 1.89037692416194 0.342627053981254 1.26288870310799 62 | 3.993603,0.180156323255198 0.154448192444669 1.62979581386249 0.576050411655607 1.89254797819741 0.708239632330506 0.342627053981254 1.79472750571931 63 | 4.029806,1.60906277046565 1.10378605019827 0.555266033439982 -1.02470580167082 -0.522940888712441 -0.863171185425945 -1.04215728919298 -0.864466507337306 64 | 4.1295508,1.0036214996026 0.113496885050331 -0.384947524429713 0.860016436332751 1.89254797819741 -0.863171185425945 0.342627053981254 -0.332627704725983 65 | 4.3851468,1.25591974271076 0.577607033774471 0.555266033439982 -1.02470580167082 1.89254797819741 1.07357183940747 0.342627053981254 1.26288870310799 66 | 4.6844434,2.09650591351268 0.625488598331018 -2.66832330782754 -1.02470580167082 1.89254797819741 1.67954222367555 0.342627053981254 0.553770299626224 67 | 5.477509,1.30028987435881 0.338383613253713 0.555266033439982 1.00481276295349 1.89254797819741 1.24263233939889 0.342627053981254 1.97200710658975 68 | -------------------------------------------------------------------------------- /data/mllib/sample_fpgrowth.txt: -------------------------------------------------------------------------------- 1 | r z h k p 2 | z y x w v u t s 3 | s x o n r 4 | x z y m t s q e 5 | z 6 | x z y r q t p 7 | -------------------------------------------------------------------------------- /data/mllib/sample_isotonic_regression_libsvm_data.txt: -------------------------------------------------------------------------------- 1 | 0.24579296 1:0.01 2 | 0.28505864 1:0.02 3 | 0.31208567 1:0.03 4 | 0.35900051 1:0.04 5 | 0.35747068 1:0.05 6 | 0.16675166 1:0.06 7 | 0.17491076 1:0.07 8 | 0.04181540 1:0.08 9 | 0.04793473 1:0.09 10 | 0.03926568 1:0.10 11 | 0.12952575 1:0.11 12 | 0.00000000 1:0.12 13 | 0.01376849 1:0.13 14 | 0.13105558 1:0.14 15 | 0.08873024 1:0.15 16 | 0.12595614 1:0.16 17 | 0.15247323 1:0.17 18 | 0.25956145 1:0.18 19 | 0.20040796 1:0.19 20 | 0.19581846 1:0.20 21 | 0.15757267 1:0.21 22 | 0.13717491 1:0.22 23 | 0.19020908 1:0.23 24 | 0.19581846 1:0.24 25 | 0.20091790 1:0.25 26 | 0.16879143 1:0.26 27 | 0.18510964 1:0.27 28 | 0.20040796 1:0.28 29 | 0.29576747 1:0.29 30 | 0.43396226 1:0.30 31 | 0.53391127 1:0.31 32 | 0.52116267 1:0.32 33 | 0.48546660 1:0.33 34 | 0.49209587 1:0.34 35 | 0.54156043 1:0.35 36 | 0.59765426 1:0.36 37 | 0.56144824 1:0.37 38 | 0.58592555 1:0.38 39 | 0.52983172 1:0.39 40 | 0.50178480 1:0.40 41 | 0.52626211 1:0.41 42 | 0.58286588 1:0.42 43 | 0.64660887 1:0.43 44 | 0.68077511 1:0.44 45 | 0.74298827 1:0.45 46 | 0.64864865 1:0.46 47 | 0.67261601 1:0.47 48 | 0.65782764 1:0.48 49 | 0.69811321 1:0.49 50 | 0.63029067 1:0.50 51 | 0.61601224 1:0.51 52 | 0.63233044 1:0.52 53 | 0.65323814 1:0.53 54 | 0.65323814 1:0.54 55 | 0.67363590 1:0.55 56 | 0.67006629 1:0.56 57 | 0.51555329 1:0.57 58 | 0.50892402 1:0.58 59 | 0.33299337 1:0.59 60 | 0.36206017 1:0.60 61 | 0.43090260 1:0.61 62 | 0.45996940 1:0.62 63 | 0.56348802 1:0.63 64 | 0.54920959 1:0.64 65 | 0.48393677 1:0.65 66 | 0.48495665 1:0.66 67 | 0.46965834 1:0.67 68 | 0.45181030 1:0.68 69 | 0.45843957 1:0.69 70 | 0.47118817 1:0.70 71 | 0.51555329 1:0.71 72 | 0.58031617 1:0.72 73 | 0.55481897 1:0.73 74 | 0.56297807 1:0.74 75 | 0.56603774 1:0.75 76 | 0.57929628 1:0.76 77 | 0.64762876 1:0.77 78 | 0.66241713 1:0.78 79 | 0.69301377 1:0.79 80 | 0.65119837 1:0.80 81 | 0.68332483 1:0.81 82 | 0.66598674 1:0.82 83 | 0.73890872 1:0.83 84 | 0.73992861 1:0.84 85 | 0.84242733 1:0.85 86 | 0.91330954 1:0.86 87 | 0.88016318 1:0.87 88 | 0.90719021 1:0.88 89 | 0.93115757 1:0.89 90 | 0.93115757 1:0.90 91 | 0.91942886 1:0.91 92 | 0.92911780 1:0.92 93 | 0.95665477 1:0.93 94 | 0.95002550 1:0.94 95 | 0.96940337 1:0.95 96 | 1.00000000 1:0.96 97 | 0.89801122 1:0.97 98 | 0.90311066 1:0.98 99 | 0.90362060 1:0.99 100 | 0.83477817 1:1.0 -------------------------------------------------------------------------------- /data/mllib/sample_kmeans_data.txt: -------------------------------------------------------------------------------- 1 | 0 1:0.0 2:0.0 3:0.0 2 | 1 1:0.1 2:0.1 3:0.1 3 | 2 1:0.2 2:0.2 3:0.2 4 | 3 1:9.0 2:9.0 3:9.0 5 | 4 1:9.1 2:9.1 3:9.1 6 | 5 1:9.2 2:9.2 3:9.2 7 | -------------------------------------------------------------------------------- /data/mllib/sample_lda_data.txt: -------------------------------------------------------------------------------- 1 | 1 2 6 0 2 3 1 1 0 0 3 2 | 1 3 0 1 3 0 0 2 0 0 1 3 | 1 4 1 0 0 4 9 0 1 2 0 4 | 2 1 0 3 0 0 5 0 2 3 9 5 | 3 1 1 9 3 0 2 0 0 1 3 6 | 4 2 0 3 4 5 1 1 1 4 0 7 | 2 1 0 3 0 0 5 0 2 2 9 8 | 1 1 1 9 2 1 2 0 0 1 3 9 | 4 4 0 3 4 2 1 3 0 0 0 10 | 2 8 2 0 3 0 2 0 2 7 2 11 | 1 1 1 9 0 2 2 0 0 3 3 12 | 4 1 0 0 4 5 1 3 0 1 0 13 | -------------------------------------------------------------------------------- /data/mllib/sample_lda_libsvm_data.txt: -------------------------------------------------------------------------------- 1 | 0 1:1 2:2 3:6 4:0 5:2 6:3 7:1 8:1 9:0 10:0 11:3 2 | 1 1:1 2:3 3:0 4:1 5:3 6:0 7:0 8:2 9:0 10:0 11:1 3 | 2 1:1 2:4 3:1 4:0 5:0 6:4 7:9 8:0 9:1 10:2 11:0 4 | 3 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:3 11:9 5 | 4 1:3 2:1 3:1 4:9 5:3 6:0 7:2 8:0 9:0 10:1 11:3 6 | 5 1:4 2:2 3:0 4:3 5:4 6:5 7:1 8:1 9:1 10:4 11:0 7 | 6 1:2 2:1 3:0 4:3 5:0 6:0 7:5 8:0 9:2 10:2 11:9 8 | 7 1:1 2:1 3:1 4:9 5:2 6:1 7:2 8:0 9:0 10:1 11:3 9 | 8 1:4 2:4 3:0 4:3 5:4 6:2 7:1 8:3 9:0 10:0 11:0 10 | 9 1:2 2:8 3:2 4:0 5:3 6:0 7:2 8:0 9:2 10:7 11:2 11 | 10 1:1 2:1 3:1 4:9 5:0 6:2 7:2 8:0 9:0 10:3 11:3 12 | 11 1:4 2:1 3:0 4:0 5:4 6:5 7:1 8:3 9:0 10:1 11:0 13 | -------------------------------------------------------------------------------- /data/mllib/sample_multiclass_classification_data.txt: -------------------------------------------------------------------------------- 1 | 1 1:-0.222222 2:0.5 3:-0.762712 4:-0.833333 2 | 1 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667 3 | 1 1:-0.722222 2:-0.166667 3:-0.864407 4:-0.833333 4 | 1 1:-0.722222 2:0.166667 3:-0.694915 4:-0.916667 5 | 0 1:0.166667 2:-0.416667 3:0.457627 4:0.5 6 | 1 1:-0.833333 3:-0.864407 4:-0.916667 7 | 2 1:-1.32455e-07 2:-0.166667 3:0.220339 4:0.0833333 8 | 2 1:-1.32455e-07 2:-0.333333 3:0.0169491 4:-4.03573e-08 9 | 1 1:-0.5 2:0.75 3:-0.830508 4:-1 10 | 0 1:0.611111 3:0.694915 4:0.416667 11 | 0 1:0.222222 2:-0.166667 3:0.423729 4:0.583333 12 | 1 1:-0.722222 2:-0.166667 3:-0.864407 4:-1 13 | 1 1:-0.5 2:0.166667 3:-0.864407 4:-0.916667 14 | 2 1:-0.222222 2:-0.333333 3:0.0508474 4:-4.03573e-08 15 | 2 1:-0.0555556 2:-0.833333 3:0.0169491 4:-0.25 16 | 2 1:-0.166667 2:-0.416667 3:-0.0169491 4:-0.0833333 17 | 1 1:-0.944444 3:-0.898305 4:-0.916667 18 | 2 1:-0.277778 2:-0.583333 3:-0.0169491 4:-0.166667 19 | 0 1:0.111111 2:-0.333333 3:0.38983 4:0.166667 20 | 2 1:-0.222222 2:-0.166667 3:0.0847457 4:-0.0833333 21 | 0 1:0.166667 2:-0.333333 3:0.559322 4:0.666667 22 | 1 1:-0.611111 2:0.0833333 3:-0.864407 4:-0.916667 23 | 2 1:-0.333333 2:-0.583333 3:0.0169491 4:-4.03573e-08 24 | 0 1:0.555555 2:-0.166667 3:0.661017 4:0.666667 25 | 2 1:0.166667 3:0.186441 4:0.166667 26 | 2 1:0.111111 2:-0.75 3:0.152542 4:-4.03573e-08 27 | 2 1:0.166667 2:-0.25 3:0.118644 4:-4.03573e-08 28 | 0 1:-0.0555556 2:-0.833333 3:0.355932 4:0.166667 29 | 0 1:-0.277778 2:-0.333333 3:0.322034 4:0.583333 30 | 2 1:-0.222222 2:-0.5 3:-0.152542 4:-0.25 31 | 2 1:-0.111111 3:0.288136 4:0.416667 32 | 2 1:-0.0555556 2:-0.25 3:0.186441 4:0.166667 33 | 2 1:0.333333 2:-0.166667 3:0.355932 4:0.333333 34 | 1 1:-0.611111 2:0.25 3:-0.898305 4:-0.833333 35 | 0 1:0.166667 2:-0.333333 3:0.559322 4:0.75 36 | 0 1:0.111111 2:-0.25 3:0.559322 4:0.416667 37 | 0 1:0.833333 2:-0.166667 3:0.898305 4:0.666667 38 | 2 1:-0.277778 2:-0.166667 3:0.186441 4:0.166667 39 | 0 1:-0.666667 2:-0.583333 3:0.186441 4:0.333333 40 | 1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 41 | 1 1:-0.166667 2:0.666667 3:-0.932203 4:-0.916667 42 | 0 1:0.0555554 2:-0.333333 3:0.288136 4:0.416667 43 | 1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 44 | 1 1:-0.833333 2:0.166667 3:-0.864407 4:-0.833333 45 | 0 1:0.0555554 2:0.166667 3:0.491525 4:0.833333 46 | 0 1:0.722222 2:-0.333333 3:0.728813 4:0.5 47 | 2 1:-0.166667 2:-0.416667 3:0.0508474 4:-0.25 48 | 2 1:0.5 3:0.254237 4:0.0833333 49 | 0 1:0.111111 2:-0.583333 3:0.355932 4:0.5 50 | 1 1:-0.944444 2:-0.166667 3:-0.898305 4:-0.916667 51 | 2 1:0.277778 2:-0.25 3:0.220339 4:-4.03573e-08 52 | 0 1:0.666667 2:-0.25 3:0.79661 4:0.416667 53 | 0 1:0.111111 2:0.0833333 3:0.694915 4:1 54 | 0 1:0.444444 3:0.59322 4:0.833333 55 | 2 1:-0.0555556 2:0.166667 3:0.186441 4:0.25 56 | 1 1:-0.833333 2:0.333333 3:-1 4:-0.916667 57 | 1 1:-0.555556 2:0.416667 3:-0.830508 4:-0.75 58 | 2 1:-0.333333 2:-0.5 3:0.152542 4:-0.0833333 59 | 1 1:-1 2:-0.166667 3:-0.966102 4:-1 60 | 1 1:-0.333333 2:0.25 3:-0.898305 4:-0.916667 61 | 2 1:0.388889 2:-0.333333 3:0.288136 4:0.0833333 62 | 2 1:0.277778 2:-0.166667 3:0.152542 4:0.0833333 63 | 0 1:0.333333 2:0.0833333 3:0.59322 4:0.666667 64 | 1 1:-0.777778 3:-0.79661 4:-0.916667 65 | 1 1:-0.444444 2:0.416667 3:-0.830508 4:-0.916667 66 | 0 1:0.222222 2:-0.166667 3:0.627119 4:0.75 67 | 1 1:-0.555556 2:0.5 3:-0.79661 4:-0.916667 68 | 1 1:-0.555556 2:0.5 3:-0.694915 4:-0.75 69 | 2 1:-1.32455e-07 2:-0.25 3:0.254237 4:0.0833333 70 | 1 1:-0.5 2:0.25 3:-0.830508 4:-0.916667 71 | 0 1:0.166667 3:0.457627 4:0.833333 72 | 2 1:0.444444 2:-0.0833334 3:0.322034 4:0.166667 73 | 0 1:0.111111 2:0.166667 3:0.559322 4:0.916667 74 | 1 1:-0.611111 2:0.25 3:-0.79661 4:-0.583333 75 | 0 1:0.388889 3:0.661017 4:0.833333 76 | 1 1:-0.722222 2:0.166667 3:-0.79661 4:-0.916667 77 | 1 1:-0.722222 2:-0.0833334 3:-0.79661 4:-0.916667 78 | 1 1:-0.555556 2:0.166667 3:-0.830508 4:-0.916667 79 | 2 1:-0.666667 2:-0.666667 3:-0.220339 4:-0.25 80 | 2 1:-0.611111 2:-0.75 3:-0.220339 4:-0.25 81 | 2 1:0.0555554 2:-0.833333 3:0.186441 4:0.166667 82 | 0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 83 | 0 1:0.611111 2:0.333333 3:0.728813 4:1 84 | 2 1:0.0555554 2:-0.25 3:0.118644 4:-4.03573e-08 85 | 1 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667 86 | 1 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667 87 | 0 1:0.611111 2:-0.166667 3:0.627119 4:0.25 88 | 0 1:0.888889 2:0.5 3:0.932203 4:0.75 89 | 2 1:0.222222 2:-0.333333 3:0.220339 4:0.166667 90 | 1 1:-0.555556 2:0.25 3:-0.864407 4:-0.833333 91 | 0 1:-1.32455e-07 2:-0.166667 3:0.322034 4:0.416667 92 | 0 1:-1.32455e-07 2:-0.5 3:0.559322 4:0.0833333 93 | 1 1:-0.611111 3:-0.932203 4:-0.916667 94 | 1 1:-0.333333 2:0.833333 3:-0.864407 4:-0.916667 95 | 0 1:-0.166667 2:-0.333333 3:0.38983 4:0.916667 96 | 2 1:-0.333333 2:-0.666667 3:-0.0847458 4:-0.25 97 | 2 1:-0.0555556 2:-0.416667 3:0.38983 4:0.25 98 | 1 1:-0.388889 2:0.416667 3:-0.830508 4:-0.916667 99 | 0 1:0.444444 2:-0.0833334 3:0.38983 4:0.833333 100 | 1 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667 101 | 0 1:0.111111 2:-0.416667 3:0.322034 4:0.416667 102 | 0 1:0.166667 2:-0.0833334 3:0.525424 4:0.416667 103 | 2 1:0.333333 2:-0.0833334 3:0.152542 4:0.0833333 104 | 0 1:-0.0555556 2:-0.166667 3:0.288136 4:0.416667 105 | 0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 106 | 1 1:-0.611111 2:0.166667 3:-0.830508 4:-0.916667 107 | 0 1:0.888889 2:-0.166667 3:0.728813 4:0.833333 108 | 2 1:-0.277778 2:-0.25 3:-0.118644 4:-4.03573e-08 109 | 2 1:-0.222222 2:-0.333333 3:0.186441 4:-4.03573e-08 110 | 0 1:0.333333 2:-0.583333 3:0.627119 4:0.416667 111 | 0 1:0.444444 2:-0.0833334 3:0.491525 4:0.666667 112 | 2 1:-0.222222 2:-0.25 3:0.0847457 4:-4.03573e-08 113 | 1 1:-0.611111 2:0.166667 3:-0.79661 4:-0.75 114 | 2 1:-0.277778 2:-0.166667 3:0.0508474 4:-4.03573e-08 115 | 0 1:1 2:0.5 3:0.830508 4:0.583333 116 | 2 1:-0.333333 2:-0.666667 3:-0.0508475 4:-0.166667 117 | 2 1:-0.277778 2:-0.416667 3:0.0847457 4:-4.03573e-08 118 | 0 1:0.888889 2:-0.333333 3:0.932203 4:0.583333 119 | 2 1:-0.111111 2:-0.166667 3:0.0847457 4:0.166667 120 | 2 1:0.111111 2:-0.583333 3:0.322034 4:0.166667 121 | 0 1:0.333333 2:0.0833333 3:0.59322 4:1 122 | 0 1:0.222222 2:-0.166667 3:0.525424 4:0.416667 123 | 1 1:-0.555556 2:0.5 3:-0.830508 4:-0.833333 124 | 0 1:-0.111111 2:-0.166667 3:0.38983 4:0.416667 125 | 0 1:0.888889 2:-0.5 3:1 4:0.833333 126 | 1 1:-0.388889 2:0.583333 3:-0.898305 4:-0.75 127 | 2 1:0.111111 2:0.0833333 3:0.254237 4:0.25 128 | 0 1:0.333333 2:-0.166667 3:0.423729 4:0.833333 129 | 1 1:-0.388889 2:0.166667 3:-0.762712 4:-0.916667 130 | 0 1:0.333333 2:-0.0833334 3:0.559322 4:0.916667 131 | 2 1:-0.333333 2:-0.75 3:0.0169491 4:-4.03573e-08 132 | 1 1:-0.222222 2:1 3:-0.830508 4:-0.75 133 | 1 1:-0.388889 2:0.583333 3:-0.762712 4:-0.75 134 | 2 1:-0.611111 2:-1 3:-0.152542 4:-0.25 135 | 2 1:-1.32455e-07 2:-0.333333 3:0.254237 4:-0.0833333 136 | 2 1:-0.5 2:-0.416667 3:-0.0169491 4:0.0833333 137 | 1 1:-0.888889 2:-0.75 3:-0.898305 4:-0.833333 138 | 1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 139 | 2 1:-0.555556 2:-0.583333 3:-0.322034 4:-0.166667 140 | 2 1:-0.166667 2:-0.5 3:0.0169491 4:-0.0833333 141 | 1 1:-0.555556 2:0.0833333 3:-0.762712 4:-0.666667 142 | 1 1:-0.777778 3:-0.898305 4:-0.916667 143 | 0 1:0.388889 2:-0.166667 3:0.525424 4:0.666667 144 | 0 1:0.222222 3:0.38983 4:0.583333 145 | 2 1:0.333333 2:-0.0833334 3:0.254237 4:0.166667 146 | 2 1:-0.388889 2:-0.166667 3:0.186441 4:0.166667 147 | 0 1:-0.222222 2:-0.583333 3:0.355932 4:0.583333 148 | 1 1:-0.611111 2:-0.166667 3:-0.79661 4:-0.916667 149 | 1 1:-0.944444 2:-0.25 3:-0.864407 4:-0.916667 150 | 1 1:-0.388889 2:0.166667 3:-0.830508 4:-0.75 151 | -------------------------------------------------------------------------------- /data/mllib/streaming_kmeans_data_test.txt: -------------------------------------------------------------------------------- 1 | (1.0), [1.7, 0.4, 0.9] 2 | (2.0), [2.2, 1.8, 0.0] 3 | -------------------------------------------------------------------------------- /data/sales-data.csv: -------------------------------------------------------------------------------- 1 | Month,Sales 2 | 1-01,266 3 | 1-02,145.9 4 | 1-03,183.1 5 | 1-04,119.3 6 | 1-05,180.3 7 | 1-06,168.5 8 | 1-07,231.8 9 | 1-08,224.5 10 | 1-09,192.8 11 | 1-10,122.9 12 | 1-11,336.5 13 | 1-12,185.9 14 | 2-01,194.3 15 | 2-02,149.5 16 | 2-03,210.1 17 | 2-04,273.3 18 | 2-05,191.4 19 | 2-06,287 20 | 2-07,226 21 | 2-08,303.6 22 | 2-09,289.9 23 | 2-10,421.6 24 | 2-11,264.5 25 | 2-12,342.3 26 | 3-01,339.7 27 | 3-02,440.4 28 | 3-03,315.9 29 | 3-04,439.3 30 | 3-05,401.3 31 | 3-06,437.4 32 | 3-07,575.5 33 | 3-08,407.6 34 | 3-09,682 35 | 3-10,475.3 36 | 3-11,581.3 37 | 3-12,646.9 38 | -------------------------------------------------------------------------------- /data/sales-funnel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/data/sales-funnel.xlsx -------------------------------------------------------------------------------- /data/sales-of-shampoo-over-a-three-ye.csv: -------------------------------------------------------------------------------- 1 | "Month","Sales of shampoo over a three year period" 2 | "1-01",266.0 3 | "1-02",145.9 4 | "1-03",183.1 5 | "1-04",119.3 6 | "1-05",180.3 7 | "1-06",168.5 8 | "1-07",231.8 9 | "1-08",224.5 10 | "1-09",192.8 11 | "1-10",122.9 12 | "1-11",336.5 13 | "1-12",185.9 14 | "2-01",194.3 15 | "2-02",149.5 16 | "2-03",210.1 17 | "2-04",273.3 18 | "2-05",191.4 19 | "2-06",287.0 20 | "2-07",226.0 21 | "2-08",303.6 22 | "2-09",289.9 23 | "2-10",421.6 24 | "2-11",264.5 25 | "2-12",342.3 26 | "3-01",339.7 27 | "3-02",440.4 28 | "3-03",315.9 29 | "3-04",439.3 30 | "3-05",401.3 31 | "3-06",437.4 32 | "3-07",575.5 33 | "3-08",407.6 34 | "3-09",682.0 35 | "3-10",475.3 36 | "3-11",581.3 37 | "3-12",646.9 38 | 39 | Sales of shampoo over a three year period 40 | 41 | -------------------------------------------------------------------------------- /entry.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | import argparse 4 | 5 | if __name__ == '__main__': 6 | spark = SparkSession.builder.appName('PySpark-App').getOrCreate() 7 | print('Session created') 8 | emp_data = spark.createDataFrame([(1,2),(3,4),(5,6)],['a','b']) 9 | print (emp_data.count()) 10 | -------------------------------------------------------------------------------- /fakefriends.csv: -------------------------------------------------------------------------------- 1 | 0,Will,33,385 2 | 1,Jean-Luc,26,2 3 | 2,Hugh,55,221 4 | 3,Deanna,40,465 5 | 4,Quark,68,21 6 | 5,Weyoun,59,318 7 | 6,Gowron,37,220 8 | 7,Will,54,307 9 | 8,Jadzia,38,380 10 | 9,Hugh,27,181 11 | 10,Odo,53,191 12 | 11,Ben,57,372 13 | 12,Keiko,54,253 14 | 13,Jean-Luc,56,444 15 | 14,Hugh,43,49 16 | 15,Rom,36,49 17 | 16,Weyoun,22,323 18 | 17,Odo,35,13 19 | 18,Jean-Luc,45,455 20 | 19,Geordi,60,246 21 | 20,Odo,67,220 22 | 21,Miles,19,268 23 | 22,Quark,30,72 24 | 23,Keiko,51,271 25 | 24,Julian,25,1 26 | 25,Ben,21,445 27 | 26,Julian,22,100 28 | 27,Leeta,42,363 29 | 28,Martok,49,476 30 | 29,Nog,48,364 31 | 30,Keiko,50,175 32 | 31,Miles,39,161 33 | 32,Nog,26,281 34 | 33,Dukat,53,197 35 | 34,Jean-Luc,43,249 36 | 35,Beverly,27,305 37 | 36,Kasidy,32,81 38 | 37,Geordi,58,21 39 | 38,Deanna,64,65 40 | 39,Morn,31,192 41 | 40,Odo,52,413 42 | 41,Hugh,67,167 43 | 42,Brunt,54,75 44 | 43,Guinan,58,345 45 | 44,Nerys,35,244 46 | 45,Dukat,52,77 47 | 46,Morn,25,96 48 | 47,Brunt,24,49 49 | 48,Nog,20,1 50 | 49,Ezri,40,254 51 | 50,Quark,51,283 52 | 51,Lwaxana,36,212 53 | 52,Beverly,19,269 54 | 53,Geordi,62,31 55 | 54,Brunt,19,5 56 | 55,Keiko,41,278 57 | 56,Gowron,44,194 58 | 57,Odo,57,294 59 | 58,Hugh,59,158 60 | 59,Morn,59,284 61 | 60,Geordi,20,100 62 | 61,Kasidy,62,442 63 | 62,Keiko,69,9 64 | 63,Jean-Luc,58,54 65 | 64,Elim,31,15 66 | 65,Guinan,52,169 67 | 66,Geordi,21,477 68 | 67,Jadzia,48,135 69 | 68,Guinan,33,74 70 | 69,Jean-Luc,30,204 71 | 70,Brunt,52,393 72 | 71,Geordi,45,184 73 | 72,Kasidy,22,179 74 | 73,Brunt,20,384 75 | 74,Leeta,65,208 76 | 75,Morn,40,459 77 | 76,Will,62,201 78 | 77,Weyoun,40,407 79 | 78,Data,61,337 80 | 79,Leeta,58,348 81 | 80,Dukat,67,445 82 | 81,Jadzia,54,440 83 | 82,Hugh,57,465 84 | 83,Geordi,32,308 85 | 84,Ben,28,311 86 | 85,Quark,66,383 87 | 86,Hugh,55,257 88 | 87,Ezri,31,481 89 | 88,Ben,66,188 90 | 89,Worf,24,492 91 | 90,Kasidy,33,471 92 | 91,Rom,46,88 93 | 92,Gowron,54,7 94 | 93,Elim,46,63 95 | 94,Morn,62,133 96 | 95,Odo,29,173 97 | 96,Ezri,25,233 98 | 97,Nerys,69,361 99 | 98,Will,44,178 100 | 99,Keiko,69,491 101 | 100,Jean-Luc,61,460 102 | 101,Morn,67,123 103 | 102,Dukat,40,18 104 | 103,Ezri,61,2 105 | 104,Dukat,32,142 106 | 105,Morn,50,417 107 | 106,Beverly,18,499 108 | 107,Will,64,419 109 | 108,Leeta,25,274 110 | 109,Quark,53,417 111 | 110,Nog,64,137 112 | 111,Nerys,37,46 113 | 112,Morn,25,13 114 | 113,Quark,41,244 115 | 114,Worf,33,275 116 | 115,Dukat,18,397 117 | 116,Ben,69,75 118 | 117,Rom,52,487 119 | 118,Ben,28,304 120 | 119,Worf,29,344 121 | 120,Jean-Luc,68,264 122 | 121,Deanna,35,355 123 | 122,Data,45,400 124 | 123,Jadzia,45,439 125 | 124,Data,47,429 126 | 125,Rom,66,284 127 | 126,Brunt,26,84 128 | 127,Miles,40,284 129 | 128,Julian,34,221 130 | 129,Kasidy,45,252 131 | 130,Gowron,67,350 132 | 131,Hugh,65,309 133 | 132,Odo,46,462 134 | 133,Quark,19,265 135 | 134,Ben,45,340 136 | 135,Rom,42,427 137 | 136,Will,19,335 138 | 137,Martok,28,32 139 | 138,Dukat,32,384 140 | 139,Nog,36,193 141 | 140,Elim,64,234 142 | 141,Miles,36,424 143 | 142,Guinan,59,335 144 | 143,Data,60,124 145 | 144,Miles,22,93 146 | 145,Leeta,45,470 147 | 146,Nerys,58,174 148 | 147,Quark,61,373 149 | 148,Nerys,39,248 150 | 149,Beverly,49,340 151 | 150,Nerys,55,313 152 | 151,Keiko,54,441 153 | 152,Kasidy,54,235 154 | 153,Morn,63,342 155 | 154,Geordi,40,389 156 | 155,Beverly,50,126 157 | 156,Deanna,44,360 158 | 157,Dukat,34,319 159 | 158,Odo,31,340 160 | 159,Kasidy,67,438 161 | 160,Beverly,58,112 162 | 161,Odo,39,207 163 | 162,Ezri,59,14 164 | 163,Nerys,67,204 165 | 164,Will,31,172 166 | 165,Leeta,26,282 167 | 166,Lwaxana,25,10 168 | 167,Quark,48,57 169 | 168,Martok,68,112 170 | 169,Beverly,53,92 171 | 170,Jean-Luc,68,490 172 | 171,Weyoun,29,126 173 | 172,Kasidy,55,204 174 | 173,Leeta,23,129 175 | 174,Deanna,47,87 176 | 175,Will,38,459 177 | 176,Worf,55,474 178 | 177,Brunt,67,316 179 | 178,Kasidy,26,381 180 | 179,Elim,37,426 181 | 180,Kasidy,30,108 182 | 181,Rom,43,404 183 | 182,Weyoun,26,145 184 | 183,Ben,47,488 185 | 184,Julian,44,84 186 | 185,Weyoun,48,287 187 | 186,Miles,31,109 188 | 187,Nerys,47,225 189 | 188,Keiko,54,369 190 | 189,Quark,62,23 191 | 190,Geordi,60,294 192 | 191,Nog,40,349 193 | 192,Jadzia,45,497 194 | 193,Nerys,60,125 195 | 194,Kasidy,38,2 196 | 195,Ben,30,376 197 | 196,Data,38,173 198 | 197,Leeta,38,76 199 | 198,Brunt,48,381 200 | 199,Hugh,38,180 201 | 200,Kasidy,21,472 202 | 201,Ezri,23,174 203 | 202,Lwaxana,63,469 204 | 203,Ezri,46,125 205 | 204,Deanna,64,164 206 | 205,Morn,69,236 207 | 206,Will,21,491 208 | 207,Lwaxana,41,206 209 | 208,Nog,37,271 210 | 209,Brunt,27,174 211 | 210,Data,33,245 212 | 211,Ben,61,73 213 | 212,Geordi,55,284 214 | 213,Worf,28,312 215 | 214,Miles,32,182 216 | 215,Will,22,6 217 | 216,Brunt,34,116 218 | 217,Keiko,29,260 219 | 218,Gowron,66,350 220 | 219,Lwaxana,26,345 221 | 220,Jean-Luc,41,394 222 | 221,Dukat,27,150 223 | 222,Rom,34,346 224 | 223,Odo,40,406 225 | 224,Keiko,44,277 226 | 225,Elim,19,106 227 | 226,Lwaxana,37,207 228 | 227,Ezri,40,198 229 | 228,Martok,26,293 230 | 229,Gowron,24,150 231 | 230,Beverly,54,397 232 | 231,Ezri,59,42 233 | 232,Worf,68,481 234 | 233,Gowron,67,70 235 | 234,Deanna,49,22 236 | 235,Elim,57,8 237 | 236,Brunt,62,442 238 | 237,Nerys,61,469 239 | 238,Deanna,25,305 240 | 239,Nog,48,345 241 | 240,Deanna,46,154 242 | 241,Quark,45,332 243 | 242,Data,25,101 244 | 243,Martok,61,68 245 | 244,Dukat,21,471 246 | 245,Jean-Luc,28,174 247 | 246,Leeta,41,260 248 | 247,Ezri,52,338 249 | 248,Dukat,21,138 250 | 249,Nerys,66,41 251 | 250,Hugh,36,342 252 | 251,Rom,55,57 253 | 252,Will,36,174 254 | 253,Leeta,69,116 255 | 254,Ezri,67,79 256 | 255,Deanna,60,324 257 | 256,Worf,32,412 258 | 257,Data,51,161 259 | 258,Worf,68,217 260 | 259,Kasidy,29,11 261 | 260,Brunt,38,96 262 | 261,Jadzia,40,172 263 | 262,Will,51,334 264 | 263,Martok,40,33 265 | 264,Julian,29,228 266 | 265,Gowron,27,471 267 | 266,Jean-Luc,66,496 268 | 267,Dukat,49,106 269 | 268,Ezri,26,298 270 | 269,Beverly,55,289 271 | 270,Data,44,353 272 | 271,Morn,25,446 273 | 272,Quark,29,367 274 | 273,Data,51,493 275 | 274,Julian,64,244 276 | 275,Will,47,13 277 | 276,Dukat,54,462 278 | 277,Hugh,46,300 279 | 278,Data,44,499 280 | 279,Beverly,23,133 281 | 280,Nerys,26,492 282 | 281,Worf,21,89 283 | 282,Geordi,32,404 284 | 283,Dukat,65,443 285 | 284,Nog,26,269 286 | 285,Data,43,101 287 | 286,Lwaxana,30,384 288 | 287,Beverly,64,396 289 | 288,Hugh,56,354 290 | 289,Ezri,30,221 291 | 290,Beverly,62,290 292 | 291,Dukat,23,373 293 | 292,Nog,63,380 294 | 293,Deanna,23,65 295 | 294,Leeta,38,410 296 | 295,Nerys,40,56 297 | 296,Data,38,454 298 | 297,Ben,45,395 299 | 298,Guinan,57,207 300 | 299,Rom,57,311 301 | 300,Beverly,49,147 302 | 301,Weyoun,28,108 303 | 302,Beverly,37,263 304 | 303,Deanna,46,319 305 | 304,Will,19,404 306 | 305,Quark,29,182 307 | 306,Beverly,23,323 308 | 307,Keiko,41,340 309 | 308,Morn,45,59 310 | 309,Geordi,67,153 311 | 310,Odo,68,189 312 | 311,Martok,43,48 313 | 312,Jadzia,61,421 314 | 313,Dukat,59,169 315 | 314,Geordi,36,168 316 | 315,Weyoun,25,208 317 | 316,Hugh,64,391 318 | 317,Guinan,59,439 319 | 318,Deanna,35,251 320 | 319,Leeta,30,476 321 | 320,Worf,62,450 322 | 321,Data,44,61 323 | 322,Rom,58,92 324 | 323,Nog,29,236 325 | 324,Miles,56,343 326 | 325,Keiko,51,492 327 | 326,Beverly,46,407 328 | 327,Julian,20,63 329 | 328,Deanna,62,41 330 | 329,Dukat,67,35 331 | 330,Ezri,33,356 332 | 331,Martok,30,17 333 | 332,Julian,55,362 334 | 333,Ben,29,207 335 | 334,Leeta,40,7 336 | 335,Odo,27,337 337 | 336,Gowron,47,4 338 | 337,Miles,58,10 339 | 338,Will,28,180 340 | 339,Morn,66,305 341 | 340,Nerys,57,275 342 | 341,Data,18,326 343 | 342,Guinan,46,151 344 | 343,Odo,26,254 345 | 344,Data,30,487 346 | 345,Ezri,31,394 347 | 346,Hugh,29,329 348 | 347,Geordi,32,24 349 | 348,Weyoun,33,460 350 | 349,Kasidy,20,277 351 | 350,Nog,55,464 352 | 351,Keiko,54,72 353 | 352,Deanna,27,53 354 | 353,Julian,64,499 355 | 354,Kasidy,69,15 356 | 355,Keiko,46,352 357 | 356,Weyoun,67,149 358 | 357,Brunt,26,7 359 | 358,Will,52,276 360 | 359,Nog,54,442 361 | 360,Nerys,39,68 362 | 361,Worf,68,206 363 | 362,Ezri,39,120 364 | 363,Dukat,41,397 365 | 364,Lwaxana,54,115 366 | 365,Brunt,65,430 367 | 366,Keiko,19,119 368 | 367,Data,39,106 369 | 368,Elim,26,383 370 | 369,Quark,48,266 371 | 370,Jadzia,53,86 372 | 371,Guinan,31,435 373 | 372,Brunt,62,273 374 | 373,Quark,19,272 375 | 374,Nog,68,293 376 | 375,Hugh,66,201 377 | 376,Gowron,23,392 378 | 377,Beverly,18,418 379 | 378,Guinan,47,97 380 | 379,Data,60,304 381 | 380,Brunt,35,65 382 | 381,Nog,38,95 383 | 382,Worf,66,240 384 | 383,Data,69,148 385 | 384,Martok,67,355 386 | 385,Beverly,57,436 387 | 386,Data,35,428 388 | 387,Will,43,335 389 | 388,Nog,30,184 390 | 389,Weyoun,38,38 391 | 390,Martok,22,266 392 | 391,Ben,64,309 393 | 392,Data,64,343 394 | 393,Quark,50,436 395 | 394,Keiko,23,230 396 | 395,Jean-Luc,56,15 397 | 396,Keiko,67,38 398 | 397,Quark,69,470 399 | 398,Lwaxana,26,124 400 | 399,Beverly,24,401 401 | 400,Data,29,128 402 | 401,Jean-Luc,42,467 403 | 402,Hugh,58,98 404 | 403,Weyoun,21,224 405 | 404,Kasidy,18,24 406 | 405,Nog,56,371 407 | 406,Ben,57,121 408 | 407,Miles,36,68 409 | 408,Dukat,62,496 410 | 409,Nog,19,267 411 | 410,Odo,35,299 412 | 411,Lwaxana,58,22 413 | 412,Jadzia,53,451 414 | 413,Hugh,45,147 415 | 414,Martok,56,313 416 | 415,Quark,30,65 417 | 416,Nerys,33,294 418 | 417,Julian,37,106 419 | 418,Guinan,32,212 420 | 419,Kasidy,55,176 421 | 420,Jadzia,26,391 422 | 421,Will,40,261 423 | 422,Ben,67,292 424 | 423,Will,44,388 425 | 424,Keiko,55,470 426 | 425,Quark,33,243 427 | 426,Worf,24,77 428 | 427,Brunt,28,258 429 | 428,Lwaxana,68,423 430 | 429,Jean-Luc,63,345 431 | 430,Geordi,36,493 432 | 431,Quark,36,343 433 | 432,Brunt,45,54 434 | 433,Ezri,38,203 435 | 434,Deanna,57,289 436 | 435,Guinan,42,275 437 | 436,Geordi,57,229 438 | 437,Morn,59,221 439 | 438,Nog,42,95 440 | 439,Data,18,417 441 | 440,Elim,48,394 442 | 441,Jadzia,38,143 443 | 442,Nog,46,105 444 | 443,Geordi,64,175 445 | 444,Keiko,18,472 446 | 445,Guinan,40,286 447 | 446,Quark,32,41 448 | 447,Julian,38,34 449 | 448,Nerys,48,439 450 | 449,Data,52,419 451 | 450,Weyoun,37,234 452 | 451,Martok,28,34 453 | 452,Ezri,58,6 454 | 453,Julian,44,337 455 | 454,Weyoun,52,456 456 | 455,Elim,33,463 457 | 456,Ezri,37,471 458 | 457,Worf,51,81 459 | 458,Elim,44,335 460 | 459,Geordi,26,84 461 | 460,Hugh,47,400 462 | 461,Geordi,41,236 463 | 462,Nerys,23,287 464 | 463,Keiko,40,220 465 | 464,Beverly,25,485 466 | 465,Morn,53,126 467 | 466,Brunt,33,228 468 | 467,Weyoun,42,194 469 | 468,Ezri,46,227 470 | 469,Brunt,55,271 471 | 470,Deanna,38,160 472 | 471,Brunt,52,273 473 | 472,Nog,27,154 474 | 473,Morn,35,38 475 | 474,Keiko,34,48 476 | 475,Ben,52,446 477 | 476,Jean-Luc,28,378 478 | 477,Gowron,50,119 479 | 478,Dukat,41,62 480 | 479,Kasidy,44,320 481 | 480,Geordi,43,428 482 | 481,Elim,32,97 483 | 482,Ben,48,146 484 | 483,Hugh,57,99 485 | 484,Leeta,22,478 486 | 485,Rom,47,356 487 | 486,Elim,49,17 488 | 487,Brunt,69,431 489 | 488,Nog,61,103 490 | 489,Odo,33,410 491 | 490,Nerys,65,101 492 | 491,Rom,60,2 493 | 492,Dukat,19,36 494 | 493,Hugh,23,357 495 | 494,Kasidy,18,194 496 | 495,Data,46,155 497 | 496,Gowron,39,275 498 | 497,Lwaxana,34,423 499 | 498,Jadzia,62,36 500 | 499,Leeta,62,12 501 | -------------------------------------------------------------------------------- /hadoop-2.7.1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/hadoop-2.7.1.zip -------------------------------------------------------------------------------- /hr_data_analysis.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import functions as F 3 | 4 | def getCountHardWorkingLessPaid(hr_data): 5 | return hr_data[(hr_data.satisfaction_level > .9) & (hr_data.salary == "low")].count() 6 | 7 | def increaseSalary(hr_data): 8 | hr_data = hr_data.withColumn('ActualSalary', hr_data.last_evaluation * 10000) 9 | hr_data = hr_data.withColumn('multifactor', 10 | F.when(hr_data.salary == "low",1) 11 | .when(hr_data.salary == "medium",2) 12 | .otherwise(3)) 13 | hr_data = hr_data.withColumn('ActualSalary', hr_data.ActualSalary * hr_data.multifactor) 14 | hr_data = hr_data.drop('multifactor') 15 | 16 | 17 | if __name__ == '__main__': 18 | 19 | spark = SparkSession.builder.appName('HR-Data-Analysis').getOrCreate() 20 | print('Session created') 21 | 22 | hr_data = spark.read.csv('hr_data.csv', inferSchema=True, header=True) 23 | 24 | hr_data = hr_data.withColumnRenamed("sales","department") 25 | 26 | hr_data = hr_data.cache() 27 | count = getCountHardWorkingLessPaid(hr_data) 28 | print ('Count of hardworking & Less Paid folks ',count) 29 | 30 | increaseSalary(hr_data) 31 | print (hr_data.show()) 32 | -------------------------------------------------------------------------------- /kddcup.data_10_percent.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/kddcup.data_10_percent.gz -------------------------------------------------------------------------------- /metastore_db/README_DO_NOT_TOUCH_FILES.txt: -------------------------------------------------------------------------------- 1 | 2 | # ************************************************************************* 3 | # *** DO NOT TOUCH FILES IN THIS DIRECTORY! *** 4 | # *** FILES IN THIS DIRECTORY AND SUBDIRECTORIES CONSTITUTE A DERBY *** 5 | # *** DATABASE, WHICH INCLUDES THE DATA (USER AND SYSTEM) AND THE *** 6 | # *** FILES NECESSARY FOR DATABASE RECOVERY. *** 7 | # *** EDITING, ADDING, OR DELETING ANY OF THESE FILES MAY CAUSE DATA *** 8 | # *** CORRUPTION AND LEAVE THE DATABASE IN A NON-RECOVERABLE STATE. *** 9 | # ************************************************************************* -------------------------------------------------------------------------------- /metastore_db/db.lck: -------------------------------------------------------------------------------- 1 | $40348015-015f-c31d-a9cd-000000fc8a88 -------------------------------------------------------------------------------- /metastore_db/dbex.lck: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /metastore_db/log/README_DO_NOT_TOUCH_FILES.txt: -------------------------------------------------------------------------------- 1 | 2 | # ************************************************************************* 3 | # *** DO NOT TOUCH FILES IN THIS DIRECTORY! *** 4 | # *** FILES IN THIS DIRECTORY ARE USED BY THE DERBY DATABASE RECOVERY *** 5 | # *** SYSTEM. EDITING, ADDING, OR DELETING FILES IN THIS DIRECTORY *** 6 | # *** WILL CAUSE THE DERBY RECOVERY SYSTEM TO FAIL, LEADING TO *** 7 | # *** NON-RECOVERABLE CORRUPT DATABASES. *** 8 | # ************************************************************************* -------------------------------------------------------------------------------- /metastore_db/log/log.ctrl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/log/log.ctrl -------------------------------------------------------------------------------- /metastore_db/log/log1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/log/log1.dat -------------------------------------------------------------------------------- /metastore_db/log/logmirror.ctrl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/log/logmirror.ctrl -------------------------------------------------------------------------------- /metastore_db/seg0/README_DO_NOT_TOUCH_FILES.txt: -------------------------------------------------------------------------------- 1 | 2 | # ************************************************************************* 3 | # *** DO NOT TOUCH FILES IN THIS DIRECTORY! *** 4 | # *** FILES IN THIS DIRECTORY ARE USED BY THE DERBY DATABASE TO STORE *** 5 | # *** USER AND SYSTEM DATA. EDITING, ADDING, OR DELETING FILES IN THIS *** 6 | # *** DIRECTORY WILL CORRUPT THE ASSOCIATED DERBY DATABASE AND MAKE *** 7 | # *** IT NON-RECOVERABLE. *** 8 | # ************************************************************************* -------------------------------------------------------------------------------- /metastore_db/seg0/c10.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c10.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c101.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c101.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c111.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c111.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c121.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c121.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c130.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c130.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c141.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c141.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c150.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c150.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c161.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c161.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c171.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c171.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c180.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c180.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c191.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c191.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c1a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c1b1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1b1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c1c0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1c0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c1d1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1d1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c1e0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1e0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c1f1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c1f1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c20.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c20.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c200.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c200.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c211.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c211.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c221.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c221.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c230.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c230.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c241.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c241.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c251.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c251.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c260.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c260.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c271.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c271.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c281.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c281.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c290.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c290.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c2a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c2b1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2b1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c2c1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2c1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c2d0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2d0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c2e1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2e1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c2f0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c2f0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c300.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c300.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c31.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c31.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c311.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c311.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c321.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c321.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c331.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c331.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c340.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c340.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c351.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c351.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c361.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c361.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c371.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c371.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c380.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c380.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c391.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c391.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c3a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c3b1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3b1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c3c0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3c0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c3d1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3d1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c3e1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3e1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c3f1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c3f1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c400.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c400.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c41.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c41.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c411.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c411.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c421.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c421.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c430.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c430.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c441.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c441.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c451.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c451.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c461.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c461.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c470.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c470.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c481.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c481.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c490.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c490.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c4a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c4b0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4b0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c4c1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4c1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c4d1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4d1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c4e1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4e1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c4f0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c4f0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c501.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c501.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c51.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c51.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c510.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c510.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c521.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c521.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c530.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c530.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c541.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c541.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c550.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c550.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c561.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c561.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c570.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c570.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c581.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c581.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c590.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c590.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c5a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c5b0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5b0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c5c1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5c1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c5d0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5d0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c5e1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5e1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c5f0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c5f0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c60.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c60.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c601.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c601.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c610.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c610.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c621.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c621.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c630.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c630.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c641.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c641.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c650.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c650.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c661.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c661.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c670.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c670.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c681.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c681.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c690.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c690.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c6a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c6b0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6b0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c6c1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6c1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c6d0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6d0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c6e1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6e1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c6f0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c6f0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c701.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c701.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c71.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c71.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c711.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c711.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c721.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c721.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c731.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c731.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c741.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c741.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c751.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c751.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c761.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c761.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c771.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c771.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c781.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c781.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c791.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c791.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c7a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c7b1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7b1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c7c1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7c1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c7d1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7d1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c7e1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7e1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c7f1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c7f1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c801.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c801.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c81.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c81.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c811.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c811.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c821.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c821.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c831.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c831.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c840.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c840.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c851.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c851.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c860.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c860.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c871.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c871.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c880.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c880.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c891.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c891.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c8a0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8a0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c8b1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8b1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c8c1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8c1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c8d1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8d1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c8e1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8e1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c8f1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c8f1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c90.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c90.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c901.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c901.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c911.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c911.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c920.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c920.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c931.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c931.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c940.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c940.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c951.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c951.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c960.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c960.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c971.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c971.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c981.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c981.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c990.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c990.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c9a1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9a1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c9b1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9b1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c9c0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9c0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c9d1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9d1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c9e0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9e0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/c9f1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/c9f1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/ca01.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca01.dat -------------------------------------------------------------------------------- /metastore_db/seg0/ca1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/ca11.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca11.dat -------------------------------------------------------------------------------- /metastore_db/seg0/ca21.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ca21.dat -------------------------------------------------------------------------------- /metastore_db/seg0/cb1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cb1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/cc0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cc0.dat -------------------------------------------------------------------------------- /metastore_db/seg0/cd1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cd1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/ce1.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/ce1.dat -------------------------------------------------------------------------------- /metastore_db/seg0/cf0.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/metastore_db/seg0/cf0.dat -------------------------------------------------------------------------------- /metastore_db/service.properties: -------------------------------------------------------------------------------- 1 | #/tensorFolder/pyspark/metastore_db 2 | # ******************************************************************** 3 | # *** Please do NOT edit this file. *** 4 | # *** CHANGING THE CONTENT OF THIS FILE MAY CAUSE DATA CORRUPTION. *** 5 | # ******************************************************************** 6 | #Mon Nov 13 05:00:30 UTC 2017 7 | SysschemasIndex2Identifier=225 8 | SyscolumnsIdentifier=144 9 | SysconglomeratesIndex1Identifier=49 10 | SysconglomeratesIdentifier=32 11 | SyscolumnsIndex2Identifier=177 12 | SysschemasIndex1Identifier=209 13 | SysconglomeratesIndex3Identifier=81 14 | SystablesIndex2Identifier=129 15 | SyscolumnsIndex1Identifier=161 16 | derby.serviceProtocol=org.apache.derby.database.Database 17 | SysschemasIdentifier=192 18 | derby.storage.propertiesId=16 19 | SysconglomeratesIndex2Identifier=65 20 | derby.serviceLocale=en_US 21 | SystablesIdentifier=96 22 | SystablesIndex1Identifier=113 23 | #--- last line, don't put anything after this line --- 24 | -------------------------------------------------------------------------------- /nifi_script.py: -------------------------------------------------------------------------------- 1 | from java.io import BufferedReader, InputStreamReader 2 | import json 3 | import java.io 4 | from org.apache.commons.io import IOUtils 5 | from java.nio.charset import StandardCharsets 6 | from org.apache.nifi.processor.io import StreamCallback 7 | 8 | class ModJSON(StreamCallback): 9 | def __init__(self): 10 | pass 11 | 12 | def process(self, inputStream, outputStream): 13 | text = IOUtils.toString(inputStream, StandardCharsets.UTF_8) 14 | obj = json.loads(text) 15 | newObj = { 16 | "Source": "NiFi", 17 | "Dest":"Stuff", 18 | } 19 | outputStream.write(bytearray(json.dumps(newObj, indent=4).encode('utf-8'))) 20 | 21 | flowFile = session.get() 22 | if (flowFile != None): 23 | flowFile = session.write(flowFile, ModJSON()) 24 | flowFile = session.putAttribute(flowFile, "filename", flowFile.getAttribute('filename').split('.')[0]+'_translated.json') 25 | session.transfer(flowFile, REL_SUCCESS) 26 | session.commit() 27 | -------------------------------------------------------------------------------- /om.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | from cassandra.cqlengine import columns 3 | from cassandra.cqlengine import connection 4 | from datetime import datetime 5 | from cassandra.cqlengine.management import sync_table 6 | from cassandra.cqlengine.models import Model 7 | 8 | #first, define a model 9 | class ExampleModel(Model): 10 | example_id = columns.UUID(primary_key=True, default=uuid.uuid4) 11 | example_type = columns.Integer(index=True) 12 | created_at = columns.DateTime() 13 | description = columns.Text(required=False) 14 | 15 | 16 | connection.setup(['127.0.0.1'], "cqlengine", protocol_version=3) 17 | 18 | sync_table(ExampleModel) 19 | 20 | em1 = ExampleModel.create(example_type=0, description="example1", created_at=datetime.now()) 21 | em2 = ExampleModel.create(example_type=0, description="example2", created_at=datetime.now()) 22 | em3 = ExampleModel.create(example_type=0, description="example3", created_at=datetime.now()) 23 | em4 = ExampleModel.create(example_type=0, description="example4", created_at=datetime.now()) 24 | em5 = ExampleModel.create(example_type=1, description="example5", created_at=datetime.now()) 25 | em6 = ExampleModel.create(example_type=1, description="example6", created_at=datetime.now()) 26 | em7 = ExampleModel.create(example_type=1, description="example7", created_at=datetime.now()) 27 | em8 = ExampleModel.create(example_type=1, description="example8", created_at=datetime.now()) 28 | 29 | print (ExampleModel.objects.count()) 30 | -------------------------------------------------------------------------------- /pyspark1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/pyspark1.png -------------------------------------------------------------------------------- /pyspark2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/pyspark2.png -------------------------------------------------------------------------------- /resources/employees.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael", "salary":3000} 2 | {"name":"Andy", "salary":4500} 3 | {"name":"Justin", "salary":3500} 4 | {"name":"Berta", "salary":4000} 5 | -------------------------------------------------------------------------------- /resources/full_user.avsc: -------------------------------------------------------------------------------- 1 | {"type": "record", "namespace": "example.avro", "name": "User", "fields": [{"type": "string", "name": "name"}, {"type": ["string", "null"], "name": "favorite_color"}, {"type": {"items": "int", "type": "array"}, "name": "favorite_numbers"}]} -------------------------------------------------------------------------------- /resources/kv1.txt: -------------------------------------------------------------------------------- 1 | 238val_238 2 | 86val_86 3 | 311val_311 4 | 27val_27 5 | 165val_165 6 | 409val_409 7 | 255val_255 8 | 278val_278 9 | 98val_98 10 | 484val_484 11 | 265val_265 12 | 193val_193 13 | 401val_401 14 | 150val_150 15 | 273val_273 16 | 224val_224 17 | 369val_369 18 | 66val_66 19 | 128val_128 20 | 213val_213 21 | 146val_146 22 | 406val_406 23 | 429val_429 24 | 374val_374 25 | 152val_152 26 | 469val_469 27 | 145val_145 28 | 495val_495 29 | 37val_37 30 | 327val_327 31 | 281val_281 32 | 277val_277 33 | 209val_209 34 | 15val_15 35 | 82val_82 36 | 403val_403 37 | 166val_166 38 | 417val_417 39 | 430val_430 40 | 252val_252 41 | 292val_292 42 | 219val_219 43 | 287val_287 44 | 153val_153 45 | 193val_193 46 | 338val_338 47 | 446val_446 48 | 459val_459 49 | 394val_394 50 | 237val_237 51 | 482val_482 52 | 174val_174 53 | 413val_413 54 | 494val_494 55 | 207val_207 56 | 199val_199 57 | 466val_466 58 | 208val_208 59 | 174val_174 60 | 399val_399 61 | 396val_396 62 | 247val_247 63 | 417val_417 64 | 489val_489 65 | 162val_162 66 | 377val_377 67 | 397val_397 68 | 309val_309 69 | 365val_365 70 | 266val_266 71 | 439val_439 72 | 342val_342 73 | 367val_367 74 | 325val_325 75 | 167val_167 76 | 195val_195 77 | 475val_475 78 | 17val_17 79 | 113val_113 80 | 155val_155 81 | 203val_203 82 | 339val_339 83 | 0val_0 84 | 455val_455 85 | 128val_128 86 | 311val_311 87 | 316val_316 88 | 57val_57 89 | 302val_302 90 | 205val_205 91 | 149val_149 92 | 438val_438 93 | 345val_345 94 | 129val_129 95 | 170val_170 96 | 20val_20 97 | 489val_489 98 | 157val_157 99 | 378val_378 100 | 221val_221 101 | 92val_92 102 | 111val_111 103 | 47val_47 104 | 72val_72 105 | 4val_4 106 | 280val_280 107 | 35val_35 108 | 427val_427 109 | 277val_277 110 | 208val_208 111 | 356val_356 112 | 399val_399 113 | 169val_169 114 | 382val_382 115 | 498val_498 116 | 125val_125 117 | 386val_386 118 | 437val_437 119 | 469val_469 120 | 192val_192 121 | 286val_286 122 | 187val_187 123 | 176val_176 124 | 54val_54 125 | 459val_459 126 | 51val_51 127 | 138val_138 128 | 103val_103 129 | 239val_239 130 | 213val_213 131 | 216val_216 132 | 430val_430 133 | 278val_278 134 | 176val_176 135 | 289val_289 136 | 221val_221 137 | 65val_65 138 | 318val_318 139 | 332val_332 140 | 311val_311 141 | 275val_275 142 | 137val_137 143 | 241val_241 144 | 83val_83 145 | 333val_333 146 | 180val_180 147 | 284val_284 148 | 12val_12 149 | 230val_230 150 | 181val_181 151 | 67val_67 152 | 260val_260 153 | 404val_404 154 | 384val_384 155 | 489val_489 156 | 353val_353 157 | 373val_373 158 | 272val_272 159 | 138val_138 160 | 217val_217 161 | 84val_84 162 | 348val_348 163 | 466val_466 164 | 58val_58 165 | 8val_8 166 | 411val_411 167 | 230val_230 168 | 208val_208 169 | 348val_348 170 | 24val_24 171 | 463val_463 172 | 431val_431 173 | 179val_179 174 | 172val_172 175 | 42val_42 176 | 129val_129 177 | 158val_158 178 | 119val_119 179 | 496val_496 180 | 0val_0 181 | 322val_322 182 | 197val_197 183 | 468val_468 184 | 393val_393 185 | 454val_454 186 | 100val_100 187 | 298val_298 188 | 199val_199 189 | 191val_191 190 | 418val_418 191 | 96val_96 192 | 26val_26 193 | 165val_165 194 | 327val_327 195 | 230val_230 196 | 205val_205 197 | 120val_120 198 | 131val_131 199 | 51val_51 200 | 404val_404 201 | 43val_43 202 | 436val_436 203 | 156val_156 204 | 469val_469 205 | 468val_468 206 | 308val_308 207 | 95val_95 208 | 196val_196 209 | 288val_288 210 | 481val_481 211 | 457val_457 212 | 98val_98 213 | 282val_282 214 | 197val_197 215 | 187val_187 216 | 318val_318 217 | 318val_318 218 | 409val_409 219 | 470val_470 220 | 137val_137 221 | 369val_369 222 | 316val_316 223 | 169val_169 224 | 413val_413 225 | 85val_85 226 | 77val_77 227 | 0val_0 228 | 490val_490 229 | 87val_87 230 | 364val_364 231 | 179val_179 232 | 118val_118 233 | 134val_134 234 | 395val_395 235 | 282val_282 236 | 138val_138 237 | 238val_238 238 | 419val_419 239 | 15val_15 240 | 118val_118 241 | 72val_72 242 | 90val_90 243 | 307val_307 244 | 19val_19 245 | 435val_435 246 | 10val_10 247 | 277val_277 248 | 273val_273 249 | 306val_306 250 | 224val_224 251 | 309val_309 252 | 389val_389 253 | 327val_327 254 | 242val_242 255 | 369val_369 256 | 392val_392 257 | 272val_272 258 | 331val_331 259 | 401val_401 260 | 242val_242 261 | 452val_452 262 | 177val_177 263 | 226val_226 264 | 5val_5 265 | 497val_497 266 | 402val_402 267 | 396val_396 268 | 317val_317 269 | 395val_395 270 | 58val_58 271 | 35val_35 272 | 336val_336 273 | 95val_95 274 | 11val_11 275 | 168val_168 276 | 34val_34 277 | 229val_229 278 | 233val_233 279 | 143val_143 280 | 472val_472 281 | 322val_322 282 | 498val_498 283 | 160val_160 284 | 195val_195 285 | 42val_42 286 | 321val_321 287 | 430val_430 288 | 119val_119 289 | 489val_489 290 | 458val_458 291 | 78val_78 292 | 76val_76 293 | 41val_41 294 | 223val_223 295 | 492val_492 296 | 149val_149 297 | 449val_449 298 | 218val_218 299 | 228val_228 300 | 138val_138 301 | 453val_453 302 | 30val_30 303 | 209val_209 304 | 64val_64 305 | 468val_468 306 | 76val_76 307 | 74val_74 308 | 342val_342 309 | 69val_69 310 | 230val_230 311 | 33val_33 312 | 368val_368 313 | 103val_103 314 | 296val_296 315 | 113val_113 316 | 216val_216 317 | 367val_367 318 | 344val_344 319 | 167val_167 320 | 274val_274 321 | 219val_219 322 | 239val_239 323 | 485val_485 324 | 116val_116 325 | 223val_223 326 | 256val_256 327 | 263val_263 328 | 70val_70 329 | 487val_487 330 | 480val_480 331 | 401val_401 332 | 288val_288 333 | 191val_191 334 | 5val_5 335 | 244val_244 336 | 438val_438 337 | 128val_128 338 | 467val_467 339 | 432val_432 340 | 202val_202 341 | 316val_316 342 | 229val_229 343 | 469val_469 344 | 463val_463 345 | 280val_280 346 | 2val_2 347 | 35val_35 348 | 283val_283 349 | 331val_331 350 | 235val_235 351 | 80val_80 352 | 44val_44 353 | 193val_193 354 | 321val_321 355 | 335val_335 356 | 104val_104 357 | 466val_466 358 | 366val_366 359 | 175val_175 360 | 403val_403 361 | 483val_483 362 | 53val_53 363 | 105val_105 364 | 257val_257 365 | 406val_406 366 | 409val_409 367 | 190val_190 368 | 406val_406 369 | 401val_401 370 | 114val_114 371 | 258val_258 372 | 90val_90 373 | 203val_203 374 | 262val_262 375 | 348val_348 376 | 424val_424 377 | 12val_12 378 | 396val_396 379 | 201val_201 380 | 217val_217 381 | 164val_164 382 | 431val_431 383 | 454val_454 384 | 478val_478 385 | 298val_298 386 | 125val_125 387 | 431val_431 388 | 164val_164 389 | 424val_424 390 | 187val_187 391 | 382val_382 392 | 5val_5 393 | 70val_70 394 | 397val_397 395 | 480val_480 396 | 291val_291 397 | 24val_24 398 | 351val_351 399 | 255val_255 400 | 104val_104 401 | 70val_70 402 | 163val_163 403 | 438val_438 404 | 119val_119 405 | 414val_414 406 | 200val_200 407 | 491val_491 408 | 237val_237 409 | 439val_439 410 | 360val_360 411 | 248val_248 412 | 479val_479 413 | 305val_305 414 | 417val_417 415 | 199val_199 416 | 444val_444 417 | 120val_120 418 | 429val_429 419 | 169val_169 420 | 443val_443 421 | 323val_323 422 | 325val_325 423 | 277val_277 424 | 230val_230 425 | 478val_478 426 | 178val_178 427 | 468val_468 428 | 310val_310 429 | 317val_317 430 | 333val_333 431 | 493val_493 432 | 460val_460 433 | 207val_207 434 | 249val_249 435 | 265val_265 436 | 480val_480 437 | 83val_83 438 | 136val_136 439 | 353val_353 440 | 172val_172 441 | 214val_214 442 | 462val_462 443 | 233val_233 444 | 406val_406 445 | 133val_133 446 | 175val_175 447 | 189val_189 448 | 454val_454 449 | 375val_375 450 | 401val_401 451 | 421val_421 452 | 407val_407 453 | 384val_384 454 | 256val_256 455 | 26val_26 456 | 134val_134 457 | 67val_67 458 | 384val_384 459 | 379val_379 460 | 18val_18 461 | 462val_462 462 | 492val_492 463 | 100val_100 464 | 298val_298 465 | 9val_9 466 | 341val_341 467 | 498val_498 468 | 146val_146 469 | 458val_458 470 | 362val_362 471 | 186val_186 472 | 285val_285 473 | 348val_348 474 | 167val_167 475 | 18val_18 476 | 273val_273 477 | 183val_183 478 | 281val_281 479 | 344val_344 480 | 97val_97 481 | 469val_469 482 | 315val_315 483 | 84val_84 484 | 28val_28 485 | 37val_37 486 | 448val_448 487 | 152val_152 488 | 348val_348 489 | 307val_307 490 | 194val_194 491 | 414val_414 492 | 477val_477 493 | 222val_222 494 | 126val_126 495 | 90val_90 496 | 169val_169 497 | 403val_403 498 | 400val_400 499 | 200val_200 500 | 97val_97 501 | -------------------------------------------------------------------------------- /resources/people.json: -------------------------------------------------------------------------------- 1 | {"name":"Michael"} 2 | {"name":"Andy", "age":30} 3 | {"name":"Justin", "age":19} 4 | -------------------------------------------------------------------------------- /resources/people.txt: -------------------------------------------------------------------------------- 1 | Michael, 29 2 | Andy, 30 3 | Justin, 19 4 | -------------------------------------------------------------------------------- /resources/user.avsc: -------------------------------------------------------------------------------- 1 | {"namespace": "example.avro", 2 | "type": "record", 3 | "name": "User", 4 | "fields": [ 5 | {"name": "name", "type": "string"}, 6 | {"name": "favorite_color", "type": ["string", "null"]} 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /resources/users.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/resources/users.avro -------------------------------------------------------------------------------- /resources/users.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/resources/users.parquet -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='PySparkUtilities', 5 | version='0.1dev', 6 | packages=['utilities'], 7 | license=''' 8 | Creative Commons 9 | Attribution-Noncommercial-Share Alike license''', 10 | long_description=''' 11 | An example of how to package code for PySpark''' 12 | ) -------------------------------------------------------------------------------- /spark_hive.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser, join, abspath 2 | 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql import Row 5 | 6 | # warehouse_location points to the default location for managed databases and tables 7 | warehouse_location = '/home/awantik/spark-warehouse' 8 | 9 | spark = SparkSession \ 10 | .builder \ 11 | .appName("Python Spark SQL Hive integration example") \ 12 | .config("spark.sql.warehouse.dir", warehouse_location) \ 13 | .enableHiveSupport() \ 14 | .getOrCreate() 15 | 16 | # spark is an existing SparkSession 17 | spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive") 18 | spark.sql("LOAD DATA LOCAL INPATH '/home/awantik/packages/spark-2.4.3-bin-hadoop2.7/examples/src/main/resources/kv1.txt' INTO TABLE src") 19 | df = spark.sql("SELECT * FROM src") 20 | df.show() 21 | 22 | spark.sql("CREATE TABLE IF NOT EXISTS newsrc (key INT, value STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\n'") 23 | spark.sql("LOAD DATA LOCAL INPATH '/home/awantik/emp.txt' INTO TABLE newsrc") 24 | df2 = spark.sql("SELECT * FROM newsrc") 25 | df2.show() 26 | 27 | df = df2.unionAll(df) 28 | df.show() 29 | -------------------------------------------------------------------------------- /test_file.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | import pyspark.sql 3 | import pytest 4 | from spark_utilities import df_count 5 | 6 | 7 | testdata = [ 8 | ([[1, 3], [2, 4]]), 9 | ([[1, 3], [2, 4], [3,3]]) 10 | ] 11 | @pytest.mark.parametrize("a",testdata) 12 | def test_spark_session_dataframe(spark_session,a): 13 | test_df = spark_session.createDataFrame(a, "a: int, b: int") 14 | assert type(test_df) == pyspark.sql.dataframe.DataFrame 15 | assert df_count(test_df) == 2 16 | 17 | 18 | @pytest.mark.new 19 | def test_spark_session_sql(spark_session): 20 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int") 21 | test_df.registerTempTable('test') 22 | 23 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1') 24 | assert test_filtered_df.count() == 1 25 | 26 | @pytest.mark.webtest 27 | def test_spark_session_s(spark_session): 28 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int") 29 | test_df.registerTempTable('test') 30 | 31 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1') 32 | assert test_filtered_df.count() == 1 33 | -------------------------------------------------------------------------------- /test_hr_data.csv: -------------------------------------------------------------------------------- 1 | satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary 2 | 0.37,0.5,2,154,3,0,1,0,hr,medium 3 | 0.11,0.93,5,140,5,0,1,0,hr,medium 4 | 0.45,0.46,2,159,3,0,1,0,hr,high 5 | 0.44,0.48,2,158,3,0,1,0,technical,low 6 | 0.44,0.56,2,133,3,0,1,0,technical,medium 7 | 0.11,0.77,6,247,4,0,1,0,technical,medium 8 | 0.79,0.93,5,268,5,0,1,0,technical,medium 9 | 0.8,0.9,5,267,5,0,1,0,technical,medium 10 | 0.1,0.87,7,251,5,0,1,0,technical,low 11 | 0.09,0.93,6,279,4,0,1,0,technical,low 12 | 0.7,0.84,6,161,4,0,1,0,technical,low 13 | 0.72,0.84,4,256,5,0,1,0,technical,low 14 | 0.11,0.8,6,304,4,0,1,0,technical,low 15 | 0.39,0.51,2,137,3,0,1,0,technical,low 16 | 0.4,0.49,2,144,3,0,1,0,support,low 17 | 0.43,0.54,2,142,3,0,1,0,support,low 18 | 0.76,0.87,5,262,5,0,1,0,support,low 19 | 0.4,0.48,2,142,3,0,1,0,support,low 20 | 0.09,0.89,6,282,4,0,1,0,support,low 21 | 0.37,0.54,2,157,3,0,1,0,support,low 22 | 0.87,0.91,5,228,5,0,1,0,support,low 23 | 0.1,0.86,6,283,4,0,1,0,support,low 24 | 0.11,0.86,6,286,4,0,1,0,support,low 25 | 0.43,0.5,2,148,3,0,1,0,support,low 26 | 0.1,0.81,6,245,4,0,1,0,support,low 27 | 0.11,0.95,6,279,4,0,1,0,technical,low 28 | 0.85,0.87,5,245,5,0,1,0,technical,low 29 | 0.37,0.49,2,138,3,0,1,0,technical,low 30 | 0.44,0.52,2,141,3,0,1,0,management,low 31 | 0.1,0.83,7,302,5,0,1,0,IT,medium 32 | 0.11,0.89,6,268,4,0,1,0,IT,medium 33 | 0.87,0.88,5,240,5,0,1,0,IT,medium 34 | 0.39,0.49,2,127,3,0,1,0,IT,medium 35 | 0.1,0.94,7,264,4,0,1,0,IT,medium 36 | 0.44,0.53,2,155,3,0,1,0,product_mng,medium 37 | 0.4,0.49,2,143,3,0,1,0,product_mng,medium 38 | 0.76,0.98,5,217,6,0,1,0,product_mng,medium 39 | 0.46,0.55,2,147,3,0,1,0,product_mng,medium 40 | 0.9,0.92,4,271,5,0,1,0,IT,medium 41 | 0.85,0.87,4,273,5,0,1,0,RandD,medium 42 | 0.1,0.78,5,285,4,1,1,0,RandD,medium 43 | 0.43,0.49,2,131,3,0,1,0,RandD,high 44 | 0.2,0.5,5,135,6,0,1,0,RandD,low 45 | 0.81,0.92,5,239,5,0,1,0,RandD,medium 46 | 0.83,0.85,5,237,5,0,1,0,marketing,medium 47 | 0.14,0.75,4,277,5,1,1,0,sales,medium 48 | 0.1,0.84,5,303,5,0,1,0,accounting,medium 49 | 0.91,0.98,4,242,6,0,1,0,support,low 50 | 0.37,0.57,2,158,3,0,1,0,technical,low 51 | 0.42,0.57,2,147,3,1,1,0,management,low 52 | 0.39,0.68,2,282,5,0,1,0,marketing,low 53 | 0.39,0.54,2,154,3,0,1,0,marketing,low 54 | 0.44,0.52,2,149,3,0,1,0,marketing,low 55 | 0.37,0.45,2,149,3,0,1,0,sales,low 56 | 0.39,0.53,2,146,3,0,1,0,sales,low 57 | 0.72,0.94,4,258,5,0,1,0,sales,low 58 | 0.37,0.49,2,148,3,0,1,0,sales,low 59 | 0.82,0.94,5,236,5,0,1,0,sales,low 60 | 0.42,0.52,2,134,3,0,1,0,sales,low 61 | 0.59,1,2,155,5,0,1,0,sales,low 62 | 0.82,0.86,5,257,5,0,1,0,sales,low 63 | 0.73,0.97,6,189,2,0,1,0,sales,low 64 | 0.78,0.66,3,164,3,0,1,0,sales,low 65 | 0.09,0.95,6,271,4,0,1,0,sales,low 66 | 0.1,0.97,6,280,4,0,1,0,sales,low 67 | 0.45,0.46,2,149,3,0,1,0,sales,low 68 | 0.83,0.81,5,219,5,0,1,0,sales,low 69 | 0.43,0.51,2,128,3,0,1,0,sales,low 70 | 0.4,0.47,2,128,3,0,1,0,sales,medium 71 | 0.43,0.46,2,157,3,0,1,0,sales,medium 72 | 0.78,0.93,4,225,5,0,1,0,sales,medium 73 | 0.39,0.45,2,140,3,0,1,0,sales,medium 74 | 0.11,0.97,6,310,4,0,1,0,accounting,medium 75 | 0.36,0.52,2,143,3,0,1,0,accounting,medium 76 | 0.36,0.54,2,153,3,0,1,0,accounting,medium 77 | 0.1,0.79,7,310,4,0,1,0,hr,medium 78 | 0.4,0.47,2,136,3,0,1,0,hr,medium 79 | 0.81,0.85,4,251,6,0,1,0,hr,medium 80 | 0.4,0.47,2,144,3,0,1,0,hr,medium 81 | 0.09,0.93,6,296,4,0,1,0,technical,medium 82 | 0.76,0.89,5,238,5,0,1,0,technical,high 83 | 0.73,0.93,5,162,4,0,1,0,technical,low 84 | 0.38,0.49,2,137,3,0,1,0,technical,medium 85 | 0.72,0.84,5,257,5,0,1,0,technical,medium 86 | 0.4,0.56,2,148,3,0,1,0,technical,medium 87 | 0.91,0.99,5,254,5,0,1,0,technical,medium 88 | 0.85,0.85,4,247,6,0,1,0,technical,low 89 | 0.9,0.7,5,206,4,0,1,0,technical,low 90 | 0.46,0.55,2,145,3,0,1,0,technical,low 91 | 0.43,0.57,2,159,3,1,1,0,technical,low 92 | 0.89,0.88,5,228,5,1,1,0,support,low 93 | 0.09,0.81,6,257,4,0,1,0,support,low 94 | 0.4,0.48,2,155,3,0,1,0,support,low 95 | 0.76,0.83,6,293,6,0,1,0,support,low 96 | 0.4,0.57,2,151,3,0,1,0,support,low 97 | 0.37,0.48,2,160,3,0,1,0,support,low 98 | 0.37,0.53,2,143,3,0,1,0,support,low 99 | 0.11,0.96,6,280,4,0,1,0,support,low 100 | 0.37,0.52,2,158,3,0,1,0,support,low 101 | -------------------------------------------------------------------------------- /test_hr_data_analysis.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | import pyspark.sql 3 | import pytest 4 | import hr_data_analysis 5 | 6 | @pytest.mark.old 7 | def test_spark_session_sql0(spark_session): 8 | test_df = spark_session.read.csv('test_hr_data.csv',inferSchema=True, header=True) 9 | assert hr_data_analysis.getCountHardWorkingLessPaid(test_df) == 1 10 | 11 | 12 | @pytest.mark.new 13 | def test_spark_session_sql(spark_session): 14 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int") 15 | test_df.registerTempTable('test') 16 | 17 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1') 18 | assert test_filtered_df.count() == 1 19 | 20 | @pytest.mark.old 21 | def test_spark_session_sql2(spark_session): 22 | test_df = spark_session.createDataFrame([[1, 3], [2, 4]], "a: int, b: int") 23 | test_df.registerTempTable('test') 24 | 25 | test_filtered_df = spark_session.sql('SELECT a, b from test where a > 1') 26 | assert test_filtered_df.count() == 1 27 | 28 | -------------------------------------------------------------------------------- /titanic-survival-project.tar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edyoda/pyspark-tutorial/d031b052ce5879b3c810419de709cbef84d5371a/titanic-survival-project.tar -------------------------------------------------------------------------------- /titanic.py: -------------------------------------------------------------------------------- 1 | import pyspark.ml.classification as cl 2 | from pyspark.ml.feature import PCA 3 | from pyspark.ml.feature import StringIndexer,OneHotEncoderEstimator,VectorAssembler 4 | from pyspark.ml import Pipeline 5 | 6 | class Titanic: 7 | def __init__(self,spark,input_data,output_data): 8 | self.spark = spark 9 | self.input = input_data 10 | self.output = output_data 11 | 12 | def load(self): 13 | self.data_df = self.spark.read.csv(self.input,inferSchema=True,header=True) 14 | self.data_df.cache() 15 | 16 | def clean(self): 17 | self.data_df = self.data_df.fillna('S',['Embarked']) 18 | self.data_df = self.data_df.fillna(29,['Age']) 19 | 20 | def create_preprocessors(self): 21 | self.stages = [] 22 | 23 | cat_cols = ['Sex','Embarked'] 24 | 25 | st_list = [] 26 | for col in cat_cols: 27 | st = StringIndexer(inputCol=col, outputCol=col+'_si') 28 | st_list.append(st) 29 | 30 | self.stages.extend(st_list) 31 | 32 | ohe = OneHotEncoderEstimator(inputCols=['Sex_si','Embarked_si'], \ 33 | outputCols=['Sex_en','Embarked_en']) 34 | 35 | self.stages.append(ohe) 36 | 37 | num_cols = ['Pclass','Age','Fare'] 38 | 39 | feature_cols = num_cols + ['Sex_en','Embarked_en'] 40 | 41 | va = VectorAssembler(inputCols=feature_cols, outputCol='feature_vec') 42 | 43 | self.stages.append(va) 44 | 45 | def dimensionaity_reduction(self): 46 | 47 | pca = PCA(k=3, inputCol='feature_vec', outputCol='feature_data') 48 | self.stages.append(pca) 49 | 50 | def create_estimators(self): 51 | 52 | logistic = cl.LogisticRegression(maxIter=10, regParam=0.01, labelCol='Survived',featuresCol='feature_data') 53 | self.stages.append(logistic) 54 | 55 | def create_pipeline(self): 56 | 57 | self.pipeline = Pipeline(stages=self.stages) 58 | 59 | def split_data(self): 60 | return self.data_df.randomSplit([0.7,0.3]) 61 | 62 | def fit(self,train): 63 | 64 | self.pipeline_model = self.pipeline.fit(train) 65 | 66 | def predict(self,test): 67 | 68 | return self.pipeline_model.transform(test) 69 | -------------------------------------------------------------------------------- /udt.py: -------------------------------------------------------------------------------- 1 | from cassandra.cluster import Cluster 2 | 3 | cluster = Cluster() 4 | session = cluster.connect() 5 | session.set_keyspace('university') 6 | 7 | session.execute("CREATE TYPE address (street text, zipcode int)") 8 | session.execute("CREATE TABLE user (id int PRIMARY KEY, location frozen
)") 9 | 10 | 11 | 12 | # create a class to map to the "address" UDT 13 | class Address(object): 14 | 15 | def __init__(self, street, zipcode): 16 | self.street = street 17 | self.zipcode = zipcode 18 | 19 | #cluster.register_user_type('university', 'address', Address) 20 | 21 | data = [Address("123 Main St.", 78723), Address("123 Main St.", 78723)] 22 | 23 | # insert a row using an instance of Address 24 | for idx,d in ennumerate(data): 25 | session.execute("INSERT INTO user (id, location) VALUES (%s, %s)", 26 | (idx, d)) 27 | 28 | # results will include Address instances 29 | results = session.execute("SELECT * FROM user") 30 | row = results.one() 31 | print (row.id, row.location.street, row.location.zipcode) 32 | --------------------------------------------------------------------------------