├── project └── build.properties ├── docker-clean.sh ├── src ├── main │ ├── resources │ │ └── data │ │ │ ├── guitarPlayers │ │ │ └── guitarPlayers.json │ │ │ ├── bands │ │ │ └── bands.json │ │ │ ├── guitars │ │ │ └── guitars.json │ │ │ ├── lipsum │ │ │ └── words.txt │ │ │ ├── employees │ │ │ └── employees.csv │ │ │ ├── employees_headers │ │ │ └── employees_headers.csv │ │ │ └── cars │ │ │ └── cars.json │ └── scala │ │ ├── generator │ │ ├── LaptopsDomain.scala │ │ └── DataGenerator.scala │ │ ├── playground │ │ └── Playground.scala │ │ ├── part2foundations │ │ ├── TestDeployApp.scala │ │ ├── ReadingDAGs.scala │ │ ├── SparkJobAnatomy.scala │ │ ├── ReadingQueryPlans.scala │ │ └── SparkAPIs.scala │ │ ├── part3dfjoins │ │ ├── BroadcastJoins.scala │ │ ├── JoinsRecap.scala │ │ ├── SkewedJoins.scala │ │ ├── ColumnPruning.scala │ │ ├── PrePartitioning.scala │ │ └── Bucketing.scala │ │ ├── part4rddjoins │ │ ├── RDDBroadcastJoins.scala │ │ ├── CogroupingRDDs.scala │ │ ├── RDDSkewedJoins.scala │ │ └── SimpleRDDJoins.scala │ │ ├── part1recap │ │ ├── SparkRecap.scala │ │ └── ScalaRecap.scala │ │ └── part5rddtransformations │ │ ├── ByKeyFunctions.scala │ │ ├── I2ITransformations.scala │ │ └── ReusingObjects.scala └── META-INF │ └── MANIFEST.MF ├── spark-cluster ├── build-images.sh ├── docker │ ├── spark-worker │ │ ├── Dockerfile │ │ └── start-worker.sh │ ├── spark-submit │ │ ├── spark-submit.sh │ │ └── Dockerfile │ ├── spark-master │ │ ├── Dockerfile │ │ └── start-master.sh │ └── base │ │ └── Dockerfile ├── env │ └── spark-worker.sh ├── docker-compose.yml └── README.md ├── .bsp └── sbt.json ├── README.md ├── .gitignore └── HadoopWindowsUserSetup.md /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 1.9.6 -------------------------------------------------------------------------------- /docker-clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker rm -f $(docker ps -aq) -------------------------------------------------------------------------------- /src/main/resources/data/guitarPlayers/guitarPlayers.json: -------------------------------------------------------------------------------- 1 | {"id":0,"name":"Jimmy Page","guitars":[0],"band":0} 2 | {"id":1,"name":"Angus Young","guitars":[1],"band":1} 3 | {"id":2,"name":"Eric Clapton","guitars":[1,5],"band":2} 4 | {"id":3,"name":"Kirk Hammett","guitars":[3],"band":3} 5 | -------------------------------------------------------------------------------- /spark-cluster/build-images.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | docker build -t spark-base:latest ./docker/base 6 | docker build -t spark-master:latest ./docker/spark-master 7 | docker build -t spark-worker:latest ./docker/spark-worker 8 | docker build -t spark-submit:latest ./docker/spark-submit -------------------------------------------------------------------------------- /src/main/resources/data/bands/bands.json: -------------------------------------------------------------------------------- 1 | {"id":1,"name":"AC/DC","hometown":"Sydney","year":1973} 2 | {"id":0,"name":"Led Zeppelin","hometown":"London","year":1968} 3 | {"id":3,"name":"Metallica","hometown":"Los Angeles","year":1981} 4 | {"id":4,"name":"The Beatles","hometown":"Liverpool","year":1960} 5 | -------------------------------------------------------------------------------- /spark-cluster/docker/spark-worker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM spark-base:latest 2 | 3 | COPY start-worker.sh / 4 | 5 | ENV SPARK_WORKER_WEBUI_PORT 8081 6 | ENV SPARK_WORKER_LOG /spark/logs 7 | ENV SPARK_MASTER "spark://spark-master:7077" 8 | 9 | EXPOSE 8081 10 | 11 | CMD ["/bin/bash", "/start-worker.sh"] 12 | -------------------------------------------------------------------------------- /src/main/scala/generator/LaptopsDomain.scala: -------------------------------------------------------------------------------- 1 | package generator 2 | 3 | case class LaptopModel(make: String, model: String) 4 | case class Laptop(registration: String, make: String, model: String, procSpeed: Double) 5 | case class LaptopOffer(make: String, model: String, procSpeed: Double, salePrice: Double) 6 | -------------------------------------------------------------------------------- /spark-cluster/env/spark-worker.sh: -------------------------------------------------------------------------------- 1 | #Environment variables used by the spark workers 2 | #Do not touch this unless you modify the compose master 3 | SPARK_MASTER=spark://spark-master:7077 4 | #Allocation Parameters 5 | SPARK_WORKER_CORES=1 6 | SPARK_WORKER_MEMORY=1G 7 | SPARK_DRIVER_MEMORY=128m 8 | SPARK_EXECUTOR_MEMORY=256m -------------------------------------------------------------------------------- /spark-cluster/docker/spark-submit/spark-submit.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /spark/bin/spark-submit \ 4 | --class ${SPARK_APPLICATION_MAIN_CLASS} \ 5 | --master ${SPARK_MASTER_URL} \ 6 | --deploy-mode cluster \ 7 | --total-executor-cores 1 \ 8 | ${SPARK_SUBMIT_ARGS} \ 9 | ${SPARK_APPLICATION_JAR_LOCATION} \ 10 | ${SPARK_APPLICATION_ARGS} \ -------------------------------------------------------------------------------- /.bsp/sbt.json: -------------------------------------------------------------------------------- 1 | {"name":"sbt","version":"1.9.6","bspVersion":"2.1.0-M1","languages":["scala"],"argv":["/Users/daniel/Library/Java/JavaVirtualMachines/adopt-openjdk-1.8.0_265/Contents/Home/jre/bin/java","-Xms100m","-Xmx100m","-classpath","/Users/daniel/Library/Application Support/JetBrains/IdeaIC2023.1/plugins/Scala/launcher/sbt-launch.jar","-Dsbt.script=/usr/local/bin/sbt","xsbt.boot.Boot","-bsp"]} -------------------------------------------------------------------------------- /src/main/resources/data/guitars/guitars.json: -------------------------------------------------------------------------------- 1 | {"id":0,"model":"EDS-1275","make":"Gibson","guitarType":"Electric double-necked"} 2 | {"id":5,"model":"Stratocaster","make":"Fender","guitarType":"Electric"} 3 | {"id":1,"model":"SG","make":"Gibson","guitarType":"Electric"} 4 | {"id":2,"model":"914","make":"Taylor","guitarType":"Acoustic"} 5 | {"id":3,"model":"M-II","make":"ESP","guitarType":"Electric"} 6 | -------------------------------------------------------------------------------- /spark-cluster/docker/spark-submit/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM spark-base:latest 2 | 3 | COPY spark-submit.sh / 4 | 5 | ENV SPARK_MASTER_URL="spark://spark-master:7077" 6 | ENV SPARK_SUBMIT_ARGS="" 7 | ENV SPARK_APPLICATION_ARGS "" 8 | #ENV SPARK_APPLICATION_JAR_LOCATION /opt/spark-apps/myjar.jar 9 | #ENV SPARK_APPLICATION_MAIN_CLASS my.main.Application 10 | 11 | 12 | CMD ["/bin/bash", "/spark-submit.sh"] 13 | -------------------------------------------------------------------------------- /spark-cluster/docker/spark-master/Dockerfile: -------------------------------------------------------------------------------- 1 | # This assumes spark-base was built first. 2 | # Usually we'd run the build-images.sh script which builds spark-base 3 | 4 | FROM spark-base:latest 5 | 6 | COPY start-master.sh / 7 | 8 | ENV SPARK_MASTER_PORT 7077 9 | ENV SPARK_MASTER_WEBUI_PORT 8080 10 | ENV SPARK_MASTER_LOG /spark/logs 11 | 12 | EXPOSE 8080 7077 6066 13 | 14 | CMD ["/bin/bash", "/start-master.sh"] 15 | -------------------------------------------------------------------------------- /spark-cluster/docker/spark-worker/start-worker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | . "/spark/sbin/spark-config.sh" 4 | . "/spark/bin/load-spark-env.sh" 5 | 6 | mkdir -p $SPARK_WORKER_LOG 7 | 8 | export SPARK_HOME=/spark 9 | 10 | ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out 11 | 12 | /spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out -------------------------------------------------------------------------------- /spark-cluster/docker/spark-master/start-master.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export SPARK_MASTER_HOST=`hostname` 4 | 5 | . "/spark/sbin/spark-config.sh" 6 | 7 | . "/spark/bin/load-spark-env.sh" 8 | 9 | mkdir -p $SPARK_MASTER_LOG 10 | 11 | export SPARK_HOME=/spark 12 | 13 | ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out 14 | 15 | cd /spark/bin && /spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out 16 | -------------------------------------------------------------------------------- /spark-cluster/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | services: 3 | spark-master: 4 | image: spark-master:latest 5 | ports: 6 | - "4040:4040" 7 | - "9090:8080" 8 | - "7077:7077" 9 | volumes: 10 | - ./apps:/opt/spark-apps 11 | - ./data:/opt/spark-data 12 | environment: 13 | - "SPARK_LOCAL_IP=spark-master" 14 | spark-worker: 15 | image: spark-worker:latest 16 | depends_on: 17 | - spark-master 18 | environment: 19 | - SPARK_MASTER=spark://spark-master:7077 20 | - SPARK_WORKER_CORES=1 21 | - SPARK_WORKER_MEMORY=2G 22 | - SPARK_DRIVER_MEMORY=256m 23 | - SPARK_EXECUTOR_MEMORY=1G 24 | volumes: 25 | - ./apps:/opt/spark-apps 26 | - ./data:/opt/spark-data 27 | -------------------------------------------------------------------------------- /src/main/scala/playground/Playground.scala: -------------------------------------------------------------------------------- 1 | package playground 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * A simple Scala application where I'll invite you to play and fiddle with the code that we write in this course. 7 | * (not that you couldn't create your own, mind you.) 8 | * 9 | * If you can compile and run this application, it means that the libraries were downloaded correctly. 10 | * In that case, you should be good to go for the rest of the course. 11 | * 12 | * Enjoy! 13 | * 14 | * Daniel @ Rock the JVM 15 | */ 16 | object Playground { 17 | 18 | val spark = SparkSession.builder() 19 | .appName("Spark Optimization Playground") 20 | .master("local") 21 | .getOrCreate() 22 | 23 | val sc = spark.sparkContext 24 | 25 | def main(args: Array[String]): Unit = { 26 | val rdd = sc.parallelize(1 to 1000) 27 | println(s"I have my first RDD, it has ${rdd.count} rows. Now let me go optimize massive jobs.") 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/scala/part2foundations/TestDeployApp.scala: -------------------------------------------------------------------------------- 1 | package part2foundations 2 | 3 | import org.apache.spark.sql.{SaveMode, SparkSession} 4 | 5 | object TestDeployApp { 6 | 7 | // TestDeployApp inputFile outputFile 8 | def main(args: Array[String]): Unit = { 9 | 10 | if (args.length != 2) { 11 | println("Need input file and output file") 12 | System.exit(1) 13 | } 14 | 15 | val spark = SparkSession.builder() 16 | .appName("Test Deploy App") 17 | // method 1 18 | .config("spark.executor.memory", "1g") 19 | .getOrCreate() 20 | 21 | import spark.implicits._ 22 | 23 | val moviesDF = spark.read 24 | .option("inferSchema", "true") 25 | .json(args(0)) 26 | 27 | val goodComediesDF = moviesDF.select( 28 | $"Title", 29 | $"IMDB_Rating".as("Rating"), 30 | $"Release_Date".as("Release") 31 | ) 32 | .where(($"Major_Genre" === "Comedy") and ($"IMDB_Rating" > 6.5)) 33 | .orderBy($"Rating".desc_nulls_last) 34 | 35 | // method 2 36 | spark.conf.set("spark.executor.memory", "1g") // warning - not all configurations available 37 | 38 | /* 39 | method 3: pass configs as command line arguments: 40 | 41 | spark-submit ... --conf spark.executor.memory 1g 42 | 43 | You can also use dedicated command line arguments for certain configurations: 44 | --master = spark.master 45 | --executor-memory = spark.executor.memory 46 | --driver-memory = spark.driver.memory 47 | 48 | and many more. 49 | */ 50 | goodComediesDF.show() 51 | 52 | goodComediesDF.write 53 | .mode(SaveMode.Overwrite) 54 | .format("json") 55 | .save(args(1)) 56 | 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main/scala/part3dfjoins/BroadcastJoins.scala: -------------------------------------------------------------------------------- 1 | package part3dfjoins 2 | 3 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} 4 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 5 | import org.apache.spark.sql.functions._ 6 | 7 | object BroadcastJoins { 8 | 9 | val spark = SparkSession.builder() 10 | .appName("Broadcast Joins") 11 | .master("local") 12 | .getOrCreate() 13 | 14 | val sc = spark.sparkContext 15 | 16 | val rows = sc.parallelize(List( 17 | Row(0, "zero"), 18 | Row(1, "first"), 19 | Row(2, "second"), 20 | Row(3, "third") 21 | )) 22 | 23 | val rowsSchema = StructType(Array( 24 | StructField("id", IntegerType), 25 | StructField("order", StringType) 26 | )) 27 | 28 | // small table 29 | val lookupTable: DataFrame = spark.createDataFrame(rows, rowsSchema) 30 | 31 | // large table 32 | val table = spark.range(1, 100000000) // column is "id" 33 | 34 | // the innocent join 35 | val joined = table.join(lookupTable, "id") 36 | joined.explain 37 | // joined.show - takes an ice age 38 | 39 | // a smarter join 40 | val joinedSmart = table.join(broadcast(lookupTable), "id") 41 | joinedSmart.explain() 42 | // joinedSmart.show() 43 | 44 | // auto-broadcast detection 45 | val bigTable = spark.range(1, 100000000) 46 | val smallTable = spark.range(1, 10000) // size estimated by Spark - auto-broadcast 47 | val joinedNumbers = smallTable.join(bigTable, "id") 48 | 49 | // deactivate auto-broadcast 50 | spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 51 | 52 | joinedNumbers.explain() 53 | 54 | def main(args: Array[String]): Unit = { 55 | Thread.sleep(1000000) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/scala/part2foundations/ReadingDAGs.scala: -------------------------------------------------------------------------------- 1 | package part2foundations 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ReadingDAGs { 6 | 7 | ///////////////////////////////////////////////////////////////////// Boilerplate 8 | // you don't need this code in the Spark shell 9 | // this code is needed if you want to run it locally in IntelliJ 10 | 11 | val spark = SparkSession.builder() 12 | .config("spark.master", "local") 13 | .appName("Reading Query Plans") 14 | .getOrCreate() 15 | 16 | val sc = spark.sparkContext 17 | 18 | ///////////////////////////////////////////////////////////////////// Boilerplate 19 | 20 | // job 1 21 | sc.parallelize(1 to 1000000).count() 22 | // DAG with a single "box" - the creation of the RDD 23 | 24 | val rdd1 = sc.parallelize(1 to 1000000) 25 | 26 | // job 2 27 | rdd1.map(_ * 2).count() 28 | // DAG with one stage and two "boxes": one for creating the RDD and one for the map 29 | 30 | // job 3 31 | rdd1.repartition(23).count() 32 | // DAG with two stages: 33 | // stage 1 - the creation of the RDD + exchange 34 | // stage 2 - computation of the count 35 | 36 | // job 4 - same as query plans: 37 | val ds1 = spark.range(1, 10000000) 38 | val ds2 = spark.range(1, 20000000, 2) 39 | val ds3 = ds1.repartition(7) 40 | val ds4 = ds2.repartition(9) 41 | val ds5 = ds3.selectExpr("id * 3 as id") 42 | val joined = ds5.join(ds4, "id") 43 | val sum = joined.selectExpr("sum(id)") 44 | // complex DAG 45 | 46 | /** 47 | * Takeaway: the DAG is a visual representation of the steps Spark will perform to run a job. 48 | * It's the "drawing" version of the physical query plan. 49 | * Unlike query plans, which are only available for DataFrames/Spark SQL, DAGs show up for ANY job. 50 | */ 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/resources/data/lipsum/words.txt: -------------------------------------------------------------------------------- 1 | elit 2 | est 3 | consequat 4 | pulvinar 5 | tortor 6 | fringilla 7 | consectetur 8 | sed 9 | aliquet 10 | taciti 11 | in 12 | accumsan 13 | sapien 14 | sagittis 15 | torquent 16 | molestie 17 | volutpat 18 | dui 19 | auctor 20 | eu 21 | ultricies 22 | nam 23 | aliquam 24 | nec 25 | justo 26 | laoreet 27 | sit 28 | mattis 29 | quis 30 | ultrices 31 | vitae 32 | risus 33 | fusce 34 | dapibus 35 | ipsum 36 | felis 37 | cubilia 38 | conubia 39 | vel 40 | ligula 41 | per 42 | mollis 43 | tellus 44 | orci 45 | aenean 46 | purus 47 | scelerisque 48 | malesuada 49 | inceptos 50 | luctus 51 | himenaeos 52 | curabitur 53 | potenti 54 | cursus 55 | suspendisse 56 | nisl 57 | lorem 58 | a 59 | eget 60 | convallis 61 | metus 62 | amet 63 | nullam 64 | enim 65 | praesent 66 | primis 67 | cras 68 | consectetuer 69 | commodo 70 | vestibulum 71 | condimentum 72 | blandit 73 | ut 74 | neque 75 | fermentum 76 | viverra 77 | ante 78 | et 79 | faucibus 80 | massa 81 | egestas 82 | porttitor 83 | facilisi 84 | sodales 85 | magna 86 | suscipit 87 | iaculis 88 | dolor 89 | at 90 | nisi 91 | sem 92 | semper 93 | id 94 | arcu 95 | dignissim 96 | ac 97 | nostra 98 | nunc 99 | lacus 100 | euismod 101 | pharetra 102 | aptent 103 | tristique 104 | posuere 105 | proin 106 | nibh 107 | pede 108 | facilisis 109 | etiam 110 | morbi 111 | nulla 112 | ad 113 | turpis 114 | class 115 | curae 116 | sollicitudin 117 | venenatis 118 | ullamcorper 119 | litora 120 | lectus 121 | integer 122 | mi 123 | quam 124 | vivamus 125 | pretium 126 | imperdiet 127 | odio 128 | porta 129 | mauris 130 | lacinia 131 | donec 132 | pellentesque 133 | duis 134 | quisque 135 | maecenas 136 | augue 137 | velit 138 | congue 139 | diam 140 | tincidunt 141 | libero 142 | interdum 143 | non 144 | urna 145 | sociosqu 146 | feugiat 147 | adipiscing 148 | elementum -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The official repository for the Rock the JVM Spark Optimization with Scala course 2 | 3 | Powered by [Rock the JVM!](rockthejvm.com) 4 | 5 | This repository contains the code we wrote during [Rock the JVM's Spark Optimization with Scala](https://rockthejvm.com/course/spark-optimization) course. Unless explicitly mentioned, the code in this repository is exactly what was caught on camera. 6 | 7 | ### Install and setup 8 | 9 | - install [IntelliJ IDEA](https://jetbrains.com/idea) 10 | - install [Docker Desktop](https://docker.com) 11 | - either clone the repo or download as zip 12 | - open with IntelliJ as an SBT project 13 | - Windows users, you need to set up some Hadoop-related configs - use [this guide](/HadoopWindowsUserSetup.md) 14 | 15 | As you open the project, the IDE will take care to download and apply the appropriate library dependencies. 16 | 17 | To set up the dockerized Spark cluster we will be using in the course, do the following: 18 | 19 | - open a terminal and navigate to `spark-cluster` 20 | - run `build-images.sh` (if you don't have a bash terminal, just open the file and run each line one by one) 21 | - run `docker-compose up` 22 | 23 | To interact with the Spark cluster, the folders `data` and `apps` inside the `spark-cluster` folder are mounted onto the Docker containers under `/opt/spark-data` and `/opt/spark-apps` respectively. 24 | 25 | To run a Spark shell, first run `docker-compose up` inside the `spark-cluster` directory, then in another terminal, do 26 | 27 | ``` 28 | docker exec -it spark-cluster_spark-master_1 bash 29 | ``` 30 | 31 | and then 32 | 33 | ``` 34 | /spark/bin/spark-shell 35 | ``` 36 | 37 | ### How to use intermediate states of this repository 38 | 39 | Start by cloning this repository and checkout the `start` tag: 40 | 41 | ``` 42 | git checkout start 43 | ``` 44 | 45 | ### For questions or suggestions 46 | 47 | If you have changes to suggest to this repo, either 48 | - submit a GitHub issue 49 | - tell me in the course Q/A forum 50 | - submit a pull request! 51 | -------------------------------------------------------------------------------- /spark-cluster/docker/base/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM eclipse-temurin:17-jdk 2 | LABEL author="Daniel Ciocirlan" email="daniel@rockthejvm.com" 3 | LABEL version="0.3" 4 | 5 | ENV DAEMON_RUN=true 6 | ENV SPARK_VERSION=3.5.0 7 | ENV SCALA_VERSION_BASE=2.13 8 | ENV HADOOP_VERSION=3 9 | ENV SCALA_VERSION=2.13.12 10 | ENV SCALA_HOME=/usr/share/scala 11 | ENV SPARK_HOME=/spark 12 | 13 | 14 | RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates jq dbus-x11 15 | RUN echo exit 0 > /usr/sbin/policy-rc.d 16 | 17 | RUN cd "/tmp" && \ 18 | wget --no-verbose "https://downloads.typesafe.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" && \ 19 | tar xzf "scala-${SCALA_VERSION}.tgz" && \ 20 | mkdir "${SCALA_HOME}" && \ 21 | rm "/tmp/scala-${SCALA_VERSION}/bin/"*.bat && \ 22 | mv "/tmp/scala-${SCALA_VERSION}/bin" "/tmp/scala-${SCALA_VERSION}/lib" "${SCALA_HOME}" && \ 23 | ln -s "${SCALA_HOME}/bin/"* "/usr/bin/" && \ 24 | rm -rf "/tmp/"* 25 | 26 | # Add Dependencies for PySpark 27 | RUN apt-get install -y python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy 28 | RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1 29 | 30 | 31 | #Scala instalation 32 | RUN export PATH="/usr/local/sbt/bin:$PATH" && apt update && apt install ca-certificates wget tar && mkdir -p "/usr/local/sbt" && wget -qO - --no-check-certificate "https://github.com/sbt/sbt/releases/download/v1.9.6/sbt-1.9.6.tgz" | tar xz -C /usr/local/sbt --strip-components=1 && sbt sbtVersion -Dsbt.rootdir=true 33 | 34 | RUN wget --no-verbose https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE}.tgz && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE}.tgz \ 35 | && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE} spark \ 36 | && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE}.tgz 37 | 38 | 39 | 40 | # Fix the value of PYTHONHASHSEED 41 | # Note: this is needed when you use Python 3.3 or greater 42 | ENV PYTHONHASHSEED 1 43 | -------------------------------------------------------------------------------- /src/main/scala/part4rddjoins/RDDBroadcastJoins.scala: -------------------------------------------------------------------------------- 1 | package part4rddjoins 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | import scala.util.Random 6 | 7 | /** 8 | * Shown on camera in the Spark Shell. 9 | */ 10 | object RDDBroadcastJoins { 11 | 12 | val spark = SparkSession.builder() 13 | .appName("Broadcast Joins") 14 | .master("local[*]") 15 | .getOrCreate() 16 | 17 | val sc = spark.sparkContext 18 | 19 | val random = new Random() 20 | 21 | /* 22 | Scenario: assign prizes to a wide-scale competition (10M+ people). 23 | Goal: find out who won what. 24 | */ 25 | 26 | // small lookup table 27 | val prizes = sc.parallelize(List( 28 | (1, "gold"), 29 | (2, "silver"), 30 | (3, "bronze") 31 | )) 32 | 33 | // the competition has ended - the leaderboard is known 34 | val leaderboard = sc.parallelize(1 to 10000000).map((_, random.alphanumeric.take(8).mkString)) 35 | val medalists = leaderboard.join(prizes) 36 | medalists.foreach(println) // 38s for 10M elements! 37 | 38 | /* 39 | We know from SQL joins that the small RDD can be broadcast so that we can avoid the shuffle on the big RDD. 40 | However, for the RDD API, we'll have to do this manually. 41 | This lesson is more about how to actually implement the broadcasting technique on RDDs. 42 | */ 43 | 44 | // need to collect the RDD locally, so that we can broadcast to the executors 45 | val medalsMap = prizes.collectAsMap() 46 | // after we do this, all executors can refer to the medalsMap locally 47 | sc.broadcast(medalsMap) 48 | // need to avoid shuffles by manually going through the partitions of the big RDD 49 | val improvedMedalists = leaderboard.mapPartitions { iterator => // iterator of all the tuples in this partition; all the tuples are local to this executor 50 | iterator.flatMap { record => 51 | val (index, name) = record 52 | medalsMap.get(index) match { // notice you can refer to the name medalsMap, which you now have access to locally after the broadcast 53 | case None => Seq.empty 54 | case Some(medal) => Seq((name, medal)) 55 | } 56 | } 57 | } 58 | 59 | improvedMedalists.foreach(println) // 2s, blazing fast, no shuffles or anything at all. 60 | 61 | def main(args: Array[String]): Unit = { 62 | Thread.sleep(1000000) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/scala/part4rddjoins/CogroupingRDDs.scala: -------------------------------------------------------------------------------- 1 | package part4rddjoins 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object CogroupingRDDs { 7 | 8 | val spark = SparkSession.builder() 9 | .appName("Cogrouping RDDs") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val sc = spark.sparkContext 14 | val rootFolder = "src/main/resources/generated/examData" 15 | 16 | /* 17 | Take all the student attempts 18 | - if a student passed (at least one attempt > 9.0), send them an email "PASSED" 19 | - else send them an email with "FAILED" 20 | */ 21 | 22 | def readIds() = sc.textFile(s"$rootFolder/examIds.txt") 23 | .map { line => 24 | val tokens = line.split(" ") 25 | (tokens(0).toLong, tokens(1)) 26 | } 27 | 28 | def readExamScores() = sc.textFile(s"$rootFolder/examScores.txt") 29 | .map { line => 30 | val tokens = line.split(" ") 31 | (tokens(0).toLong, tokens(1).toDouble) 32 | } 33 | 34 | def readExamEmails() = sc.textFile(s"$rootFolder/examEmails.txt") 35 | .map { line => 36 | val tokens = line.split(" ") 37 | (tokens(0).toLong, tokens(1)) 38 | } 39 | 40 | def plainJoin() = { 41 | val scores = readExamScores().reduceByKey(Math.max) 42 | val candidates = readIds() 43 | val emails = readExamEmails() 44 | 45 | val results = candidates 46 | .join(scores) // RDD[(Long, (String, Double))] 47 | .join(emails) // RDD[(Long, ((String, Double), String))] 48 | .mapValues { 49 | case ((_, maxAttempt), email) => 50 | if (maxAttempt >= 9.0) (email, "PASSED") 51 | else (email, "FAILED") 52 | } 53 | 54 | results.count() 55 | } 56 | 57 | def coGroupedJoin() = { 58 | val scores = readExamScores().reduceByKey(Math.max) 59 | val candidates = readIds() 60 | val emails = readExamEmails() 61 | 62 | val result: RDD[(Long, Option[(String, String)])] = candidates.cogroup(scores, emails) // co-partition the 3 RDDs: RDD[(Long, (Iterable[String], Iterable[Double], Iterable[String]))] 63 | .mapValues { 64 | case (nameIterable, maxAttemptIterable, emailIterable) => 65 | val name = nameIterable.headOption 66 | val maxScore = maxAttemptIterable.headOption 67 | val email = emailIterable.headOption 68 | 69 | for { 70 | e <- email 71 | s <- maxScore 72 | } yield (e, if (s >= 9.0) "PASSED" else "FAILED") 73 | } 74 | 75 | result.count() 76 | result.count() 77 | } 78 | 79 | 80 | def main(args: Array[String]): Unit = { 81 | plainJoin() 82 | coGroupedJoin() 83 | Thread.sleep(1000000) 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/main/resources/data/employees/employees.csv: -------------------------------------------------------------------------------- 1 | 7584740,Devin,Jeramy,Vedenichev,103,1951-12-31T05:00:00.000Z,958-67-2937,55214 2 | 7677553,Marcus,Arlie,Tibb,103,1951-12-31T05:00:00.000Z,999-90-6698,47746 3 | 7736171,Pat,Johnie,De Keep,103,1951-12-31T05:00:00.000Z,960-92-7355,98868 4 | 7813417,Brooks,Dannie,Lemmens,103,1951-12-31T05:00:00.000Z,914-82-1490,69065 5 | 7929094,Milan,Alex,Setterfield,103,1951-12-31T05:00:00.000Z,989-42-1725,79877 6 | 8037265,Dexter,Otis,Brahmer,103,1951-12-31T05:00:00.000Z,959-50-1621,91473 7 | 8050195,Anibal,Nicky,Springford,103,1951-12-31T05:00:00.000Z,998-90-1376,62784 8 | 8053263,Karl,Horace,Thompson,103,1951-12-31T05:00:00.000Z,930-70-7868,90481 9 | 8125007,Quinton,Eduardo,Rooze,103,1951-12-31T05:00:00.000Z,903-21-6089,87389 10 | 8169581,Jc,Edison,Fruchter,103,1951-12-31T05:00:00.000Z,905-35-8052,58120 11 | 8206921,Sylvester,Faustino,Glasgow,103,1951-12-31T05:00:00.000Z,907-30-2443,43279 12 | 8230707,Juan,Dewayne,Leggitt,103,1951-12-31T05:00:00.000Z,965-56-5537,74107 13 | 8353995,Bill,Dirk,Negri,103,1951-12-31T05:00:00.000Z,905-76-4894,79389 14 | 8430873,Oren,August,Lachaize,103,1951-12-31T05:00:00.000Z,990-14-4688,82777 15 | 8448144,Dewayne,Hubert,Sturton,103,1951-12-31T05:00:00.000Z,917-37-8590,91109 16 | 8479988,Courtney,Long,Bellin,103,1951-12-31T05:00:00.000Z,906-26-2150,51099 17 | 8481731,Sung,Jamie,Haslin,103,1951-12-31T05:00:00.000Z,982-29-1779,75742 18 | 8596468,Walker,Francesco,O'Fallone,103,1951-12-31T05:00:00.000Z,962-42-6521,107650 19 | 8650756,Clyde,Royce,Readshall,103,1951-12-31T05:00:00.000Z,983-38-1342,88851 20 | 5073404,Dale,Brice,Casolla,105,1951-12-31T05:00:00.000Z,963-52-1303,102445 21 | 5106546,Lupe,Rod,Bullard,105,1951-12-31T05:00:00.000Z,901-76-7197,84274 22 | 5178737,Douglass,Seth,Cummine,105,1951-12-31T05:00:00.000Z,958-74-4222,81743 23 | 5265931,Abram,Eddie,Izzett,105,1951-12-31T05:00:00.000Z,962-45-3424,63223 24 | 5268793,Wilson,Landon,Chellam,105,1951-12-31T05:00:00.000Z,911-11-7847,88966 25 | 5371143,Kieth,Malcolm,Formigli,105,1951-12-31T05:00:00.000Z,666-37-6950,70655 26 | 5407756,Ted,Pablo,Lidgely,105,1951-12-31T05:00:00.000Z,994-52-7326,63100 27 | 5413070,Reid,Reyes,Kabos,105,1951-12-31T05:00:00.000Z,918-28-4915,90488 28 | 5422998,Buford,Leonel,Stanbro,105,1951-12-31T05:00:00.000Z,927-29-6346,100472 29 | 5530003,Milford,Samual,Manis,105,1951-12-31T05:00:00.000Z,907-28-9627,40950 30 | 5550873,Wm,Fredrick,Filshin,105,1951-12-31T05:00:00.000Z,934-69-5826,39809 31 | 5597718,Jeromy,Von,Daley,105,1951-12-31T05:00:00.000Z,991-24-2641,29644 32 | 5889924,Tyrell,Alonso,Hagyard,105,1951-12-31T05:00:00.000Z,903-55-7969,39093 33 | 5890431,George,Moises,Shurville,105,1951-12-31T05:00:00.000Z,956-30-5990,44500 34 | 6130212,Percy,Les,Lafontaine,105,1951-12-31T05:00:00.000Z,922-95-3154,73006 35 | -------------------------------------------------------------------------------- /src/main/resources/data/employees_headers/employees_headers.csv: -------------------------------------------------------------------------------- 1 | id,firstName,middleName,lastName,dept,birthDate,ssn,salary 2 | 7584740,Devin,Jeramy,Vedenichev,103,1951-12-31T05:00:00.000Z,958-67-2937,55214 3 | 7677553,Marcus,Arlie,Tibb,103,1951-12-31T05:00:00.000Z,999-90-6698,47746 4 | 7736171,Pat,Johnie,De Keep,103,1951-12-31T05:00:00.000Z,960-92-7355,98868 5 | 7813417,Brooks,Dannie,Lemmens,103,1951-12-31T05:00:00.000Z,914-82-1490,69065 6 | 7929094,Milan,Alex,Setterfield,103,1951-12-31T05:00:00.000Z,989-42-1725,79877 7 | 8037265,Dexter,Otis,Brahmer,103,1951-12-31T05:00:00.000Z,959-50-1621,91473 8 | 8050195,Anibal,Nicky,Springford,103,1951-12-31T05:00:00.000Z,998-90-1376,62784 9 | 8053263,Karl,Horace,Thompson,103,1951-12-31T05:00:00.000Z,930-70-7868,90481 10 | 8125007,Quinton,Eduardo,Rooze,103,1951-12-31T05:00:00.000Z,903-21-6089,87389 11 | 8169581,Jc,Edison,Fruchter,103,1951-12-31T05:00:00.000Z,905-35-8052,58120 12 | 8206921,Sylvester,Faustino,Glasgow,103,1951-12-31T05:00:00.000Z,907-30-2443,43279 13 | 8230707,Juan,Dewayne,Leggitt,103,1951-12-31T05:00:00.000Z,965-56-5537,74107 14 | 8353995,Bill,Dirk,Negri,103,1951-12-31T05:00:00.000Z,905-76-4894,79389 15 | 8430873,Oren,August,Lachaize,103,1951-12-31T05:00:00.000Z,990-14-4688,82777 16 | 8448144,Dewayne,Hubert,Sturton,103,1951-12-31T05:00:00.000Z,917-37-8590,91109 17 | 8479988,Courtney,Long,Bellin,103,1951-12-31T05:00:00.000Z,906-26-2150,51099 18 | 8481731,Sung,Jamie,Haslin,103,1951-12-31T05:00:00.000Z,982-29-1779,75742 19 | 8596468,Walker,Francesco,O'Fallone,103,1951-12-31T05:00:00.000Z,962-42-6521,107650 20 | 8650756,Clyde,Royce,Readshall,103,1951-12-31T05:00:00.000Z,983-38-1342,88851 21 | 5073404,Dale,Brice,Casolla,105,1951-12-31T05:00:00.000Z,963-52-1303,102445 22 | 5106546,Lupe,Rod,Bullard,105,1951-12-31T05:00:00.000Z,901-76-7197,84274 23 | 5178737,Douglass,Seth,Cummine,105,1951-12-31T05:00:00.000Z,958-74-4222,81743 24 | 5265931,Abram,Eddie,Izzett,105,1951-12-31T05:00:00.000Z,962-45-3424,63223 25 | 5268793,Wilson,Landon,Chellam,105,1951-12-31T05:00:00.000Z,911-11-7847,88966 26 | 5371143,Kieth,Malcolm,Formigli,105,1951-12-31T05:00:00.000Z,666-37-6950,70655 27 | 5407756,Ted,Pablo,Lidgely,105,1951-12-31T05:00:00.000Z,994-52-7326,63100 28 | 5413070,Reid,Reyes,Kabos,105,1951-12-31T05:00:00.000Z,918-28-4915,90488 29 | 5422998,Buford,Leonel,Stanbro,105,1951-12-31T05:00:00.000Z,927-29-6346,100472 30 | 5530003,Milford,Samual,Manis,105,1951-12-31T05:00:00.000Z,907-28-9627,40950 31 | 5550873,Wm,Fredrick,Filshin,105,1951-12-31T05:00:00.000Z,934-69-5826,39809 32 | 5597718,Jeromy,Von,Daley,105,1951-12-31T05:00:00.000Z,991-24-2641,29644 33 | 5889924,Tyrell,Alonso,Hagyard,105,1951-12-31T05:00:00.000Z,903-55-7969,39093 34 | 5890431,George,Moises,Shurville,105,1951-12-31T05:00:00.000Z,956-30-5990,44500 35 | 6130212,Percy,Les,Lafontaine,105,1951-12-31T05:00:00.000Z,922-95-3154,73006 36 | -------------------------------------------------------------------------------- /src/main/scala/part3dfjoins/JoinsRecap.scala: -------------------------------------------------------------------------------- 1 | package part3dfjoins 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object JoinsRecap { 7 | 8 | val spark = SparkSession.builder() 9 | .master("local[2]") 10 | .appName("Joins Recap") 11 | .getOrCreate() 12 | 13 | val sc = spark.sparkContext 14 | 15 | val guitarsDF = spark.read 16 | .option("inferSchema", "true") 17 | .json("src/main/resources/data/guitars") 18 | 19 | val guitaristsDF = spark.read 20 | .option("inferSchema", "true") 21 | .json("src/main/resources/data/guitarPlayers") 22 | 23 | val bandsDF = spark.read 24 | .option("inferSchema", "true") 25 | .json("src/main/resources/data/bands") 26 | 27 | // inner joins 28 | val joinCondition = guitaristsDF.col("band") === bandsDF.col("id") 29 | val guitaristsBandsDF = guitaristsDF.join(bandsDF, joinCondition, "inner") 30 | 31 | // outer joins 32 | // left outer = everything in inner join + all the rows in the LEFT table, with nulls in the rows not passing the condition in the RIGHT table 33 | guitaristsDF.join(bandsDF, joinCondition, "left_outer") 34 | // right outer = everything in inner join + all the rows in the RIGHT table, with nulls in the rows not passing the condition in the LEFT table 35 | guitaristsDF.join(bandsDF, joinCondition, "right_outer") 36 | // outer join = everything in left_outer + right_outer 37 | guitaristsDF.join(bandsDF, joinCondition, "outer") 38 | 39 | // semi joins = everything in the left DF for which THERE IS a row in the right DF satisfying the condition 40 | // essentially a filter 41 | guitaristsDF.join(bandsDF, joinCondition, "left_semi") 42 | 43 | // anti join = everything in the left DF for which THERE IS NOT a row in the right DF satisfying the condition 44 | // also a filter 45 | guitaristsDF.join(bandsDF, joinCondition, "left_anti") 46 | 47 | // cross join = everything in the left table with everything in the right table 48 | // dangerous: NRows(crossjoin) = NRows(left) x NRows(right) 49 | // careful with outer joins with non-unique keys 50 | 51 | // RDD joins 52 | val colorsScores = Seq( 53 | ("blue", 1), 54 | ("red", 4), 55 | ("green", 5), 56 | ("yellow", 2), 57 | ("orange", 3), 58 | ("cyan", 0) 59 | ) 60 | val colorsRDD: RDD[(String, Int)] = sc.parallelize(colorsScores) 61 | val text = "The sky is blue, but the orange pale sun turns from yellow to red" 62 | val words = text.split(" ").map(_.toLowerCase()).map((_, 1)) // standard technique for counting words with RDDs 63 | val wordsRDD = sc.parallelize(words).reduceByKey(_ + _) // counting word occurrence 64 | val scores: RDD[(String, (Int, Int))] = wordsRDD.join(colorsRDD) // implied join type is INNER 65 | 66 | 67 | def main(args: Array[String]): Unit = { 68 | 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/part2foundations/SparkJobAnatomy.scala: -------------------------------------------------------------------------------- 1 | package part2foundations 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | 6 | object SparkJobAnatomy { 7 | 8 | ///////////////////////////////////////////////////////////////////// Boilerplate 9 | // you don't need this code in the Spark shell 10 | // this code is needed if you want to run it locally in IntelliJ 11 | 12 | val spark = SparkSession.builder() 13 | .config("spark.master", "local") 14 | .appName("Spark Job Anatomy") 15 | .getOrCreate() 16 | 17 | val sc = spark.sparkContext 18 | 19 | ///////////////////////////////////////////////////////////////////// Boilerplate 20 | 21 | /** 22 | * Cluster prep 23 | * 24 | * 1. Navigate to the spark-optimization folder, go to spark-cluster/ 25 | * 2. docker-compose up --scale spark-worker=3 26 | * 3. In another terminal: 27 | * - docker-exec -it spark-cluster_spark-master_1 bash 28 | * - cd spark/bin 29 | * - ./spark-shell 30 | * 4. In (yet) another terminal: 31 | * - go to spark-optimization 32 | * - docker cp (the data folder) spark-cluster_spark-master_1:/tmp 33 | * 5. Open http//:localhost:4040 for the Spark UI 34 | */ 35 | 36 | // job 1 - a count 37 | val rdd1 = sc.parallelize(1 to 1000000) 38 | rdd1.count 39 | // inspect the UI, one stage with 6 tasks 40 | // task = a unit of computation applied to a unit of data (a partition) 41 | 42 | // job 2 - a count with a small transformation 43 | rdd1.map(_ * 2).count 44 | // inspect the UI, another job with (still) one stage, 6 tasks 45 | // all parallelizable computations (like maps) are done in a single stage 46 | 47 | // job 3 - a count with a shuffle 48 | rdd1.repartition(23).count 49 | // UI: 2 stages, one with 6 tasks, one with 23 tasks 50 | // each stage is delimited by shuffles 51 | 52 | // job 4, a more complex computation: load a file and compute the average salary of the employees by department 53 | val employees = sc.textFile("/tmp/employees.csv") 54 | // process the lines 55 | val empTokens = employees.map(line => line.split(",")) 56 | // extract relevant data 57 | val empDetails = empTokens.map(tokens => (tokens(4), tokens(7))) 58 | // group the elements 59 | val empGroups = empDetails.groupByKey(2) 60 | // process the values associated to each group 61 | val avgSalaries = empGroups.mapValues(salaries => salaries.map(_.toInt).sum / salaries.size) 62 | // show the result 63 | avgSalaries 64 | .collect() // this is an action 65 | .foreach(println) 66 | 67 | // look at the Spark UI: one job, 2 stages 68 | // the groupByKey triggers a shuffle, and thus the beginning of another stage 69 | // all other computations (maps, mapValues) are done in their respective stage 70 | // the number of tasks = the number of partitions processed in a given stage 71 | } -------------------------------------------------------------------------------- /src/main/scala/part1recap/SparkRecap.scala: -------------------------------------------------------------------------------- 1 | package part1recap 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | 7 | object SparkRecap { 8 | 9 | // the entry point to the Spark structured API 10 | val spark = SparkSession.builder() 11 | .appName("Spark Recap") 12 | .master("local[2]") 13 | .getOrCreate() 14 | 15 | // read a DF 16 | val cars = spark.read 17 | .format("json") 18 | .option("inferSchema", "true") 19 | .load("src/main/resources/data/cars") 20 | 21 | import spark.implicits._ 22 | 23 | // select 24 | val usefulCarsData = cars.select( 25 | col("Name"), // column object 26 | $"Year", // another column object (needs spark implicits) 27 | (col("Weight_in_lbs") / 2.2).as("Weight_in_kg"), 28 | expr("Weight_in_lbs / 2.2").as("Weight_in_kg_2") 29 | ) 30 | 31 | val carsWeights = cars.selectExpr("Weight_in_lbs / 2.2") 32 | 33 | // filter 34 | val europeanCars = cars.where(col("Origin") =!= "USA") 35 | 36 | // aggregations 37 | val averageHP = cars.select(avg(col("Horsepower")).as("average_hp")) // sum, meam, stddev, min, max 38 | 39 | // grouping 40 | val countByOrigin = cars 41 | .groupBy(col("Origin")) // a RelationalGroupedDataset 42 | .count() 43 | 44 | // joining 45 | val guitarPlayers = spark.read 46 | .option("inferSchema", "true") 47 | .json("src/main/resources/data/guitarPlayers") 48 | 49 | val bands = spark.read 50 | .option("inferSchema", "true") 51 | .json("src/main/resources/data/bands") 52 | 53 | val guitaristsBands = guitarPlayers.join(bands, guitarPlayers.col("band") === bands.col("id")) 54 | /* 55 | join types 56 | - inner: only the matching rows are kept 57 | - left/right/full outer join 58 | - semi/anti 59 | */ 60 | 61 | // datasets = typed distributed collection of objects 62 | case class GuitarPlayer(id: Long, name: String, guitars: Seq[Long], band: Long) 63 | val guitarPlayersDS = guitarPlayers.as[GuitarPlayer] // needs spark.implicits 64 | guitarPlayersDS.map(_.name) 65 | 66 | // Spark SQL 67 | cars.createOrReplaceTempView("cars") 68 | val americanCars = spark.sql( 69 | """ 70 | |select Name from cars where Origin = 'USA' 71 | """.stripMargin 72 | ) 73 | 74 | // low-level API: RDDs 75 | val sc = spark.sparkContext 76 | val numbersRDD: RDD[Int] = sc.parallelize(1 to 1000000) 77 | 78 | // functional operators 79 | val doubles = numbersRDD.map(_ * 2) 80 | 81 | // RDD -> DF 82 | val numbersDF = numbersRDD.toDF("number") // you lose type info, you get SQL capability 83 | 84 | // RDD -> DS 85 | val numbersDS = spark.createDataset(numbersRDD) 86 | 87 | // DS -> RDD 88 | val guitarPlayersRDD = guitarPlayersDS.rdd 89 | 90 | // DF -> RDD 91 | val carsRDD = cars.rdd // RDD[Row] 92 | 93 | def main(args: Array[String]): Unit = { 94 | // showing a DF to the console 95 | cars.show() 96 | cars.printSchema() 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/part1recap/ScalaRecap.scala: -------------------------------------------------------------------------------- 1 | package part1recap 2 | 3 | import scala.concurrent.Future 4 | import scala.util.{Failure, Success} 5 | 6 | object ScalaRecap extends App { 7 | 8 | // values and variables 9 | val aBoolean: Boolean = false 10 | 11 | // expressions 12 | val anIfExpression = if(2 > 3) "bigger" else "smaller" 13 | 14 | // instructions vs expressions 15 | val theUnit = println("Hello, Scala") // Unit = "no meaningful value" = void in other languages 16 | 17 | // functions 18 | def myFunction(x: Int) = 42 19 | 20 | // OOP 21 | class Animal 22 | class Cat extends Animal 23 | trait Carnivore { 24 | def eat(animal: Animal): Unit 25 | } 26 | 27 | class Crocodile extends Animal with Carnivore { 28 | override def eat(animal: Animal): Unit = println("Crunch!") 29 | } 30 | 31 | // singleton pattern 32 | object MySingleton 33 | 34 | // companions 35 | object Carnivore 36 | 37 | // generics 38 | trait MyList[A] 39 | 40 | // method notation 41 | val x = 1 + 2 42 | val y = 1.+(2) 43 | 44 | // Functional Programming 45 | val incrementer: Int => Int = x => x + 1 46 | val incremented = incrementer(42) 47 | 48 | // map, flatMap, filter 49 | val processedList = List(1,2,3).map(incrementer) 50 | 51 | // Pattern Matching 52 | val unknown: Any = 45 53 | val ordinal = unknown match { 54 | case 1 => "first" 55 | case 2 => "second" 56 | case _ => "unknown" 57 | } 58 | 59 | // try-catch 60 | try { 61 | throw new NullPointerException 62 | } catch { 63 | case _: NullPointerException => "some returned value" 64 | case _: Throwable => "something else" 65 | } 66 | 67 | // Future 68 | import scala.concurrent.ExecutionContext.Implicits.global 69 | val aFuture = Future { 70 | // some expensive computation, runs on another thread 71 | 42 72 | } 73 | 74 | aFuture.onComplete { 75 | case Success(meaningOfLife) => println(s"I've found $meaningOfLife") 76 | case Failure(ex) => println(s"I have failed: $ex") 77 | } 78 | 79 | // Partial functions 80 | val aPartialFunction: PartialFunction[Int, Int] = { 81 | case 1 => 43 82 | case 8 => 56 83 | case _ => 999 84 | } 85 | 86 | // Implicits 87 | 88 | // auto-injection by the compiler 89 | def methodWithImplicitArgument(implicit x: Int) = x + 43 90 | implicit val implicitInt = 67 91 | val implicitCall = methodWithImplicitArgument 92 | 93 | // implicit conversions - implicit defs 94 | case class Person(name: String) { 95 | def greet = println(s"Hi, my name is $name") 96 | } 97 | 98 | implicit def fromStringToPerson(name: String) = Person(name) 99 | "Bob".greet // fromStringToPerson("Bob").greet 100 | 101 | // implicit conversion - implicit classes 102 | implicit class Dog(name: String) { 103 | def bark = println("Bark!") 104 | } 105 | "Lassie".bark 106 | 107 | /* 108 | - local scope 109 | - imported scope 110 | - companion objects of the types involved in the method call 111 | */ 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/main/scala/part4rddjoins/RDDSkewedJoins.scala: -------------------------------------------------------------------------------- 1 | package part4rddjoins 2 | 3 | import generator.{DataGenerator, Laptop, LaptopOffer} 4 | import org.apache.spark.sql.SparkSession 5 | 6 | object RDDSkewedJoins { 7 | 8 | val spark = SparkSession.builder() 9 | .appName("RDD Skewed Joins") 10 | .master("local[*]") 11 | .getOrCreate() 12 | 13 | val sc = spark.sparkContext 14 | 15 | /* 16 | An online store selling gaming laptops. 17 | 2 laptops are "similar" if they have the same make & model, but proc speed within 0.1 18 | 19 | For each laptop configuration, we are interested in the average sale price of "similar" models. 20 | 21 | Acer Predator 2.9Ghz aylfaskjhrw -> average sale price of all Acer Predators with CPU speed between 2.8 and 3.0 GHz 22 | */ 23 | 24 | val laptops = sc.parallelize(Seq.fill(40000)(DataGenerator.randomLaptop())) 25 | val laptopOffers = sc.parallelize(Seq.fill(100000)(DataGenerator.randomLaptopOffer())) 26 | 27 | def plainJoin() = { 28 | val preparedLaptops = laptops.map { 29 | case Laptop(registration, make, model, procSpeed) => ((make, model), (registration, procSpeed)) 30 | } 31 | 32 | val preparedOffers = laptopOffers.map { 33 | case LaptopOffer(make, model, procSpeed, salePrice) => ((make, model), (procSpeed, salePrice)) 34 | } 35 | 36 | val result = preparedLaptops.join(preparedOffers) // RDD[(make, model), ((reg, cpu), (cpu, salePrice)))] 37 | .filter { 38 | case ((make, model), ((reg, laptopCpu), (offerCpu, salePrice))) => Math.abs(laptopCpu - offerCpu) <= 0.1 39 | } 40 | .map { 41 | case ((make, model), ((reg, laptopCpu), (offerCpu, salePrice))) => (reg, salePrice) 42 | } 43 | .aggregateByKey((0.0, 0))( 44 | { 45 | case ((totalPrice, numPrices), salePrice) => (totalPrice + salePrice, numPrices + 1) // combine state with record 46 | }, 47 | { 48 | case ((totalPrices1, numPrices1), (totalPrices2, numPrices2)) => (totalPrices1 + totalPrices2, numPrices1 + numPrices2) // combine 2 states into one 49 | } 50 | ) // RDD[(String, (Double, Int))] 51 | .mapValues { 52 | case (totalPrices, numPrices) => totalPrices / numPrices 53 | } 54 | 55 | result.count() 56 | } 57 | 58 | def noSkewJoin() = { 59 | val preparedLaptops = laptops 60 | .flatMap { laptop => 61 | Seq( 62 | laptop, 63 | laptop.copy(procSpeed = laptop.procSpeed - 0.1), 64 | laptop.copy(procSpeed = laptop.procSpeed + 0.1), 65 | ) 66 | } 67 | .map { 68 | case Laptop(registration, make, model, procSpeed) => ((make, model, procSpeed), registration) 69 | } 70 | 71 | val preparedOffers = laptopOffers.map { 72 | case LaptopOffer(make, model, procSpeed, salePrice) => ((make, model, procSpeed), salePrice) 73 | } 74 | 75 | val result = preparedLaptops.join(preparedOffers) // RDD[(make, model, procSpeed), (reg, salePrice)) 76 | .map(_._2) 77 | .aggregateByKey((0.0, 0))( 78 | { 79 | case ((totalPrice, numPrices), salePrice) => (totalPrice + salePrice, numPrices + 1) // combine state with record 80 | }, 81 | { 82 | case ((totalPrices1, numPrices1), (totalPrices2, numPrices2)) => (totalPrices1 + totalPrices2, numPrices1 + numPrices2) // combine 2 states into one 83 | } 84 | ) // RDD[(String, (Double, Int))] 85 | .mapValues { 86 | case (totalPrices, numPrices) => totalPrices / numPrices 87 | } 88 | 89 | result.count() 90 | } 91 | 92 | 93 | def main(args: Array[String]): Unit = { 94 | noSkewJoin() 95 | Thread.sleep(1000000) 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/scala/part4rddjoins/SimpleRDDJoins.scala: -------------------------------------------------------------------------------- 1 | package part4rddjoins 2 | 3 | import generator.DataGenerator 4 | import org.apache.spark.HashPartitioner 5 | import org.apache.spark.rdd.RDD 6 | import org.apache.spark.sql.SparkSession 7 | 8 | object SimpleRDDJoins { 9 | 10 | val spark = SparkSession.builder() 11 | .appName("RDD joins") 12 | .master("local[*]") 13 | .getOrCreate() 14 | 15 | val sc = spark.sparkContext 16 | val rootFolder = "src/main/resources/generated/examData" 17 | 18 | // DataGenerator.generateExamData(rootFolder, 1000000, 5) 19 | 20 | def readIds() = sc.textFile(s"$rootFolder/examIds.txt") 21 | .map { line => 22 | val tokens = line.split(" ") 23 | (tokens(0).toLong, tokens(1)) 24 | } 25 | .partitionBy(new HashPartitioner(10)) 26 | 27 | def readExamScores() = sc.textFile(s"$rootFolder/examScores.txt") 28 | .map { line => 29 | val tokens = line.split(" ") 30 | (tokens(0).toLong, tokens(1).toDouble) 31 | } 32 | 33 | // goal: the number of students who passed the exam (= at least one attempt > 9.0) 34 | 35 | def plainJoin() = { 36 | val candidates = readIds() 37 | val scores = readExamScores() 38 | 39 | // simple join 40 | val joined: RDD[(Long, (Double, String))] = scores.join(candidates) // (score attempt, candidate name) 41 | val finalScores = joined 42 | .reduceByKey((pair1, pair2) => if(pair1._1 > pair2._1) pair1 else pair2) 43 | .filter(_._2._1 > 9.0) 44 | 45 | finalScores.count 46 | } 47 | 48 | def preAggregate() = { 49 | val candidates = readIds() 50 | val scores = readExamScores() 51 | 52 | // do aggregation first - 10% perf increase 53 | val maxScores: RDD[(Long, Double)] = scores.reduceByKey(Math.max) 54 | val finalScores = maxScores.join(candidates).filter(_._2._1 > 9.0) 55 | 56 | finalScores.count 57 | } 58 | 59 | def preFiltering() = { 60 | val candidates = readIds() 61 | val scores = readExamScores() 62 | 63 | // do filtering first before the join 64 | val maxScores = scores.reduceByKey(Math.max).filter(_._2 > 9.0) 65 | val finalScores = maxScores.join(candidates) 66 | 67 | finalScores.count 68 | } 69 | 70 | def coPartitioning() = { 71 | val candidates = readIds() 72 | val scores = readExamScores() 73 | 74 | val partitionerForScores = candidates.partitioner match { 75 | case None => new HashPartitioner(candidates.getNumPartitions) 76 | case Some(partitioner) => partitioner 77 | } 78 | 79 | val repartitionedScores = scores.partitionBy(partitionerForScores) 80 | val joined: RDD[(Long, (Double, String))] = repartitionedScores.join(candidates) 81 | val finalScores = joined 82 | .reduceByKey((pair1, pair2) => if(pair1._1 > pair2._1) pair1 else pair2) 83 | .filter(_._2._1 > 9.0) 84 | 85 | finalScores.count 86 | } 87 | 88 | def combined() = { 89 | val candidates = readIds() 90 | val scores = readExamScores() 91 | 92 | val partitionerForScores = candidates.partitioner match { 93 | case None => new HashPartitioner(candidates.getNumPartitions) 94 | case Some(partitioner) => partitioner 95 | } 96 | 97 | val repartitionedScores = scores.partitionBy(partitionerForScores) 98 | 99 | // do filtering first before the join 100 | val maxScores = repartitionedScores.reduceByKey(Math.max).filter(_._2 > 9.0) 101 | val finalScores = maxScores.join(candidates) 102 | 103 | finalScores.count 104 | } 105 | 106 | def main(args: Array[String]): Unit = { 107 | plainJoin() 108 | preAggregate() 109 | preFiltering() 110 | coPartitioning() 111 | combined() 112 | 113 | Thread.sleep(1000000) 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/scala/part3dfjoins/SkewedJoins.scala: -------------------------------------------------------------------------------- 1 | package part3dfjoins 2 | 3 | import generator.DataGenerator 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.functions._ 6 | 7 | object SkewedJoins { 8 | 9 | val spark = SparkSession.builder() 10 | .appName("Skewed Joins") 11 | .master("local[*]") 12 | .config("spark.sql.autoBroadcastJoinThreshold", -1) // deactivate broadcast joins 13 | .getOrCreate() 14 | 15 | import spark.implicits._ 16 | 17 | /* 18 | An online store selling gaming laptops. 19 | 2 laptops are "similar" if they have the same make & model, but proc speed within 0.1 20 | 21 | For each laptop configuration, we are interested in the average sale price of "similar" models. 22 | 23 | Acer Predator 2.9Ghz aylfaskjhrw -> average sale price of all Acer Predators with CPU speed between 2.8 and 3.0 GHz 24 | */ 25 | 26 | val laptops = Seq.fill(40000)(DataGenerator.randomLaptop()).toDS 27 | val laptopOffers = Seq.fill(100000)(DataGenerator.randomLaptopOffer()).toDS 28 | 29 | val joined = laptops.join(laptopOffers, Seq("make", "model")) 30 | .filter(abs(laptopOffers.col("procSpeed") - laptops.col("procSpeed")) <= 0.1) 31 | .groupBy("registration") 32 | .agg(avg("salePrice").as("averagePrice")) 33 | /* 34 | == Physical Plan == 35 | *(4) HashAggregate(keys=[registration#4], functions=[avg(salePrice#20)]) 36 | +- Exchange hashpartitioning(registration#4, 200), true, [id=#99] 37 | +- *(3) HashAggregate(keys=[registration#4], functions=[partial_avg(salePrice#20)]) 38 | +- *(3) Project [registration#4, salePrice#20] 39 | +- *(3) SortMergeJoin [make#5, model#6], [make#17, model#18], Inner, (abs((procSpeed#19 - procSpeed#7)) <= 0.1) 40 | :- *(1) Sort [make#5 ASC NULLS FIRST, model#6 ASC NULLS FIRST], false, 0 41 | : +- Exchange hashpartitioning(make#5, model#6, 200), true, [id=#77] 42 | : +- LocalTableScan [registration#4, make#5, model#6, procSpeed#7] 43 | +- *(2) Sort [make#17 ASC NULLS FIRST, model#18 ASC NULLS FIRST], false, 0 44 | +- Exchange hashpartitioning(make#17, model#18, 200), true, [id=#78] 45 | +- LocalTableScan [make#17, model#18, procSpeed#19, salePrice#20] 46 | */ 47 | 48 | val laptops2 = laptops.withColumn("procSpeed", explode(array($"procSpeed" - 0.1, $"procSpeed", $"procSpeed" + 0.1))) 49 | val joined2 = laptops2.join(laptopOffers, Seq("make", "model", "procSpeed")) 50 | .groupBy("registration") 51 | .agg(avg("salePrice").as("averagePrice")) 52 | /* 53 | == Physical Plan == 54 | *(4) HashAggregate(keys=[registration#4], functions=[avg(salePrice#20)]) 55 | +- Exchange hashpartitioning(registration#4, 200), true, [id=#107] 56 | +- *(3) HashAggregate(keys=[registration#4], functions=[partial_avg(salePrice#20)]) 57 | +- *(3) Project [registration#4, salePrice#20] 58 | +- *(3) SortMergeJoin [make#5, model#6, knownfloatingpointnormalized(normalizenanandzero(procSpeed#43))], [make#17, model#18, knownfloatingpointnormalized(normalizenanandzero(procSpeed#19))], Inner 59 | :- *(1) Sort [make#5 ASC NULLS FIRST, model#6 ASC NULLS FIRST, knownfloatingpointnormalized(normalizenanandzero(procSpeed#43)) ASC NULLS FIRST], false, 0 60 | : +- Exchange hashpartitioning(make#5, model#6, knownfloatingpointnormalized(normalizenanandzero(procSpeed#43)), 200), true, [id=#85] 61 | : +- Generate explode(array((procSpeed#7 - 0.1), procSpeed#7, (procSpeed#7 + 0.1))), [registration#4, make#5, model#6], false, [procSpeed#43] 62 | : +- LocalTableScan [registration#4, make#5, model#6, procSpeed#7] 63 | +- *(2) Sort [make#17 ASC NULLS FIRST, model#18 ASC NULLS FIRST, knownfloatingpointnormalized(normalizenanandzero(procSpeed#19)) ASC NULLS FIRST], false, 0 64 | +- Exchange hashpartitioning(make#17, model#18, knownfloatingpointnormalized(normalizenanandzero(procSpeed#19)), 200), true, [id=#86] 65 | +- LocalTableScan [make#17, model#18, procSpeed#19, salePrice#20] 66 | */ 67 | 68 | def main(args: Array[String]): Unit = { 69 | joined2.show() 70 | joined2.explain() 71 | Thread.sleep(1000000) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/scala/part5rddtransformations/ByKeyFunctions.scala: -------------------------------------------------------------------------------- 1 | package part5rddtransformations 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | import scala.io.Source 6 | import scala.util.{Random, Using} 7 | 8 | object ByKeyFunctions { 9 | 10 | val spark = SparkSession.builder() 11 | .appName("Skewed Joins") 12 | .master("local[2]") 13 | .getOrCreate() 14 | 15 | val sc = spark.sparkContext 16 | 17 | /* 18 | In the video, we copied the file from src/main/resources/data/lipsum/words.txt to spark-cluster/data. 19 | This will make it available under /opt/spark-data/. 20 | Alternatively, if you copied the entire src/main/resources/data folder to Docker, you will need to use the path where you copied it. 21 | 22 | Scenario: assume we have a dataset with (word, occurrences) which we obtained after scraping a big document or website. 23 | We want to aggregate and sum all values under a single map. 24 | */ 25 | val words: Seq[String] = Using.resource(Source.fromFile("/opt/spark-data/words.txt")) { source => 26 | source.getLines().toSeq 27 | } 28 | 29 | // generate data 30 | val random = new Random 31 | val wordCounts = sc.parallelize(Seq.fill(2000000)(words(random.nextInt(words.length)), random.nextInt(1000))) 32 | 33 | // the most intuitive solution can be the most dangerous 34 | val totalCounts = wordCounts 35 | .groupByKey() // RDD of key = word, value = iterable of all previous values 36 | .mapValues(_.sum) 37 | 38 | // call an action 39 | totalCounts.collectAsMap() 40 | // ^^ 6s for 2M laptop sales [maybe adjust numbers] 41 | // ^^ look at the shuffle write - it shuffles the entire data 42 | 43 | /* 44 | groupByKey is dangerous in 2 ways: 45 | - it causes a shuffle so that data associated with one key stays on the same machine 46 | - it can cause memory errors if the data is skewed, i.e. data associated to one key has disproportionate representation and may not fit in an executor mem 47 | */ 48 | 49 | /** 50 | Other byKey functions 51 | */ 52 | 53 | /* 54 | ReduceByKey is the simplest - like a collection 55 | Also faster and safer because 56 | - it does a partial aggregate on the executor (operations done on the executors without shuffling are called map-side) 57 | - avoids the data skew problem 58 | - shuffles much less data 59 | */ 60 | val totalCountsReduce = wordCounts.reduceByKey(_ + _) 61 | totalCountsReduce.collectAsMap() 62 | 63 | /* 64 | FoldByKey is similar to the collection fold function 65 | - needs a 0 value to start with 66 | - needs a combination function 67 | 68 | Similar performance 69 | */ 70 | val totalCountsFold = wordCounts.foldByKey(0)(_ + _) 71 | totalCountsFold.collectAsMap() 72 | 73 | /* 74 | Aggregate by key is more general and needs a zero value and 2 combination functions 75 | - one that combines the current aggregated value with a new element 76 | - one that combines two aggregated values from different executors 77 | 78 | Similar performance 79 | */ 80 | val totalCountsAggregate = wordCounts.aggregateByKey(0.0)(_ + _, _ + _) 81 | totalCountsAggregate.collectAsMap() 82 | 83 | /* 84 | CombineByKey is the most general function available that can combine values inside your RDD. You need 85 | - a function that turns a value into an aggregate value so that further aggregates can start from it 86 | - a function to combine a current aggregate with a value in the RDD inside the executor 87 | - a function to combine 2 aggregates between executors 88 | - a number of partitions, or a partitioner so that you can do further operations e.g. joins without additional shuffles 89 | 90 | CombineByKey can be as dangerous as groupByKey when the combination functions grow the data instead of shrinking it. 91 | Used correctly (i.e. when the functions are reduction functions), combineByKey is efficient and potentially much more efficient later on if you do joins. 92 | */ 93 | val totalCountsCombine = wordCounts.combineByKey( 94 | (count: Int) => count, 95 | (currentSum: Int, newValue: Int) => currentSum + newValue, 96 | (partialSum1: Int, partialSum2: Int) => partialSum1 + partialSum2, 97 | numPartitions = 10 98 | ) 99 | totalCountsCombine.collectAsMap() 100 | // collect still takes 2 seconds 101 | 102 | def main(args: Array[String]): Unit = { 103 | Thread.sleep(10000000) 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Class-Path: commons-compiler-3.0.15.jar hadoop-mapreduce-client-common 3 | -2.7.4.jar hadoop-yarn-server-nodemanager-2.7.4.jar hadoop-yarn-api-2 4 | .7.4.jar avro-1.8.2.jar avro-mapred-1.8.2-hadoop2.jar hadoop-mapreduc 5 | e-client-jobclient-2.7.4.jar jackson-mapper-asl-1.9.13.jar scala-xml_ 6 | 2.12-1.2.0.jar commons-compress-1.8.1.jar javassist-3.22.0-CR2.jar ha 7 | doop-yarn-common-2.7.4.jar commons-httpclient-3.1.jar spark-catalyst_ 8 | 2.12-3.0.0-preview2.jar jersey-common-2.29.1.jar jackson-core-2.10.0. 9 | jar spark-tags_2.12-3.0.0-preview2.jar parquet-column-1.10.1.jar json 10 | 4s-scalap_2.12-3.6.6.jar javax.servlet-api-3.1.0.jar jsr305-3.0.2.jar 11 | jackson-module-scala_2.12-2.10.0.jar metrics-graphite-4.1.1.jar metr 12 | ics-jmx-4.1.1.jar leveldbjni-all-1.8.jar guice-3.0.jar curator-recipe 13 | s-2.7.1.jar avro-ipc-1.8.2.jar hadoop-mapreduce-client-core-2.7.4.jar 14 | jersey-hk2-2.29.1.jar spark-core_2.12-3.0.0-preview2.jar RoaringBitm 15 | ap-0.7.45.jar hadoop-yarn-server-common-2.7.4.jar metrics-json-4.1.1. 16 | jar jackson-core-asl-1.9.13.jar hadoop-annotations-2.7.4.jar pyrolite 17 | -4.30.jar orc-shims-1.5.8.jar jakarta.inject-2.6.1.jar jetty-util-6.1 18 | .26.jar httpcore-4.2.4.jar hk2-locator-2.6.1.jar xz-1.5.jar commons-m 19 | ath3-3.4.1.jar commons-cli-1.2.jar gson-2.2.4.jar jsp-api-2.1.jar act 20 | ivation-1.1.1.jar curator-framework-2.7.1.jar parquet-hadoop-1.10.1.j 21 | ar hadoop-common-2.7.4.jar slf4j-api-1.7.28.jar jersey-container-serv 22 | let-2.29.1.jar jetty-sslengine-6.1.26.jar commons-crypto-1.0.0.jar ao 23 | palliance-repackaged-2.6.1.jar jakarta.ws.rs-api-2.1.6.jar jcl-over-s 24 | lf4j-1.7.16.jar jackson-databind-2.10.0.jar osgi-resource-locator-1.0 25 | .3.jar arrow-memory-0.15.1.jar aopalliance-1.0.jar orc-mapreduce-1.5. 26 | 8.jar kryo-shaded-4.0.2.jar commons-io-2.4.jar stax-api-1.0-2.jar par 27 | quet-jackson-1.10.1.jar log4j-1.2.17.jar jersey-client-2.29.1.jar sna 28 | ppy-java-1.1.7.3.jar parquet-format-2.4.0.jar flatbuffers-java-1.9.0. 29 | jar metrics-core-4.1.1.jar slf4j-log4j12-1.7.25.jar xercesImpl-2.9.1. 30 | jar chill-java-0.9.3.jar jakarta.validation-api-2.0.2.jar jakarta.ann 31 | otation-api-1.3.5.jar jersey-server-2.29.1.jar jersey-container-servl 32 | et-core-2.29.1.jar zstd-jni-1.4.4-3.jar jackson-annotations-2.10.0.ja 33 | r objenesis-2.5.1.jar scala-parser-combinators_2.12-1.1.2.jar commons 34 | -beanutils-1.7.0.jar ivy-2.4.0.jar json4s-core_2.12-3.6.6.jar commons 35 | -net-3.1.jar oro-2.0.8.jar spark-launcher_2.12-3.0.0-preview2.jar ant 36 | lr4-runtime-4.7.1.jar hadoop-mapreduce-client-app-2.7.4.jar hadoop-cl 37 | ient-2.7.4.jar hk2-api-2.6.1.jar stream-2.9.6.jar commons-configurati 38 | on-1.6.jar zookeeper-3.4.14.jar orc-core-1.5.8.jar xbean-asm7-shaded- 39 | 4.15.jar log4j-api-2.4.1.jar api-asn1-api-1.0.0-M20.jar curator-clien 40 | t-2.7.1.jar protobuf-java-2.5.0.jar compress-lzf-1.0.3.jar jackson-ja 41 | xrs-1.9.13.jar arrow-format-0.15.1.jar scala-library-2.12.4.jar spark 42 | -unsafe_2.12-3.0.0-preview2.jar spark-sql_2.12-3.0.0-preview2.jar air 43 | compressor-0.10.jar jline-0.9.94.jar minlog-1.3.0.jar lz4-java-1.7.0. 44 | jar unused-1.0.0.jar chill_2.12-0.9.3.jar commons-text-1.6.jar py4j-0 45 | .10.8.1.jar parquet-encoding-1.10.1.jar jackson-xc-1.9.13.jar hadoop- 46 | mapreduce-client-shuffle-2.7.4.jar audience-annotations-0.5.0.jar jet 47 | tison-1.1.jar netty-all-4.1.42.Final.jar jaxb-api-2.2.2.jar jersey-me 48 | dia-jaxb-2.29.1.jar apacheds-kerberos-codec-2.0.0-M15.jar janino-3.0. 49 | 15.jar hadoop-yarn-client-2.7.4.jar arrow-vector-0.15.1.jar log4j-cor 50 | e-2.4.1.jar hive-storage-api-2.6.0.jar guava-16.0.1.jar spotbugs-anno 51 | tations-3.1.9.jar spark-sketch_2.12-3.0.0-preview2.jar xmlenc-0.52.ja 52 | r json4s-ast_2.12-3.6.6.jar scala-reflect-2.12.4.jar hk2-utils-2.6.1. 53 | jar spark-network-common_2.12-3.0.0-preview2.jar paranamer-2.8.jar ap 54 | acheds-i18n-2.0.0-M15.jar jul-to-slf4j-1.7.16.jar commons-lang3-3.9.j 55 | ar metrics-jvm-4.1.1.jar jackson-module-paranamer-2.10.0.jar hadoop-h 56 | dfs-2.7.4.jar spark-network-shuffle_2.12-3.0.0-preview2.jar xml-apis- 57 | 1.3.04.jar json4s-jackson_2.12-3.6.6.jar htrace-core-3.1.0-incubating 58 | .jar javax.inject-1.jar httpclient-4.2.5.jar hadoop-auth-2.7.4.jar co 59 | mmons-codec-1.10.jar commons-collections-3.2.2.jar shims-0.7.45.jar s 60 | park-kvstore_2.12-3.0.0-preview2.jar netty-3.10.6.Final.jar parquet-c 61 | ommon-1.10.1.jar univocity-parsers-2.8.3.jar api-util-1.0.0-M20.jar c 62 | ommons-lang-2.6.jar commons-digester-1.8.jar 63 | Main-Class: 64 | 65 | -------------------------------------------------------------------------------- /src/main/scala/part5rddtransformations/I2ITransformations.scala: -------------------------------------------------------------------------------- 1 | package part5rddtransformations 2 | 3 | import generator.DataGenerator 4 | import org.apache.spark.sql.SparkSession 5 | import scala.collection.mutable 6 | 7 | object I2ITransformations { 8 | 9 | val spark = SparkSession.builder() 10 | .appName("I2I Transformations") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val sc = spark.sparkContext 15 | 16 | /* 17 | Science project 18 | each metric has identifier, value 19 | 20 | Return the smallest ("best") 10 metrics (identifiers + values) 21 | */ 22 | 23 | val LIMIT = 10 24 | 25 | def readMetrics() = sc.textFile("src/main/resources/generated/metrics/metrics10m.txt") 26 | .map { line => 27 | val tokens = line.split(" ") 28 | val name = tokens(0) 29 | val value = tokens(1) 30 | 31 | (name, value.toDouble) 32 | } 33 | 34 | def printTopMetrics() = { 35 | val sortedMetrics = readMetrics().sortBy(_._2).take(LIMIT) 36 | sortedMetrics.foreach(println) 37 | } 38 | 39 | def printTopMetricsI2I() = { 40 | 41 | val iteratorToIteratorTransformation = (records: Iterator[(String, Double)]) => { 42 | /* 43 | i2i transformation 44 | - they are NARROW TRANSFORMATIONS 45 | - Spark will "selectively" spill data to disk when partitions are too big for memory 46 | 47 | Warning: don't traverse more than once or convert to collections 48 | */ 49 | 50 | implicit val ordering: Ordering[(String, Double)] = Ordering.by[(String, Double), Double](_._2) 51 | val limitedCollection = new mutable.TreeSet[(String, Double)]() 52 | 53 | records.foreach { record => 54 | limitedCollection.add(record) 55 | if (limitedCollection.size > LIMIT) { 56 | limitedCollection.remove(limitedCollection.last) 57 | } 58 | } 59 | 60 | // I've traversed the iterator 61 | 62 | limitedCollection.iterator 63 | } 64 | 65 | val topMetrics = readMetrics() 66 | .mapPartitions(iteratorToIteratorTransformation) 67 | .repartition(1) 68 | .mapPartitions(iteratorToIteratorTransformation) 69 | 70 | val result = topMetrics.take(LIMIT) 71 | result.foreach(println) 72 | } 73 | 74 | /** 75 | * Exercises 76 | */ 77 | 78 | def printTopMetricsEx1() = { 79 | /* 80 | Better than the "dummy" approach 81 | - not sorting the entire RDD 82 | 83 | Bad (worse than the optimal) 84 | - sorting the entire partition 85 | - forcing the iterator in memory - this can OOM your executors 86 | */ 87 | val topMetrics = readMetrics() 88 | .mapPartitions(_.toList.sortBy(_._2).take(LIMIT).iterator) 89 | .repartition(1) 90 | .mapPartitions(_.toList.sortBy(_._2).take(LIMIT).iterator) 91 | .take(LIMIT) 92 | 93 | topMetrics.foreach(println) 94 | } 95 | 96 | /* 97 | Better than ex1 98 | - extracting top 10 values per partition instead of sorting the entire partition 99 | 100 | Bad because 101 | - forcing toList can OOM your executors 102 | - iterating over the list twice 103 | - if the list is immutable, time spent allocating objects (and GC) 104 | */ 105 | def printTopMetricsEx2() = { 106 | val topMetrics = readMetrics() 107 | .mapPartitions { records => 108 | 109 | implicit val ordering: Ordering[(String, Double)] = Ordering.by[(String, Double), Double](_._2) 110 | val limitedCollection = new mutable.TreeSet[(String, Double)]() 111 | 112 | records.toList.foreach { record => 113 | limitedCollection.add(record) 114 | if (limitedCollection.size > LIMIT) { 115 | limitedCollection.remove(limitedCollection.last) 116 | } 117 | } 118 | 119 | // I've traversed the iterator 120 | 121 | limitedCollection.iterator 122 | } 123 | .repartition(1) 124 | .mapPartitions { records => 125 | 126 | implicit val ordering: Ordering[(String, Double)] = Ordering.by[(String, Double), Double](_._2) 127 | val limitedCollection = new mutable.TreeSet[(String, Double)]() 128 | 129 | records.toList.foreach { record => 130 | limitedCollection.add(record) 131 | if (limitedCollection.size > LIMIT) { 132 | limitedCollection.remove(limitedCollection.last) 133 | } 134 | } 135 | 136 | // I've traversed the iterator 137 | 138 | limitedCollection.iterator 139 | } 140 | .take(LIMIT) 141 | 142 | topMetrics.foreach(println) 143 | } 144 | 145 | def main(args: Array[String]): Unit = { 146 | printTopMetrics() 147 | printTopMetricsI2I() 148 | Thread.sleep(1000000) 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/sbt,java,scala,spark,intellij 3 | # Edit at https://www.gitignore.io/?templates=sbt,java,scala,spark,intellij 4 | 5 | ### Intellij ### 6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 8 | 9 | # User-specific stuff 10 | .idea/**/workspace.xml 11 | .idea/**/tasks.xml 12 | .idea/**/usage.statistics.xml 13 | .idea/**/dictionaries 14 | .idea/**/shelf 15 | 16 | # Generated files 17 | .idea/**/contentModel.xml 18 | 19 | # Sensitive or high-churn files 20 | .idea/**/dataSources/ 21 | .idea/**/dataSources.ids 22 | .idea/**/dataSources.local.xml 23 | .idea/**/sqlDataSources.xml 24 | .idea/**/dynamic.xml 25 | .idea/**/uiDesigner.xml 26 | .idea/**/dbnavigator.xml 27 | 28 | # Gradle 29 | .idea/**/gradle.xml 30 | .idea/**/libraries 31 | 32 | # Gradle and Maven with auto-import 33 | # When using Gradle or Maven with auto-import, you should exclude module files, 34 | # since they will be recreated, and may cause churn. Uncomment if using 35 | # auto-import. 36 | # .idea/modules.xml 37 | # .idea/*.iml 38 | # .idea/modules 39 | # *.iml 40 | # *.ipr 41 | 42 | # CMake 43 | cmake-build-*/ 44 | 45 | # Mongo Explorer plugin 46 | .idea/**/mongoSettings.xml 47 | 48 | # File-based project format 49 | *.iws 50 | 51 | # IntelliJ 52 | out/ 53 | 54 | # mpeltonen/sbt-idea plugin 55 | .idea_modules/ 56 | 57 | # JIRA plugin 58 | atlassian-ide-plugin.xml 59 | 60 | # Cursive Clojure plugin 61 | .idea/replstate.xml 62 | 63 | # Crashlytics plugin (for Android Studio and IntelliJ) 64 | com_crashlytics_export_strings.xml 65 | crashlytics.properties 66 | crashlytics-build.properties 67 | fabric.properties 68 | 69 | # Editor-based Rest Client 70 | .idea/httpRequests 71 | 72 | # Android studio 3.1+ serialized cache file 73 | .idea/caches/build_file_checksums.ser 74 | 75 | ### Intellij Patch ### 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 77 | 78 | # *.iml 79 | # modules.xml 80 | # .idea/misc.xml 81 | # *.ipr 82 | 83 | # Sonarlint plugin 84 | .idea/**/sonarlint/ 85 | 86 | # SonarQube Plugin 87 | .idea/**/sonarIssues.xml 88 | 89 | # Markdown Navigator plugin 90 | .idea/**/markdown-navigator.xml 91 | .idea/**/markdown-navigator/ 92 | 93 | ### Java ### 94 | # Compiled class file 95 | *.class 96 | 97 | # Log file 98 | *.log 99 | 100 | # BlueJ files 101 | *.ctxt 102 | 103 | # Mobile Tools for Java (J2ME) 104 | .mtj.tmp/ 105 | 106 | # Package Files # 107 | *.jar 108 | *.war 109 | *.nar 110 | *.ear 111 | *.zip 112 | *.tar.gz 113 | *.rar 114 | 115 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 116 | hs_err_pid* 117 | 118 | ### SBT ### 119 | # Simple Build Tool 120 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control 121 | 122 | dist/* 123 | target/ 124 | lib_managed/ 125 | src_managed/ 126 | project/boot/ 127 | project/plugins/project/ 128 | .history 129 | .cache 130 | .lib/ 131 | 132 | ### Scala ### 133 | *.metals 134 | 135 | ### Spark ### 136 | *#*# 137 | *.#* 138 | *.iml 139 | *.ipr 140 | *.pyc 141 | *.pyo 142 | *.swp 143 | *~ 144 | .DS_Store 145 | .classpath 146 | .ensime 147 | .ensime_cache/ 148 | .ensime_lucene 149 | .generated-mima* 150 | .idea/ 151 | .project 152 | .pydevproject 153 | .scala_dependencies 154 | .settings 155 | /lib/ 156 | R-unit-tests.log 157 | R/unit-tests.out 158 | R/cran-check.out 159 | R/pkg/vignettes/sparkr-vignettes.html 160 | R/pkg/tests/fulltests/Rplots.pdf 161 | build/*.jar 162 | build/apache-maven* 163 | build/scala* 164 | build/zinc* 165 | cache 166 | checkpoint 167 | conf/*.cmd 168 | conf/*.conf 169 | conf/*.properties 170 | conf/*.sh 171 | conf/*.xml 172 | conf/java-opts 173 | conf/slaves 174 | dependency-reduced-pom.xml 175 | derby.log 176 | dev/create-release/*final 177 | dev/create-release/*txt 178 | dev/pr-deps/ 179 | dist/ 180 | docs/_site 181 | docs/api 182 | sql/docs 183 | sql/site 184 | lint-r-report.log 185 | log/ 186 | logs/ 187 | project/build/target/ 188 | project/plugins/lib_managed/ 189 | project/plugins/project/build.properties 190 | project/plugins/src_managed/ 191 | project/plugins/target/ 192 | python/lib/pyspark.zip 193 | python/deps 194 | python/test_coverage/coverage_data 195 | python/test_coverage/htmlcov 196 | python/pyspark/python 197 | reports/ 198 | scalastyle-on-compile.generated.xml 199 | scalastyle-output.xml 200 | scalastyle.txt 201 | spark-*-bin-*.tgz 202 | spark-tests.log 203 | streaming-tests.log 204 | unit-tests.log 205 | work/ 206 | docs/.jekyll-metadata 207 | 208 | # For Hive 209 | TempStatsStore/ 210 | metastore/ 211 | metastore_db/ 212 | sql/hive-thriftserver/test_warehouses 213 | warehouse/ 214 | spark-warehouse/ 215 | 216 | # For R session data 217 | .RData 218 | .RHistory 219 | .Rhistory 220 | *.Rproj 221 | *.Rproj.* 222 | 223 | .Rproj.user 224 | 225 | # For SBT 226 | .jvmopts 227 | 228 | 229 | # End of https://www.gitignore.io/api/sbt,java,scala,spark,intellij 230 | 231 | # Daniel added 232 | src/main/resources/generated/ 233 | -------------------------------------------------------------------------------- /src/main/scala/part2foundations/ReadingQueryPlans.scala: -------------------------------------------------------------------------------- 1 | package part2foundations 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object ReadingQueryPlans { 6 | ///////////////////////////////////////////////////////////////////// Boilerplate 7 | // you don't need this code in the Spark shell 8 | // this code is needed if you want to run it locally in IntelliJ 9 | 10 | val spark = SparkSession.builder() 11 | .config("spark.master", "local") 12 | .appName("Reading Query Plans") 13 | .getOrCreate() 14 | 15 | val sc = spark.sparkContext 16 | 17 | ///////////////////////////////////////////////////////////////////// Boilerplate 18 | 19 | // plan 1 - a simple transformation 20 | val simpleNumbers = spark.range(1, 1000000) 21 | val times5 = simpleNumbers.selectExpr("id * 5 as id") 22 | times5.explain() // this is how you show a query plan 23 | /* 24 | == Physical Plan == 25 | *(1) Project [(id#0L * 5) AS id#2L] 26 | +- *(1) Range (1, 1000000, step=1, splits=6) 27 | */ 28 | 29 | // plan 2 - a shuffle 30 | val moreNumbers = spark.range(1, 1000000, 2) 31 | val split7 = moreNumbers.repartition(7) 32 | 33 | split7.explain() 34 | /* 35 | == Physical Plan == 36 | Exchange RoundRobinPartitioning(7), false, [id=#16] 37 | +- *(1) Range (1, 1000000, step=2, splits=6) 38 | */ 39 | 40 | // plan 3 - shuffle + transformation 41 | split7.selectExpr("id * 5 as id").explain() 42 | /* 43 | == Physical Plan == 44 | *(2) Project [(id#4L * 5) AS id#8L] 45 | +- Exchange RoundRobinPartitioning(7), false, [id=#29] 46 | +- *(1) Range (1, 1000000, step=2, splits=6) 47 | */ 48 | 49 | 50 | // plan 4 - a more complex job with a join 51 | val ds1 = spark.range(1, 10000000) 52 | val ds2 = spark.range(1, 20000000, 2) 53 | val ds3 = ds1.repartition(7) 54 | val ds4 = ds2.repartition(9) 55 | val ds5 = ds3.selectExpr("id * 3 as id") 56 | val joined = ds5.join(ds4, "id") 57 | val sum = joined.selectExpr("sum(id)") 58 | sum.explain() 59 | /* 60 | 61 | == Physical Plan == 62 | *(7) HashAggregate(keys=[], functions=[sum(id#18L)]) 63 | +- Exchange SinglePartition, true, [id=#99] 64 | +- *(6) HashAggregate(keys=[], functions=[partial_sum(id#18L)]) 65 | +- *(6) Project [id#18L] 66 | +- *(6) SortMergeJoin [id#18L], [id#12L], Inner 67 | :- *(3) Sort [id#18L ASC NULLS FIRST], false, 0 68 | : +- Exchange hashpartitioning(id#18L, 200), true, [id=#83] 69 | : +- *(2) Project [(id#10L * 3) AS id#18L] 70 | : +- Exchange RoundRobinPartitioning(7), false, [id=#79] 71 | : +- *(1) Range (1, 10000000, step=1, splits=6) 72 | +- *(5) Sort [id#12L ASC NULLS FIRST], false, 0 73 | +- Exchange hashpartitioning(id#12L, 200), true, [id=#90] 74 | +- Exchange RoundRobinPartitioning(9), false, [id=#89] 75 | +- *(4) Range (1, 20000000, step=2, splits=6) 76 | */ 77 | 78 | /** 79 | * Exercises - read the Query Plans and try to understand the code that generated them. 80 | */ 81 | 82 | // exercise 1 83 | /* 84 | == Physical Plan == 85 | *(1) Project [firstName#153, lastName#155, (cast(salary#159 as double) / 1.1) AS salary_EUR#168] 86 | +- *(1) FileScan csv [firstName#153,lastName#155,salary#159] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/tmp/employees_headers.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct 87 | */ 88 | val employeesDF = spark.read.option("header", true).csv("/tmp/employees_headers.csv") 89 | val empEur = employeesDF.selectExpr("firstName", "lastName", "salary / 1.1 as salary_EUR") 90 | 91 | // exercise 2 92 | /* 93 | == Physical Plan == 94 | *(2) HashAggregate(keys=[dept#156], functions=[avg(cast(salary#181 as bigint))]) 95 | +- Exchange hashpartitioning(dept#156, 200) 96 | +- *(1) HashAggregate(keys=[dept#156], functions=[partial_avg(cast(salary#181 as bigint))]) 97 | +- *(1) Project [dept#156, cast(salary#159 as int) AS salary#181] 98 | +- *(1) FileScan csv [dept#156,salary#159] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/tmp/employees_headers.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct 99 | */ 100 | val avgSals = employeesDF 101 | .selectExpr("dept", "cast(salary as int) as salary") 102 | .groupBy("dept") 103 | .avg("salary") 104 | 105 | 106 | // exercise 3 107 | /* 108 | == Physical Plan == 109 | *(5) Project [id#195L] 110 | +- *(5) SortMergeJoin [id#195L], [id#197L], Inner 111 | :- *(2) Sort [id#195L ASC NULLS FIRST], false, 0 112 | : +- Exchange hashpartitioning(id#195L, 200) 113 | : +- *(1) Range (1, 10000000, step=3, splits=6) 114 | +- *(4) Sort [id#197L ASC NULLS FIRST], false, 0 115 | +- Exchange hashpartitioning(id#197L, 200) 116 | +- *(3) Range (1, 10000000, step=5, splits=6) 117 | */ 118 | val d1 = spark.range(1, 10000000, 3) 119 | val d2 = spark.range(1, 10000000, 5) 120 | val j1 = d1.join(d2, "id") 121 | 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/part2foundations/SparkAPIs.scala: -------------------------------------------------------------------------------- 1 | package part2foundations 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions.expr 5 | 6 | object SparkAPIs { 7 | 8 | /** 9 | * This application contains the code we wrote during the "Different Spark APIs" video. 10 | */ 11 | 12 | val spark = SparkSession.builder() 13 | .config("spark.master", "local") 14 | .appName("Different Spark APIs") 15 | .getOrCreate() 16 | 17 | // for toDF 18 | import spark.implicits._ 19 | 20 | val sc = spark.sparkContext 21 | 22 | // small count comparison 23 | val numbers = 1 to 1000000000 24 | val rdd = sc.parallelize(1 to 1000000000) 25 | rdd.count() // ~10s on camera - might vary on your PC 26 | 27 | val df = rdd.toDF("id") 28 | df.count() // ~16s - might vary 29 | val dfCount = df.selectExpr("count(*)") // same 30 | // look at the Spark UI - there's a wholestagecodegen step in the stage - that's Spark generating the appropriate bytecode to process RDDs behind the scenes 31 | // most of the time taken is just the RDD transformation - look at the time taken in stage 1 32 | 33 | val ds = spark.range(1, 1000000000) 34 | ds.count() // instant, 0.1s 35 | val dsCount = ds.selectExpr("count(*)") 36 | dsCount.show() // same 37 | ds.toDF("value").count() // same 38 | 39 | ds.rdd.count() // ~25s 40 | // cmd-click on the `rdd` implementation to see why this is so slow. 41 | 42 | /** 43 | * Notice that inside the same "realm", i.e. RDDs or DFs, the computation time is small. 44 | * Converting between them takes a long time. 45 | * That's because each row is processed individually. 46 | * Conversions are particularly bad in Python, because the data needs to go from the Python interpreter to the JVM AND back. 47 | * 48 | * Lesson 1: once decided on the API level, STAY THERE. 49 | */ 50 | 51 | val rddTimes5 = rdd.map(_ * 5) 52 | rddTimes5.count() // ~20s 53 | // one stage 54 | 55 | val dfTimes5 = df.selectExpr("id * 5 as id") 56 | val dfTimes5Count = dfTimes5.selectExpr("count(*)") 57 | dfTimes5Count.show() // still 11-12s 58 | /* 59 | Notice there's no difference in the time taken, comparing with the original count. 60 | The RDD version multiplied every single row, but here, the multiplication is instant. 61 | Or is it? 62 | 63 | WHY? 64 | 65 | scala> dfTimes5Count.explain 66 | == Physical Plan == 67 | *(2) HashAggregate(keys=[], functions=[count(1)]) 68 | +- Exchange SinglePartition 69 | +- *(1) HashAggregate(keys=[], functions=[partial_count(1)]) 70 | +- *(1) Project 71 | +- *(1) SerializeFromObject [input[0, int, false] AS value#2] 72 | +- Scan[obj#1] 73 | 74 | scala> dfCount.explain 75 | == Physical Plan == 76 | *(2) HashAggregate(keys=[], functions=[count(1)]) 77 | +- Exchange SinglePartition 78 | +- *(1) HashAggregate(keys=[], functions=[partial_count(1)]) 79 | +- *(1) Project 80 | +- *(1) SerializeFromObject [input[0, int, false] AS value#2] 81 | +- Scan[obj#1] 82 | 83 | Same query plan! Spark removed the select altogether. 84 | */ 85 | 86 | /** 87 | * Exercise: measure the time it takes to count the number of elements from the DS, multiplied by 5. 88 | * Try to explain the difference. It's ok if you have like an 80% explanation. 89 | */ 90 | val dsTimes5 = ds.map(_ * 5) 91 | val dsTimes5Count = dsTimes5.selectExpr("count(*)") 92 | dsTimes5Count.show() 93 | /* 94 | 7 seconds from 0.1 seconds! That's a 70x time increase. 95 | Let's explain: 96 | 97 | scala> dsCount.explain 98 | == Physical Plan == 99 | *(2) HashAggregate(keys=[], functions=[count(1)]) 100 | +- Exchange SinglePartition 101 | +- *(1) HashAggregate(keys=[], functions=[partial_count(1)]) 102 | +- *(1) Project 103 | +- *(1) Range (1, 1000000000, step=1, splits=6) 104 | 105 | scala> dsTimes5Count.explain 106 | == Physical Plan == 107 | *(2) HashAggregate(keys=[], functions=[count(1)]) 108 | +- Exchange SinglePartition 109 | +- *(1) HashAggregate(keys=[], functions=[partial_count(1)]) 110 | +- *(1) Project 111 | +- *(1) SerializeFromObject [input[0, bigint, false] AS value#71L] 112 | +- *(1) MapElements , obj#70: bigint 113 | +- *(1) DeserializeToObject staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, id#13L, true, false), obj#69: java.lang.Long 114 | +- *(1) Range (1, 1000000000, step=1, splits=6) 115 | 116 | Different query plans. Because we're using a lambda there, Spark can't optimize it. 117 | So Spark has to "deserializeObject" by invoking Long.valueOf on each element in the DS, then map each element with the function, then serialize it back as a DS. 118 | 119 | The reason why Spark has to do that is that Spark doesn't have any information on the lambda, and thus is forced to apply it to each element. 120 | */ 121 | 122 | /** 123 | * Lesson 2: use DFs most of the time. Spark optimizes most stuff away. 124 | * Lesson 3: Lambdas are impossible to optimize. 125 | */ 126 | 127 | } 128 | -------------------------------------------------------------------------------- /src/main/scala/part5rddtransformations/ReusingObjects.scala: -------------------------------------------------------------------------------- 1 | package part5rddtransformations 2 | 3 | import generator.DataGenerator 4 | import org.apache.spark.rdd.RDD 5 | import org.apache.spark.sql.SparkSession 6 | 7 | object ReusingObjects { 8 | 9 | val spark = SparkSession.builder() 10 | .appName("Reusing JVM objects") 11 | .master("local[*]") 12 | .getOrCreate() 13 | 14 | val sc = spark.sparkContext 15 | 16 | /* 17 | Analyze text 18 | Receive batches of text from data sources 19 | "35 // some text" 20 | 21 | Stats per each data source id: 22 | - the number of lines in total 23 | - total number of words in total 24 | - length of the longest word 25 | - the number of occurrences of the word "imperdiet" 26 | 27 | Results should be VERY FAST. 28 | */ 29 | 30 | val textPath = "src/main/resources/generated/lipsum/3m.txt" 31 | val criticalWord = "imperdiet" 32 | 33 | val text = sc.textFile(textPath).map { line => 34 | val tokens = line.split("//") 35 | (tokens(0), tokens(1)) 36 | } 37 | 38 | def generateData() = { 39 | DataGenerator.generateText(textPath, 60000000, 3000000, 200) 40 | } 41 | 42 | 43 | //////////////////// Version 1 44 | 45 | case class TextStats(nLines: Int, nWords: Int, maxWordLength: Int, occurrences: Int) 46 | 47 | object TextStats { 48 | val zero = TextStats(0, 0, 0, 0) 49 | } 50 | 51 | def collectStats() = { 52 | 53 | def aggregateNewRecord(textStats: TextStats, record: String): TextStats = { 54 | val newWords = record.split(" ") 55 | val longestWord = newWords.maxBy(_.length) 56 | val newOccurrences = newWords.count(_ == criticalWord) 57 | TextStats( 58 | textStats.nLines + 1, 59 | textStats.nWords + newWords.length, 60 | if (longestWord.length > textStats.maxWordLength) longestWord.length else textStats.maxWordLength, 61 | textStats.occurrences + newOccurrences 62 | ) 63 | } 64 | 65 | def combineStats(stats1: TextStats, stats2: TextStats): TextStats = { 66 | TextStats( 67 | stats1.nLines + stats2.nLines, 68 | stats1.nWords + stats2.nWords, 69 | if (stats1.maxWordLength > stats2.maxWordLength) stats1.maxWordLength else stats2.maxWordLength, 70 | stats1.occurrences + stats2.occurrences 71 | ) 72 | } 73 | 74 | val aggregate: RDD[(String, TextStats)] = text.aggregateByKey(TextStats.zero)(aggregateNewRecord, combineStats) 75 | aggregate.collectAsMap() 76 | } 77 | 78 | //////////////////// Version 2 79 | 80 | class MutableTextStats(var nLines: Int, var nWords: Int, var maxWordLength: Int, var occurrences: Int) extends Serializable 81 | object MutableTextStats extends Serializable { 82 | def zero = new MutableTextStats(0,0,0,0) 83 | } 84 | 85 | def collectStats2() = { 86 | 87 | def aggregateNewRecord(textStats: MutableTextStats, record: String): MutableTextStats = { 88 | val newWords = record.split(" ") 89 | val longestWord = newWords.maxBy(_.length) 90 | val newOccurrences = newWords.count(_ == criticalWord) 91 | 92 | textStats.nLines += 1 93 | textStats.nWords += newWords.length 94 | textStats.maxWordLength = if (longestWord.length > textStats.maxWordLength) longestWord.length else textStats.maxWordLength 95 | textStats.occurrences += newOccurrences 96 | 97 | textStats 98 | } 99 | 100 | def combineStats(stats1: MutableTextStats, stats2: MutableTextStats): MutableTextStats = { 101 | stats1.nLines += stats2.nLines 102 | stats1.nWords += stats2.nWords 103 | stats1.maxWordLength = if (stats1.maxWordLength > stats2.maxWordLength) stats1.maxWordLength else stats2.maxWordLength 104 | stats1.occurrences += stats2.occurrences 105 | 106 | stats1 107 | } 108 | 109 | val aggregate: RDD[(String, MutableTextStats)] = text.aggregateByKey(MutableTextStats.zero)(aggregateNewRecord, combineStats) 110 | aggregate.collectAsMap() 111 | } 112 | 113 | ///////////////////////// Version 3 - JVM arrays 114 | 115 | object UglyTextStats extends Serializable { 116 | val nLinesIndex = 0 117 | val nWordsIndex = 1 118 | val longestWordIndex = 2 119 | val occurrencesIndex = 3 120 | 121 | def aggregateNewRecord(textStats: Array[Int], record: String): Array[Int] = { 122 | val newWords = record.split(" ") // Array of strings 123 | 124 | var i = 0 125 | while (i < newWords.length) { 126 | val word = newWords(i) 127 | val wordLength = word.length 128 | 129 | textStats(longestWordIndex) = if (wordLength > textStats(longestWordIndex)) wordLength else textStats(longestWordIndex) 130 | textStats(occurrencesIndex) += (if (word == criticalWord) 1 else 0) 131 | 132 | i += 1 133 | } 134 | 135 | textStats(nLinesIndex) += 1 136 | textStats(nWordsIndex) += newWords.length 137 | 138 | textStats 139 | } 140 | 141 | def combineStats(stats1: Array[Int], stats2: Array[Int]): Array[Int] = { 142 | stats1(nLinesIndex) += stats2(nLinesIndex) 143 | stats1(nWordsIndex) += stats2(nWordsIndex) 144 | stats1(longestWordIndex) = if (stats1(longestWordIndex) > stats2(longestWordIndex)) stats1(longestWordIndex) else stats2(longestWordIndex) 145 | stats1(occurrencesIndex) += stats2(occurrencesIndex) 146 | 147 | stats1 148 | } 149 | } 150 | 151 | def collectStats3() = { 152 | val aggregate: RDD[(String, Array[Int])] = text.aggregateByKey(Array.fill(4)(0))(UglyTextStats.aggregateNewRecord, UglyTextStats.combineStats) 153 | aggregate.collectAsMap() 154 | } 155 | 156 | def main(args: Array[String]): Unit = { 157 | collectStats() 158 | collectStats2() 159 | collectStats3() 160 | 161 | Thread.sleep(1000000) 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/main/scala/part3dfjoins/ColumnPruning.scala: -------------------------------------------------------------------------------- 1 | package part3dfjoins 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.sql.functions._ 5 | 6 | object ColumnPruning { 7 | 8 | val spark = SparkSession.builder() 9 | .appName("Column Pruning") 10 | .master("local[2]") 11 | .getOrCreate() 12 | 13 | val sc = spark.sparkContext 14 | import spark.implicits._ 15 | 16 | val guitarsDF = spark.read 17 | .option("inferSchema", "true") 18 | .json("src/main/resources/data/guitars/guitars.json") 19 | 20 | val guitarPlayersDF = spark.read 21 | .option("inferSchema", "true") 22 | .json("src/main/resources/data/guitarPlayers/guitarPlayers.json") 23 | 24 | val bandsDF = spark.read 25 | .option("inferSchema", "true") 26 | .json("src/main/resources/data/bands/bands.json") 27 | 28 | val joinCondition = guitarPlayersDF.col("band") === bandsDF.col("id") 29 | val guitaristsBandsDF = guitarPlayersDF.join(bandsDF, joinCondition, "inner") 30 | guitaristsBandsDF.explain() 31 | 32 | /* 33 | == Physical Plan == 34 | *(2) BroadcastHashJoin [band#22L], [id#38L], Inner, BuildLeft 35 | :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#34] 36 | : +- *(1) Project [band#22L, guitars#23, id#24L, name#25] <-- UNNECESSARY 37 | : +- *(1) Filter isnotnull(band#22L) 38 | : +- BatchScan[band#22L, guitars#23, id#24L, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct,id:bigint,name:string> 39 | +- *(2) Project [hometown#37, id#38L, name#39, year#40L] 40 | +- *(2) Filter isnotnull(id#38L) 41 | +- BatchScan[hometown#37, id#38L, name#39, year#40L] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct 42 | */ 43 | 44 | val guitaristsWithoutBandsDF = guitarPlayersDF.join(bandsDF, joinCondition, "left_anti") 45 | guitaristsWithoutBandsDF.explain() 46 | /* 47 | == Physical Plan == 48 | *(2) BroadcastHashJoin [band#22L], [id#38L], LeftAnti, BuildRight 49 | :- *(2) Project [band#22L, guitars#23, id#24L, name#25] <- UNNECESSARY 50 | : +- BatchScan[band#22L, guitars#23, id#24L, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct,id:bigint,name:string> 51 | +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#66] 52 | +- *(1) Project [id#38L] <- COLUMN PRUNING 53 | +- *(1) Filter isnotnull(id#38L) 54 | +- BatchScan[id#38L] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct 55 | 56 | Column pruning = cut off columns that are not relevant 57 | = shrinks DF 58 | * useful for joins and groups 59 | */ 60 | 61 | // project and filter pushdown 62 | val namesDF = guitaristsBandsDF.select(guitarPlayersDF.col("name"), bandsDF.col("name")) 63 | namesDF.explain() 64 | 65 | /* 66 | == Physical Plan == 67 | *(2) Project [name#25, name#39] 68 | +- *(2) BroadcastHashJoin [band#22L], [id#38L], Inner, BuildLeft 69 | :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#100] 70 | : +- *(1) Project [band#22L, name#25] <- COLUMN PRUNING 71 | : +- *(1) Filter isnotnull(band#22L) 72 | : +- BatchScan[band#22L, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct 73 | +- *(2) Project [id#38L, name#39] 74 | +- *(2) Filter isnotnull(id#38L) 75 | +- BatchScan[id#38L, name#39] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct 76 | 77 | Spark tends to drop columns as early as possible. 78 | Should be YOUR goal as well. 79 | */ 80 | 81 | val rockDF = guitarPlayersDF 82 | .join(bandsDF, joinCondition) 83 | .join(guitarsDF, array_contains(guitarPlayersDF.col("guitars"), guitarsDF.col("id"))) 84 | 85 | val essentialsDF = rockDF.select(guitarPlayersDF.col("name"), bandsDF.col("name"), upper(guitarsDF.col("make"))) 86 | essentialsDF.explain() 87 | /* 88 | == Physical Plan == 89 | *(4) Project [name#25, name#39, upper(make#9) AS upper(make)#164] TODO the upper function is done LAST 90 | +- BroadcastNestedLoopJoin BuildRight, Inner, array_contains(guitars#23, id#8L) 91 | :- *(2) Project [guitars#23, name#25, name#39] 92 | : +- *(2) BroadcastHashJoin [band#22L], [id#38L], Inner, BuildLeft 93 | : :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#156] 94 | : : +- *(1) Project [band#22L, guitars#23, name#25] TODO <- Column pruning 95 | : : +- *(1) Filter isnotnull(band#22L) 96 | : : +- BatchScan[band#22L, guitars#23, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct,name:string> 97 | : +- *(2) Project [id#38L, name#39] TODO <- Column pruning 98 | : +- *(2) Filter isnotnull(id#38L) 99 | : +- BatchScan[id#38L, name#39] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct 100 | +- BroadcastExchange IdentityBroadcastMode, [id=#167] 101 | +- *(3) Project [id#8L, make#9] TODO <- Column pruning 102 | +- BatchScan[id#8L, make#9] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct 103 | */ 104 | 105 | /** 106 | * LESSON: if you anticipate that the joined table is much larger than the table on whose column you are applying the 107 | * map-side operation, e.g. " * 5", or "upper", do this operation on the small table FIRST. 108 | * 109 | * Particularly useful for outer joins. 110 | */ 111 | 112 | def main(args: Array[String]): Unit = { 113 | 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /HadoopWindowsUserSetup.md: -------------------------------------------------------------------------------- 1 | *Apache Spark doesn't have its own system to organize files in a distributed way (the file system), so it requires 2 | external file systems to store and process large datasets. For this reason, programmers install Spark 3 | on top of Hadoop so that Spark's advanced analytics applications can make use of the data stored using the Hadoop Distributed 4 | File System (HDFS).* 5 | 6 | ****Prerequisites:**** 7 | 8 | Before you start installing Hadoop on Windows, there are a few prerequisites that you need to have in place: 9 | 10 | 1. Java Development Kit (JDK) version 11 or higher 11 | 2. Apache Hadoop distribution suitable for Windows 12 | 13 | **Step 1:** *Install the Java Development Kit* 14 | 15 | Hadoop is built using Java, so you’ll need to install the Java Development Kit (JDK) version 11 16 | or higher on your computer. You can download the JDK from the Oracle website. 17 | (https://www.oracle.com/in/java/technologies/javase/javase8-archive-downloads.html) Once the download is 18 | complete, run the installer and follow the instructions to install the JDK. 19 | 20 | **Step 2:** *Download the Hadoop distribution* 21 | 22 | To install Hadoop on Windows, you’ll need to download the appropriate distribution from the 23 | Apache Hadoop website (https://hadoop.apache.org/releases.html). 24 | You’ll want to choose the distribution that is compatible with your version of Windows (hadoop-3.3.6) and click on binary. 25 | Once you’ve downloaded the distribution, extract files of hadoop-3.3.6.tar.gz and place under `C:\Hadoop`. 26 | 27 | **Step 3:** *Set up the Environment Variables* 28 | 29 | To use Java & Hadoop, you’ll need to set up some environment variables. 30 | This will allow you to run Java & Hadoop commands from any directory on your computer. 31 | To set up the environment variables, follow these steps: 32 | 33 | 1. Open the Start menu and search for “Environment Variables”. 34 | 2. Click on “Edit the system environment variables”. 35 | 3. Click on the “Environment Variables” button. 36 | 4. Under “System Variables”, click on “New”. 37 | 5. Enter “JAVA_HOME” as the variable name & the path to the directory where your java is installed (example- C:\Program Files\Java\jdk1.8.0) as the variable value. 38 | 6. Click “OK”. 39 | 7. Enter “HADOOP_HOME” as the variable name and the path to the directory where you extracted the Hadoop distribution (example- C:\hadoop) as the variable value. 40 | 8. Click “OK”. 41 | 9. Locate the “Path” variable in the “System Variables” list and click “Edit”. 42 | 10. Add the following to the end of the “Variable value” field: `%JAVA_HOME%\bin; %HADOOP_HOME%\bin; %HADOOP_HOME%\sbin;` 43 | 11. Click “OK” to close all the windows. 44 | 45 | **Step 4:** *Install Hadoop native IO binary* 46 | 47 | Clone or download the winutils repository (https://github.com/cdarlint/winutils/tree/master/hadoop-3.3.5/bin) 48 | and copy the contents of `hadoop-3.3.5/bin` into the extracted location of the Hadoop binary package. 49 | In our example, it will be `C:\Hadoop\bin`. 50 | 51 | **Important Note:** The following steps are not necessary for Spark to run, the above is sufficient to work with Spark. 52 | However, you can proceed if you really want the entire Hadoop distribution working locally. 53 | 54 | **Step 5:** *Hadoop Configuration* 55 | 56 | To configure Hadoop, you’ll need to modify a few configuration files. 57 | These files are located in the `etc/hadoop` directory of the Hadoop folder. 58 | Open each of the following files in a text editor and make the changes described below and save the files: 59 | 60 | 1. `core-site.xml`: Add the following lines to the file inside `` like this: 61 | ``` 62 | 63 | 64 | fs.defaultFS 65 | hdfs://localhost:9000 66 | 67 | 68 | ``` 69 | 70 | 2. Open the file `hadoop-env.cmd` (windows command script) and replace `set JAVA_HOME=%JAVA_HOME%` 71 | with the Java installation location, e.g. `set JAVA_HOME=C:\Program Files\Java\jdk1.11.0` 72 | or if it doesn't work then use `set JAVA_HOME=C:\Progra~1\Java\jdk1.11.0`. Also go to bottom of the file 73 | and give your name to this variable: `set HADOOP_IDENT_STRING=RockTheJVM`. 74 | 75 | 76 | 3. `hdfs-site.xml`: First create these folders - `C:/hadoop/data/dfs/datanode` and `C:/hadoop/data/dfs/datanode` 77 | Add the following lines to the file inside `` like this: 78 | ``` 79 | 80 | dfs.replication 81 | 1 82 | 83 | 84 | dfs.namenode.name.dir 85 | file:///C:/hadoop/data/dfs/namenode 86 | 87 | 88 | dfs.datanode.data.dir 89 | file:///C:/hadoop/data/dfs/datanode 90 | 91 | ``` 92 | 93 | 4. `mapred-site.xml`: Add the following lines to the file inside `` like this: 94 | ``` 95 | 96 | 97 | mapreduce.framework.name 98 | yarn 99 | 100 | 101 | ``` 102 | 103 | 5. `yarn-site.xml`: Add the following lines to the file inside `` like this: 104 | ``` 105 | 106 | 107 | yarn.nodemanager.aux-services 108 | mapreduce_shuffle 109 | Yarn Node Manager Aux Service 110 | 111 | 112 | ``` 113 | 114 | **Step 6:** **If you want to start Hadoop:** 115 | 116 | To start Hadoop, open a command prompt and navigate to the directory where you extracted the Hadoop distribution. 117 | Then, run the following commands: 118 | ``` 119 | cd sbin 120 | start-all.cmd 121 | ``` 122 | This will start the Hadoop daemons and launch the web interface. You can access the web interface by going to http://localhost:9000/ in your web browser. 123 | 124 | ****Conclusion:**** 125 | 126 | Setting up Hadoop on a Windows system can pose some challenges, but by following this comprehensive guide, 127 | you'll be able to configure it smoothly and quickly. Hadoop is a robust solution for handling extensive datasets and executing distributed applications, 128 | making it a favored choice for numerous enterprises and institutions worldwide. 129 | Whether you're a data scientist or a software developer, integrating Hadoop into your toolkit is highly beneficial. 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /src/main/scala/part3dfjoins/PrePartitioning.scala: -------------------------------------------------------------------------------- 1 | package part3dfjoins 2 | 3 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} 4 | 5 | object PrePartitioning { 6 | 7 | val spark = SparkSession.builder() 8 | .appName("Pre-partitioning") 9 | .master("local") 10 | .getOrCreate() 11 | 12 | import spark.implicits._ 13 | 14 | // deactivate broadcast joins 15 | spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 16 | 17 | /* 18 | addColumns(initialTable, 3) => dataframe with columns "id", "newCol1", "newCol2", "newCol3" 19 | */ 20 | def addColumns[T](df: Dataset[T], n: Int): DataFrame = { 21 | val newColumns = (1 to n).map(i => s"id * $i as newCol$i") 22 | df.selectExpr(("id" +: newColumns): _*) 23 | } 24 | 25 | // don't touch this 26 | val initialTable = spark.range(1, 10000000).repartition(10) // RoundRobinPartitioning(10) 27 | val narrowTable = spark.range(1, 5000000).repartition(7) // RoundRobinPartitioning(7) 28 | 29 | // scenario 1 30 | val wideTable = addColumns(initialTable, 30) 31 | val join1 = wideTable.join(narrowTable, "id") 32 | join1.explain() 33 | // println(join1.count()) // around 20s 34 | /* 35 | == Physical Plan == 36 | *(6) Project [id#0L, newCol1#8L, newCol2#9L, newCol3#10L, newCol4#11L, newCol5#12L, newCol6#13L, newCol7#14L, newCol8#15L, newCol9#16L, newCol10#17L, newCol11#18L, newCol12#19L, newCol13#20L, newCol14#21L, newCol15#22L, newCol16#23L, newCol17#24L, newCol18#25L, newCol19#26L, newCol20#27L, newCol21#28L, newCol22#29L, newCol23#30L, ... 7 more fields] 37 | +- *(6) SortMergeJoin [id#0L], [id#4L], Inner 38 | :- *(3) Sort [id#0L ASC NULLS FIRST], false, 0 39 | : +- Exchange hashpartitioning(id#0L, 200), true, [id=#39] 40 | : +- *(2) Project [id#0L, (id#0L * 1) AS newCol1#8L, (id#0L * 2) AS newCol2#9L, (id#0L * 3) AS newCol3#10L, (id#0L * 4) AS newCol4#11L, (id#0L * 5) AS newCol5#12L, (id#0L * 6) AS newCol6#13L, (id#0L * 7) AS newCol7#14L, (id#0L * 8) AS newCol8#15L, (id#0L * 9) AS newCol9#16L, (id#0L * 10) AS newCol10#17L, (id#0L * 11) AS newCol11#18L, (id#0L * 12) AS newCol12#19L, (id#0L * 13) AS newCol13#20L, (id#0L * 14) AS newCol14#21L, (id#0L * 15) AS newCol15#22L, (id#0L * 16) AS newCol16#23L, (id#0L * 17) AS newCol17#24L, (id#0L * 18) AS newCol18#25L, (id#0L * 19) AS newCol19#26L, (id#0L * 20) AS newCol20#27L, (id#0L * 21) AS newCol21#28L, (id#0L * 22) AS newCol22#29L, (id#0L * 23) AS newCol23#30L, ... 7 more fields] 41 | : +- Exchange RoundRobinPartitioning(10), false, [id=#35] 42 | : +- *(1) Range (1, 10000000, step=1, splits=1) 43 | +- *(5) Sort [id#4L ASC NULLS FIRST], false, 0 44 | +- Exchange hashpartitioning(id#4L, 200), true, [id=#46] 45 | +- Exchange RoundRobinPartitioning(7), false, [id=#45] 46 | +- *(4) Range (1, 5000000, step=1, splits=1) 47 | */ 48 | 49 | // scenario 2 50 | val altNarrow = narrowTable.repartition($"id") // use a HashPartitioner 51 | val altInitial = initialTable.repartition($"id") 52 | // join on co-partitioned DFs 53 | val join2 = altInitial.join(altNarrow, "id") 54 | val result2 = addColumns(join2, 30) 55 | result2.explain() 56 | // println(result2.count()) // 6s 57 | 58 | /* 59 | == Physical Plan == 60 | *(5) Project [id#0L, (id#0L * 1) AS newCol1#105L, (id#0L * 2) AS newCol2#106L, (id#0L * 3) AS newCol3#107L, (id#0L * 4) AS newCol4#108L, (id#0L * 5) AS newCol5#109L, (id#0L * 6) AS newCol6#110L, (id#0L * 7) AS newCol7#111L, (id#0L * 8) AS newCol8#112L, (id#0L * 9) AS newCol9#113L, (id#0L * 10) AS newCol10#114L, (id#0L * 11) AS newCol11#115L, (id#0L * 12) AS newCol12#116L, (id#0L * 13) AS newCol13#117L, (id#0L * 14) AS newCol14#118L, (id#0L * 15) AS newCol15#119L, (id#0L * 16) AS newCol16#120L, (id#0L * 17) AS newCol17#121L, (id#0L * 18) AS newCol18#122L, (id#0L * 19) AS newCol19#123L, (id#0L * 20) AS newCol20#124L, (id#0L * 21) AS newCol21#125L, (id#0L * 22) AS newCol22#126L, (id#0L * 23) AS newCol23#127L, ... 7 more fields] 61 | +- *(5) SortMergeJoin [id#0L], [id#4L], Inner 62 | :- *(2) Sort [id#0L ASC NULLS FIRST], false, 0 63 | : +- Exchange hashpartitioning(id#0L, 200), false, [id=#91] 64 | : +- *(1) Range (1, 10000000, step=1, splits=1) 65 | +- *(4) Sort [id#4L ASC NULLS FIRST], false, 0 66 | +- Exchange hashpartitioning(id#4L, 200), false, [id=#97] 67 | +- *(3) Range (1, 5000000, step=1, splits=1) 68 | */ 69 | 70 | /** 71 | * Lesson: partition early. 72 | * Partitioning late is AT BEST what Spark naturally does. 73 | */ 74 | 75 | // scenario 3 76 | val enhanceColumnsFirst = addColumns(initialTable, 30) 77 | val repartitionedNarrow = narrowTable.repartition($"id") 78 | val repartitionedEnhanced = enhanceColumnsFirst.repartition($"id") // USELESS! 79 | val result3 = enhanceColumnsFirst.join(repartitionedNarrow, "id") 80 | // println(result3.count()) // around 19-20s 81 | result3.explain() 82 | /* 83 | == Physical Plan == 84 | *(6) Project [id#0L, newCol1#166L, newCol2#167L, newCol3#168L, newCol4#169L, newCol5#170L, newCol6#171L, newCol7#172L, newCol8#173L, newCol9#174L, newCol10#175L, newCol11#176L, newCol12#177L, newCol13#178L, newCol14#179L, newCol15#180L, newCol16#181L, newCol17#182L, newCol18#183L, newCol19#184L, newCol20#185L, newCol21#186L, newCol22#187L, newCol23#188L, ... 7 more fields] 85 | +- *(6) SortMergeJoin [id#0L], [id#4L], Inner 86 | :- *(3) Sort [id#0L ASC NULLS FIRST], false, 0 87 | : +- Exchange hashpartitioning(id#0L, 200), false, [id=#154] 88 | : +- *(2) Project [id#0L, (id#0L * 1) AS newCol1#166L, (id#0L * 2) AS newCol2#167L, (id#0L * 3) AS newCol3#168L, (id#0L * 4) AS newCol4#169L, (id#0L * 5) AS newCol5#170L, (id#0L * 6) AS newCol6#171L, (id#0L * 7) AS newCol7#172L, (id#0L * 8) AS newCol8#173L, (id#0L * 9) AS newCol9#174L, (id#0L * 10) AS newCol10#175L, (id#0L * 11) AS newCol11#176L, (id#0L * 12) AS newCol12#177L, (id#0L * 13) AS newCol13#178L, (id#0L * 14) AS newCol14#179L, (id#0L * 15) AS newCol15#180L, (id#0L * 16) AS newCol16#181L, (id#0L * 17) AS newCol17#182L, (id#0L * 18) AS newCol18#183L, (id#0L * 19) AS newCol19#184L, (id#0L * 20) AS newCol20#185L, (id#0L * 21) AS newCol21#186L, (id#0L * 22) AS newCol22#187L, (id#0L * 23) AS newCol23#188L, ... 7 more fields] 89 | : +- Exchange RoundRobinPartitioning(10), false, [id=#150] 90 | : +- *(1) Range (1, 10000000, step=1, splits=1) 91 | +- *(5) Sort [id#4L ASC NULLS FIRST], false, 0 92 | +- Exchange hashpartitioning(id#4L, 200), false, [id=#160] 93 | +- *(4) Range (1, 5000000, step=1, splits=1) 94 | */ 95 | 96 | /** 97 | * Exercise: what would happen if we just repartitioned the smaller table to 10 partitions? 98 | * TERRIBLE! 99 | * 100 | */ 101 | 102 | initialTable.join(narrowTable.repartition(10), "id").explain() // identical to scenario 1 103 | 104 | def main(args: Array[String]): Unit = { 105 | 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/scala/part3dfjoins/Bucketing.scala: -------------------------------------------------------------------------------- 1 | package part3dfjoins 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | object Bucketing { 6 | 7 | val spark = SparkSession.builder() 8 | .appName("Bucketing") 9 | .master("local") 10 | .getOrCreate() 11 | 12 | import spark.implicits._ 13 | 14 | // deactivate broadcasting 15 | spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1) 16 | 17 | val large = spark.range(1000000).selectExpr("id * 5 as id").repartition(10) 18 | val small = spark.range(10000).selectExpr("id * 3 as id").repartition(3) 19 | 20 | val joined = large.join(small, "id") 21 | joined.explain() 22 | /* 23 | == Physical Plan == 24 | *(5) Project [id#2L] 25 | +- *(5) SortMergeJoin [id#2L], [id#6L], Inner 26 | :- *(2) Sort [id#2L ASC NULLS FIRST], false, 0 27 | : +- Exchange hashpartitioning(id#2L, 200), true, [id=#40] 28 | : +- Exchange RoundRobinPartitioning(10), false, [id=#39] 29 | : +- *(1) Project [(id#0L * 5) AS id#2L] 30 | : +- *(1) Range (0, 1000000, step=1, splits=1) 31 | +- *(4) Sort [id#6L ASC NULLS FIRST], false, 0 32 | +- Exchange hashpartitioning(id#6L, 200), true, [id=#47] 33 | +- Exchange RoundRobinPartitioning(3), false, [id=#46] 34 | +- *(3) Project [(id#4L * 3) AS id#6L] 35 | +- *(3) Range (0, 10000, step=1, splits=1) 36 | 37 | */ 38 | 39 | // bucketing 40 | large.write 41 | .bucketBy(4, "id") 42 | .sortBy("id") 43 | .mode("overwrite") 44 | .saveAsTable("bucketed_large") 45 | 46 | small.write 47 | .bucketBy(4, "id") 48 | .sortBy("id") 49 | .mode("overwrite") 50 | .saveAsTable("bucketed_small") // bucketing and saving almost as expensive as a regular shuffle 51 | 52 | spark.sql("use default") 53 | val bucketedLarge = spark.table("bucketed_large") 54 | val bucketedSmall = spark.table("bucketed_small") 55 | val bucketedJoin = bucketedLarge.join(bucketedSmall, "id") 56 | bucketedJoin.explain() 57 | /* 58 | *(3) Project [id#11L] 59 | +- *(3) SortMergeJoin [id#11L], [id#13L], Inner 60 | :- *(1) Sort [id#11L ASC NULLS FIRST], false, 0 61 | : +- *(1) Project [id#11L] 62 | : +- *(1) Filter isnotnull(id#11L) 63 | : +- *(1) ColumnarToRow 64 | : +- FileScan parquet default.bucketed_large[id#11L] Batched: true, DataFilters: [isnotnull(id#11L)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-warehouse/bu..., PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct, SelectedBucketsCount: 4 out of 4 65 | +- *(2) Sort [id#13L ASC NULLS FIRST], false, 0 66 | +- *(2) Project [id#13L] 67 | +- *(2) Filter isnotnull(id#13L) 68 | +- *(2) ColumnarToRow 69 | +- FileScan parquet default.bucketed_small[id#13L] Batched: true, DataFilters: [isnotnull(id#13L)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-warehouse/bu..., PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct, SelectedBucketsCount: 4 out of 4 70 | 71 | */ 72 | 73 | // bucketing for groups 74 | val flightsDF = spark.read 75 | .option("inferSchema", "true") 76 | .json("src/main/resources/data/flights/flights.json") 77 | .repartition(2) 78 | 79 | val mostDelayed = flightsDF 80 | .filter("origin = 'DEN' and arrdelay > 1") 81 | .groupBy("origin", "dest", "carrier") 82 | .avg("arrdelay") 83 | .orderBy($"avg(arrdelay)".desc_nulls_last) 84 | mostDelayed.explain() 85 | 86 | /* 87 | == Physical Plan == 88 | *(4) Sort [avg(arrdelay)#53 DESC NULLS LAST], true, 0 89 | +- Exchange rangepartitioning(avg(arrdelay)#53 DESC NULLS LAST, 200), true, [id=#111] 90 | +- *(3) HashAggregate(keys=[origin#27, dest#24, carrier#18], functions=[avg(arrdelay#17)]) 91 | +- Exchange hashpartitioning(origin#27, dest#24, carrier#18, 200), true, [id=#107] 92 | +- *(2) HashAggregate(keys=[origin#27, dest#24, carrier#18], functions=[partial_avg(arrdelay#17)]) 93 | +- Exchange RoundRobinPartitioning(2), false, [id=#103] 94 | +- *(1) Project [arrdelay#17, carrier#18, dest#24, origin#27] 95 | +- *(1) Filter (((isnotnull(origin#27) AND isnotnull(arrdelay#17)) AND (origin#27 = DEN)) AND (arrdelay#17 > 1.0)) 96 | +- BatchScan[arrdelay#17, carrier#18, dest#24, origin#27] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct 97 | */ 98 | 99 | // flightsDF.write 100 | // .partitionBy("origin") 101 | // .bucketBy(4, "dest", "carrier") 102 | // .saveAsTable("flights_bucketed") // just as long as a shuffle 103 | // 104 | // val flightsBucketed = spark.table("flights_bucketed") 105 | // val mostDelayed2 = flightsBucketed 106 | // .filter("origin = 'DEN' and arrdelay > 1") 107 | // .groupBy("origin", "dest", "carrier") 108 | // .avg("arrdelay") 109 | // .orderBy($"avg(arrdelay)".desc_nulls_last) 110 | // mostDelayed2.explain() 111 | /* 112 | == Physical Plan == 113 | *(2) Sort [avg(arrdelay)#140 DESC NULLS LAST], true, 0 114 | +- Exchange rangepartitioning(avg(arrdelay)#140 DESC NULLS LAST, 200), true, [id=#172] 115 | +- *(1) HashAggregate(keys=[origin#114, dest#111, carrier#105], functions=[avg(arrdelay#104)]) 116 | +- *(1) HashAggregate(keys=[origin#114, dest#111, carrier#105], functions=[partial_avg(arrdelay#104)]) 117 | +- *(1) Project [arrdelay#104, carrier#105, dest#111, origin#114] 118 | +- *(1) Filter (isnotnull(arrdelay#104) AND (arrdelay#104 > 1.0)) 119 | +- *(1) ColumnarToRow 120 | +- FileScan parquet default.flights_bucketed[arrdelay#104,carrier#105,dest#111,origin#114] Batched: true, DataFilters: [isnotnull(arrdelay#104), (arrdelay#104 > 1.0)], Format: Parquet, Location: PrunedInMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-wareho..., PartitionFilters: [isnotnull(origin#114), (origin#114 = DEN)], PushedFilters: [IsNotNull(arrdelay), GreaterThan(arrdelay,1.0)], ReadSchema: struct, SelectedBucketsCount: 4 out of 4 121 | */ 122 | 123 | /** 124 | * Bucket pruning 125 | */ 126 | val the10 = bucketedLarge.filter($"id" === 10) 127 | the10.show() 128 | the10.explain() 129 | /* 130 | == Physical Plan == 131 | *(1) Project [id#11L] 132 | +- *(1) Filter (isnotnull(id#11L) AND (id#11L = 10)) 133 | +- *(1) ColumnarToRow 134 | +- FileScan parquet default.bucketed_large[id#11L] Batched: true, DataFilters: [isnotnull(id#11L), (id#11L = 10)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-warehouse/bu..., PartitionFilters: [], PushedFilters: [IsNotNull(id), EqualTo(id,10)], ReadSchema: struct, SelectedBucketsCount: 1 out of 4 135 | */ 136 | 137 | def main(args: Array[String]): Unit = { 138 | // joined.count() // 4-5s 139 | // bucketedJoin.count() // 4s for bucketing + 0.5s for counting 140 | // mostDelayed.show() // ~1s 141 | // mostDelayed2.show() // ~0.2s = 5x perf! 142 | } 143 | 144 | } 145 | -------------------------------------------------------------------------------- /spark-cluster/README.md: -------------------------------------------------------------------------------- 1 | # Spark Cluster with Docker & docker-compose 2 | 3 | # General 4 | 5 | A simple spark standalone cluster for your testing environment purposses. A *docker-compose up* away from you solution for your spark development environment. 6 | 7 | The Docker compose will create the following containers: 8 | 9 | container|Ip address 10 | ---|--- 11 | spark-master|10.5.0.2 12 | spark-worker-1|10.5.0.3 13 | spark-worker-2|10.5.0.4 14 | spark-worker-3|10.5.0.5 15 | 16 | # Installation 17 | 18 | The following steps will make you run your spark cluster's containers. 19 | 20 | ## Pre requisites 21 | 22 | * Docker installed 23 | 24 | * Docker compose installed 25 | 26 | * A spark Application Jar to play with(Optional) 27 | 28 | ## Build the images 29 | 30 | The first step to deploy the cluster will be the build of the custom images, these builds can be performed with the *build-images.sh* script. 31 | 32 | The executions are as simple as the following steps: 33 | 34 | ```sh 35 | chmod +x build-images.sh 36 | ./build-images.sh 37 | ``` 38 | 39 | This will create the following docker images: 40 | 41 | * spark-base:3.5.0: A base image based on java:alpine-jdk-8 which ships scala, python3 and spark 3.5.0 42 | 43 | * spark-master:3.5.0: A image based on the previously created spark image, used to create a spark master containers. 44 | 45 | * spark-worker:3.5.0: A image based on the previously created spark image, used to create spark worker containers. 46 | 47 | * spark-submit:3.5.0: A image based on the previously created spark image, used to create spark submit containers(run, deliver driver and die gracefully). 48 | 49 | ## Run the docker-compose 50 | 51 | The final step to create your test cluster will be to run the compose file: 52 | 53 | ```sh 54 | docker-compose up --scale spark-worker=3 55 | ``` 56 | 57 | ## Validate your cluster 58 | 59 | Just validate your cluster accessing the spark UI on each worker & master URL. 60 | 61 | ### Spark Master 62 | 63 | http://10.5.0.2:8080/ 64 | 65 | ![alt text](docs/spark-master.png "Spark master UI") 66 | 67 | ### Spark Worker 1 68 | 69 | http://10.5.0.3:8081/ 70 | 71 | ![alt text](docs/spark-worker-1.png "Spark worker 1 UI") 72 | 73 | ### Spark Worker 2 74 | 75 | http://10.5.0.4:8081/ 76 | 77 | ![alt text](docs/spark-worker-2.png "Spark worker 2 UI") 78 | 79 | ### Spark Worker 3 80 | 81 | http://10.5.0.5:8081/ 82 | 83 | ![alt text](docs/spark-worker-3.png "Spark worker 3 UI") 84 | 85 | # Resource Allocation 86 | 87 | This cluster is shipped with three workers and one spark master, each of these has a particular set of resource allocation(basically RAM & cpu cores allocation). 88 | 89 | * The default CPU cores allocation for each spark worker is 1 core. 90 | 91 | * The default RAM for each spark-worker is 1024 MB. 92 | 93 | * The default RAM allocation for spark executors is 256mb. 94 | 95 | * The default RAM allocation for spark driver is 128mb 96 | 97 | * If you wish to modify this allocations just edit the env/spark-worker.sh file. 98 | 99 | # Binded Volumes 100 | 101 | To make app running easier I've shipped two volume mounts described in the following chart: 102 | 103 | Host Mount|Container Mount|Purposse 104 | ---|---|--- 105 | /mnt/spark-apps|/opt/spark-apps|Used to make available your app's jars on all workers & master 106 | /mnt/spark-data|/opt/spark-data| Used to make available your app's data on all workers & master 107 | 108 | This is basically a dummy DFS created from docker Volumes...(maybe not...) 109 | 110 | # Run a sample application 111 | 112 | Now let`s make a **wild spark submit** to validate the distributed nature of our new toy following these steps: 113 | 114 | ## Create a Scala spark app 115 | 116 | The first thing you need to do is to make a spark application. Our spark-submit image is designed to run scala code (soon will ship pyspark support guess I was just lazy to do so..). 117 | 118 | In my case I am using an app called [crimes-app](https://). You can make or use your own scala app, I 've just used this one because I had it at hand. 119 | 120 | 121 | ## Ship your jar & dependencies on the Workers and Master 122 | 123 | A necesary step to make a **spark-submit** is to copy your application bundle into all workers, also any configuration file or input file you need. 124 | 125 | Luckily for us we are using docker volumes so, you just have to copy your app and configs into /mnt/spark-apps, and your input files into /mnt/spark-files. 126 | 127 | ```bash 128 | #Copy spark application into all workers's app folder 129 | cp /home/workspace/crimes-app/build/libs/crimes-app.jar /mnt/spark-apps 130 | 131 | #Copy spark application configs into all workers's app folder 132 | cp -r /home/workspace/crimes-app/config /mnt/spark-apps 133 | 134 | # Copy the file to be processed to all workers's data folder 135 | cp /home/Crimes_-_2001_to_present.csv /mnt/spark-files 136 | ``` 137 | 138 | ## Check the successful copy of the data and app jar (Optional) 139 | 140 | This is not a necessary step, just if you are curious you can check if your app code and files are in place before running the spark-submit. 141 | 142 | ```sh 143 | # Worker 1 Validations 144 | docker exec -ti spark-worker-1 ls -l /opt/spark-apps 145 | 146 | docker exec -ti spark-worker-1 ls -l /opt/spark-data 147 | 148 | # Worker 2 Validations 149 | docker exec -ti spark-worker-2 ls -l /opt/spark-apps 150 | 151 | docker exec -ti spark-worker-2 ls -l /opt/spark-data 152 | 153 | # Worker 3 Validations 154 | docker exec -ti spark-worker-3 ls -l /opt/spark-apps 155 | 156 | docker exec -ti spark-worker-3 ls -l /opt/spark-data 157 | ``` 158 | After running one of this commands you have to see your app's jar and files. 159 | 160 | 161 | ## Use docker spark-submit 162 | 163 | ```bash 164 | #Creating some variables to make the docker run command more readable 165 | #App jar environment used by the spark-submit image 166 | SPARK_APPLICATION_JAR_LOCATION="/opt/spark-apps/crimes-app.jar" 167 | #App main class environment used by the spark-submit image 168 | SPARK_APPLICATION_MAIN_CLASS="org.mvb.applications.CrimesApp" 169 | #Extra submit args used by the spark-submit image 170 | SPARK_SUBMIT_ARGS="--conf spark.executor.extraJavaOptions='-Dconfig-path=/opt/spark-apps/dev/config.conf'" 171 | 172 | #We have to use the same network as the spark cluster(internally the image resolves spark master as spark://spark-master:7077) 173 | docker run --network docker-spark-cluster_spark-network \ 174 | -v /mnt/spark-apps:/opt/spark-apps \ 175 | --env SPARK_APPLICATION_JAR_LOCATION=$SPARK_APPLICATION_JAR_LOCATION \ 176 | --env SPARK_APPLICATION_MAIN_CLASS=$SPARK_APPLICATION_MAIN_CLASS \ 177 | spark-submit:3.5.0 178 | 179 | ``` 180 | 181 | After running this you will see an output pretty much like this: 182 | 183 | ```bash 184 | Running Spark using the REST application submission protocol. 185 | 2018-09-23 15:17:52 INFO RestSubmissionClient:54 - Submitting a request to launch an application in spark://spark-master:6066. 186 | 2018-09-23 15:17:53 INFO RestSubmissionClient:54 - Submission successfully created as driver-20180923151753-0000. Polling submission state... 187 | 2018-09-23 15:17:53 INFO RestSubmissionClient:54 - Submitting a request for the status of submission driver-20180923151753-0000 in spark://spark-master:6066. 188 | 2018-09-23 15:17:53 INFO RestSubmissionClient:54 - State of driver driver-20180923151753-0000 is now RUNNING. 189 | 2018-09-23 15:17:53 INFO RestSubmissionClient:54 - Driver is running on worker worker-20180923151711-10.5.0.4-45381 at 10.5.0.4:45381. 190 | 2018-09-23 15:17:53 INFO RestSubmissionClient:54 - Server responded with CreateSubmissionResponse: 191 | { 192 | "action" : "CreateSubmissionResponse", 193 | "message" : "Driver successfully submitted as driver-20180923151753-0000", 194 | "serverSparkVersion" : "3.5.0", 195 | "submissionId" : "driver-20180923151753-0000", 196 | "success" : true 197 | } 198 | ``` 199 | 200 | # Summary (What have I done :O?) 201 | 202 | * We compiled the necessary docker images to run spark master and worker containers. 203 | 204 | * We created a spark standalone cluster using 3 worker nodes and 1 master node using docker && docker-compose. 205 | 206 | * Copied the resources necessary to run a sample application. 207 | 208 | * Submitted an application to the cluster using a **spark-submit** docker image. 209 | 210 | * We ran a distributed application at home(just need enough cpu cores and RAM to do so). 211 | 212 | # Why a standalone cluster? 213 | 214 | * This is intended to be used for test purposses, basically a way of running distributed spark apps on your laptop or desktop. 215 | 216 | * Right now I don't have enough resources to make a Yarn, Mesos or Kubernetes based cluster :(. 217 | 218 | * This will be useful to use CI/CD pipelines for your spark apps(A really difficult and hot topic) 219 | -------------------------------------------------------------------------------- /src/main/scala/generator/DataGenerator.scala: -------------------------------------------------------------------------------- 1 | package generator 2 | 3 | import java.io.{File, FileWriter, PrintWriter} 4 | 5 | import scala.annotation.tailrec 6 | import scala.io.Source 7 | import scala.util.Random 8 | 9 | object DataGenerator { 10 | 11 | val random = new Random() 12 | 13 | ///////////////////////////////////////////////////////////////////////////////// 14 | // General data generation 15 | ///////////////////////////////////////////////////////////////////////////////// 16 | 17 | def randomDouble(limit: Double): Double = random.nextDouble() * limit 18 | 19 | def randomLong(limit: Long = Long.MaxValue): Long = Math.abs(random.nextLong()) % limit 20 | 21 | def randomInt(limit: Int = Int.MaxValue): Int = random.nextInt(limit) 22 | 23 | def randomIntBetween(low: Int, high: Int) = { 24 | assert(low <= high) 25 | random.nextInt(high - low) + low 26 | } 27 | 28 | def randomString(n: Int) = 29 | new String((0 to n).map(_ => ('a' + random.nextInt(26)).toChar).toArray) 30 | 31 | ///////////////////////////////////////////////////////////////////////////////// 32 | // Laptop models generation - skewed data lectures 33 | ///////////////////////////////////////////////////////////////////////////////// 34 | 35 | val laptopModelsSet: Seq[LaptopModel] = Seq( 36 | LaptopModel("Razer", "Blade"), 37 | LaptopModel("Alienware", "Area-51"), 38 | LaptopModel("HP", "Omen"), 39 | LaptopModel("Acer", "Predator"), 40 | LaptopModel("Asus", "ROG"), 41 | LaptopModel("Lenovo", "Legion"), 42 | LaptopModel("MSI", "Raider") 43 | ) 44 | 45 | def randomLaptopModel(uniform: Boolean = false): LaptopModel = { 46 | val makeModelIndex = if (!uniform && random.nextBoolean()) 0 else random.nextInt(laptopModelsSet.size) // 50% of the data is of the first kind 47 | laptopModelsSet(makeModelIndex) 48 | } 49 | 50 | def randomProcSpeed() = s"3.${random.nextInt(9)}".toDouble 51 | 52 | def randomRegistration(): String = s"${random.alphanumeric.take(7).mkString("")}" 53 | 54 | def randomPrice() = 500 + random.nextInt(1500) 55 | 56 | def randomLaptop(uniformDist: Boolean = false): Laptop = { 57 | val makeModel = randomLaptopModel() 58 | Laptop(randomRegistration(), makeModel.make, makeModel.model, randomProcSpeed()) 59 | } 60 | 61 | def randomLaptopOffer(uniformDist: Boolean = false): LaptopOffer = { 62 | val makeModel = randomLaptopModel() 63 | LaptopOffer(makeModel.make, makeModel.model, randomProcSpeed(), randomPrice()) 64 | } 65 | 66 | ///////////////////////////////////////////////////////////////////////////////// 67 | // Misc data generation 68 | ///////////////////////////////////////////////////////////////////////////////// 69 | 70 | /** 71 | * For the iterator-to-iterator transformations lecture. 72 | * Generates a number of metrics in the style of "metricName metricValue", where metricName is a string and metricValue is a double. 73 | * 74 | * @param destPath the path of the file the metrics will be written to. 75 | * @param nMetrics the number of metrics to generate 76 | * @param limit the maximum value any metric can take 77 | */ 78 | def generateMetrics(destPath: String, nMetrics: Int, limit: Double = 1000000) = { 79 | val writer = new PrintWriter(new FileWriter(new File(destPath))) 80 | (1 to nMetrics).foreach(_ => writer.println(s"${randomString(16)} ${randomDouble(1000000)}")) 81 | writer.flush() 82 | writer.close() 83 | } 84 | 85 | /** 86 | * For the RDD joins & cogroup lectures. Generates 3 files: 87 | * 1) with student IDs and names 88 | * 2) with student IDs and emails 89 | * 3) with student IDs and exam attempt grade 90 | * 91 | * @param rootFolderPath the path where the 3 files will be written 92 | * @param nStudents the number of students 93 | * @param nAttempts the number of attempts of the exam, per each student 94 | */ 95 | def generateExamData(rootFolderPath: String, nStudents: Int, nAttempts: Int): Unit = { 96 | val studentNames = (0 to nStudents).map(_ => randomString(16)) 97 | val studentIds = studentNames.map(_ => randomLong()) 98 | val idWriter = new PrintWriter(new FileWriter(new File(s"$rootFolderPath/examIds.txt"))) 99 | val emailWriter = new PrintWriter(new FileWriter(new File(s"$rootFolderPath/examEmails.txt"))) 100 | val scoreWriter = new PrintWriter(new FileWriter(new File(s"$rootFolderPath/examScores.txt"))) 101 | 102 | studentNames 103 | .zip(studentIds) 104 | .foreach { 105 | case (name, id) => 106 | idWriter.println(s"$id $name") 107 | emailWriter.println(s"$id $name@rockthejvm.com") 108 | } 109 | 110 | val scores = studentIds 111 | .flatMap(id => Seq.fill(5)(id)) 112 | .map(id => (id, randomInt(10), randomInt(10))) 113 | .toSet 114 | 115 | scores.foreach { 116 | case (id, scoreMaj, scoreMin) => scoreWriter.println(s"$id $scoreMaj.$scoreMin") 117 | } 118 | 119 | idWriter.flush() 120 | idWriter.close() 121 | emailWriter.flush() 122 | emailWriter.close() 123 | scoreWriter.flush() 124 | scoreWriter.close() 125 | } 126 | 127 | /** 128 | * For the Secondary Sort lesson. 129 | * Generates random person encounters as key-value pairs in a CSV file. 130 | * The key is the person identifier and the value is the distance to the closest person, as measured by a hypothetical "approach device". 131 | * 132 | * @param path the file path to write the data to 133 | * @param nPeople the number of people involved in the data 134 | * @param nValuesPerPerson the number of people encounters 135 | * @param skew the percentage of the data that belongs to one person 136 | */ 137 | def generatePeopleEncounters(path: String, nPeople: Int, nValuesPerPerson: Int, skew: Double = 0): Unit = { 138 | val writer = new PrintWriter(new FileWriter(new File(path))) 139 | val nEntries = nPeople * nValuesPerPerson 140 | 141 | writer.println("personId,approachValue") 142 | (1 to nEntries).foreach { _ => 143 | val personIndex = if (random.nextDouble() < skew) 0 else 1 + random.nextInt(nPeople) 144 | val approachValue = 10000 * random.nextDouble() 145 | 146 | writer.println(s"person_$personIndex,$approachValue") 147 | } 148 | 149 | writer.flush() 150 | writer.close() 151 | } 152 | 153 | /** 154 | * A function which generates random text in lorem-ipsum fashion, in chunks, as normal "paragraphs". 155 | * Supports an optional senders argument to attach every paragraph to a sender/broker ID for key-value crunching. 156 | * If the number of senders is positive, then each line will have a prefix "(senderID) // ", where senderID is randomly picked between 1 and nSenders. 157 | * 158 | * @param dstPath the file path where you want to write the text 159 | * @param nWords the number of words 160 | * @param nParagraphs the number of lines 161 | * @param nSenders (default 0) the number of unique senders 162 | */ 163 | def generateText(dstPath: String, nWords: Int, nParagraphs: Int, nSenders: Int = 0): Unit = { 164 | assert(nSenders >= 0) 165 | assert(nWords > 1) 166 | assert(nParagraphs > 0) 167 | 168 | val words = Source.fromFile("src/main/resources/data/lipsum/words.txt").getLines().toSeq 169 | val numWords = words.length 170 | 171 | def pickRandomWord(isLast: Boolean = false) = 172 | words(random.nextInt(numWords)) + (if (!isLast && random.nextInt() % 5 == 0) "," else "") 173 | 174 | val lowSentenceLimit = 2 175 | val highSentenceLimit = 14 176 | val avgParLength = nWords / nParagraphs 177 | val lowParLimit = avgParLength / 2 178 | val highParLimit = avgParLength * 3 / 2 179 | val writer = new PrintWriter(new FileWriter(new File(dstPath))) 180 | 181 | @tailrec 182 | def generateLipsumRec(nWords: Int, nParagraphs: Int, nWordsInParagraph: Int, attachSender: Boolean = false): Unit = { 183 | val sentenceLength = 184 | if (nWordsInParagraph < highSentenceLimit) nWordsInParagraph 185 | else randomIntBetween(lowSentenceLimit, highSentenceLimit) 186 | 187 | val ending = if (sentenceLength == nWordsInParagraph) "." else ". " 188 | val sentence = ((1 until sentenceLength).map(_ => pickRandomWord()) :+ pickRandomWord(true)).mkString(" ") + ending 189 | 190 | if (attachSender) { 191 | val sender = (randomInt(nSenders) + 1) + " // " 192 | writer.print(sender) 193 | } 194 | writer.print(sentence.capitalize) 195 | 196 | val nWordsInParagraphLeft = nWordsInParagraph - sentenceLength 197 | val nParagraphsLeft = nParagraphs - 1 198 | 199 | if (nWordsInParagraphLeft == 0) { 200 | if (nParagraphsLeft > 0) { 201 | val nWordsLeft = nWords - sentenceLength 202 | val nextParLength = 203 | if (nParagraphsLeft == 1) nWordsLeft 204 | else randomIntBetween(lowParLimit, highParLimit) 205 | 206 | writer.print("\n") 207 | generateLipsumRec(nWords - sentenceLength, nParagraphsLeft, nextParLength, nSenders > 0) 208 | } 209 | } else { 210 | generateLipsumRec(nWords - sentenceLength, nParagraphs, nWordsInParagraphLeft) 211 | } 212 | } 213 | 214 | val nWordsInFirstParagraph = if (nWords < highParLimit) nWords else randomIntBetween(lowParLimit, highParLimit) 215 | 216 | generateLipsumRec(nWords, nParagraphs, nWordsInFirstParagraph, nSenders > 0) 217 | writer.flush() 218 | writer.close() 219 | } 220 | 221 | def main(args: Array[String]): Unit = { 222 | generateExamData("src/main/resources/data/studentgen", 100000, 5) 223 | } 224 | } -------------------------------------------------------------------------------- /src/main/resources/data/cars/cars.json: -------------------------------------------------------------------------------- 1 | {"Name":"chevrolet chevelle malibu", "Miles_per_Gallon":18, "Cylinders":8, "Displacement":307, "Horsepower":130, "Weight_in_lbs":3504, "Acceleration":12, "Year":"1970-01-01", "Origin":"USA"} 2 | {"Name":"buick skylark 320", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":3693, "Acceleration":11.5, "Year":"1970-01-01", "Origin":"USA"} 3 | {"Name":"plymouth satellite", "Miles_per_Gallon":18, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3436, "Acceleration":11, "Year":"1970-01-01", "Origin":"USA"} 4 | {"Name":"amc rebel sst", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3433, "Acceleration":12, "Year":"1970-01-01", "Origin":"USA"} 5 | {"Name":"ford torino", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":3449, "Acceleration":10.5, "Year":"1970-01-01", "Origin":"USA"} 6 | {"Name":"ford galaxie 500", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":429, "Horsepower":198, "Weight_in_lbs":4341, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"} 7 | {"Name":"chevrolet impala", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":454, "Horsepower":220, "Weight_in_lbs":4354, "Acceleration":9, "Year":"1970-01-01", "Origin":"USA"} 8 | {"Name":"plymouth fury iii", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":440, "Horsepower":215, "Weight_in_lbs":4312, "Acceleration":8.5, "Year":"1970-01-01", "Origin":"USA"} 9 | {"Name":"pontiac catalina", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":455, "Horsepower":225, "Weight_in_lbs":4425, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"} 10 | {"Name":"amc ambassador dpl", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":390, "Horsepower":190, "Weight_in_lbs":3850, "Acceleration":8.5, "Year":"1970-01-01", "Origin":"USA"} 11 | {"Name":"citroen ds-21 pallas", "Miles_per_Gallon":null, "Cylinders":4, "Displacement":133, "Horsepower":115, "Weight_in_lbs":3090, "Acceleration":17.5, "Year":"1970-01-01", "Origin":"Europe"} 12 | {"Name":"chevrolet chevelle concours (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":4142, "Acceleration":11.5, "Year":"1970-01-01", "Origin":"USA"} 13 | {"Name":"ford torino (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":351, "Horsepower":153, "Weight_in_lbs":4034, "Acceleration":11, "Year":"1970-01-01", "Origin":"USA"} 14 | {"Name":"plymouth satellite (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":383, "Horsepower":175, "Weight_in_lbs":4166, "Acceleration":10.5, "Year":"1970-01-01", "Origin":"USA"} 15 | {"Name":"amc rebel sst (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":360, "Horsepower":175, "Weight_in_lbs":3850, "Acceleration":11, "Year":"1970-01-01", "Origin":"USA"} 16 | {"Name":"dodge challenger se", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":383, "Horsepower":170, "Weight_in_lbs":3563, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"} 17 | {"Name":"plymouth 'cuda 340", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":340, "Horsepower":160, "Weight_in_lbs":3609, "Acceleration":8, "Year":"1970-01-01", "Origin":"USA"} 18 | {"Name":"ford mustang boss 302", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":3353, "Acceleration":8, "Year":"1970-01-01", "Origin":"USA"} 19 | {"Name":"chevrolet monte carlo", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":400, "Horsepower":150, "Weight_in_lbs":3761, "Acceleration":9.5, "Year":"1970-01-01", "Origin":"USA"} 20 | {"Name":"buick estate wagon (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":455, "Horsepower":225, "Weight_in_lbs":3086, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"} 21 | {"Name":"toyota corona mark ii", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":113, "Horsepower":95, "Weight_in_lbs":2372, "Acceleration":15, "Year":"1970-01-01", "Origin":"Japan"} 22 | {"Name":"plymouth duster", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":198, "Horsepower":95, "Weight_in_lbs":2833, "Acceleration":15.5, "Year":"1970-01-01", "Origin":"USA"} 23 | {"Name":"amc hornet", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":199, "Horsepower":97, "Weight_in_lbs":2774, "Acceleration":15.5, "Year":"1970-01-01", "Origin":"USA"} 24 | {"Name":"ford maverick", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":2587, "Acceleration":16, "Year":"1970-01-01", "Origin":"USA"} 25 | {"Name":"datsun pl510", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2130, "Acceleration":14.5, "Year":"1970-01-01", "Origin":"Japan"} 26 | {"Name":"volkswagen 1131 deluxe sedan", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":46, "Weight_in_lbs":1835, "Acceleration":20.5, "Year":"1970-01-01", "Origin":"Europe"} 27 | {"Name":"peugeot 504", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":110, "Horsepower":87, "Weight_in_lbs":2672, "Acceleration":17.5, "Year":"1970-01-01", "Origin":"Europe"} 28 | {"Name":"audi 100 ls", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":107, "Horsepower":90, "Weight_in_lbs":2430, "Acceleration":14.5, "Year":"1970-01-01", "Origin":"Europe"} 29 | {"Name":"saab 99e", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":104, "Horsepower":95, "Weight_in_lbs":2375, "Acceleration":17.5, "Year":"1970-01-01", "Origin":"Europe"} 30 | {"Name":"bmw 2002", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":121, "Horsepower":113, "Weight_in_lbs":2234, "Acceleration":12.5, "Year":"1970-01-01", "Origin":"Europe"} 31 | {"Name":"amc gremlin", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":199, "Horsepower":90, "Weight_in_lbs":2648, "Acceleration":15, "Year":"1970-01-01", "Origin":"USA"} 32 | {"Name":"ford f250", "Miles_per_Gallon":10, "Cylinders":8, "Displacement":360, "Horsepower":215, "Weight_in_lbs":4615, "Acceleration":14, "Year":"1970-01-01", "Origin":"USA"} 33 | {"Name":"chevy c20", "Miles_per_Gallon":10, "Cylinders":8, "Displacement":307, "Horsepower":200, "Weight_in_lbs":4376, "Acceleration":15, "Year":"1970-01-01", "Origin":"USA"} 34 | {"Name":"dodge d200", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":318, "Horsepower":210, "Weight_in_lbs":4382, "Acceleration":13.5, "Year":"1970-01-01", "Origin":"USA"} 35 | {"Name":"hi 1200d", "Miles_per_Gallon":9, "Cylinders":8, "Displacement":304, "Horsepower":193, "Weight_in_lbs":4732, "Acceleration":18.5, "Year":"1970-01-01", "Origin":"USA"} 36 | {"Name":"datsun pl510", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2130, "Acceleration":14.5, "Year":"1971-01-01", "Origin":"Japan"} 37 | {"Name":"chevrolet vega 2300", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":140, "Horsepower":90, "Weight_in_lbs":2264, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"} 38 | {"Name":"toyota corona", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":113, "Horsepower":95, "Weight_in_lbs":2228, "Acceleration":14, "Year":"1971-01-01", "Origin":"Japan"} 39 | {"Name":"ford pinto", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":98, "Horsepower":null, "Weight_in_lbs":2046, "Acceleration":19, "Year":"1971-01-01", "Origin":"USA"} 40 | {"Name":"volkswagen super beetle 117", "Miles_per_Gallon":null, "Cylinders":4, "Displacement":97, "Horsepower":48, "Weight_in_lbs":1978, "Acceleration":20, "Year":"1971-01-01", "Origin":"Europe"} 41 | {"Name":"amc gremlin", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2634, "Acceleration":13, "Year":"1971-01-01", "Origin":"USA"} 42 | {"Name":"plymouth satellite custom", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":225, "Horsepower":105, "Weight_in_lbs":3439, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"} 43 | {"Name":"chevrolet chevelle malibu", "Miles_per_Gallon":17, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3329, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"} 44 | {"Name":"ford torino 500", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":250, "Horsepower":88, "Weight_in_lbs":3302, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"} 45 | {"Name":"amc matador", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":3288, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"} 46 | {"Name":"chevrolet impala", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":4209, "Acceleration":12, "Year":"1971-01-01", "Origin":"USA"} 47 | {"Name":"pontiac catalina brougham", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":400, "Horsepower":175, "Weight_in_lbs":4464, "Acceleration":11.5, "Year":"1971-01-01", "Origin":"USA"} 48 | {"Name":"ford galaxie 500", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":351, "Horsepower":153, "Weight_in_lbs":4154, "Acceleration":13.5, "Year":"1971-01-01", "Origin":"USA"} 49 | {"Name":"plymouth fury iii", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4096, "Acceleration":13, "Year":"1971-01-01", "Origin":"USA"} 50 | {"Name":"dodge monaco (sw)", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":383, "Horsepower":180, "Weight_in_lbs":4955, "Acceleration":11.5, "Year":"1971-01-01", "Origin":"USA"} 51 | {"Name":"ford country squire (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":170, "Weight_in_lbs":4746, "Acceleration":12, "Year":"1971-01-01", "Origin":"USA"} 52 | {"Name":"pontiac safari (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":175, "Weight_in_lbs":5140, "Acceleration":12, "Year":"1971-01-01", "Origin":"USA"} 53 | {"Name":"amc hornet sportabout (sw)", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":258, "Horsepower":110, "Weight_in_lbs":2962, "Acceleration":13.5, "Year":"1971-01-01", "Origin":"USA"} 54 | {"Name":"chevrolet vega (sw)", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":140, "Horsepower":72, "Weight_in_lbs":2408, "Acceleration":19, "Year":"1971-01-01", "Origin":"USA"} 55 | {"Name":"pontiac firebird", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3282, "Acceleration":15, "Year":"1971-01-01", "Origin":"USA"} 56 | {"Name":"ford mustang", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":88, "Weight_in_lbs":3139, "Acceleration":14.5, "Year":"1971-01-01", "Origin":"USA"} 57 | {"Name":"mercury capri 2000", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":122, "Horsepower":86, "Weight_in_lbs":2220, "Acceleration":14, "Year":"1971-01-01", "Origin":"USA"} 58 | {"Name":"opel 1900", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":116, "Horsepower":90, "Weight_in_lbs":2123, "Acceleration":14, "Year":"1971-01-01", "Origin":"Europe"} 59 | {"Name":"peugeot 304", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":79, "Horsepower":70, "Weight_in_lbs":2074, "Acceleration":19.5, "Year":"1971-01-01", "Origin":"Europe"} 60 | {"Name":"fiat 124b", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":88, "Horsepower":76, "Weight_in_lbs":2065, "Acceleration":14.5, "Year":"1971-01-01", "Origin":"Europe"} 61 | {"Name":"toyota corolla 1200", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":71, "Horsepower":65, "Weight_in_lbs":1773, "Acceleration":19, "Year":"1971-01-01", "Origin":"Japan"} 62 | {"Name":"datsun 1200", "Miles_per_Gallon":35, "Cylinders":4, "Displacement":72, "Horsepower":69, "Weight_in_lbs":1613, "Acceleration":18, "Year":"1971-01-01", "Origin":"Japan"} 63 | {"Name":"volkswagen model 111", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":60, "Weight_in_lbs":1834, "Acceleration":19, "Year":"1971-01-01", "Origin":"Europe"} 64 | {"Name":"plymouth cricket", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":91, "Horsepower":70, "Weight_in_lbs":1955, "Acceleration":20.5, "Year":"1971-01-01", "Origin":"USA"} 65 | {"Name":"toyota corona hardtop", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":113, "Horsepower":95, "Weight_in_lbs":2278, "Acceleration":15.5, "Year":"1972-01-01", "Origin":"Japan"} 66 | {"Name":"dodge colt hardtop", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":97.5, "Horsepower":80, "Weight_in_lbs":2126, "Acceleration":17, "Year":"1972-01-01", "Origin":"USA"} 67 | {"Name":"volkswagen type 3", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":97, "Horsepower":54, "Weight_in_lbs":2254, "Acceleration":23.5, "Year":"1972-01-01", "Origin":"Europe"} 68 | {"Name":"chevrolet vega", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":140, "Horsepower":90, "Weight_in_lbs":2408, "Acceleration":19.5, "Year":"1972-01-01", "Origin":"USA"} 69 | {"Name":"ford pinto runabout", "Miles_per_Gallon":21, "Cylinders":4, "Displacement":122, "Horsepower":86, "Weight_in_lbs":2226, "Acceleration":16.5, "Year":"1972-01-01", "Origin":"USA"} 70 | {"Name":"chevrolet impala", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":4274, "Acceleration":12, "Year":"1972-01-01", "Origin":"USA"} 71 | {"Name":"pontiac catalina", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":400, "Horsepower":175, "Weight_in_lbs":4385, "Acceleration":12, "Year":"1972-01-01", "Origin":"USA"} 72 | {"Name":"plymouth fury iii", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4135, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"USA"} 73 | {"Name":"ford galaxie 500", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":351, "Horsepower":153, "Weight_in_lbs":4129, "Acceleration":13, "Year":"1972-01-01", "Origin":"USA"} 74 | {"Name":"amc ambassador sst", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3672, "Acceleration":11.5, "Year":"1972-01-01", "Origin":"USA"} 75 | {"Name":"mercury marquis", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":429, "Horsepower":208, "Weight_in_lbs":4633, "Acceleration":11, "Year":"1972-01-01", "Origin":"USA"} 76 | {"Name":"buick lesabre custom", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":155, "Weight_in_lbs":4502, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"USA"} 77 | {"Name":"oldsmobile delta 88 royale", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":350, "Horsepower":160, "Weight_in_lbs":4456, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"USA"} 78 | {"Name":"chrysler newport royal", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":190, "Weight_in_lbs":4422, "Acceleration":12.5, "Year":"1972-01-01", "Origin":"USA"} 79 | {"Name":"mazda rx2 coupe", "Miles_per_Gallon":19, "Cylinders":3, "Displacement":70, "Horsepower":97, "Weight_in_lbs":2330, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"Japan"} 80 | {"Name":"amc matador (sw)", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3892, "Acceleration":12.5, "Year":"1972-01-01", "Origin":"USA"} 81 | {"Name":"chevrolet chevelle concours (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":307, "Horsepower":130, "Weight_in_lbs":4098, "Acceleration":14, "Year":"1972-01-01", "Origin":"USA"} 82 | {"Name":"ford gran torino (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":4294, "Acceleration":16, "Year":"1972-01-01", "Origin":"USA"} 83 | {"Name":"plymouth satellite custom (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4077, "Acceleration":14, "Year":"1972-01-01", "Origin":"USA"} 84 | {"Name":"volvo 145e (sw)", "Miles_per_Gallon":18, "Cylinders":4, "Displacement":121, "Horsepower":112, "Weight_in_lbs":2933, "Acceleration":14.5, "Year":"1972-01-01", "Origin":"Europe"} 85 | {"Name":"volkswagen 411 (sw)", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":121, "Horsepower":76, "Weight_in_lbs":2511, "Acceleration":18, "Year":"1972-01-01", "Origin":"Europe"} 86 | {"Name":"peugeot 504 (sw)", "Miles_per_Gallon":21, "Cylinders":4, "Displacement":120, "Horsepower":87, "Weight_in_lbs":2979, "Acceleration":19.5, "Year":"1972-01-01", "Origin":"Europe"} 87 | {"Name":"renault 12 (sw)", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":96, "Horsepower":69, "Weight_in_lbs":2189, "Acceleration":18, "Year":"1972-01-01", "Origin":"Europe"} 88 | {"Name":"ford pinto (sw)", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":122, "Horsepower":86, "Weight_in_lbs":2395, "Acceleration":16, "Year":"1972-01-01", "Origin":"USA"} 89 | {"Name":"datsun 510 (sw)", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":97, "Horsepower":92, "Weight_in_lbs":2288, "Acceleration":17, "Year":"1972-01-01", "Origin":"Japan"} 90 | {"Name":"toyouta corona mark ii (sw)", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":120, "Horsepower":97, "Weight_in_lbs":2506, "Acceleration":14.5, "Year":"1972-01-01", "Origin":"Japan"} 91 | {"Name":"dodge colt (sw)", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":98, "Horsepower":80, "Weight_in_lbs":2164, "Acceleration":15, "Year":"1972-01-01", "Origin":"USA"} 92 | {"Name":"toyota corolla 1600 (sw)", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2100, "Acceleration":16.5, "Year":"1972-01-01", "Origin":"Japan"} 93 | {"Name":"buick century 350", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":175, "Weight_in_lbs":4100, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"} 94 | {"Name":"amc matador", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3672, "Acceleration":11.5, "Year":"1973-01-01", "Origin":"USA"} 95 | {"Name":"chevrolet malibu", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":3988, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"} 96 | {"Name":"ford gran torino", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":302, "Horsepower":137, "Weight_in_lbs":4042, "Acceleration":14.5, "Year":"1973-01-01", "Origin":"USA"} 97 | {"Name":"dodge coronet custom", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3777, "Acceleration":12.5, "Year":"1973-01-01", "Origin":"USA"} 98 | {"Name":"mercury marquis brougham", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":429, "Horsepower":198, "Weight_in_lbs":4952, "Acceleration":11.5, "Year":"1973-01-01", "Origin":"USA"} 99 | {"Name":"chevrolet caprice classic", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":150, "Weight_in_lbs":4464, "Acceleration":12, "Year":"1973-01-01", "Origin":"USA"} 100 | {"Name":"ford ltd", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":351, "Horsepower":158, "Weight_in_lbs":4363, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"} 101 | {"Name":"plymouth fury gran sedan", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4237, "Acceleration":14.5, "Year":"1973-01-01", "Origin":"USA"} 102 | {"Name":"chrysler new yorker brougham", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":440, "Horsepower":215, "Weight_in_lbs":4735, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"} 103 | {"Name":"buick electra 225 custom", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":455, "Horsepower":225, "Weight_in_lbs":4951, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"} 104 | {"Name":"amc ambassador brougham", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":360, "Horsepower":175, "Weight_in_lbs":3821, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"} 105 | {"Name":"plymouth valiant", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":225, "Horsepower":105, "Weight_in_lbs":3121, "Acceleration":16.5, "Year":"1973-01-01", "Origin":"USA"} 106 | {"Name":"chevrolet nova custom", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3278, "Acceleration":18, "Year":"1973-01-01", "Origin":"USA"} 107 | {"Name":"amc hornet", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2945, "Acceleration":16, "Year":"1973-01-01", "Origin":"USA"} 108 | {"Name":"ford maverick", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":88, "Weight_in_lbs":3021, "Acceleration":16.5, "Year":"1973-01-01", "Origin":"USA"} 109 | {"Name":"plymouth duster", "Miles_per_Gallon":23, "Cylinders":6, "Displacement":198, "Horsepower":95, "Weight_in_lbs":2904, "Acceleration":16, "Year":"1973-01-01", "Origin":"USA"} 110 | {"Name":"volkswagen super beetle", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":46, "Weight_in_lbs":1950, "Acceleration":21, "Year":"1973-01-01", "Origin":"Europe"} 111 | {"Name":"chevrolet impala", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":400, "Horsepower":150, "Weight_in_lbs":4997, "Acceleration":14, "Year":"1973-01-01", "Origin":"USA"} 112 | {"Name":"ford country", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":400, "Horsepower":167, "Weight_in_lbs":4906, "Acceleration":12.5, "Year":"1973-01-01", "Origin":"USA"} 113 | {"Name":"plymouth custom suburb", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":360, "Horsepower":170, "Weight_in_lbs":4654, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"} 114 | {"Name":"oldsmobile vista cruiser", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":350, "Horsepower":180, "Weight_in_lbs":4499, "Acceleration":12.5, "Year":"1973-01-01", "Origin":"USA"} 115 | {"Name":"amc gremlin", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2789, "Acceleration":15, "Year":"1973-01-01", "Origin":"USA"} 116 | {"Name":"toyota carina", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2279, "Acceleration":19, "Year":"1973-01-01", "Origin":"Japan"} 117 | {"Name":"chevrolet vega", "Miles_per_Gallon":21, "Cylinders":4, "Displacement":140, "Horsepower":72, "Weight_in_lbs":2401, "Acceleration":19.5, "Year":"1973-01-01", "Origin":"USA"} 118 | {"Name":"datsun 610", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":108, "Horsepower":94, "Weight_in_lbs":2379, "Acceleration":16.5, "Year":"1973-01-01", "Origin":"Japan"} 119 | {"Name":"maxda rx3", "Miles_per_Gallon":18, "Cylinders":3, "Displacement":70, "Horsepower":90, "Weight_in_lbs":2124, "Acceleration":13.5, "Year":"1973-01-01", "Origin":"Japan"} 120 | {"Name":"ford pinto", "Miles_per_Gallon":19, "Cylinders":4, "Displacement":122, "Horsepower":85, "Weight_in_lbs":2310, "Acceleration":18.5, "Year":"1973-01-01", "Origin":"USA"} 121 | {"Name":"mercury capri v6", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":155, "Horsepower":107, "Weight_in_lbs":2472, "Acceleration":14, "Year":"1973-01-01", "Origin":"USA"} 122 | {"Name":"fiat 124 sport coupe", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":98, "Horsepower":90, "Weight_in_lbs":2265, "Acceleration":15.5, "Year":"1973-01-01", "Origin":"Europe"} 123 | {"Name":"chevrolet monte carlo s", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":4082, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"} 124 | {"Name":"pontiac grand prix", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":400, "Horsepower":230, "Weight_in_lbs":4278, "Acceleration":9.5, "Year":"1973-01-01", "Origin":"USA"} 125 | {"Name":"fiat 128", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":68, "Horsepower":49, "Weight_in_lbs":1867, "Acceleration":19.5, "Year":"1973-01-01", "Origin":"Europe"} 126 | {"Name":"opel manta", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":116, "Horsepower":75, "Weight_in_lbs":2158, "Acceleration":15.5, "Year":"1973-01-01", "Origin":"Europe"} 127 | {"Name":"audi 100ls", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":114, "Horsepower":91, "Weight_in_lbs":2582, "Acceleration":14, "Year":"1973-01-01", "Origin":"Europe"} 128 | {"Name":"volvo 144ea", "Miles_per_Gallon":19, "Cylinders":4, "Displacement":121, "Horsepower":112, "Weight_in_lbs":2868, "Acceleration":15.5, "Year":"1973-01-01", "Origin":"Europe"} 129 | {"Name":"dodge dart custom", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3399, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"} 130 | {"Name":"saab 99le", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":121, "Horsepower":110, "Weight_in_lbs":2660, "Acceleration":14, "Year":"1973-01-01", "Origin":"Europe"} 131 | {"Name":"toyota mark ii", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":156, "Horsepower":122, "Weight_in_lbs":2807, "Acceleration":13.5, "Year":"1973-01-01", "Origin":"Japan"} 132 | {"Name":"oldsmobile omega", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":350, "Horsepower":180, "Weight_in_lbs":3664, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"} 133 | {"Name":"plymouth duster", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":198, "Horsepower":95, "Weight_in_lbs":3102, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"USA"} 134 | {"Name":"ford maverick", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":200, "Horsepower":null, "Weight_in_lbs":2875, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"} 135 | {"Name":"amc hornet", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2901, "Acceleration":16, "Year":"1974-01-01", "Origin":"USA"} 136 | {"Name":"chevrolet nova", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3336, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"} 137 | {"Name":"datsun b210", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":79, "Horsepower":67, "Weight_in_lbs":1950, "Acceleration":19, "Year":"1974-01-01", "Origin":"Japan"} 138 | {"Name":"ford pinto", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":122, "Horsepower":80, "Weight_in_lbs":2451, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"USA"} 139 | {"Name":"toyota corolla 1200", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":71, "Horsepower":65, "Weight_in_lbs":1836, "Acceleration":21, "Year":"1974-01-01", "Origin":"Japan"} 140 | {"Name":"chevrolet vega", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":140, "Horsepower":75, "Weight_in_lbs":2542, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"} 141 | {"Name":"chevrolet chevelle malibu classic", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3781, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"} 142 | {"Name":"amc matador", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":258, "Horsepower":110, "Weight_in_lbs":3632, "Acceleration":18, "Year":"1974-01-01", "Origin":"USA"} 143 | {"Name":"plymouth satellite sebring", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":225, "Horsepower":105, "Weight_in_lbs":3613, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"USA"} 144 | {"Name":"ford gran torino", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":4141, "Acceleration":14, "Year":"1974-01-01", "Origin":"USA"} 145 | {"Name":"buick century luxus (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":150, "Weight_in_lbs":4699, "Acceleration":14.5, "Year":"1974-01-01", "Origin":"USA"} 146 | {"Name":"dodge coronet custom (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4457, "Acceleration":13.5, "Year":"1974-01-01", "Origin":"USA"} 147 | {"Name":"ford gran torino (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":4638, "Acceleration":16, "Year":"1974-01-01", "Origin":"USA"} 148 | {"Name":"amc matador (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":4257, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"USA"} 149 | {"Name":"audi fox", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":98, "Horsepower":83, "Weight_in_lbs":2219, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"Europe"} 150 | {"Name":"volkswagen dasher", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":79, "Horsepower":67, "Weight_in_lbs":1963, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"Europe"} 151 | {"Name":"opel manta", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":2300, "Acceleration":14.5, "Year":"1974-01-01", "Origin":"Europe"} 152 | {"Name":"toyota corona", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":76, "Horsepower":52, "Weight_in_lbs":1649, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"Japan"} 153 | {"Name":"datsun 710", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":83, "Horsepower":61, "Weight_in_lbs":2003, "Acceleration":19, "Year":"1974-01-01", "Origin":"Japan"} 154 | {"Name":"dodge colt", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":90, "Horsepower":75, "Weight_in_lbs":2125, "Acceleration":14.5, "Year":"1974-01-01", "Origin":"USA"} 155 | {"Name":"fiat 128", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":90, "Horsepower":75, "Weight_in_lbs":2108, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"Europe"} 156 | {"Name":"fiat 124 tc", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":116, "Horsepower":75, "Weight_in_lbs":2246, "Acceleration":14, "Year":"1974-01-01", "Origin":"Europe"} 157 | {"Name":"honda civic", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":120, "Horsepower":97, "Weight_in_lbs":2489, "Acceleration":15, "Year":"1974-01-01", "Origin":"Japan"} 158 | {"Name":"subaru", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":108, "Horsepower":93, "Weight_in_lbs":2391, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"Japan"} 159 | {"Name":"fiat x1.9", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":79, "Horsepower":67, "Weight_in_lbs":2000, "Acceleration":16, "Year":"1974-01-01", "Origin":"Europe"} 160 | {"Name":"plymouth valiant custom", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":225, "Horsepower":95, "Weight_in_lbs":3264, "Acceleration":16, "Year":"1975-01-01", "Origin":"USA"} 161 | {"Name":"chevrolet nova", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":105, "Weight_in_lbs":3459, "Acceleration":16, "Year":"1975-01-01", "Origin":"USA"} 162 | {"Name":"mercury monarch", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":250, "Horsepower":72, "Weight_in_lbs":3432, "Acceleration":21, "Year":"1975-01-01", "Origin":"USA"} 163 | {"Name":"ford maverick", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":250, "Horsepower":72, "Weight_in_lbs":3158, "Acceleration":19.5, "Year":"1975-01-01", "Origin":"USA"} 164 | {"Name":"pontiac catalina", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":400, "Horsepower":170, "Weight_in_lbs":4668, "Acceleration":11.5, "Year":"1975-01-01", "Origin":"USA"} 165 | {"Name":"chevrolet bel air", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":4440, "Acceleration":14, "Year":"1975-01-01", "Origin":"USA"} 166 | {"Name":"plymouth grand fury", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4498, "Acceleration":14.5, "Year":"1975-01-01", "Origin":"USA"} 167 | {"Name":"ford ltd", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":351, "Horsepower":148, "Weight_in_lbs":4657, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"USA"} 168 | {"Name":"buick century", "Miles_per_Gallon":17, "Cylinders":6, "Displacement":231, "Horsepower":110, "Weight_in_lbs":3907, "Acceleration":21, "Year":"1975-01-01", "Origin":"USA"} 169 | {"Name":"chevroelt chevelle malibu", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":250, "Horsepower":105, "Weight_in_lbs":3897, "Acceleration":18.5, "Year":"1975-01-01", "Origin":"USA"} 170 | {"Name":"amc matador", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":258, "Horsepower":110, "Weight_in_lbs":3730, "Acceleration":19, "Year":"1975-01-01", "Origin":"USA"} 171 | {"Name":"plymouth fury", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":225, "Horsepower":95, "Weight_in_lbs":3785, "Acceleration":19, "Year":"1975-01-01", "Origin":"USA"} 172 | {"Name":"buick skyhawk", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":231, "Horsepower":110, "Weight_in_lbs":3039, "Acceleration":15, "Year":"1975-01-01", "Origin":"USA"} 173 | {"Name":"chevrolet monza 2+2", "Miles_per_Gallon":20, "Cylinders":8, "Displacement":262, "Horsepower":110, "Weight_in_lbs":3221, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"USA"} 174 | {"Name":"ford mustang ii", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":302, "Horsepower":129, "Weight_in_lbs":3169, "Acceleration":12, "Year":"1975-01-01", "Origin":"USA"} 175 | {"Name":"toyota corolla", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":97, "Horsepower":75, "Weight_in_lbs":2171, "Acceleration":16, "Year":"1975-01-01", "Origin":"Japan"} 176 | {"Name":"ford pinto", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":140, "Horsepower":83, "Weight_in_lbs":2639, "Acceleration":17, "Year":"1975-01-01", "Origin":"USA"} 177 | {"Name":"amc gremlin", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2914, "Acceleration":16, "Year":"1975-01-01", "Origin":"USA"} 178 | {"Name":"pontiac astro", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":140, "Horsepower":78, "Weight_in_lbs":2592, "Acceleration":18.5, "Year":"1975-01-01", "Origin":"USA"} 179 | {"Name":"toyota corona", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":134, "Horsepower":96, "Weight_in_lbs":2702, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"Japan"} 180 | {"Name":"volkswagen dasher", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":90, "Horsepower":71, "Weight_in_lbs":2223, "Acceleration":16.5, "Year":"1975-01-01", "Origin":"Europe"} 181 | {"Name":"datsun 710", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":119, "Horsepower":97, "Weight_in_lbs":2545, "Acceleration":17, "Year":"1975-01-01", "Origin":"Japan"} 182 | {"Name":"ford pinto", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":171, "Horsepower":97, "Weight_in_lbs":2984, "Acceleration":14.5, "Year":"1975-01-01", "Origin":"USA"} 183 | {"Name":"volkswagen rabbit", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":90, "Horsepower":70, "Weight_in_lbs":1937, "Acceleration":14, "Year":"1975-01-01", "Origin":"Europe"} 184 | {"Name":"amc pacer", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3211, "Acceleration":17, "Year":"1975-01-01", "Origin":"USA"} 185 | {"Name":"audi 100ls", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":115, "Horsepower":95, "Weight_in_lbs":2694, "Acceleration":15, "Year":"1975-01-01", "Origin":"Europe"} 186 | {"Name":"peugeot 504", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":120, "Horsepower":88, "Weight_in_lbs":2957, "Acceleration":17, "Year":"1975-01-01", "Origin":"Europe"} 187 | {"Name":"volvo 244dl", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":121, "Horsepower":98, "Weight_in_lbs":2945, "Acceleration":14.5, "Year":"1975-01-01", "Origin":"Europe"} 188 | {"Name":"saab 99le", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":121, "Horsepower":115, "Weight_in_lbs":2671, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"Europe"} 189 | {"Name":"honda civic cvcc", "Miles_per_Gallon":33, "Cylinders":4, "Displacement":91, "Horsepower":53, "Weight_in_lbs":1795, "Acceleration":17.5, "Year":"1975-01-01", "Origin":"Japan"} 190 | {"Name":"fiat 131", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":107, "Horsepower":86, "Weight_in_lbs":2464, "Acceleration":15.5, "Year":"1976-01-01", "Origin":"Europe"} 191 | {"Name":"opel 1900", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":116, "Horsepower":81, "Weight_in_lbs":2220, "Acceleration":16.9, "Year":"1976-01-01", "Origin":"Europe"} 192 | {"Name":"capri ii", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":140, "Horsepower":92, "Weight_in_lbs":2572, "Acceleration":14.9, "Year":"1976-01-01", "Origin":"USA"} 193 | {"Name":"dodge colt", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":98, "Horsepower":79, "Weight_in_lbs":2255, "Acceleration":17.7, "Year":"1976-01-01", "Origin":"USA"} 194 | {"Name":"renault 12tl", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":101, "Horsepower":83, "Weight_in_lbs":2202, "Acceleration":15.3, "Year":"1976-01-01", "Origin":"Europe"} 195 | {"Name":"chevrolet chevelle malibu classic", "Miles_per_Gallon":17.5, "Cylinders":8, "Displacement":305, "Horsepower":140, "Weight_in_lbs":4215, "Acceleration":13, "Year":"1976-01-01", "Origin":"USA"} 196 | {"Name":"dodge coronet brougham", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4190, "Acceleration":13, "Year":"1976-01-01", "Origin":"USA"} 197 | {"Name":"amc matador", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":304, "Horsepower":120, "Weight_in_lbs":3962, "Acceleration":13.9, "Year":"1976-01-01", "Origin":"USA"} 198 | {"Name":"ford gran torino", "Miles_per_Gallon":14.5, "Cylinders":8, "Displacement":351, "Horsepower":152, "Weight_in_lbs":4215, "Acceleration":12.8, "Year":"1976-01-01", "Origin":"USA"} 199 | {"Name":"plymouth valiant", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3233, "Acceleration":15.4, "Year":"1976-01-01", "Origin":"USA"} 200 | {"Name":"chevrolet nova", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":250, "Horsepower":105, "Weight_in_lbs":3353, "Acceleration":14.5, "Year":"1976-01-01", "Origin":"USA"} 201 | {"Name":"ford maverick", "Miles_per_Gallon":24, "Cylinders":6, "Displacement":200, "Horsepower":81, "Weight_in_lbs":3012, "Acceleration":17.6, "Year":"1976-01-01", "Origin":"USA"} 202 | {"Name":"amc hornet", "Miles_per_Gallon":22.5, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3085, "Acceleration":17.6, "Year":"1976-01-01", "Origin":"USA"} 203 | {"Name":"chevrolet chevette", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":85, "Horsepower":52, "Weight_in_lbs":2035, "Acceleration":22.2, "Year":"1976-01-01", "Origin":"USA"} 204 | {"Name":"chevrolet woody", "Miles_per_Gallon":24.5, "Cylinders":4, "Displacement":98, "Horsepower":60, "Weight_in_lbs":2164, "Acceleration":22.1, "Year":"1976-01-01", "Origin":"USA"} 205 | {"Name":"vw rabbit", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":90, "Horsepower":70, "Weight_in_lbs":1937, "Acceleration":14.2, "Year":"1976-01-01", "Origin":"Europe"} 206 | {"Name":"honda civic", "Miles_per_Gallon":33, "Cylinders":4, "Displacement":91, "Horsepower":53, "Weight_in_lbs":1795, "Acceleration":17.4, "Year":"1976-01-01", "Origin":"Japan"} 207 | {"Name":"dodge aspen se", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3651, "Acceleration":17.7, "Year":"1976-01-01", "Origin":"USA"} 208 | {"Name":"ford granada ghia", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":78, "Weight_in_lbs":3574, "Acceleration":21, "Year":"1976-01-01", "Origin":"USA"} 209 | {"Name":"pontiac ventura sj", "Miles_per_Gallon":18.5, "Cylinders":6, "Displacement":250, "Horsepower":110, "Weight_in_lbs":3645, "Acceleration":16.2, "Year":"1976-01-01", "Origin":"USA"} 210 | {"Name":"amc pacer d/l", "Miles_per_Gallon":17.5, "Cylinders":6, "Displacement":258, "Horsepower":95, "Weight_in_lbs":3193, "Acceleration":17.8, "Year":"1976-01-01", "Origin":"USA"} 211 | {"Name":"volkswagen rabbit", "Miles_per_Gallon":29.5, "Cylinders":4, "Displacement":97, "Horsepower":71, "Weight_in_lbs":1825, "Acceleration":12.2, "Year":"1976-01-01", "Origin":"Europe"} 212 | {"Name":"datsun b-210", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":85, "Horsepower":70, "Weight_in_lbs":1990, "Acceleration":17, "Year":"1976-01-01", "Origin":"Japan"} 213 | {"Name":"toyota corolla", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":97, "Horsepower":75, "Weight_in_lbs":2155, "Acceleration":16.4, "Year":"1976-01-01", "Origin":"Japan"} 214 | {"Name":"ford pinto", "Miles_per_Gallon":26.5, "Cylinders":4, "Displacement":140, "Horsepower":72, "Weight_in_lbs":2565, "Acceleration":13.6, "Year":"1976-01-01", "Origin":"USA"} 215 | {"Name":"volvo 245", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":130, "Horsepower":102, "Weight_in_lbs":3150, "Acceleration":15.7, "Year":"1976-01-01", "Origin":"Europe"} 216 | {"Name":"plymouth volare premier v8", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3940, "Acceleration":13.2, "Year":"1976-01-01", "Origin":"USA"} 217 | {"Name":"peugeot 504", "Miles_per_Gallon":19, "Cylinders":4, "Displacement":120, "Horsepower":88, "Weight_in_lbs":3270, "Acceleration":21.9, "Year":"1976-01-01", "Origin":"Europe"} 218 | {"Name":"toyota mark ii", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":156, "Horsepower":108, "Weight_in_lbs":2930, "Acceleration":15.5, "Year":"1976-01-01", "Origin":"Japan"} 219 | {"Name":"mercedes-benz 280s", "Miles_per_Gallon":16.5, "Cylinders":6, "Displacement":168, "Horsepower":120, "Weight_in_lbs":3820, "Acceleration":16.7, "Year":"1976-01-01", "Origin":"Europe"} 220 | {"Name":"cadillac seville", "Miles_per_Gallon":16.5, "Cylinders":8, "Displacement":350, "Horsepower":180, "Weight_in_lbs":4380, "Acceleration":12.1, "Year":"1976-01-01", "Origin":"USA"} 221 | {"Name":"chevy c10", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":4055, "Acceleration":12, "Year":"1976-01-01", "Origin":"USA"} 222 | {"Name":"ford f108", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":302, "Horsepower":130, "Weight_in_lbs":3870, "Acceleration":15, "Year":"1976-01-01", "Origin":"USA"} 223 | {"Name":"dodge d100", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3755, "Acceleration":14, "Year":"1976-01-01", "Origin":"USA"} 224 | {"Name":"honda Accelerationord cvcc", "Miles_per_Gallon":31.5, "Cylinders":4, "Displacement":98, "Horsepower":68, "Weight_in_lbs":2045, "Acceleration":18.5, "Year":"1977-01-01", "Origin":"Japan"} 225 | {"Name":"buick opel isuzu deluxe", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":111, "Horsepower":80, "Weight_in_lbs":2155, "Acceleration":14.8, "Year":"1977-01-01", "Origin":"USA"} 226 | {"Name":"renault 5 gtl", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":79, "Horsepower":58, "Weight_in_lbs":1825, "Acceleration":18.6, "Year":"1977-01-01", "Origin":"Europe"} 227 | {"Name":"plymouth arrow gs", "Miles_per_Gallon":25.5, "Cylinders":4, "Displacement":122, "Horsepower":96, "Weight_in_lbs":2300, "Acceleration":15.5, "Year":"1977-01-01", "Origin":"USA"} 228 | {"Name":"datsun f-10 hatchback", "Miles_per_Gallon":33.5, "Cylinders":4, "Displacement":85, "Horsepower":70, "Weight_in_lbs":1945, "Acceleration":16.8, "Year":"1977-01-01", "Origin":"Japan"} 229 | {"Name":"chevrolet caprice classic", "Miles_per_Gallon":17.5, "Cylinders":8, "Displacement":305, "Horsepower":145, "Weight_in_lbs":3880, "Acceleration":12.5, "Year":"1977-01-01", "Origin":"USA"} 230 | {"Name":"oldsmobile cutlass supreme", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":260, "Horsepower":110, "Weight_in_lbs":4060, "Acceleration":19, "Year":"1977-01-01", "Origin":"USA"} 231 | {"Name":"dodge monaco brougham", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":318, "Horsepower":145, "Weight_in_lbs":4140, "Acceleration":13.7, "Year":"1977-01-01", "Origin":"USA"} 232 | {"Name":"mercury cougar brougham", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":302, "Horsepower":130, "Weight_in_lbs":4295, "Acceleration":14.9, "Year":"1977-01-01", "Origin":"USA"} 233 | {"Name":"chevrolet concours", "Miles_per_Gallon":17.5, "Cylinders":6, "Displacement":250, "Horsepower":110, "Weight_in_lbs":3520, "Acceleration":16.4, "Year":"1977-01-01", "Origin":"USA"} 234 | {"Name":"buick skylark", "Miles_per_Gallon":20.5, "Cylinders":6, "Displacement":231, "Horsepower":105, "Weight_in_lbs":3425, "Acceleration":16.9, "Year":"1977-01-01", "Origin":"USA"} 235 | {"Name":"plymouth volare custom", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3630, "Acceleration":17.7, "Year":"1977-01-01", "Origin":"USA"} 236 | {"Name":"ford granada", "Miles_per_Gallon":18.5, "Cylinders":6, "Displacement":250, "Horsepower":98, "Weight_in_lbs":3525, "Acceleration":19, "Year":"1977-01-01", "Origin":"USA"} 237 | {"Name":"pontiac grand prix lj", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":400, "Horsepower":180, "Weight_in_lbs":4220, "Acceleration":11.1, "Year":"1977-01-01", "Origin":"USA"} 238 | {"Name":"chevrolet monte carlo landau", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":350, "Horsepower":170, "Weight_in_lbs":4165, "Acceleration":11.4, "Year":"1977-01-01", "Origin":"USA"} 239 | {"Name":"chrysler cordoba", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":400, "Horsepower":190, "Weight_in_lbs":4325, "Acceleration":12.2, "Year":"1977-01-01", "Origin":"USA"} 240 | {"Name":"ford thunderbird", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":351, "Horsepower":149, "Weight_in_lbs":4335, "Acceleration":14.5, "Year":"1977-01-01", "Origin":"USA"} 241 | {"Name":"volkswagen rabbit custom", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":1940, "Acceleration":14.5, "Year":"1977-01-01", "Origin":"Europe"} 242 | {"Name":"pontiac sunbird coupe", "Miles_per_Gallon":24.5, "Cylinders":4, "Displacement":151, "Horsepower":88, "Weight_in_lbs":2740, "Acceleration":16, "Year":"1977-01-01", "Origin":"USA"} 243 | {"Name":"toyota corolla liftback", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":75, "Weight_in_lbs":2265, "Acceleration":18.2, "Year":"1977-01-01", "Origin":"Japan"} 244 | {"Name":"ford mustang ii 2+2", "Miles_per_Gallon":25.5, "Cylinders":4, "Displacement":140, "Horsepower":89, "Weight_in_lbs":2755, "Acceleration":15.8, "Year":"1977-01-01", "Origin":"USA"} 245 | {"Name":"chevrolet chevette", "Miles_per_Gallon":30.5, "Cylinders":4, "Displacement":98, "Horsepower":63, "Weight_in_lbs":2051, "Acceleration":17, "Year":"1977-01-01", "Origin":"USA"} 246 | {"Name":"dodge colt m/m", "Miles_per_Gallon":33.5, "Cylinders":4, "Displacement":98, "Horsepower":83, "Weight_in_lbs":2075, "Acceleration":15.9, "Year":"1977-01-01", "Origin":"USA"} 247 | {"Name":"subaru dl", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":97, "Horsepower":67, "Weight_in_lbs":1985, "Acceleration":16.4, "Year":"1977-01-01", "Origin":"Japan"} 248 | {"Name":"volkswagen dasher", "Miles_per_Gallon":30.5, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":2190, "Acceleration":14.1, "Year":"1977-01-01", "Origin":"Europe"} 249 | {"Name":"datsun 810", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":146, "Horsepower":97, "Weight_in_lbs":2815, "Acceleration":14.5, "Year":"1977-01-01", "Origin":"Japan"} 250 | {"Name":"bmw 320i", "Miles_per_Gallon":21.5, "Cylinders":4, "Displacement":121, "Horsepower":110, "Weight_in_lbs":2600, "Acceleration":12.8, "Year":"1977-01-01", "Origin":"Europe"} 251 | {"Name":"mazda rx-4", "Miles_per_Gallon":21.5, "Cylinders":3, "Displacement":80, "Horsepower":110, "Weight_in_lbs":2720, "Acceleration":13.5, "Year":"1977-01-01", "Origin":"Japan"} 252 | {"Name":"volkswagen rabbit custom diesel", "Miles_per_Gallon":43.1, "Cylinders":4, "Displacement":90, "Horsepower":48, "Weight_in_lbs":1985, "Acceleration":21.5, "Year":"1978-01-01", "Origin":"Europe"} 253 | {"Name":"ford fiesta", "Miles_per_Gallon":36.1, "Cylinders":4, "Displacement":98, "Horsepower":66, "Weight_in_lbs":1800, "Acceleration":14.4, "Year":"1978-01-01", "Origin":"USA"} 254 | {"Name":"mazda glc deluxe", "Miles_per_Gallon":32.8, "Cylinders":4, "Displacement":78, "Horsepower":52, "Weight_in_lbs":1985, "Acceleration":19.4, "Year":"1978-01-01", "Origin":"Japan"} 255 | {"Name":"datsun b210 gx", "Miles_per_Gallon":39.4, "Cylinders":4, "Displacement":85, "Horsepower":70, "Weight_in_lbs":2070, "Acceleration":18.6, "Year":"1978-01-01", "Origin":"Japan"} 256 | {"Name":"honda civic cvcc", "Miles_per_Gallon":36.1, "Cylinders":4, "Displacement":91, "Horsepower":60, "Weight_in_lbs":1800, "Acceleration":16.4, "Year":"1978-01-01", "Origin":"Japan"} 257 | {"Name":"oldsmobile cutlass salon brougham", "Miles_per_Gallon":19.9, "Cylinders":8, "Displacement":260, "Horsepower":110, "Weight_in_lbs":3365, "Acceleration":15.5, "Year":"1978-01-01", "Origin":"USA"} 258 | {"Name":"dodge diplomat", "Miles_per_Gallon":19.4, "Cylinders":8, "Displacement":318, "Horsepower":140, "Weight_in_lbs":3735, "Acceleration":13.2, "Year":"1978-01-01", "Origin":"USA"} 259 | {"Name":"mercury monarch ghia", "Miles_per_Gallon":20.2, "Cylinders":8, "Displacement":302, "Horsepower":139, "Weight_in_lbs":3570, "Acceleration":12.8, "Year":"1978-01-01", "Origin":"USA"} 260 | {"Name":"pontiac phoenix lj", "Miles_per_Gallon":19.2, "Cylinders":6, "Displacement":231, "Horsepower":105, "Weight_in_lbs":3535, "Acceleration":19.2, "Year":"1978-01-01", "Origin":"USA"} 261 | {"Name":"chevrolet malibu", "Miles_per_Gallon":20.5, "Cylinders":6, "Displacement":200, "Horsepower":95, "Weight_in_lbs":3155, "Acceleration":18.2, "Year":"1978-01-01", "Origin":"USA"} 262 | {"Name":"ford fairmont (auto)", "Miles_per_Gallon":20.2, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":2965, "Acceleration":15.8, "Year":"1978-01-01", "Origin":"USA"} 263 | {"Name":"ford fairmont (man)", "Miles_per_Gallon":25.1, "Cylinders":4, "Displacement":140, "Horsepower":88, "Weight_in_lbs":2720, "Acceleration":15.4, "Year":"1978-01-01", "Origin":"USA"} 264 | {"Name":"plymouth volare", "Miles_per_Gallon":20.5, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3430, "Acceleration":17.2, "Year":"1978-01-01", "Origin":"USA"} 265 | {"Name":"amc concord", "Miles_per_Gallon":19.4, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3210, "Acceleration":17.2, "Year":"1978-01-01", "Origin":"USA"} 266 | {"Name":"buick century special", "Miles_per_Gallon":20.6, "Cylinders":6, "Displacement":231, "Horsepower":105, "Weight_in_lbs":3380, "Acceleration":15.8, "Year":"1978-01-01", "Origin":"USA"} 267 | {"Name":"mercury zephyr", "Miles_per_Gallon":20.8, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":3070, "Acceleration":16.7, "Year":"1978-01-01", "Origin":"USA"} 268 | {"Name":"dodge aspen", "Miles_per_Gallon":18.6, "Cylinders":6, "Displacement":225, "Horsepower":110, "Weight_in_lbs":3620, "Acceleration":18.7, "Year":"1978-01-01", "Origin":"USA"} 269 | {"Name":"amc concord d/l", "Miles_per_Gallon":18.1, "Cylinders":6, "Displacement":258, "Horsepower":120, "Weight_in_lbs":3410, "Acceleration":15.1, "Year":"1978-01-01", "Origin":"USA"} 270 | {"Name":"chevrolet monte carlo landau", "Miles_per_Gallon":19.2, "Cylinders":8, "Displacement":305, "Horsepower":145, "Weight_in_lbs":3425, "Acceleration":13.2, "Year":"1978-01-01", "Origin":"USA"} 271 | {"Name":"buick regal sport coupe (turbo)", "Miles_per_Gallon":17.7, "Cylinders":6, "Displacement":231, "Horsepower":165, "Weight_in_lbs":3445, "Acceleration":13.4, "Year":"1978-01-01", "Origin":"USA"} 272 | {"Name":"ford futura", "Miles_per_Gallon":18.1, "Cylinders":8, "Displacement":302, "Horsepower":139, "Weight_in_lbs":3205, "Acceleration":11.2, "Year":"1978-01-01", "Origin":"USA"} 273 | {"Name":"dodge magnum xe", "Miles_per_Gallon":17.5, "Cylinders":8, "Displacement":318, "Horsepower":140, "Weight_in_lbs":4080, "Acceleration":13.7, "Year":"1978-01-01", "Origin":"USA"} 274 | {"Name":"chevrolet chevette", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":98, "Horsepower":68, "Weight_in_lbs":2155, "Acceleration":16.5, "Year":"1978-01-01", "Origin":"USA"} 275 | {"Name":"toyota corona", "Miles_per_Gallon":27.5, "Cylinders":4, "Displacement":134, "Horsepower":95, "Weight_in_lbs":2560, "Acceleration":14.2, "Year":"1978-01-01", "Origin":"Japan"} 276 | {"Name":"datsun 510", "Miles_per_Gallon":27.2, "Cylinders":4, "Displacement":119, "Horsepower":97, "Weight_in_lbs":2300, "Acceleration":14.7, "Year":"1978-01-01", "Origin":"Japan"} 277 | {"Name":"dodge omni", "Miles_per_Gallon":30.9, "Cylinders":4, "Displacement":105, "Horsepower":75, "Weight_in_lbs":2230, "Acceleration":14.5, "Year":"1978-01-01", "Origin":"USA"} 278 | {"Name":"toyota celica gt liftback", "Miles_per_Gallon":21.1, "Cylinders":4, "Displacement":134, "Horsepower":95, "Weight_in_lbs":2515, "Acceleration":14.8, "Year":"1978-01-01", "Origin":"Japan"} 279 | {"Name":"plymouth sapporo", "Miles_per_Gallon":23.2, "Cylinders":4, "Displacement":156, "Horsepower":105, "Weight_in_lbs":2745, "Acceleration":16.7, "Year":"1978-01-01", "Origin":"USA"} 280 | {"Name":"oldsmobile starfire sx", "Miles_per_Gallon":23.8, "Cylinders":4, "Displacement":151, "Horsepower":85, "Weight_in_lbs":2855, "Acceleration":17.6, "Year":"1978-01-01", "Origin":"USA"} 281 | {"Name":"datsun 200-sx", "Miles_per_Gallon":23.9, "Cylinders":4, "Displacement":119, "Horsepower":97, "Weight_in_lbs":2405, "Acceleration":14.9, "Year":"1978-01-01", "Origin":"Japan"} 282 | {"Name":"audi 5000", "Miles_per_Gallon":20.3, "Cylinders":5, "Displacement":131, "Horsepower":103, "Weight_in_lbs":2830, "Acceleration":15.9, "Year":"1978-01-01", "Origin":"Europe"} 283 | {"Name":"volvo 264gl", "Miles_per_Gallon":17, "Cylinders":6, "Displacement":163, "Horsepower":125, "Weight_in_lbs":3140, "Acceleration":13.6, "Year":"1978-01-01", "Origin":"Europe"} 284 | {"Name":"saab 99gle", "Miles_per_Gallon":21.6, "Cylinders":4, "Displacement":121, "Horsepower":115, "Weight_in_lbs":2795, "Acceleration":15.7, "Year":"1978-01-01", "Origin":"Europe"} 285 | {"Name":"peugeot 604sl", "Miles_per_Gallon":16.2, "Cylinders":6, "Displacement":163, "Horsepower":133, "Weight_in_lbs":3410, "Acceleration":15.8, "Year":"1978-01-01", "Origin":"Europe"} 286 | {"Name":"volkswagen scirocco", "Miles_per_Gallon":31.5, "Cylinders":4, "Displacement":89, "Horsepower":71, "Weight_in_lbs":1990, "Acceleration":14.9, "Year":"1978-01-01", "Origin":"Europe"} 287 | {"Name":"honda Accelerationord lx", "Miles_per_Gallon":29.5, "Cylinders":4, "Displacement":98, "Horsepower":68, "Weight_in_lbs":2135, "Acceleration":16.6, "Year":"1978-01-01", "Origin":"Japan"} 288 | {"Name":"pontiac lemans v6", "Miles_per_Gallon":21.5, "Cylinders":6, "Displacement":231, "Horsepower":115, "Weight_in_lbs":3245, "Acceleration":15.4, "Year":"1979-01-01", "Origin":"USA"} 289 | {"Name":"mercury zephyr 6", "Miles_per_Gallon":19.8, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":2990, "Acceleration":18.2, "Year":"1979-01-01", "Origin":"USA"} 290 | {"Name":"ford fairmont 4", "Miles_per_Gallon":22.3, "Cylinders":4, "Displacement":140, "Horsepower":88, "Weight_in_lbs":2890, "Acceleration":17.3, "Year":"1979-01-01", "Origin":"USA"} 291 | {"Name":"amc concord dl 6", "Miles_per_Gallon":20.2, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3265, "Acceleration":18.2, "Year":"1979-01-01", "Origin":"USA"} 292 | {"Name":"dodge aspen 6", "Miles_per_Gallon":20.6, "Cylinders":6, "Displacement":225, "Horsepower":110, "Weight_in_lbs":3360, "Acceleration":16.6, "Year":"1979-01-01", "Origin":"USA"} 293 | {"Name":"chevrolet caprice classic", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":305, "Horsepower":130, "Weight_in_lbs":3840, "Acceleration":15.4, "Year":"1979-01-01", "Origin":"USA"} 294 | {"Name":"ford ltd landau", "Miles_per_Gallon":17.6, "Cylinders":8, "Displacement":302, "Horsepower":129, "Weight_in_lbs":3725, "Acceleration":13.4, "Year":"1979-01-01", "Origin":"USA"} 295 | {"Name":"mercury grand marquis", "Miles_per_Gallon":16.5, "Cylinders":8, "Displacement":351, "Horsepower":138, "Weight_in_lbs":3955, "Acceleration":13.2, "Year":"1979-01-01", "Origin":"USA"} 296 | {"Name":"dodge st. regis", "Miles_per_Gallon":18.2, "Cylinders":8, "Displacement":318, "Horsepower":135, "Weight_in_lbs":3830, "Acceleration":15.2, "Year":"1979-01-01", "Origin":"USA"} 297 | {"Name":"buick estate wagon (sw)", "Miles_per_Gallon":16.9, "Cylinders":8, "Displacement":350, "Horsepower":155, "Weight_in_lbs":4360, "Acceleration":14.9, "Year":"1979-01-01", "Origin":"USA"} 298 | {"Name":"ford country squire (sw)", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":351, "Horsepower":142, "Weight_in_lbs":4054, "Acceleration":14.3, "Year":"1979-01-01", "Origin":"USA"} 299 | {"Name":"chevrolet malibu classic (sw)", "Miles_per_Gallon":19.2, "Cylinders":8, "Displacement":267, "Horsepower":125, "Weight_in_lbs":3605, "Acceleration":15, "Year":"1979-01-01", "Origin":"USA"} 300 | {"Name":"chrysler lebaron town @ country (sw)", "Miles_per_Gallon":18.5, "Cylinders":8, "Displacement":360, "Horsepower":150, "Weight_in_lbs":3940, "Acceleration":13, "Year":"1979-01-01", "Origin":"USA"} 301 | {"Name":"vw rabbit custom", "Miles_per_Gallon":31.9, "Cylinders":4, "Displacement":89, "Horsepower":71, "Weight_in_lbs":1925, "Acceleration":14, "Year":"1979-01-01", "Origin":"Europe"} 302 | {"Name":"maxda glc deluxe", "Miles_per_Gallon":34.1, "Cylinders":4, "Displacement":86, "Horsepower":65, "Weight_in_lbs":1975, "Acceleration":15.2, "Year":"1979-01-01", "Origin":"Japan"} 303 | {"Name":"dodge colt hatchback custom", "Miles_per_Gallon":35.7, "Cylinders":4, "Displacement":98, "Horsepower":80, "Weight_in_lbs":1915, "Acceleration":14.4, "Year":"1979-01-01", "Origin":"USA"} 304 | {"Name":"amc spirit dl", "Miles_per_Gallon":27.4, "Cylinders":4, "Displacement":121, "Horsepower":80, "Weight_in_lbs":2670, "Acceleration":15, "Year":"1979-01-01", "Origin":"USA"} 305 | {"Name":"mercedes benz 300d", "Miles_per_Gallon":25.4, "Cylinders":5, "Displacement":183, "Horsepower":77, "Weight_in_lbs":3530, "Acceleration":20.1, "Year":"1979-01-01", "Origin":"Europe"} 306 | {"Name":"cadillac eldorado", "Miles_per_Gallon":23, "Cylinders":8, "Displacement":350, "Horsepower":125, "Weight_in_lbs":3900, "Acceleration":17.4, "Year":"1979-01-01", "Origin":"USA"} 307 | {"Name":"peugeot 504", "Miles_per_Gallon":27.2, "Cylinders":4, "Displacement":141, "Horsepower":71, "Weight_in_lbs":3190, "Acceleration":24.8, "Year":"1979-01-01", "Origin":"Europe"} 308 | {"Name":"oldsmobile cutlass salon brougham", "Miles_per_Gallon":23.9, "Cylinders":8, "Displacement":260, "Horsepower":90, "Weight_in_lbs":3420, "Acceleration":22.2, "Year":"1979-01-01", "Origin":"USA"} 309 | {"Name":"plymouth horizon", "Miles_per_Gallon":34.2, "Cylinders":4, "Displacement":105, "Horsepower":70, "Weight_in_lbs":2200, "Acceleration":13.2, "Year":"1979-01-01", "Origin":"USA"} 310 | {"Name":"plymouth horizon tc3", "Miles_per_Gallon":34.5, "Cylinders":4, "Displacement":105, "Horsepower":70, "Weight_in_lbs":2150, "Acceleration":14.9, "Year":"1979-01-01", "Origin":"USA"} 311 | {"Name":"datsun 210", "Miles_per_Gallon":31.8, "Cylinders":4, "Displacement":85, "Horsepower":65, "Weight_in_lbs":2020, "Acceleration":19.2, "Year":"1979-01-01", "Origin":"Japan"} 312 | {"Name":"fiat strada custom", "Miles_per_Gallon":37.3, "Cylinders":4, "Displacement":91, "Horsepower":69, "Weight_in_lbs":2130, "Acceleration":14.7, "Year":"1979-01-01", "Origin":"Europe"} 313 | {"Name":"buick skylark limited", "Miles_per_Gallon":28.4, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2670, "Acceleration":16, "Year":"1979-01-01", "Origin":"USA"} 314 | {"Name":"chevrolet citation", "Miles_per_Gallon":28.8, "Cylinders":6, "Displacement":173, "Horsepower":115, "Weight_in_lbs":2595, "Acceleration":11.3, "Year":"1979-01-01", "Origin":"USA"} 315 | {"Name":"oldsmobile omega brougham", "Miles_per_Gallon":26.8, "Cylinders":6, "Displacement":173, "Horsepower":115, "Weight_in_lbs":2700, "Acceleration":12.9, "Year":"1979-01-01", "Origin":"USA"} 316 | {"Name":"pontiac phoenix", "Miles_per_Gallon":33.5, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2556, "Acceleration":13.2, "Year":"1979-01-01", "Origin":"USA"} 317 | {"Name":"vw rabbit", "Miles_per_Gallon":41.5, "Cylinders":4, "Displacement":98, "Horsepower":76, "Weight_in_lbs":2144, "Acceleration":14.7, "Year":"1980-01-01", "Origin":"Europe"} 318 | {"Name":"toyota corolla tercel", "Miles_per_Gallon":38.1, "Cylinders":4, "Displacement":89, "Horsepower":60, "Weight_in_lbs":1968, "Acceleration":18.8, "Year":"1980-01-01", "Origin":"Japan"} 319 | {"Name":"chevrolet chevette", "Miles_per_Gallon":32.1, "Cylinders":4, "Displacement":98, "Horsepower":70, "Weight_in_lbs":2120, "Acceleration":15.5, "Year":"1980-01-01", "Origin":"USA"} 320 | {"Name":"datsun 310", "Miles_per_Gallon":37.2, "Cylinders":4, "Displacement":86, "Horsepower":65, "Weight_in_lbs":2019, "Acceleration":16.4, "Year":"1980-01-01", "Origin":"Japan"} 321 | {"Name":"chevrolet citation", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2678, "Acceleration":16.5, "Year":"1980-01-01", "Origin":"USA"} 322 | {"Name":"ford fairmont", "Miles_per_Gallon":26.4, "Cylinders":4, "Displacement":140, "Horsepower":88, "Weight_in_lbs":2870, "Acceleration":18.1, "Year":"1980-01-01", "Origin":"USA"} 323 | {"Name":"amc concord", "Miles_per_Gallon":24.3, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":3003, "Acceleration":20.1, "Year":"1980-01-01", "Origin":"USA"} 324 | {"Name":"dodge aspen", "Miles_per_Gallon":19.1, "Cylinders":6, "Displacement":225, "Horsepower":90, "Weight_in_lbs":3381, "Acceleration":18.7, "Year":"1980-01-01", "Origin":"USA"} 325 | {"Name":"audi 4000", "Miles_per_Gallon":34.3, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":2188, "Acceleration":15.8, "Year":"1980-01-01", "Origin":"Europe"} 326 | {"Name":"toyota corona liftback", "Miles_per_Gallon":29.8, "Cylinders":4, "Displacement":134, "Horsepower":90, "Weight_in_lbs":2711, "Acceleration":15.5, "Year":"1980-01-01", "Origin":"Japan"} 327 | {"Name":"mazda 626", "Miles_per_Gallon":31.3, "Cylinders":4, "Displacement":120, "Horsepower":75, "Weight_in_lbs":2542, "Acceleration":17.5, "Year":"1980-01-01", "Origin":"Japan"} 328 | {"Name":"datsun 510 hatchback", "Miles_per_Gallon":37, "Cylinders":4, "Displacement":119, "Horsepower":92, "Weight_in_lbs":2434, "Acceleration":15, "Year":"1980-01-01", "Origin":"Japan"} 329 | {"Name":"toyota corolla", "Miles_per_Gallon":32.2, "Cylinders":4, "Displacement":108, "Horsepower":75, "Weight_in_lbs":2265, "Acceleration":15.2, "Year":"1980-01-01", "Origin":"Japan"} 330 | {"Name":"mazda glc", "Miles_per_Gallon":46.6, "Cylinders":4, "Displacement":86, "Horsepower":65, "Weight_in_lbs":2110, "Acceleration":17.9, "Year":"1980-01-01", "Origin":"Japan"} 331 | {"Name":"dodge colt", "Miles_per_Gallon":27.9, "Cylinders":4, "Displacement":156, "Horsepower":105, "Weight_in_lbs":2800, "Acceleration":14.4, "Year":"1980-01-01", "Origin":"USA"} 332 | {"Name":"datsun 210", "Miles_per_Gallon":40.8, "Cylinders":4, "Displacement":85, "Horsepower":65, "Weight_in_lbs":2110, "Acceleration":19.2, "Year":"1980-01-01", "Origin":"Japan"} 333 | {"Name":"vw rabbit c (diesel)", "Miles_per_Gallon":44.3, "Cylinders":4, "Displacement":90, "Horsepower":48, "Weight_in_lbs":2085, "Acceleration":21.7, "Year":"1980-01-01", "Origin":"Europe"} 334 | {"Name":"vw dasher (diesel)", "Miles_per_Gallon":43.4, "Cylinders":4, "Displacement":90, "Horsepower":48, "Weight_in_lbs":2335, "Acceleration":23.7, "Year":"1980-01-01", "Origin":"Europe"} 335 | {"Name":"audi 5000s (diesel)", "Miles_per_Gallon":36.4, "Cylinders":5, "Displacement":121, "Horsepower":67, "Weight_in_lbs":2950, "Acceleration":19.9, "Year":"1980-01-01", "Origin":"Europe"} 336 | {"Name":"mercedes-benz 240d", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":146, "Horsepower":67, "Weight_in_lbs":3250, "Acceleration":21.8, "Year":"1980-01-01", "Origin":"Europe"} 337 | {"Name":"honda civic 1500 gl", "Miles_per_Gallon":44.6, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1850, "Acceleration":13.8, "Year":"1980-01-01", "Origin":"Japan"} 338 | {"Name":"renault lecar deluxe", "Miles_per_Gallon":40.9, "Cylinders":4, "Displacement":85, "Horsepower":null, "Weight_in_lbs":1835, "Acceleration":17.3, "Year":"1980-01-01", "Origin":"Europe"} 339 | {"Name":"subaru dl", "Miles_per_Gallon":33.8, "Cylinders":4, "Displacement":97, "Horsepower":67, "Weight_in_lbs":2145, "Acceleration":18, "Year":"1980-01-01", "Origin":"Japan"} 340 | {"Name":"vokswagen rabbit", "Miles_per_Gallon":29.8, "Cylinders":4, "Displacement":89, "Horsepower":62, "Weight_in_lbs":1845, "Acceleration":15.3, "Year":"1980-01-01", "Origin":"Europe"} 341 | {"Name":"datsun 280-zx", "Miles_per_Gallon":32.7, "Cylinders":6, "Displacement":168, "Horsepower":132, "Weight_in_lbs":2910, "Acceleration":11.4, "Year":"1980-01-01", "Origin":"Japan"} 342 | {"Name":"mazda rx-7 gs", "Miles_per_Gallon":23.7, "Cylinders":3, "Displacement":70, "Horsepower":100, "Weight_in_lbs":2420, "Acceleration":12.5, "Year":"1980-01-01", "Origin":"Japan"} 343 | {"Name":"triumph tr7 coupe", "Miles_per_Gallon":35, "Cylinders":4, "Displacement":122, "Horsepower":88, "Weight_in_lbs":2500, "Acceleration":15.1, "Year":"1980-01-01", "Origin":"Europe"} 344 | {"Name":"ford mustang cobra", "Miles_per_Gallon":23.6, "Cylinders":4, "Displacement":140, "Horsepower":null, "Weight_in_lbs":2905, "Acceleration":14.3, "Year":"1980-01-01", "Origin":"USA"} 345 | {"Name":"honda Accelerationord", "Miles_per_Gallon":32.4, "Cylinders":4, "Displacement":107, "Horsepower":72, "Weight_in_lbs":2290, "Acceleration":17, "Year":"1980-01-01", "Origin":"Japan"} 346 | {"Name":"plymouth reliant", "Miles_per_Gallon":27.2, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2490, "Acceleration":15.7, "Year":"1982-01-01", "Origin":"USA"} 347 | {"Name":"buick skylark", "Miles_per_Gallon":26.6, "Cylinders":4, "Displacement":151, "Horsepower":84, "Weight_in_lbs":2635, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"} 348 | {"Name":"dodge aries wagon (sw)", "Miles_per_Gallon":25.8, "Cylinders":4, "Displacement":156, "Horsepower":92, "Weight_in_lbs":2620, "Acceleration":14.4, "Year":"1982-01-01", "Origin":"USA"} 349 | {"Name":"chevrolet citation", "Miles_per_Gallon":23.5, "Cylinders":6, "Displacement":173, "Horsepower":110, "Weight_in_lbs":2725, "Acceleration":12.6, "Year":"1982-01-01", "Origin":"USA"} 350 | {"Name":"plymouth reliant", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2385, "Acceleration":12.9, "Year":"1982-01-01", "Origin":"USA"} 351 | {"Name":"toyota starlet", "Miles_per_Gallon":39.1, "Cylinders":4, "Displacement":79, "Horsepower":58, "Weight_in_lbs":1755, "Acceleration":16.9, "Year":"1982-01-01", "Origin":"Japan"} 352 | {"Name":"plymouth champ", "Miles_per_Gallon":39, "Cylinders":4, "Displacement":86, "Horsepower":64, "Weight_in_lbs":1875, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"} 353 | {"Name":"honda civic 1300", "Miles_per_Gallon":35.1, "Cylinders":4, "Displacement":81, "Horsepower":60, "Weight_in_lbs":1760, "Acceleration":16.1, "Year":"1982-01-01", "Origin":"Japan"} 354 | {"Name":"subaru", "Miles_per_Gallon":32.3, "Cylinders":4, "Displacement":97, "Horsepower":67, "Weight_in_lbs":2065, "Acceleration":17.8, "Year":"1982-01-01", "Origin":"Japan"} 355 | {"Name":"datsun 210", "Miles_per_Gallon":37, "Cylinders":4, "Displacement":85, "Horsepower":65, "Weight_in_lbs":1975, "Acceleration":19.4, "Year":"1982-01-01", "Origin":"Japan"} 356 | {"Name":"toyota tercel", "Miles_per_Gallon":37.7, "Cylinders":4, "Displacement":89, "Horsepower":62, "Weight_in_lbs":2050, "Acceleration":17.3, "Year":"1982-01-01", "Origin":"Japan"} 357 | {"Name":"mazda glc 4", "Miles_per_Gallon":34.1, "Cylinders":4, "Displacement":91, "Horsepower":68, "Weight_in_lbs":1985, "Acceleration":16, "Year":"1982-01-01", "Origin":"Japan"} 358 | {"Name":"plymouth horizon 4", "Miles_per_Gallon":34.7, "Cylinders":4, "Displacement":105, "Horsepower":63, "Weight_in_lbs":2215, "Acceleration":14.9, "Year":"1982-01-01", "Origin":"USA"} 359 | {"Name":"ford escort 4w", "Miles_per_Gallon":34.4, "Cylinders":4, "Displacement":98, "Horsepower":65, "Weight_in_lbs":2045, "Acceleration":16.2, "Year":"1982-01-01", "Origin":"USA"} 360 | {"Name":"ford escort 2h", "Miles_per_Gallon":29.9, "Cylinders":4, "Displacement":98, "Horsepower":65, "Weight_in_lbs":2380, "Acceleration":20.7, "Year":"1982-01-01", "Origin":"USA"} 361 | {"Name":"volkswagen jetta", "Miles_per_Gallon":33, "Cylinders":4, "Displacement":105, "Horsepower":74, "Weight_in_lbs":2190, "Acceleration":14.2, "Year":"1982-01-01", "Origin":"Europe"} 362 | {"Name":"renault 18i", "Miles_per_Gallon":34.5, "Cylinders":4, "Displacement":100, "Horsepower":null, "Weight_in_lbs":2320, "Acceleration":15.8, "Year":"1982-01-01", "Origin":"Europe"} 363 | {"Name":"honda prelude", "Miles_per_Gallon":33.7, "Cylinders":4, "Displacement":107, "Horsepower":75, "Weight_in_lbs":2210, "Acceleration":14.4, "Year":"1982-01-01", "Origin":"Japan"} 364 | {"Name":"toyota corolla", "Miles_per_Gallon":32.4, "Cylinders":4, "Displacement":108, "Horsepower":75, "Weight_in_lbs":2350, "Acceleration":16.8, "Year":"1982-01-01", "Origin":"Japan"} 365 | {"Name":"datsun 200sx", "Miles_per_Gallon":32.9, "Cylinders":4, "Displacement":119, "Horsepower":100, "Weight_in_lbs":2615, "Acceleration":14.8, "Year":"1982-01-01", "Origin":"Japan"} 366 | {"Name":"mazda 626", "Miles_per_Gallon":31.6, "Cylinders":4, "Displacement":120, "Horsepower":74, "Weight_in_lbs":2635, "Acceleration":18.3, "Year":"1982-01-01", "Origin":"Japan"} 367 | {"Name":"peugeot 505s turbo diesel", "Miles_per_Gallon":28.1, "Cylinders":4, "Displacement":141, "Horsepower":80, "Weight_in_lbs":3230, "Acceleration":20.4, "Year":"1982-01-01", "Origin":"Europe"} 368 | {"Name":"saab 900s", "Miles_per_Gallon":null, "Cylinders":4, "Displacement":121, "Horsepower":110, "Weight_in_lbs":2800, "Acceleration":15.4, "Year":"1982-01-01", "Origin":"Europe"} 369 | {"Name":"volvo diesel", "Miles_per_Gallon":30.7, "Cylinders":6, "Displacement":145, "Horsepower":76, "Weight_in_lbs":3160, "Acceleration":19.6, "Year":"1982-01-01", "Origin":"Europe"} 370 | {"Name":"toyota cressida", "Miles_per_Gallon":25.4, "Cylinders":6, "Displacement":168, "Horsepower":116, "Weight_in_lbs":2900, "Acceleration":12.6, "Year":"1982-01-01", "Origin":"Japan"} 371 | {"Name":"datsun 810 maxima", "Miles_per_Gallon":24.2, "Cylinders":6, "Displacement":146, "Horsepower":120, "Weight_in_lbs":2930, "Acceleration":13.8, "Year":"1982-01-01", "Origin":"Japan"} 372 | {"Name":"buick century", "Miles_per_Gallon":22.4, "Cylinders":6, "Displacement":231, "Horsepower":110, "Weight_in_lbs":3415, "Acceleration":15.8, "Year":"1982-01-01", "Origin":"USA"} 373 | {"Name":"oldsmobile cutlass ls", "Miles_per_Gallon":26.6, "Cylinders":8, "Displacement":350, "Horsepower":105, "Weight_in_lbs":3725, "Acceleration":19, "Year":"1982-01-01", "Origin":"USA"} 374 | {"Name":"ford granada gl", "Miles_per_Gallon":20.2, "Cylinders":6, "Displacement":200, "Horsepower":88, "Weight_in_lbs":3060, "Acceleration":17.1, "Year":"1982-01-01", "Origin":"USA"} 375 | {"Name":"chrysler lebaron salon", "Miles_per_Gallon":17.6, "Cylinders":6, "Displacement":225, "Horsepower":85, "Weight_in_lbs":3465, "Acceleration":16.6, "Year":"1982-01-01", "Origin":"USA"} 376 | {"Name":"chevrolet cavalier", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":112, "Horsepower":88, "Weight_in_lbs":2605, "Acceleration":19.6, "Year":"1982-01-01", "Origin":"USA"} 377 | {"Name":"chevrolet cavalier wagon", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":112, "Horsepower":88, "Weight_in_lbs":2640, "Acceleration":18.6, "Year":"1982-01-01", "Origin":"USA"} 378 | {"Name":"chevrolet cavalier 2-door", "Miles_per_Gallon":34, "Cylinders":4, "Displacement":112, "Horsepower":88, "Weight_in_lbs":2395, "Acceleration":18, "Year":"1982-01-01", "Origin":"USA"} 379 | {"Name":"pontiac j2000 se hatchback", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":112, "Horsepower":85, "Weight_in_lbs":2575, "Acceleration":16.2, "Year":"1982-01-01", "Origin":"USA"} 380 | {"Name":"dodge aries se", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2525, "Acceleration":16, "Year":"1982-01-01", "Origin":"USA"} 381 | {"Name":"pontiac phoenix", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2735, "Acceleration":18, "Year":"1982-01-01", "Origin":"USA"} 382 | {"Name":"ford fairmont futura", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":140, "Horsepower":92, "Weight_in_lbs":2865, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"} 383 | {"Name":"amc concord dl", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":151, "Horsepower":null, "Weight_in_lbs":3035, "Acceleration":20.5, "Year":"1982-01-01", "Origin":"USA"} 384 | {"Name":"volkswagen rabbit l", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":105, "Horsepower":74, "Weight_in_lbs":1980, "Acceleration":15.3, "Year":"1982-01-01", "Origin":"Europe"} 385 | {"Name":"mazda glc custom l", "Miles_per_Gallon":37, "Cylinders":4, "Displacement":91, "Horsepower":68, "Weight_in_lbs":2025, "Acceleration":18.2, "Year":"1982-01-01", "Origin":"Japan"} 386 | {"Name":"mazda glc custom", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":91, "Horsepower":68, "Weight_in_lbs":1970, "Acceleration":17.6, "Year":"1982-01-01", "Origin":"Japan"} 387 | {"Name":"plymouth horizon miser", "Miles_per_Gallon":38, "Cylinders":4, "Displacement":105, "Horsepower":63, "Weight_in_lbs":2125, "Acceleration":14.7, "Year":"1982-01-01", "Origin":"USA"} 388 | {"Name":"mercury lynx l", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":98, "Horsepower":70, "Weight_in_lbs":2125, "Acceleration":17.3, "Year":"1982-01-01", "Origin":"USA"} 389 | {"Name":"nissan stanza xe", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":120, "Horsepower":88, "Weight_in_lbs":2160, "Acceleration":14.5, "Year":"1982-01-01", "Origin":"Japan"} 390 | {"Name":"honda Accelerationord", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":107, "Horsepower":75, "Weight_in_lbs":2205, "Acceleration":14.5, "Year":"1982-01-01", "Origin":"Japan"} 391 | {"Name":"toyota corolla", "Miles_per_Gallon":34, "Cylinders":4, "Displacement":108, "Horsepower":70, "Weight_in_lbs":2245, "Acceleration":16.9, "Year":"1982-01-01", "Origin":"Japan"} 392 | {"Name":"honda civic", "Miles_per_Gallon":38, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1965, "Acceleration":15, "Year":"1982-01-01", "Origin":"Japan"} 393 | {"Name":"honda civic (auto)", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1965, "Acceleration":15.7, "Year":"1982-01-01", "Origin":"Japan"} 394 | {"Name":"datsun 310 gx", "Miles_per_Gallon":38, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1995, "Acceleration":16.2, "Year":"1982-01-01", "Origin":"Japan"} 395 | {"Name":"buick century limited", "Miles_per_Gallon":25, "Cylinders":6, "Displacement":181, "Horsepower":110, "Weight_in_lbs":2945, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"} 396 | {"Name":"oldsmobile cutlass ciera (diesel)", "Miles_per_Gallon":38, "Cylinders":6, "Displacement":262, "Horsepower":85, "Weight_in_lbs":3015, "Acceleration":17, "Year":"1982-01-01", "Origin":"USA"} 397 | {"Name":"chrysler lebaron medallion", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":156, "Horsepower":92, "Weight_in_lbs":2585, "Acceleration":14.5, "Year":"1982-01-01", "Origin":"USA"} 398 | {"Name":"ford granada l", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":232, "Horsepower":112, "Weight_in_lbs":2835, "Acceleration":14.7, "Year":"1982-01-01", "Origin":"USA"} 399 | {"Name":"toyota celica gt", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":144, "Horsepower":96, "Weight_in_lbs":2665, "Acceleration":13.9, "Year":"1982-01-01", "Origin":"Japan"} 400 | {"Name":"dodge charger 2.2", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2370, "Acceleration":13, "Year":"1982-01-01", "Origin":"USA"} 401 | {"Name":"chevrolet camaro", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2950, "Acceleration":17.3, "Year":"1982-01-01", "Origin":"USA"} 402 | {"Name":"ford mustang gl", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":140, "Horsepower":86, "Weight_in_lbs":2790, "Acceleration":15.6, "Year":"1982-01-01", "Origin":"USA"} 403 | {"Name":"vw pickup", "Miles_per_Gallon":44, "Cylinders":4, "Displacement":97, "Horsepower":52, "Weight_in_lbs":2130, "Acceleration":24.6, "Year":"1982-01-01", "Origin":"Europe"} 404 | {"Name":"dodge rampage", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2295, "Acceleration":11.6, "Year":"1982-01-01", "Origin":"USA"} 405 | {"Name":"ford ranger", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":120, "Horsepower":79, "Weight_in_lbs":2625, "Acceleration":18.6, "Year":"1982-01-01", "Origin":"USA"} 406 | {"Name":"chevy s-10", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":119, "Horsepower":82, "Weight_in_lbs":2720, "Acceleration":19.4, "Year":"1982-01-01", "Origin":"USA"} --------------------------------------------------------------------------------