├── project
    └── build.properties
├── docker-clean.sh
├── src
    ├── main
    │   ├── resources
    │   │   └── data
    │   │   │   ├── guitarPlayers
    │   │   │       └── guitarPlayers.json
    │   │   │   ├── bands
    │   │   │       └── bands.json
    │   │   │   ├── guitars
    │   │   │       └── guitars.json
    │   │   │   ├── lipsum
    │   │   │       └── words.txt
    │   │   │   ├── employees
    │   │   │       └── employees.csv
    │   │   │   ├── employees_headers
    │   │   │       └── employees_headers.csv
    │   │   │   └── cars
    │   │   │       └── cars.json
    │   └── scala
    │   │   ├── generator
    │   │       ├── LaptopsDomain.scala
    │   │       └── DataGenerator.scala
    │   │   ├── playground
    │   │       └── Playground.scala
    │   │   ├── part2foundations
    │   │       ├── TestDeployApp.scala
    │   │       ├── ReadingDAGs.scala
    │   │       ├── SparkJobAnatomy.scala
    │   │       ├── ReadingQueryPlans.scala
    │   │       └── SparkAPIs.scala
    │   │   ├── part3dfjoins
    │   │       ├── BroadcastJoins.scala
    │   │       ├── JoinsRecap.scala
    │   │       ├── SkewedJoins.scala
    │   │       ├── ColumnPruning.scala
    │   │       ├── PrePartitioning.scala
    │   │       └── Bucketing.scala
    │   │   ├── part4rddjoins
    │   │       ├── RDDBroadcastJoins.scala
    │   │       ├── CogroupingRDDs.scala
    │   │       ├── RDDSkewedJoins.scala
    │   │       └── SimpleRDDJoins.scala
    │   │   ├── part1recap
    │   │       ├── SparkRecap.scala
    │   │       └── ScalaRecap.scala
    │   │   └── part5rddtransformations
    │   │       ├── ByKeyFunctions.scala
    │   │       ├── I2ITransformations.scala
    │   │       └── ReusingObjects.scala
    └── META-INF
    │   └── MANIFEST.MF
├── spark-cluster
    ├── build-images.sh
    ├── docker
    │   ├── spark-worker
    │   │   ├── Dockerfile
    │   │   └── start-worker.sh
    │   ├── spark-submit
    │   │   ├── spark-submit.sh
    │   │   └── Dockerfile
    │   ├── spark-master
    │   │   ├── Dockerfile
    │   │   └── start-master.sh
    │   └── base
    │   │   └── Dockerfile
    ├── env
    │   └── spark-worker.sh
    ├── docker-compose.yml
    └── README.md
├── .bsp
    └── sbt.json
├── README.md
├── .gitignore
└── HadoopWindowsUserSetup.md


/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 1.9.6


--------------------------------------------------------------------------------
/docker-clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker rm -f $(docker ps -aq)


--------------------------------------------------------------------------------
/src/main/resources/data/guitarPlayers/guitarPlayers.json:
--------------------------------------------------------------------------------
1 | {"id":0,"name":"Jimmy Page","guitars":[0],"band":0}
2 | {"id":1,"name":"Angus Young","guitars":[1],"band":1}
3 | {"id":2,"name":"Eric Clapton","guitars":[1,5],"band":2}
4 | {"id":3,"name":"Kirk Hammett","guitars":[3],"band":3}
5 | 


--------------------------------------------------------------------------------
/spark-cluster/build-images.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | set -e
4 | 
5 | docker build -t spark-base:latest ./docker/base
6 | docker build -t spark-master:latest ./docker/spark-master
7 | docker build -t spark-worker:latest ./docker/spark-worker
8 | docker build -t spark-submit:latest ./docker/spark-submit


--------------------------------------------------------------------------------
/src/main/resources/data/bands/bands.json:
--------------------------------------------------------------------------------
1 | {"id":1,"name":"AC/DC","hometown":"Sydney","year":1973}
2 | {"id":0,"name":"Led Zeppelin","hometown":"London","year":1968}
3 | {"id":3,"name":"Metallica","hometown":"Los Angeles","year":1981}
4 | {"id":4,"name":"The Beatles","hometown":"Liverpool","year":1960}
5 | 


--------------------------------------------------------------------------------
/spark-cluster/docker/spark-worker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM spark-base:latest
 2 | 
 3 | COPY start-worker.sh /
 4 | 
 5 | ENV SPARK_WORKER_WEBUI_PORT 8081
 6 | ENV SPARK_WORKER_LOG /spark/logs
 7 | ENV SPARK_MASTER "spark://spark-master:7077"
 8 | 
 9 | EXPOSE 8081
10 | 
11 | CMD ["/bin/bash", "/start-worker.sh"]
12 | 


--------------------------------------------------------------------------------
/src/main/scala/generator/LaptopsDomain.scala:
--------------------------------------------------------------------------------
1 | package generator
2 | 
3 | case class LaptopModel(make: String, model: String)
4 | case class Laptop(registration: String, make: String, model: String, procSpeed: Double)
5 | case class LaptopOffer(make: String, model: String, procSpeed: Double, salePrice: Double)
6 | 


--------------------------------------------------------------------------------
/spark-cluster/env/spark-worker.sh:
--------------------------------------------------------------------------------
1 | #Environment variables used by the spark workers
2 | #Do not touch this unless you modify the compose master
3 | SPARK_MASTER=spark://spark-master:7077
4 | #Allocation Parameters
5 | SPARK_WORKER_CORES=1
6 | SPARK_WORKER_MEMORY=1G
7 | SPARK_DRIVER_MEMORY=128m
8 | SPARK_EXECUTOR_MEMORY=256m


--------------------------------------------------------------------------------
/spark-cluster/docker/spark-submit/spark-submit.sh:
--------------------------------------------------------------------------------
 1 |  #!/bin/bash
 2 |  
 3 | /spark/bin/spark-submit \
 4 | --class ${SPARK_APPLICATION_MAIN_CLASS} \
 5 | --master ${SPARK_MASTER_URL} \
 6 | --deploy-mode cluster \
 7 | --total-executor-cores 1 \
 8 |  ${SPARK_SUBMIT_ARGS} \
 9 |  ${SPARK_APPLICATION_JAR_LOCATION} \
10 |  ${SPARK_APPLICATION_ARGS} \


--------------------------------------------------------------------------------
/.bsp/sbt.json:
--------------------------------------------------------------------------------
1 | {"name":"sbt","version":"1.9.6","bspVersion":"2.1.0-M1","languages":["scala"],"argv":["/Users/daniel/Library/Java/JavaVirtualMachines/adopt-openjdk-1.8.0_265/Contents/Home/jre/bin/java","-Xms100m","-Xmx100m","-classpath","/Users/daniel/Library/Application Support/JetBrains/IdeaIC2023.1/plugins/Scala/launcher/sbt-launch.jar","-Dsbt.script=/usr/local/bin/sbt","xsbt.boot.Boot","-bsp"]}


--------------------------------------------------------------------------------
/src/main/resources/data/guitars/guitars.json:
--------------------------------------------------------------------------------
1 | {"id":0,"model":"EDS-1275","make":"Gibson","guitarType":"Electric double-necked"}
2 | {"id":5,"model":"Stratocaster","make":"Fender","guitarType":"Electric"}
3 | {"id":1,"model":"SG","make":"Gibson","guitarType":"Electric"}
4 | {"id":2,"model":"914","make":"Taylor","guitarType":"Acoustic"}
5 | {"id":3,"model":"M-II","make":"ESP","guitarType":"Electric"}
6 | 


--------------------------------------------------------------------------------
/spark-cluster/docker/spark-submit/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM spark-base:latest
 2 | 
 3 | COPY spark-submit.sh /
 4 | 
 5 | ENV SPARK_MASTER_URL="spark://spark-master:7077"
 6 | ENV SPARK_SUBMIT_ARGS=""
 7 | ENV SPARK_APPLICATION_ARGS ""
 8 | #ENV SPARK_APPLICATION_JAR_LOCATION /opt/spark-apps/myjar.jar
 9 | #ENV SPARK_APPLICATION_MAIN_CLASS my.main.Application
10 | 
11 | 
12 | CMD ["/bin/bash", "/spark-submit.sh"]
13 | 


--------------------------------------------------------------------------------
/spark-cluster/docker/spark-master/Dockerfile:
--------------------------------------------------------------------------------
 1 | # This assumes spark-base was built first.
 2 | # Usually we'd run the build-images.sh script which builds spark-base
 3 | 
 4 | FROM spark-base:latest
 5 | 
 6 | COPY start-master.sh /
 7 | 
 8 | ENV SPARK_MASTER_PORT 7077
 9 | ENV SPARK_MASTER_WEBUI_PORT 8080
10 | ENV SPARK_MASTER_LOG /spark/logs
11 | 
12 | EXPOSE 8080 7077 6066
13 | 
14 | CMD ["/bin/bash", "/start-master.sh"]
15 | 


--------------------------------------------------------------------------------
/spark-cluster/docker/spark-worker/start-worker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | . "/spark/sbin/spark-config.sh"
 4 | . "/spark/bin/load-spark-env.sh"
 5 | 
 6 | mkdir -p $SPARK_WORKER_LOG
 7 | 
 8 | export SPARK_HOME=/spark
 9 | 
10 | ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out
11 | 
12 | /spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out


--------------------------------------------------------------------------------
/spark-cluster/docker/spark-master/start-master.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export SPARK_MASTER_HOST=`hostname`
 4 | 
 5 | . "/spark/sbin/spark-config.sh"
 6 | 
 7 | . "/spark/bin/load-spark-env.sh"
 8 | 
 9 | mkdir -p $SPARK_MASTER_LOG
10 | 
11 | export SPARK_HOME=/spark
12 | 
13 | ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out
14 | 
15 | cd /spark/bin && /spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out
16 | 


--------------------------------------------------------------------------------
/spark-cluster/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.7"
 2 | services:
 3 |   spark-master:
 4 |     image: spark-master:latest
 5 |     ports:
 6 |       - "4040:4040"
 7 |       - "9090:8080"
 8 |       - "7077:7077"
 9 |     volumes:
10 |        - ./apps:/opt/spark-apps
11 |        - ./data:/opt/spark-data
12 |     environment:
13 |       - "SPARK_LOCAL_IP=spark-master"
14 |   spark-worker:
15 |     image: spark-worker:latest
16 |     depends_on:
17 |       - spark-master
18 |     environment:
19 |       - SPARK_MASTER=spark://spark-master:7077
20 |       - SPARK_WORKER_CORES=1
21 |       - SPARK_WORKER_MEMORY=2G
22 |       - SPARK_DRIVER_MEMORY=256m
23 |       - SPARK_EXECUTOR_MEMORY=1G
24 |     volumes:
25 |        - ./apps:/opt/spark-apps
26 |        - ./data:/opt/spark-data
27 | 


--------------------------------------------------------------------------------
/src/main/scala/playground/Playground.scala:
--------------------------------------------------------------------------------
 1 | package playground
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * A simple Scala application where I'll invite you to play and fiddle with the code that we write in this course.
 7 |   * (not that you couldn't create your own, mind you.)
 8 |   *
 9 |   * If you can compile and run this application, it means that the libraries were downloaded correctly.
10 |   * In that case, you should be good to go for the rest of the course.
11 |   *
12 |   * Enjoy!
13 |   *
14 |   * Daniel @ Rock the JVM
15 |   */
16 | object Playground {
17 | 
18 |   val spark = SparkSession.builder()
19 |     .appName("Spark Optimization Playground")
20 |     .master("local")
21 |     .getOrCreate()
22 | 
23 |   val sc = spark.sparkContext
24 | 
25 |   def main(args: Array[String]): Unit = {
26 |     val rdd = sc.parallelize(1 to 1000)
27 |     println(s"I have my first RDD, it has ${rdd.count} rows. Now let me go optimize massive jobs.")
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/scala/part2foundations/TestDeployApp.scala:
--------------------------------------------------------------------------------
 1 | package part2foundations
 2 | 
 3 | import org.apache.spark.sql.{SaveMode, SparkSession}
 4 | 
 5 | object TestDeployApp {
 6 | 
 7 |   // TestDeployApp inputFile outputFile
 8 |   def main(args: Array[String]): Unit = {
 9 | 
10 |     if (args.length != 2) {
11 |       println("Need input file and output file")
12 |       System.exit(1)
13 |     }
14 | 
15 |     val spark = SparkSession.builder()
16 |       .appName("Test Deploy App")
17 |       // method 1
18 |       .config("spark.executor.memory", "1g")
19 |       .getOrCreate()
20 | 
21 |     import spark.implicits._
22 | 
23 |     val moviesDF = spark.read
24 |       .option("inferSchema", "true")
25 |       .json(args(0))
26 | 
27 |     val goodComediesDF = moviesDF.select(
28 |       $"Title",
29 |       $"IMDB_Rating".as("Rating"),
30 |       $"Release_Date".as("Release")
31 |     )
32 |       .where(($"Major_Genre" === "Comedy") and ($"IMDB_Rating" > 6.5))
33 |       .orderBy($"Rating".desc_nulls_last)
34 | 
35 |     // method 2
36 |     spark.conf.set("spark.executor.memory", "1g") // warning - not all configurations available
37 | 
38 |     /*
39 |       method 3: pass configs as command line arguments:
40 | 
41 |         spark-submit ... --conf spark.executor.memory 1g
42 | 
43 |       You can also use dedicated command line arguments for certain configurations:
44 |         --master = spark.master
45 |         --executor-memory = spark.executor.memory
46 |         --driver-memory = spark.driver.memory
47 | 
48 |         and many more.
49 |     */
50 |     goodComediesDF.show()
51 | 
52 |     goodComediesDF.write
53 |       .mode(SaveMode.Overwrite)
54 |       .format("json")
55 |       .save(args(1))
56 | 
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/main/scala/part3dfjoins/BroadcastJoins.scala:
--------------------------------------------------------------------------------
 1 | package part3dfjoins
 2 | 
 3 | import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
 4 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
 5 | import org.apache.spark.sql.functions._
 6 | 
 7 | object BroadcastJoins {
 8 | 
 9 |   val spark = SparkSession.builder()
10 |     .appName("Broadcast Joins")
11 |     .master("local")
12 |     .getOrCreate()
13 | 
14 |   val sc = spark.sparkContext
15 | 
16 |   val rows = sc.parallelize(List(
17 |     Row(0, "zero"),
18 |     Row(1, "first"),
19 |     Row(2, "second"),
20 |     Row(3, "third")
21 |   ))
22 | 
23 |   val rowsSchema = StructType(Array(
24 |     StructField("id", IntegerType),
25 |     StructField("order", StringType)
26 |   ))
27 | 
28 |   // small table
29 |   val lookupTable: DataFrame = spark.createDataFrame(rows, rowsSchema)
30 | 
31 |   // large table
32 |   val table = spark.range(1, 100000000) // column is "id"
33 | 
34 |   // the innocent join
35 |   val joined = table.join(lookupTable, "id")
36 |   joined.explain
37 |   // joined.show - takes an ice age
38 | 
39 |   // a smarter join
40 |   val joinedSmart = table.join(broadcast(lookupTable), "id")
41 |   joinedSmart.explain()
42 |   // joinedSmart.show()
43 | 
44 |   // auto-broadcast detection
45 |   val bigTable = spark.range(1, 100000000)
46 |   val smallTable = spark.range(1, 10000) // size estimated by Spark - auto-broadcast
47 |   val joinedNumbers = smallTable.join(bigTable, "id")
48 | 
49 |   // deactivate auto-broadcast
50 |   spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
51 | 
52 |   joinedNumbers.explain()
53 | 
54 |   def main(args: Array[String]): Unit = {
55 |     Thread.sleep(1000000)
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/main/scala/part2foundations/ReadingDAGs.scala:
--------------------------------------------------------------------------------
 1 | package part2foundations
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | object ReadingDAGs {
 6 | 
 7 |   ///////////////////////////////////////////////////////////////////// Boilerplate
 8 |   // you don't need this code in the Spark shell
 9 |   // this code is needed if you want to run it locally in IntelliJ
10 | 
11 |   val spark = SparkSession.builder()
12 |     .config("spark.master", "local")
13 |     .appName("Reading Query Plans")
14 |     .getOrCreate()
15 | 
16 |   val sc = spark.sparkContext
17 | 
18 |   ///////////////////////////////////////////////////////////////////// Boilerplate
19 | 
20 |   // job 1
21 |   sc.parallelize(1 to 1000000).count()
22 |   // DAG with a single "box" - the creation of the RDD
23 | 
24 |   val rdd1 = sc.parallelize(1 to 1000000)
25 | 
26 |   // job 2
27 |   rdd1.map(_ * 2).count()
28 |   // DAG with one stage and two "boxes": one for creating the RDD and one for the map
29 | 
30 |   // job 3
31 |   rdd1.repartition(23).count()
32 |   // DAG with two stages:
33 |   // stage 1 - the creation of the RDD + exchange
34 |   // stage 2 - computation of the count
35 | 
36 |   // job 4 - same as query plans:
37 |   val ds1 = spark.range(1, 10000000)
38 |   val ds2 = spark.range(1, 20000000, 2)
39 |   val ds3 = ds1.repartition(7)
40 |   val ds4 = ds2.repartition(9)
41 |   val ds5 = ds3.selectExpr("id * 3 as id")
42 |   val joined = ds5.join(ds4, "id")
43 |   val sum = joined.selectExpr("sum(id)")
44 |   // complex DAG
45 | 
46 |   /**
47 |     * Takeaway: the DAG is a visual representation of the steps Spark will perform to run a job.
48 |     * It's the "drawing" version of the physical query plan.
49 |     * Unlike query plans, which are only available for DataFrames/Spark SQL, DAGs show up for ANY job.
50 |     */
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/resources/data/lipsum/words.txt:
--------------------------------------------------------------------------------
  1 | elit
  2 | est
  3 | consequat
  4 | pulvinar
  5 | tortor
  6 | fringilla
  7 | consectetur
  8 | sed
  9 | aliquet
 10 | taciti
 11 | in
 12 | accumsan
 13 | sapien
 14 | sagittis
 15 | torquent
 16 | molestie
 17 | volutpat
 18 | dui
 19 | auctor
 20 | eu
 21 | ultricies
 22 | nam
 23 | aliquam
 24 | nec
 25 | justo
 26 | laoreet
 27 | sit
 28 | mattis
 29 | quis
 30 | ultrices
 31 | vitae
 32 | risus
 33 | fusce
 34 | dapibus
 35 | ipsum
 36 | felis
 37 | cubilia
 38 | conubia
 39 | vel
 40 | ligula
 41 | per
 42 | mollis
 43 | tellus
 44 | orci
 45 | aenean
 46 | purus
 47 | scelerisque
 48 | malesuada
 49 | inceptos
 50 | luctus
 51 | himenaeos
 52 | curabitur
 53 | potenti
 54 | cursus
 55 | suspendisse
 56 | nisl
 57 | lorem
 58 | a
 59 | eget
 60 | convallis
 61 | metus
 62 | amet
 63 | nullam
 64 | enim
 65 | praesent
 66 | primis
 67 | cras
 68 | consectetuer
 69 | commodo
 70 | vestibulum
 71 | condimentum
 72 | blandit
 73 | ut
 74 | neque
 75 | fermentum
 76 | viverra
 77 | ante
 78 | et
 79 | faucibus
 80 | massa
 81 | egestas
 82 | porttitor
 83 | facilisi
 84 | sodales
 85 | magna
 86 | suscipit
 87 | iaculis
 88 | dolor
 89 | at
 90 | nisi
 91 | sem
 92 | semper
 93 | id
 94 | arcu
 95 | dignissim
 96 | ac
 97 | nostra
 98 | nunc
 99 | lacus
100 | euismod
101 | pharetra
102 | aptent
103 | tristique
104 | posuere
105 | proin
106 | nibh
107 | pede
108 | facilisis
109 | etiam
110 | morbi
111 | nulla
112 | ad
113 | turpis
114 | class
115 | curae
116 | sollicitudin
117 | venenatis
118 | ullamcorper
119 | litora
120 | lectus
121 | integer
122 | mi
123 | quam
124 | vivamus
125 | pretium
126 | imperdiet
127 | odio
128 | porta
129 | mauris
130 | lacinia
131 | donec
132 | pellentesque
133 | duis
134 | quisque
135 | maecenas
136 | augue
137 | velit
138 | congue
139 | diam
140 | tincidunt
141 | libero
142 | interdum
143 | non
144 | urna
145 | sociosqu
146 | feugiat
147 | adipiscing
148 | elementum


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The official repository for the Rock the JVM Spark Optimization with Scala course
 2 | 
 3 | Powered by [Rock the JVM!](rockthejvm.com)
 4 | 
 5 | This repository contains the code we wrote during [Rock the JVM's Spark Optimization with Scala](https://rockthejvm.com/course/spark-optimization) course. Unless explicitly mentioned, the code in this repository is exactly what was caught on camera.
 6 | 
 7 | ### Install and setup
 8 | 
 9 | - install [IntelliJ IDEA](https://jetbrains.com/idea)
10 | - install [Docker Desktop](https://docker.com)
11 | - either clone the repo or download as zip
12 | - open with IntelliJ as an SBT project
13 | - Windows users, you need to set up some Hadoop-related configs - use [this guide](/HadoopWindowsUserSetup.md) 
14 | 
15 | As you open the project, the IDE will take care to download and apply the appropriate library dependencies.
16 | 
17 | To set up the dockerized Spark cluster we will be using in the course, do the following:
18 | 
19 | - open a terminal and navigate to `spark-cluster`
20 | - run `build-images.sh` (if you don't have a bash terminal, just open the file and run each line one by one)
21 | - run `docker-compose up`
22 | 
23 | To interact with the Spark cluster, the folders `data` and `apps` inside the `spark-cluster` folder are mounted onto the Docker containers under `/opt/spark-data` and `/opt/spark-apps` respectively.
24 | 
25 | To run a Spark shell, first run `docker-compose up` inside the `spark-cluster` directory, then in another terminal, do
26 | 
27 | ```
28 | docker exec -it spark-cluster_spark-master_1 bash
29 | ```
30 | 
31 | and then
32 | 
33 | ```
34 | /spark/bin/spark-shell
35 | ```
36 | 
37 | ### How to use intermediate states of this repository
38 | 
39 | Start by cloning this repository and checkout the `start` tag:
40 | 
41 | ```
42 | git checkout start
43 | ```
44 | 
45 | ### For questions or suggestions
46 | 
47 | If you have changes to suggest to this repo, either
48 | - submit a GitHub issue
49 | - tell me in the course Q/A forum
50 | - submit a pull request!
51 | 


--------------------------------------------------------------------------------
/spark-cluster/docker/base/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM eclipse-temurin:17-jdk
 2 | LABEL author="Daniel Ciocirlan" email="daniel@rockthejvm.com"
 3 | LABEL version="0.3"
 4 | 
 5 | ENV DAEMON_RUN=true
 6 | ENV SPARK_VERSION=3.5.0
 7 | ENV SCALA_VERSION_BASE=2.13
 8 | ENV HADOOP_VERSION=3
 9 | ENV SCALA_VERSION=2.13.12
10 | ENV SCALA_HOME=/usr/share/scala
11 | ENV SPARK_HOME=/spark
12 | 
13 | 
14 | RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates jq dbus-x11
15 | RUN echo exit 0 > /usr/sbin/policy-rc.d
16 | 
17 | RUN cd "/tmp" && \
18 |     wget --no-verbose "https://downloads.typesafe.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" && \
19 |     tar xzf "scala-${SCALA_VERSION}.tgz" && \
20 |     mkdir "${SCALA_HOME}" && \
21 |     rm "/tmp/scala-${SCALA_VERSION}/bin/"*.bat && \
22 |     mv "/tmp/scala-${SCALA_VERSION}/bin" "/tmp/scala-${SCALA_VERSION}/lib" "${SCALA_HOME}" && \
23 |     ln -s "${SCALA_HOME}/bin/"* "/usr/bin/" && \
24 |     rm -rf "/tmp/"*
25 | 
26 | # Add Dependencies for PySpark
27 | RUN apt-get install -y python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy
28 | RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1
29 | 
30 | 
31 | #Scala instalation
32 | RUN export PATH="/usr/local/sbt/bin:$PATH" &&  apt update && apt install ca-certificates wget tar && mkdir -p "/usr/local/sbt" && wget -qO - --no-check-certificate "https://github.com/sbt/sbt/releases/download/v1.9.6/sbt-1.9.6.tgz" | tar xz -C /usr/local/sbt --strip-components=1 && sbt sbtVersion -Dsbt.rootdir=true
33 | 
34 | RUN wget --no-verbose https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE}.tgz && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE}.tgz \
35 |       && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE} spark \
36 |       && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala${SCALA_VERSION_BASE}.tgz
37 | 
38 | 
39 | 
40 | # Fix the value of PYTHONHASHSEED
41 | # Note: this is needed when you use Python 3.3 or greater
42 | ENV PYTHONHASHSEED 1
43 | 


--------------------------------------------------------------------------------
/src/main/scala/part4rddjoins/RDDBroadcastJoins.scala:
--------------------------------------------------------------------------------
 1 | package part4rddjoins
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | import scala.util.Random
 6 | 
 7 | /**
 8 |   * Shown on camera in the Spark Shell.
 9 |   */
10 | object RDDBroadcastJoins {
11 | 
12 |   val spark = SparkSession.builder()
13 |     .appName("Broadcast Joins")
14 |     .master("local[*]")
15 |     .getOrCreate()
16 | 
17 |   val sc = spark.sparkContext
18 | 
19 |   val random = new Random()
20 | 
21 |   /*
22 |     Scenario: assign prizes to a wide-scale competition (10M+ people).
23 |     Goal: find out who won what.
24 |    */
25 | 
26 |   // small lookup table
27 |   val prizes = sc.parallelize(List(
28 |     (1, "gold"),
29 |     (2, "silver"),
30 |     (3, "bronze")
31 |   ))
32 | 
33 |   // the competition has ended - the leaderboard is known
34 |   val leaderboard = sc.parallelize(1 to 10000000).map((_, random.alphanumeric.take(8).mkString))
35 |   val medalists = leaderboard.join(prizes)
36 |   medalists.foreach(println) // 38s for 10M elements!
37 | 
38 |   /*
39 |     We know from SQL joins that the small RDD can be broadcast so that we can avoid the shuffle on the big RDD.
40 |     However, for the RDD API, we'll have to do this manually.
41 |     This lesson is more about how to actually implement the broadcasting technique on RDDs.
42 |   */
43 | 
44 |   // need to collect the RDD locally, so that we can broadcast to the executors
45 |   val medalsMap = prizes.collectAsMap()
46 |   // after we do this, all executors can refer to the medalsMap locally
47 |   sc.broadcast(medalsMap)
48 |   // need to avoid shuffles by manually going through the partitions of the big RDD
49 |   val improvedMedalists = leaderboard.mapPartitions { iterator => // iterator of all the tuples in this partition; all the tuples are local to this executor
50 |     iterator.flatMap { record =>
51 |       val (index, name) = record
52 |       medalsMap.get(index) match { // notice you can refer to the name medalsMap, which you now have access to locally after the broadcast
53 |         case None => Seq.empty
54 |         case Some(medal) => Seq((name, medal))
55 |       }
56 |     }
57 |   }
58 | 
59 |   improvedMedalists.foreach(println) // 2s, blazing fast, no shuffles or anything at all.
60 | 
61 |   def main(args: Array[String]): Unit = {
62 |     Thread.sleep(1000000)
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/main/scala/part4rddjoins/CogroupingRDDs.scala:
--------------------------------------------------------------------------------
 1 | package part4rddjoins
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object CogroupingRDDs {
 7 | 
 8 |   val spark = SparkSession.builder()
 9 |     .appName("Cogrouping RDDs")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val sc = spark.sparkContext
14 |   val rootFolder = "src/main/resources/generated/examData"
15 | 
16 |   /*
17 |     Take all the student attempts
18 |     - if a student passed (at least one attempt > 9.0), send them an email "PASSED"
19 |     - else send them an email with "FAILED"
20 |    */
21 | 
22 |   def readIds() = sc.textFile(s"$rootFolder/examIds.txt")
23 |     .map { line =>
24 |       val tokens = line.split(" ")
25 |       (tokens(0).toLong, tokens(1))
26 |     }
27 | 
28 |   def readExamScores() = sc.textFile(s"$rootFolder/examScores.txt")
29 |     .map { line =>
30 |       val tokens = line.split(" ")
31 |       (tokens(0).toLong, tokens(1).toDouble)
32 |     }
33 | 
34 |   def readExamEmails() = sc.textFile(s"$rootFolder/examEmails.txt")
35 |     .map { line =>
36 |       val tokens = line.split(" ")
37 |       (tokens(0).toLong, tokens(1))
38 |     }
39 | 
40 |   def plainJoin() = {
41 |     val scores = readExamScores().reduceByKey(Math.max)
42 |     val candidates = readIds()
43 |     val emails = readExamEmails()
44 | 
45 |     val results = candidates
46 |       .join(scores) // RDD[(Long, (String, Double))]
47 |       .join(emails) // RDD[(Long, ((String, Double), String))]
48 |       .mapValues {
49 |         case ((_, maxAttempt), email) =>
50 |           if (maxAttempt >= 9.0) (email, "PASSED")
51 |           else (email, "FAILED")
52 |       }
53 | 
54 |     results.count()
55 |   }
56 | 
57 |   def coGroupedJoin() = {
58 |     val scores = readExamScores().reduceByKey(Math.max)
59 |     val candidates = readIds()
60 |     val emails = readExamEmails()
61 | 
62 |     val result: RDD[(Long, Option[(String, String)])] = candidates.cogroup(scores, emails) // co-partition the 3 RDDs: RDD[(Long, (Iterable[String], Iterable[Double], Iterable[String]))]
63 |       .mapValues {
64 |       case (nameIterable, maxAttemptIterable, emailIterable) =>
65 |         val name = nameIterable.headOption
66 |         val maxScore = maxAttemptIterable.headOption
67 |         val email = emailIterable.headOption
68 | 
69 |         for {
70 |           e <- email
71 |           s <- maxScore
72 |         } yield (e, if (s >= 9.0) "PASSED" else "FAILED")
73 |       }
74 | 
75 |     result.count()
76 |     result.count()
77 |   }
78 | 
79 | 
80 |   def main(args: Array[String]): Unit = {
81 |     plainJoin()
82 |     coGroupedJoin()
83 |     Thread.sleep(1000000)
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/main/resources/data/employees/employees.csv:
--------------------------------------------------------------------------------
 1 | 7584740,Devin,Jeramy,Vedenichev,103,1951-12-31T05:00:00.000Z,958-67-2937,55214
 2 | 7677553,Marcus,Arlie,Tibb,103,1951-12-31T05:00:00.000Z,999-90-6698,47746
 3 | 7736171,Pat,Johnie,De Keep,103,1951-12-31T05:00:00.000Z,960-92-7355,98868
 4 | 7813417,Brooks,Dannie,Lemmens,103,1951-12-31T05:00:00.000Z,914-82-1490,69065
 5 | 7929094,Milan,Alex,Setterfield,103,1951-12-31T05:00:00.000Z,989-42-1725,79877
 6 | 8037265,Dexter,Otis,Brahmer,103,1951-12-31T05:00:00.000Z,959-50-1621,91473
 7 | 8050195,Anibal,Nicky,Springford,103,1951-12-31T05:00:00.000Z,998-90-1376,62784
 8 | 8053263,Karl,Horace,Thompson,103,1951-12-31T05:00:00.000Z,930-70-7868,90481
 9 | 8125007,Quinton,Eduardo,Rooze,103,1951-12-31T05:00:00.000Z,903-21-6089,87389
10 | 8169581,Jc,Edison,Fruchter,103,1951-12-31T05:00:00.000Z,905-35-8052,58120
11 | 8206921,Sylvester,Faustino,Glasgow,103,1951-12-31T05:00:00.000Z,907-30-2443,43279
12 | 8230707,Juan,Dewayne,Leggitt,103,1951-12-31T05:00:00.000Z,965-56-5537,74107
13 | 8353995,Bill,Dirk,Negri,103,1951-12-31T05:00:00.000Z,905-76-4894,79389
14 | 8430873,Oren,August,Lachaize,103,1951-12-31T05:00:00.000Z,990-14-4688,82777
15 | 8448144,Dewayne,Hubert,Sturton,103,1951-12-31T05:00:00.000Z,917-37-8590,91109
16 | 8479988,Courtney,Long,Bellin,103,1951-12-31T05:00:00.000Z,906-26-2150,51099
17 | 8481731,Sung,Jamie,Haslin,103,1951-12-31T05:00:00.000Z,982-29-1779,75742
18 | 8596468,Walker,Francesco,O'Fallone,103,1951-12-31T05:00:00.000Z,962-42-6521,107650
19 | 8650756,Clyde,Royce,Readshall,103,1951-12-31T05:00:00.000Z,983-38-1342,88851
20 | 5073404,Dale,Brice,Casolla,105,1951-12-31T05:00:00.000Z,963-52-1303,102445
21 | 5106546,Lupe,Rod,Bullard,105,1951-12-31T05:00:00.000Z,901-76-7197,84274
22 | 5178737,Douglass,Seth,Cummine,105,1951-12-31T05:00:00.000Z,958-74-4222,81743
23 | 5265931,Abram,Eddie,Izzett,105,1951-12-31T05:00:00.000Z,962-45-3424,63223
24 | 5268793,Wilson,Landon,Chellam,105,1951-12-31T05:00:00.000Z,911-11-7847,88966
25 | 5371143,Kieth,Malcolm,Formigli,105,1951-12-31T05:00:00.000Z,666-37-6950,70655
26 | 5407756,Ted,Pablo,Lidgely,105,1951-12-31T05:00:00.000Z,994-52-7326,63100
27 | 5413070,Reid,Reyes,Kabos,105,1951-12-31T05:00:00.000Z,918-28-4915,90488
28 | 5422998,Buford,Leonel,Stanbro,105,1951-12-31T05:00:00.000Z,927-29-6346,100472
29 | 5530003,Milford,Samual,Manis,105,1951-12-31T05:00:00.000Z,907-28-9627,40950
30 | 5550873,Wm,Fredrick,Filshin,105,1951-12-31T05:00:00.000Z,934-69-5826,39809
31 | 5597718,Jeromy,Von,Daley,105,1951-12-31T05:00:00.000Z,991-24-2641,29644
32 | 5889924,Tyrell,Alonso,Hagyard,105,1951-12-31T05:00:00.000Z,903-55-7969,39093
33 | 5890431,George,Moises,Shurville,105,1951-12-31T05:00:00.000Z,956-30-5990,44500
34 | 6130212,Percy,Les,Lafontaine,105,1951-12-31T05:00:00.000Z,922-95-3154,73006
35 | 


--------------------------------------------------------------------------------
/src/main/resources/data/employees_headers/employees_headers.csv:
--------------------------------------------------------------------------------
 1 | id,firstName,middleName,lastName,dept,birthDate,ssn,salary
 2 | 7584740,Devin,Jeramy,Vedenichev,103,1951-12-31T05:00:00.000Z,958-67-2937,55214
 3 | 7677553,Marcus,Arlie,Tibb,103,1951-12-31T05:00:00.000Z,999-90-6698,47746
 4 | 7736171,Pat,Johnie,De Keep,103,1951-12-31T05:00:00.000Z,960-92-7355,98868
 5 | 7813417,Brooks,Dannie,Lemmens,103,1951-12-31T05:00:00.000Z,914-82-1490,69065
 6 | 7929094,Milan,Alex,Setterfield,103,1951-12-31T05:00:00.000Z,989-42-1725,79877
 7 | 8037265,Dexter,Otis,Brahmer,103,1951-12-31T05:00:00.000Z,959-50-1621,91473
 8 | 8050195,Anibal,Nicky,Springford,103,1951-12-31T05:00:00.000Z,998-90-1376,62784
 9 | 8053263,Karl,Horace,Thompson,103,1951-12-31T05:00:00.000Z,930-70-7868,90481
10 | 8125007,Quinton,Eduardo,Rooze,103,1951-12-31T05:00:00.000Z,903-21-6089,87389
11 | 8169581,Jc,Edison,Fruchter,103,1951-12-31T05:00:00.000Z,905-35-8052,58120
12 | 8206921,Sylvester,Faustino,Glasgow,103,1951-12-31T05:00:00.000Z,907-30-2443,43279
13 | 8230707,Juan,Dewayne,Leggitt,103,1951-12-31T05:00:00.000Z,965-56-5537,74107
14 | 8353995,Bill,Dirk,Negri,103,1951-12-31T05:00:00.000Z,905-76-4894,79389
15 | 8430873,Oren,August,Lachaize,103,1951-12-31T05:00:00.000Z,990-14-4688,82777
16 | 8448144,Dewayne,Hubert,Sturton,103,1951-12-31T05:00:00.000Z,917-37-8590,91109
17 | 8479988,Courtney,Long,Bellin,103,1951-12-31T05:00:00.000Z,906-26-2150,51099
18 | 8481731,Sung,Jamie,Haslin,103,1951-12-31T05:00:00.000Z,982-29-1779,75742
19 | 8596468,Walker,Francesco,O'Fallone,103,1951-12-31T05:00:00.000Z,962-42-6521,107650
20 | 8650756,Clyde,Royce,Readshall,103,1951-12-31T05:00:00.000Z,983-38-1342,88851
21 | 5073404,Dale,Brice,Casolla,105,1951-12-31T05:00:00.000Z,963-52-1303,102445
22 | 5106546,Lupe,Rod,Bullard,105,1951-12-31T05:00:00.000Z,901-76-7197,84274
23 | 5178737,Douglass,Seth,Cummine,105,1951-12-31T05:00:00.000Z,958-74-4222,81743
24 | 5265931,Abram,Eddie,Izzett,105,1951-12-31T05:00:00.000Z,962-45-3424,63223
25 | 5268793,Wilson,Landon,Chellam,105,1951-12-31T05:00:00.000Z,911-11-7847,88966
26 | 5371143,Kieth,Malcolm,Formigli,105,1951-12-31T05:00:00.000Z,666-37-6950,70655
27 | 5407756,Ted,Pablo,Lidgely,105,1951-12-31T05:00:00.000Z,994-52-7326,63100
28 | 5413070,Reid,Reyes,Kabos,105,1951-12-31T05:00:00.000Z,918-28-4915,90488
29 | 5422998,Buford,Leonel,Stanbro,105,1951-12-31T05:00:00.000Z,927-29-6346,100472
30 | 5530003,Milford,Samual,Manis,105,1951-12-31T05:00:00.000Z,907-28-9627,40950
31 | 5550873,Wm,Fredrick,Filshin,105,1951-12-31T05:00:00.000Z,934-69-5826,39809
32 | 5597718,Jeromy,Von,Daley,105,1951-12-31T05:00:00.000Z,991-24-2641,29644
33 | 5889924,Tyrell,Alonso,Hagyard,105,1951-12-31T05:00:00.000Z,903-55-7969,39093
34 | 5890431,George,Moises,Shurville,105,1951-12-31T05:00:00.000Z,956-30-5990,44500
35 | 6130212,Percy,Les,Lafontaine,105,1951-12-31T05:00:00.000Z,922-95-3154,73006
36 | 


--------------------------------------------------------------------------------
/src/main/scala/part3dfjoins/JoinsRecap.scala:
--------------------------------------------------------------------------------
 1 | package part3dfjoins
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object JoinsRecap {
 7 | 
 8 |   val spark = SparkSession.builder()
 9 |     .master("local[2]")
10 |     .appName("Joins Recap")
11 |     .getOrCreate()
12 | 
13 |   val sc = spark.sparkContext
14 | 
15 |   val guitarsDF = spark.read
16 |     .option("inferSchema", "true")
17 |     .json("src/main/resources/data/guitars")
18 | 
19 |   val guitaristsDF = spark.read
20 |     .option("inferSchema", "true")
21 |     .json("src/main/resources/data/guitarPlayers")
22 | 
23 |   val bandsDF = spark.read
24 |     .option("inferSchema", "true")
25 |     .json("src/main/resources/data/bands")
26 | 
27 |   // inner joins
28 |   val joinCondition = guitaristsDF.col("band") === bandsDF.col("id")
29 |   val guitaristsBandsDF = guitaristsDF.join(bandsDF, joinCondition, "inner")
30 | 
31 |   // outer joins
32 |   // left outer = everything in inner join + all the rows in the LEFT table, with nulls in the rows not passing the condition in the RIGHT table
33 |   guitaristsDF.join(bandsDF, joinCondition, "left_outer")
34 |   // right outer = everything in inner join + all the rows in the RIGHT table, with nulls in the rows not passing the condition in the LEFT table
35 |   guitaristsDF.join(bandsDF, joinCondition, "right_outer")
36 |   // outer join = everything in left_outer + right_outer
37 |   guitaristsDF.join(bandsDF, joinCondition, "outer")
38 | 
39 |   // semi joins = everything in the left DF for which THERE IS a row in the right DF satisfying the condition
40 |   // essentially a filter
41 |   guitaristsDF.join(bandsDF, joinCondition, "left_semi")
42 | 
43 |   // anti join = everything in the left DF for which THERE IS NOT a row in the right DF satisfying the condition
44 |   // also a filter
45 |   guitaristsDF.join(bandsDF, joinCondition, "left_anti")
46 | 
47 |   // cross join = everything in the left table with everything in the right table
48 |   // dangerous: NRows(crossjoin) = NRows(left) x NRows(right)
49 |   // careful with outer joins with non-unique keys
50 | 
51 |   // RDD joins
52 |   val colorsScores = Seq(
53 |     ("blue", 1),
54 |     ("red", 4),
55 |     ("green", 5),
56 |     ("yellow", 2),
57 |     ("orange", 3),
58 |     ("cyan", 0)
59 |   )
60 |   val colorsRDD: RDD[(String, Int)] = sc.parallelize(colorsScores)
61 |   val text = "The sky is blue, but the orange pale sun turns from yellow to red"
62 |   val words = text.split(" ").map(_.toLowerCase()).map((_, 1)) // standard technique for counting words with RDDs
63 |   val wordsRDD = sc.parallelize(words).reduceByKey(_ + _) // counting word occurrence
64 |   val scores: RDD[(String, (Int, Int))] = wordsRDD.join(colorsRDD) // implied join type is INNER
65 | 
66 | 
67 |   def main(args: Array[String]): Unit = {
68 | 
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/scala/part2foundations/SparkJobAnatomy.scala:
--------------------------------------------------------------------------------
 1 | package part2foundations
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | 
 6 | object SparkJobAnatomy {
 7 | 
 8 |   ///////////////////////////////////////////////////////////////////// Boilerplate
 9 |   // you don't need this code in the Spark shell
10 |   // this code is needed if you want to run it locally in IntelliJ
11 | 
12 |   val spark = SparkSession.builder()
13 |     .config("spark.master", "local")
14 |     .appName("Spark Job Anatomy")
15 |     .getOrCreate()
16 | 
17 |   val sc = spark.sparkContext
18 | 
19 |   ///////////////////////////////////////////////////////////////////// Boilerplate
20 | 
21 |   /**
22 |     * Cluster prep
23 |     *
24 |     * 1. Navigate to the spark-optimization folder, go to spark-cluster/
25 |     * 2. docker-compose up --scale spark-worker=3
26 |     * 3. In another terminal:
27 |     *    - docker-exec -it spark-cluster_spark-master_1 bash
28 |     *    - cd spark/bin
29 |     *    - ./spark-shell
30 |     * 4. In (yet) another terminal:
31 |     *    - go to spark-optimization
32 |     *    - docker cp (the data folder) spark-cluster_spark-master_1:/tmp
33 |     * 5. Open http//:localhost:4040 for the Spark UI
34 |     */
35 | 
36 |   // job 1 - a count
37 |   val rdd1 = sc.parallelize(1 to 1000000)
38 |   rdd1.count
39 |   // inspect the UI, one stage with 6 tasks
40 |   // task = a unit of computation applied to a unit of data (a partition)
41 | 
42 |   // job 2 - a count with a small transformation
43 |   rdd1.map(_ * 2).count
44 |   // inspect the UI, another job with (still) one stage, 6 tasks
45 |   // all parallelizable computations (like maps) are done in a single stage
46 | 
47 |   // job 3 - a count with a shuffle
48 |   rdd1.repartition(23).count
49 |   // UI: 2 stages, one with 6 tasks, one with 23 tasks
50 |   // each stage is delimited by shuffles
51 | 
52 |   // job 4, a more complex computation: load a file and compute the average salary of the employees by department
53 |   val employees = sc.textFile("/tmp/employees.csv")
54 |   // process the lines
55 |   val empTokens = employees.map(line => line.split(","))
56 |   // extract relevant data
57 |   val empDetails = empTokens.map(tokens => (tokens(4), tokens(7)))
58 |   // group the elements
59 |   val empGroups = empDetails.groupByKey(2)
60 |   // process the values associated to each group
61 |   val avgSalaries = empGroups.mapValues(salaries => salaries.map(_.toInt).sum / salaries.size)
62 |   // show the result
63 |   avgSalaries
64 |     .collect() // this is an action
65 |     .foreach(println)
66 | 
67 |   // look at the Spark UI: one job, 2 stages
68 |   // the groupByKey triggers a shuffle, and thus the beginning of another stage
69 |   // all other computations (maps, mapValues) are done in their respective stage
70 |   // the number of tasks = the number of partitions processed in a given stage
71 | }


--------------------------------------------------------------------------------
/src/main/scala/part1recap/SparkRecap.scala:
--------------------------------------------------------------------------------
 1 | package part1recap
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | 
 7 | object SparkRecap {
 8 | 
 9 |   // the entry point to the Spark structured API
10 |   val spark = SparkSession.builder()
11 |     .appName("Spark Recap")
12 |     .master("local[2]")
13 |     .getOrCreate()
14 | 
15 |   // read a DF
16 |   val cars = spark.read
17 |     .format("json")
18 |     .option("inferSchema", "true")
19 |     .load("src/main/resources/data/cars")
20 | 
21 |   import spark.implicits._
22 | 
23 |   // select
24 |   val usefulCarsData = cars.select(
25 |     col("Name"), // column object
26 |     $"Year", // another column object (needs spark implicits)
27 |     (col("Weight_in_lbs") / 2.2).as("Weight_in_kg"),
28 |     expr("Weight_in_lbs / 2.2").as("Weight_in_kg_2")
29 |   )
30 | 
31 |   val carsWeights = cars.selectExpr("Weight_in_lbs / 2.2")
32 | 
33 |   // filter
34 |   val europeanCars = cars.where(col("Origin") =!= "USA")
35 | 
36 |   // aggregations
37 |   val averageHP = cars.select(avg(col("Horsepower")).as("average_hp")) // sum, meam, stddev, min, max
38 | 
39 |   // grouping
40 |   val countByOrigin = cars
41 |     .groupBy(col("Origin")) // a RelationalGroupedDataset
42 |     .count()
43 | 
44 |   // joining
45 |   val guitarPlayers = spark.read
46 |     .option("inferSchema", "true")
47 |     .json("src/main/resources/data/guitarPlayers")
48 | 
49 |   val bands = spark.read
50 |     .option("inferSchema", "true")
51 |     .json("src/main/resources/data/bands")
52 | 
53 |   val guitaristsBands = guitarPlayers.join(bands, guitarPlayers.col("band") === bands.col("id"))
54 |   /*
55 |     join types
56 |     - inner: only the matching rows are kept
57 |     - left/right/full outer join
58 |     - semi/anti
59 |    */
60 | 
61 |   // datasets = typed distributed collection of objects
62 |   case class GuitarPlayer(id: Long, name: String, guitars: Seq[Long], band: Long)
63 |   val guitarPlayersDS = guitarPlayers.as[GuitarPlayer] // needs spark.implicits
64 |   guitarPlayersDS.map(_.name)
65 | 
66 |   // Spark SQL
67 |   cars.createOrReplaceTempView("cars")
68 |   val americanCars = spark.sql(
69 |     """
70 |       |select Name from cars where Origin = 'USA'
71 |     """.stripMargin
72 |   )
73 | 
74 |   // low-level API: RDDs
75 |   val sc = spark.sparkContext
76 |   val numbersRDD: RDD[Int] = sc.parallelize(1 to 1000000)
77 | 
78 |   // functional operators
79 |   val doubles = numbersRDD.map(_ * 2)
80 | 
81 |   // RDD -> DF
82 |   val numbersDF = numbersRDD.toDF("number") // you lose type info, you get SQL capability
83 | 
84 |   // RDD -> DS
85 |   val numbersDS = spark.createDataset(numbersRDD)
86 | 
87 |   // DS -> RDD
88 |   val guitarPlayersRDD = guitarPlayersDS.rdd
89 | 
90 |   // DF -> RDD
91 |   val carsRDD = cars.rdd // RDD[Row]
92 | 
93 |   def main(args: Array[String]): Unit = {
94 |     // showing a DF to the console
95 |     cars.show()
96 |     cars.printSchema()
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/scala/part1recap/ScalaRecap.scala:
--------------------------------------------------------------------------------
  1 | package part1recap
  2 | 
  3 | import scala.concurrent.Future
  4 | import scala.util.{Failure, Success}
  5 | 
  6 | object ScalaRecap extends App {
  7 | 
  8 |   // values and variables
  9 |   val aBoolean: Boolean = false
 10 | 
 11 |   // expressions
 12 |   val anIfExpression = if(2 > 3) "bigger" else "smaller"
 13 | 
 14 |   // instructions vs expressions
 15 |   val theUnit = println("Hello, Scala") // Unit = "no meaningful value" = void in other languages
 16 | 
 17 |   // functions
 18 |   def myFunction(x: Int) = 42
 19 | 
 20 |   // OOP
 21 |   class Animal
 22 |   class Cat extends Animal
 23 |   trait Carnivore {
 24 |     def eat(animal: Animal): Unit
 25 |   }
 26 | 
 27 |   class Crocodile extends Animal with Carnivore {
 28 |     override def eat(animal: Animal): Unit = println("Crunch!")
 29 |   }
 30 | 
 31 |   // singleton pattern
 32 |   object MySingleton
 33 | 
 34 |   // companions
 35 |   object Carnivore
 36 | 
 37 |   // generics
 38 |   trait MyList[A]
 39 | 
 40 |   // method notation
 41 |   val x = 1 + 2
 42 |   val y = 1.+(2)
 43 | 
 44 |   // Functional Programming
 45 |   val incrementer: Int => Int = x => x + 1
 46 |   val incremented = incrementer(42)
 47 | 
 48 |   // map, flatMap, filter
 49 |   val processedList = List(1,2,3).map(incrementer)
 50 | 
 51 |   // Pattern Matching
 52 |   val unknown: Any = 45
 53 |   val ordinal = unknown match {
 54 |     case 1 => "first"
 55 |     case 2 => "second"
 56 |     case _ => "unknown"
 57 |   }
 58 | 
 59 |   // try-catch
 60 |   try {
 61 |     throw new NullPointerException
 62 |   } catch {
 63 |     case _: NullPointerException => "some returned value"
 64 |     case _: Throwable => "something else"
 65 |   }
 66 | 
 67 |   // Future
 68 |   import scala.concurrent.ExecutionContext.Implicits.global
 69 |   val aFuture = Future {
 70 |     // some expensive computation, runs on another thread
 71 |     42
 72 |   }
 73 | 
 74 |   aFuture.onComplete {
 75 |     case Success(meaningOfLife) => println(s"I've found $meaningOfLife")
 76 |     case Failure(ex) => println(s"I have failed: $ex")
 77 |   }
 78 | 
 79 |   // Partial functions
 80 |   val aPartialFunction: PartialFunction[Int, Int] = {
 81 |     case 1 => 43
 82 |     case 8 => 56
 83 |     case _ => 999
 84 |   }
 85 | 
 86 |   // Implicits
 87 | 
 88 |   // auto-injection by the compiler
 89 |   def methodWithImplicitArgument(implicit x: Int) = x + 43
 90 |   implicit val implicitInt = 67
 91 |   val implicitCall = methodWithImplicitArgument
 92 | 
 93 |   // implicit conversions - implicit defs
 94 |   case class Person(name: String) {
 95 |     def greet = println(s"Hi, my name is $name")
 96 |   }
 97 | 
 98 |   implicit def fromStringToPerson(name: String) = Person(name)
 99 |   "Bob".greet // fromStringToPerson("Bob").greet
100 | 
101 |   // implicit conversion - implicit classes
102 |   implicit class Dog(name: String) {
103 |     def bark = println("Bark!")
104 |   }
105 |   "Lassie".bark
106 | 
107 |   /*
108 |     - local scope
109 |     - imported scope
110 |     - companion objects of the types involved in the method call
111 |    */
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/main/scala/part4rddjoins/RDDSkewedJoins.scala:
--------------------------------------------------------------------------------
 1 | package part4rddjoins
 2 | 
 3 | import generator.{DataGenerator, Laptop, LaptopOffer}
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | object RDDSkewedJoins {
 7 | 
 8 |   val spark = SparkSession.builder()
 9 |     .appName("RDD Skewed Joins")
10 |     .master("local[*]")
11 |     .getOrCreate()
12 | 
13 |   val sc = spark.sparkContext
14 | 
15 |   /*
16 |     An online store selling gaming laptops.
17 |     2 laptops are "similar" if they have the same make & model, but proc speed within 0.1
18 | 
19 |     For each laptop configuration, we are interested in the average sale price of "similar" models.
20 | 
21 |     Acer Predator 2.9Ghz aylfaskjhrw -> average sale price of all Acer Predators with CPU speed between 2.8 and 3.0 GHz
22 |    */
23 | 
24 |   val laptops = sc.parallelize(Seq.fill(40000)(DataGenerator.randomLaptop()))
25 |   val laptopOffers = sc.parallelize(Seq.fill(100000)(DataGenerator.randomLaptopOffer()))
26 | 
27 |   def plainJoin() = {
28 |     val preparedLaptops = laptops.map {
29 |       case Laptop(registration, make, model, procSpeed) => ((make, model), (registration, procSpeed))
30 |     }
31 | 
32 |     val preparedOffers = laptopOffers.map {
33 |       case LaptopOffer(make, model, procSpeed, salePrice) => ((make, model), (procSpeed, salePrice))
34 |     }
35 | 
36 |     val result = preparedLaptops.join(preparedOffers) // RDD[(make, model), ((reg, cpu), (cpu, salePrice)))]
37 |       .filter {
38 |         case ((make, model), ((reg, laptopCpu), (offerCpu, salePrice))) => Math.abs(laptopCpu - offerCpu) <= 0.1
39 |       }
40 |       .map {
41 |         case ((make, model), ((reg, laptopCpu), (offerCpu, salePrice))) => (reg, salePrice)
42 |       }
43 |       .aggregateByKey((0.0, 0))(
44 |         {
45 |           case ((totalPrice, numPrices), salePrice) => (totalPrice + salePrice, numPrices + 1) // combine state with record
46 |         },
47 |         {
48 |           case ((totalPrices1, numPrices1), (totalPrices2, numPrices2)) => (totalPrices1 + totalPrices2, numPrices1 + numPrices2) // combine 2 states into one
49 |         }
50 |       ) // RDD[(String, (Double, Int))]
51 |       .mapValues {
52 |         case (totalPrices, numPrices) => totalPrices / numPrices
53 |       }
54 | 
55 |     result.count()
56 |   }
57 | 
58 |   def noSkewJoin() = {
59 |     val preparedLaptops = laptops
60 |       .flatMap { laptop =>
61 |         Seq(
62 |           laptop,
63 |           laptop.copy(procSpeed = laptop.procSpeed - 0.1),
64 |           laptop.copy(procSpeed = laptop.procSpeed + 0.1),
65 |         )
66 |       }
67 |       .map {
68 |         case Laptop(registration, make, model, procSpeed) => ((make, model, procSpeed), registration)
69 |       }
70 | 
71 |     val preparedOffers = laptopOffers.map {
72 |       case LaptopOffer(make, model, procSpeed, salePrice) => ((make, model, procSpeed), salePrice)
73 |     }
74 | 
75 |     val result = preparedLaptops.join(preparedOffers) // RDD[(make, model, procSpeed), (reg, salePrice))
76 |       .map(_._2)
77 |       .aggregateByKey((0.0, 0))(
78 |         {
79 |           case ((totalPrice, numPrices), salePrice) => (totalPrice + salePrice, numPrices + 1) // combine state with record
80 |         },
81 |         {
82 |           case ((totalPrices1, numPrices1), (totalPrices2, numPrices2)) => (totalPrices1 + totalPrices2, numPrices1 + numPrices2) // combine 2 states into one
83 |         }
84 |       ) // RDD[(String, (Double, Int))]
85 |       .mapValues {
86 |       case (totalPrices, numPrices) => totalPrices / numPrices
87 |     }
88 | 
89 |     result.count()
90 |   }
91 | 
92 | 
93 |   def main(args: Array[String]): Unit = {
94 |     noSkewJoin()
95 |     Thread.sleep(1000000)
96 |   }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/scala/part4rddjoins/SimpleRDDJoins.scala:
--------------------------------------------------------------------------------
  1 | package part4rddjoins
  2 | 
  3 | import generator.DataGenerator
  4 | import org.apache.spark.HashPartitioner
  5 | import org.apache.spark.rdd.RDD
  6 | import org.apache.spark.sql.SparkSession
  7 | 
  8 | object SimpleRDDJoins {
  9 | 
 10 |   val spark = SparkSession.builder()
 11 |     .appName("RDD joins")
 12 |     .master("local[*]")
 13 |     .getOrCreate()
 14 | 
 15 |   val sc = spark.sparkContext
 16 |   val rootFolder = "src/main/resources/generated/examData"
 17 | 
 18 |   // DataGenerator.generateExamData(rootFolder, 1000000, 5)
 19 | 
 20 |   def readIds() = sc.textFile(s"$rootFolder/examIds.txt")
 21 |     .map { line =>
 22 |       val tokens = line.split(" ")
 23 |       (tokens(0).toLong, tokens(1))
 24 |     }
 25 |     .partitionBy(new HashPartitioner(10))
 26 | 
 27 |   def readExamScores() = sc.textFile(s"$rootFolder/examScores.txt")
 28 |     .map { line =>
 29 |       val tokens = line.split(" ")
 30 |       (tokens(0).toLong, tokens(1).toDouble)
 31 |     }
 32 | 
 33 |   // goal: the number of students who passed the exam (= at least one attempt > 9.0)
 34 | 
 35 |   def plainJoin() = {
 36 |     val candidates = readIds()
 37 |     val scores = readExamScores()
 38 | 
 39 |     // simple join
 40 |     val joined: RDD[(Long, (Double, String))] = scores.join(candidates) // (score attempt, candidate name)
 41 |     val finalScores = joined
 42 |       .reduceByKey((pair1, pair2) => if(pair1._1 > pair2._1) pair1 else pair2)
 43 |       .filter(_._2._1 > 9.0)
 44 | 
 45 |     finalScores.count
 46 |   }
 47 | 
 48 |   def preAggregate() = {
 49 |     val candidates = readIds()
 50 |     val scores = readExamScores()
 51 | 
 52 |     // do aggregation first - 10% perf increase
 53 |     val maxScores: RDD[(Long, Double)] = scores.reduceByKey(Math.max)
 54 |     val finalScores = maxScores.join(candidates).filter(_._2._1 > 9.0)
 55 | 
 56 |     finalScores.count
 57 |   }
 58 | 
 59 |   def preFiltering() = {
 60 |     val candidates = readIds()
 61 |     val scores = readExamScores()
 62 | 
 63 |     // do filtering first before the join
 64 |     val maxScores = scores.reduceByKey(Math.max).filter(_._2 > 9.0)
 65 |     val finalScores = maxScores.join(candidates)
 66 | 
 67 |     finalScores.count
 68 |   }
 69 | 
 70 |   def coPartitioning() = {
 71 |     val candidates = readIds()
 72 |     val scores = readExamScores()
 73 | 
 74 |     val partitionerForScores = candidates.partitioner match {
 75 |       case None => new HashPartitioner(candidates.getNumPartitions)
 76 |       case Some(partitioner) => partitioner
 77 |     }
 78 | 
 79 |     val repartitionedScores = scores.partitionBy(partitionerForScores)
 80 |     val joined: RDD[(Long, (Double, String))] = repartitionedScores.join(candidates)
 81 |     val finalScores = joined
 82 |       .reduceByKey((pair1, pair2) => if(pair1._1 > pair2._1) pair1 else pair2)
 83 |       .filter(_._2._1 > 9.0)
 84 | 
 85 |     finalScores.count
 86 |   }
 87 | 
 88 |   def combined() = {
 89 |     val candidates = readIds()
 90 |     val scores = readExamScores()
 91 | 
 92 |     val partitionerForScores = candidates.partitioner match {
 93 |       case None => new HashPartitioner(candidates.getNumPartitions)
 94 |       case Some(partitioner) => partitioner
 95 |     }
 96 | 
 97 |     val repartitionedScores = scores.partitionBy(partitionerForScores)
 98 | 
 99 |     // do filtering first before the join
100 |     val maxScores = repartitionedScores.reduceByKey(Math.max).filter(_._2 > 9.0)
101 |     val finalScores = maxScores.join(candidates)
102 | 
103 |     finalScores.count
104 |   }
105 | 
106 |   def main(args: Array[String]): Unit = {
107 |     plainJoin()
108 |     preAggregate()
109 |     preFiltering()
110 |     coPartitioning()
111 |     combined()
112 | 
113 |     Thread.sleep(1000000)
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/scala/part3dfjoins/SkewedJoins.scala:
--------------------------------------------------------------------------------
 1 | package part3dfjoins
 2 | 
 3 | import generator.DataGenerator
 4 | import org.apache.spark.sql.SparkSession
 5 | import org.apache.spark.sql.functions._
 6 | 
 7 | object SkewedJoins {
 8 | 
 9 |   val spark = SparkSession.builder()
10 |     .appName("Skewed Joins")
11 |     .master("local[*]")
12 |     .config("spark.sql.autoBroadcastJoinThreshold", -1) // deactivate broadcast joins
13 |     .getOrCreate()
14 | 
15 |   import spark.implicits._
16 | 
17 |   /*
18 |     An online store selling gaming laptops.
19 |     2 laptops are "similar" if they have the same make & model, but proc speed within 0.1
20 | 
21 |     For each laptop configuration, we are interested in the average sale price of "similar" models.
22 | 
23 |     Acer Predator 2.9Ghz aylfaskjhrw -> average sale price of all Acer Predators with CPU speed between 2.8 and 3.0 GHz
24 |    */
25 | 
26 |   val laptops = Seq.fill(40000)(DataGenerator.randomLaptop()).toDS
27 |   val laptopOffers = Seq.fill(100000)(DataGenerator.randomLaptopOffer()).toDS
28 | 
29 |   val joined = laptops.join(laptopOffers, Seq("make", "model"))
30 |     .filter(abs(laptopOffers.col("procSpeed") - laptops.col("procSpeed")) <= 0.1)
31 |     .groupBy("registration")
32 |     .agg(avg("salePrice").as("averagePrice"))
33 |   /*
34 |     == Physical Plan ==
35 |     *(4) HashAggregate(keys=[registration#4], functions=[avg(salePrice#20)])
36 |     +- Exchange hashpartitioning(registration#4, 200), true, [id=#99]
37 |        +- *(3) HashAggregate(keys=[registration#4], functions=[partial_avg(salePrice#20)])
38 |           +- *(3) Project [registration#4, salePrice#20]
39 |              +- *(3) SortMergeJoin [make#5, model#6], [make#17, model#18], Inner, (abs((procSpeed#19 - procSpeed#7)) <= 0.1)
40 |                 :- *(1) Sort [make#5 ASC NULLS FIRST, model#6 ASC NULLS FIRST], false, 0
41 |                 :  +- Exchange hashpartitioning(make#5, model#6, 200), true, [id=#77]
42 |                 :     +- LocalTableScan [registration#4, make#5, model#6, procSpeed#7]
43 |                 +- *(2) Sort [make#17 ASC NULLS FIRST, model#18 ASC NULLS FIRST], false, 0
44 |                    +- Exchange hashpartitioning(make#17, model#18, 200), true, [id=#78]
45 |                       +- LocalTableScan [make#17, model#18, procSpeed#19, salePrice#20]
46 |    */
47 | 
48 |   val laptops2 = laptops.withColumn("procSpeed", explode(array($"procSpeed" - 0.1, $"procSpeed", $"procSpeed" + 0.1)))
49 |   val joined2 = laptops2.join(laptopOffers, Seq("make", "model", "procSpeed"))
50 |     .groupBy("registration")
51 |     .agg(avg("salePrice").as("averagePrice"))
52 |   /*
53 |     == Physical Plan ==
54 |     *(4) HashAggregate(keys=[registration#4], functions=[avg(salePrice#20)])
55 |     +- Exchange hashpartitioning(registration#4, 200), true, [id=#107]
56 |        +- *(3) HashAggregate(keys=[registration#4], functions=[partial_avg(salePrice#20)])
57 |           +- *(3) Project [registration#4, salePrice#20]
58 |              +- *(3) SortMergeJoin [make#5, model#6, knownfloatingpointnormalized(normalizenanandzero(procSpeed#43))], [make#17, model#18, knownfloatingpointnormalized(normalizenanandzero(procSpeed#19))], Inner
59 |                 :- *(1) Sort [make#5 ASC NULLS FIRST, model#6 ASC NULLS FIRST, knownfloatingpointnormalized(normalizenanandzero(procSpeed#43)) ASC NULLS FIRST], false, 0
60 |                 :  +- Exchange hashpartitioning(make#5, model#6, knownfloatingpointnormalized(normalizenanandzero(procSpeed#43)), 200), true, [id=#85]
61 |                 :     +- Generate explode(array((procSpeed#7 - 0.1), procSpeed#7, (procSpeed#7 + 0.1))), [registration#4, make#5, model#6], false, [procSpeed#43]
62 |                 :        +- LocalTableScan [registration#4, make#5, model#6, procSpeed#7]
63 |                 +- *(2) Sort [make#17 ASC NULLS FIRST, model#18 ASC NULLS FIRST, knownfloatingpointnormalized(normalizenanandzero(procSpeed#19)) ASC NULLS FIRST], false, 0
64 |                    +- Exchange hashpartitioning(make#17, model#18, knownfloatingpointnormalized(normalizenanandzero(procSpeed#19)), 200), true, [id=#86]
65 |                       +- LocalTableScan [make#17, model#18, procSpeed#19, salePrice#20]
66 |    */
67 | 
68 |   def main(args: Array[String]): Unit = {
69 |     joined2.show()
70 |     joined2.explain()
71 |     Thread.sleep(1000000)
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/scala/part5rddtransformations/ByKeyFunctions.scala:
--------------------------------------------------------------------------------
  1 | package part5rddtransformations
  2 | 
  3 | import org.apache.spark.sql.SparkSession
  4 | 
  5 | import scala.io.Source
  6 | import scala.util.{Random, Using}
  7 | 
  8 | object ByKeyFunctions {
  9 | 
 10 |   val spark = SparkSession.builder()
 11 |     .appName("Skewed Joins")
 12 |     .master("local[2]")
 13 |     .getOrCreate()
 14 | 
 15 |   val sc = spark.sparkContext
 16 | 
 17 |   /*
 18 |     In the video, we copied the file from src/main/resources/data/lipsum/words.txt to spark-cluster/data.
 19 |     This will make it available under /opt/spark-data/.
 20 |     Alternatively, if you copied the entire src/main/resources/data folder to Docker, you will need to use the path where you copied it.
 21 | 
 22 |     Scenario: assume we have a dataset with (word, occurrences) which we obtained after scraping a big document or website.
 23 |     We want to aggregate and sum all values under a single map.
 24 |    */
 25 |   val words: Seq[String] = Using.resource(Source.fromFile("/opt/spark-data/words.txt")) { source =>
 26 |     source.getLines().toSeq
 27 |   }
 28 | 
 29 |   // generate data
 30 |   val random = new Random
 31 |   val wordCounts = sc.parallelize(Seq.fill(2000000)(words(random.nextInt(words.length)), random.nextInt(1000)))
 32 | 
 33 |   // the most intuitive solution can be the most dangerous
 34 |   val totalCounts = wordCounts
 35 |     .groupByKey() // RDD of key = word, value = iterable of all previous values
 36 |     .mapValues(_.sum)
 37 | 
 38 |   // call an action
 39 |   totalCounts.collectAsMap()
 40 |   // ^^ 6s for 2M laptop sales [maybe adjust numbers]
 41 |   // ^^ look at the shuffle write - it shuffles the entire data
 42 | 
 43 |   /*
 44 |     groupByKey is dangerous in 2 ways:
 45 |       - it causes a shuffle so that data associated with one key stays on the same machine
 46 |       - it can cause memory errors if the data is skewed, i.e. data associated to one key has disproportionate representation and may not fit in an executor mem
 47 |    */
 48 | 
 49 |   /**
 50 |     Other byKey functions
 51 |    */
 52 | 
 53 |   /*
 54 |     ReduceByKey is the simplest - like a collection
 55 |     Also faster and safer because
 56 |     - it does a partial aggregate on the executor (operations done on the executors without shuffling are called map-side)
 57 |     - avoids the data skew problem
 58 |     - shuffles much less data
 59 |   */
 60 |   val totalCountsReduce = wordCounts.reduceByKey(_ + _)
 61 |   totalCountsReduce.collectAsMap()
 62 | 
 63 |   /*
 64 |     FoldByKey is similar to the collection fold function
 65 |     - needs a 0 value to start with
 66 |     - needs a combination function
 67 | 
 68 |     Similar performance
 69 |    */
 70 |   val totalCountsFold = wordCounts.foldByKey(0)(_ + _)
 71 |   totalCountsFold.collectAsMap()
 72 | 
 73 |   /*
 74 |     Aggregate by key is more general and needs a zero value and 2 combination functions
 75 |     - one that combines the current aggregated value with a new element
 76 |     - one that combines two aggregated values from different executors
 77 | 
 78 |     Similar performance
 79 |    */
 80 |   val totalCountsAggregate = wordCounts.aggregateByKey(0.0)(_ + _, _ + _)
 81 |   totalCountsAggregate.collectAsMap()
 82 | 
 83 |   /*
 84 |     CombineByKey is the most general function available that can combine values inside your RDD. You need
 85 |     - a function that turns a value into an aggregate value so that further aggregates can start from it
 86 |     - a function to combine a current aggregate with a value in the RDD inside the executor
 87 |     - a function to combine 2 aggregates between executors
 88 |     - a number of partitions, or a partitioner so that you can do further operations e.g. joins without additional shuffles
 89 | 
 90 |     CombineByKey can be as dangerous as groupByKey when the combination functions grow the data instead of shrinking it.
 91 |     Used correctly (i.e. when the functions are reduction functions), combineByKey is efficient and potentially much more efficient later on if you do joins.
 92 |    */
 93 |   val totalCountsCombine = wordCounts.combineByKey(
 94 |     (count: Int) => count,
 95 |     (currentSum: Int, newValue: Int) => currentSum + newValue,
 96 |     (partialSum1: Int, partialSum2: Int) => partialSum1 + partialSum2,
 97 |     numPartitions = 10
 98 |   )
 99 |   totalCountsCombine.collectAsMap()
100 |   // collect still takes 2 seconds
101 | 
102 |   def main(args: Array[String]): Unit = {
103 |     Thread.sleep(10000000)
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/META-INF/MANIFEST.MF:
--------------------------------------------------------------------------------
 1 | Manifest-Version: 1.0
 2 | Class-Path: commons-compiler-3.0.15.jar hadoop-mapreduce-client-common
 3 |  -2.7.4.jar hadoop-yarn-server-nodemanager-2.7.4.jar hadoop-yarn-api-2
 4 |  .7.4.jar avro-1.8.2.jar avro-mapred-1.8.2-hadoop2.jar hadoop-mapreduc
 5 |  e-client-jobclient-2.7.4.jar jackson-mapper-asl-1.9.13.jar scala-xml_
 6 |  2.12-1.2.0.jar commons-compress-1.8.1.jar javassist-3.22.0-CR2.jar ha
 7 |  doop-yarn-common-2.7.4.jar commons-httpclient-3.1.jar spark-catalyst_
 8 |  2.12-3.0.0-preview2.jar jersey-common-2.29.1.jar jackson-core-2.10.0.
 9 |  jar spark-tags_2.12-3.0.0-preview2.jar parquet-column-1.10.1.jar json
10 |  4s-scalap_2.12-3.6.6.jar javax.servlet-api-3.1.0.jar jsr305-3.0.2.jar
11 |   jackson-module-scala_2.12-2.10.0.jar metrics-graphite-4.1.1.jar metr
12 |  ics-jmx-4.1.1.jar leveldbjni-all-1.8.jar guice-3.0.jar curator-recipe
13 |  s-2.7.1.jar avro-ipc-1.8.2.jar hadoop-mapreduce-client-core-2.7.4.jar
14 |   jersey-hk2-2.29.1.jar spark-core_2.12-3.0.0-preview2.jar RoaringBitm
15 |  ap-0.7.45.jar hadoop-yarn-server-common-2.7.4.jar metrics-json-4.1.1.
16 |  jar jackson-core-asl-1.9.13.jar hadoop-annotations-2.7.4.jar pyrolite
17 |  -4.30.jar orc-shims-1.5.8.jar jakarta.inject-2.6.1.jar jetty-util-6.1
18 |  .26.jar httpcore-4.2.4.jar hk2-locator-2.6.1.jar xz-1.5.jar commons-m
19 |  ath3-3.4.1.jar commons-cli-1.2.jar gson-2.2.4.jar jsp-api-2.1.jar act
20 |  ivation-1.1.1.jar curator-framework-2.7.1.jar parquet-hadoop-1.10.1.j
21 |  ar hadoop-common-2.7.4.jar slf4j-api-1.7.28.jar jersey-container-serv
22 |  let-2.29.1.jar jetty-sslengine-6.1.26.jar commons-crypto-1.0.0.jar ao
23 |  palliance-repackaged-2.6.1.jar jakarta.ws.rs-api-2.1.6.jar jcl-over-s
24 |  lf4j-1.7.16.jar jackson-databind-2.10.0.jar osgi-resource-locator-1.0
25 |  .3.jar arrow-memory-0.15.1.jar aopalliance-1.0.jar orc-mapreduce-1.5.
26 |  8.jar kryo-shaded-4.0.2.jar commons-io-2.4.jar stax-api-1.0-2.jar par
27 |  quet-jackson-1.10.1.jar log4j-1.2.17.jar jersey-client-2.29.1.jar sna
28 |  ppy-java-1.1.7.3.jar parquet-format-2.4.0.jar flatbuffers-java-1.9.0.
29 |  jar metrics-core-4.1.1.jar slf4j-log4j12-1.7.25.jar xercesImpl-2.9.1.
30 |  jar chill-java-0.9.3.jar jakarta.validation-api-2.0.2.jar jakarta.ann
31 |  otation-api-1.3.5.jar jersey-server-2.29.1.jar jersey-container-servl
32 |  et-core-2.29.1.jar zstd-jni-1.4.4-3.jar jackson-annotations-2.10.0.ja
33 |  r objenesis-2.5.1.jar scala-parser-combinators_2.12-1.1.2.jar commons
34 |  -beanutils-1.7.0.jar ivy-2.4.0.jar json4s-core_2.12-3.6.6.jar commons
35 |  -net-3.1.jar oro-2.0.8.jar spark-launcher_2.12-3.0.0-preview2.jar ant
36 |  lr4-runtime-4.7.1.jar hadoop-mapreduce-client-app-2.7.4.jar hadoop-cl
37 |  ient-2.7.4.jar hk2-api-2.6.1.jar stream-2.9.6.jar commons-configurati
38 |  on-1.6.jar zookeeper-3.4.14.jar orc-core-1.5.8.jar xbean-asm7-shaded-
39 |  4.15.jar log4j-api-2.4.1.jar api-asn1-api-1.0.0-M20.jar curator-clien
40 |  t-2.7.1.jar protobuf-java-2.5.0.jar compress-lzf-1.0.3.jar jackson-ja
41 |  xrs-1.9.13.jar arrow-format-0.15.1.jar scala-library-2.12.4.jar spark
42 |  -unsafe_2.12-3.0.0-preview2.jar spark-sql_2.12-3.0.0-preview2.jar air
43 |  compressor-0.10.jar jline-0.9.94.jar minlog-1.3.0.jar lz4-java-1.7.0.
44 |  jar unused-1.0.0.jar chill_2.12-0.9.3.jar commons-text-1.6.jar py4j-0
45 |  .10.8.1.jar parquet-encoding-1.10.1.jar jackson-xc-1.9.13.jar hadoop-
46 |  mapreduce-client-shuffle-2.7.4.jar audience-annotations-0.5.0.jar jet
47 |  tison-1.1.jar netty-all-4.1.42.Final.jar jaxb-api-2.2.2.jar jersey-me
48 |  dia-jaxb-2.29.1.jar apacheds-kerberos-codec-2.0.0-M15.jar janino-3.0.
49 |  15.jar hadoop-yarn-client-2.7.4.jar arrow-vector-0.15.1.jar log4j-cor
50 |  e-2.4.1.jar hive-storage-api-2.6.0.jar guava-16.0.1.jar spotbugs-anno
51 |  tations-3.1.9.jar spark-sketch_2.12-3.0.0-preview2.jar xmlenc-0.52.ja
52 |  r json4s-ast_2.12-3.6.6.jar scala-reflect-2.12.4.jar hk2-utils-2.6.1.
53 |  jar spark-network-common_2.12-3.0.0-preview2.jar paranamer-2.8.jar ap
54 |  acheds-i18n-2.0.0-M15.jar jul-to-slf4j-1.7.16.jar commons-lang3-3.9.j
55 |  ar metrics-jvm-4.1.1.jar jackson-module-paranamer-2.10.0.jar hadoop-h
56 |  dfs-2.7.4.jar spark-network-shuffle_2.12-3.0.0-preview2.jar xml-apis-
57 |  1.3.04.jar json4s-jackson_2.12-3.6.6.jar htrace-core-3.1.0-incubating
58 |  .jar javax.inject-1.jar httpclient-4.2.5.jar hadoop-auth-2.7.4.jar co
59 |  mmons-codec-1.10.jar commons-collections-3.2.2.jar shims-0.7.45.jar s
60 |  park-kvstore_2.12-3.0.0-preview2.jar netty-3.10.6.Final.jar parquet-c
61 |  ommon-1.10.1.jar univocity-parsers-2.8.3.jar api-util-1.0.0-M20.jar c
62 |  ommons-lang-2.6.jar commons-digester-1.8.jar
63 | Main-Class: 
64 | 
65 | 


--------------------------------------------------------------------------------
/src/main/scala/part5rddtransformations/I2ITransformations.scala:
--------------------------------------------------------------------------------
  1 | package part5rddtransformations
  2 | 
  3 | import generator.DataGenerator
  4 | import org.apache.spark.sql.SparkSession
  5 | import scala.collection.mutable
  6 | 
  7 | object I2ITransformations {
  8 | 
  9 |   val spark = SparkSession.builder()
 10 |     .appName("I2I Transformations")
 11 |     .master("local[*]")
 12 |     .getOrCreate()
 13 | 
 14 |   val sc = spark.sparkContext
 15 | 
 16 |   /*
 17 |     Science project
 18 |     each metric has identifier, value
 19 | 
 20 |     Return the smallest ("best") 10 metrics (identifiers + values)
 21 |    */
 22 | 
 23 |   val LIMIT = 10
 24 | 
 25 |   def readMetrics() = sc.textFile("src/main/resources/generated/metrics/metrics10m.txt")
 26 |     .map { line =>
 27 |       val tokens = line.split(" ")
 28 |       val name = tokens(0)
 29 |       val value = tokens(1)
 30 | 
 31 |       (name, value.toDouble)
 32 |     }
 33 | 
 34 |   def printTopMetrics() = {
 35 |     val sortedMetrics = readMetrics().sortBy(_._2).take(LIMIT)
 36 |     sortedMetrics.foreach(println)
 37 |   }
 38 | 
 39 |   def printTopMetricsI2I() = {
 40 | 
 41 |     val iteratorToIteratorTransformation = (records: Iterator[(String, Double)]) => {
 42 |       /*
 43 |         i2i transformation
 44 |         - they are NARROW TRANSFORMATIONS
 45 |         - Spark will "selectively" spill data to disk when partitions are too big for memory
 46 | 
 47 |         Warning: don't traverse more than once or convert to collections
 48 |         */
 49 | 
 50 |       implicit val ordering: Ordering[(String, Double)] = Ordering.by[(String, Double), Double](_._2)
 51 |       val limitedCollection = new mutable.TreeSet[(String, Double)]()
 52 | 
 53 |       records.foreach { record =>
 54 |         limitedCollection.add(record)
 55 |         if (limitedCollection.size > LIMIT) {
 56 |           limitedCollection.remove(limitedCollection.last)
 57 |         }
 58 |       }
 59 | 
 60 |       // I've traversed the iterator
 61 | 
 62 |       limitedCollection.iterator
 63 |     }
 64 | 
 65 |     val topMetrics = readMetrics()
 66 |       .mapPartitions(iteratorToIteratorTransformation)
 67 |       .repartition(1)
 68 |       .mapPartitions(iteratorToIteratorTransformation)
 69 | 
 70 |     val result = topMetrics.take(LIMIT)
 71 |     result.foreach(println)
 72 |   }
 73 | 
 74 |   /**
 75 |     * Exercises
 76 |     */
 77 | 
 78 |   def printTopMetricsEx1() = {
 79 |     /*
 80 |       Better than the "dummy" approach
 81 |       - not sorting the entire RDD
 82 | 
 83 |       Bad (worse than the optimal)
 84 |       - sorting the entire partition
 85 |       - forcing the iterator in memory - this can OOM your executors
 86 |      */
 87 |     val topMetrics = readMetrics()
 88 |       .mapPartitions(_.toList.sortBy(_._2).take(LIMIT).iterator)
 89 |       .repartition(1)
 90 |       .mapPartitions(_.toList.sortBy(_._2).take(LIMIT).iterator)
 91 |       .take(LIMIT)
 92 | 
 93 |     topMetrics.foreach(println)
 94 |   }
 95 | 
 96 |   /*
 97 |     Better than ex1
 98 |     - extracting top 10 values per partition instead of sorting the entire partition
 99 | 
100 |     Bad because
101 |     - forcing toList can OOM your executors
102 |     - iterating over the list twice
103 |     - if the list is immutable, time spent allocating objects (and GC)
104 |    */
105 |   def printTopMetricsEx2() = {
106 |     val topMetrics = readMetrics()
107 |       .mapPartitions { records =>
108 | 
109 |         implicit val ordering: Ordering[(String, Double)] = Ordering.by[(String, Double), Double](_._2)
110 |         val limitedCollection = new mutable.TreeSet[(String, Double)]()
111 | 
112 |         records.toList.foreach { record =>
113 |           limitedCollection.add(record)
114 |           if (limitedCollection.size > LIMIT) {
115 |             limitedCollection.remove(limitedCollection.last)
116 |           }
117 |         }
118 | 
119 |         // I've traversed the iterator
120 | 
121 |         limitedCollection.iterator
122 |       }
123 |       .repartition(1)
124 |       .mapPartitions { records =>
125 | 
126 |         implicit val ordering: Ordering[(String, Double)] = Ordering.by[(String, Double), Double](_._2)
127 |         val limitedCollection = new mutable.TreeSet[(String, Double)]()
128 | 
129 |         records.toList.foreach { record =>
130 |           limitedCollection.add(record)
131 |           if (limitedCollection.size > LIMIT) {
132 |             limitedCollection.remove(limitedCollection.last)
133 |           }
134 |         }
135 | 
136 |         // I've traversed the iterator
137 | 
138 |         limitedCollection.iterator
139 |       }
140 |       .take(LIMIT)
141 | 
142 |     topMetrics.foreach(println)
143 |   }
144 | 
145 |   def main(args: Array[String]): Unit = {
146 |     printTopMetrics()
147 |     printTopMetricsI2I()
148 |     Thread.sleep(1000000)
149 |   }
150 | }
151 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/sbt,java,scala,spark,intellij
  3 | # Edit at https://www.gitignore.io/?templates=sbt,java,scala,spark,intellij
  4 | 
  5 | ### Intellij ###
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff
 10 | .idea/**/workspace.xml
 11 | .idea/**/tasks.xml
 12 | .idea/**/usage.statistics.xml
 13 | .idea/**/dictionaries
 14 | .idea/**/shelf
 15 | 
 16 | # Generated files
 17 | .idea/**/contentModel.xml
 18 | 
 19 | # Sensitive or high-churn files
 20 | .idea/**/dataSources/
 21 | .idea/**/dataSources.ids
 22 | .idea/**/dataSources.local.xml
 23 | .idea/**/sqlDataSources.xml
 24 | .idea/**/dynamic.xml
 25 | .idea/**/uiDesigner.xml
 26 | .idea/**/dbnavigator.xml
 27 | 
 28 | # Gradle
 29 | .idea/**/gradle.xml
 30 | .idea/**/libraries
 31 | 
 32 | # Gradle and Maven with auto-import
 33 | # When using Gradle or Maven with auto-import, you should exclude module files,
 34 | # since they will be recreated, and may cause churn.  Uncomment if using
 35 | # auto-import.
 36 | # .idea/modules.xml
 37 | # .idea/*.iml
 38 | # .idea/modules
 39 | # *.iml
 40 | # *.ipr
 41 | 
 42 | # CMake
 43 | cmake-build-*/
 44 | 
 45 | # Mongo Explorer plugin
 46 | .idea/**/mongoSettings.xml
 47 | 
 48 | # File-based project format
 49 | *.iws
 50 | 
 51 | # IntelliJ
 52 | out/
 53 | 
 54 | # mpeltonen/sbt-idea plugin
 55 | .idea_modules/
 56 | 
 57 | # JIRA plugin
 58 | atlassian-ide-plugin.xml
 59 | 
 60 | # Cursive Clojure plugin
 61 | .idea/replstate.xml
 62 | 
 63 | # Crashlytics plugin (for Android Studio and IntelliJ)
 64 | com_crashlytics_export_strings.xml
 65 | crashlytics.properties
 66 | crashlytics-build.properties
 67 | fabric.properties
 68 | 
 69 | # Editor-based Rest Client
 70 | .idea/httpRequests
 71 | 
 72 | # Android studio 3.1+ serialized cache file
 73 | .idea/caches/build_file_checksums.ser
 74 | 
 75 | ### Intellij Patch ###
 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 77 | 
 78 | # *.iml
 79 | # modules.xml
 80 | # .idea/misc.xml
 81 | # *.ipr
 82 | 
 83 | # Sonarlint plugin
 84 | .idea/**/sonarlint/
 85 | 
 86 | # SonarQube Plugin
 87 | .idea/**/sonarIssues.xml
 88 | 
 89 | # Markdown Navigator plugin
 90 | .idea/**/markdown-navigator.xml
 91 | .idea/**/markdown-navigator/
 92 | 
 93 | ### Java ###
 94 | # Compiled class file
 95 | *.class
 96 | 
 97 | # Log file
 98 | *.log
 99 | 
100 | # BlueJ files
101 | *.ctxt
102 | 
103 | # Mobile Tools for Java (J2ME)
104 | .mtj.tmp/
105 | 
106 | # Package Files #
107 | *.jar
108 | *.war
109 | *.nar
110 | *.ear
111 | *.zip
112 | *.tar.gz
113 | *.rar
114 | 
115 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
116 | hs_err_pid*
117 | 
118 | ### SBT ###
119 | # Simple Build Tool
120 | # http://www.scala-sbt.org/release/docs/Getting-Started/Directories.html#configuring-version-control
121 | 
122 | dist/*
123 | target/
124 | lib_managed/
125 | src_managed/
126 | project/boot/
127 | project/plugins/project/
128 | .history
129 | .cache
130 | .lib/
131 | 
132 | ### Scala ###
133 | *.metals
134 | 
135 | ### Spark ###
136 | *#*#
137 | *.#*
138 | *.iml
139 | *.ipr
140 | *.pyc
141 | *.pyo
142 | *.swp
143 | *~
144 | .DS_Store
145 | .classpath
146 | .ensime
147 | .ensime_cache/
148 | .ensime_lucene
149 | .generated-mima*
150 | .idea/
151 | .project
152 | .pydevproject
153 | .scala_dependencies
154 | .settings
155 | /lib/
156 | R-unit-tests.log
157 | R/unit-tests.out
158 | R/cran-check.out
159 | R/pkg/vignettes/sparkr-vignettes.html
160 | R/pkg/tests/fulltests/Rplots.pdf
161 | build/*.jar
162 | build/apache-maven*
163 | build/scala*
164 | build/zinc*
165 | cache
166 | checkpoint
167 | conf/*.cmd
168 | conf/*.conf
169 | conf/*.properties
170 | conf/*.sh
171 | conf/*.xml
172 | conf/java-opts
173 | conf/slaves
174 | dependency-reduced-pom.xml
175 | derby.log
176 | dev/create-release/*final
177 | dev/create-release/*txt
178 | dev/pr-deps/
179 | dist/
180 | docs/_site
181 | docs/api
182 | sql/docs
183 | sql/site
184 | lint-r-report.log
185 | log/
186 | logs/
187 | project/build/target/
188 | project/plugins/lib_managed/
189 | project/plugins/project/build.properties
190 | project/plugins/src_managed/
191 | project/plugins/target/
192 | python/lib/pyspark.zip
193 | python/deps
194 | python/test_coverage/coverage_data
195 | python/test_coverage/htmlcov
196 | python/pyspark/python
197 | reports/
198 | scalastyle-on-compile.generated.xml
199 | scalastyle-output.xml
200 | scalastyle.txt
201 | spark-*-bin-*.tgz
202 | spark-tests.log
203 | streaming-tests.log
204 | unit-tests.log
205 | work/
206 | docs/.jekyll-metadata
207 | 
208 | # For Hive
209 | TempStatsStore/
210 | metastore/
211 | metastore_db/
212 | sql/hive-thriftserver/test_warehouses
213 | warehouse/
214 | spark-warehouse/
215 | 
216 | # For R session data
217 | .RData
218 | .RHistory
219 | .Rhistory
220 | *.Rproj
221 | *.Rproj.*
222 | 
223 | .Rproj.user
224 | 
225 | # For SBT
226 | .jvmopts
227 | 
228 | 
229 | # End of https://www.gitignore.io/api/sbt,java,scala,spark,intellij
230 | 
231 | # Daniel added
232 | src/main/resources/generated/
233 | 


--------------------------------------------------------------------------------
/src/main/scala/part2foundations/ReadingQueryPlans.scala:
--------------------------------------------------------------------------------
  1 | package part2foundations
  2 | 
  3 | import org.apache.spark.sql.SparkSession
  4 | 
  5 | object ReadingQueryPlans {
  6 |   ///////////////////////////////////////////////////////////////////// Boilerplate
  7 |   // you don't need this code in the Spark shell
  8 |   // this code is needed if you want to run it locally in IntelliJ
  9 | 
 10 |   val spark = SparkSession.builder()
 11 |     .config("spark.master", "local")
 12 |     .appName("Reading Query Plans")
 13 |     .getOrCreate()
 14 | 
 15 |   val sc = spark.sparkContext
 16 | 
 17 |   ///////////////////////////////////////////////////////////////////// Boilerplate
 18 | 
 19 |   // plan 1 - a simple transformation
 20 |   val simpleNumbers = spark.range(1, 1000000)
 21 |   val times5 = simpleNumbers.selectExpr("id * 5 as id")
 22 |   times5.explain() // this is how you show a query plan
 23 |   /*
 24 |     == Physical Plan ==
 25 |     *(1) Project [(id#0L * 5) AS id#2L]
 26 |     +- *(1) Range (1, 1000000, step=1, splits=6)
 27 |    */
 28 | 
 29 |   // plan 2 - a shuffle
 30 |   val moreNumbers = spark.range(1, 1000000, 2)
 31 |   val split7 = moreNumbers.repartition(7)
 32 | 
 33 |   split7.explain()
 34 |   /*
 35 |     == Physical Plan ==
 36 |     Exchange RoundRobinPartitioning(7), false, [id=#16]
 37 |     +- *(1) Range (1, 1000000, step=2, splits=6)
 38 |    */
 39 | 
 40 |   // plan 3 - shuffle + transformation
 41 |   split7.selectExpr("id * 5 as id").explain()
 42 |   /*
 43 |     == Physical Plan ==
 44 |     *(2) Project [(id#4L * 5) AS id#8L]
 45 |     +- Exchange RoundRobinPartitioning(7), false, [id=#29]
 46 |       +- *(1) Range (1, 1000000, step=2, splits=6)
 47 |    */
 48 | 
 49 | 
 50 |   // plan 4 - a more complex job with a join
 51 |   val ds1 = spark.range(1, 10000000)
 52 |   val ds2 = spark.range(1, 20000000, 2)
 53 |   val ds3 = ds1.repartition(7)
 54 |   val ds4 = ds2.repartition(9)
 55 |   val ds5 = ds3.selectExpr("id * 3 as id")
 56 |   val joined = ds5.join(ds4, "id")
 57 |   val sum = joined.selectExpr("sum(id)")
 58 |   sum.explain()
 59 |   /*
 60 | 
 61 |   == Physical Plan ==
 62 |   *(7) HashAggregate(keys=[], functions=[sum(id#18L)])
 63 |   +- Exchange SinglePartition, true, [id=#99]
 64 |     +- *(6) HashAggregate(keys=[], functions=[partial_sum(id#18L)])
 65 |       +- *(6) Project [id#18L]
 66 |         +- *(6) SortMergeJoin [id#18L], [id#12L], Inner
 67 |           :- *(3) Sort [id#18L ASC NULLS FIRST], false, 0
 68 |           :  +- Exchange hashpartitioning(id#18L, 200), true, [id=#83]
 69 |           :     +- *(2) Project [(id#10L * 3) AS id#18L]
 70 |           :        +- Exchange RoundRobinPartitioning(7), false, [id=#79]
 71 |           :           +- *(1) Range (1, 10000000, step=1, splits=6)
 72 |           +- *(5) Sort [id#12L ASC NULLS FIRST], false, 0
 73 |             +- Exchange hashpartitioning(id#12L, 200), true, [id=#90]
 74 |               +- Exchange RoundRobinPartitioning(9), false, [id=#89]
 75 |                 +- *(4) Range (1, 20000000, step=2, splits=6)
 76 |    */
 77 | 
 78 |   /**
 79 |     * Exercises - read the Query Plans and try to understand the code that generated them.
 80 |     */
 81 | 
 82 |   // exercise 1
 83 |   /*
 84 |     == Physical Plan ==
 85 |     *(1) Project [firstName#153, lastName#155, (cast(salary#159 as double) / 1.1) AS salary_EUR#168]
 86 |     +- *(1) FileScan csv [firstName#153,lastName#155,salary#159] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/tmp/employees_headers.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<firstName:string,lastName:string,salary:string>
 87 |    */
 88 |   val employeesDF = spark.read.option("header", true).csv("/tmp/employees_headers.csv")
 89 |   val empEur = employeesDF.selectExpr("firstName", "lastName", "salary / 1.1 as salary_EUR")
 90 | 
 91 |   // exercise 2
 92 |   /*
 93 |   == Physical Plan ==
 94 |   *(2) HashAggregate(keys=[dept#156], functions=[avg(cast(salary#181 as bigint))])
 95 |     +- Exchange hashpartitioning(dept#156, 200)
 96 |       +- *(1) HashAggregate(keys=[dept#156], functions=[partial_avg(cast(salary#181 as bigint))])
 97 |         +- *(1) Project [dept#156, cast(salary#159 as int) AS salary#181]
 98 |           +- *(1) FileScan csv [dept#156,salary#159] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/tmp/employees_headers.csv], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<dept:string,salary:string>
 99 |    */
100 |   val avgSals = employeesDF
101 |     .selectExpr("dept", "cast(salary as int) as salary")
102 |     .groupBy("dept")
103 |     .avg("salary")
104 | 
105 | 
106 |   // exercise 3
107 |   /*
108 |   == Physical Plan ==
109 |   *(5) Project [id#195L]
110 |     +- *(5) SortMergeJoin [id#195L], [id#197L], Inner
111 |       :- *(2) Sort [id#195L ASC NULLS FIRST], false, 0
112 |       :  +- Exchange hashpartitioning(id#195L, 200)
113 |       :     +- *(1) Range (1, 10000000, step=3, splits=6)
114 |       +- *(4) Sort [id#197L ASC NULLS FIRST], false, 0
115 |         +- Exchange hashpartitioning(id#197L, 200)
116 |           +- *(3) Range (1, 10000000, step=5, splits=6)
117 |    */
118 |   val d1 = spark.range(1, 10000000, 3)
119 |   val d2 = spark.range(1, 10000000, 5)
120 |   val j1 = d1.join(d2, "id")
121 | 
122 | }
123 | 


--------------------------------------------------------------------------------
/src/main/scala/part2foundations/SparkAPIs.scala:
--------------------------------------------------------------------------------
  1 | package part2foundations
  2 | 
  3 | import org.apache.spark.sql.SparkSession
  4 | import org.apache.spark.sql.functions.expr
  5 | 
  6 | object SparkAPIs {
  7 | 
  8 |   /**
  9 |     * This application contains the code we wrote during the "Different Spark APIs" video.
 10 |     */
 11 | 
 12 |   val spark = SparkSession.builder()
 13 |   .config("spark.master", "local")
 14 |   .appName("Different Spark APIs")
 15 |   .getOrCreate()
 16 | 
 17 |   // for toDF
 18 |   import spark.implicits._
 19 | 
 20 |   val sc = spark.sparkContext
 21 | 
 22 |   // small count comparison
 23 |   val numbers = 1 to 1000000000
 24 |   val rdd = sc.parallelize(1 to 1000000000)
 25 |   rdd.count() // ~10s on camera - might vary on your PC
 26 | 
 27 |   val df = rdd.toDF("id")
 28 |   df.count() // ~16s - might vary
 29 |   val dfCount = df.selectExpr("count(*)") // same
 30 |   // look at the Spark UI - there's a wholestagecodegen step in the stage - that's Spark generating the appropriate bytecode to process RDDs behind the scenes
 31 |   // most of the time taken is just the RDD transformation - look at the time taken in stage 1
 32 | 
 33 |   val ds = spark.range(1, 1000000000)
 34 |   ds.count() // instant, 0.1s
 35 |   val dsCount = ds.selectExpr("count(*)")
 36 |   dsCount.show() // same
 37 |   ds.toDF("value").count() // same
 38 | 
 39 |   ds.rdd.count() // ~25s
 40 |   // cmd-click on the `rdd` implementation to see why this is so slow.
 41 | 
 42 |   /**
 43 |     * Notice that inside the same "realm", i.e. RDDs or DFs, the computation time is small.
 44 |     * Converting between them takes a long time.
 45 |     * That's because each row is processed individually.
 46 |     * Conversions are particularly bad in Python, because the data needs to go from the Python interpreter to the JVM AND back.
 47 |     *
 48 |     * Lesson 1: once decided on the API level, STAY THERE.
 49 |     */
 50 | 
 51 |   val rddTimes5 = rdd.map(_ * 5)
 52 |   rddTimes5.count() // ~20s
 53 |   // one stage
 54 | 
 55 |   val dfTimes5 = df.selectExpr("id * 5 as id")
 56 |   val dfTimes5Count = dfTimes5.selectExpr("count(*)")
 57 |   dfTimes5Count.show() // still 11-12s
 58 |   /*
 59 |     Notice there's no difference in the time taken, comparing with the original count.
 60 |     The RDD version multiplied every single row, but here, the multiplication is instant.
 61 |     Or is it?
 62 | 
 63 |     WHY?
 64 | 
 65 |     scala> dfTimes5Count.explain
 66 |     == Physical Plan ==
 67 |     *(2) HashAggregate(keys=[], functions=[count(1)])
 68 |     +- Exchange SinglePartition
 69 |        +- *(1) HashAggregate(keys=[], functions=[partial_count(1)])
 70 |           +- *(1) Project
 71 |              +- *(1) SerializeFromObject [input[0, int, false] AS value#2]
 72 |                 +- Scan[obj#1]
 73 | 
 74 |     scala> dfCount.explain
 75 |     == Physical Plan ==
 76 |     *(2) HashAggregate(keys=[], functions=[count(1)])
 77 |     +- Exchange SinglePartition
 78 |        +- *(1) HashAggregate(keys=[], functions=[partial_count(1)])
 79 |           +- *(1) Project
 80 |              +- *(1) SerializeFromObject [input[0, int, false] AS value#2]
 81 |                 +- Scan[obj#1]
 82 | 
 83 |     Same query plan! Spark removed the select altogether.
 84 |    */
 85 | 
 86 |   /**
 87 |     * Exercise: measure the time it takes to count the number of elements from the DS, multiplied by 5.
 88 |     * Try to explain the difference. It's ok if you have like an 80% explanation.
 89 |     */
 90 |   val dsTimes5 = ds.map(_ * 5)
 91 |   val dsTimes5Count = dsTimes5.selectExpr("count(*)")
 92 |   dsTimes5Count.show()
 93 |   /*
 94 |     7 seconds from 0.1 seconds! That's a 70x time increase.
 95 |     Let's explain:
 96 | 
 97 |     scala> dsCount.explain
 98 |     == Physical Plan ==
 99 |     *(2) HashAggregate(keys=[], functions=[count(1)])
100 |     +- Exchange SinglePartition
101 |        +- *(1) HashAggregate(keys=[], functions=[partial_count(1)])
102 |           +- *(1) Project
103 |              +- *(1) Range (1, 1000000000, step=1, splits=6)
104 | 
105 |     scala> dsTimes5Count.explain
106 |     == Physical Plan ==
107 |     *(2) HashAggregate(keys=[], functions=[count(1)])
108 |     +- Exchange SinglePartition
109 |        +- *(1) HashAggregate(keys=[], functions=[partial_count(1)])
110 |           +- *(1) Project
111 |              +- *(1) SerializeFromObject [input[0, bigint, false] AS value#71L]
112 |                 +- *(1) MapElements <function1>, obj#70: bigint
113 |                    +- *(1) DeserializeToObject staticinvoke(class java.lang.Long, ObjectType(class java.lang.Long), valueOf, id#13L, true, false), obj#69: java.lang.Long
114 |                       +- *(1) Range (1, 1000000000, step=1, splits=6)
115 | 
116 |     Different query plans. Because we're using a lambda there, Spark can't optimize it.
117 |     So Spark has to "deserializeObject" by invoking Long.valueOf on each element in the DS, then map each element with the function, then serialize it back as a DS.
118 | 
119 |     The reason why Spark has to do that is that Spark doesn't have any information on the lambda, and thus is forced to apply it to each element.
120 |    */
121 | 
122 |   /**
123 |     * Lesson 2: use DFs most of the time. Spark optimizes most stuff away.
124 |     * Lesson 3: Lambdas are impossible to optimize.
125 |     */
126 | 
127 | }
128 | 


--------------------------------------------------------------------------------
/src/main/scala/part5rddtransformations/ReusingObjects.scala:
--------------------------------------------------------------------------------
  1 | package part5rddtransformations
  2 | 
  3 | import generator.DataGenerator
  4 | import org.apache.spark.rdd.RDD
  5 | import org.apache.spark.sql.SparkSession
  6 | 
  7 | object ReusingObjects {
  8 | 
  9 |   val spark = SparkSession.builder()
 10 |     .appName("Reusing JVM objects")
 11 |     .master("local[*]")
 12 |     .getOrCreate()
 13 | 
 14 |   val sc = spark.sparkContext
 15 | 
 16 |   /*
 17 |     Analyze text
 18 |     Receive batches of text from data sources
 19 |     "35 // some text"
 20 | 
 21 |     Stats per each data source id:
 22 |     - the number of lines in total
 23 |     - total number of words in total
 24 |     - length of the longest word
 25 |     - the number of occurrences of the word "imperdiet"
 26 | 
 27 |     Results should be VERY FAST.
 28 |    */
 29 | 
 30 |   val textPath = "src/main/resources/generated/lipsum/3m.txt"
 31 |   val criticalWord = "imperdiet"
 32 | 
 33 |   val text = sc.textFile(textPath).map { line =>
 34 |     val tokens = line.split("//")
 35 |     (tokens(0), tokens(1))
 36 |   }
 37 | 
 38 |   def generateData() = {
 39 |     DataGenerator.generateText(textPath, 60000000, 3000000, 200)
 40 |   }
 41 | 
 42 | 
 43 |   //////////////////// Version 1
 44 | 
 45 |   case class TextStats(nLines: Int, nWords: Int, maxWordLength: Int, occurrences: Int)
 46 | 
 47 |   object TextStats {
 48 |     val zero = TextStats(0, 0, 0, 0)
 49 |   }
 50 | 
 51 |   def collectStats() = {
 52 | 
 53 |     def aggregateNewRecord(textStats: TextStats, record: String): TextStats = {
 54 |       val newWords = record.split(" ")
 55 |       val longestWord = newWords.maxBy(_.length)
 56 |       val newOccurrences = newWords.count(_ == criticalWord)
 57 |       TextStats(
 58 |         textStats.nLines + 1,
 59 |         textStats.nWords + newWords.length,
 60 |         if (longestWord.length > textStats.maxWordLength) longestWord.length else textStats.maxWordLength,
 61 |         textStats.occurrences + newOccurrences
 62 |       )
 63 |     }
 64 | 
 65 |     def combineStats(stats1: TextStats, stats2: TextStats): TextStats = {
 66 |       TextStats(
 67 |         stats1.nLines + stats2.nLines,
 68 |         stats1.nWords + stats2.nWords,
 69 |         if (stats1.maxWordLength > stats2.maxWordLength) stats1.maxWordLength else stats2.maxWordLength,
 70 |         stats1.occurrences + stats2.occurrences
 71 |       )
 72 |     }
 73 | 
 74 |     val aggregate: RDD[(String, TextStats)] = text.aggregateByKey(TextStats.zero)(aggregateNewRecord, combineStats)
 75 |     aggregate.collectAsMap()
 76 |   }
 77 | 
 78 |   //////////////////// Version 2
 79 | 
 80 |   class MutableTextStats(var nLines: Int, var nWords: Int, var maxWordLength: Int, var occurrences: Int) extends Serializable
 81 |   object MutableTextStats extends Serializable {
 82 |     def zero = new MutableTextStats(0,0,0,0)
 83 |   }
 84 | 
 85 |   def collectStats2() = {
 86 | 
 87 |     def aggregateNewRecord(textStats: MutableTextStats, record: String): MutableTextStats = {
 88 |       val newWords = record.split(" ")
 89 |       val longestWord = newWords.maxBy(_.length)
 90 |       val newOccurrences = newWords.count(_ == criticalWord)
 91 | 
 92 |       textStats.nLines += 1
 93 |       textStats.nWords += newWords.length
 94 |       textStats.maxWordLength = if (longestWord.length > textStats.maxWordLength) longestWord.length else textStats.maxWordLength
 95 |       textStats.occurrences += newOccurrences
 96 | 
 97 |       textStats
 98 |     }
 99 | 
100 |     def combineStats(stats1: MutableTextStats, stats2: MutableTextStats): MutableTextStats = {
101 |       stats1.nLines += stats2.nLines
102 |       stats1.nWords += stats2.nWords
103 |       stats1.maxWordLength = if (stats1.maxWordLength > stats2.maxWordLength) stats1.maxWordLength else stats2.maxWordLength
104 |       stats1.occurrences += stats2.occurrences
105 | 
106 |       stats1
107 |     }
108 | 
109 |     val aggregate: RDD[(String, MutableTextStats)] = text.aggregateByKey(MutableTextStats.zero)(aggregateNewRecord, combineStats)
110 |     aggregate.collectAsMap()
111 |   }
112 | 
113 |   ///////////////////////// Version 3 - JVM arrays
114 | 
115 |   object UglyTextStats extends Serializable {
116 |     val nLinesIndex = 0
117 |     val nWordsIndex = 1
118 |     val longestWordIndex = 2
119 |     val occurrencesIndex = 3
120 | 
121 |     def aggregateNewRecord(textStats: Array[Int], record: String): Array[Int] = {
122 |       val newWords = record.split(" ") // Array of strings
123 | 
124 |       var i = 0
125 |       while (i < newWords.length) {
126 |         val word = newWords(i)
127 |         val wordLength = word.length
128 | 
129 |         textStats(longestWordIndex) = if (wordLength > textStats(longestWordIndex)) wordLength else textStats(longestWordIndex)
130 |         textStats(occurrencesIndex) += (if (word == criticalWord) 1 else 0)
131 | 
132 |         i += 1
133 |       }
134 | 
135 |       textStats(nLinesIndex) += 1
136 |       textStats(nWordsIndex) += newWords.length
137 | 
138 |       textStats
139 |     }
140 | 
141 |     def combineStats(stats1: Array[Int], stats2: Array[Int]): Array[Int] = {
142 |       stats1(nLinesIndex) += stats2(nLinesIndex)
143 |       stats1(nWordsIndex) += stats2(nWordsIndex)
144 |       stats1(longestWordIndex) = if (stats1(longestWordIndex) > stats2(longestWordIndex)) stats1(longestWordIndex) else stats2(longestWordIndex)
145 |       stats1(occurrencesIndex) += stats2(occurrencesIndex)
146 | 
147 |       stats1
148 |     }
149 |   }
150 | 
151 |   def collectStats3() = {
152 |     val aggregate: RDD[(String, Array[Int])] = text.aggregateByKey(Array.fill(4)(0))(UglyTextStats.aggregateNewRecord, UglyTextStats.combineStats)
153 |     aggregate.collectAsMap()
154 |   }
155 | 
156 |   def main(args: Array[String]): Unit = {
157 |     collectStats()
158 |     collectStats2()
159 |     collectStats3()
160 | 
161 |     Thread.sleep(1000000)
162 |   }
163 | }
164 | 


--------------------------------------------------------------------------------
/src/main/scala/part3dfjoins/ColumnPruning.scala:
--------------------------------------------------------------------------------
  1 | package part3dfjoins
  2 | 
  3 | import org.apache.spark.sql.SparkSession
  4 | import org.apache.spark.sql.functions._
  5 | 
  6 | object ColumnPruning {
  7 | 
  8 |   val spark = SparkSession.builder()
  9 |     .appName("Column Pruning")
 10 |     .master("local[2]")
 11 |     .getOrCreate()
 12 | 
 13 |   val sc = spark.sparkContext
 14 |   import spark.implicits._
 15 | 
 16 |   val guitarsDF = spark.read
 17 |     .option("inferSchema", "true")
 18 |     .json("src/main/resources/data/guitars/guitars.json")
 19 | 
 20 |   val guitarPlayersDF = spark.read
 21 |     .option("inferSchema", "true")
 22 |     .json("src/main/resources/data/guitarPlayers/guitarPlayers.json")
 23 | 
 24 |   val bandsDF = spark.read
 25 |     .option("inferSchema", "true")
 26 |     .json("src/main/resources/data/bands/bands.json")
 27 | 
 28 |   val joinCondition = guitarPlayersDF.col("band") === bandsDF.col("id")
 29 |   val guitaristsBandsDF = guitarPlayersDF.join(bandsDF, joinCondition, "inner")
 30 |   guitaristsBandsDF.explain()
 31 | 
 32 |   /*
 33 |   == Physical Plan ==
 34 |   *(2) BroadcastHashJoin [band#22L], [id#38L], Inner, BuildLeft
 35 |   :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#34]
 36 |   :  +- *(1) Project [band#22L, guitars#23, id#24L, name#25] <-- UNNECESSARY
 37 |   :     +- *(1) Filter isnotnull(band#22L)
 38 |   :        +- BatchScan[band#22L, guitars#23, id#24L, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<band:bigint,guitars:array<bigint>,id:bigint,name:string>
 39 |   +- *(2) Project [hometown#37, id#38L, name#39, year#40L]
 40 |      +- *(2) Filter isnotnull(id#38L)
 41 |         +- BatchScan[hometown#37, id#38L, name#39, year#40L] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<hometown:string,id:bigint,name:string,year:bigint>
 42 |    */
 43 | 
 44 |   val guitaristsWithoutBandsDF = guitarPlayersDF.join(bandsDF, joinCondition, "left_anti")
 45 |   guitaristsWithoutBandsDF.explain()
 46 |   /*
 47 |     == Physical Plan ==
 48 |     *(2) BroadcastHashJoin [band#22L], [id#38L], LeftAnti, BuildRight
 49 |     :- *(2) Project [band#22L, guitars#23, id#24L, name#25] <- UNNECESSARY
 50 |     :  +- BatchScan[band#22L, guitars#23, id#24L, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<band:bigint,guitars:array<bigint>,id:bigint,name:string>
 51 |     +- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#66]
 52 |        +- *(1) Project [id#38L] <- COLUMN PRUNING
 53 |           +- *(1) Filter isnotnull(id#38L)
 54 |              +- BatchScan[id#38L] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<id:bigint>
 55 | 
 56 |     Column pruning = cut off columns that are not relevant
 57 |     = shrinks DF
 58 |     * useful for joins and groups
 59 |    */
 60 | 
 61 |   // project and filter pushdown
 62 |   val namesDF = guitaristsBandsDF.select(guitarPlayersDF.col("name"), bandsDF.col("name"))
 63 |   namesDF.explain()
 64 | 
 65 |   /*
 66 |   == Physical Plan ==
 67 |   *(2) Project [name#25, name#39]
 68 |   +- *(2) BroadcastHashJoin [band#22L], [id#38L], Inner, BuildLeft
 69 |      :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#100]
 70 |      :  +- *(1) Project [band#22L, name#25] <- COLUMN PRUNING
 71 |      :     +- *(1) Filter isnotnull(band#22L)
 72 |      :        +- BatchScan[band#22L, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<band:bigint,name:string>
 73 |      +- *(2) Project [id#38L, name#39]
 74 |         +- *(2) Filter isnotnull(id#38L)
 75 |            +- BatchScan[id#38L, name#39] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<id:bigint,name:string>
 76 | 
 77 |   Spark tends to drop columns as early as possible.
 78 |   Should be YOUR goal as well.
 79 |    */
 80 | 
 81 |   val rockDF = guitarPlayersDF
 82 |     .join(bandsDF, joinCondition)
 83 |     .join(guitarsDF, array_contains(guitarPlayersDF.col("guitars"), guitarsDF.col("id")))
 84 | 
 85 |   val essentialsDF = rockDF.select(guitarPlayersDF.col("name"), bandsDF.col("name"), upper(guitarsDF.col("make")))
 86 |   essentialsDF.explain()
 87 |   /*
 88 |   == Physical Plan ==
 89 |   *(4) Project [name#25, name#39, upper(make#9) AS upper(make)#164] TODO the upper function is done LAST
 90 |   +- BroadcastNestedLoopJoin BuildRight, Inner, array_contains(guitars#23, id#8L)
 91 |      :- *(2) Project [guitars#23, name#25, name#39]
 92 |      :  +- *(2) BroadcastHashJoin [band#22L], [id#38L], Inner, BuildLeft
 93 |      :     :- BroadcastExchange HashedRelationBroadcastMode(List(input[0, bigint, true])), [id=#156]
 94 |      :     :  +- *(1) Project [band#22L, guitars#23, name#25] TODO <- Column pruning
 95 |      :     :     +- *(1) Filter isnotnull(band#22L)
 96 |      :     :        +- BatchScan[band#22L, guitars#23, name#25] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<band:bigint,guitars:array<bigint>,name:string>
 97 |      :     +- *(2) Project [id#38L, name#39] TODO <- Column pruning
 98 |      :        +- *(2) Filter isnotnull(id#38L)
 99 |      :           +- BatchScan[id#38L, name#39] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<id:bigint,name:string>
100 |      +- BroadcastExchange IdentityBroadcastMode, [id=#167]
101 |         +- *(3) Project [id#8L, make#9] TODO <- Column pruning
102 |            +- BatchScan[id#8L, make#9] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<id:bigint,make:string>
103 |    */
104 | 
105 |   /**
106 |     * LESSON: if you anticipate that the joined table is much larger than the table on whose column you are applying the
107 |     * map-side operation, e.g. " * 5", or "upper", do this operation on the small table FIRST.
108 |     *
109 |     * Particularly useful for outer joins.
110 |     */
111 | 
112 |   def main(args: Array[String]): Unit = {
113 | 
114 |   }
115 | }
116 | 


--------------------------------------------------------------------------------
/HadoopWindowsUserSetup.md:
--------------------------------------------------------------------------------
  1 | *Apache Spark doesn't have its own system to organize files in a distributed way (the file system), so it requires 
  2 | external file systems to store and process large datasets. For this reason, programmers install Spark 
  3 | on top of Hadoop so that Spark's advanced analytics applications can make use of the data stored using the Hadoop Distributed 
  4 | File System (HDFS).*
  5 | 
  6 | ****Prerequisites:****
  7 | 
  8 | Before you start installing Hadoop on Windows, there are a few prerequisites that you need to have in place:
  9 | 
 10 | 1. Java Development Kit (JDK) version 11 or higher
 11 | 2. Apache Hadoop distribution suitable for Windows
 12 | 
 13 | **Step 1:**  *Install the Java Development Kit* 
 14 | 
 15 | Hadoop is built using Java, so you’ll need to install the Java Development Kit (JDK) version 11 
 16 | or higher on your computer. You can download the JDK from the Oracle website.
 17 | (https://www.oracle.com/in/java/technologies/javase/javase8-archive-downloads.html) Once the download is 
 18 | complete, run the installer and follow the instructions to install the JDK.
 19 | 
 20 | **Step 2:** *Download the Hadoop distribution*
 21 | 
 22 | To install Hadoop on Windows, you’ll need to download the appropriate distribution from the 
 23 | Apache Hadoop website (https://hadoop.apache.org/releases.html). 
 24 | You’ll want to choose the distribution that is compatible with your version of Windows (hadoop-3.3.6) and click on binary. 
 25 | Once you’ve downloaded the distribution, extract files of hadoop-3.3.6.tar.gz and place under `C:\Hadoop`.
 26 | 
 27 | **Step 3:** *Set up the Environment Variables*
 28 | 
 29 | To use Java & Hadoop, you’ll need to set up some environment variables.
 30 | This will allow you to run Java & Hadoop commands from any directory on your computer. 
 31 | To set up the environment variables, follow these steps:
 32 | 
 33 | 1. Open the Start menu and search for “Environment Variables”.
 34 | 2. Click on “Edit the system environment variables”.
 35 | 3. Click on the “Environment Variables” button.
 36 | 4. Under “System Variables”, click on “New”.
 37 | 5. Enter “JAVA_HOME” as the variable name & the path to the directory where your java is installed (example- C:\Program Files\Java\jdk1.8.0) as the variable value.
 38 | 6. Click “OK”.
 39 | 7. Enter “HADOOP_HOME” as the variable name and the path to the directory where you extracted the Hadoop distribution (example- C:\hadoop) as the variable value.
 40 | 8. Click “OK”.
 41 | 9. Locate the “Path” variable in the “System Variables” list and click “Edit”.
 42 | 10. Add the following to the end of the “Variable value” field: `%JAVA_HOME%\bin; %HADOOP_HOME%\bin; %HADOOP_HOME%\sbin;`
 43 | 11. Click “OK” to close all the windows.
 44 | 
 45 | **Step 4:** *Install Hadoop native IO binary*
 46 | 
 47 | Clone or download the winutils repository (https://github.com/cdarlint/winutils/tree/master/hadoop-3.3.5/bin) 
 48 | and copy the contents of `hadoop-3.3.5/bin` into the extracted location of the Hadoop binary package. 
 49 | In our example, it will be `C:\Hadoop\bin`.
 50 | 
 51 | **Important Note:** The following steps are not necessary for Spark to run, the above is sufficient to work with Spark. 
 52 | However, you can proceed if you really want the entire Hadoop distribution working locally.
 53 | 
 54 | **Step 5:** *Hadoop Configuration*
 55 | 
 56 | To configure Hadoop, you’ll need to modify a few configuration files. 
 57 | These files are located in the `etc/hadoop` directory of the Hadoop folder. 
 58 | Open each of the following files in a text editor and make the changes described below and save the files:
 59 | 
 60 | 1. `core-site.xml`: Add the following lines to the file inside `<configuration>` like this:
 61 | ```
 62 | <configuration>
 63 |   <property>
 64 |     <name>fs.defaultFS</name>
 65 |     <value>hdfs://localhost:9000</value>
 66 |   </property>
 67 | </configuration>
 68 | ```
 69 | 
 70 | 2. Open the file `hadoop-env.cmd` (windows command script) and replace `set JAVA_HOME=%JAVA_HOME%`
 71 | with the Java installation location, e.g. `set JAVA_HOME=C:\Program Files\Java\jdk1.11.0` 
 72 | or if it doesn't work then use `set JAVA_HOME=C:\Progra~1\Java\jdk1.11.0`. Also go to bottom of the file
 73 | and give your name to this variable: `set HADOOP_IDENT_STRING=RockTheJVM`.
 74 | 
 75 | 
 76 | 3. `hdfs-site.xml`: First create these folders - `C:/hadoop/data/dfs/datanode` and `C:/hadoop/data/dfs/datanode`
 77 | Add the following lines to the file inside `<configuration>` like this:
 78 | ```
 79 | <property>
 80 |     <name>dfs.replication</name>
 81 |     <value>1</value>
 82 | </property>
 83 | <property>
 84 |     <name>dfs.namenode.name.dir</name>
 85 |     <value>file:///C:/hadoop/data/dfs/namenode</value>
 86 | </property>
 87 | <property>
 88 |     <name>dfs.datanode.data.dir</name>
 89 |     <value>file:///C:/hadoop/data/dfs/datanode</value>
 90 | </property>
 91 | ```
 92 | 
 93 | 4. `mapred-site.xml`: Add the following lines to the file inside `<configuration>` like this:
 94 | ```
 95 | <configuration>
 96 |    <property>
 97 |        <name>mapreduce.framework.name</name>
 98 |        <value>yarn</value>
 99 |    </property>
100 | </configuration>
101 | ```
102 | 
103 | 5. `yarn-site.xml`: Add the following lines to the file inside `<configuration>` like this:
104 | ```
105 | <configuration>
106 | <property>
107 | <name>yarn.nodemanager.aux-services</name>
108 | <value>mapreduce_shuffle</value>
109 | <description>Yarn Node Manager Aux Service</description>
110 | </property>
111 | <configuration>
112 | ```
113 | 
114 | **Step 6:** **If you want to start Hadoop:**
115 | 
116 | To start Hadoop, open a command prompt and navigate to the directory where you extracted the Hadoop distribution. 
117 | Then, run the following commands:
118 | ```
119 | cd sbin
120 | start-all.cmd
121 | ```
122 | This will start the Hadoop daemons and launch the web interface. You can access the web interface by going to http://localhost:9000/ in your web browser.
123 | 
124 | ****Conclusion:****
125 | 
126 | Setting up Hadoop on a Windows system can pose some challenges, but by following this comprehensive guide, 
127 | you'll be able to configure it smoothly and quickly. Hadoop is a robust solution for handling extensive datasets and executing distributed applications, 
128 | making it a favored choice for numerous enterprises and institutions worldwide. 
129 | Whether you're a data scientist or a software developer, integrating Hadoop into your toolkit is highly beneficial.
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | 


--------------------------------------------------------------------------------
/src/main/scala/part3dfjoins/PrePartitioning.scala:
--------------------------------------------------------------------------------
  1 | package part3dfjoins
  2 | 
  3 | import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
  4 | 
  5 | object PrePartitioning {
  6 | 
  7 |   val spark = SparkSession.builder()
  8 |     .appName("Pre-partitioning")
  9 |     .master("local")
 10 |     .getOrCreate()
 11 | 
 12 |   import spark.implicits._
 13 | 
 14 |   // deactivate broadcast joins
 15 |   spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
 16 | 
 17 |   /*
 18 |     addColumns(initialTable, 3) => dataframe with columns "id", "newCol1", "newCol2", "newCol3"
 19 |    */
 20 |   def addColumns[T](df: Dataset[T], n: Int): DataFrame = {
 21 |     val newColumns = (1 to n).map(i => s"id * $i as newCol$i")
 22 |     df.selectExpr(("id" +: newColumns): _*)
 23 |   }
 24 | 
 25 |   // don't touch this
 26 |   val initialTable = spark.range(1, 10000000).repartition(10) // RoundRobinPartitioning(10)
 27 |   val narrowTable = spark.range(1, 5000000).repartition(7) // RoundRobinPartitioning(7)
 28 | 
 29 |   // scenario 1
 30 |   val wideTable = addColumns(initialTable, 30)
 31 |   val join1 = wideTable.join(narrowTable, "id")
 32 |   join1.explain()
 33 |   // println(join1.count()) // around 20s
 34 |   /*
 35 |     == Physical Plan ==
 36 |     *(6) Project [id#0L, newCol1#8L, newCol2#9L, newCol3#10L, newCol4#11L, newCol5#12L, newCol6#13L, newCol7#14L, newCol8#15L, newCol9#16L, newCol10#17L, newCol11#18L, newCol12#19L, newCol13#20L, newCol14#21L, newCol15#22L, newCol16#23L, newCol17#24L, newCol18#25L, newCol19#26L, newCol20#27L, newCol21#28L, newCol22#29L, newCol23#30L, ... 7 more fields]
 37 |     +- *(6) SortMergeJoin [id#0L], [id#4L], Inner
 38 |        :- *(3) Sort [id#0L ASC NULLS FIRST], false, 0
 39 |        :  +- Exchange hashpartitioning(id#0L, 200), true, [id=#39]
 40 |        :     +- *(2) Project [id#0L, (id#0L * 1) AS newCol1#8L, (id#0L * 2) AS newCol2#9L, (id#0L * 3) AS newCol3#10L, (id#0L * 4) AS newCol4#11L, (id#0L * 5) AS newCol5#12L, (id#0L * 6) AS newCol6#13L, (id#0L * 7) AS newCol7#14L, (id#0L * 8) AS newCol8#15L, (id#0L * 9) AS newCol9#16L, (id#0L * 10) AS newCol10#17L, (id#0L * 11) AS newCol11#18L, (id#0L * 12) AS newCol12#19L, (id#0L * 13) AS newCol13#20L, (id#0L * 14) AS newCol14#21L, (id#0L * 15) AS newCol15#22L, (id#0L * 16) AS newCol16#23L, (id#0L * 17) AS newCol17#24L, (id#0L * 18) AS newCol18#25L, (id#0L * 19) AS newCol19#26L, (id#0L * 20) AS newCol20#27L, (id#0L * 21) AS newCol21#28L, (id#0L * 22) AS newCol22#29L, (id#0L * 23) AS newCol23#30L, ... 7 more fields]
 41 |        :        +- Exchange RoundRobinPartitioning(10), false, [id=#35]
 42 |        :           +- *(1) Range (1, 10000000, step=1, splits=1)
 43 |        +- *(5) Sort [id#4L ASC NULLS FIRST], false, 0
 44 |           +- Exchange hashpartitioning(id#4L, 200), true, [id=#46]
 45 |              +- Exchange RoundRobinPartitioning(7), false, [id=#45]
 46 |                 +- *(4) Range (1, 5000000, step=1, splits=1)
 47 |    */
 48 | 
 49 |   // scenario 2
 50 |   val altNarrow = narrowTable.repartition($"id") // use a HashPartitioner
 51 |   val altInitial = initialTable.repartition($"id")
 52 |   // join on co-partitioned DFs
 53 |   val join2 = altInitial.join(altNarrow, "id")
 54 |   val result2 = addColumns(join2, 30)
 55 |   result2.explain()
 56 |   // println(result2.count()) // 6s
 57 | 
 58 |   /*
 59 |     == Physical Plan ==
 60 |     *(5) Project [id#0L, (id#0L * 1) AS newCol1#105L, (id#0L * 2) AS newCol2#106L, (id#0L * 3) AS newCol3#107L, (id#0L * 4) AS newCol4#108L, (id#0L * 5) AS newCol5#109L, (id#0L * 6) AS newCol6#110L, (id#0L * 7) AS newCol7#111L, (id#0L * 8) AS newCol8#112L, (id#0L * 9) AS newCol9#113L, (id#0L * 10) AS newCol10#114L, (id#0L * 11) AS newCol11#115L, (id#0L * 12) AS newCol12#116L, (id#0L * 13) AS newCol13#117L, (id#0L * 14) AS newCol14#118L, (id#0L * 15) AS newCol15#119L, (id#0L * 16) AS newCol16#120L, (id#0L * 17) AS newCol17#121L, (id#0L * 18) AS newCol18#122L, (id#0L * 19) AS newCol19#123L, (id#0L * 20) AS newCol20#124L, (id#0L * 21) AS newCol21#125L, (id#0L * 22) AS newCol22#126L, (id#0L * 23) AS newCol23#127L, ... 7 more fields]
 61 |     +- *(5) SortMergeJoin [id#0L], [id#4L], Inner
 62 |        :- *(2) Sort [id#0L ASC NULLS FIRST], false, 0
 63 |        :  +- Exchange hashpartitioning(id#0L, 200), false, [id=#91]
 64 |        :     +- *(1) Range (1, 10000000, step=1, splits=1)
 65 |        +- *(4) Sort [id#4L ASC NULLS FIRST], false, 0
 66 |           +- Exchange hashpartitioning(id#4L, 200), false, [id=#97]
 67 |              +- *(3) Range (1, 5000000, step=1, splits=1)
 68 |    */
 69 | 
 70 |   /**
 71 |     * Lesson: partition early.
 72 |     * Partitioning late is AT BEST what Spark naturally does.
 73 |     */
 74 | 
 75 |   // scenario 3
 76 |   val enhanceColumnsFirst = addColumns(initialTable, 30)
 77 |   val repartitionedNarrow = narrowTable.repartition($"id")
 78 |   val repartitionedEnhanced = enhanceColumnsFirst.repartition($"id") // USELESS!
 79 |   val result3 = enhanceColumnsFirst.join(repartitionedNarrow, "id")
 80 |   // println(result3.count()) // around 19-20s
 81 |   result3.explain()
 82 |   /*
 83 |     == Physical Plan ==
 84 |     *(6) Project [id#0L, newCol1#166L, newCol2#167L, newCol3#168L, newCol4#169L, newCol5#170L, newCol6#171L, newCol7#172L, newCol8#173L, newCol9#174L, newCol10#175L, newCol11#176L, newCol12#177L, newCol13#178L, newCol14#179L, newCol15#180L, newCol16#181L, newCol17#182L, newCol18#183L, newCol19#184L, newCol20#185L, newCol21#186L, newCol22#187L, newCol23#188L, ... 7 more fields]
 85 |     +- *(6) SortMergeJoin [id#0L], [id#4L], Inner
 86 |        :- *(3) Sort [id#0L ASC NULLS FIRST], false, 0
 87 |        :  +- Exchange hashpartitioning(id#0L, 200), false, [id=#154]
 88 |        :     +- *(2) Project [id#0L, (id#0L * 1) AS newCol1#166L, (id#0L * 2) AS newCol2#167L, (id#0L * 3) AS newCol3#168L, (id#0L * 4) AS newCol4#169L, (id#0L * 5) AS newCol5#170L, (id#0L * 6) AS newCol6#171L, (id#0L * 7) AS newCol7#172L, (id#0L * 8) AS newCol8#173L, (id#0L * 9) AS newCol9#174L, (id#0L * 10) AS newCol10#175L, (id#0L * 11) AS newCol11#176L, (id#0L * 12) AS newCol12#177L, (id#0L * 13) AS newCol13#178L, (id#0L * 14) AS newCol14#179L, (id#0L * 15) AS newCol15#180L, (id#0L * 16) AS newCol16#181L, (id#0L * 17) AS newCol17#182L, (id#0L * 18) AS newCol18#183L, (id#0L * 19) AS newCol19#184L, (id#0L * 20) AS newCol20#185L, (id#0L * 21) AS newCol21#186L, (id#0L * 22) AS newCol22#187L, (id#0L * 23) AS newCol23#188L, ... 7 more fields]
 89 |        :        +- Exchange RoundRobinPartitioning(10), false, [id=#150]
 90 |        :           +- *(1) Range (1, 10000000, step=1, splits=1)
 91 |        +- *(5) Sort [id#4L ASC NULLS FIRST], false, 0
 92 |           +- Exchange hashpartitioning(id#4L, 200), false, [id=#160]
 93 |              +- *(4) Range (1, 5000000, step=1, splits=1)
 94 |    */
 95 | 
 96 |   /**
 97 |     * Exercise: what would happen if we just repartitioned the smaller table to 10 partitions?
 98 |     * TERRIBLE!
 99 |     *
100 |     */
101 | 
102 |   initialTable.join(narrowTable.repartition(10), "id").explain() // identical to scenario 1
103 | 
104 |   def main(args: Array[String]): Unit = {
105 | 
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/scala/part3dfjoins/Bucketing.scala:
--------------------------------------------------------------------------------
  1 | package part3dfjoins
  2 | 
  3 | import org.apache.spark.sql.SparkSession
  4 | 
  5 | object Bucketing {
  6 | 
  7 |   val spark = SparkSession.builder()
  8 |     .appName("Bucketing")
  9 |     .master("local")
 10 |     .getOrCreate()
 11 | 
 12 |   import spark.implicits._
 13 | 
 14 |   // deactivate broadcasting
 15 |   spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)
 16 | 
 17 |   val large = spark.range(1000000).selectExpr("id * 5 as id").repartition(10)
 18 |   val small = spark.range(10000).selectExpr("id * 3 as id").repartition(3)
 19 | 
 20 |   val joined = large.join(small, "id")
 21 |   joined.explain()
 22 |   /*
 23 |     == Physical Plan ==
 24 |     *(5) Project [id#2L]
 25 |     +- *(5) SortMergeJoin [id#2L], [id#6L], Inner
 26 |        :- *(2) Sort [id#2L ASC NULLS FIRST], false, 0
 27 |        :  +- Exchange hashpartitioning(id#2L, 200), true, [id=#40]
 28 |        :     +- Exchange RoundRobinPartitioning(10), false, [id=#39]
 29 |        :        +- *(1) Project [(id#0L * 5) AS id#2L]
 30 |        :           +- *(1) Range (0, 1000000, step=1, splits=1)
 31 |        +- *(4) Sort [id#6L ASC NULLS FIRST], false, 0
 32 |           +- Exchange hashpartitioning(id#6L, 200), true, [id=#47]
 33 |              +- Exchange RoundRobinPartitioning(3), false, [id=#46]
 34 |                 +- *(3) Project [(id#4L * 3) AS id#6L]
 35 |                    +- *(3) Range (0, 10000, step=1, splits=1)
 36 | 
 37 |    */
 38 | 
 39 |   // bucketing
 40 |   large.write
 41 |     .bucketBy(4, "id")
 42 |     .sortBy("id")
 43 |     .mode("overwrite")
 44 |     .saveAsTable("bucketed_large")
 45 | 
 46 |   small.write
 47 |     .bucketBy(4, "id")
 48 |     .sortBy("id")
 49 |     .mode("overwrite")
 50 |     .saveAsTable("bucketed_small") // bucketing and saving almost as expensive as a regular shuffle
 51 | 
 52 |   spark.sql("use default")
 53 |   val bucketedLarge = spark.table("bucketed_large")
 54 |   val bucketedSmall = spark.table("bucketed_small")
 55 |   val bucketedJoin = bucketedLarge.join(bucketedSmall, "id")
 56 |   bucketedJoin.explain()
 57 |   /*
 58 |     *(3) Project [id#11L]
 59 |     +- *(3) SortMergeJoin [id#11L], [id#13L], Inner
 60 |        :- *(1) Sort [id#11L ASC NULLS FIRST], false, 0
 61 |        :  +- *(1) Project [id#11L]
 62 |        :     +- *(1) Filter isnotnull(id#11L)
 63 |        :        +- *(1) ColumnarToRow
 64 |        :           +- FileScan parquet default.bucketed_large[id#11L] Batched: true, DataFilters: [isnotnull(id#11L)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-warehouse/bu..., PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:bigint>, SelectedBucketsCount: 4 out of 4
 65 |        +- *(2) Sort [id#13L ASC NULLS FIRST], false, 0
 66 |           +- *(2) Project [id#13L]
 67 |              +- *(2) Filter isnotnull(id#13L)
 68 |                 +- *(2) ColumnarToRow
 69 |                    +- FileScan parquet default.bucketed_small[id#13L] Batched: true, DataFilters: [isnotnull(id#13L)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-warehouse/bu..., PartitionFilters: [], PushedFilters: [IsNotNull(id)], ReadSchema: struct<id:bigint>, SelectedBucketsCount: 4 out of 4
 70 | 
 71 |    */
 72 | 
 73 |   // bucketing for groups
 74 |   val flightsDF = spark.read
 75 |     .option("inferSchema", "true")
 76 |     .json("src/main/resources/data/flights/flights.json")
 77 |     .repartition(2)
 78 | 
 79 |   val mostDelayed = flightsDF
 80 |     .filter("origin = 'DEN' and arrdelay > 1")
 81 |     .groupBy("origin", "dest", "carrier")
 82 |     .avg("arrdelay")
 83 |     .orderBy($"avg(arrdelay)".desc_nulls_last)
 84 |   mostDelayed.explain()
 85 | 
 86 |   /*
 87 |     == Physical Plan ==
 88 |     *(4) Sort [avg(arrdelay)#53 DESC NULLS LAST], true, 0
 89 |     +- Exchange rangepartitioning(avg(arrdelay)#53 DESC NULLS LAST, 200), true, [id=#111]
 90 |        +- *(3) HashAggregate(keys=[origin#27, dest#24, carrier#18], functions=[avg(arrdelay#17)])
 91 |           +- Exchange hashpartitioning(origin#27, dest#24, carrier#18, 200), true, [id=#107]
 92 |              +- *(2) HashAggregate(keys=[origin#27, dest#24, carrier#18], functions=[partial_avg(arrdelay#17)])
 93 |                 +- Exchange RoundRobinPartitioning(2), false, [id=#103]
 94 |                    +- *(1) Project [arrdelay#17, carrier#18, dest#24, origin#27]
 95 |                       +- *(1) Filter (((isnotnull(origin#27) AND isnotnull(arrdelay#17)) AND (origin#27 = DEN)) AND (arrdelay#17 > 1.0))
 96 |                          +- BatchScan[arrdelay#17, carrier#18, dest#24, origin#27] JsonScan Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/src/main/resources..., ReadSchema: struct<arrdelay:double,carrier:string,dest:string,origin:string>
 97 |    */
 98 | 
 99 | //  flightsDF.write
100 | //    .partitionBy("origin")
101 | //    .bucketBy(4, "dest", "carrier")
102 | //    .saveAsTable("flights_bucketed") // just as long as a shuffle
103 | //
104 | //  val flightsBucketed = spark.table("flights_bucketed")
105 | //  val mostDelayed2 = flightsBucketed
106 | //    .filter("origin = 'DEN' and arrdelay > 1")
107 | //    .groupBy("origin", "dest", "carrier")
108 | //    .avg("arrdelay")
109 | //    .orderBy($"avg(arrdelay)".desc_nulls_last)
110 | //  mostDelayed2.explain()
111 |   /*
112 |     == Physical Plan ==
113 |     *(2) Sort [avg(arrdelay)#140 DESC NULLS LAST], true, 0
114 |     +- Exchange rangepartitioning(avg(arrdelay)#140 DESC NULLS LAST, 200), true, [id=#172]
115 |        +- *(1) HashAggregate(keys=[origin#114, dest#111, carrier#105], functions=[avg(arrdelay#104)])
116 |           +- *(1) HashAggregate(keys=[origin#114, dest#111, carrier#105], functions=[partial_avg(arrdelay#104)])
117 |              +- *(1) Project [arrdelay#104, carrier#105, dest#111, origin#114]
118 |                 +- *(1) Filter (isnotnull(arrdelay#104) AND (arrdelay#104 > 1.0))
119 |                    +- *(1) ColumnarToRow
120 |                       +- FileScan parquet default.flights_bucketed[arrdelay#104,carrier#105,dest#111,origin#114] Batched: true, DataFilters: [isnotnull(arrdelay#104), (arrdelay#104 > 1.0)], Format: Parquet, Location: PrunedInMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-wareho..., PartitionFilters: [isnotnull(origin#114), (origin#114 = DEN)], PushedFilters: [IsNotNull(arrdelay), GreaterThan(arrdelay,1.0)], ReadSchema: struct<arrdelay:double,carrier:string,dest:string>, SelectedBucketsCount: 4 out of 4
121 |    */
122 | 
123 |   /**
124 |     * Bucket pruning
125 |     */
126 |   val the10 = bucketedLarge.filter($"id" === 10)
127 |   the10.show()
128 |   the10.explain()
129 |   /*
130 |     == Physical Plan ==
131 |     *(1) Project [id#11L]
132 |     +- *(1) Filter (isnotnull(id#11L) AND (id#11L = 10))
133 |        +- *(1) ColumnarToRow
134 |           +- FileScan parquet default.bucketed_large[id#11L] Batched: true, DataFilters: [isnotnull(id#11L), (id#11L = 10)], Format: Parquet, Location: InMemoryFileIndex[file:/Users/daniel/dev/rockthejvm/courses/spark-optimization/spark-warehouse/bu..., PartitionFilters: [], PushedFilters: [IsNotNull(id), EqualTo(id,10)], ReadSchema: struct<id:bigint>, SelectedBucketsCount: 1 out of 4
135 |    */
136 | 
137 |   def main(args: Array[String]): Unit = {
138 |     // joined.count() // 4-5s
139 |     // bucketedJoin.count() // 4s for bucketing + 0.5s for counting
140 |     // mostDelayed.show() // ~1s
141 |     // mostDelayed2.show() // ~0.2s = 5x perf!
142 |   }
143 | 
144 | }
145 | 


--------------------------------------------------------------------------------
/spark-cluster/README.md:
--------------------------------------------------------------------------------
  1 | # Spark Cluster with Docker & docker-compose
  2 | 
  3 | # General
  4 | 
  5 | A simple spark standalone cluster for your testing environment purposses. A *docker-compose up* away from you solution for your spark development environment.
  6 | 
  7 | The Docker compose will create the following containers:
  8 | 
  9 | container|Ip address
 10 | ---|---
 11 | spark-master|10.5.0.2
 12 | spark-worker-1|10.5.0.3
 13 | spark-worker-2|10.5.0.4
 14 | spark-worker-3|10.5.0.5
 15 | 
 16 | # Installation
 17 | 
 18 | The following steps will make you run your spark cluster's containers.
 19 | 
 20 | ## Pre requisites
 21 | 
 22 | * Docker installed
 23 | 
 24 | * Docker compose  installed
 25 | 
 26 | * A spark Application Jar to play with(Optional)
 27 | 
 28 | ## Build the images
 29 | 
 30 | The first step to deploy the cluster will be the build of the custom images, these builds can be performed with the *build-images.sh* script. 
 31 | 
 32 | The executions are as simple as the following steps:
 33 | 
 34 | ```sh
 35 | chmod +x build-images.sh
 36 | ./build-images.sh
 37 | ```
 38 | 
 39 | This will create the following docker images:
 40 | 
 41 | * spark-base:3.5.0: A base image based on java:alpine-jdk-8 which ships scala, python3 and spark 3.5.0
 42 | 
 43 | * spark-master:3.5.0: A image based on the previously created spark image, used to create a spark master containers.
 44 | 
 45 | * spark-worker:3.5.0: A image based on the previously created spark image, used to create spark worker containers.
 46 | 
 47 | * spark-submit:3.5.0: A image based on the previously created spark image, used to create spark submit containers(run, deliver driver and die gracefully).
 48 | 
 49 | ## Run the docker-compose
 50 | 
 51 | The final step to create your test cluster will be to run the compose file:
 52 | 
 53 | ```sh
 54 | docker-compose up --scale spark-worker=3
 55 | ```
 56 | 
 57 | ## Validate your cluster
 58 | 
 59 | Just validate your cluster accessing the spark UI on each worker & master URL.
 60 | 
 61 | ### Spark Master
 62 | 
 63 | http://10.5.0.2:8080/
 64 | 
 65 | ![alt text](docs/spark-master.png "Spark master UI")
 66 | 
 67 | ### Spark Worker 1
 68 | 
 69 | http://10.5.0.3:8081/
 70 | 
 71 | ![alt text](docs/spark-worker-1.png "Spark worker 1 UI")
 72 | 
 73 | ### Spark Worker 2
 74 | 
 75 | http://10.5.0.4:8081/
 76 | 
 77 | ![alt text](docs/spark-worker-2.png "Spark worker 2 UI")
 78 | 
 79 | ### Spark Worker 3
 80 | 
 81 | http://10.5.0.5:8081/
 82 | 
 83 | ![alt text](docs/spark-worker-3.png "Spark worker 3 UI")
 84 | 
 85 | # Resource Allocation 
 86 | 
 87 | This cluster is shipped with three workers and one spark master, each of these has a particular set of resource allocation(basically RAM & cpu cores allocation).
 88 | 
 89 | * The default CPU cores allocation for each spark worker is 1 core.
 90 | 
 91 | * The default RAM for each spark-worker is 1024 MB.
 92 | 
 93 | * The default RAM allocation for spark executors is 256mb.
 94 | 
 95 | * The default RAM allocation for spark driver is 128mb
 96 | 
 97 | * If you wish to modify this allocations just edit the env/spark-worker.sh file.
 98 | 
 99 | # Binded Volumes
100 | 
101 | To make app running easier I've shipped two volume mounts described in the following chart:
102 | 
103 | Host Mount|Container Mount|Purposse
104 | ---|---|---
105 | /mnt/spark-apps|/opt/spark-apps|Used to make available your app's jars on all workers & master
106 | /mnt/spark-data|/opt/spark-data| Used to make available your app's data on all workers & master
107 | 
108 | This is basically a dummy DFS created from docker Volumes...(maybe not...)
109 | 
110 | # Run a sample application
111 | 
112 | Now let`s make a **wild spark submit** to validate the distributed nature of our new toy following these steps:
113 | 
114 | ## Create a Scala spark app
115 | 
116 | The first thing you need to do is to make a spark application. Our spark-submit image is designed to run scala code (soon will ship pyspark support guess I was just lazy to do so..).
117 | 
118 | In my case I am using an app called  [crimes-app](https://). You can make or use your own scala app, I 've just used this one because I had it at hand.
119 | 
120 | 
121 | ## Ship your jar & dependencies on the Workers and Master
122 | 
123 | A necesary step to make a **spark-submit** is to copy your application bundle into all workers, also any configuration file or input file you need.
124 | 
125 | Luckily for us we are using docker volumes so, you just have to copy your app and configs into /mnt/spark-apps, and your input files into /mnt/spark-files.
126 | 
127 | ```bash
128 | #Copy spark application into all workers's app folder
129 | cp /home/workspace/crimes-app/build/libs/crimes-app.jar /mnt/spark-apps
130 | 
131 | #Copy spark application configs into all workers's app folder
132 | cp -r /home/workspace/crimes-app/config /mnt/spark-apps
133 | 
134 | # Copy the file to be processed to all workers's data folder
135 | cp /home/Crimes_-_2001_to_present.csv /mnt/spark-files
136 | ```
137 | 
138 | ## Check the successful copy of the data and app jar (Optional)
139 | 
140 | This is not a necessary step, just if you are curious you can check if your app code and files are in place before running the spark-submit.
141 | 
142 | ```sh
143 | # Worker 1 Validations
144 | docker exec -ti spark-worker-1 ls -l /opt/spark-apps
145 | 
146 | docker exec -ti spark-worker-1 ls -l /opt/spark-data
147 | 
148 | # Worker 2 Validations
149 | docker exec -ti spark-worker-2 ls -l /opt/spark-apps
150 | 
151 | docker exec -ti spark-worker-2 ls -l /opt/spark-data
152 | 
153 | # Worker 3 Validations
154 | docker exec -ti spark-worker-3 ls -l /opt/spark-apps
155 | 
156 | docker exec -ti spark-worker-3 ls -l /opt/spark-data
157 | ```
158 | After running one of this commands you have to see your app's jar and files.
159 | 
160 | 
161 | ## Use docker spark-submit
162 | 
163 | ```bash
164 | #Creating some variables to make the docker run command more readable
165 | #App jar environment used by the spark-submit image
166 | SPARK_APPLICATION_JAR_LOCATION="/opt/spark-apps/crimes-app.jar"
167 | #App main class environment used by the spark-submit image
168 | SPARK_APPLICATION_MAIN_CLASS="org.mvb.applications.CrimesApp"
169 | #Extra submit args used by the spark-submit image
170 | SPARK_SUBMIT_ARGS="--conf spark.executor.extraJavaOptions='-Dconfig-path=/opt/spark-apps/dev/config.conf'"
171 | 
172 | #We have to use the same network as the spark cluster(internally the image resolves spark master as spark://spark-master:7077)
173 | docker run --network docker-spark-cluster_spark-network \
174 | -v /mnt/spark-apps:/opt/spark-apps \
175 | --env SPARK_APPLICATION_JAR_LOCATION=$SPARK_APPLICATION_JAR_LOCATION \
176 | --env SPARK_APPLICATION_MAIN_CLASS=$SPARK_APPLICATION_MAIN_CLASS \
177 | spark-submit:3.5.0
178 | 
179 | ```
180 | 
181 | After running this you will see an output pretty much like this:
182 | 
183 | ```bash
184 | Running Spark using the REST application submission protocol.
185 | 2018-09-23 15:17:52 INFO  RestSubmissionClient:54 - Submitting a request to launch an application in spark://spark-master:6066.
186 | 2018-09-23 15:17:53 INFO  RestSubmissionClient:54 - Submission successfully created as driver-20180923151753-0000. Polling submission state...
187 | 2018-09-23 15:17:53 INFO  RestSubmissionClient:54 - Submitting a request for the status of submission driver-20180923151753-0000 in spark://spark-master:6066.
188 | 2018-09-23 15:17:53 INFO  RestSubmissionClient:54 - State of driver driver-20180923151753-0000 is now RUNNING.
189 | 2018-09-23 15:17:53 INFO  RestSubmissionClient:54 - Driver is running on worker worker-20180923151711-10.5.0.4-45381 at 10.5.0.4:45381.
190 | 2018-09-23 15:17:53 INFO  RestSubmissionClient:54 - Server responded with CreateSubmissionResponse:
191 | {
192 |   "action" : "CreateSubmissionResponse",
193 |   "message" : "Driver successfully submitted as driver-20180923151753-0000",
194 |   "serverSparkVersion" : "3.5.0",
195 |   "submissionId" : "driver-20180923151753-0000",
196 |   "success" : true
197 | }
198 | ```
199 | 
200 | # Summary (What have I done :O?)
201 | 
202 | * We compiled the necessary docker images to run spark master and worker containers.
203 | 
204 | * We created a spark standalone cluster using 3 worker nodes and 1 master node using docker && docker-compose.
205 | 
206 | * Copied the resources necessary to run a sample application.
207 | 
208 | * Submitted an application to the cluster using a **spark-submit** docker image.
209 | 
210 | * We ran a distributed application at home(just need enough cpu cores and RAM to do so).
211 | 
212 | # Why a standalone cluster?
213 | 
214 | * This is intended to be used for test purposses, basically a way of running distributed spark apps on your laptop or desktop.
215 | 
216 | * Right now I don't have enough resources to make a Yarn, Mesos or Kubernetes based cluster :(.
217 | 
218 | * This will be useful to use CI/CD pipelines for your spark apps(A really difficult and hot topic)
219 | 


--------------------------------------------------------------------------------
/src/main/scala/generator/DataGenerator.scala:
--------------------------------------------------------------------------------
  1 | package generator
  2 | 
  3 | import java.io.{File, FileWriter, PrintWriter}
  4 | 
  5 | import scala.annotation.tailrec
  6 | import scala.io.Source
  7 | import scala.util.Random
  8 | 
  9 | object DataGenerator {
 10 | 
 11 |   val random = new Random()
 12 | 
 13 |   /////////////////////////////////////////////////////////////////////////////////
 14 |   // General data generation
 15 |   /////////////////////////////////////////////////////////////////////////////////
 16 | 
 17 |   def randomDouble(limit: Double): Double = random.nextDouble() * limit
 18 | 
 19 |   def randomLong(limit: Long = Long.MaxValue): Long = Math.abs(random.nextLong()) % limit
 20 | 
 21 |   def randomInt(limit: Int = Int.MaxValue): Int = random.nextInt(limit)
 22 | 
 23 |   def randomIntBetween(low: Int, high: Int) = {
 24 |     assert(low <= high)
 25 |     random.nextInt(high - low) + low
 26 |   }
 27 | 
 28 |   def randomString(n: Int) =
 29 |     new String((0 to n).map(_ => ('a' + random.nextInt(26)).toChar).toArray)
 30 | 
 31 |   /////////////////////////////////////////////////////////////////////////////////
 32 |   // Laptop models generation - skewed data lectures
 33 |   /////////////////////////////////////////////////////////////////////////////////
 34 | 
 35 |   val laptopModelsSet: Seq[LaptopModel] = Seq(
 36 |     LaptopModel("Razer", "Blade"),
 37 |     LaptopModel("Alienware", "Area-51"),
 38 |     LaptopModel("HP", "Omen"),
 39 |     LaptopModel("Acer", "Predator"),
 40 |     LaptopModel("Asus", "ROG"),
 41 |     LaptopModel("Lenovo", "Legion"),
 42 |     LaptopModel("MSI", "Raider")
 43 |   )
 44 | 
 45 |   def randomLaptopModel(uniform: Boolean = false): LaptopModel = {
 46 |     val makeModelIndex = if (!uniform && random.nextBoolean()) 0 else random.nextInt(laptopModelsSet.size) // 50% of the data is of the first kind
 47 |     laptopModelsSet(makeModelIndex)
 48 |   }
 49 | 
 50 |   def randomProcSpeed() = s"3.${random.nextInt(9)}".toDouble
 51 | 
 52 |   def randomRegistration(): String = s"${random.alphanumeric.take(7).mkString("")}"
 53 | 
 54 |   def randomPrice() = 500 + random.nextInt(1500)
 55 | 
 56 |   def randomLaptop(uniformDist: Boolean = false): Laptop = {
 57 |     val makeModel = randomLaptopModel()
 58 |     Laptop(randomRegistration(), makeModel.make, makeModel.model, randomProcSpeed())
 59 |   }
 60 | 
 61 |   def randomLaptopOffer(uniformDist: Boolean = false): LaptopOffer = {
 62 |     val makeModel = randomLaptopModel()
 63 |     LaptopOffer(makeModel.make, makeModel.model, randomProcSpeed(), randomPrice())
 64 |   }
 65 | 
 66 |   /////////////////////////////////////////////////////////////////////////////////
 67 |   // Misc data generation
 68 |   /////////////////////////////////////////////////////////////////////////////////
 69 | 
 70 |   /**
 71 |     * For the iterator-to-iterator transformations lecture.
 72 |     * Generates a number of metrics in the style of "metricName metricValue", where metricName is a string and metricValue is a double.
 73 |     *
 74 |     * @param destPath the path of the file the metrics will be written to.
 75 |     * @param nMetrics the number of metrics to generate
 76 |     * @param limit the maximum value any metric can take
 77 |     */
 78 |   def generateMetrics(destPath: String, nMetrics: Int, limit: Double = 1000000) = {
 79 |     val writer = new PrintWriter(new FileWriter(new File(destPath)))
 80 |     (1 to nMetrics).foreach(_ => writer.println(s"${randomString(16)} ${randomDouble(1000000)}"))
 81 |     writer.flush()
 82 |     writer.close()
 83 |   }
 84 | 
 85 |   /**
 86 |     * For the RDD joins & cogroup lectures. Generates 3 files:
 87 |     * 1) with student IDs and names
 88 |     * 2) with student IDs and emails
 89 |     * 3) with student IDs and exam attempt grade
 90 |     *
 91 |     * @param rootFolderPath the path where the 3 files will be written
 92 |     * @param nStudents the number of students
 93 |     * @param nAttempts the number of attempts of the exam, per each student
 94 |     */
 95 |   def generateExamData(rootFolderPath: String, nStudents: Int, nAttempts: Int): Unit = {
 96 |     val studentNames = (0 to nStudents).map(_ => randomString(16))
 97 |     val studentIds = studentNames.map(_ => randomLong())
 98 |     val idWriter = new PrintWriter(new FileWriter(new File(s"$rootFolderPath/examIds.txt")))
 99 |     val emailWriter = new PrintWriter(new FileWriter(new File(s"$rootFolderPath/examEmails.txt")))
100 |     val scoreWriter = new PrintWriter(new FileWriter(new File(s"$rootFolderPath/examScores.txt")))
101 | 
102 |     studentNames
103 |       .zip(studentIds)
104 |       .foreach {
105 |         case (name, id) =>
106 |           idWriter.println(s"$id $name")
107 |           emailWriter.println(s"$id $name@rockthejvm.com")
108 |       }
109 | 
110 |     val scores = studentIds
111 |       .flatMap(id => Seq.fill(5)(id))
112 |       .map(id => (id, randomInt(10), randomInt(10)))
113 |       .toSet
114 | 
115 |     scores.foreach {
116 |       case (id, scoreMaj, scoreMin) => scoreWriter.println(s"$id $scoreMaj.$scoreMin")
117 |     }
118 | 
119 |     idWriter.flush()
120 |     idWriter.close()
121 |     emailWriter.flush()
122 |     emailWriter.close()
123 |     scoreWriter.flush()
124 |     scoreWriter.close()
125 |   }
126 | 
127 |   /**
128 |     * For the Secondary Sort lesson.
129 |     * Generates random person encounters as key-value pairs in a CSV file.
130 |     * The key is the person identifier and the value is the distance to the closest person, as measured by a hypothetical "approach device".
131 |     *
132 |     * @param path the file path to write the data to
133 |     * @param nPeople the number of people involved in the data
134 |     * @param nValuesPerPerson the number of people encounters
135 |     * @param skew the percentage of the data that belongs to one person
136 |     */
137 |   def generatePeopleEncounters(path: String, nPeople: Int, nValuesPerPerson: Int, skew: Double = 0): Unit = {
138 |     val writer = new PrintWriter(new FileWriter(new File(path)))
139 |     val nEntries = nPeople * nValuesPerPerson
140 | 
141 |     writer.println("personId,approachValue")
142 |     (1 to nEntries).foreach { _ =>
143 |       val personIndex = if (random.nextDouble() < skew) 0 else 1 + random.nextInt(nPeople)
144 |       val approachValue = 10000 * random.nextDouble()
145 | 
146 |       writer.println(s"person_$personIndex,$approachValue")
147 |     }
148 | 
149 |     writer.flush()
150 |     writer.close()
151 |   }
152 | 
153 |   /**
154 |     * A function which generates random text in lorem-ipsum fashion, in chunks, as normal "paragraphs".
155 |     * Supports an optional senders argument to attach every paragraph to a sender/broker ID for key-value crunching.
156 |     * If the number of senders is positive, then each line will have a prefix "(senderID) // ", where senderID is randomly picked between 1 and nSenders.
157 |     *
158 |     * @param dstPath the file path where you want to write the text
159 |     * @param nWords the number of words
160 |     * @param nParagraphs the number of lines
161 |     * @param nSenders (default 0) the number of unique senders
162 |     */
163 |   def generateText(dstPath: String, nWords: Int, nParagraphs: Int, nSenders: Int = 0): Unit = {
164 |     assert(nSenders >= 0)
165 |     assert(nWords > 1)
166 |     assert(nParagraphs > 0)
167 | 
168 |     val words = Source.fromFile("src/main/resources/data/lipsum/words.txt").getLines().toSeq
169 |     val numWords = words.length
170 | 
171 |     def pickRandomWord(isLast: Boolean = false) =
172 |       words(random.nextInt(numWords)) + (if (!isLast && random.nextInt() % 5 == 0) "," else "")
173 | 
174 |     val lowSentenceLimit = 2
175 |     val highSentenceLimit = 14
176 |     val avgParLength = nWords / nParagraphs
177 |     val lowParLimit = avgParLength / 2
178 |     val highParLimit = avgParLength * 3 / 2
179 |     val writer = new PrintWriter(new FileWriter(new File(dstPath)))
180 | 
181 |     @tailrec
182 |     def generateLipsumRec(nWords: Int, nParagraphs: Int, nWordsInParagraph: Int, attachSender: Boolean = false): Unit = {
183 |       val sentenceLength =
184 |         if (nWordsInParagraph < highSentenceLimit) nWordsInParagraph
185 |         else randomIntBetween(lowSentenceLimit, highSentenceLimit)
186 | 
187 |       val ending = if (sentenceLength == nWordsInParagraph) "." else ". "
188 |       val sentence = ((1 until sentenceLength).map(_ => pickRandomWord()) :+ pickRandomWord(true)).mkString(" ") + ending
189 | 
190 |       if (attachSender) {
191 |         val sender = (randomInt(nSenders) + 1) + " // "
192 |         writer.print(sender)
193 |       }
194 |       writer.print(sentence.capitalize)
195 | 
196 |       val nWordsInParagraphLeft = nWordsInParagraph - sentenceLength
197 |       val nParagraphsLeft = nParagraphs - 1
198 | 
199 |       if (nWordsInParagraphLeft == 0) {
200 |         if (nParagraphsLeft > 0) {
201 |           val nWordsLeft = nWords - sentenceLength
202 |           val nextParLength =
203 |             if (nParagraphsLeft == 1) nWordsLeft
204 |             else randomIntBetween(lowParLimit, highParLimit)
205 | 
206 |           writer.print("\n")
207 |           generateLipsumRec(nWords - sentenceLength, nParagraphsLeft, nextParLength, nSenders > 0)
208 |         }
209 |       } else {
210 |         generateLipsumRec(nWords - sentenceLength, nParagraphs, nWordsInParagraphLeft)
211 |       }
212 |     }
213 | 
214 |     val nWordsInFirstParagraph = if (nWords < highParLimit) nWords else randomIntBetween(lowParLimit, highParLimit)
215 | 
216 |     generateLipsumRec(nWords, nParagraphs, nWordsInFirstParagraph, nSenders > 0)
217 |     writer.flush()
218 |     writer.close()
219 |   }
220 | 
221 |   def main(args: Array[String]): Unit = {
222 |     generateExamData("src/main/resources/data/studentgen", 100000, 5)
223 |   }
224 | }


--------------------------------------------------------------------------------
/src/main/resources/data/cars/cars.json:
--------------------------------------------------------------------------------
  1 | {"Name":"chevrolet chevelle malibu", "Miles_per_Gallon":18, "Cylinders":8, "Displacement":307, "Horsepower":130, "Weight_in_lbs":3504, "Acceleration":12, "Year":"1970-01-01", "Origin":"USA"}
  2 | {"Name":"buick skylark 320", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":3693, "Acceleration":11.5, "Year":"1970-01-01", "Origin":"USA"}
  3 | {"Name":"plymouth satellite", "Miles_per_Gallon":18, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3436, "Acceleration":11, "Year":"1970-01-01", "Origin":"USA"}
  4 | {"Name":"amc rebel sst", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3433, "Acceleration":12, "Year":"1970-01-01", "Origin":"USA"}
  5 | {"Name":"ford torino", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":3449, "Acceleration":10.5, "Year":"1970-01-01", "Origin":"USA"}
  6 | {"Name":"ford galaxie 500", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":429, "Horsepower":198, "Weight_in_lbs":4341, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"}
  7 | {"Name":"chevrolet impala", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":454, "Horsepower":220, "Weight_in_lbs":4354, "Acceleration":9, "Year":"1970-01-01", "Origin":"USA"}
  8 | {"Name":"plymouth fury iii", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":440, "Horsepower":215, "Weight_in_lbs":4312, "Acceleration":8.5, "Year":"1970-01-01", "Origin":"USA"}
  9 | {"Name":"pontiac catalina", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":455, "Horsepower":225, "Weight_in_lbs":4425, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"}
 10 | {"Name":"amc ambassador dpl", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":390, "Horsepower":190, "Weight_in_lbs":3850, "Acceleration":8.5, "Year":"1970-01-01", "Origin":"USA"}
 11 | {"Name":"citroen ds-21 pallas", "Miles_per_Gallon":null, "Cylinders":4, "Displacement":133, "Horsepower":115, "Weight_in_lbs":3090, "Acceleration":17.5, "Year":"1970-01-01", "Origin":"Europe"}
 12 | {"Name":"chevrolet chevelle concours (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":4142, "Acceleration":11.5, "Year":"1970-01-01", "Origin":"USA"}
 13 | {"Name":"ford torino (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":351, "Horsepower":153, "Weight_in_lbs":4034, "Acceleration":11, "Year":"1970-01-01", "Origin":"USA"}
 14 | {"Name":"plymouth satellite (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":383, "Horsepower":175, "Weight_in_lbs":4166, "Acceleration":10.5, "Year":"1970-01-01", "Origin":"USA"}
 15 | {"Name":"amc rebel sst (sw)", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":360, "Horsepower":175, "Weight_in_lbs":3850, "Acceleration":11, "Year":"1970-01-01", "Origin":"USA"}
 16 | {"Name":"dodge challenger se", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":383, "Horsepower":170, "Weight_in_lbs":3563, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"}
 17 | {"Name":"plymouth 'cuda 340", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":340, "Horsepower":160, "Weight_in_lbs":3609, "Acceleration":8, "Year":"1970-01-01", "Origin":"USA"}
 18 | {"Name":"ford mustang boss 302", "Miles_per_Gallon":null, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":3353, "Acceleration":8, "Year":"1970-01-01", "Origin":"USA"}
 19 | {"Name":"chevrolet monte carlo", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":400, "Horsepower":150, "Weight_in_lbs":3761, "Acceleration":9.5, "Year":"1970-01-01", "Origin":"USA"}
 20 | {"Name":"buick estate wagon (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":455, "Horsepower":225, "Weight_in_lbs":3086, "Acceleration":10, "Year":"1970-01-01", "Origin":"USA"}
 21 | {"Name":"toyota corona mark ii", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":113, "Horsepower":95, "Weight_in_lbs":2372, "Acceleration":15, "Year":"1970-01-01", "Origin":"Japan"}
 22 | {"Name":"plymouth duster", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":198, "Horsepower":95, "Weight_in_lbs":2833, "Acceleration":15.5, "Year":"1970-01-01", "Origin":"USA"}
 23 | {"Name":"amc hornet", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":199, "Horsepower":97, "Weight_in_lbs":2774, "Acceleration":15.5, "Year":"1970-01-01", "Origin":"USA"}
 24 | {"Name":"ford maverick", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":2587, "Acceleration":16, "Year":"1970-01-01", "Origin":"USA"}
 25 | {"Name":"datsun pl510", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2130, "Acceleration":14.5, "Year":"1970-01-01", "Origin":"Japan"}
 26 | {"Name":"volkswagen 1131 deluxe sedan", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":46, "Weight_in_lbs":1835, "Acceleration":20.5, "Year":"1970-01-01", "Origin":"Europe"}
 27 | {"Name":"peugeot 504", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":110, "Horsepower":87, "Weight_in_lbs":2672, "Acceleration":17.5, "Year":"1970-01-01", "Origin":"Europe"}
 28 | {"Name":"audi 100 ls", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":107, "Horsepower":90, "Weight_in_lbs":2430, "Acceleration":14.5, "Year":"1970-01-01", "Origin":"Europe"}
 29 | {"Name":"saab 99e", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":104, "Horsepower":95, "Weight_in_lbs":2375, "Acceleration":17.5, "Year":"1970-01-01", "Origin":"Europe"}
 30 | {"Name":"bmw 2002", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":121, "Horsepower":113, "Weight_in_lbs":2234, "Acceleration":12.5, "Year":"1970-01-01", "Origin":"Europe"}
 31 | {"Name":"amc gremlin", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":199, "Horsepower":90, "Weight_in_lbs":2648, "Acceleration":15, "Year":"1970-01-01", "Origin":"USA"}
 32 | {"Name":"ford f250", "Miles_per_Gallon":10, "Cylinders":8, "Displacement":360, "Horsepower":215, "Weight_in_lbs":4615, "Acceleration":14, "Year":"1970-01-01", "Origin":"USA"}
 33 | {"Name":"chevy c20", "Miles_per_Gallon":10, "Cylinders":8, "Displacement":307, "Horsepower":200, "Weight_in_lbs":4376, "Acceleration":15, "Year":"1970-01-01", "Origin":"USA"}
 34 | {"Name":"dodge d200", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":318, "Horsepower":210, "Weight_in_lbs":4382, "Acceleration":13.5, "Year":"1970-01-01", "Origin":"USA"}
 35 | {"Name":"hi 1200d", "Miles_per_Gallon":9, "Cylinders":8, "Displacement":304, "Horsepower":193, "Weight_in_lbs":4732, "Acceleration":18.5, "Year":"1970-01-01", "Origin":"USA"}
 36 | {"Name":"datsun pl510", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2130, "Acceleration":14.5, "Year":"1971-01-01", "Origin":"Japan"}
 37 | {"Name":"chevrolet vega 2300", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":140, "Horsepower":90, "Weight_in_lbs":2264, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"}
 38 | {"Name":"toyota corona", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":113, "Horsepower":95, "Weight_in_lbs":2228, "Acceleration":14, "Year":"1971-01-01", "Origin":"Japan"}
 39 | {"Name":"ford pinto", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":98, "Horsepower":null, "Weight_in_lbs":2046, "Acceleration":19, "Year":"1971-01-01", "Origin":"USA"}
 40 | {"Name":"volkswagen super beetle 117", "Miles_per_Gallon":null, "Cylinders":4, "Displacement":97, "Horsepower":48, "Weight_in_lbs":1978, "Acceleration":20, "Year":"1971-01-01", "Origin":"Europe"}
 41 | {"Name":"amc gremlin", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2634, "Acceleration":13, "Year":"1971-01-01", "Origin":"USA"}
 42 | {"Name":"plymouth satellite custom", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":225, "Horsepower":105, "Weight_in_lbs":3439, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"}
 43 | {"Name":"chevrolet chevelle malibu", "Miles_per_Gallon":17, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3329, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"}
 44 | {"Name":"ford torino 500", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":250, "Horsepower":88, "Weight_in_lbs":3302, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"}
 45 | {"Name":"amc matador", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":3288, "Acceleration":15.5, "Year":"1971-01-01", "Origin":"USA"}
 46 | {"Name":"chevrolet impala", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":4209, "Acceleration":12, "Year":"1971-01-01", "Origin":"USA"}
 47 | {"Name":"pontiac catalina brougham", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":400, "Horsepower":175, "Weight_in_lbs":4464, "Acceleration":11.5, "Year":"1971-01-01", "Origin":"USA"}
 48 | {"Name":"ford galaxie 500", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":351, "Horsepower":153, "Weight_in_lbs":4154, "Acceleration":13.5, "Year":"1971-01-01", "Origin":"USA"}
 49 | {"Name":"plymouth fury iii", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4096, "Acceleration":13, "Year":"1971-01-01", "Origin":"USA"}
 50 | {"Name":"dodge monaco (sw)", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":383, "Horsepower":180, "Weight_in_lbs":4955, "Acceleration":11.5, "Year":"1971-01-01", "Origin":"USA"}
 51 | {"Name":"ford country squire (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":170, "Weight_in_lbs":4746, "Acceleration":12, "Year":"1971-01-01", "Origin":"USA"}
 52 | {"Name":"pontiac safari (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":175, "Weight_in_lbs":5140, "Acceleration":12, "Year":"1971-01-01", "Origin":"USA"}
 53 | {"Name":"amc hornet sportabout (sw)", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":258, "Horsepower":110, "Weight_in_lbs":2962, "Acceleration":13.5, "Year":"1971-01-01", "Origin":"USA"}
 54 | {"Name":"chevrolet vega (sw)", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":140, "Horsepower":72, "Weight_in_lbs":2408, "Acceleration":19, "Year":"1971-01-01", "Origin":"USA"}
 55 | {"Name":"pontiac firebird", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3282, "Acceleration":15, "Year":"1971-01-01", "Origin":"USA"}
 56 | {"Name":"ford mustang", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":88, "Weight_in_lbs":3139, "Acceleration":14.5, "Year":"1971-01-01", "Origin":"USA"}
 57 | {"Name":"mercury capri 2000", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":122, "Horsepower":86, "Weight_in_lbs":2220, "Acceleration":14, "Year":"1971-01-01", "Origin":"USA"}
 58 | {"Name":"opel 1900", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":116, "Horsepower":90, "Weight_in_lbs":2123, "Acceleration":14, "Year":"1971-01-01", "Origin":"Europe"}
 59 | {"Name":"peugeot 304", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":79, "Horsepower":70, "Weight_in_lbs":2074, "Acceleration":19.5, "Year":"1971-01-01", "Origin":"Europe"}
 60 | {"Name":"fiat 124b", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":88, "Horsepower":76, "Weight_in_lbs":2065, "Acceleration":14.5, "Year":"1971-01-01", "Origin":"Europe"}
 61 | {"Name":"toyota corolla 1200", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":71, "Horsepower":65, "Weight_in_lbs":1773, "Acceleration":19, "Year":"1971-01-01", "Origin":"Japan"}
 62 | {"Name":"datsun 1200", "Miles_per_Gallon":35, "Cylinders":4, "Displacement":72, "Horsepower":69, "Weight_in_lbs":1613, "Acceleration":18, "Year":"1971-01-01", "Origin":"Japan"}
 63 | {"Name":"volkswagen model 111", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":60, "Weight_in_lbs":1834, "Acceleration":19, "Year":"1971-01-01", "Origin":"Europe"}
 64 | {"Name":"plymouth cricket", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":91, "Horsepower":70, "Weight_in_lbs":1955, "Acceleration":20.5, "Year":"1971-01-01", "Origin":"USA"}
 65 | {"Name":"toyota corona hardtop", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":113, "Horsepower":95, "Weight_in_lbs":2278, "Acceleration":15.5, "Year":"1972-01-01", "Origin":"Japan"}
 66 | {"Name":"dodge colt hardtop", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":97.5, "Horsepower":80, "Weight_in_lbs":2126, "Acceleration":17, "Year":"1972-01-01", "Origin":"USA"}
 67 | {"Name":"volkswagen type 3", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":97, "Horsepower":54, "Weight_in_lbs":2254, "Acceleration":23.5, "Year":"1972-01-01", "Origin":"Europe"}
 68 | {"Name":"chevrolet vega", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":140, "Horsepower":90, "Weight_in_lbs":2408, "Acceleration":19.5, "Year":"1972-01-01", "Origin":"USA"}
 69 | {"Name":"ford pinto runabout", "Miles_per_Gallon":21, "Cylinders":4, "Displacement":122, "Horsepower":86, "Weight_in_lbs":2226, "Acceleration":16.5, "Year":"1972-01-01", "Origin":"USA"}
 70 | {"Name":"chevrolet impala", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":165, "Weight_in_lbs":4274, "Acceleration":12, "Year":"1972-01-01", "Origin":"USA"}
 71 | {"Name":"pontiac catalina", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":400, "Horsepower":175, "Weight_in_lbs":4385, "Acceleration":12, "Year":"1972-01-01", "Origin":"USA"}
 72 | {"Name":"plymouth fury iii", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4135, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"USA"}
 73 | {"Name":"ford galaxie 500", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":351, "Horsepower":153, "Weight_in_lbs":4129, "Acceleration":13, "Year":"1972-01-01", "Origin":"USA"}
 74 | {"Name":"amc ambassador sst", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3672, "Acceleration":11.5, "Year":"1972-01-01", "Origin":"USA"}
 75 | {"Name":"mercury marquis", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":429, "Horsepower":208, "Weight_in_lbs":4633, "Acceleration":11, "Year":"1972-01-01", "Origin":"USA"}
 76 | {"Name":"buick lesabre custom", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":155, "Weight_in_lbs":4502, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"USA"}
 77 | {"Name":"oldsmobile delta 88 royale", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":350, "Horsepower":160, "Weight_in_lbs":4456, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"USA"}
 78 | {"Name":"chrysler newport royal", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":190, "Weight_in_lbs":4422, "Acceleration":12.5, "Year":"1972-01-01", "Origin":"USA"}
 79 | {"Name":"mazda rx2 coupe", "Miles_per_Gallon":19, "Cylinders":3, "Displacement":70, "Horsepower":97, "Weight_in_lbs":2330, "Acceleration":13.5, "Year":"1972-01-01", "Origin":"Japan"}
 80 | {"Name":"amc matador (sw)", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3892, "Acceleration":12.5, "Year":"1972-01-01", "Origin":"USA"}
 81 | {"Name":"chevrolet chevelle concours (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":307, "Horsepower":130, "Weight_in_lbs":4098, "Acceleration":14, "Year":"1972-01-01", "Origin":"USA"}
 82 | {"Name":"ford gran torino (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":4294, "Acceleration":16, "Year":"1972-01-01", "Origin":"USA"}
 83 | {"Name":"plymouth satellite custom (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4077, "Acceleration":14, "Year":"1972-01-01", "Origin":"USA"}
 84 | {"Name":"volvo 145e (sw)", "Miles_per_Gallon":18, "Cylinders":4, "Displacement":121, "Horsepower":112, "Weight_in_lbs":2933, "Acceleration":14.5, "Year":"1972-01-01", "Origin":"Europe"}
 85 | {"Name":"volkswagen 411 (sw)", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":121, "Horsepower":76, "Weight_in_lbs":2511, "Acceleration":18, "Year":"1972-01-01", "Origin":"Europe"}
 86 | {"Name":"peugeot 504 (sw)", "Miles_per_Gallon":21, "Cylinders":4, "Displacement":120, "Horsepower":87, "Weight_in_lbs":2979, "Acceleration":19.5, "Year":"1972-01-01", "Origin":"Europe"}
 87 | {"Name":"renault 12 (sw)", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":96, "Horsepower":69, "Weight_in_lbs":2189, "Acceleration":18, "Year":"1972-01-01", "Origin":"Europe"}
 88 | {"Name":"ford pinto (sw)", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":122, "Horsepower":86, "Weight_in_lbs":2395, "Acceleration":16, "Year":"1972-01-01", "Origin":"USA"}
 89 | {"Name":"datsun 510 (sw)", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":97, "Horsepower":92, "Weight_in_lbs":2288, "Acceleration":17, "Year":"1972-01-01", "Origin":"Japan"}
 90 | {"Name":"toyouta corona mark ii (sw)", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":120, "Horsepower":97, "Weight_in_lbs":2506, "Acceleration":14.5, "Year":"1972-01-01", "Origin":"Japan"}
 91 | {"Name":"dodge colt (sw)", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":98, "Horsepower":80, "Weight_in_lbs":2164, "Acceleration":15, "Year":"1972-01-01", "Origin":"USA"}
 92 | {"Name":"toyota corolla 1600 (sw)", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2100, "Acceleration":16.5, "Year":"1972-01-01", "Origin":"Japan"}
 93 | {"Name":"buick century 350", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":175, "Weight_in_lbs":4100, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"}
 94 | {"Name":"amc matador", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":3672, "Acceleration":11.5, "Year":"1973-01-01", "Origin":"USA"}
 95 | {"Name":"chevrolet malibu", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":3988, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"}
 96 | {"Name":"ford gran torino", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":302, "Horsepower":137, "Weight_in_lbs":4042, "Acceleration":14.5, "Year":"1973-01-01", "Origin":"USA"}
 97 | {"Name":"dodge coronet custom", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3777, "Acceleration":12.5, "Year":"1973-01-01", "Origin":"USA"}
 98 | {"Name":"mercury marquis brougham", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":429, "Horsepower":198, "Weight_in_lbs":4952, "Acceleration":11.5, "Year":"1973-01-01", "Origin":"USA"}
 99 | {"Name":"chevrolet caprice classic", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":400, "Horsepower":150, "Weight_in_lbs":4464, "Acceleration":12, "Year":"1973-01-01", "Origin":"USA"}
100 | {"Name":"ford ltd", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":351, "Horsepower":158, "Weight_in_lbs":4363, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"}
101 | {"Name":"plymouth fury gran sedan", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4237, "Acceleration":14.5, "Year":"1973-01-01", "Origin":"USA"}
102 | {"Name":"chrysler new yorker brougham", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":440, "Horsepower":215, "Weight_in_lbs":4735, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"}
103 | {"Name":"buick electra 225 custom", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":455, "Horsepower":225, "Weight_in_lbs":4951, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"}
104 | {"Name":"amc ambassador brougham", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":360, "Horsepower":175, "Weight_in_lbs":3821, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"}
105 | {"Name":"plymouth valiant", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":225, "Horsepower":105, "Weight_in_lbs":3121, "Acceleration":16.5, "Year":"1973-01-01", "Origin":"USA"}
106 | {"Name":"chevrolet nova custom", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3278, "Acceleration":18, "Year":"1973-01-01", "Origin":"USA"}
107 | {"Name":"amc hornet", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2945, "Acceleration":16, "Year":"1973-01-01", "Origin":"USA"}
108 | {"Name":"ford maverick", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":88, "Weight_in_lbs":3021, "Acceleration":16.5, "Year":"1973-01-01", "Origin":"USA"}
109 | {"Name":"plymouth duster", "Miles_per_Gallon":23, "Cylinders":6, "Displacement":198, "Horsepower":95, "Weight_in_lbs":2904, "Acceleration":16, "Year":"1973-01-01", "Origin":"USA"}
110 | {"Name":"volkswagen super beetle", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":46, "Weight_in_lbs":1950, "Acceleration":21, "Year":"1973-01-01", "Origin":"Europe"}
111 | {"Name":"chevrolet impala", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":400, "Horsepower":150, "Weight_in_lbs":4997, "Acceleration":14, "Year":"1973-01-01", "Origin":"USA"}
112 | {"Name":"ford country", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":400, "Horsepower":167, "Weight_in_lbs":4906, "Acceleration":12.5, "Year":"1973-01-01", "Origin":"USA"}
113 | {"Name":"plymouth custom suburb", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":360, "Horsepower":170, "Weight_in_lbs":4654, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"}
114 | {"Name":"oldsmobile vista cruiser", "Miles_per_Gallon":12, "Cylinders":8, "Displacement":350, "Horsepower":180, "Weight_in_lbs":4499, "Acceleration":12.5, "Year":"1973-01-01", "Origin":"USA"}
115 | {"Name":"amc gremlin", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2789, "Acceleration":15, "Year":"1973-01-01", "Origin":"USA"}
116 | {"Name":"toyota carina", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":97, "Horsepower":88, "Weight_in_lbs":2279, "Acceleration":19, "Year":"1973-01-01", "Origin":"Japan"}
117 | {"Name":"chevrolet vega", "Miles_per_Gallon":21, "Cylinders":4, "Displacement":140, "Horsepower":72, "Weight_in_lbs":2401, "Acceleration":19.5, "Year":"1973-01-01", "Origin":"USA"}
118 | {"Name":"datsun 610", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":108, "Horsepower":94, "Weight_in_lbs":2379, "Acceleration":16.5, "Year":"1973-01-01", "Origin":"Japan"}
119 | {"Name":"maxda rx3", "Miles_per_Gallon":18, "Cylinders":3, "Displacement":70, "Horsepower":90, "Weight_in_lbs":2124, "Acceleration":13.5, "Year":"1973-01-01", "Origin":"Japan"}
120 | {"Name":"ford pinto", "Miles_per_Gallon":19, "Cylinders":4, "Displacement":122, "Horsepower":85, "Weight_in_lbs":2310, "Acceleration":18.5, "Year":"1973-01-01", "Origin":"USA"}
121 | {"Name":"mercury capri v6", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":155, "Horsepower":107, "Weight_in_lbs":2472, "Acceleration":14, "Year":"1973-01-01", "Origin":"USA"}
122 | {"Name":"fiat 124 sport coupe", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":98, "Horsepower":90, "Weight_in_lbs":2265, "Acceleration":15.5, "Year":"1973-01-01", "Origin":"Europe"}
123 | {"Name":"chevrolet monte carlo s", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":4082, "Acceleration":13, "Year":"1973-01-01", "Origin":"USA"}
124 | {"Name":"pontiac grand prix", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":400, "Horsepower":230, "Weight_in_lbs":4278, "Acceleration":9.5, "Year":"1973-01-01", "Origin":"USA"}
125 | {"Name":"fiat 128", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":68, "Horsepower":49, "Weight_in_lbs":1867, "Acceleration":19.5, "Year":"1973-01-01", "Origin":"Europe"}
126 | {"Name":"opel manta", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":116, "Horsepower":75, "Weight_in_lbs":2158, "Acceleration":15.5, "Year":"1973-01-01", "Origin":"Europe"}
127 | {"Name":"audi 100ls", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":114, "Horsepower":91, "Weight_in_lbs":2582, "Acceleration":14, "Year":"1973-01-01", "Origin":"Europe"}
128 | {"Name":"volvo 144ea", "Miles_per_Gallon":19, "Cylinders":4, "Displacement":121, "Horsepower":112, "Weight_in_lbs":2868, "Acceleration":15.5, "Year":"1973-01-01", "Origin":"Europe"}
129 | {"Name":"dodge dart custom", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3399, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"}
130 | {"Name":"saab 99le", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":121, "Horsepower":110, "Weight_in_lbs":2660, "Acceleration":14, "Year":"1973-01-01", "Origin":"Europe"}
131 | {"Name":"toyota mark ii", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":156, "Horsepower":122, "Weight_in_lbs":2807, "Acceleration":13.5, "Year":"1973-01-01", "Origin":"Japan"}
132 | {"Name":"oldsmobile omega", "Miles_per_Gallon":11, "Cylinders":8, "Displacement":350, "Horsepower":180, "Weight_in_lbs":3664, "Acceleration":11, "Year":"1973-01-01", "Origin":"USA"}
133 | {"Name":"plymouth duster", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":198, "Horsepower":95, "Weight_in_lbs":3102, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"USA"}
134 | {"Name":"ford maverick", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":200, "Horsepower":null, "Weight_in_lbs":2875, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"}
135 | {"Name":"amc hornet", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2901, "Acceleration":16, "Year":"1974-01-01", "Origin":"USA"}
136 | {"Name":"chevrolet nova", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3336, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"}
137 | {"Name":"datsun b210", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":79, "Horsepower":67, "Weight_in_lbs":1950, "Acceleration":19, "Year":"1974-01-01", "Origin":"Japan"}
138 | {"Name":"ford pinto", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":122, "Horsepower":80, "Weight_in_lbs":2451, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"USA"}
139 | {"Name":"toyota corolla 1200", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":71, "Horsepower":65, "Weight_in_lbs":1836, "Acceleration":21, "Year":"1974-01-01", "Origin":"Japan"}
140 | {"Name":"chevrolet vega", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":140, "Horsepower":75, "Weight_in_lbs":2542, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"}
141 | {"Name":"chevrolet chevelle malibu classic", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":250, "Horsepower":100, "Weight_in_lbs":3781, "Acceleration":17, "Year":"1974-01-01", "Origin":"USA"}
142 | {"Name":"amc matador", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":258, "Horsepower":110, "Weight_in_lbs":3632, "Acceleration":18, "Year":"1974-01-01", "Origin":"USA"}
143 | {"Name":"plymouth satellite sebring", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":225, "Horsepower":105, "Weight_in_lbs":3613, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"USA"}
144 | {"Name":"ford gran torino", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":4141, "Acceleration":14, "Year":"1974-01-01", "Origin":"USA"}
145 | {"Name":"buick century luxus (sw)", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":150, "Weight_in_lbs":4699, "Acceleration":14.5, "Year":"1974-01-01", "Origin":"USA"}
146 | {"Name":"dodge coronet custom (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4457, "Acceleration":13.5, "Year":"1974-01-01", "Origin":"USA"}
147 | {"Name":"ford gran torino (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":302, "Horsepower":140, "Weight_in_lbs":4638, "Acceleration":16, "Year":"1974-01-01", "Origin":"USA"}
148 | {"Name":"amc matador (sw)", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":304, "Horsepower":150, "Weight_in_lbs":4257, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"USA"}
149 | {"Name":"audi fox", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":98, "Horsepower":83, "Weight_in_lbs":2219, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"Europe"}
150 | {"Name":"volkswagen dasher", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":79, "Horsepower":67, "Weight_in_lbs":1963, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"Europe"}
151 | {"Name":"opel manta", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":2300, "Acceleration":14.5, "Year":"1974-01-01", "Origin":"Europe"}
152 | {"Name":"toyota corona", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":76, "Horsepower":52, "Weight_in_lbs":1649, "Acceleration":16.5, "Year":"1974-01-01", "Origin":"Japan"}
153 | {"Name":"datsun 710", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":83, "Horsepower":61, "Weight_in_lbs":2003, "Acceleration":19, "Year":"1974-01-01", "Origin":"Japan"}
154 | {"Name":"dodge colt", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":90, "Horsepower":75, "Weight_in_lbs":2125, "Acceleration":14.5, "Year":"1974-01-01", "Origin":"USA"}
155 | {"Name":"fiat 128", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":90, "Horsepower":75, "Weight_in_lbs":2108, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"Europe"}
156 | {"Name":"fiat 124 tc", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":116, "Horsepower":75, "Weight_in_lbs":2246, "Acceleration":14, "Year":"1974-01-01", "Origin":"Europe"}
157 | {"Name":"honda civic", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":120, "Horsepower":97, "Weight_in_lbs":2489, "Acceleration":15, "Year":"1974-01-01", "Origin":"Japan"}
158 | {"Name":"subaru", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":108, "Horsepower":93, "Weight_in_lbs":2391, "Acceleration":15.5, "Year":"1974-01-01", "Origin":"Japan"}
159 | {"Name":"fiat x1.9", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":79, "Horsepower":67, "Weight_in_lbs":2000, "Acceleration":16, "Year":"1974-01-01", "Origin":"Europe"}
160 | {"Name":"plymouth valiant custom", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":225, "Horsepower":95, "Weight_in_lbs":3264, "Acceleration":16, "Year":"1975-01-01", "Origin":"USA"}
161 | {"Name":"chevrolet nova", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":105, "Weight_in_lbs":3459, "Acceleration":16, "Year":"1975-01-01", "Origin":"USA"}
162 | {"Name":"mercury monarch", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":250, "Horsepower":72, "Weight_in_lbs":3432, "Acceleration":21, "Year":"1975-01-01", "Origin":"USA"}
163 | {"Name":"ford maverick", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":250, "Horsepower":72, "Weight_in_lbs":3158, "Acceleration":19.5, "Year":"1975-01-01", "Origin":"USA"}
164 | {"Name":"pontiac catalina", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":400, "Horsepower":170, "Weight_in_lbs":4668, "Acceleration":11.5, "Year":"1975-01-01", "Origin":"USA"}
165 | {"Name":"chevrolet bel air", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":4440, "Acceleration":14, "Year":"1975-01-01", "Origin":"USA"}
166 | {"Name":"plymouth grand fury", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4498, "Acceleration":14.5, "Year":"1975-01-01", "Origin":"USA"}
167 | {"Name":"ford ltd", "Miles_per_Gallon":14, "Cylinders":8, "Displacement":351, "Horsepower":148, "Weight_in_lbs":4657, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"USA"}
168 | {"Name":"buick century", "Miles_per_Gallon":17, "Cylinders":6, "Displacement":231, "Horsepower":110, "Weight_in_lbs":3907, "Acceleration":21, "Year":"1975-01-01", "Origin":"USA"}
169 | {"Name":"chevroelt chevelle malibu", "Miles_per_Gallon":16, "Cylinders":6, "Displacement":250, "Horsepower":105, "Weight_in_lbs":3897, "Acceleration":18.5, "Year":"1975-01-01", "Origin":"USA"}
170 | {"Name":"amc matador", "Miles_per_Gallon":15, "Cylinders":6, "Displacement":258, "Horsepower":110, "Weight_in_lbs":3730, "Acceleration":19, "Year":"1975-01-01", "Origin":"USA"}
171 | {"Name":"plymouth fury", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":225, "Horsepower":95, "Weight_in_lbs":3785, "Acceleration":19, "Year":"1975-01-01", "Origin":"USA"}
172 | {"Name":"buick skyhawk", "Miles_per_Gallon":21, "Cylinders":6, "Displacement":231, "Horsepower":110, "Weight_in_lbs":3039, "Acceleration":15, "Year":"1975-01-01", "Origin":"USA"}
173 | {"Name":"chevrolet monza 2+2", "Miles_per_Gallon":20, "Cylinders":8, "Displacement":262, "Horsepower":110, "Weight_in_lbs":3221, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"USA"}
174 | {"Name":"ford mustang ii", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":302, "Horsepower":129, "Weight_in_lbs":3169, "Acceleration":12, "Year":"1975-01-01", "Origin":"USA"}
175 | {"Name":"toyota corolla", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":97, "Horsepower":75, "Weight_in_lbs":2171, "Acceleration":16, "Year":"1975-01-01", "Origin":"Japan"}
176 | {"Name":"ford pinto", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":140, "Horsepower":83, "Weight_in_lbs":2639, "Acceleration":17, "Year":"1975-01-01", "Origin":"USA"}
177 | {"Name":"amc gremlin", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":232, "Horsepower":100, "Weight_in_lbs":2914, "Acceleration":16, "Year":"1975-01-01", "Origin":"USA"}
178 | {"Name":"pontiac astro", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":140, "Horsepower":78, "Weight_in_lbs":2592, "Acceleration":18.5, "Year":"1975-01-01", "Origin":"USA"}
179 | {"Name":"toyota corona", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":134, "Horsepower":96, "Weight_in_lbs":2702, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"Japan"}
180 | {"Name":"volkswagen dasher", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":90, "Horsepower":71, "Weight_in_lbs":2223, "Acceleration":16.5, "Year":"1975-01-01", "Origin":"Europe"}
181 | {"Name":"datsun 710", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":119, "Horsepower":97, "Weight_in_lbs":2545, "Acceleration":17, "Year":"1975-01-01", "Origin":"Japan"}
182 | {"Name":"ford pinto", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":171, "Horsepower":97, "Weight_in_lbs":2984, "Acceleration":14.5, "Year":"1975-01-01", "Origin":"USA"}
183 | {"Name":"volkswagen rabbit", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":90, "Horsepower":70, "Weight_in_lbs":1937, "Acceleration":14, "Year":"1975-01-01", "Origin":"Europe"}
184 | {"Name":"amc pacer", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3211, "Acceleration":17, "Year":"1975-01-01", "Origin":"USA"}
185 | {"Name":"audi 100ls", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":115, "Horsepower":95, "Weight_in_lbs":2694, "Acceleration":15, "Year":"1975-01-01", "Origin":"Europe"}
186 | {"Name":"peugeot 504", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":120, "Horsepower":88, "Weight_in_lbs":2957, "Acceleration":17, "Year":"1975-01-01", "Origin":"Europe"}
187 | {"Name":"volvo 244dl", "Miles_per_Gallon":22, "Cylinders":4, "Displacement":121, "Horsepower":98, "Weight_in_lbs":2945, "Acceleration":14.5, "Year":"1975-01-01", "Origin":"Europe"}
188 | {"Name":"saab 99le", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":121, "Horsepower":115, "Weight_in_lbs":2671, "Acceleration":13.5, "Year":"1975-01-01", "Origin":"Europe"}
189 | {"Name":"honda civic cvcc", "Miles_per_Gallon":33, "Cylinders":4, "Displacement":91, "Horsepower":53, "Weight_in_lbs":1795, "Acceleration":17.5, "Year":"1975-01-01", "Origin":"Japan"}
190 | {"Name":"fiat 131", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":107, "Horsepower":86, "Weight_in_lbs":2464, "Acceleration":15.5, "Year":"1976-01-01", "Origin":"Europe"}
191 | {"Name":"opel 1900", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":116, "Horsepower":81, "Weight_in_lbs":2220, "Acceleration":16.9, "Year":"1976-01-01", "Origin":"Europe"}
192 | {"Name":"capri ii", "Miles_per_Gallon":25, "Cylinders":4, "Displacement":140, "Horsepower":92, "Weight_in_lbs":2572, "Acceleration":14.9, "Year":"1976-01-01", "Origin":"USA"}
193 | {"Name":"dodge colt", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":98, "Horsepower":79, "Weight_in_lbs":2255, "Acceleration":17.7, "Year":"1976-01-01", "Origin":"USA"}
194 | {"Name":"renault 12tl", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":101, "Horsepower":83, "Weight_in_lbs":2202, "Acceleration":15.3, "Year":"1976-01-01", "Origin":"Europe"}
195 | {"Name":"chevrolet chevelle malibu classic", "Miles_per_Gallon":17.5, "Cylinders":8, "Displacement":305, "Horsepower":140, "Weight_in_lbs":4215, "Acceleration":13, "Year":"1976-01-01", "Origin":"USA"}
196 | {"Name":"dodge coronet brougham", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":4190, "Acceleration":13, "Year":"1976-01-01", "Origin":"USA"}
197 | {"Name":"amc matador", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":304, "Horsepower":120, "Weight_in_lbs":3962, "Acceleration":13.9, "Year":"1976-01-01", "Origin":"USA"}
198 | {"Name":"ford gran torino", "Miles_per_Gallon":14.5, "Cylinders":8, "Displacement":351, "Horsepower":152, "Weight_in_lbs":4215, "Acceleration":12.8, "Year":"1976-01-01", "Origin":"USA"}
199 | {"Name":"plymouth valiant", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3233, "Acceleration":15.4, "Year":"1976-01-01", "Origin":"USA"}
200 | {"Name":"chevrolet nova", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":250, "Horsepower":105, "Weight_in_lbs":3353, "Acceleration":14.5, "Year":"1976-01-01", "Origin":"USA"}
201 | {"Name":"ford maverick", "Miles_per_Gallon":24, "Cylinders":6, "Displacement":200, "Horsepower":81, "Weight_in_lbs":3012, "Acceleration":17.6, "Year":"1976-01-01", "Origin":"USA"}
202 | {"Name":"amc hornet", "Miles_per_Gallon":22.5, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3085, "Acceleration":17.6, "Year":"1976-01-01", "Origin":"USA"}
203 | {"Name":"chevrolet chevette", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":85, "Horsepower":52, "Weight_in_lbs":2035, "Acceleration":22.2, "Year":"1976-01-01", "Origin":"USA"}
204 | {"Name":"chevrolet woody", "Miles_per_Gallon":24.5, "Cylinders":4, "Displacement":98, "Horsepower":60, "Weight_in_lbs":2164, "Acceleration":22.1, "Year":"1976-01-01", "Origin":"USA"}
205 | {"Name":"vw rabbit", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":90, "Horsepower":70, "Weight_in_lbs":1937, "Acceleration":14.2, "Year":"1976-01-01", "Origin":"Europe"}
206 | {"Name":"honda civic", "Miles_per_Gallon":33, "Cylinders":4, "Displacement":91, "Horsepower":53, "Weight_in_lbs":1795, "Acceleration":17.4, "Year":"1976-01-01", "Origin":"Japan"}
207 | {"Name":"dodge aspen se", "Miles_per_Gallon":20, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3651, "Acceleration":17.7, "Year":"1976-01-01", "Origin":"USA"}
208 | {"Name":"ford granada ghia", "Miles_per_Gallon":18, "Cylinders":6, "Displacement":250, "Horsepower":78, "Weight_in_lbs":3574, "Acceleration":21, "Year":"1976-01-01", "Origin":"USA"}
209 | {"Name":"pontiac ventura sj", "Miles_per_Gallon":18.5, "Cylinders":6, "Displacement":250, "Horsepower":110, "Weight_in_lbs":3645, "Acceleration":16.2, "Year":"1976-01-01", "Origin":"USA"}
210 | {"Name":"amc pacer d/l", "Miles_per_Gallon":17.5, "Cylinders":6, "Displacement":258, "Horsepower":95, "Weight_in_lbs":3193, "Acceleration":17.8, "Year":"1976-01-01", "Origin":"USA"}
211 | {"Name":"volkswagen rabbit", "Miles_per_Gallon":29.5, "Cylinders":4, "Displacement":97, "Horsepower":71, "Weight_in_lbs":1825, "Acceleration":12.2, "Year":"1976-01-01", "Origin":"Europe"}
212 | {"Name":"datsun b-210", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":85, "Horsepower":70, "Weight_in_lbs":1990, "Acceleration":17, "Year":"1976-01-01", "Origin":"Japan"}
213 | {"Name":"toyota corolla", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":97, "Horsepower":75, "Weight_in_lbs":2155, "Acceleration":16.4, "Year":"1976-01-01", "Origin":"Japan"}
214 | {"Name":"ford pinto", "Miles_per_Gallon":26.5, "Cylinders":4, "Displacement":140, "Horsepower":72, "Weight_in_lbs":2565, "Acceleration":13.6, "Year":"1976-01-01", "Origin":"USA"}
215 | {"Name":"volvo 245", "Miles_per_Gallon":20, "Cylinders":4, "Displacement":130, "Horsepower":102, "Weight_in_lbs":3150, "Acceleration":15.7, "Year":"1976-01-01", "Origin":"Europe"}
216 | {"Name":"plymouth volare premier v8", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3940, "Acceleration":13.2, "Year":"1976-01-01", "Origin":"USA"}
217 | {"Name":"peugeot 504", "Miles_per_Gallon":19, "Cylinders":4, "Displacement":120, "Horsepower":88, "Weight_in_lbs":3270, "Acceleration":21.9, "Year":"1976-01-01", "Origin":"Europe"}
218 | {"Name":"toyota mark ii", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":156, "Horsepower":108, "Weight_in_lbs":2930, "Acceleration":15.5, "Year":"1976-01-01", "Origin":"Japan"}
219 | {"Name":"mercedes-benz 280s", "Miles_per_Gallon":16.5, "Cylinders":6, "Displacement":168, "Horsepower":120, "Weight_in_lbs":3820, "Acceleration":16.7, "Year":"1976-01-01", "Origin":"Europe"}
220 | {"Name":"cadillac seville", "Miles_per_Gallon":16.5, "Cylinders":8, "Displacement":350, "Horsepower":180, "Weight_in_lbs":4380, "Acceleration":12.1, "Year":"1976-01-01", "Origin":"USA"}
221 | {"Name":"chevy c10", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":350, "Horsepower":145, "Weight_in_lbs":4055, "Acceleration":12, "Year":"1976-01-01", "Origin":"USA"}
222 | {"Name":"ford f108", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":302, "Horsepower":130, "Weight_in_lbs":3870, "Acceleration":15, "Year":"1976-01-01", "Origin":"USA"}
223 | {"Name":"dodge d100", "Miles_per_Gallon":13, "Cylinders":8, "Displacement":318, "Horsepower":150, "Weight_in_lbs":3755, "Acceleration":14, "Year":"1976-01-01", "Origin":"USA"}
224 | {"Name":"honda Accelerationord cvcc", "Miles_per_Gallon":31.5, "Cylinders":4, "Displacement":98, "Horsepower":68, "Weight_in_lbs":2045, "Acceleration":18.5, "Year":"1977-01-01", "Origin":"Japan"}
225 | {"Name":"buick opel isuzu deluxe", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":111, "Horsepower":80, "Weight_in_lbs":2155, "Acceleration":14.8, "Year":"1977-01-01", "Origin":"USA"}
226 | {"Name":"renault 5 gtl", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":79, "Horsepower":58, "Weight_in_lbs":1825, "Acceleration":18.6, "Year":"1977-01-01", "Origin":"Europe"}
227 | {"Name":"plymouth arrow gs", "Miles_per_Gallon":25.5, "Cylinders":4, "Displacement":122, "Horsepower":96, "Weight_in_lbs":2300, "Acceleration":15.5, "Year":"1977-01-01", "Origin":"USA"}
228 | {"Name":"datsun f-10 hatchback", "Miles_per_Gallon":33.5, "Cylinders":4, "Displacement":85, "Horsepower":70, "Weight_in_lbs":1945, "Acceleration":16.8, "Year":"1977-01-01", "Origin":"Japan"}
229 | {"Name":"chevrolet caprice classic", "Miles_per_Gallon":17.5, "Cylinders":8, "Displacement":305, "Horsepower":145, "Weight_in_lbs":3880, "Acceleration":12.5, "Year":"1977-01-01", "Origin":"USA"}
230 | {"Name":"oldsmobile cutlass supreme", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":260, "Horsepower":110, "Weight_in_lbs":4060, "Acceleration":19, "Year":"1977-01-01", "Origin":"USA"}
231 | {"Name":"dodge monaco brougham", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":318, "Horsepower":145, "Weight_in_lbs":4140, "Acceleration":13.7, "Year":"1977-01-01", "Origin":"USA"}
232 | {"Name":"mercury cougar brougham", "Miles_per_Gallon":15, "Cylinders":8, "Displacement":302, "Horsepower":130, "Weight_in_lbs":4295, "Acceleration":14.9, "Year":"1977-01-01", "Origin":"USA"}
233 | {"Name":"chevrolet concours", "Miles_per_Gallon":17.5, "Cylinders":6, "Displacement":250, "Horsepower":110, "Weight_in_lbs":3520, "Acceleration":16.4, "Year":"1977-01-01", "Origin":"USA"}
234 | {"Name":"buick skylark", "Miles_per_Gallon":20.5, "Cylinders":6, "Displacement":231, "Horsepower":105, "Weight_in_lbs":3425, "Acceleration":16.9, "Year":"1977-01-01", "Origin":"USA"}
235 | {"Name":"plymouth volare custom", "Miles_per_Gallon":19, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3630, "Acceleration":17.7, "Year":"1977-01-01", "Origin":"USA"}
236 | {"Name":"ford granada", "Miles_per_Gallon":18.5, "Cylinders":6, "Displacement":250, "Horsepower":98, "Weight_in_lbs":3525, "Acceleration":19, "Year":"1977-01-01", "Origin":"USA"}
237 | {"Name":"pontiac grand prix lj", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":400, "Horsepower":180, "Weight_in_lbs":4220, "Acceleration":11.1, "Year":"1977-01-01", "Origin":"USA"}
238 | {"Name":"chevrolet monte carlo landau", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":350, "Horsepower":170, "Weight_in_lbs":4165, "Acceleration":11.4, "Year":"1977-01-01", "Origin":"USA"}
239 | {"Name":"chrysler cordoba", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":400, "Horsepower":190, "Weight_in_lbs":4325, "Acceleration":12.2, "Year":"1977-01-01", "Origin":"USA"}
240 | {"Name":"ford thunderbird", "Miles_per_Gallon":16, "Cylinders":8, "Displacement":351, "Horsepower":149, "Weight_in_lbs":4335, "Acceleration":14.5, "Year":"1977-01-01", "Origin":"USA"}
241 | {"Name":"volkswagen rabbit custom", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":1940, "Acceleration":14.5, "Year":"1977-01-01", "Origin":"Europe"}
242 | {"Name":"pontiac sunbird coupe", "Miles_per_Gallon":24.5, "Cylinders":4, "Displacement":151, "Horsepower":88, "Weight_in_lbs":2740, "Acceleration":16, "Year":"1977-01-01", "Origin":"USA"}
243 | {"Name":"toyota corolla liftback", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":97, "Horsepower":75, "Weight_in_lbs":2265, "Acceleration":18.2, "Year":"1977-01-01", "Origin":"Japan"}
244 | {"Name":"ford mustang ii 2+2", "Miles_per_Gallon":25.5, "Cylinders":4, "Displacement":140, "Horsepower":89, "Weight_in_lbs":2755, "Acceleration":15.8, "Year":"1977-01-01", "Origin":"USA"}
245 | {"Name":"chevrolet chevette", "Miles_per_Gallon":30.5, "Cylinders":4, "Displacement":98, "Horsepower":63, "Weight_in_lbs":2051, "Acceleration":17, "Year":"1977-01-01", "Origin":"USA"}
246 | {"Name":"dodge colt m/m", "Miles_per_Gallon":33.5, "Cylinders":4, "Displacement":98, "Horsepower":83, "Weight_in_lbs":2075, "Acceleration":15.9, "Year":"1977-01-01", "Origin":"USA"}
247 | {"Name":"subaru dl", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":97, "Horsepower":67, "Weight_in_lbs":1985, "Acceleration":16.4, "Year":"1977-01-01", "Origin":"Japan"}
248 | {"Name":"volkswagen dasher", "Miles_per_Gallon":30.5, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":2190, "Acceleration":14.1, "Year":"1977-01-01", "Origin":"Europe"}
249 | {"Name":"datsun 810", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":146, "Horsepower":97, "Weight_in_lbs":2815, "Acceleration":14.5, "Year":"1977-01-01", "Origin":"Japan"}
250 | {"Name":"bmw 320i", "Miles_per_Gallon":21.5, "Cylinders":4, "Displacement":121, "Horsepower":110, "Weight_in_lbs":2600, "Acceleration":12.8, "Year":"1977-01-01", "Origin":"Europe"}
251 | {"Name":"mazda rx-4", "Miles_per_Gallon":21.5, "Cylinders":3, "Displacement":80, "Horsepower":110, "Weight_in_lbs":2720, "Acceleration":13.5, "Year":"1977-01-01", "Origin":"Japan"}
252 | {"Name":"volkswagen rabbit custom diesel", "Miles_per_Gallon":43.1, "Cylinders":4, "Displacement":90, "Horsepower":48, "Weight_in_lbs":1985, "Acceleration":21.5, "Year":"1978-01-01", "Origin":"Europe"}
253 | {"Name":"ford fiesta", "Miles_per_Gallon":36.1, "Cylinders":4, "Displacement":98, "Horsepower":66, "Weight_in_lbs":1800, "Acceleration":14.4, "Year":"1978-01-01", "Origin":"USA"}
254 | {"Name":"mazda glc deluxe", "Miles_per_Gallon":32.8, "Cylinders":4, "Displacement":78, "Horsepower":52, "Weight_in_lbs":1985, "Acceleration":19.4, "Year":"1978-01-01", "Origin":"Japan"}
255 | {"Name":"datsun b210 gx", "Miles_per_Gallon":39.4, "Cylinders":4, "Displacement":85, "Horsepower":70, "Weight_in_lbs":2070, "Acceleration":18.6, "Year":"1978-01-01", "Origin":"Japan"}
256 | {"Name":"honda civic cvcc", "Miles_per_Gallon":36.1, "Cylinders":4, "Displacement":91, "Horsepower":60, "Weight_in_lbs":1800, "Acceleration":16.4, "Year":"1978-01-01", "Origin":"Japan"}
257 | {"Name":"oldsmobile cutlass salon brougham", "Miles_per_Gallon":19.9, "Cylinders":8, "Displacement":260, "Horsepower":110, "Weight_in_lbs":3365, "Acceleration":15.5, "Year":"1978-01-01", "Origin":"USA"}
258 | {"Name":"dodge diplomat", "Miles_per_Gallon":19.4, "Cylinders":8, "Displacement":318, "Horsepower":140, "Weight_in_lbs":3735, "Acceleration":13.2, "Year":"1978-01-01", "Origin":"USA"}
259 | {"Name":"mercury monarch ghia", "Miles_per_Gallon":20.2, "Cylinders":8, "Displacement":302, "Horsepower":139, "Weight_in_lbs":3570, "Acceleration":12.8, "Year":"1978-01-01", "Origin":"USA"}
260 | {"Name":"pontiac phoenix lj", "Miles_per_Gallon":19.2, "Cylinders":6, "Displacement":231, "Horsepower":105, "Weight_in_lbs":3535, "Acceleration":19.2, "Year":"1978-01-01", "Origin":"USA"}
261 | {"Name":"chevrolet malibu", "Miles_per_Gallon":20.5, "Cylinders":6, "Displacement":200, "Horsepower":95, "Weight_in_lbs":3155, "Acceleration":18.2, "Year":"1978-01-01", "Origin":"USA"}
262 | {"Name":"ford fairmont (auto)", "Miles_per_Gallon":20.2, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":2965, "Acceleration":15.8, "Year":"1978-01-01", "Origin":"USA"}
263 | {"Name":"ford fairmont (man)", "Miles_per_Gallon":25.1, "Cylinders":4, "Displacement":140, "Horsepower":88, "Weight_in_lbs":2720, "Acceleration":15.4, "Year":"1978-01-01", "Origin":"USA"}
264 | {"Name":"plymouth volare", "Miles_per_Gallon":20.5, "Cylinders":6, "Displacement":225, "Horsepower":100, "Weight_in_lbs":3430, "Acceleration":17.2, "Year":"1978-01-01", "Origin":"USA"}
265 | {"Name":"amc concord", "Miles_per_Gallon":19.4, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3210, "Acceleration":17.2, "Year":"1978-01-01", "Origin":"USA"}
266 | {"Name":"buick century special", "Miles_per_Gallon":20.6, "Cylinders":6, "Displacement":231, "Horsepower":105, "Weight_in_lbs":3380, "Acceleration":15.8, "Year":"1978-01-01", "Origin":"USA"}
267 | {"Name":"mercury zephyr", "Miles_per_Gallon":20.8, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":3070, "Acceleration":16.7, "Year":"1978-01-01", "Origin":"USA"}
268 | {"Name":"dodge aspen", "Miles_per_Gallon":18.6, "Cylinders":6, "Displacement":225, "Horsepower":110, "Weight_in_lbs":3620, "Acceleration":18.7, "Year":"1978-01-01", "Origin":"USA"}
269 | {"Name":"amc concord d/l", "Miles_per_Gallon":18.1, "Cylinders":6, "Displacement":258, "Horsepower":120, "Weight_in_lbs":3410, "Acceleration":15.1, "Year":"1978-01-01", "Origin":"USA"}
270 | {"Name":"chevrolet monte carlo landau", "Miles_per_Gallon":19.2, "Cylinders":8, "Displacement":305, "Horsepower":145, "Weight_in_lbs":3425, "Acceleration":13.2, "Year":"1978-01-01", "Origin":"USA"}
271 | {"Name":"buick regal sport coupe (turbo)", "Miles_per_Gallon":17.7, "Cylinders":6, "Displacement":231, "Horsepower":165, "Weight_in_lbs":3445, "Acceleration":13.4, "Year":"1978-01-01", "Origin":"USA"}
272 | {"Name":"ford futura", "Miles_per_Gallon":18.1, "Cylinders":8, "Displacement":302, "Horsepower":139, "Weight_in_lbs":3205, "Acceleration":11.2, "Year":"1978-01-01", "Origin":"USA"}
273 | {"Name":"dodge magnum xe", "Miles_per_Gallon":17.5, "Cylinders":8, "Displacement":318, "Horsepower":140, "Weight_in_lbs":4080, "Acceleration":13.7, "Year":"1978-01-01", "Origin":"USA"}
274 | {"Name":"chevrolet chevette", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":98, "Horsepower":68, "Weight_in_lbs":2155, "Acceleration":16.5, "Year":"1978-01-01", "Origin":"USA"}
275 | {"Name":"toyota corona", "Miles_per_Gallon":27.5, "Cylinders":4, "Displacement":134, "Horsepower":95, "Weight_in_lbs":2560, "Acceleration":14.2, "Year":"1978-01-01", "Origin":"Japan"}
276 | {"Name":"datsun 510", "Miles_per_Gallon":27.2, "Cylinders":4, "Displacement":119, "Horsepower":97, "Weight_in_lbs":2300, "Acceleration":14.7, "Year":"1978-01-01", "Origin":"Japan"}
277 | {"Name":"dodge omni", "Miles_per_Gallon":30.9, "Cylinders":4, "Displacement":105, "Horsepower":75, "Weight_in_lbs":2230, "Acceleration":14.5, "Year":"1978-01-01", "Origin":"USA"}
278 | {"Name":"toyota celica gt liftback", "Miles_per_Gallon":21.1, "Cylinders":4, "Displacement":134, "Horsepower":95, "Weight_in_lbs":2515, "Acceleration":14.8, "Year":"1978-01-01", "Origin":"Japan"}
279 | {"Name":"plymouth sapporo", "Miles_per_Gallon":23.2, "Cylinders":4, "Displacement":156, "Horsepower":105, "Weight_in_lbs":2745, "Acceleration":16.7, "Year":"1978-01-01", "Origin":"USA"}
280 | {"Name":"oldsmobile starfire sx", "Miles_per_Gallon":23.8, "Cylinders":4, "Displacement":151, "Horsepower":85, "Weight_in_lbs":2855, "Acceleration":17.6, "Year":"1978-01-01", "Origin":"USA"}
281 | {"Name":"datsun 200-sx", "Miles_per_Gallon":23.9, "Cylinders":4, "Displacement":119, "Horsepower":97, "Weight_in_lbs":2405, "Acceleration":14.9, "Year":"1978-01-01", "Origin":"Japan"}
282 | {"Name":"audi 5000", "Miles_per_Gallon":20.3, "Cylinders":5, "Displacement":131, "Horsepower":103, "Weight_in_lbs":2830, "Acceleration":15.9, "Year":"1978-01-01", "Origin":"Europe"}
283 | {"Name":"volvo 264gl", "Miles_per_Gallon":17, "Cylinders":6, "Displacement":163, "Horsepower":125, "Weight_in_lbs":3140, "Acceleration":13.6, "Year":"1978-01-01", "Origin":"Europe"}
284 | {"Name":"saab 99gle", "Miles_per_Gallon":21.6, "Cylinders":4, "Displacement":121, "Horsepower":115, "Weight_in_lbs":2795, "Acceleration":15.7, "Year":"1978-01-01", "Origin":"Europe"}
285 | {"Name":"peugeot 604sl", "Miles_per_Gallon":16.2, "Cylinders":6, "Displacement":163, "Horsepower":133, "Weight_in_lbs":3410, "Acceleration":15.8, "Year":"1978-01-01", "Origin":"Europe"}
286 | {"Name":"volkswagen scirocco", "Miles_per_Gallon":31.5, "Cylinders":4, "Displacement":89, "Horsepower":71, "Weight_in_lbs":1990, "Acceleration":14.9, "Year":"1978-01-01", "Origin":"Europe"}
287 | {"Name":"honda Accelerationord lx", "Miles_per_Gallon":29.5, "Cylinders":4, "Displacement":98, "Horsepower":68, "Weight_in_lbs":2135, "Acceleration":16.6, "Year":"1978-01-01", "Origin":"Japan"}
288 | {"Name":"pontiac lemans v6", "Miles_per_Gallon":21.5, "Cylinders":6, "Displacement":231, "Horsepower":115, "Weight_in_lbs":3245, "Acceleration":15.4, "Year":"1979-01-01", "Origin":"USA"}
289 | {"Name":"mercury zephyr 6", "Miles_per_Gallon":19.8, "Cylinders":6, "Displacement":200, "Horsepower":85, "Weight_in_lbs":2990, "Acceleration":18.2, "Year":"1979-01-01", "Origin":"USA"}
290 | {"Name":"ford fairmont 4", "Miles_per_Gallon":22.3, "Cylinders":4, "Displacement":140, "Horsepower":88, "Weight_in_lbs":2890, "Acceleration":17.3, "Year":"1979-01-01", "Origin":"USA"}
291 | {"Name":"amc concord dl 6", "Miles_per_Gallon":20.2, "Cylinders":6, "Displacement":232, "Horsepower":90, "Weight_in_lbs":3265, "Acceleration":18.2, "Year":"1979-01-01", "Origin":"USA"}
292 | {"Name":"dodge aspen 6", "Miles_per_Gallon":20.6, "Cylinders":6, "Displacement":225, "Horsepower":110, "Weight_in_lbs":3360, "Acceleration":16.6, "Year":"1979-01-01", "Origin":"USA"}
293 | {"Name":"chevrolet caprice classic", "Miles_per_Gallon":17, "Cylinders":8, "Displacement":305, "Horsepower":130, "Weight_in_lbs":3840, "Acceleration":15.4, "Year":"1979-01-01", "Origin":"USA"}
294 | {"Name":"ford ltd landau", "Miles_per_Gallon":17.6, "Cylinders":8, "Displacement":302, "Horsepower":129, "Weight_in_lbs":3725, "Acceleration":13.4, "Year":"1979-01-01", "Origin":"USA"}
295 | {"Name":"mercury grand marquis", "Miles_per_Gallon":16.5, "Cylinders":8, "Displacement":351, "Horsepower":138, "Weight_in_lbs":3955, "Acceleration":13.2, "Year":"1979-01-01", "Origin":"USA"}
296 | {"Name":"dodge st. regis", "Miles_per_Gallon":18.2, "Cylinders":8, "Displacement":318, "Horsepower":135, "Weight_in_lbs":3830, "Acceleration":15.2, "Year":"1979-01-01", "Origin":"USA"}
297 | {"Name":"buick estate wagon (sw)", "Miles_per_Gallon":16.9, "Cylinders":8, "Displacement":350, "Horsepower":155, "Weight_in_lbs":4360, "Acceleration":14.9, "Year":"1979-01-01", "Origin":"USA"}
298 | {"Name":"ford country squire (sw)", "Miles_per_Gallon":15.5, "Cylinders":8, "Displacement":351, "Horsepower":142, "Weight_in_lbs":4054, "Acceleration":14.3, "Year":"1979-01-01", "Origin":"USA"}
299 | {"Name":"chevrolet malibu classic (sw)", "Miles_per_Gallon":19.2, "Cylinders":8, "Displacement":267, "Horsepower":125, "Weight_in_lbs":3605, "Acceleration":15, "Year":"1979-01-01", "Origin":"USA"}
300 | {"Name":"chrysler lebaron town @ country (sw)", "Miles_per_Gallon":18.5, "Cylinders":8, "Displacement":360, "Horsepower":150, "Weight_in_lbs":3940, "Acceleration":13, "Year":"1979-01-01", "Origin":"USA"}
301 | {"Name":"vw rabbit custom", "Miles_per_Gallon":31.9, "Cylinders":4, "Displacement":89, "Horsepower":71, "Weight_in_lbs":1925, "Acceleration":14, "Year":"1979-01-01", "Origin":"Europe"}
302 | {"Name":"maxda glc deluxe", "Miles_per_Gallon":34.1, "Cylinders":4, "Displacement":86, "Horsepower":65, "Weight_in_lbs":1975, "Acceleration":15.2, "Year":"1979-01-01", "Origin":"Japan"}
303 | {"Name":"dodge colt hatchback custom", "Miles_per_Gallon":35.7, "Cylinders":4, "Displacement":98, "Horsepower":80, "Weight_in_lbs":1915, "Acceleration":14.4, "Year":"1979-01-01", "Origin":"USA"}
304 | {"Name":"amc spirit dl", "Miles_per_Gallon":27.4, "Cylinders":4, "Displacement":121, "Horsepower":80, "Weight_in_lbs":2670, "Acceleration":15, "Year":"1979-01-01", "Origin":"USA"}
305 | {"Name":"mercedes benz 300d", "Miles_per_Gallon":25.4, "Cylinders":5, "Displacement":183, "Horsepower":77, "Weight_in_lbs":3530, "Acceleration":20.1, "Year":"1979-01-01", "Origin":"Europe"}
306 | {"Name":"cadillac eldorado", "Miles_per_Gallon":23, "Cylinders":8, "Displacement":350, "Horsepower":125, "Weight_in_lbs":3900, "Acceleration":17.4, "Year":"1979-01-01", "Origin":"USA"}
307 | {"Name":"peugeot 504", "Miles_per_Gallon":27.2, "Cylinders":4, "Displacement":141, "Horsepower":71, "Weight_in_lbs":3190, "Acceleration":24.8, "Year":"1979-01-01", "Origin":"Europe"}
308 | {"Name":"oldsmobile cutlass salon brougham", "Miles_per_Gallon":23.9, "Cylinders":8, "Displacement":260, "Horsepower":90, "Weight_in_lbs":3420, "Acceleration":22.2, "Year":"1979-01-01", "Origin":"USA"}
309 | {"Name":"plymouth horizon", "Miles_per_Gallon":34.2, "Cylinders":4, "Displacement":105, "Horsepower":70, "Weight_in_lbs":2200, "Acceleration":13.2, "Year":"1979-01-01", "Origin":"USA"}
310 | {"Name":"plymouth horizon tc3", "Miles_per_Gallon":34.5, "Cylinders":4, "Displacement":105, "Horsepower":70, "Weight_in_lbs":2150, "Acceleration":14.9, "Year":"1979-01-01", "Origin":"USA"}
311 | {"Name":"datsun 210", "Miles_per_Gallon":31.8, "Cylinders":4, "Displacement":85, "Horsepower":65, "Weight_in_lbs":2020, "Acceleration":19.2, "Year":"1979-01-01", "Origin":"Japan"}
312 | {"Name":"fiat strada custom", "Miles_per_Gallon":37.3, "Cylinders":4, "Displacement":91, "Horsepower":69, "Weight_in_lbs":2130, "Acceleration":14.7, "Year":"1979-01-01", "Origin":"Europe"}
313 | {"Name":"buick skylark limited", "Miles_per_Gallon":28.4, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2670, "Acceleration":16, "Year":"1979-01-01", "Origin":"USA"}
314 | {"Name":"chevrolet citation", "Miles_per_Gallon":28.8, "Cylinders":6, "Displacement":173, "Horsepower":115, "Weight_in_lbs":2595, "Acceleration":11.3, "Year":"1979-01-01", "Origin":"USA"}
315 | {"Name":"oldsmobile omega brougham", "Miles_per_Gallon":26.8, "Cylinders":6, "Displacement":173, "Horsepower":115, "Weight_in_lbs":2700, "Acceleration":12.9, "Year":"1979-01-01", "Origin":"USA"}
316 | {"Name":"pontiac phoenix", "Miles_per_Gallon":33.5, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2556, "Acceleration":13.2, "Year":"1979-01-01", "Origin":"USA"}
317 | {"Name":"vw rabbit", "Miles_per_Gallon":41.5, "Cylinders":4, "Displacement":98, "Horsepower":76, "Weight_in_lbs":2144, "Acceleration":14.7, "Year":"1980-01-01", "Origin":"Europe"}
318 | {"Name":"toyota corolla tercel", "Miles_per_Gallon":38.1, "Cylinders":4, "Displacement":89, "Horsepower":60, "Weight_in_lbs":1968, "Acceleration":18.8, "Year":"1980-01-01", "Origin":"Japan"}
319 | {"Name":"chevrolet chevette", "Miles_per_Gallon":32.1, "Cylinders":4, "Displacement":98, "Horsepower":70, "Weight_in_lbs":2120, "Acceleration":15.5, "Year":"1980-01-01", "Origin":"USA"}
320 | {"Name":"datsun 310", "Miles_per_Gallon":37.2, "Cylinders":4, "Displacement":86, "Horsepower":65, "Weight_in_lbs":2019, "Acceleration":16.4, "Year":"1980-01-01", "Origin":"Japan"}
321 | {"Name":"chevrolet citation", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2678, "Acceleration":16.5, "Year":"1980-01-01", "Origin":"USA"}
322 | {"Name":"ford fairmont", "Miles_per_Gallon":26.4, "Cylinders":4, "Displacement":140, "Horsepower":88, "Weight_in_lbs":2870, "Acceleration":18.1, "Year":"1980-01-01", "Origin":"USA"}
323 | {"Name":"amc concord", "Miles_per_Gallon":24.3, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":3003, "Acceleration":20.1, "Year":"1980-01-01", "Origin":"USA"}
324 | {"Name":"dodge aspen", "Miles_per_Gallon":19.1, "Cylinders":6, "Displacement":225, "Horsepower":90, "Weight_in_lbs":3381, "Acceleration":18.7, "Year":"1980-01-01", "Origin":"USA"}
325 | {"Name":"audi 4000", "Miles_per_Gallon":34.3, "Cylinders":4, "Displacement":97, "Horsepower":78, "Weight_in_lbs":2188, "Acceleration":15.8, "Year":"1980-01-01", "Origin":"Europe"}
326 | {"Name":"toyota corona liftback", "Miles_per_Gallon":29.8, "Cylinders":4, "Displacement":134, "Horsepower":90, "Weight_in_lbs":2711, "Acceleration":15.5, "Year":"1980-01-01", "Origin":"Japan"}
327 | {"Name":"mazda 626", "Miles_per_Gallon":31.3, "Cylinders":4, "Displacement":120, "Horsepower":75, "Weight_in_lbs":2542, "Acceleration":17.5, "Year":"1980-01-01", "Origin":"Japan"}
328 | {"Name":"datsun 510 hatchback", "Miles_per_Gallon":37, "Cylinders":4, "Displacement":119, "Horsepower":92, "Weight_in_lbs":2434, "Acceleration":15, "Year":"1980-01-01", "Origin":"Japan"}
329 | {"Name":"toyota corolla", "Miles_per_Gallon":32.2, "Cylinders":4, "Displacement":108, "Horsepower":75, "Weight_in_lbs":2265, "Acceleration":15.2, "Year":"1980-01-01", "Origin":"Japan"}
330 | {"Name":"mazda glc", "Miles_per_Gallon":46.6, "Cylinders":4, "Displacement":86, "Horsepower":65, "Weight_in_lbs":2110, "Acceleration":17.9, "Year":"1980-01-01", "Origin":"Japan"}
331 | {"Name":"dodge colt", "Miles_per_Gallon":27.9, "Cylinders":4, "Displacement":156, "Horsepower":105, "Weight_in_lbs":2800, "Acceleration":14.4, "Year":"1980-01-01", "Origin":"USA"}
332 | {"Name":"datsun 210", "Miles_per_Gallon":40.8, "Cylinders":4, "Displacement":85, "Horsepower":65, "Weight_in_lbs":2110, "Acceleration":19.2, "Year":"1980-01-01", "Origin":"Japan"}
333 | {"Name":"vw rabbit c (diesel)", "Miles_per_Gallon":44.3, "Cylinders":4, "Displacement":90, "Horsepower":48, "Weight_in_lbs":2085, "Acceleration":21.7, "Year":"1980-01-01", "Origin":"Europe"}
334 | {"Name":"vw dasher (diesel)", "Miles_per_Gallon":43.4, "Cylinders":4, "Displacement":90, "Horsepower":48, "Weight_in_lbs":2335, "Acceleration":23.7, "Year":"1980-01-01", "Origin":"Europe"}
335 | {"Name":"audi 5000s (diesel)", "Miles_per_Gallon":36.4, "Cylinders":5, "Displacement":121, "Horsepower":67, "Weight_in_lbs":2950, "Acceleration":19.9, "Year":"1980-01-01", "Origin":"Europe"}
336 | {"Name":"mercedes-benz 240d", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":146, "Horsepower":67, "Weight_in_lbs":3250, "Acceleration":21.8, "Year":"1980-01-01", "Origin":"Europe"}
337 | {"Name":"honda civic 1500 gl", "Miles_per_Gallon":44.6, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1850, "Acceleration":13.8, "Year":"1980-01-01", "Origin":"Japan"}
338 | {"Name":"renault lecar deluxe", "Miles_per_Gallon":40.9, "Cylinders":4, "Displacement":85, "Horsepower":null, "Weight_in_lbs":1835, "Acceleration":17.3, "Year":"1980-01-01", "Origin":"Europe"}
339 | {"Name":"subaru dl", "Miles_per_Gallon":33.8, "Cylinders":4, "Displacement":97, "Horsepower":67, "Weight_in_lbs":2145, "Acceleration":18, "Year":"1980-01-01", "Origin":"Japan"}
340 | {"Name":"vokswagen rabbit", "Miles_per_Gallon":29.8, "Cylinders":4, "Displacement":89, "Horsepower":62, "Weight_in_lbs":1845, "Acceleration":15.3, "Year":"1980-01-01", "Origin":"Europe"}
341 | {"Name":"datsun 280-zx", "Miles_per_Gallon":32.7, "Cylinders":6, "Displacement":168, "Horsepower":132, "Weight_in_lbs":2910, "Acceleration":11.4, "Year":"1980-01-01", "Origin":"Japan"}
342 | {"Name":"mazda rx-7 gs", "Miles_per_Gallon":23.7, "Cylinders":3, "Displacement":70, "Horsepower":100, "Weight_in_lbs":2420, "Acceleration":12.5, "Year":"1980-01-01", "Origin":"Japan"}
343 | {"Name":"triumph tr7 coupe", "Miles_per_Gallon":35, "Cylinders":4, "Displacement":122, "Horsepower":88, "Weight_in_lbs":2500, "Acceleration":15.1, "Year":"1980-01-01", "Origin":"Europe"}
344 | {"Name":"ford mustang cobra", "Miles_per_Gallon":23.6, "Cylinders":4, "Displacement":140, "Horsepower":null, "Weight_in_lbs":2905, "Acceleration":14.3, "Year":"1980-01-01", "Origin":"USA"}
345 | {"Name":"honda Accelerationord", "Miles_per_Gallon":32.4, "Cylinders":4, "Displacement":107, "Horsepower":72, "Weight_in_lbs":2290, "Acceleration":17, "Year":"1980-01-01", "Origin":"Japan"}
346 | {"Name":"plymouth reliant", "Miles_per_Gallon":27.2, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2490, "Acceleration":15.7, "Year":"1982-01-01", "Origin":"USA"}
347 | {"Name":"buick skylark", "Miles_per_Gallon":26.6, "Cylinders":4, "Displacement":151, "Horsepower":84, "Weight_in_lbs":2635, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"}
348 | {"Name":"dodge aries wagon (sw)", "Miles_per_Gallon":25.8, "Cylinders":4, "Displacement":156, "Horsepower":92, "Weight_in_lbs":2620, "Acceleration":14.4, "Year":"1982-01-01", "Origin":"USA"}
349 | {"Name":"chevrolet citation", "Miles_per_Gallon":23.5, "Cylinders":6, "Displacement":173, "Horsepower":110, "Weight_in_lbs":2725, "Acceleration":12.6, "Year":"1982-01-01", "Origin":"USA"}
350 | {"Name":"plymouth reliant", "Miles_per_Gallon":30, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2385, "Acceleration":12.9, "Year":"1982-01-01", "Origin":"USA"}
351 | {"Name":"toyota starlet", "Miles_per_Gallon":39.1, "Cylinders":4, "Displacement":79, "Horsepower":58, "Weight_in_lbs":1755, "Acceleration":16.9, "Year":"1982-01-01", "Origin":"Japan"}
352 | {"Name":"plymouth champ", "Miles_per_Gallon":39, "Cylinders":4, "Displacement":86, "Horsepower":64, "Weight_in_lbs":1875, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"}
353 | {"Name":"honda civic 1300", "Miles_per_Gallon":35.1, "Cylinders":4, "Displacement":81, "Horsepower":60, "Weight_in_lbs":1760, "Acceleration":16.1, "Year":"1982-01-01", "Origin":"Japan"}
354 | {"Name":"subaru", "Miles_per_Gallon":32.3, "Cylinders":4, "Displacement":97, "Horsepower":67, "Weight_in_lbs":2065, "Acceleration":17.8, "Year":"1982-01-01", "Origin":"Japan"}
355 | {"Name":"datsun 210", "Miles_per_Gallon":37, "Cylinders":4, "Displacement":85, "Horsepower":65, "Weight_in_lbs":1975, "Acceleration":19.4, "Year":"1982-01-01", "Origin":"Japan"}
356 | {"Name":"toyota tercel", "Miles_per_Gallon":37.7, "Cylinders":4, "Displacement":89, "Horsepower":62, "Weight_in_lbs":2050, "Acceleration":17.3, "Year":"1982-01-01", "Origin":"Japan"}
357 | {"Name":"mazda glc 4", "Miles_per_Gallon":34.1, "Cylinders":4, "Displacement":91, "Horsepower":68, "Weight_in_lbs":1985, "Acceleration":16, "Year":"1982-01-01", "Origin":"Japan"}
358 | {"Name":"plymouth horizon 4", "Miles_per_Gallon":34.7, "Cylinders":4, "Displacement":105, "Horsepower":63, "Weight_in_lbs":2215, "Acceleration":14.9, "Year":"1982-01-01", "Origin":"USA"}
359 | {"Name":"ford escort 4w", "Miles_per_Gallon":34.4, "Cylinders":4, "Displacement":98, "Horsepower":65, "Weight_in_lbs":2045, "Acceleration":16.2, "Year":"1982-01-01", "Origin":"USA"}
360 | {"Name":"ford escort 2h", "Miles_per_Gallon":29.9, "Cylinders":4, "Displacement":98, "Horsepower":65, "Weight_in_lbs":2380, "Acceleration":20.7, "Year":"1982-01-01", "Origin":"USA"}
361 | {"Name":"volkswagen jetta", "Miles_per_Gallon":33, "Cylinders":4, "Displacement":105, "Horsepower":74, "Weight_in_lbs":2190, "Acceleration":14.2, "Year":"1982-01-01", "Origin":"Europe"}
362 | {"Name":"renault 18i", "Miles_per_Gallon":34.5, "Cylinders":4, "Displacement":100, "Horsepower":null, "Weight_in_lbs":2320, "Acceleration":15.8, "Year":"1982-01-01", "Origin":"Europe"}
363 | {"Name":"honda prelude", "Miles_per_Gallon":33.7, "Cylinders":4, "Displacement":107, "Horsepower":75, "Weight_in_lbs":2210, "Acceleration":14.4, "Year":"1982-01-01", "Origin":"Japan"}
364 | {"Name":"toyota corolla", "Miles_per_Gallon":32.4, "Cylinders":4, "Displacement":108, "Horsepower":75, "Weight_in_lbs":2350, "Acceleration":16.8, "Year":"1982-01-01", "Origin":"Japan"}
365 | {"Name":"datsun 200sx", "Miles_per_Gallon":32.9, "Cylinders":4, "Displacement":119, "Horsepower":100, "Weight_in_lbs":2615, "Acceleration":14.8, "Year":"1982-01-01", "Origin":"Japan"}
366 | {"Name":"mazda 626", "Miles_per_Gallon":31.6, "Cylinders":4, "Displacement":120, "Horsepower":74, "Weight_in_lbs":2635, "Acceleration":18.3, "Year":"1982-01-01", "Origin":"Japan"}
367 | {"Name":"peugeot 505s turbo diesel", "Miles_per_Gallon":28.1, "Cylinders":4, "Displacement":141, "Horsepower":80, "Weight_in_lbs":3230, "Acceleration":20.4, "Year":"1982-01-01", "Origin":"Europe"}
368 | {"Name":"saab 900s", "Miles_per_Gallon":null, "Cylinders":4, "Displacement":121, "Horsepower":110, "Weight_in_lbs":2800, "Acceleration":15.4, "Year":"1982-01-01", "Origin":"Europe"}
369 | {"Name":"volvo diesel", "Miles_per_Gallon":30.7, "Cylinders":6, "Displacement":145, "Horsepower":76, "Weight_in_lbs":3160, "Acceleration":19.6, "Year":"1982-01-01", "Origin":"Europe"}
370 | {"Name":"toyota cressida", "Miles_per_Gallon":25.4, "Cylinders":6, "Displacement":168, "Horsepower":116, "Weight_in_lbs":2900, "Acceleration":12.6, "Year":"1982-01-01", "Origin":"Japan"}
371 | {"Name":"datsun 810 maxima", "Miles_per_Gallon":24.2, "Cylinders":6, "Displacement":146, "Horsepower":120, "Weight_in_lbs":2930, "Acceleration":13.8, "Year":"1982-01-01", "Origin":"Japan"}
372 | {"Name":"buick century", "Miles_per_Gallon":22.4, "Cylinders":6, "Displacement":231, "Horsepower":110, "Weight_in_lbs":3415, "Acceleration":15.8, "Year":"1982-01-01", "Origin":"USA"}
373 | {"Name":"oldsmobile cutlass ls", "Miles_per_Gallon":26.6, "Cylinders":8, "Displacement":350, "Horsepower":105, "Weight_in_lbs":3725, "Acceleration":19, "Year":"1982-01-01", "Origin":"USA"}
374 | {"Name":"ford granada gl", "Miles_per_Gallon":20.2, "Cylinders":6, "Displacement":200, "Horsepower":88, "Weight_in_lbs":3060, "Acceleration":17.1, "Year":"1982-01-01", "Origin":"USA"}
375 | {"Name":"chrysler lebaron salon", "Miles_per_Gallon":17.6, "Cylinders":6, "Displacement":225, "Horsepower":85, "Weight_in_lbs":3465, "Acceleration":16.6, "Year":"1982-01-01", "Origin":"USA"}
376 | {"Name":"chevrolet cavalier", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":112, "Horsepower":88, "Weight_in_lbs":2605, "Acceleration":19.6, "Year":"1982-01-01", "Origin":"USA"}
377 | {"Name":"chevrolet cavalier wagon", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":112, "Horsepower":88, "Weight_in_lbs":2640, "Acceleration":18.6, "Year":"1982-01-01", "Origin":"USA"}
378 | {"Name":"chevrolet cavalier 2-door", "Miles_per_Gallon":34, "Cylinders":4, "Displacement":112, "Horsepower":88, "Weight_in_lbs":2395, "Acceleration":18, "Year":"1982-01-01", "Origin":"USA"}
379 | {"Name":"pontiac j2000 se hatchback", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":112, "Horsepower":85, "Weight_in_lbs":2575, "Acceleration":16.2, "Year":"1982-01-01", "Origin":"USA"}
380 | {"Name":"dodge aries se", "Miles_per_Gallon":29, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2525, "Acceleration":16, "Year":"1982-01-01", "Origin":"USA"}
381 | {"Name":"pontiac phoenix", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2735, "Acceleration":18, "Year":"1982-01-01", "Origin":"USA"}
382 | {"Name":"ford fairmont futura", "Miles_per_Gallon":24, "Cylinders":4, "Displacement":140, "Horsepower":92, "Weight_in_lbs":2865, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"}
383 | {"Name":"amc concord dl", "Miles_per_Gallon":23, "Cylinders":4, "Displacement":151, "Horsepower":null, "Weight_in_lbs":3035, "Acceleration":20.5, "Year":"1982-01-01", "Origin":"USA"}
384 | {"Name":"volkswagen rabbit l", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":105, "Horsepower":74, "Weight_in_lbs":1980, "Acceleration":15.3, "Year":"1982-01-01", "Origin":"Europe"}
385 | {"Name":"mazda glc custom l", "Miles_per_Gallon":37, "Cylinders":4, "Displacement":91, "Horsepower":68, "Weight_in_lbs":2025, "Acceleration":18.2, "Year":"1982-01-01", "Origin":"Japan"}
386 | {"Name":"mazda glc custom", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":91, "Horsepower":68, "Weight_in_lbs":1970, "Acceleration":17.6, "Year":"1982-01-01", "Origin":"Japan"}
387 | {"Name":"plymouth horizon miser", "Miles_per_Gallon":38, "Cylinders":4, "Displacement":105, "Horsepower":63, "Weight_in_lbs":2125, "Acceleration":14.7, "Year":"1982-01-01", "Origin":"USA"}
388 | {"Name":"mercury lynx l", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":98, "Horsepower":70, "Weight_in_lbs":2125, "Acceleration":17.3, "Year":"1982-01-01", "Origin":"USA"}
389 | {"Name":"nissan stanza xe", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":120, "Horsepower":88, "Weight_in_lbs":2160, "Acceleration":14.5, "Year":"1982-01-01", "Origin":"Japan"}
390 | {"Name":"honda Accelerationord", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":107, "Horsepower":75, "Weight_in_lbs":2205, "Acceleration":14.5, "Year":"1982-01-01", "Origin":"Japan"}
391 | {"Name":"toyota corolla", "Miles_per_Gallon":34, "Cylinders":4, "Displacement":108, "Horsepower":70, "Weight_in_lbs":2245, "Acceleration":16.9, "Year":"1982-01-01", "Origin":"Japan"}
392 | {"Name":"honda civic", "Miles_per_Gallon":38, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1965, "Acceleration":15, "Year":"1982-01-01", "Origin":"Japan"}
393 | {"Name":"honda civic (auto)", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1965, "Acceleration":15.7, "Year":"1982-01-01", "Origin":"Japan"}
394 | {"Name":"datsun 310 gx", "Miles_per_Gallon":38, "Cylinders":4, "Displacement":91, "Horsepower":67, "Weight_in_lbs":1995, "Acceleration":16.2, "Year":"1982-01-01", "Origin":"Japan"}
395 | {"Name":"buick century limited", "Miles_per_Gallon":25, "Cylinders":6, "Displacement":181, "Horsepower":110, "Weight_in_lbs":2945, "Acceleration":16.4, "Year":"1982-01-01", "Origin":"USA"}
396 | {"Name":"oldsmobile cutlass ciera (diesel)", "Miles_per_Gallon":38, "Cylinders":6, "Displacement":262, "Horsepower":85, "Weight_in_lbs":3015, "Acceleration":17, "Year":"1982-01-01", "Origin":"USA"}
397 | {"Name":"chrysler lebaron medallion", "Miles_per_Gallon":26, "Cylinders":4, "Displacement":156, "Horsepower":92, "Weight_in_lbs":2585, "Acceleration":14.5, "Year":"1982-01-01", "Origin":"USA"}
398 | {"Name":"ford granada l", "Miles_per_Gallon":22, "Cylinders":6, "Displacement":232, "Horsepower":112, "Weight_in_lbs":2835, "Acceleration":14.7, "Year":"1982-01-01", "Origin":"USA"}
399 | {"Name":"toyota celica gt", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":144, "Horsepower":96, "Weight_in_lbs":2665, "Acceleration":13.9, "Year":"1982-01-01", "Origin":"Japan"}
400 | {"Name":"dodge charger 2.2", "Miles_per_Gallon":36, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2370, "Acceleration":13, "Year":"1982-01-01", "Origin":"USA"}
401 | {"Name":"chevrolet camaro", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":151, "Horsepower":90, "Weight_in_lbs":2950, "Acceleration":17.3, "Year":"1982-01-01", "Origin":"USA"}
402 | {"Name":"ford mustang gl", "Miles_per_Gallon":27, "Cylinders":4, "Displacement":140, "Horsepower":86, "Weight_in_lbs":2790, "Acceleration":15.6, "Year":"1982-01-01", "Origin":"USA"}
403 | {"Name":"vw pickup", "Miles_per_Gallon":44, "Cylinders":4, "Displacement":97, "Horsepower":52, "Weight_in_lbs":2130, "Acceleration":24.6, "Year":"1982-01-01", "Origin":"Europe"}
404 | {"Name":"dodge rampage", "Miles_per_Gallon":32, "Cylinders":4, "Displacement":135, "Horsepower":84, "Weight_in_lbs":2295, "Acceleration":11.6, "Year":"1982-01-01", "Origin":"USA"}
405 | {"Name":"ford ranger", "Miles_per_Gallon":28, "Cylinders":4, "Displacement":120, "Horsepower":79, "Weight_in_lbs":2625, "Acceleration":18.6, "Year":"1982-01-01", "Origin":"USA"}
406 | {"Name":"chevy s-10", "Miles_per_Gallon":31, "Cylinders":4, "Displacement":119, "Horsepower":82, "Weight_in_lbs":2720, "Acceleration":19.4, "Year":"1982-01-01", "Origin":"USA"}


--------------------------------------------------------------------------------