├── .env.spark ├── .gitignore ├── .isort.cfg ├── .pre-commit-config.yaml ├── .pydocstyle ├── .pylintrc ├── .relint.yml ├── Dockerfile ├── Dockerfile-yarn ├── LICENSE ├── Makefile ├── Readme.md ├── book_data ├── 2019-01-01.csv ├── BroadcastLogs_2018_Q3_M8-SAMPLE.csv ├── Periodic_Table_Of_Elements.csv ├── ReferenceTables │ ├── BroadcastProducers.csv │ ├── CD_AirLanguage.csv │ ├── CD_AudienceTargetAge.csv │ ├── CD_AudienceTargetEthnic.csv │ ├── CD_BroadcastOriginPoint.csv │ ├── CD_Category.csv │ ├── CD_ClosedCaption.csv │ ├── CD_Composition.csv │ ├── CD_CountryOfOrigin.csv │ ├── CD_DubDramaCredit.csv │ ├── CD_EthnicProgram.csv │ ├── CD_Exhibition.csv │ ├── CD_FilmClassification.csv │ ├── CD_NetworkAffiliation.csv │ ├── CD_ProductionSource.csv │ ├── CD_ProgramClass.csv │ ├── CD_SpecialAttention.csv │ ├── Call_Signs.csv │ └── LogIdentifier.csv ├── pokedex.dsv ├── pride-and-prejudice.txt ├── sample.csv ├── sample_frame.csv ├── shows-breaking-bad.json ├── shows-silicon-valley.json └── shows-the-golden-girls.json ├── conf └── spark-defaults.conf ├── docker-compose.yarn.yml ├── docker-compose.yml ├── entrypoint-yarn.sh ├── entrypoint.sh ├── requirements ├── requirements.in └── requirements.txt ├── spark_apps └── data_analysis_book │ ├── chapter02 │ ├── ex2.py │ └── word_non_null.py │ ├── chapter03 │ ├── ex3_3.py │ ├── ex3_4.py │ ├── ex3_5.py │ ├── ex3_5_2.py │ ├── word_non_null.py │ ├── word_non_null_short.py │ └── word_non_null_short_multiple_files.py │ ├── chapter04 │ ├── broadcast_logs.py │ ├── broadcast_logs_new_column.py │ ├── broadcast_logs_stats.py │ ├── broadcast_logs_tidy.py │ ├── broadcast_logs_unpacking.py │ ├── ex4_1.py │ ├── ex4_3.py │ ├── ex4_4.py │ └── tabular_data.py │ ├── chapter05 │ ├── broadcast_logs.py │ ├── broadcast_logs_naming.py │ ├── ex5.py │ ├── ex5_5.py │ ├── ex5_6.py │ └── ex5_7.py │ ├── chapter06 │ ├── defining_schema.py │ ├── defining_schema_json.py │ ├── ex6.py │ ├── ex6_6.py │ ├── ex6_7.py │ ├── ex6_8.py │ ├── reading_json.py │ ├── reading_json_explode_collect.py │ └── reading_json_struct.py │ └── chapter07 │ ├── backblaze.py │ ├── blending_sql_python.py │ ├── creating_view.py │ ├── download_backblaze_data.py │ ├── ex7_2.py │ ├── ex7_3.py │ ├── ex7_4.py │ ├── ex7_5.py │ ├── periodic_table.py │ ├── spark_catalog.py │ ├── sql_querying.py │ └── subquery_cte.py ├── ssh_config └── yarn ├── capacity-scheduler.xml ├── core-site.xml ├── hdfs-site.xml ├── mapred-site.xml ├── spark-defaults.conf └── yarn-site.xml /.env.spark: -------------------------------------------------------------------------------- 1 | SPARK_NO_DAEMONIZE=true 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .DS_Store 3 | book_data/results 4 | book_data/current 5 | book_data/broadcast_logs/ReferenceTables 6 | book_data/broadcast_logs 7 | book_data/elements 8 | book_data/gsod_noaa 9 | book_data/gutenberg_books 10 | book_data/list_of_numbers 11 | book_data/recipes 12 | book_data/window 13 | book_data/shows 14 | /backup/ 15 | /docker-compose.generated.yml 16 | /yarn-generated/ 17 | /book_data/backblaze_data/ 18 | -------------------------------------------------------------------------------- /.isort.cfg: -------------------------------------------------------------------------------- 1 | [settings] 2 | known_third_party =numpy,py4j,pyspark,wget 3 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3 3 | 4 | repos: 5 | - repo: https://github.com/pre-commit/pre-commit-hooks 6 | rev: v4.3.0 7 | hooks: 8 | - id: trailing-whitespace 9 | exclude: ^.*\.md$ 10 | - id: end-of-file-fixer 11 | - id: debug-statements 12 | - id: mixed-line-ending 13 | args: [--fix=lf] 14 | - id: detect-private-key 15 | - id: check-merge-conflict 16 | 17 | - repo: https://github.com/jorisroovers/gitlint 18 | rev: v0.17.0 19 | hooks: 20 | - id: gitlint 21 | 22 | - repo: https://github.com/asottile/seed-isort-config 23 | rev: v2.2.0 24 | hooks: 25 | - id: seed-isort-config 26 | 27 | - repo: https://github.com/timothycrosley/isort 28 | rev: 5.10.1 29 | hooks: 30 | - id: isort 31 | args: [ "--profile", "black" ] 32 | # extra dependencies for config in pyproject.toml 33 | additional_dependencies: ["toml"] 34 | 35 | - repo: https://github.com/ambv/black 36 | rev: 22.8.0 37 | hooks: 38 | - id: black 39 | 40 | - repo: https://github.com/pre-commit/mirrors-pylint 41 | rev: v3.0.0a5 42 | hooks: 43 | - id: pylint 44 | exclude: ^(docs/).*$ 45 | args: ["--disable=import-error,no-name-in-module"] 46 | 47 | - repo: https://github.com/PyCQA/pydocstyle 48 | rev: 6.1.1 49 | hooks: 50 | - id: pydocstyle 51 | 52 | - repo: https://github.com/codingjoe/relint 53 | rev: 1.4.0 54 | hooks: 55 | - id: relint 56 | -------------------------------------------------------------------------------- /.pydocstyle: -------------------------------------------------------------------------------- 1 | [pydocstyle] 2 | # D100-D107 ignore missing docstrings 3 | # D203 disabled in favor of D211 4 | # D213 disabled in favor of D212 5 | ignore = D100,D101,D102,D103,D104,D105,D106,D107,D203,D213 6 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | # jobs=0 means 'use all CPUs' 3 | jobs=0 4 | 5 | [MESSAGES CONTROL] 6 | disable = 7 | abstract-method, 8 | bare-except, 9 | broad-except, 10 | fixme, 11 | import-error, 12 | invalid-name, 13 | line-too-long, 14 | locally-disabled, 15 | missing-docstring, 16 | no-member, 17 | no-name-in-module, 18 | no-value-for-parameter, 19 | protected-access, 20 | redefined-outer-name, 21 | consider-using-f-string, 22 | too-few-public-methods, 23 | ungrouped-imports, 24 | unused-argument, 25 | unspecified-encoding, 26 | wrong-import-order, 27 | wrong-import-position, 28 | pointless-statement, 29 | chained-comparison 30 | 31 | [REPORTS] 32 | output-format=colorized 33 | 34 | [FORMAT] 35 | logging-modules= 36 | logging, 37 | structlog, 38 | -------------------------------------------------------------------------------- /.relint.yml: -------------------------------------------------------------------------------- 1 | - name: Fix it now 2 | pattern: "[fF][iI][xX][mM][eE]" 3 | filePattern: \*\.py 4 | - name: No sys.path changes 5 | pattern: "sys\\.path\\.append|sys\\.path\\.insert" 6 | filePattern: \*\.py 7 | - name: IPython debug leftover 8 | pattern: "IPython\\.embed()" 9 | filePattern: \*\.py 10 | - name: Leftover print 11 | pattern: "print\\(" 12 | filePattern: \*\.py 13 | - name: Unpinned requirement 14 | pattern: "^\\w+\\s" 15 | filePattern: \*requirements\.txt 16 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-bullseye as spark-base 2 | 3 | ARG SPARK_VERSION=3.3.3 4 | 5 | # Install tools required by the OS 6 | RUN apt-get update && \ 7 | apt-get install -y --no-install-recommends \ 8 | sudo \ 9 | curl \ 10 | vim \ 11 | unzip \ 12 | rsync \ 13 | openjdk-11-jdk \ 14 | build-essential \ 15 | software-properties-common \ 16 | ssh && \ 17 | apt-get clean && \ 18 | rm -rf /var/lib/apt/lists/* 19 | 20 | 21 | # Setup the directories for our Spark and Hadoop installations 22 | ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} 23 | ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} 24 | 25 | RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} 26 | WORKDIR ${SPARK_HOME} 27 | 28 | # Download and install Spark 29 | RUN curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ 30 | && tar xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ 31 | && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz 32 | 33 | 34 | FROM spark-base as pyspark 35 | 36 | # Install python deps 37 | COPY requirements/requirements.txt . 38 | RUN pip3 install -r requirements.txt 39 | 40 | # Setup Spark related environment variables 41 | ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" 42 | ENV SPARK_MASTER="spark://spark-master:7077" 43 | ENV SPARK_MASTER_HOST spark-master 44 | ENV SPARK_MASTER_PORT 7077 45 | ENV PYSPARK_PYTHON python3 46 | 47 | # Copy the default configurations into $SPARK_HOME/conf 48 | COPY conf/spark-defaults.conf "$SPARK_HOME/conf" 49 | 50 | RUN chmod u+x /opt/spark/sbin/* && \ 51 | chmod u+x /opt/spark/bin/* 52 | 53 | ENV PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH 54 | 55 | # Copy appropriate entrypoint script 56 | COPY entrypoint.sh . 57 | RUN chmod +x entrypoint.sh 58 | 59 | ENTRYPOINT ["./entrypoint.sh"] 60 | -------------------------------------------------------------------------------- /Dockerfile-yarn: -------------------------------------------------------------------------------- 1 | FROM python:3.10-bullseye as spark-base 2 | 3 | ARG SPARK_VERSION=3.3.1 4 | ARG HADOOP_VERSION=3.3.4 5 | 6 | 7 | # Install tools required by the OS 8 | RUN apt-get update && \ 9 | apt-get install -y --no-install-recommends \ 10 | sudo \ 11 | curl \ 12 | vim \ 13 | unzip \ 14 | rsync \ 15 | openjdk-11-jdk \ 16 | build-essential \ 17 | software-properties-common \ 18 | ssh && \ 19 | apt-get clean && \ 20 | rm -rf /var/lib/apt/lists/* 21 | 22 | # Setup the directories for our Spark and Hadoop installations 23 | ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} 24 | ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} 25 | 26 | RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} 27 | WORKDIR ${SPARK_HOME} 28 | 29 | # Download and install Spark 30 | RUN curl https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ 31 | && tar xvzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ 32 | && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz 33 | 34 | # Download and install Hadoop 35 | RUN curl https://dlcdn.apache.org/hadoop/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz -o hadoop-${HADOOP_VERSION}-bin.tar.gz \ 36 | && tar xfz hadoop-${HADOOP_VERSION}-bin.tar.gz --directory /opt/hadoop --strip-components 1 \ 37 | && rm -rf hadoop-${HADOOP_VERSION}-bin.tar.gz 38 | 39 | 40 | FROM spark-base as pyspark 41 | 42 | # Install python deps 43 | COPY requirements/requirements.txt . 44 | RUN pip3 install -r requirements.txt 45 | 46 | # Set JAVA_HOME environment variable 47 | ENV JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" 48 | 49 | # Add the Spark and Hadoop bin and sbin to the PATH variable. 50 | # Also add $JAVA_HOME/bin to the PATH 51 | ENV PATH="$SPARK_HOME/sbin:/opt/spark/bin:${PATH}" 52 | ENV PATH="$HADOOP_HOME/bin:$HADOOP_HOME/sbin:${PATH}" 53 | ENV PATH="${PATH}:${JAVA_HOME}/bin" 54 | 55 | # Setup Spark related environment variables 56 | ENV SPARK_MASTER="spark://spark-yarn-master:7077" 57 | ENV SPARK_MASTER_HOST spark-yarn-master 58 | ENV SPARK_MASTER_PORT 7077 59 | ENV PYSPARK_PYTHON python3 60 | ENV HADOOP_CONF_DIR="$HADOOP_HOME/etc/hadoop" 61 | 62 | # Add Hadoop native library path to the dynamic link library path 63 | ENV LD_LIBRARY_PATH="$HADOOP_HOME/lib/native:${LD_LIBRARY_PATH}" 64 | 65 | # Set user for HDFS and Yarn (for production probably not smart to put root) 66 | ENV HDFS_NAMENODE_USER="root" 67 | ENV HDFS_DATANODE_USER="root" 68 | ENV HDFS_SECONDARYNAMENODE_USER="root" 69 | ENV YARN_RESOURCEMANAGER_USER="root" 70 | ENV YARN_NODEMANAGER_USER="root" 71 | 72 | # Add JAVA_HOME to haddop-env.sh 73 | RUN echo "export JAVA_HOME=${JAVA_HOME}" >> "$HADOOP_HOME/etc/hadoop/hadoop-env.sh" 74 | 75 | # COPY the appropriate configuration files to their appropriate locations 76 | COPY yarn/spark-defaults.conf "$SPARK_HOME/conf/" 77 | COPY yarn/*.xml "$HADOOP_HOME/etc/hadoop/" 78 | 79 | # Make the binaries and scripts executable and set the PYTHONPATH environment variable 80 | RUN chmod u+x /opt/spark/sbin/* && \ 81 | chmod u+x /opt/spark/bin/* 82 | 83 | ENV PYTHONPATH=$SPARK_HOME/python/:$PYTHONPATH 84 | #ENV PYTHONPATH=$SPARK_HOME/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH 85 | 86 | RUN ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa && \ 87 | cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ 88 | chmod 600 ~/.ssh/authorized_keys 89 | 90 | COPY ssh_config ~/.ssh/config 91 | 92 | # Copy appropriate entrypoint script 93 | COPY entrypoint-yarn.sh entrypoint.sh 94 | 95 | EXPOSE 22 96 | 97 | ENTRYPOINT ["./entrypoint.sh"] 98 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Marin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | docker compose build 3 | 4 | build-yarn: 5 | docker compose -f docker-compose.yarn.yml build 6 | 7 | build-yarn-nc: 8 | docker compose -f docker-compose.yarn.yml build --no-cache 9 | 10 | build-nc: 11 | docker compose build --no-cache 12 | 13 | build-progress: 14 | docker compose build --no-cache --progress=plain 15 | 16 | down: 17 | docker compose down --volumes --remove-orphans 18 | 19 | down-yarn: 20 | docker compose -f docker-compose.yarn.yml down --volumes --remove-orphans 21 | 22 | run: 23 | make down && docker compose up 24 | 25 | run-scaled: 26 | make down && docker compose up --scale spark-worker=3 27 | 28 | run-d: 29 | make down && docker compose up -d 30 | 31 | run-yarn: 32 | make down-yarn && docker compose -f docker-compose.yarn.yml up 33 | 34 | run-yarn-scaled: 35 | make down-yarn && docker compose -f docker-compose.yarn.yml up --scale spark-yarn-worker=3 36 | 37 | stop: 38 | docker compose stop 39 | 40 | stop-yarn: 41 | docker compose -f docker-compose.yarn.yml stop 42 | 43 | 44 | submit: 45 | docker exec da-spark-master spark-submit --master spark://spark-master:7077 --deploy-mode client ./apps/$(app) 46 | 47 | submit-da-book: 48 | make submit app=data_analysis_book/$(app) 49 | 50 | submit-yarn-test: 51 | docker exec da-spark-yarn-master spark-submit --master yarn --deploy-mode cluster ./examples/src/main/python/pi.py 52 | 53 | submit-yarn-cluster: 54 | docker exec da-spark-yarn-master spark-submit --master yarn --deploy-mode cluster ./apps/$(app) 55 | 56 | rm-results: 57 | rm -r book_data/results/* 58 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # About the repo 2 | 3 | I started this repo because I wanted to learn PySpark. 4 | However, I also didn't want to use Jupyter notebook as it 5 | is typically the case in the examples I came across. 6 | 7 | Therefore, I started with setting up a spark cluster 8 | using docker. 9 | 10 | # Running the code (Spark standalone cluster) 11 | You can run the spark standalone cluster by running: 12 | ```shell 13 | make run 14 | ``` 15 | or with 3 workers using: 16 | ```shell 17 | make run-scaled 18 | ``` 19 | You can submit Python jobs with the command: 20 | ```shell 21 | make submit app=dir/relative/to/spark_apps/dir 22 | ``` 23 | e.g. 24 | ```shell 25 | make submit app=data_analysis_book/chapter03/word_non_null.py 26 | ``` 27 | 28 | There are a number of commands to build the standalone cluster, 29 | you should check the Makefile to see them all. But the 30 | simplest one is: 31 | ```shell 32 | make build 33 | ``` 34 | 35 | ## Web UIs 36 | The master node can be accessed on: 37 | `localhost:9090`. 38 | The spark history server is accessible through: 39 | `localhost:18080`. 40 | 41 | 42 | # Running the code (Spark on Hadoop Yarn cluster) 43 | Before running, check the virtual disk size that Docker 44 | assigns to the container. In my case, I needed to assign 45 | some 70 GB. 46 | You can run Spark on the Hadoop Yarn cluster by running: 47 | ```shell 48 | make run-yarn 49 | ``` 50 | or with 3 data nodes: 51 | ```shell 52 | make run-yarn-scaled 53 | ``` 54 | You can submit an example job to test the setup: 55 | ```shell 56 | make submit-yarn-test 57 | ``` 58 | which will submit the `pi.py` example in cluster mode. 59 | 60 | You can also submit a custom job: 61 | ```shell 62 | make submit-yarn-cluster app=data_analysis_book/chapter03/word_non_null.py 63 | ``` 64 | 65 | There are a number of commands to build the cluster, 66 | you should check the Makefile to see them all. But the 67 | simplest one is: 68 | ```shell 69 | make build-yarn 70 | ``` 71 | 72 | ## Web UIs 73 | You can access different web UIs. The one I found the most 74 | useful is the NameNode UI: 75 | ```shell 76 | http://localhost:9870 77 | ``` 78 | 79 | Other UIs: 80 | - ResourceManger - `localhost:8088` 81 | - Spark history server - `localhost:18080` 82 | 83 | # About the branch expose-docker-hostnames-to-host 84 | The branch expose-docker-hostnames-to-host contains the 85 | shell scripts, templates, and Makefile modifications 86 | required to expose worker node web interfaces. To run 87 | the cluster you need to do the following. 88 | 1. run 89 | ```shell 90 | make run-ag n=3 91 | ``` 92 | which will generate a docker compose file with 3 worker 93 | nodes and the appropriate yarn and hdfs site files 94 | in the yarn-generated folder. 95 | 2. register the docker hostnames with /etc/hosts 96 | ```shell 97 | sudo make dns-modify o=true 98 | ``` 99 | which will create a backup folder with your original 100 | hosts file. 101 | 102 | Once you are done and terminate the cluster, restore 103 | your original hosts file with: 104 | ```shell 105 | sudo make dns-restore 106 | ``` 107 | 108 | For more information read the story I published on Medium 109 | [here](https://medium.com/@MarinAgli1/using-hostnames-to-access-hadoop-resources-running-on-docker-5860cd7aeec1). 110 | 111 | 112 | # Stories published on Medium 113 | 1. Setting up a standalone Spark cluster can be found [here](https://medium.com/@MarinAgli1/setting-up-a-spark-standalone-cluster-on-docker-in-layman-terms-8cbdc9fdd14b). 114 | 2. Setting up Hadoop Yarn to run Spark applications can be found [here](https://medium.com/@MarinAgli1/setting-up-hadoop-yarn-to-run-spark-applications-6ea1158287af). 115 | 3. Using hostnames to access Hadoop resources can be found [here](https://medium.com/@MarinAgli1/using-hostnames-to-access-hadoop-resources-running-on-docker-5860cd7aeec1). 116 | 117 | # About the book_data directory 118 | The official repo of the book Data Analysis with Python and 119 | PySpark can be found here:https://github.com/jonesberg/DataAnalysisWithPythonAndPySpark. 120 | 121 | I did not include the files from this repo as there are 122 | files larger than 50 MB which is the limit for GitHub. At the 123 | time of writing the repo contains a link to Dropbox which 124 | contains the files. I suggest you download them. 125 | 126 | The book_data directory contains some files found on 127 | this repo: 128 | https://github.com/maprihoda/data-analysis-with-python-and-pyspark. 129 | Which is also a repo of someone who read the mentioned book. 130 | -------------------------------------------------------------------------------- /book_data/Periodic_Table_Of_Elements.csv: -------------------------------------------------------------------------------- 1 | AtomicNumber,Element,Symbol,AtomicMass,NumberofNeutrons,NumberofProtons,NumberofElectrons,Period,Group,Phase,Radioactive,Natural,Metal,Nonmetal,Metalloid,Type,AtomicRadius,Electronegativity,FirstIonization,Density,MeltingPoint,BoilingPoint,NumberOfIsotopes,Discoverer,Year,SpecificHeat,NumberofShells,NumberofValence 2 | 1,Hydrogen,H,1.007,0,1,1,1,1,gas,,yes,,yes,,Nonmetal,0.79,2.2,13.5984,8.99E-05,14.175,20.28,3,Cavendish,1766,14.304,1,1 3 | 2,Helium,He,4.002,2,2,2,1,18,gas,,yes,,yes,,Noble Gas,0.49,,24.5874,1.79E-04,,4.22,5,Janssen,1868,5.193,1, 4 | 3,Lithium,Li,6.941,4,3,3,2,1,solid,,yes,yes,,,Alkali Metal,2.1,0.98,5.3917,5.34E-01,453.85,1615,5,Arfvedson,1817,3.582,2,1 5 | 4,Beryllium,Be,9.012,5,4,4,2,2,solid,,yes,yes,,,Alkaline Earth Metal,1.4,1.57,9.3227,1.85E+00,1560.15,2742,6,Vaulquelin,1798,1.825,2,2 6 | 5,Boron,B,10.811,6,5,5,2,13,solid,,yes,,,yes,Metalloid,1.2,2.04,8.298,2.34E+00,2573.15,4200,6,Gay-Lussac,1808,1.026,2,3 7 | 6,Carbon,C,12.011,6,6,6,2,14,solid,,yes,,yes,,Nonmetal,0.91,2.55,11.2603,2.27E+00,3948.15,4300,7,Prehistoric,,0.709,2,4 8 | 7,Nitrogen,N,14.007,7,7,7,2,15,gas,,yes,,yes,,Nonmetal,0.75,3.04,14.5341,1.25E-03,63.29,77.36,8,Rutherford,1772,1.04,2,5 9 | 8,Oxygen,O,15.999,8,8,8,2,16,gas,,yes,,yes,,Nonmetal,0.65,3.44,13.6181,1.43E-03,50.5,90.2,8,Priestley/Scheele,1774,0.918,2,6 10 | 9,Fluorine,F,18.998,10,9,9,2,17,gas,,yes,,yes,,Halogen,0.57,3.98,17.4228,1.70E-03,53.63,85.03,6,Moissan,1886,0.824,2,7 11 | 10,Neon,Ne,20.18,10,10,10,2,18,gas,,yes,,yes,,Noble Gas,0.51,,21.5645,9.00E-04,24.703,27.07,8,Ramsay and Travers,1898,1.03,2,8 12 | 11,Sodium,Na,22.99,12,11,11,3,1,solid,,yes,yes,,,Alkali Metal,2.2,0.93,5.1391,9.71E-01,371.15,1156,7,Davy,1807,1.228,3,1 13 | 12,Magnesium,Mg,24.305,12,12,12,3,2,solid,,yes,yes,,,Alkaline Earth Metal,1.7,1.31,7.6462,1.74E+00,923.15,1363,8,Black,1755,1.023,3,2 14 | 13,Aluminum,Al,26.982,14,13,13,3,13,solid,,yes,yes,,,Metal,1.8,1.61,5.9858,2.70E+00,933.4,2792,8,Wshler,1827,0.897,3,3 15 | 14,Silicon,Si,28.086,14,14,14,3,14,solid,,yes,,,yes,Metalloid,1.5,1.9,8.1517,2.33E+00,1683.15,3538,8,Berzelius,1824,0.705,3,4 16 | 15,Phosphorus,P,30.974,16,15,15,3,15,solid,,yes,,yes,,Nonmetal,1.2,2.19,10.4867,1.82E+00,317.25,553,7,BranBrand,1669,0.769,3,5 17 | 16,Sulfur,S,32.065,16,16,16,3,16,solid,,yes,,yes,,Nonmetal,1.1,2.58,10.36,2.07E+00,388.51,717.8,10,Prehistoric,,0.71,3,6 18 | 17,Chlorine,Cl,35.453,18,17,17,3,17,gas,,yes,,yes,,Halogen,0.97,3.16,12.9676,3.21E-03,172.31,239.11,11,Scheele,1774,0.479,3,7 19 | 18,Argon,Ar,39.948,22,18,18,3,18,gas,,yes,,yes,,Noble Gas,0.88,,15.7596,1.78E-03,83.96,87.3,8,Rayleigh and Ramsay,1894,0.52,3,8 20 | 19,Potassium,K,39.098,20,19,19,4,1,solid,,yes,yes,,,Alkali Metal,2.8,0.82,4.3407,8.62E-01,336.5,1032,10,Davy,1807,0.757,4,1 21 | 20,Calcium,Ca,40.078,20,20,20,4,2,solid,,yes,yes,,,Alkaline Earth Metal,2.2,1,6.1132,1.54E+00,1112.15,1757,14,Davy,1808,0.647,4,2 22 | 21,Scandium,Sc,44.956,24,21,21,4,3,solid,,yes,yes,,,Transition Metal,2.1,1.36,6.5615,2.99E+00,1812.15,3109,15,Nilson,1878,0.568,4, 23 | 22,Titanium,Ti,47.867,26,22,22,4,4,solid,,yes,yes,,,Transition Metal,2,1.54,6.8281,4.54E+00,1933.15,3560,9,Gregor,1791,0.523,4, 24 | 23,Vanadium,V,50.942,28,23,23,4,5,solid,,yes,yes,,,Transition Metal,1.9,1.63,6.7462,6.11E+00,2175.15,3680,9, del Rio,1801,0.489,4, 25 | 24,Chromium,Cr,51.996,28,24,24,4,6,solid,,yes,yes,,,Transition Metal,1.9,1.66,6.7665,7.15E+00,2130.15,2944,9,Vauquelin,1797,0.449,4, 26 | 25,Manganese,Mn,54.938,30,25,25,4,7,solid,,yes,yes,,,Transition Metal,1.8,1.55,7.434,7.44E+00,1519.15,2334,11,"Gahn, Scheele",1774,0.479,4, 27 | 26,Iron,Fe,55.845,30,26,26,4,8,solid,,yes,yes,,,Transition Metal,1.7,1.83,7.9024,7.87E+00,1808.15,3134,10,Prehistoric,,0.449,4, 28 | 27,Cobalt,Co,58.933,32,27,27,4,9,solid,,yes,yes,,,Transition Metal,1.7,1.88,7.881,8.86E+00,1768.15,3200,14,Brandt,1735,0.421,4, 29 | 28,Nickel,Ni,58.693,31,28,28,4,10,solid,,yes,yes,,,Transition Metal,1.6,1.91,7.6398,8.91E+00,1726.15,3186,11,Cronstedt,1751,0.444,4, 30 | 29,Copper,Cu,63.546,35,29,29,4,11,solid,,yes,yes,,,Transition Metal,1.6,1.9,7.7264,8.96E+00,1357.75,2835,11,Prehistoric,,0.385,4, 31 | 30,Zinc,Zn,65.38,35,30,30,4,12,solid,,yes,yes,,,Transition Metal,1.5,1.65,9.3942,7.13E+00,692.88,1180,15,Prehistoric,,0.388,4, 32 | 31,Gallium,Ga,69.723,39,31,31,4,13,solid,,yes,yes,,,Metal,1.8,1.81,5.9993,5.91E+00,302.91,2477,14,de Boisbaudran,1875,0.371,4,3 33 | 32,Germanium,Ge,72.64,41,32,32,4,14,solid,,yes,,,yes,Metalloid,1.5,2.01,7.8994,5.32E+00,1211.45,3106,17,Winkler,1886,0.32,4,4 34 | 33,Arsenic,As,74.922,42,33,33,4,15,solid,,yes,,,yes,Metalloid,1.3,2.18,9.7886,5.78E+00,1090.15,887,14,Albertus Magnus,1250,0.329,4,5 35 | 34,Selenium,Se,78.96,45,34,34,4,16,solid,,yes,,yes,,Nonmetal,1.2,2.55,9.7524,4.81E+00,494.15,958,20,Berzelius,1817,0.321,4,6 36 | 35,Bromine,Br,79.904,45,35,35,4,17,liq,,yes,,yes,,Halogen,1.1,2.96,11.8138,3.12E+00,266.05,332,19,Balard,1826,0.474,4,7 37 | 36,Krypton,Kr,83.798,48,36,36,4,18,gas,,yes,,yes,,Noble Gas,1,,13.9996,3.73E-03,115.93,119.93,23,Ramsay and Travers,1898,0.248,4,8 38 | 37,Rubidium,Rb,85.468,48,37,37,5,1,solid,,yes,yes,,,Alkali Metal,3,0.82,4.1771,1.53E+00,312.79,961,20,Bunsen and Kirchoff,1861,0.363,5,1 39 | 38,Strontium,Sr,87.62,50,38,38,5,2,solid,,yes,yes,,,Alkaline Earth Metal,2.5,0.95,5.6949,2.64E+00,1042.15,1655,18,Davy,1808,0.301,5,2 40 | 39,Yttrium,Y,88.906,50,39,39,5,3,solid,,yes,yes,,,Transition Metal,2.3,1.22,6.2173,4.47E+00,1799.15,3609,21,Gadolin,1794,0.298,5, 41 | 40,Zirconium,Zr,91.224,51,40,40,5,4,solid,,yes,yes,,,Transition Metal,2.2,1.33,6.6339,6.51E+00,2125.15,4682,20,Klaproth,1789,0.278,5, 42 | 41,Niobium,Nb,92.906,52,41,41,5,5,solid,,yes,yes,,,Transition Metal,2.1,1.6,6.7589,8.57E+00,2741.15,5017,24,Hatchett,1801,0.265,5, 43 | 42,Molybdenum,Mo,95.96,54,42,42,5,6,solid,,yes,yes,,,Transition Metal,2,2.16,7.0924,1.02E+01,2890.15,4912,20,Scheele,1778,0.251,5, 44 | 43,Technetium,Tc,98,55,43,43,5,7,artificial,yes,,yes,,,Transition Metal,2,1.9,7.28,1.15E+01,2473.15,5150,23,Perrier and Segr�,1937,,5, 45 | 44,Ruthenium,Ru,101.07,57,44,44,5,8,solid,,yes,yes,,,Transition Metal,1.9,2.2,7.3605,1.24E+01,2523.15,4423,16,Klaus,1844,0.238,5, 46 | 45,Rhodium,Rh,102.906,58,45,45,5,9,solid,,yes,yes,,,Transition Metal,1.8,2.28,7.4589,1.24E+01,2239.15,3968,20,Wollaston,1803,0.243,5, 47 | 46,Palladium,Pd,106.42,60,46,46,5,10,solid,,yes,yes,,,Transition Metal,1.8,2.2,8.3369,1.20E+01,1825.15,3236,21,Wollaston,1803,0.244,5, 48 | 47,Silver,Ag,107.868,61,47,47,5,11,solid,,yes,yes,,,Transition Metal,1.8,1.93,7.5762,1.05E+01,1234.15,2435,27,Prehistoric,,0.235,5, 49 | 48,Cadmium,Cd,112.411,64,48,48,5,12,solid,,yes,yes,,,Transition Metal,1.7,1.69,8.9938,8.69E+00,594.33,1040,22,Stromeyer,1817,0.232,5, 50 | 49,Indium,In,114.818,66,49,49,5,13,solid,,yes,yes,,,Metal,2,1.78,5.7864,7.31E+00,429.91,2345,34,Reich and Richter,1863,0.233,5,3 51 | 50,Tin,Sn,118.71,69,50,50,5,14,solid,,yes,yes,,,Metal,1.7,1.96,7.3439,7.29E+00,505.21,2875,28,Prehistoric,,0.228,5,4 52 | 51,Antimony,Sb,121.76,71,51,51,5,15,solid,,yes,,,yes,Metalloid,1.5,2.05,8.6084,6.69E+00,904.05,1860,29,Early historic times,,0.207,5,5 53 | 52,Tellurium,Te,127.6,76,52,52,5,16,solid,,yes,,,yes,Metalloid,1.4,2.1,9.0096,6.23E+00,722.8,1261,29,von Reichenstein,1782,0.202,5,6 54 | 53,Iodine,I,126.904,74,53,53,5,17,solid,,yes,,yes,,Halogen,1.3,2.66,10.4513,4.93E+00,386.65,457.4,24,Courtois,1811,0.214,5,7 55 | 54,Xenon,Xe,131.293,77,54,54,5,18,gas,,yes,,yes,,Noble Gas,1.2,,12.1298,5.89E-03,161.45,165.03,31,Ramsay and Travers,1898,0.158,5,8 56 | 55,Cesium,Cs,132.905,78,55,55,6,1,solid,,yes,yes,,,Alkali Metal,3.3,0.79,3.8939,1.87E+00,301.7,944,22,Bunsen and Kirchoff,1860,0.242,6,1 57 | 56,Barium,Ba,137.327,81,56,56,6,2,solid,,yes,yes,,,Alkaline Earth Metal,2.8,0.89,5.2117,3.59E+00,1002.15,2170,25,Davy,1808,0.204,6,2 58 | 57,Lanthanum,La,138.905,82,57,57,6,3,solid,,yes,yes,,,Lanthanide,2.7,1.1,5.5769,6.15E+00,1193.15,3737,19,Mosander,1839,0.195,6, 59 | 58,Cerium,Ce,140.116,82,58,58,6,,solid,,yes,yes,,,Lanthanide,2.7,1.12,5.5387,6.77E+00,1071.15,3716,19,Berzelius,1803,0.192,6, 60 | 59,Praseodymium,Pr,140.908,82,59,59,6,,solid,,yes,yes,,,Lanthanide,2.7,1.13,5.473,6.77E+00,1204.15,3793,15,von Welsbach,1885,0.193,6, 61 | 60,Neodymium,Nd,144.242,84,60,60,6,,solid,,yes,yes,,,Lanthanide,2.6,1.14,5.525,7.01E+00,1289.15,3347,16,von Welsbach,1885,0.19,6, 62 | 61,Promethium,Pm,145,84,61,61,6,,artificial,yes,,yes,,,Lanthanide,2.6,1.13,5.582,7.26E+00,1204.15,3273,14,Marinsky et al.,1945,,6, 63 | 62,Samarium,Sm,150.36,88,62,62,6,,solid,,yes,yes,,,Lanthanide,2.6,1.17,5.6437,7.52E+00,1345.15,2067,17,Boisbaudran,1879,0.197,6, 64 | 63,Europium,Eu,151.964,89,63,63,6,,solid,,yes,yes,,,Lanthanide,2.6,1.2,5.6704,5.24E+00,1095.15,1802,21,Demarcay,1901,0.182,6, 65 | 64,Gadolinium,Gd,157.25,93,64,64,6,,solid,,yes,yes,,,Lanthanide,2.5,1.2,6.1501,7.90E+00,1585.15,3546,17,de Marignac,1880,0.236,6, 66 | 65,Terbium,Tb,158.925,94,65,65,6,,solid,,yes,yes,,,Lanthanide,2.5,1.2,5.8638,8.23E+00,1630.15,3503,24,Mosander,1843,0.182,6, 67 | 66,Dysprosium,Dy,162.5,97,66,66,6,,solid,,yes,yes,,,Lanthanide,2.5,1.22,5.9389,8.55E+00,1680.15,2840,21,de Boisbaudran,1886,0.17,6, 68 | 67,Holmium,Ho,164.93,98,67,67,6,,solid,,yes,yes,,,Lanthanide,2.5,1.23,6.0215,8.80E+00,1743.15,2993,29,Delafontaine and Soret,1878,0.165,6, 69 | 68,Erbium,Er,167.259,99,68,68,6,,solid,,yes,yes,,,Lanthanide,2.5,1.24,6.1077,9.07E+00,1795.15,3503,16,Mosander,1843,0.168,6, 70 | 69,Thulium,Tm,168.934,100,69,69,6,,solid,,yes,yes,,,Lanthanide,2.4,1.25,6.1843,9.32E+00,1818.15,2223,18,Cleve,1879,0.16,6, 71 | 70,Ytterbium,Yb,173.054,103,70,70,6,,solid,,yes,yes,,,Lanthanide,2.4,1.1,6.2542,6.97E+00,1097.15,1469,16,Marignac,1878,0.155,6, 72 | 71,Lutetium,Lu,174.967,104,71,71,6,,solid,,yes,yes,,,Lanthanide,2.3,1.27,5.4259,9.84E+00,1936.15,3675,22,Urbain/ von Welsbach,1907,0.154,6, 73 | 72,Hafnium,Hf,178.49,106,72,72,6,4,solid,,yes,yes,,,Transition Metal,2.2,1.3,6.8251,1.33E+01,2500.15,4876,17,Coster and von Hevesy,1923,0.144,6, 74 | 73,Tantalum,Ta,180.948,108,73,73,6,5,solid,,yes,yes,,,Transition Metal,2.1,1.5,7.5496,1.67E+01,3269.15,5731,19,Ekeberg,1801,0.14,6, 75 | 74,Wolfram,W,183.84,110,74,74,6,6,solid,,yes,yes,,,Transition Metal,2,2.36,7.864,1.93E+01,3680.15,5828,22,J. and F. d'Elhuyar,1783,0.132,6, 76 | 75,Rhenium,Re,186.207,111,75,75,6,7,solid,,yes,yes,,,Transition Metal,2,1.9,7.8335,2.10E+01,3453.15,5869,21,"Noddack, Berg, and Tacke",1925,0.137,6, 77 | 76,Osmium,Os,190.23,114,76,76,6,8,solid,,yes,yes,,,Transition Metal,1.9,2.2,8.4382,2.26E+01,3300.15,5285,19,Tennant,1803,0.13,6, 78 | 77,Iridium,Ir,192.217,115,77,77,6,9,solid,,yes,yes,,,Transition Metal,1.9,2.2,8.967,2.26E+01,2716.15,4701,25,Tennant,1804,0.131,6, 79 | 78,Platinum,Pt,195.084,117,78,78,6,10,solid,,yes,yes,,,Transition Metal,1.8,2.28,8.9587,2.15E+01,2045.15,4098,32,Ulloa/Wood,1735,0.133,6, 80 | 79,Gold,Au,196.967,118,79,79,6,11,solid,,yes,yes,,,Transition Metal,1.8,2.54,9.2255,1.93E+01,1337.73,3129,21,Prehistoric,,0.129,6, 81 | 80,Mercury,Hg,200.59,121,80,80,6,12,liq,,yes,yes,,,Transition Metal,1.8,2,10.4375,1.35E+01,234.43,630,26,Prehistoric,,0.14,6, 82 | 81,Thallium,Tl,204.383,123,81,81,6,13,solid,,yes,yes,,,Metal,2.1,2.04,6.1082,1.19E+01,577.15,1746,28,Crookes,1861,0.129,6,3 83 | 82,Lead,Pb,207.2,125,82,82,6,14,solid,,yes,yes,,,Metal,1.8,2.33,7.4167,1.13E+01,600.75,2022,29,Prehistoric,,0.129,6,4 84 | 83,Bismuth,Bi,208.98,126,83,83,6,15,solid,,yes,yes,,,Metal,1.6,2.02,7.2856,9.81E+00,544.67,1837,19,Geoffroy the Younger,1753,0.122,6,5 85 | 84,Polonium,Po,210,126,84,84,6,16,solid,yes,yes,,,yes,Metalloid,1.5,2,8.417,9.32E+00,527.15,1235,34,Curie,1898,,6,6 86 | 85,Astatine,At,210,125,85,85,6,17,solid,yes,yes,,yes,,Noble Gas,1.4,2.2,9.3,7.00E+00,575.15,610,21,Corson et al.,1940,,6,7 87 | 86,Radon,Rn,222,136,86,86,6,18,gas,yes,yes,yes,,,Alkali Metal,1.3,,10.7485,9.73E-03,202.15,211.3,20,Dorn,1900,0.094,6,8 88 | 87,Francium,Fr,223,136,87,87,7,1,solid,yes,yes,yes,,,Alkaline Earth Metal,,0.7,4.0727,1.87E+00,300.15,950,21,Perey,1939,,7,1 89 | 88,Radium,Ra,226,138,88,88,7,2,solid,yes,yes,yes,,,Actinide,,0.9,5.2784,5.50E+00,973.15,2010,15,Pierre and Marie Curie,1898,,7,2 90 | 89,Actinium,Ac,227,138,89,89,7,3,solid,yes,yes,yes,,,Actinide,,1.1,5.17,1.01E+01,1323.15,3471,11,Debierne/Giesel,1899,0.12,7, 91 | 90,Thorium,Th,232.038,142,90,90,7,,solid,yes,yes,yes,,,Actinide,,1.3,6.3067,1.17E+01,2028.15,5061,12,Berzelius,1828,0.113,7, 92 | 91,Protactinium,Pa,231.036,140,91,91,7,,solid,yes,yes,yes,,,Actinide,,1.5,5.89,1.54E+01,1873.15,4300,14,Hahn and Meitner,1917,,7, 93 | 92,Uranium,U,238.029,146,92,92,7,,solid,yes,yes,yes,,,Actinide,,1.38,6.1941,1.90E+01,1405.15,4404,15,Peligot,1841,0.116,7, 94 | 93,Neptunium,Np,237,144,93,93,7,,artificial,yes,,yes,,,Actinide,,1.36,6.2657,2.05E+01,913.15,4273,153,McMillan and Abelson,1940,,7, 95 | 94,Plutonium,Pu,244,150,94,94,7,,artificial,yes,,yes,,,Actinide,,1.28,6.0262,1.98E+01,913.15,3501,163,Seaborg et al.,1940,,7, 96 | 95,Americium,Am,243,148,95,95,7,,artificial,yes,,yes,,,Actinide,,1.3,5.9738,1.37E+01,1267.15,2880,133,Seaborg et al.,1944,,7, 97 | 96,Curium,Cm,247,151,96,96,7,,artificial,yes,,yes,,,Actinide,,1.3,5.9915,1.35E+01,1340.15,3383,133,Seaborg et al.,1944,,7, 98 | 97,Berkelium,Bk,247,150,97,97,7,,artificial,yes,,yes,,,Actinide,,1.3,6.1979,1.48E+01,1259.15,983,83,Seaborg et al.,1949,,7, 99 | 98,Californium,Cf,251,153,98,98,7,,artificial,yes,,yes,,,Actinide,,1.3,6.2817,1.51E+01,1925.15,1173,123,Seaborg et al.,1950,,7, 100 | 99,Einsteinium,Es,252,153,99,99,7,,artificial,yes,,yes,,,Actinide,,1.3,6.42,1.35E+01,1133.15,,123,Ghiorso et al.,1952,,7, 101 | 100,Fermium,Fm,257,157,100,100,7,,artificial,yes,,yes,,,Actinide,,1.3,6.5,,,,103,Ghiorso et al.,1953,,7, 102 | 101,Mendelevium,Md,258,157,101,101,7,,artificial,yes,,yes,,,Actinide,,1.3,6.58,,,,33,Ghiorso et al.,1955,,7, 103 | 102,Nobelium,No,259,157,102,102,7,,artificial,yes,,yes,,,Actinide,,1.3,6.65,,,,73,Ghiorso et al.,1958,,7, 104 | 103,Lawrencium,Lr,262,159,103,103,7,,artificial,yes,,yes,,,Actinide,,,,,,,203,Ghiorso et al.,1961,,7, 105 | 104,Rutherfordium,Rf,261,157,104,104,7,4,artificial,yes,,yes,,,Transactinide,,,,1.81E+01,,,,Ghiorso et al.,1969,,7, 106 | 105,Dubnium,Db,262,157,105,105,7,5,artificial,yes,,yes,,,Transactinide,,,,3.90E+01,,,,Ghiorso et al.,1970,,7, 107 | 106,Seaborgium,Sg,266,160,106,106,7,6,artificial,yes,,yes,,,Transactinide,,,,3.50E+01,,,,Ghiorso et al.,1974,,7, 108 | 107,Bohrium,Bh,264,157,107,107,7,7,artificial,yes,,yes,,,Transactinide,,,,3.70E+01,,,,Armbruster and M�nzenberg,1981,,7, 109 | 108,Hassium,Hs,267,159,108,108,7,8,artificial,yes,,yes,,,Transactinide,,,,4.10E+01,,,,Armbruster and M�nzenberg,1983,,7, 110 | 109,Meitnerium,Mt,268,159,109,109,7,9,artificial,yes,,yes,,,Transactinide,,,,3.50E+01,,,,"GSI, Darmstadt, West Germany",1982,,7, 111 | 110,Darmstadtium ,Ds ,271,161,110,110,7,10,artificial,yes,,yes,,,Transactinide,,,,,,,,,1994,,7, 112 | 111,Roentgenium ,Rg ,272,161,111,111,7,11,artificial,yes,,yes,,,Transactinide,,,,,,,,,1994,,7, 113 | 112,Copernicium ,Cn ,285,173,112,112,7,12,artificial,yes,,yes,,,Transactinide,,,,,,,,,1996,,7, 114 | 113,Nihonium,Nh,284,171,113,113,7,13,artificial,yes,,yes,,,,,,,,,,,,2004,,7,3 115 | 114,Flerovium,Fl,289,175,114,114,7,14,artificial,yes,,yes,,,Transactinide,,,,,,,,,1999,,7,4 116 | 115,Moscovium,Mc,288,173,115,115,7,15,artificial,yes,,yes,,,,,,,,,,,,2010,,7,5 117 | 116,Livermorium,Lv,292,176,116,116,7,16,artificial,yes,,yes,,,Transactinide,,,,,,,,,2000,,7,6 118 | 117,Tennessine,Ts,295,178,117,117,7,17,artificial,yes,,,yes,,,,,,,,,,,2010,,7,7 119 | 118,Oganesson,Og,294,176,118,118,7,18,artificial,yes,,,yes,,Noble Gas,,,,,,,,,2006,,7,8 120 | -------------------------------------------------------------------------------- /book_data/ReferenceTables/BroadcastProducers.csv: -------------------------------------------------------------------------------- 1 | BroadcastProducersID|BroadcastLogID|ProducerNO 2 | -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_AirLanguage.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_AirLanguage.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_AudienceTargetAge.csv: -------------------------------------------------------------------------------- 1 | AudienceTargetAgeID|AudienceTargetAgeCD|ActiveFG|FilterTvFG|FilterPayFG|EnglishDescription|FrenchDescription 2 | 1|1|1|1|1|CHILDREN - 0 TO 5 YEARS|ENFANTS - 0 A 5 ANS 3 | 2|2|1|1|1|CHILDREN - 6 TO 12 YEARS|ENFANTS - 6 A 12 ANS 4 | 3|3|1|1|1|CHILDREN - 13 TO 17 YEARS|ADOLESCENTS - 13 A 17 ANS 5 | 4|4|1|1|1|ADULTS - 18 YEARS AND OVER|ADULTES - 18 ANS ET PLUS 6 | 5|5|0|0|0|INVALID|INVALID 7 | 6|6|0|0|0|INVALID|INVALID 8 | 7|7|0|0|0|INVALID|INVALID 9 | 8|8|0|0|0|INVALID|INVALID 10 | 9|9|0|0|0|INVALID|INVALID 11 | 10|0|0|0|0|INVALID|INVALID 12 | 11|0|0|0|0|INVALID|INVALID 13 | 12|0|0|0|0|INVALID|INVALID 14 | 342|0|0|0|0|INVALID|INVALID 15 | -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_AudienceTargetEthnic.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_AudienceTargetEthnic.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_BroadcastOriginPoint.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_BroadcastOriginPoint.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_Category.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_Category.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_ClosedCaption.csv: -------------------------------------------------------------------------------- 1 | ClosedCaptionID|ClosedCaptionCD|ActiveFG|EnglishDescription|FrenchDescription 2 | 1|CC|1|Closed captioned|FR-Closed captioned 3 | 2|DV|1|Video description|FR-video description 4 | 3|CD|1|Closed captioning and video description|FR-Closed captioning and video description 5 | -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_Composition.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_Composition.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_CountryOfOrigin.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_CountryOfOrigin.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_DubDramaCredit.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_DubDramaCredit.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_EthnicProgram.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_EthnicProgram.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_Exhibition.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_Exhibition.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_FilmClassification.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_FilmClassification.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_NetworkAffiliation.csv: -------------------------------------------------------------------------------- 1 | NetworkAffiliationID|NetworkAffiliationCD|ActiveFG|FilterTvFG|FilterPayFG|EnglishDescription|FrenchDescription 2 | 1|10|1|1|0|CBC OWNED & OPERATED|CBC 3 | 2|20|1|1|0|SRC OWNED & OPERATED|SRC 4 | 3|30|1|1|0|CBC AFFILIATES|CBC 5 | 4|40|1|1|0|SRC AFFILIATES|SRC AFF 6 | 5|50|1|1|0|CTV AFFILIATES|CTV 7 | 6|60|1|1|0|TVA AFFILIATES|TVA 8 | 7|70|1|1|0|RADIO-QUEBEC AFFILIATES|RQ 9 | 8|80|1|1|0|QAUTRE SAISONS|TQS 10 | 9|90|1|1|0|INDEPENDENT|INDEPENDENT 11 | 10|95|1|1|0|TEMPORARY NETWORK|TN 12 | -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_ProductionSource.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_ProductionSource.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_ProgramClass.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_ProgramClass.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/CD_SpecialAttention.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/CD_SpecialAttention.csv -------------------------------------------------------------------------------- /book_data/ReferenceTables/Call_Signs.csv: -------------------------------------------------------------------------------- 1 | LogIdentifierID,UndertakingNO,Undertaking_Name 2 | BRAVO,315413740,Bravo! 3 | CBET,320495005,"Canadian Broadcasting Corporation, windsor (CBET-DT)" 4 | CFTV3,535418075,"Southshore Broadcasting Inc., leamington (CFTV-DT)" 5 | CHEX,320468002,"591987 B.C. Ltd., peterborough (CHEX-DT)" 6 | CICT,430447003,"Corus Television Limited Partnership, calgary (CICT-DT)" 7 | CIMT,220450035,"Télé Inter-Rives ltée, rivière-du-loup (CIMT-DT)" 8 | CKMI,210497004,"Corus Television Limited Partnership, quebec (CKMI-DT)" 9 | FIGHT,535418348,Fight Network 10 | SCSD02,535422406,Super Channel (formerly Allarco Entertainment) 11 | SMITH,535422357,Smithsonian Channel (formerly eqhd) 12 | STJUICE,405426322,Stingray Juicebox 13 | TCN,305417215,The Comedy Network 14 | TMN3,314600354,Crave (The Movie Network) 15 | TRN,305424616,HPItv (formerly The Racing Network Canada) 16 | WARNER,535429791,Hollywood Suite 70s Movies (formerly Warner Films) 17 | 13ST,305423907,Crime + Investigation (formerly Mystery) 18 | ASIDE,535428149,A.Side (formerly AUX TV) 19 | AUX,535428149,A.Side (formerly AUX TV) 20 | CFCNL,430451054,"Bell Media Inc., lethbridge (CFCN-DT-5)" 21 | CKAL,405417909,"Rogers Media Inc., calgary (CKAL-DT)" 22 | CKSH,220516009,"Société Radio-Canada, sherbrooke (CKSH-DT)" 23 | HGTV,305417322,HGTV Canada - Home and Garden Television Canada 24 | SE01,214300535,Super Écran 25 | STLOUD,305426539,Stingray Loud 26 | CBFT,210426003,"Société Radio-Canada, montreal (CBFT-DT)" 27 | DSNYXD,535445648,Disney XD 28 | NICK,535427068,Nickelodeon (formerly YTV OneWorld) 29 | OMON,535445482,OMNI Regional 30 | PRIMETV,305417299,DTOUR (formerly TVtropolis) 31 | STVIBE,305426521,Stingray Vibe 32 | TV5,214301103,TV5 - Unis 33 | ASN,124100280,CTV Two Atlantic (formerly /A\ Atlantic & ASN) 34 | CBAT,134200161,"Canadian Broadcasting Corporation, fredericton (formerly saint john change to main area) (CBAT-DT)" 35 | CBNT,110436003,"Canadian Broadcasting Corporation, st. john's (CBNT-DT)" 36 | CGTV,305428072,GameTV (formerly CGTV Canada) 37 | CHEK,510467004,"0859291 B.C. Ltd., victoria (CHEK-DT)" 38 | CHWI,325410488,"Bell Media Inc., chatham/windsor/wheatley (CHWI-DT)" 39 | CITS,305420375,"Crossroads Television System, hamilton (CITS-DT)" 40 | DTOUR,305417299,DTOUR (formerly TVtropolis) 41 | HPITV,305424616,HPItv (formerly The Racing Network Canada) 42 | OMAB,535445482,OMNI Regional 43 | SLVSC,535417803,Silver Screen Classics 44 | ABORIW,535437471,Aboriginal Peoples Television Network (APTN) 45 | CFTV2,535418075,"Southshore Broadcasting Inc., leamington (CFTV-DT)" 46 | CHBC,520465006,"Corus Television Limited Partnership, kelowna (CHBC-DT)" 47 | CHISTO,205421556,Historia 48 | CHLT,220471007,"TVA Group Inc., sherbrooke (CHLT-DT)" 49 | CITL,430456129,"Stingray Radio Inc., lloydminster (CITL-DT)" 50 | CIVM,210963005,"Société de télédiffusion du Québec, montreal (CIVM-DT)" 51 | CKNY,330500000,"Bell Media Inc., north bay (CKNY-TV)" 52 | COOK,305427636,Cooking Channel (formerly W Movies) 53 | COSMO,535425723,Cosmopolitan TV (formerly Cosmopolitan Television) 54 | DSNYEN,535445622,Disney Channel 55 | DSNYFR,535421127,La chaîne Disney (formerly TÉLÉTOON Rétro (Français)) 56 | LCN,205417406,Le Canal Nouvelles (LCN) 57 | MLLE,535430516,Moi&cie (formerly Mlle) 58 | OUTTV,305423973,OUTtv (formerly PrideVision) 59 | PRIDE,305423973,OUTtv (formerly PrideVision) 60 | SE02,214300535,Super Écran 61 | TELENO,314600552,Telelatino 62 | TOON,305417231,TELETOON/TÉLÉTOON 63 | WMOV,305427636,Cooking Channel (formerly W Movies) 64 | BNN,305417273,BNN Bloomberg 65 | CHNM,505428658,"Rogers Media Inc., vancouver (CHNM-DT)" 66 | CJCB,124100041,"Bell Media Inc., sydney (CJCB-TV)" 67 | CRAVE3,314600354,Crave (The Movie Network) 68 | CTVNC,305417223,CTV News Channel (formerly CTV Newsnet) 69 | FCTV,314600560,Fairchild TV 70 | MMAX,205417399,MAX (formerly MUSIMAX) 71 | OMBC,535445482,OMNI Regional 72 | OWN,305417257,OWN: The Oprah Winfrey Network (formerly OWN; formerly VIVA) 73 | SEAS,535435699,The Seasonal Channel 74 | SPTVA2,535429486,TVA Sports 75 | TANG,535433015,NTD Television 76 | AMIFR,535437356,AMI-télé 77 | CARTN,535421135,Cartoon Network (formerly TELETOON Retro (English)) 78 | CFMT,314600024,"Rogers Media Inc., toronto (CFMT-DT)" 79 | CFRS,234502441,"V Interactions inc., saguenay (jonquiere) (CFRS-DT)" 80 | CFTV,535418075,"Southshore Broadcasting Inc., leamington (CFTV-DT)" 81 | CHCH,310466008,"2190015 Ontario Inc, hamilton (CHCH-DT)" 82 | CIVI,505423343,"Bell Media Inc., victoria (CIVI-DT)" 83 | CP24,305417348,CablePulse 24 (CP24) 84 | DEJA,305426050,DejaView 85 | HSTORM,535429808,Hollywood Suite 90s Movies (formerly AXN Movies) 86 | SHOW,315413732,Showcase 87 | SPARK,535434435,ABC Spark (formerly Harmony) 88 | SPTVA3,535429486,TVA Sports 89 | CBKT,420430035,"Canadian Broadcasting Corporation, regina (CBKT-DT)" 90 | CBLFT,535429642,"Société Radio-Canada, (CBLFT-DT)" 91 | CFCN,430451005,"Bell Media Inc., calgary (CFCN-DT)" 92 | CFJP,214300832,"V Interactions inc., montréal (CFJP-DT)" 93 | CFSK,425001799,"Corus Television Limited Partnership, saskatoon (CFSK-DT)" 94 | CFTV1,535418075,"Southshore Broadcasting Inc., leamington (CFTV-DT)" 95 | CITV,430476002,"Corus Television Limited Partnership, edmonton (CITV-DT)" 96 | CKND,411059009,"Corus Television Limited Partnership, winnipeg (CKND-DT)" 97 | CKPR,330503004,"Thunder Bay Electronics Limited, thunder bay (CKPR-DT)" 98 | CPOP,205426720,CINÉPOP (formerly Cinémania) 99 | DIY,535421151,D.I.Y. Network (formerly D.I.Y. Television) 100 | ESPN,305424608,ESPN Classic 101 | FACHTV,314600560,Fairchild TV 102 | FASH,305423931,FashionTelevisionChannel (formerly Fashion Television ...) 103 | MMM,305417364,Gusto (formerly M3) 104 | MOVIEP,315413716,STARZ (formerly The Movie Network Encore) 105 | SE03,214300535,Super Écran 106 | WTN,415413806,W Network 107 | 90SM,535429808,Hollywood Suite 90s Movies (formerly AXN Movies) 108 | BBCE,535422323,BBC Earth (formerly radX) 109 | CBWT,410443006,"Canadian Broadcasting Corporation, winnipeg (CBWT-DT)" 110 | CFKS,224401539,"V Interactions inc., sherbrooke (CFKS-DT)" 111 | CHEM,210471017,"TVA Group Inc., trois-rivieres (CHEM-DT)" 112 | CKCO,320493000,"Bell Media Inc., kitchener (CKCO-DT)" 113 | CRAVE2,314600354,Crave (The Movie Network) 114 | E!,305417330,E! (formerly Star! TV) 115 | FPTV,305424179,Festival Portuguese Television 116 | MPIX2,315413716,STARZ (formerly The Movie Network Encore) 117 | MTV1,305417372,MTV (Canada) (formerly known as Talk TV) 118 | RDSINF,205424063,RDS Info (formerly Réseau Info Sports (RIS) 119 | SCSD03,535422406,Super Channel (formerly Allarco Entertainment) 120 | SPACE,305417306,Space (formerly Space: The Imagination Station) 121 | TMN2,314600354,Crave (The Movie Network) 122 | CASA,535421276,Casa - (formerly Les idées de ma maison) 123 | CBLT,310433008,"Canadian Broadcasting Corporation, toronto (CBLT-DT)" 124 | CBWFT,410442008,"Société Radio-Canada, winnipeg (CBWFT-DT)" 125 | CFQC,420455008,"Bell Media Inc., saskatoon (CFQC-DT)" 126 | CFRN6,430456061,"Bell Media Inc., red deer (CFRN-TV-6)" 127 | CFTO,310459003,"Bell Media Inc., toronto (CFTO-DT)" 128 | CFTV4,535418075,"Southshore Broadcasting Inc., leamington (CFTV-DT)" 129 | CHMG,535417530,"Télé-Mag inc., québec (CHMG-TV)" 130 | CHRO,330473000,"Bell Media Inc., pembroke (CHRO-TV)" 131 | CICI,330509001,"Bell Media Inc., sudbury (CICI-TV)" 132 | CITO,330509027,"Bell Media Inc., timmins (CITO-TV)" 133 | CJNT,215415548,"Rogers Media Inc., montreal (CJNT-DT)" 134 | CKLT,130494024,"Bell Media Inc., saint john (CKLT-DT)" 135 | DISSCI,305426191,Discovery Science (formerly Discovery Civilization Channel) 136 | FACHV,314600560,Fairchild TV 137 | W,415413806,W Network 138 | 80SM,535429816,Hollywood Suite 80s Movies (formerly MGM Channel) 139 | BOOK,405423922,Book Television (formerly Book Television - The Channel) 140 | CBCT,120424007,"Canadian Broadcasting Corporation, charlottetown (CBCT-DT)" 141 | CHOT,231317009,"RNC MEDIA Inc., gatineau (CHOT-DT)" 142 | CI,305423907,Crime + Investigation (formerly Mystery) 143 | CICC,420969008,"Bell Media Inc., yorkton (CICC-TV)" 144 | CISA,430485003,"Corus Television Limited Partnership, lethbridge (CISA-DT)" 145 | CJBRT,535429551,"Société Radio-Canada, (CJBR-DT)" 146 | CKES,535425335,"Crossroads Television System, edmonton, ab (CKES-DT)" 147 | CKPG,530502004,"Jim Pattison Broadcast Group Limited Partnership, prince george (CKPG-TV)" 148 | COTT,405423948,Cottage Life (formerly Bold) 149 | DXDCHA,535428214,Family CHRGD (formerly Disney XD) 150 | MAKE,305428600,Makeful TV (formerly BITE Television ) 151 | OMQC,535445482,OMNI Regional 152 | SNOIL,535429527,Sportsnet One (formerly Rogers Sportsnet One) 153 | TCC,535421515,Daystar Canada (formerly Grace TV) 154 | TOONR,535421135,Cartoon Network (formerly TELETOON Retro (English)) 155 | TQS,214300873,V (Network) (formerly - Quatre-Saisons (TQS)) 156 | BLC,535424428,The Beautiful Little Channel (formerly Classical Digital) 157 | CBMT,210434007,"Canadian Broadcasting Corporation, montreal (CBMT-DT)" 158 | CBUFT,511152001,"Société Radio-Canada, vancouver (CBUFT-DT)" 159 | CBXFT,430444000,"Société Radio-Canada, edmonton (CBXFT-DT)" 160 | CHAT,430463000,"Jim Pattison Broadcast Group Limited Partnership, medicine hat (CHAT-TV)" 161 | CHMI,414903757,"Rogers Media Inc., portage la prairie/winn., mb (CHMI-DT)" 162 | CIII,320496052,"Corus Television Limited Partnership, paris (CIII-DT)" 163 | CKRT,220507008,"CKRT-TV ltée, rivière-du-loup (CKRT-DT)" 164 | CKTV,230506008,"Société Radio-Canada, saguenay (jonquiere) (CKTV-DT)" 165 | MGM,535429816,Hollywood Suite 80s Movies (formerly MGM Channel) 166 | STARZ1,315413716,STARZ (formerly The Movie Network Encore) 167 | TTV,515408441,Talentvision 168 | WFN,535420393,Sportsman Canada 169 | ANIMAL,305426266,Animal Planet 170 | CAVE,205424055,"H2 (formerly The Cave, Men TV)" 171 | CFRE,425001781,"Corus Television Limited Partnership, regina (CFRE-DT)" 172 | EXP,535432215,ICI EXPLORA (formerly SENS) 173 | MPLU,214301129,MusiquePlus 174 | MSET,535423024,Mediaset Italia (formerly Italian Entertainment TV) 175 | NEWSW,334805116,CBC News Network (Formerly Newsworld) 176 | RAPT,305425440,NBA TV (Canada) - (formerly Raptors NBA TV) 177 | SALT,305428436,Salt & Light (Inner Peace Television Network) 178 | SRC,330952003,French TV Service 179 | THTV,305417281,TreeHouse TV 180 | V,214300873,V (Network) (formerly - Quatre-Saisons (TQS)) 181 | 70SM,535429791,Hollywood Suite 70s Movies (formerly Warner Films) 182 | BOLLY,535429684,Bollywood Times HD TV 183 | CFER,220450019,"TVA Group Inc., rimouski (CFER-DT)" 184 | CICA,310475009,"The Ontario Educational Communications Authority, toronto (CICA-DT)" 185 | CJCH,120480009,"Bell Media Inc., halifax (CJCH-DT)" 186 | CKWS1,535419164,"591987 B.C. Ltd., brighton (CKWS-DT-1)" 187 | CRAVE1,314600354,Crave (The Movie Network) 188 | FAMCHA,314600859,Family Channel (formerly Family) 189 | FNTSY,535435540,Game+ (formerly FNTSY Sports Network) 190 | FOOD,305423329,Food Network Canada 191 | MUCHM,314600545,Much (formerly MuchMusic) 192 | SE04,214300535,Super Écran 193 | TLTOON,305417231,TELETOON/TÉLÉTOON 194 | ZEETV,535435681,Zee TV Canada (formerly Hindi Women’s TV) 195 | BC1,535434906,BC News 1 (formerly Global News Plus BC) 196 | CANALVIE,205417381,Canal Vie 197 | CFAP,214300865,"V Interactions inc., quebec (CFAP-DT)" 198 | CIHF2,134200815,"Corus Television Limited Partnership, saint john (CHNB-DT)" 199 | CIPA,425001864,"Bell Media Inc., prince albert (CIPA-TV)" 200 | CKSA,430508002,"Stingray Radio Inc., lloydminster (CKSA-DT)" 201 | DIVA,305425002,Lifetime (formerly Showcase Diva) 202 | FTV,314600560,Fairchild TV 203 | MVOLA,305425599,Rewind (formerly Movieola) 204 | CFCM,210450003,"TVA Group Inc., quebec (CFCM-DT)" 205 | CFHD,535435516,"4517466 Canada Inc., montréal, qc (CFHD-DT)" 206 | CFTM,210458006,"TVA Group Inc., montreal (CFTM-DT)" 207 | CIIT,405429417,"ZoomerMedia Limited, winnipeg (CIIT-DT)" 208 | CIVT,505418295,"Bell Media Inc., vancouver (CIVT-DT)" 209 | CJIL,435415054,"The Miracle Channel Association, lethbridge (CJIL-DT)" 210 | CKCS,535425327,"Crossroads Television System, calgary, ab (CKCS-DT)" 211 | DOCS,305423965,Documentary (formerly The Canadian Documentary Channel) 212 | EURO,535422000,EuroWorld SPORT (formerly RCS Television) 213 | HISTORY,305417249,History Television 214 | NGWILD,535434584,NatGeo Wild 215 | OUTDR,305417314,Outdoor Life Network (OLN) 216 | PRIS,535421284,Prise 2 (formerly Nostalgie) 217 | TVA,210945002,TVA 218 | WEAT,214301194,The Weather Network / MétéoMédia 219 | AMITV,535425674,AMI-tv (formerly The Accessible Channel) 220 | CBXT,430445007,"Canadian Broadcasting Corporation, edmonton (CBXT-DT)" 221 | CEVASI,205421548,Évasion (formerly Canal Évasion) 222 | CHBX,331253005,"Bell Media Inc., sault ste. marie (CHBX-TV)" 223 | CKCK,420492001,"Bell Media Inc., regina (CKCK-DT)" 224 | CKVR,320511009,"Bell Media Inc., barrie (CKVR-DT)" 225 | CMT,435413778,CMT (formerly Country Music Television) 226 | MAGINO,535421862,Télémagino (formerly Disney Junior) 227 | PHDF,535421862,Télémagino (formerly Disney Junior) 228 | STRETR,305426597,Stingray Retro 229 | ABORI,535437471,Aboriginal Peoples Television Network (APTN) 230 | CBRT,431036003,"Canadian Broadcasting Corporation, calgary (CBRT-DT)" 231 | CESS,431048008,CTV Two Alberta (formerly ACCESS) 232 | CFEM,234500064,"RNC MEDIA Inc., rouyn-noranda (CFEM-DT)" 233 | CFJC,530452002,"Jim Pattison Broadcast Group Limited Partnership, kamloops (CFJC-TV)" 234 | CFKM,214300857,"V Interactions inc., trois-rivieres (CFKM-DT)" 235 | CFPL,320454002,"Bell Media Inc., london (CFPL-DT)" 236 | CFTU,214300758,"Canal Savoir, montreal (CFTU-DT)" 237 | CHRGD,535428214,Family CHRGD (formerly Disney XD) 238 | CKEM,405417925,"Rogers Media Inc., edmonton (CKEM-DT)" 239 | CKTM,210510004,"Société Radio-Canada, trois-rivieres (CKTM-DT)" 240 | GAME,305428072,GameTV (formerly CGTV Canada) 241 | GUS,305417364,Gusto (formerly M3) 242 | INVDIS,405425613,Investigation Discovery (formerly Court TV Canada) 243 | INVST,535433411,Investigation (formerly Canal D Investigation) 244 | SNONE,535429527,Sportsnet One (formerly Rogers Sportsnet One) 245 | SWIM,305424997,Adult Swim (formerly ACTION) 246 | TDC,315413765,Discovery Channel 247 | TMN1,314600354,Crave (The Movie Network) 248 | ZTÉLÉ,205421572,Z (formerly Ztélé) 249 | ARTV,205423734,ICI ARTV 250 | CBVT,210441002,"Société Radio-Canada, quebec (CBVT-DT)" 251 | CFGS,234502433,"RNC MEDIA Inc., gatineau (CFGS-DT)" 252 | CHNB2,134200815,"Corus Television Limited Partnership, saint john (CHNB-DT)" 253 | CIHF,124101783,"Corus Television Limited Partnership, halifax (CIHF-DT)" 254 | CJOH,330486002,"Bell Media Inc., ottawa (CJOH-DT)" 255 | CKY,410514004,"Bell Media Inc., winnipeg (CKY-DT)" 256 | HFEST,535429824,Hollywood Suite 2000s Movies (formerly Sony Movie Channel) 257 | LIFENT,315413724,Slice 258 | RDS,214301137,Le Réseau des Sports (RDS) 259 | SCSD01,535422406,Super Channel (formerly Allarco Entertainment) 260 | SPRTVA,535429486,TVA Sports 261 | TMN4,314600354,Crave (The Movie Network) 262 | TOONRF,535421127,La chaîne Disney (formerly TÉLÉTOON Rétro (Français)) 263 | VINTAG,535447107,Vintage TV Canada Limited 264 | 2000SM,535429824,Hollywood Suite 2000s Movies (formerly Sony Movie Channel) 265 | BITE,305428600,Makeful TV (formerly BITE Television ) 266 | CHNU,505423335,"ZoomerMedia Limited, fraser valley (CHNU-DT)" 267 | CJCO,535425294,"Rogers Media Inc., calgary, ab (CJCO-DT)" 268 | CJDC,530482009,"Bell Media Inc. (OBCI), dawson creek (CJDC-TV)" 269 | CKCW,130494008,"Bell Media Inc., moncton (CKCW-DT)" 270 | CMET,214301194,The Weather Network / MétéoMédia 271 | CSCN,425405479,Saskatchewan Communications Network 272 | FYI,305424020,Fyi (formerly Twist TV) 273 | MOICIE,535430516,Moi&cie (formerly Mlle) 274 | NGCE,305424294,National Geographic Channel 275 | ONEBMS,305423915,"The Brand New ONE Body, Mind, Spirit, Love Channel" 276 | PHDCHA,314600859,Family Channel (formerly Family) 277 | RADX,535422323,BBC Earth (formerly radX) 278 | RURAL,535426911,The Rural Channel 279 | SATV,305417421,ATN South Asian Television (SATV) 280 | SNVANC,535429527,Sportsnet One (formerly Rogers Sportsnet One) 281 | TRESR,535420898,HIFI (formerly Treasure HD) 282 | ABORIH,535437471,Aboriginal Peoples Television Network (APTN) 283 | BBCCND,305424319,BBC Canada 284 | CBAFT,130423007,"Société Radio-Canada, moncton (CBAFT-DT)" 285 | CFTK,530457001,"Bell Media Inc. (OBCI), terrace (CFTK-TV)" 286 | CHAU,220464002,"CHAU-TV Communications ltée, carleton (CHAU-DT)" 287 | CHLF,314600768,TFO (La Chaîne) 288 | CITY,310477005,"Rogers Media Inc., toronto (CITY-DT)" 289 | DISVEL,535421250,Discovery Velocity (formerly Discovery World HD) 290 | RAZER,405424037,MTV2 (formerly Razer) 291 | RDI,215413782,ICI RDI 292 | RDS2,214301137,Le Réseau des Sports (RDS) 293 | STARZ2,315413716,STARZ (formerly The Movie Network Encore) 294 | TRV,305423999,travel + escape 295 | VISION,314600834,Vision TV 296 | WILD,535417118,Wild tv (The Hunting Channel) 297 | YOOP,535429402,YOOPA (formerly TVA Junior) 298 | ACTION,305424997,Adult Swim (formerly ACTION) 299 | CBC,330953001,English TV Service 300 | CBUT,510440001,"Canadian Broadcasting Corporation, vancouver (CBUT-DT)" 301 | CFTF,205420805,"Télévision MBS inc., rivière-du-loup (CFTF-DT)" 302 | CFVS,234502458,"RNC MEDIA Inc., val-d'or (CFVS-DT)" 303 | CFYK,535432207,"Canadian Broadcasting Corporation, (CFYK-DT)" 304 | CJMT,305428874,"Rogers Media Inc., toronto (CJMT-DT)" 305 | CKNO,515409530,Knowledge (Formerly CKNO-TV Knowledge Network) 306 | HBOCAN,314600354,Crave (The Movie Network) 307 | LEAF,305425531,Leafs TV 308 | MEHD,535429692,Mehndi HD TV 309 | SN360,305417356,Sportsnet 360 (formerly The Score) 310 | TAC,535425674,AMI-tv (formerly The Accessible Channel) 311 | VRAK,214301111,VRAK (formerly Vrak.TV) 312 | ABORIN,535437471,Aboriginal Peoples Television Network (APTN) 313 | CBKFT,425001765,"Société Radio-Canada, regina (CBKFT-DT)" 314 | CBOFT,330437005,"Société Radio-Canada, ottawa (CBOFT-DT)" 315 | CBOT,330438003,"Canadian Broadcasting Corporation, ottawa (CBOT-DT)" 316 | CFCF,210448007,"Bell Media Inc., montreal (CFCF-DT)" 317 | CHAN,510462005,"Corus Television Limited Partnership, vancouver (CHAN-DT)" 318 | CJON,110487006,"Newfoundland Broadcasting Company Limited, st. john's (CJON-DT)" 319 | CKWS01,315405985,"591987 B.C. Ltd., brighton (CKWS-TV-1)" 320 | CPTF,305417348,CablePulse 24 (CP24) 321 | DSNYJR,535445630,Disney Junior 322 | FAMJR,314600859,Family Channel (formerly Family) 323 | FXX,535432190,FXX (Canada) (formerly Ampersand) 324 | IFC,105424006,The Independent Film Channel Canada 325 | LCE,205421548,Évasion (formerly Canal Évasion) 326 | LONE,305426000,MovieTime (formerly known as Lonestar) 327 | SCSD04,535422406,Super Channel (formerly Allarco Entertainment) 328 | SNFLAM,535429527,Sportsnet One (formerly Rogers Sportsnet One) 329 | STRETRO,305426597,Stingray Retro 330 | TLNSP,535423082,Univision Canada (formerly TLN en Español) 331 | UNIS,214301103,TV5 - Unis 332 | VINTAGE,535447107,Vintage TV Canada Limited 333 | YTV,314600842,YTV 334 | ZEST,535426961,Zeste (formerly Cuisine) 335 | CANALD,215413790,Canal D 336 | CBHT,120428008,"Canadian Broadcasting Corporation, halifax (CBHT-DT)" 337 | CFEG,505421537,"The B.C. Conference of the Mennonite Brethren Churches, abbotsford (CFEG-TV)" 338 | CFRN,430456004,"Bell Media Inc., edmonton (CFRN-DT)" 339 | CHEX02,315408591,Power Broadcasting Inc. 340 | CHFD,330469008,"Thunder Bay Electronics Limited, thunder bay (CHFD-DT)" 341 | CJEO,535425301,"Rogers Media Inc., edmonton, ab (CJEO-DT)" 342 | CJPM,230489007,"TVA Group Inc., saguenay (chicoutimi) (CJPM-DT)" 343 | CKVU,511116006,"Rogers Media Inc., vancouver (CKVU-DT)" 344 | FXC,535432174,FX (formerly FX Canada) 345 | NATUR,535420880,Love Nature (formerly Oasis HD) 346 | OTN1,305417413,Odyssey (formerly OTN) 347 | SERIEP,205421564,Séries+ 348 | SNW360,305417356,Sportsnet 360 (formerly The Score) 349 | STJUIC,405426322,Stingray Juicebox 350 | TVADIK,205424104,addikTV (formerly Mystère) 351 | WFNHD,535420393,Sportsman Canada 352 | -------------------------------------------------------------------------------- /book_data/ReferenceTables/LogIdentifier.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mrn-aglic/pyspark-playground/7d1a176d939e7558d2160a280448412ee0e65a80/book_data/ReferenceTables/LogIdentifier.csv -------------------------------------------------------------------------------- /book_data/pokedex.dsv: -------------------------------------------------------------------------------- 1 | #001 Bulbasaur Bulbasaur Grass Poison 2 | #002 Ivysaur Ivysaur Grass Poison 3 | #003 Venusaur Venusaur Grass Poison 4 | #004 Charmander Charmander Fire 5 | #005 Charmeleon Charmeleon Fire 6 | #006 Charizard Charizard Fire Flying 7 | #007 Squirtle Squirtle Water 8 | #008 Wartortle Wartortle Water 9 | #009 Blastoise Blastoise Water 10 | #010 Caterpie Caterpie Bug 11 | #011 Metapod Metapod Bug 12 | #012 Butterfree Butterfree Bug Flying 13 | #013 Weedle Weedle Bug Poison 14 | #014 Kakuna Kakuna Bug Poison 15 | #015 Beedrill Beedrill Bug Poison 16 | #016 Pidgey Pidgey Normal Flying 17 | #017 Pidgeotto Pidgeotto Normal Flying 18 | #018 Pidgeot Pidgeot Normal Flying 19 | #019 Rattata Rattata Normal 20 | #020 Raticate Raticate Normal 21 | #021 Spearow Spearow Normal Flying 22 | #022 Fearow Fearow Normal Flying 23 | #023 Ekans Ekans Poison 24 | #024 Arbok Arbok Poison 25 | #025 Pikachu Pikachu Electric 26 | #026 Raichu Raichu Electric 27 | #027 Sandshrew Sandshrew Ground 28 | #028 Sandslash Sandslash Ground 29 | #029 Nidoran♀ Nidoran♀ Poison 30 | #030 Nidorina Nidorina Poison 31 | #031 Nidoqueen Nidoqueen Poison Ground 32 | #032 Nidoran♂ Nidoran♂ Poison 33 | #033 Nidorino Nidorino Poison 34 | #034 Nidoking Nidoking Poison Ground 35 | #035 Clefairy Clefairy Fairy 36 | #036 Clefable Clefable Fairy 37 | #037 Vulpix Vulpix Fire 38 | #038 Ninetales Ninetales Fire 39 | #039 Jigglypuff Jigglypuff Normal Fairy 40 | #040 Wigglytuff Wigglytuff Normal Fairy 41 | #041 Zubat Zubat Poison Flying 42 | #042 Golbat Golbat Poison Flying 43 | #043 Oddish Oddish Grass Poison 44 | #044 Gloom Gloom Grass Poison 45 | #045 Vileplume Vileplume Grass Poison 46 | #046 Paras Paras Bug Grass 47 | #047 Parasect Parasect Bug Grass 48 | #048 Venonat Venonat Bug Poison 49 | #049 Venomoth Venomoth Bug Poison 50 | #050 Diglett Diglett Ground 51 | #051 Dugtrio Dugtrio Ground 52 | #052 Meowth Meowth Normal 53 | #053 Persian Persian Normal 54 | #054 Psyduck Psyduck Water 55 | #055 Golduck Golduck Water 56 | #056 Mankey Mankey Fighting 57 | #057 Primeape Primeape Fighting 58 | #058 Growlithe Growlithe Fire 59 | #059 Arcanine Arcanine Fire 60 | #060 Poliwag Poliwag Water 61 | #061 Poliwhirl Poliwhirl Water 62 | #062 Poliwrath Poliwrath Water Fighting 63 | #063 Abra Abra Psychic 64 | #064 Kadabra Kadabra Psychic 65 | #065 Alakazam Alakazam Psychic 66 | #066 Machop Machop Fighting 67 | #067 Machoke Machoke Fighting 68 | #068 Machamp Machamp Fighting 69 | #069 Bellsprout Bellsprout Grass Poison 70 | #070 Weepinbell Weepinbell Grass Poison 71 | #071 Victreebel Victreebel Grass Poison 72 | #072 Tentacool Tentacool Water Poison 73 | #073 Tentacruel Tentacruel Water Poison 74 | #074 Geodude Geodude Rock Ground 75 | #075 Graveler Graveler Rock Ground 76 | #076 Golem Golem Rock Ground 77 | #077 Ponyta Ponyta Fire 78 | #078 Rapidash Rapidash Fire 79 | #079 Slowpoke Slowpoke Water Psychic 80 | #080 Slowbro Slowbro Water Psychic 81 | #081 Magnemite Magnemite Electric Steel 82 | #082 Magneton Magneton Electric Steel 83 | #083 Farfetch'd Farfetch'd Normal Flying 84 | #084 Doduo Doduo Normal Flying 85 | #085 Dodrio Dodrio Normal Flying 86 | #086 Seel Seel Water 87 | #087 Dewgong Dewgong Water Ice 88 | #088 Grimer Grimer Poison 89 | #089 Muk Muk Poison 90 | #090 Shellder Shellder Water 91 | #091 Cloyster Cloyster Water Ice 92 | #092 Gastly Gastly Ghost Poison 93 | #093 Haunter Haunter Ghost Poison 94 | #094 Gengar Gengar Ghost Poison 95 | #095 Onix Onix Rock Ground 96 | #096 Drowzee Drowzee Psychic 97 | #097 Hypno Hypno Psychic 98 | #098 Krabby Krabby Water 99 | #099 Kingler Kingler Water 100 | #100 Voltorb Voltorb Electric 101 | #101 Electrode Electrode Electric 102 | #102 Exeggcute Exeggcute Grass Psychic 103 | #103 Exeggutor Exeggutor Grass Psychic 104 | #104 Cubone Cubone Ground 105 | #105 Marowak Marowak Ground 106 | #106 Hitmonlee Hitmonlee Fighting 107 | #107 Hitmonchan Hitmonchan Fighting 108 | #108 Lickitung Lickitung Normal 109 | #109 Koffing Koffing Poison 110 | #110 Weezing Weezing Poison 111 | #111 Rhyhorn Rhyhorn Ground Rock 112 | #112 Rhydon Rhydon Ground Rock 113 | #113 Chansey Chansey Normal 114 | #114 Tangela Tangela Grass 115 | #115 Kangaskhan Kangaskhan Normal 116 | #116 Horsea Horsea Water 117 | #117 Seadra Seadra Water 118 | #118 Goldeen Goldeen Water 119 | #119 Seaking Seaking Water 120 | #120 Staryu Staryu Water 121 | #121 Starmie Starmie Water Psychic 122 | #122 Mr. Mime Mr. Mime Psychic Fairy 123 | #123 Scyther Scyther Bug Flying 124 | #124 Jynx Jynx Ice Psychic 125 | #125 Electabuzz Electabuzz Electric 126 | #126 Magmar Magmar Fire 127 | #127 Pinsir Pinsir Bug 128 | #128 Tauros Tauros Normal 129 | #129 Magikarp Magikarp Water 130 | #130 Gyarados Gyarados Water Flying 131 | #131 Lapras Lapras Water Ice 132 | #132 Ditto Ditto Normal 133 | #133 Eevee Eevee Normal 134 | #134 Vaporeon Vaporeon Water 135 | #135 Jolteon Jolteon Electric 136 | #136 Flareon Flareon Fire 137 | #137 Porygon Porygon Normal 138 | #138 Omanyte Omanyte Rock Water 139 | #139 Omastar Omastar Rock Water 140 | #140 Kabuto Kabuto Rock Water 141 | #141 Kabutops Kabutops Rock Water 142 | #142 Aerodactyl Aerodactyl Rock Flying 143 | #143 Snorlax Snorlax Normal 144 | #144 Articuno Articuno Ice Flying 145 | #145 Zapdos Zapdos Electric Flying 146 | #146 Moltres Moltres Fire Flying 147 | #147 Dratini Dratini Dragon 148 | #148 Dragonair Dragonair Dragon 149 | #149 Dragonite Dragonite Dragon Flying 150 | #150 Mewtwo Mewtwo Psychic 151 | #151 Mew Mew Psychic 152 | #152 Chikorita Chikorita Grass 153 | #153 Bayleef Bayleef Grass 154 | #154 Meganium Meganium Grass 155 | #155 Cyndaquil Cyndaquil Fire 156 | #156 Quilava Quilava Fire 157 | #157 Typhlosion Typhlosion Fire 158 | #158 Totodile Totodile Water 159 | #159 Croconaw Croconaw Water 160 | #160 Feraligatr Feraligatr Water 161 | #161 Sentret Sentret Normal 162 | #162 Furret Furret Normal 163 | #163 Hoothoot Hoothoot Normal Flying 164 | #164 Noctowl Noctowl Normal Flying 165 | #165 Ledyba Ledyba Bug Flying 166 | #166 Ledian Ledian Bug Flying 167 | #167 Spinarak Spinarak Bug Poison 168 | #168 Ariados Ariados Bug Poison 169 | #169 Crobat Crobat Poison Flying 170 | #170 Chinchou Chinchou Water Electric 171 | #171 Lanturn Lanturn Water Electric 172 | #172 Pichu Pichu Electric 173 | #173 Cleffa Cleffa Fairy 174 | #174 Igglybuff Igglybuff Normal Fairy 175 | #175 Togepi Togepi Fairy 176 | #176 Togetic Togetic Fairy Flying 177 | #177 Natu Natu Psychic Flying 178 | #178 Xatu Xatu Psychic Flying 179 | #179 Mareep Mareep Electric 180 | #180 Flaaffy Flaaffy Electric 181 | #181 Ampharos Ampharos Electric 182 | #182 Bellossom Bellossom Grass 183 | #183 Marill Marill Water Fairy 184 | #184 Azumarill Azumarill Water Fairy 185 | #185 Sudowoodo Sudowoodo Rock 186 | #186 Politoed Politoed Water 187 | #187 Hoppip Hoppip Grass Flying 188 | #188 Skiploom Skiploom Grass Flying 189 | #189 Jumpluff Jumpluff Grass Flying 190 | #190 Aipom Aipom Normal 191 | #191 Sunkern Sunkern Grass 192 | #192 Sunflora Sunflora Grass 193 | #193 Yanma Yanma Bug Flying 194 | #194 Wooper Wooper Water Ground 195 | #195 Quagsire Quagsire Water Ground 196 | #196 Espeon Espeon Psychic 197 | #197 Umbreon Umbreon Dark 198 | #198 Murkrow Murkrow Dark Flying 199 | #199 Slowking Slowking Water Psychic 200 | #200 Misdreavus Misdreavus Ghost 201 | #201 Unown Unown Psychic 202 | #202 Wobbuffet Wobbuffet Psychic 203 | #203 Girafarig Girafarig Normal Psychic 204 | #204 Pineco Pineco Bug 205 | #205 Forretress Forretress Bug Steel 206 | #206 Dunsparce Dunsparce Normal 207 | #207 Gligar Gligar Ground Flying 208 | #208 Steelix Steelix Steel Ground 209 | #209 Snubbull Snubbull Fairy 210 | #210 Granbull Granbull Fairy 211 | #211 Qwilfish Qwilfish Water Poison 212 | #212 Scizor Scizor Bug Steel 213 | #213 Shuckle Shuckle Bug Rock 214 | #214 Heracross Heracross Bug Fighting 215 | #215 Sneasel Sneasel Dark Ice 216 | #216 Teddiursa Teddiursa Normal 217 | #217 Ursaring Ursaring Normal 218 | #218 Slugma Slugma Fire 219 | #219 Magcargo Magcargo Fire Rock 220 | #220 Swinub Swinub Ice Ground 221 | #221 Piloswine Piloswine Ice Ground 222 | #222 Corsola Corsola Water Rock 223 | #223 Remoraid Remoraid Water 224 | #224 Octillery Octillery Water 225 | #225 Delibird Delibird Ice Flying 226 | #226 Mantine Mantine Water Flying 227 | #227 Skarmory Skarmory Steel Flying 228 | #228 Houndour Houndour Dark Fire 229 | #229 Houndoom Houndoom Dark Fire 230 | #230 Kingdra Kingdra Water Dragon 231 | #231 Phanpy Phanpy Ground 232 | #232 Donphan Donphan Ground 233 | #233 Porygon2 Porygon2 Normal 234 | #234 Stantler Stantler Normal 235 | #235 Smeargle Smeargle Normal 236 | #236 Tyrogue Tyrogue Fighting 237 | #237 Hitmontop Hitmontop Fighting 238 | #238 Smoochum Smoochum Ice Psychic 239 | #239 Elekid Elekid Electric 240 | #240 Magby Magby Fire 241 | #241 Miltank Miltank Normal 242 | #242 Blissey Blissey Normal 243 | #243 Raikou Raikou Electric 244 | #244 Entei Entei Fire 245 | #245 Suicune Suicune Water 246 | #246 Larvitar Larvitar Rock Ground 247 | #247 Pupitar Pupitar Rock Ground 248 | #248 Tyranitar Tyranitar Rock Dark 249 | #249 Lugia Lugia Psychic Flying 250 | #250 Ho-Oh Ho-Oh Fire Flying 251 | #251 Celebi Celebi Psychic Grass 252 | #252 Treecko Treecko Grass 253 | #253 Grovyle Grovyle Grass 254 | #254 Sceptile Sceptile Grass 255 | #255 Torchic Torchic Fire 256 | #256 Combusken Combusken Fire Fighting 257 | #257 Blaziken Blaziken Fire Fighting 258 | #258 Mudkip Mudkip Water 259 | #259 Marshtomp Marshtomp Water Ground 260 | #260 Swampert Swampert Water Ground 261 | #261 Poochyena Poochyena Dark 262 | #262 Mightyena Mightyena Dark 263 | #263 Zigzagoon Zigzagoon Normal 264 | #264 Linoone Linoone Normal 265 | #265 Wurmple Wurmple Bug 266 | #266 Silcoon Silcoon Bug 267 | #267 Beautifly Beautifly Bug Flying 268 | #268 Cascoon Cascoon Bug 269 | #269 Dustox Dustox Bug Poison 270 | #270 Lotad Lotad Water Grass 271 | #271 Lombre Lombre Water Grass 272 | #272 Ludicolo Ludicolo Water Grass 273 | #273 Seedot Seedot Grass 274 | #274 Nuzleaf Nuzleaf Grass Dark 275 | #275 Shiftry Shiftry Grass Dark 276 | #276 Taillow Taillow Normal Flying 277 | #277 Swellow Swellow Normal Flying 278 | #278 Wingull Wingull Water Flying 279 | #279 Pelipper Pelipper Water Flying 280 | #280 Ralts Ralts Psychic Fairy 281 | #281 Kirlia Kirlia Psychic Fairy 282 | #282 Gardevoir Gardevoir Psychic Fairy 283 | #283 Surskit Surskit Bug Water 284 | #284 Masquerain Masquerain Bug Flying 285 | #285 Shroomish Shroomish Grass 286 | #286 Breloom Breloom Grass Fighting 287 | #287 Slakoth Slakoth Normal 288 | #288 Vigoroth Vigoroth Normal 289 | #289 Slaking Slaking Normal 290 | #290 Nincada Nincada Bug Ground 291 | #291 Ninjask Ninjask Bug Flying 292 | #292 Shedinja Shedinja Bug Ghost 293 | #293 Whismur Whismur Normal 294 | #294 Loudred Loudred Normal 295 | #295 Exploud Exploud Normal 296 | #296 Makuhita Makuhita Fighting 297 | #297 Hariyama Hariyama Fighting 298 | #298 Azurill Azurill Normal Fairy 299 | #299 Nosepass Nosepass Rock 300 | #300 Skitty Skitty Normal 301 | #301 Delcatty Delcatty Normal 302 | #302 Sableye Sableye Dark Ghost 303 | #303 Mawile Mawile Steel Fairy 304 | #304 Aron Aron Steel Rock 305 | #305 Lairon Lairon Steel Rock 306 | #306 Aggron Aggron Steel Rock 307 | #307 Meditite Meditite Fighting Psychic 308 | #308 Medicham Medicham Fighting Psychic 309 | #309 Electrike Electrike Electric 310 | #310 Manectric Manectric Electric 311 | #311 Plusle Plusle Electric 312 | #312 Minun Minun Electric 313 | #313 Volbeat Volbeat Bug 314 | #314 Illumise Illumise Bug 315 | #315 Roselia Roselia Grass Poison 316 | #316 Gulpin Gulpin Poison 317 | #317 Swalot Swalot Poison 318 | #318 Carvanha Carvanha Water Dark 319 | #319 Sharpedo Sharpedo Water Dark 320 | #320 Wailmer Wailmer Water 321 | #321 Wailord Wailord Water 322 | #322 Numel Numel Fire Ground 323 | #323 Camerupt Camerupt Fire Ground 324 | #324 Torkoal Torkoal Fire 325 | #325 Spoink Spoink Psychic 326 | #326 Grumpig Grumpig Psychic 327 | #327 Spinda Spinda Normal 328 | #328 Trapinch Trapinch Ground 329 | #329 Vibrava Vibrava Ground Dragon 330 | #330 Flygon Flygon Ground Dragon 331 | #331 Cacnea Cacnea Grass 332 | #332 Cacturne Cacturne Grass Dark 333 | #333 Swablu Swablu Normal Flying 334 | #334 Altaria Altaria Dragon Flying 335 | #335 Zangoose Zangoose Normal 336 | #336 Seviper Seviper Poison 337 | #337 Lunatone Lunatone Rock Psychic 338 | #338 Solrock Solrock Rock Psychic 339 | #339 Barboach Barboach Water Ground 340 | #340 Whiscash Whiscash Water Ground 341 | #341 Corphish Corphish Water 342 | #342 Crawdaunt Crawdaunt Water Dark 343 | #343 Baltoy Baltoy Ground Psychic 344 | #344 Claydol Claydol Ground Psychic 345 | #345 Lileep Lileep Rock Grass 346 | #346 Cradily Cradily Rock Grass 347 | #347 Anorith Anorith Rock Bug 348 | #348 Armaldo Armaldo Rock Bug 349 | #349 Feebas Feebas Water 350 | #350 Milotic Milotic Water 351 | #351 Castform Castform Fire 352 | #351 Castform Castform Ice 353 | #351 Castform Castform Normal 354 | #351 Castform Castform Water 355 | #352 Kecleon Kecleon Normal 356 | #353 Shuppet Shuppet Ghost 357 | #354 Banette Banette Ghost 358 | #355 Duskull Duskull Ghost 359 | #356 Dusclops Dusclops Ghost 360 | #357 Tropius Tropius Grass Flying 361 | #358 Chimecho Chimecho Psychic 362 | #359 Absol Absol Dark 363 | #360 Wynaut Wynaut Psychic 364 | #361 Snorunt Snorunt Ice 365 | #362 Glalie Glalie Ice 366 | #363 Spheal Spheal Ice Water 367 | #364 Sealeo Sealeo Ice Water 368 | #365 Walrein Walrein Ice Water 369 | #366 Clamperl Clamperl Water 370 | #367 Huntail Huntail Water 371 | #368 Gorebyss Gorebyss Water 372 | #369 Relicanth Relicanth Water Rock 373 | #370 Luvdisc Luvdisc Water 374 | #371 Bagon Bagon Dragon 375 | #372 Shelgon Shelgon Dragon 376 | #373 Salamence Salamence Dragon Flying 377 | #374 Beldum Beldum Steel Psychic 378 | #375 Metang Metang Steel Psychic 379 | #376 Metagross Metagross Steel Psychic 380 | #377 Regirock Regirock Rock 381 | #378 Regice Regice Ice 382 | #379 Registeel Registeel Steel 383 | #380 Latias Latias Dragon Psychic 384 | #381 Latios Latios Dragon Psychic 385 | #382 Kyogre Kyogre Water 386 | #383 Groudon Groudon Ground 387 | #384 Rayquaza Rayquaza Dragon Flying 388 | #385 Jirachi Jirachi Steel Psychic 389 | #386 Deoxys Deoxys Psychic 390 | #386 Deoxys Deoxys Psychic 391 | #386 Deoxys Deoxys Psychic 392 | #386 Deoxys Deoxys Psychic 393 | #387 Turtwig Turtwig Grass 394 | #388 Grotle Grotle Grass 395 | #389 Torterra Torterra Grass Ground 396 | #390 Chimchar Chimchar Fire 397 | #391 Monferno Monferno Fire Fighting 398 | #392 Infernape Infernape Fire Fighting 399 | #393 Piplup Piplup Water 400 | #394 Prinplup Prinplup Water 401 | #395 Empoleon Empoleon Water Steel 402 | #396 Starly Starly Normal Flying 403 | #397 Staravia Staravia Normal Flying 404 | #398 Staraptor Staraptor Normal Flying 405 | #399 Bidoof Bidoof Normal 406 | #400 Bibarel Bibarel Normal Water 407 | #401 Kricketot Kricketot Bug 408 | #402 Kricketune Kricketune Bug 409 | #403 Shinx Shinx Electric 410 | #404 Luxio Luxio Electric 411 | #405 Luxray Luxray Electric 412 | #406 Budew Budew Grass Poison 413 | #407 Roserade Roserade Grass Poison 414 | #408 Cranidos Cranidos Rock 415 | #409 Rampardos Rampardos Rock 416 | #410 Shieldon Shieldon Rock Steel 417 | #411 Bastiodon Bastiodon Rock Steel 418 | #412 Burmy Burmy Bug 419 | #412 Burmy Burmy Bug 420 | #412 Burmy Burmy Bug 421 | #413 Wormadam Wormadam Bug Grass 422 | #413 Wormadam Wormadam Bug Ground 423 | #413 Wormadam Wormadam Bug Steel 424 | #414 Mothim Mothim Bug Flying 425 | #415 Combee Combee Bug Flying 426 | #416 Vespiquen Vespiquen Bug Flying 427 | #417 Pachirisu Pachirisu Electric 428 | #418 Buizel Buizel Water 429 | #419 Floatzel Floatzel Water 430 | #420 Cherubi Cherubi Grass 431 | #421 Cherrim Cherrim Grass 432 | #422 Shellos Shellos Water 433 | #422 Shellos Shellos Water 434 | #423 Gastrodon Gastrodon Water Ground 435 | #423 Gastrodon Gastrodon Water Ground 436 | #424 Ambipom Ambipom Normal 437 | #425 Drifloon Drifloon Ghost Flying 438 | #426 Drifblim Drifblim Ghost Flying 439 | #427 Buneary Buneary Normal 440 | #428 Lopunny Lopunny Normal 441 | #429 Mismagius Mismagius Ghost 442 | #430 Honchkrow Honchkrow Dark Flying 443 | #431 Glameow Glameow Normal 444 | #432 Purugly Purugly Normal 445 | #433 Chingling Chingling Psychic 446 | #434 Stunky Stunky Poison Dark 447 | #435 Skuntank Skuntank Poison Dark 448 | #436 Bronzor Bronzor Steel Psychic 449 | #437 Bronzong Bronzong Steel Psychic 450 | #438 Bonsly Bonsly Rock 451 | #439 Mime Jr. Mime Jr. Psychic Fairy 452 | #440 Happiny Happiny Normal 453 | #441 Chatot Chatot Normal Flying 454 | #442 Spiritomb Spiritomb Ghost Dark 455 | #443 Gible Gible Dragon Ground 456 | #444 Gabite Gabite Dragon Ground 457 | #445 Garchomp Garchomp Dragon Ground 458 | #446 Munchlax Munchlax Normal 459 | #447 Riolu Riolu Fighting 460 | #448 Lucario Lucario Fighting Steel 461 | #449 Hippopotas Hippopotas Ground 462 | #450 Hippowdon Hippowdon Ground 463 | #451 Skorupi Skorupi Poison Bug 464 | #452 Drapion Drapion Poison Dark 465 | #453 Croagunk Croagunk Poison Fighting 466 | #454 Toxicroak Toxicroak Poison Fighting 467 | #455 Carnivine Carnivine Grass 468 | #456 Finneon Finneon Water 469 | #457 Lumineon Lumineon Water 470 | #458 Mantyke Mantyke Water Flying 471 | #459 Snover Snover Grass Ice 472 | #460 Abomasnow Abomasnow Grass Ice 473 | #461 Weavile Weavile Dark Ice 474 | #462 Magnezone Magnezone Electric Steel 475 | #463 Lickilicky Lickilicky Normal 476 | #464 Rhyperior Rhyperior Ground Rock 477 | #465 Tangrowth Tangrowth Grass 478 | #466 Electivire Electivire Electric 479 | #467 Magmortar Magmortar Fire 480 | #468 Togekiss Togekiss Fairy Flying 481 | #469 Yanmega Yanmega Bug Flying 482 | #470 Leafeon Leafeon Grass 483 | #471 Glaceon Glaceon Ice 484 | #472 Gliscor Gliscor Ground Flying 485 | #473 Mamoswine Mamoswine Ice Ground 486 | #474 Porygon-Z Porygon-Z Normal 487 | #475 Gallade Gallade Psychic Fighting 488 | #476 Probopass Probopass Rock Steel 489 | #477 Dusknoir Dusknoir Ghost 490 | #478 Froslass Froslass Ice Ghost 491 | #479 Rotom Rotom Electric Fire 492 | #479 Rotom Rotom Electric Flying 493 | #479 Rotom Rotom Electric Ghost 494 | #479 Rotom Rotom Electric Grass 495 | #479 Rotom Rotom Electric Ice 496 | #479 Rotom Rotom Electric Water 497 | #480 Uxie Uxie Psychic 498 | #481 Mesprit Mesprit Psychic 499 | #482 Azelf Azelf Psychic 500 | #483 Dialga Dialga Steel Dragon 501 | #484 Palkia Palkia Water Dragon 502 | #487 Giratina Giratina Ghost Dragon 503 | #487 Giratina Giratina Ghost Dragon 504 | #490 Manaphy Manaphy Water 505 | #494 Victini Victini Psychic Fire 506 | #495 Snivy Snivy Grass 507 | #496 Servine Servine Grass 508 | #497 Serperior Serperior Grass 509 | #498 Tepig Tepig Fire 510 | #499 Pignite Pignite Fire Fighting 511 | #500 Emboar Emboar Fire Fighting 512 | #501 Oshawott Oshawott Water 513 | #502 Dewott Dewott Water 514 | #503 Samurott Samurott Water 515 | #504 Patrat Patrat Normal 516 | #505 Watchog Watchog Normal 517 | #506 Lillipup Lillipup Normal 518 | #507 Herdier Herdier Normal 519 | #508 Stoutland Stoutland Normal 520 | #509 Purrloin Purrloin Dark 521 | #510 Liepard Liepard Dark 522 | #511 Pansage Pansage Grass 523 | #512 Simisage Simisage Grass 524 | #513 Pansear Pansear Fire 525 | #514 Simisear Simisear Fire 526 | #515 Panpour Panpour Water 527 | #516 Simipour Simipour Water 528 | #517 Munna Munna Psychic 529 | #518 Musharna Musharna Psychic 530 | #519 Pidove Pidove Normal Flying 531 | #520 Tranquill Tranquill Normal Flying 532 | #521 Unfezant Unfezant Normal Flying 533 | #521 Unfezant Unfezant Normal Flying 534 | #522 Blitzle Blitzle Electric 535 | #523 Zebstrika Zebstrika Electric 536 | #524 Roggenrola Roggenrola Rock 537 | #525 Boldore Boldore Rock 538 | #526 Gigalith Gigalith Rock 539 | #527 Woobat Woobat Psychic Flying 540 | #528 Swoobat Swoobat Psychic Flying 541 | #529 Drilbur Drilbur Ground 542 | #530 Excadrill Excadrill Ground Steel 543 | #531 Audino Audino Normal 544 | #532 Timburr Timburr Fighting 545 | #533 Gurdurr Gurdurr Fighting 546 | #534 Conkeldurr Conkeldurr Fighting 547 | #535 Tympole Tympole Water 548 | #536 Palpitoad Palpitoad Water Ground 549 | #537 Seismitoad Seismitoad Water Ground 550 | #538 Throh Throh Fighting 551 | #539 Sawk Sawk Fighting 552 | #540 Sewaddle Sewaddle Bug Grass 553 | #541 Swadloon Swadloon Bug Grass 554 | #542 Leavanny Leavanny Bug Grass 555 | #543 Venipede Venipede Bug Poison 556 | #544 Whirlipede Whirlipede Bug Poison 557 | #545 Scolipede Scolipede Bug Poison 558 | #546 Cottonee Cottonee Grass Fairy 559 | #547 Whimsicott Whimsicott Grass Fairy 560 | #548 Petilil Petilil Grass 561 | #549 Lilligant Lilligant Grass 562 | #550 Basculin Basculin Water 563 | #550 Basculin Basculin Water 564 | #551 Sandile Sandile Ground Dark 565 | #552 Krokorok Krokorok Ground Dark 566 | #553 Krookodile Krookodile Ground Dark 567 | #554 Darumaka Darumaka Fire 568 | #555 Darmanitan Darmanitan Fire 569 | #555 Darmanitan Darmanitan Fire Psychic 570 | #556 Maractus Maractus Grass 571 | #557 Dwebble Dwebble Bug Rock 572 | #558 Crustle Crustle Bug Rock 573 | #559 Scraggy Scraggy Dark Fighting 574 | #560 Scrafty Scrafty Dark Fighting 575 | #561 Sigilyph Sigilyph Psychic Flying 576 | #562 Yamask Yamask Ghost 577 | #563 Cofagrigus Cofagrigus Ghost 578 | #564 Tirtouga Tirtouga Water Rock 579 | #565 Carracosta Carracosta Water Rock 580 | #566 Archen Archen Rock Flying 581 | #567 Archeops Archeops Rock Flying 582 | #568 Trubbish Trubbish Poison 583 | #569 Garbodor Garbodor Poison 584 | #570 Zorua Zorua Dark 585 | #571 Zoroark Zoroark Dark 586 | #572 Minccino Minccino Normal 587 | #573 Cinccino Cinccino Normal 588 | #574 Gothita Gothita Psychic 589 | #575 Gothorita Gothorita Psychic 590 | #576 Gothitelle Gothitelle Psychic 591 | #577 Solosis Solosis Psychic 592 | #578 Duosion Duosion Psychic 593 | #579 Reuniclus Reuniclus Psychic 594 | #580 Ducklett Ducklett Water Flying 595 | #581 Swanna Swanna Water Flying 596 | #582 Vanillite Vanillite Ice 597 | #583 Vanillish Vanillish Ice 598 | #584 Vanilluxe Vanilluxe Ice 599 | #585 Deerling Deerling Normal Grass 600 | #586 Sawsbuck Sawsbuck Normal Grass 601 | #587 Emolga Emolga Electric Flying 602 | #588 Karrablast Karrablast Bug 603 | #589 Escavalier Escavalier Bug Steel 604 | #590 Foongus Foongus Grass Poison 605 | #591 Amoonguss Amoonguss Grass Poison 606 | #592 Frillish Frillish Water Ghost 607 | #592 Frillish Frillish Water Ghost 608 | #593 Jellicent Jellicent Water Ghost 609 | #593 Jellicent Jellicent Water Ghost 610 | #594 Alomomola Alomomola Water 611 | #595 Joltik Joltik Bug Electric 612 | #596 Galvantula Galvantula Bug Electric 613 | #597 Ferroseed Ferroseed Grass Steel 614 | #598 Ferrothorn Ferrothorn Grass Steel 615 | #599 Klink Klink Steel 616 | #600 Klang Klang Steel 617 | #601 Klinklang Klinklang Steel 618 | #602 Tynamo Tynamo Electric 619 | #603 Eelektrik Eelektrik Electric 620 | #604 Eelektross Eelektross Electric 621 | #605 Elgyem Elgyem Psychic 622 | #606 Beheeyem Beheeyem Psychic 623 | #607 Litwick Litwick Ghost Fire 624 | #608 Lampent Lampent Ghost Fire 625 | #609 Chandelure Chandelure Ghost Fire 626 | #610 Axew Axew Dragon 627 | #611 Fraxure Fraxure Dragon 628 | #612 Haxorus Haxorus Dragon 629 | #613 Cubchoo Cubchoo Ice 630 | #614 Beartic Beartic Ice 631 | #615 Cryogonal Cryogonal Ice 632 | #616 Shelmet Shelmet Bug 633 | #617 Accelgor Accelgor Bug 634 | #618 Stunfisk Stunfisk Ground Electric 635 | #619 Mienfoo Mienfoo Fighting 636 | #620 Mienshao Mienshao Fighting 637 | #621 Druddigon Druddigon Dragon 638 | #622 Golett Golett Ground Ghost 639 | #623 Golurk Golurk Ground Ghost 640 | #624 Pawniard Pawniard Dark Steel 641 | #625 Bisharp Bisharp Dark Steel 642 | #626 Bouffalant Bouffalant Normal 643 | #627 Rufflet Rufflet Normal Flying 644 | #628 Braviary Braviary Normal Flying 645 | #629 Vullaby Vullaby Dark Flying 646 | #630 Mandibuzz Mandibuzz Dark Flying 647 | #631 Heatmor Heatmor Fire 648 | #632 Durant Durant Bug Steel 649 | #633 Deino Deino Dark Dragon 650 | #634 Zweilous Zweilous Dark Dragon 651 | #635 Hydreigon Hydreigon Dark Dragon 652 | #636 Larvesta Larvesta Bug Fire 653 | #637 Volcarona Volcarona Bug Fire 654 | #638 Cobalion Cobalion Steel Fighting 655 | #639 Terrakion Terrakion Rock Fighting 656 | #640 Virizion Virizion Grass Fighting 657 | #641 Tornadus Tornadus Flying 658 | #642 Thundurus Thundurus Electric Flying 659 | #643 Reshiram Reshiram Dragon Fire 660 | #644 Zekrom Zekrom Dragon Electric 661 | #645 Landorus Landorus Ground Flying 662 | #646 Kyurem Kyurem Dragon Ice 663 | #647 Keldeo Keldeo Water Fighting 664 | #648 Meloetta Meloetta Normal Fighting 665 | #648 Meloetta Meloetta Normal Psychic 666 | #649 Genesect Genesect Bug Steel 667 | #650 Chespin Chespin Grass 668 | #651 Quilladin Quilladin Grass 669 | #652 Chesnaught Chesnaught Grass Fighting 670 | #653 Fennekin Fennekin Fire 671 | #654 Braixen Braixen Fire 672 | #655 Delphox Delphox Fire Psychic 673 | #656 Froakie Froakie Water 674 | #657 Frogadier Frogadier Water 675 | #658 Greninja Greninja Water Dark 676 | #659 Bunnelby Bunnelby Normal 677 | #660 Diggersby Diggersby Normal Ground 678 | #661 Fletchling Fletchling Normal Flying 679 | #662 Fletchinder Fletchinder Fire Flying 680 | #663 Talonflame Talonflame Fire Flying 681 | #664 Scatterbug Scatterbug Bug 682 | #665 Spewpa Spewpa Bug 683 | #666 Vivillon Vivillon Bug Flying 684 | #667 Litleo Litleo Fire Normal 685 | #668 Pyroar Pyroar Fire Normal 686 | #669 Flabébé Flabébé Fairy 687 | #670 Floette Floette Fairy 688 | #671 Florges Florges Fairy 689 | #672 Skiddo Skiddo Grass 690 | #673 Gogoat Gogoat Grass 691 | #674 Pancham Pancham Fighting 692 | #675 Pangoro Pangoro Fighting Dark 693 | #676 Furfrou Furfrou Normal 694 | #677 Espurr Espurr Psychic 695 | #678 Meowstic Meowstic Psychic 696 | #679 Honedge Honedge Steel Ghost 697 | #680 Doublade Doublade Steel Ghost 698 | #681 Aegislash Aegislash Steel Ghost 699 | #682 Spritzee Spritzee Fairy 700 | #683 Aromatisse Aromatisse Fairy 701 | #684 Swirlix Swirlix Fairy 702 | #685 Slurpuff Slurpuff Fairy 703 | #686 Inkay Inkay Dark Psychic 704 | #687 Malamar Malamar Dark Psychic 705 | #688 Binacle Binacle Rock Water 706 | #689 Barbaracle Barbaracle Rock Water 707 | #690 Skrelp Skrelp Poison Water 708 | #691 Dragalge Dragalge Poison Dragon 709 | #692 Clauncher Clauncher Water 710 | #693 Clawitzer Clawitzer Water 711 | #694 Helioptile Helioptile Electric Normal 712 | #695 Heliolisk Heliolisk Electric Normal 713 | #696 Tyrunt Tyrunt Rock Dragon 714 | #697 Tyrantrum Tyrantrum Rock Dragon 715 | #698 Amaura Amaura Rock Ice 716 | #699 Aurorus Aurorus Rock Ice 717 | #700 Sylveon Sylveon Fairy 718 | #701 Hawlucha Hawlucha Fighting Flying 719 | #702 Dedenne Dedenne Electric Fairy 720 | #703 Carbink Carbink Rock Fairy 721 | #704 Goomy Goomy Dragon 722 | #705 Sliggoo Sliggoo Dragon 723 | #706 Goodra Goodra Dragon 724 | #707 Klefki Klefki Steel Fairy 725 | #708 Phantump Phantump Ghost Grass 726 | #709 Trevenant Trevenant Ghost Grass 727 | #710 Pumpkaboo Pumpkaboo Ghost Grass 728 | #711 Gourgeist Gourgeist Ghost Grass 729 | #712 Bergmite Bergmite Ice 730 | #713 Avalugg Avalugg Ice 731 | #714 Noibat Noibat Flying Dragon 732 | #715 Noivern Noivern Flying Dragon 733 | #716 Xerneas Xerneas Fairy 734 | #717 Yveltal Yveltal Dark Flying 735 | #718 Zygarde Zygarde Dragon Ground 736 | #719 Diancie Diancie Rock Fairy 737 | #720 Hoopa Hoopa Psychic Dark 738 | #720 Hoopa Hoopa Psychic Ghost 739 | #721 Volcanion Volcanion Fire Water 740 | #722 Rowlet Rowlet Grass Flying 741 | #723 Dartrix Dartrix Grass Flying 742 | #724 Decidueye Decidueye Grass Ghost 743 | #725 Litten Litten Fire 744 | #726 Torracat Torracat Fire 745 | #727 Incineroar Incineroar Fire Dark 746 | #728 Popplio Popplio Water 747 | #729 Brionne Brionne Water 748 | #730 Primarina Primarina Water Fairy 749 | #731 Pikipek Pikipek Normal Flying 750 | #732 Trumbeak Trumbeak Normal Flying 751 | #733 Toucannon Toucannon Normal Flying 752 | #734 Yungoos Yungoos Normal 753 | #735 Gumshoos Gumshoos Normal 754 | #736 Grubbin Grubbin Bug 755 | #737 Charjabug Charjabug Bug Electric 756 | #738 Vikavolt Vikavolt Bug Electric 757 | #739 Crabrawler Crabrawler Fighting 758 | #740 Crabominable Crabominable Fighting Ice 759 | #741 Oricorio Oricorio Electric Flying 760 | #741 Oricorio Oricorio Fire Flying 761 | #741 Oricorio Oricorio Ghost Flying 762 | #741 Oricorio Oricorio Psychic Flying 763 | #742 Cutiefly Cutiefly Bug Fairy 764 | #743 Ribombee Ribombee Bug Fairy 765 | #744 Rockruff Rockruff Rock 766 | #745 Lycanroc Lycanroc Rock 767 | #746 Wishiwashi Wishiwashi Water 768 | #747 Mareanie Mareanie Poison Water 769 | #748 Toxapex Toxapex Poison Water 770 | #749 Mudbray Mudbray Ground 771 | #750 Mudsdale Mudsdale Ground 772 | #751 Dewpider Dewpider Water Bug 773 | #752 Araquanid Araquanid Water Bug 774 | #753 Fomantis Fomantis Grass 775 | #754 Lurantis Lurantis Grass 776 | #755 Morelull Morelull Grass Fairy 777 | #756 Shiinotic Shiinotic Grass Fairy 778 | #757 Salandit Salandit Poison Fire 779 | #758 Salazzle Salazzle Poison Fire 780 | #759 Stufful Stufful Normal Fighting 781 | #760 Bewear Bewear Normal Fighting 782 | #761 Bounsweet Bounsweet Grass 783 | #762 Steenee Steenee Grass 784 | #763 Tsareena Tsareena Grass 785 | #764 Comfey Comfey Fairy 786 | #765 Oranguru Oranguru Normal Psychic 787 | #766 Passimian Passimian Fighting 788 | #767 Wimpod Wimpod Bug Water 789 | #768 Golisopod Golisopod Bug Water 790 | #769 Sandygast Sandygast Ghost Ground 791 | #770 Palossand Palossand Ghost Ground 792 | #771 Pyukumuku Pyukumuku Water 793 | #772 Type: Null Type: Null Normal 794 | #773 Silvally Silvally Normal 795 | #774 Minior Minior Rock Flying 796 | #775 Komala Komala Normal 797 | #776 Turtonator Turtonator Fire Dragon 798 | #777 Togedemaru Togedemaru Electric Steel 799 | #778 Mimikyu Mimikyu Ghost Fairy 800 | #779 Bruxish Bruxish Water Psychic 801 | #780 Drampa Drampa Normal Dragon 802 | #781 Dhelmise Dhelmise Ghost Grass 803 | #782 Jangmo-o Jangmo-o Dragon 804 | #783 Hakamo-o Hakamo-o Dragon Fighting 805 | #784 Kommo-o Kommo-o Dragon Fighting 806 | #785 Tapu Koko Tapu Koko Electric Fairy 807 | #786 Tapu Lele Tapu Lele Psychic Fairy 808 | #787 Tapu Bulu Tapu Bulu Grass Fairy 809 | #788 Tapu Fini Tapu Fini Water Fairy 810 | #789 Cosmog Cosmog Psychic 811 | #790 Cosmoem Cosmoem Psychic 812 | #791 Solgaleo Solgaleo Psychic Steel 813 | #792 Lunala Lunala Psychic Ghost 814 | #793 Nihilego Nihilego Rock Poison 815 | #794 Buzzwole Buzzwole Bug Fighting 816 | #795 Pheromosa Pheromosa Bug Fighting 817 | #796 Xurkitree Xurkitree Electric 818 | #797 Celesteela Celesteela Steel Flying 819 | #798 Kartana Kartana Grass Steel 820 | #799 Guzzlord Guzzlord Dark Dragon 821 | #800 Necrozma Necrozma Psychic 822 | #801 Magearna Magearna Steel Fairy 823 | #802 Marshadow Marshadow Fighting Ghost 824 | #803 Poipole Poipole Poison 825 | #804 Naganadel Naganadel Poison Dragon 826 | #805 Stakataka Stakataka Rock Steel 827 | #806 Blacephalon Blacephalon Fire Ghost 828 | #807 Zeraora Zeraora Electric 829 | #808 Meltan Meltan Steel 830 | #809 Melmetal Melmetal Steel 831 | #810 Grookey Grookey Grass 832 | #811 Thwackey Thwackey Grass 833 | #812 Rillaboom Rillaboom Grass 834 | #813 Scorbunny Scorbunny Fire 835 | #814 Raboot Raboot Fire 836 | #815 Cinderace Cinderace Fire 837 | #816 Sobble Sobble Water 838 | #817 Drizzile Drizzile Water 839 | #818 Inteleon Inteleon Water 840 | #819 Skwovet Skwovet Normal 841 | #820 Greedent Greedent Normal 842 | #821 Rookidee Rookidee Flying 843 | #822 Corvisquire Corvisquire Flying 844 | #823 Corviknight Corviknight Flying Steel 845 | #824 Blipbug Blipbug Bug 846 | #825 Dottler Dottler Bug Psychic 847 | #826 Orbeetle Orbeetle Bug Psychic 848 | #827 Nickit Nickit Dark 849 | #828 Thievul Thievul Dark 850 | #829 Gossifleur Gossifleur Grass 851 | #830 Eldegoss Eldegoss Grass 852 | #831 Wooloo Wooloo Normal 853 | #832 Dubwool Dubwool Normal 854 | #833 Chewtle Chewtle Water 855 | #834 Drednaw Drednaw Water Rock 856 | #835 Yamper Yamper Electric 857 | #836 Boltund Boltund Electric 858 | #837 Rolycoly Rolycoly Rock 859 | #838 Carkol Carkol Rock Fire 860 | #839 Coalossal Coalossal Rock Fire 861 | #840 Applin Applin Grass Dragon 862 | #841 Flapple Flapple Grass Dragon 863 | #842 Appletun Appletun Grass Dragon 864 | #843 Silicobra Silicobra Ground 865 | #844 Sandaconda Sandaconda Ground 866 | #845 Cramorant Cramorant Flying Water 867 | #846 Arrokuda Arrokuda Water 868 | #847 Barraskewda Barraskewda Water 869 | #848 Toxel Toxel Electric Poison 870 | #849 Toxtricity Toxtricity Electric Poison 871 | #850 Sizzlipede Sizzlipede Fire Bug 872 | #851 Centiskorch Centiskorch Fire Bug 873 | #852 Clobbopus Clobbopus Fighting 874 | #853 Grapploct Grapploct Fighting 875 | #854 Sinistea Sinistea Ghost 876 | #855 Polteageist Polteageist Ghost 877 | #856 Hatenna Hatenna Psychic 878 | #857 Hattrem Hattrem Psychic 879 | #858 Hatterene Hatterene Psychic Fairy 880 | #859 Impidimp Impidimp Dark Fairy 881 | #860 Morgrem Morgrem Dark Fairy 882 | #861 Grimmsnarl Grimmsnarl Dark Fairy 883 | #862 Obstagoon Obstagoon Dark Normal 884 | #863 Perrserker Perrserker Steel 885 | #864 Cursola Cursola Ghost 886 | #865 Sirfetch'd Sirfetch'd Fighting 887 | #866 Mr. Rime Mr. Rime Ice Psychic 888 | #867 Runerigus Runerigus Ground Ghost 889 | #868 Milcery Milcery Fairy 890 | #869 Alcremie Alcremie Fairy 891 | #870 Falinks Falinks Fighting 892 | #871 Pincurchin Pincurchin Electric 893 | #872 Snom Snom Ice Bug 894 | #873 Frosmoth Frosmoth Ice Bug 895 | #874 Stonjourner Stonjourner Rock 896 | #875 Eiscue Eiscue Ice 897 | #876 Indeedee Indeedee Psychic Normal 898 | #877 Morpeko Morpeko Electric Dark 899 | #878 Cufant Cufant Steel 900 | #879 Copperajah Copperajah Steel 901 | #880 Dracozolt Dracozolt Electric Dragon 902 | #881 Arctozolt Arctozolt Electric Ice 903 | #882 Dracovish Dracovish Water Dragon 904 | #883 Arctovish Arctovish Water Ice 905 | #884 Duraludon Duraludon Steel Dragon 906 | #885 Dreepy Dreepy Dragon Ghost 907 | #886 Drakloak Drakloak Dragon Ghost 908 | #887 Dragapult Dragapult Dragon Ghost 909 | #888 Zacian Zacian Fairy 910 | #888 Zacian Zacian Fairy Steel 911 | #889 Zamazenta Zamazenta Fighting 912 | #889 Zamazenta Zamazenta Fighting Steel 913 | #890 Eternatus Eternatus Poison Dragon 914 | -------------------------------------------------------------------------------- /book_data/sample.csv: -------------------------------------------------------------------------------- 1 | Item,Quantity,Price 2 | $Banana, organic$,1,0.99 3 | Pear,7,1.24 4 | $Cake, chocolate$,1,14.50 5 | -------------------------------------------------------------------------------- /book_data/sample_frame.csv: -------------------------------------------------------------------------------- 1 | string_column,integer_column,float_column,date_column 2 | blue,3,2.99,2019-04-01 3 | green,7,6.54,2019-07-18 4 | yellow,10,9.78,1984-07-14 5 | red,5,5.17,2020-01-01 6 | -------------------------------------------------------------------------------- /book_data/shows-silicon-valley.json: -------------------------------------------------------------------------------- 1 | {"id":143,"url":"http://www.tvmaze.com/shows/143/silicon-valley","name":"Silicon Valley","type":"Scripted","language":"English","genres":["Comedy"],"status":"Ended","runtime":30,"premiered":"2014-04-06","officialSite":"http://www.hbo.com/silicon-valley/","schedule":{"time":"22:00","days":["Sunday"]},"rating":{"average":8.5},"weight":94,"network":{"id":8,"name":"HBO","country":{"name":"United States","code":"US","timezone":"America/New_York"}},"webChannel":null,"externals":{"tvrage":33759,"thetvdb":277165,"imdb":"tt2575988"},"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_portrait/215/538434.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/215/538434.jpg"},"summary":"

In the high-tech gold rush of modern Silicon Valley, the people most qualified to succeed are the least capable of handling success. From Mike Judge comes this satire about a programmer whose game-changing algorithm becomes the subject of a valley-wide bidding war.

","updated":1604233967,"_links":{"self":{"href":"http://api.tvmaze.com/shows/143"},"previousepisode":{"href":"http://api.tvmaze.com/episodes/1757884"}},"_embedded":{"episodes":[{"id":10897,"url":"http://www.tvmaze.com/episodes/10897/silicon-valley-1x01-minimum-viable-product","name":"Minimum Viable Product","season":1,"number":1,"type":"regular","airdate":"2014-04-06","airtime":"22:00","airstamp":"2014-04-07T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123633.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123633.jpg"},"summary":"

Attending an elaborate launch party, Richard and his computer programmer friends - Big Head, Dinesh and Gilfoyle - dream of making it big. Instead, they're living in the communal Hacker Hostel owned by former programmer Erlich, who gets to claim ten percent of anything they invent there. When it becomes clear that Richard has developed a powerful compression algorithm for his website, Pied Piper, he finds himself courted by Gavin Belson, his egomaniacal corporate boss, who offers a $10 million buyout by his firm, Hooli. But Richard holds back when well-known investor Peter Gregory makes a counteroffer.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10897"}}},{"id":10898,"url":"http://www.tvmaze.com/episodes/10898/silicon-valley-1x02-the-cap-table","name":"The Cap Table","season":1,"number":2,"type":"regular","airdate":"2014-04-13","airtime":"22:00","airstamp":"2014-04-14T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123634.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123634.jpg"},"summary":"

After a celebratory party at the Hacker Hostel, Richard and Erlich learn that Peter Gregory won't pay up until they deliver a viable business plan that includes a slimmed-downed staff. A desperate Richard hires former Belson underling Jared, and they set about trying to trim the fat. While Gilfoyle and Dinesh prove essential, Big Head's place in the company is less certain.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10898"}}},{"id":10899,"url":"http://www.tvmaze.com/episodes/10899/silicon-valley-1x03-articles-of-incorporation","name":"Articles of Incorporation","season":1,"number":3,"type":"regular","airdate":"2014-04-20","airtime":"22:00","airstamp":"2014-04-21T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123635.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123635.jpg"},"summary":"

While Gavin Belson begins to hype Nucleus, a competing compression platform, Richard learns that the name Pied Piper is already registered to a sprinkler company, forcing him to negotiate. Meanwhile, Erlich goes on a vision quest for a new company name, and Peter Gregory proves elusive when one of his companies asks for money.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10899"}}},{"id":10900,"url":"http://www.tvmaze.com/episodes/10900/silicon-valley-1x04-fiduciary-duties","name":"Fiduciary Duties","season":1,"number":4,"type":"regular","airdate":"2014-04-27","airtime":"22:00","airstamp":"2014-04-28T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123636.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123636.jpg"},"summary":"

At Peter's toga party, Richard drunkenly promises to make Erlich a board member, which he regrets the next morning. After being unassigned at Hooli, Big Head finds others like him who have made careers out of doing nothing. Richard struggles to put Pied Piper's vision into words for a presentation without Erlich; later, he discovers an interesting connection between Peter and Gavin Belson.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10900"}}},{"id":10901,"url":"http://www.tvmaze.com/episodes/10901/silicon-valley-1x05-signaling-risk","name":"Signaling Risk","season":1,"number":5,"type":"regular","airdate":"2014-05-04","airtime":"22:00","airstamp":"2014-05-05T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123637.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123637.jpg"},"summary":"

Erlich convinces a graffiti artist to create Pied Piper's logo, with controversial results. Jared tries to make the company more efficient. After Gavin Belson and Peter Gregory unexpectedly come face-to-face, Richard learns that he only has eight weeks to prepare for a live demo at TechCrunch Disrupt.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10901"}}},{"id":10902,"url":"http://www.tvmaze.com/episodes/10902/silicon-valley-1x06-third-party-insourcing","name":"Third Party Insourcing","season":1,"number":6,"type":"regular","airdate":"2014-05-11","airtime":"22:00","airstamp":"2014-05-12T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123638.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123638.jpg"},"summary":"

Richard feels threatened when the team hires \"The Carver\", a hacker with a notorious reputation, to help with Pied Piper's cloud. Jared finds himself taken for a ride when he seeks out Peter Gregory's signature. Erlich and Dinesh compete for the attention of Tara, Gilfoyle's visiting girlfriend. Later, Dinesh faces a sexual dilemma.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10902"}}},{"id":10903,"url":"http://www.tvmaze.com/episodes/10903/silicon-valley-1x07-proof-of-concept","name":"Proof of Concept","season":1,"number":7,"type":"regular","airdate":"2014-05-18","airtime":"22:00","airstamp":"2014-05-19T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123639.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123639.jpg"},"summary":"

At TechCrunch Disrupt, Richard feels the pressure to finish his demo, but finds himself distracted by a girl he dated briefly, who's now spreading rumors about him. Jared worries that Monica is taking his place in the company. Dinesh develops a crush on a girl at a neighboring booth. Erlich's scandalous past connection to one of the judges threatens Pied Piper's chances.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10903"}}},{"id":10904,"url":"http://www.tvmaze.com/episodes/10904/silicon-valley-1x08-optimal-tip-to-tip-efficiency","name":"Optimal Tip-to-Tip Efficiency","season":1,"number":8,"type":"regular","airdate":"2014-06-01","airtime":"22:00","airstamp":"2014-06-02T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123640.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123640.jpg"},"summary":"

Poised to compete at TechCrunch Disrupt, the guys of Pied Piper become worried after an impressive presentation by Gavin Belson. As Jared tries to pivot the company, Richard is inspired to make big changes at the last minute.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/10904"}}},{"id":117409,"url":"http://www.tvmaze.com/episodes/117409/silicon-valley-2x01-sand-hill-shuffle","name":"Sand Hill Shuffle","season":2,"number":1,"type":"regular","airdate":"2015-04-12","airtime":"22:00","airstamp":"2015-04-13T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123616.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123616.jpg"},"summary":"

Season 2 begins with the Pied Piper guys being wined and dined by every venture capitalist under the sun, while Monica adjusts to a new managing partner at Raviga as the company faces major changes.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/117409"}}},{"id":142992,"url":"http://www.tvmaze.com/episodes/142992/silicon-valley-2x02-runaway-devaluation","name":"Runaway Devaluation","season":2,"number":2,"type":"regular","airdate":"2015-04-19","airtime":"22:00","airstamp":"2015-04-20T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123618.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123618.jpg"},"summary":"

Pied Piper could go under if Richard and the guys can't find legal and financial help in the wake of Hooli's bombshell. Meanwhile, Dinesh tries to thwart a fund-raising campaign for his cousin's new app; and Monica tries to keep her interest in Pied Piper separate from her job.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/142992"}}},{"id":142993,"url":"http://www.tvmaze.com/episodes/142993/silicon-valley-2x03-bad-money","name":"Bad Money","season":2,"number":3,"type":"regular","airdate":"2015-04-26","airtime":"22:00","airstamp":"2015-04-27T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123619.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123619.jpg"},"summary":"

Richard mulls a proposal by Gavin, but also considers a pitch from Russ Hanneman about backing Pied Piper. Meanwhile, Monica learns surprising news about Richard's deal with Hooli; and Gilfoyle and Dinesh go to extremes to get what they want.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/142993"}}},{"id":142994,"url":"http://www.tvmaze.com/episodes/142994/silicon-valley-2x04-the-lady","name":"The Lady","season":2,"number":4,"type":"regular","airdate":"2015-05-03","airtime":"22:00","airstamp":"2015-05-04T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123620.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123620.jpg"},"summary":"

Richard butts heads with Erlich over a prospective hire, while Dinesh and Gilfoyle become suspicious that a new employee they recommended is commanding a higher salary. Big Head gets a surprising promotion at Hooli. At a board meeting, Monica and Richard find themselves outvoted; Jared institutes a workplace harassment policy.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/142994"}}},{"id":153965,"url":"http://www.tvmaze.com/episodes/153965/silicon-valley-2x05-server-space","name":"Server Space","season":2,"number":5,"type":"regular","airdate":"2015-05-10","airtime":"22:00","airstamp":"2015-05-11T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/10/27312.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/10/27312.jpg"},"summary":"

Gavin creates interference that hinders Pied Piper's expansion. Meanwhile, the guys could be threatened by a nosy neighbor; Guilfoyle sets out to build servers; Richard's reluctant to let Jared move in; and Big Head's leadership skills are suspect.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/153965"}}},{"id":154580,"url":"http://www.tvmaze.com/episodes/154580/silicon-valley-2x06-homicide","name":"Homicide","season":2,"number":6,"type":"regular","airdate":"2015-05-17","airtime":"22:00","airstamp":"2015-05-18T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123622.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123622.jpg"},"summary":"

Erlich runs into an old protégé who's now an energy drink billionaire when Monica urges the guys to pursue a livestream opportunity, but Richard learns their friendship isn't what Erlich thinks it is. Meanwhile, Jared wants Carla and Monica to be friends.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/154580"}}},{"id":155129,"url":"http://www.tvmaze.com/episodes/155129/silicon-valley-2x07-adult-content","name":"Adult Content","season":2,"number":7,"type":"regular","airdate":"2015-05-24","airtime":"22:00","airstamp":"2015-05-25T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123623.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123623.jpg"},"summary":"

The team fields job offers and Russ is distracted by financial news, but Richard realizes Pied Piper could fold if it doesn't merge with a hated rival. Meanwhile, Dinesh tries to woo a woman online; and Gavin looks on the bright side of Nucleus' failure.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/155129"}}},{"id":155130,"url":"http://www.tvmaze.com/episodes/155130/silicon-valley-2x08-white-hatblack-hat","name":"White Hat/Black Hat","season":2,"number":8,"type":"regular","airdate":"2015-05-31","airtime":"22:00","airstamp":"2015-06-01T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123624.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123624.jpg"},"summary":"

Richard gets paranoid about security after he takes pity on a competitor and inadvertently starts a feud. Meanwhile, Jared fibs about Pied Piper's size; and Gavin looks for a scapegoat when he feels pressure from board members.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/155130"}}},{"id":155199,"url":"http://www.tvmaze.com/episodes/155199/silicon-valley-2x09-binding-arbitration","name":"Binding Arbitration","season":2,"number":9,"type":"regular","airdate":"2015-06-07","airtime":"22:00","airstamp":"2015-06-08T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123625.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123625.jpg"},"summary":"

Erlich wants to testify when Pied Piper and Hooli enter binding arbitration, but Richard worries that his rival's claims could have merit. Meanwhile, Jared, Dinesh and Gilfoyle debate a philosophical theory; and Big Head gets a boost.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/155199"}}},{"id":155200,"url":"http://www.tvmaze.com/episodes/155200/silicon-valley-2x10-two-days-of-the-condor","name":"Two Days of the Condor","season":2,"number":10,"type":"regular","airdate":"2015-06-14","airtime":"22:00","airstamp":"2015-06-15T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/49/123626.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/49/123626.jpg"},"summary":"

As the guys await the verdict on Pied Piper's fate, an unexpected real-life drama draws a spike in traffic to their livestream and leaves them fighting to hold things together - literally. While Erlich considers his future, Richard scrambles to save Pied Piper's.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/155200"}}},{"id":560883,"url":"http://www.tvmaze.com/episodes/560883/silicon-valley-3x01-founder-friendly","name":"Founder Friendly","season":3,"number":1,"type":"regular","airdate":"2016-04-24","airtime":"22:00","airstamp":"2016-04-25T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/61/154297.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/61/154297.jpg"},"summary":"

After being unceremoniously fired, an angry Richard faces a tough decision: accept the diminished role of CTO, or leave Pied Piper for good. Erlich takes a shine to Jack Barker, Laurie's new choice of CEO, while Dinesh and Gilfoyle weigh their options in Richard's absence. At Hooli, Gavin tries to improve his image by admitting failure, and Big Head gets wind of major changes.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/560883"}}},{"id":668661,"url":"http://www.tvmaze.com/episodes/668661/silicon-valley-3x02-two-in-the-box","name":"Two in the Box","season":3,"number":2,"type":"regular","airdate":"2016-05-01","airtime":"22:00","airstamp":"2016-05-02T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/55/138938.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/55/138938.jpg"},"summary":"

The new and improved Pied Piper impresses Dinesh and Gilfoyle, but worries Richard; Jared and Erlich both face housing issues; Gavin suggests a controversial move.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/668661"}}},{"id":668662,"url":"http://www.tvmaze.com/episodes/668662/silicon-valley-3x03-meinertzhagens-haversack","name":"Meinertzhagen's Haversack","season":3,"number":3,"type":"regular","airdate":"2016-05-08","airtime":"22:00","airstamp":"2016-05-09T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/56/142406.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/56/142406.jpg"},"summary":"

Richard searches for a way around Jack; Gilfoyle opens himself up to recruiters; Dinesh draws unwanted attention from a recent purchase.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/668662"}}},{"id":670680,"url":"http://www.tvmaze.com/episodes/670680/silicon-valley-3x04-maleant-data-systems-solutions","name":"Maleant Data Systems Solutions","season":3,"number":4,"type":"regular","airdate":"2016-05-15","airtime":"22:00","airstamp":"2016-05-16T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/58/145807.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/58/145807.jpg"},"summary":"

The Pied Piper guys struggle to phone it in; Erlich faces competition; Monica takes a stand; Gavin makes a decision about Nucleus.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/670680"}}},{"id":670682,"url":"http://www.tvmaze.com/episodes/670682/silicon-valley-3x05-the-empty-chair","name":"The Empty Chair","season":3,"number":5,"type":"regular","airdate":"2016-05-22","airtime":"22:00","airstamp":"2016-05-23T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/59/147688.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/59/147688.jpg"},"summary":"

Richard lets his ego get in the way at an interview; Dinesh, Gilfoyle and Jared misplace hardware; Erlich pitches his plans to Big Head.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/670682"}}},{"id":670681,"url":"http://www.tvmaze.com/episodes/670681/silicon-valley-3x06-bachmanity-insanity","name":"Bachmanity Insanity","season":3,"number":6,"type":"regular","airdate":"2016-05-29","airtime":"22:00","airstamp":"2016-05-30T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/60/150310.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/60/150310.jpg"},"summary":"

Richard's new relationship is threatened by neuroses; Big Head and Erlich's launch party has snags; Dinesh falls for a foreign coworker.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/670681"}}},{"id":717453,"url":"http://www.tvmaze.com/episodes/717453/silicon-valley-3x07-to-build-a-better-beta","name":"To Build a Better Beta","season":3,"number":7,"type":"regular","airdate":"2016-06-05","airtime":"22:00","airstamp":"2016-06-06T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/61/152985.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/61/152985.jpg"},"summary":"

When the guys decide to release the beta version of Pied Piper, they receive an unexpected response. With a limited number of beta invites each, Dinesh worries about his lack of friends, while Gilfoyle looks to catch him in a lie. Monica worries about how to deliver criticism. Facing financial woes, Erlich considers a big decision, and Gavin challenges the Nucleus team to do the impossible.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/717453"}}},{"id":729570,"url":"http://www.tvmaze.com/episodes/729570/silicon-valley-3x08-bachmans-earnings-over-ride","name":"Bachman's Earning's Over-ride","season":3,"number":8,"type":"regular","airdate":"2016-06-12","airtime":"22:00","airstamp":"2016-06-13T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/61/154296.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/61/154296.jpg"},"summary":"

Erlich struggles to come clean to Richard, who is forced to make a choice between their friendship and the company's future. Jared's new Pied Piper apparel makes a splash, and divides Dinesh and Gilfoyle. As Gavin faces major life changes, the guys celebrate a rare victory.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/729570"}}},{"id":729571,"url":"http://www.tvmaze.com/episodes/729571/silicon-valley-3x09-daily-active-users","name":"Daily Active Users","season":3,"number":9,"type":"regular","airdate":"2016-06-19","airtime":"22:00","airstamp":"2016-06-20T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/62/155043.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/62/155043.jpg"},"summary":"

Upon discovering surprising stats, Richard attempts to bridge the gap between Pied Piper and its users, leading Jared to take drastic measures to hold everything together. Gavin learns secrets about the competition and decides to bring in a new face to reclaim his former glory.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/729571"}}},{"id":729572,"url":"http://www.tvmaze.com/episodes/729572/silicon-valley-3x10-the-uptick","name":"The Uptick","season":3,"number":10,"type":"regular","airdate":"2016-06-26","airtime":"22:00","airstamp":"2016-06-27T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/64/161596.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/64/161596.jpg"},"summary":"

With Pied Piper's future in question, Erlich's publicity success leaves Richard in a moral quandary, just as Dinesh's video-chat app gathers speed. While Laurie prepares to jump ship, Gavin's comeback at Hooli is threatened by his displays of grandeur.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/729572"}}},{"id":1069871,"url":"http://www.tvmaze.com/episodes/1069871/silicon-valley-4x01-success-failure","name":"Success Failure","season":4,"number":1,"type":"regular","airdate":"2017-04-23","airtime":"22:00","airstamp":"2017-04-24T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/108/271257.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/108/271257.jpg"},"summary":"

In the wake of Pied Piper's clickfarm scandal, the guys struggle to find funding for Pied Piper's video-chat app to keep up with their rapidly growing user base. Erlich faces resistance from Big Head's dad, while Gavin balks after Jack steps on his toes at Hooli. Having a hard time adjusting to his company's pivot, Richard gets sage advice from an unexpected source, leading him to a big idea that could change his future.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1069871"}}},{"id":1093340,"url":"http://www.tvmaze.com/episodes/1093340/silicon-valley-4x02-terms-of-service","name":"Terms of Service","season":4,"number":2,"type":"regular","airdate":"2017-04-30","airtime":"22:00","airstamp":"2017-05-01T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/109/274591.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/109/274591.jpg"},"summary":"

Richard butts heads with Dinesh, whose new position goes to his head. Later, Richard discovers interesting data about PiperChat's users. Erlich makes a play to be involved in Jian-Yang's new app; Jared sets ground rules in his friendship with Richard; Gavin's paranoia over Jack's enthusiasm causes him to make a rash decision.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1093340"}}},{"id":1119032,"url":"http://www.tvmaze.com/episodes/1119032/silicon-valley-4x03-intellectual-property","name":"Intellectual Property","season":4,"number":3,"type":"regular","airdate":"2017-05-07","airtime":"22:00","airstamp":"2017-05-08T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/110/277036.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/110/277036.jpg"},"summary":"

An overtired Richard pushes himself to the brink of sanity while trying to move ahead with his next big idea. Eyeing a comeback, Erlich pressures an uncooperative Jian-Yang. Monica sets a trap at Raviga to improve her standing with Laurie. Dinesh goes on a date; Big Head enters the world of academia; Gavin faces an unknown future.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1119032"}}},{"id":1119033,"url":"http://www.tvmaze.com/episodes/1119033/silicon-valley-4x04-teambuilding-exercise","name":"Teambuilding Exercise","season":4,"number":4,"type":"regular","airdate":"2017-05-14","airtime":"22:00","airstamp":"2017-05-15T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/111/279792.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/111/279792.jpg"},"summary":"

When Richard reaches out to an unlikely ally, Jared worries about the company he's keeping. Gilfoyle gets tough on security in the wake of Dinesh's latest dalliance; later, he considers whether to put his pride aside for a job. Concerned about Jian-Yang's commitment to his app, Erlich takes matters into his own hands.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1119033"}}},{"id":1151634,"url":"http://www.tvmaze.com/episodes/1151634/silicon-valley-4x05-the-blood-boy","name":"The Blood Boy","season":4,"number":5,"type":"regular","airdate":"2017-05-21","airtime":"22:00","airstamp":"2017-05-22T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/112/282428.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/112/282428.jpg"},"summary":"

Cracks in Richard's latest partnership become more apparent when he's forced to deal with an unexpected interloper. As things get more serious, Dinesh scrambles to find a way out of his new relationship. After learning of surprising developments afoot at Raviga, Monica has trouble deciding which horse to back.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1151634"}}},{"id":1151635,"url":"http://www.tvmaze.com/episodes/1151635/silicon-valley-4x06-customer-service","name":"Customer Service","season":4,"number":6,"type":"regular","airdate":"2017-05-28","airtime":"22:00","airstamp":"2017-05-29T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/114/285730.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/114/285730.jpg"},"summary":"

On the hunt for financial support, Richard looks outside the tech bubble, and crosses paths with a contentious figure from Pied Piper's past. Erlich reaches out to Monica and Laurie in search of a new endeavor. A launch gone wrong finds Dinesh and Gilfoyle at war as Jared tries to keep the peace.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1151635"}}},{"id":1151636,"url":"http://www.tvmaze.com/episodes/1151636/silicon-valley-4x07-the-patent-troll","name":"The Patent Troll","season":4,"number":7,"type":"regular","airdate":"2017-06-04","airtime":"22:15","airstamp":"2017-06-05T02:15:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/115/288264.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/115/288264.jpg"},"summary":"

When Richard decides to stand up to a patent troll, his defiance ends up coming back to haunt him. Gilfoyle goes to desperate measures to battle Jian-Yang's new smart fridge. Jared embraces multiple identities in his quest to reduce costs. Erlich tries to hang with the alpha males.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1151636"}}},{"id":1151637,"url":"http://www.tvmaze.com/episodes/1151637/silicon-valley-4x08-the-keenan-vortex","name":"The Keenan Vortex","season":4,"number":8,"type":"regular","airdate":"2017-06-11","airtime":"22:00","airstamp":"2017-06-12T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/116/291044.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/116/291044.jpg"},"summary":"

An unexpected increase in data traffic leads Richard to turn to Erlich for help in garnering a deal with Keenan Feldspar, Silicon Valley's latest \"it\" boy – but when Keenan makes a too-good-to-resist offer, Richard must weigh Pied Piper's future against a potential mutiny. Jack faces setbacks while preparing for Hooli-Con.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1151637"}}},{"id":1168582,"url":"http://www.tvmaze.com/episodes/1168582/silicon-valley-4x09-hooli-con","name":"Hooli-Con","season":4,"number":9,"type":"regular","airdate":"2017-06-18","airtime":"22:00","airstamp":"2017-06-19T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/117/293508.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/117/293508.jpg"},"summary":"

The Pied Piper guys try to pull off a stealth plan at Hooli-Con.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1168582"}}},{"id":1168583,"url":"http://www.tvmaze.com/episodes/1168583/silicon-valley-4x10-server-error","name":"Server Error","season":4,"number":10,"type":"regular","airdate":"2017-06-25","airtime":"22:00","airstamp":"2017-06-26T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/118/295689.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/118/295689.jpg"},"summary":"

Richard finds himself in a web of lies; Jared plans his exit; Jack bets big; Gavin plots a comeback.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1168583"}}},{"id":1387194,"url":"http://www.tvmaze.com/episodes/1387194/silicon-valley-5x01-grow-fast-or-die-slow","name":"Grow Fast or Die Slow","season":5,"number":1,"type":"regular","airdate":"2018-03-25","airtime":"22:00","airstamp":"2018-03-26T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/149/374260.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/149/374260.jpg"},"summary":"

Now that Pied Piper has ample funding and new offices, the pressure to get things right stymies Richard and forces him to grow the company in a way he hadn't planned. A picky Dinesh and Gilfoyle question their ability to make good decisions. After returning to Hooli, Gavin worries about becoming antiquated.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1387194"}}},{"id":1419259,"url":"http://www.tvmaze.com/episodes/1419259/silicon-valley-5x02-reorientation","name":"Reorientation","season":5,"number":2,"type":"regular","airdate":"2018-04-01","airtime":"22:00","airstamp":"2018-04-02T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/151/378576.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/151/378576.jpg"},"summary":"

Panicked by suddenly leading a much larger team, Richard finds himself managing a number of small conflicts in his efforts to unite his new employees. Dinesh celebrates a new purchase that Gilfoyle looks to spoil. Jian-Yang goes to court. Gavin meets pushback over his signature and what it says about him.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1419259"}}},{"id":1419260,"url":"http://www.tvmaze.com/episodes/1419260/silicon-valley-5x03-chief-operating-officer","name":"Chief Operating Officer","season":5,"number":3,"type":"regular","airdate":"2018-04-08","airtime":"22:00","airstamp":"2018-04-09T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/152/380370.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/152/380370.jpg"},"summary":"

Encouraged by Jared to strike up a friendship with Dana, a like-minded CEO, Richard instead finds himself charmed by Dana's COO, who challenges his loyalty to Gilfoyle. Facing limited housing options thanks to his impulse purchase, Dinesh searches for a new roommate.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1419260"}}},{"id":1436933,"url":"http://www.tvmaze.com/episodes/1436933/silicon-valley-5x04-tech-evangelist","name":"Tech Evangelist","season":5,"number":4,"type":"regular","airdate":"2018-04-15","airtime":"22:00","airstamp":"2018-04-16T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/152/381119.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/152/381119.jpg"},"summary":"

Attempting to woo a gaming company to PiperNet, Richard inadvertently angers a prized ally; Dinesh deals with a betrayal; in preparation for a big launch, Gavin leaves his underlings with a cryptic message; Jared gets inside information from Big Head.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1436933"}}},{"id":1436934,"url":"http://www.tvmaze.com/episodes/1436934/silicon-valley-5x05-facial-recognition","name":"Facial Recognition","season":5,"number":5,"type":"regular","airdate":"2018-04-22","airtime":"22:15","airstamp":"2018-04-23T02:15:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/152/381117.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/152/381117.jpg"},"summary":"

Overshadowed by Jared in an on-camera interview, Richard's confidence wavers further when Laurie and Monica force him to work with Eklow, a new artificial-intelligence company. Gilfoyle worries about the prospect of introducing AI into Pied Piper. Dinesh makes Jared self-conscious ahead of a second interview. Gavin questions his future beyond Hooli.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1436934"}}},{"id":1436935,"url":"http://www.tvmaze.com/episodes/1436935/silicon-valley-5x06-artificial-emotional-intelligence","name":"Artificial Emotional Intelligence","season":5,"number":6,"type":"regular","airdate":"2018-04-29","airtime":"22:00","airstamp":"2018-04-30T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/155/389559.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/155/389559.jpg"},"summary":"

Richard decides to help when Laurie gets in a jam, but his lack of emotional discipline threatens to backfire on Pied Piper. Gavin tries to make a deal with a stubborn partner while abroad. Dinesh relishes a rare win. Jared bonds with a surprising figure.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1436935"}}},{"id":1445226,"url":"http://www.tvmaze.com/episodes/1445226/silicon-valley-5x07-initial-coin-offering","name":"Initial Coin Offering","season":5,"number":7,"type":"regular","airdate":"2018-05-06","airtime":"22:00","airstamp":"2018-05-07T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/155/387703.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/155/387703.jpg"},"summary":"

As the Pied Piper guys prepare to close on their Series B funding, Richard receives some unsettling news. Gilfoyle suggests a risky proposition and Monica gets blunt with Richard. Dinesh goes to great lengths to compete with a coworker, while Jared keeps close watch on Richard's new assistant. In search of a better deal, Gavin tries to charm small-town America.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1445226"}}},{"id":1445227,"url":"http://www.tvmaze.com/episodes/1445227/silicon-valley-5x08-fifty-one-percent","name":"Fifty-One Percent","season":5,"number":8,"type":"regular","airdate":"2018-05-13","airtime":"22:15","airstamp":"2018-05-14T02:15:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/155/389554.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/155/389554.jpg"},"summary":"

The launch of PiperNet finds Monica suspicious of an early success, and the team must race against the clock as their future is threatened. Realizing he's made more enemies then friends, Richard makes a surprising move.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1445227"}}},{"id":1698000,"url":"http://www.tvmaze.com/episodes/1698000/silicon-valley-6x01-artificial-lack-of-intelligence","name":"Artificial Lack of Intelligence","season":6,"number":1,"type":"regular","airdate":"2019-10-27","airtime":"22:00","airstamp":"2019-10-28T02:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/217/543706.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/217/543706.jpg"},"summary":"

Richard discovers his promise to keep Pied Piper free from collecting user data is under threat. Jared finds himself missing his role as Richard's go-to guy and revisits the hacker hostel. Gilfoyle devises a creative way to deal with Dinesh's complaining.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1698000"}}},{"id":1730750,"url":"http://www.tvmaze.com/episodes/1730750/silicon-valley-6x02-blood-money","name":"Blood Money","season":6,"number":2,"type":"regular","airdate":"2019-11-03","airtime":"22:00","airstamp":"2019-11-04T03:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/218/545800.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/218/545800.jpg"},"summary":"

When Richard meets an interested investor with a sketchy past, he must weigh his integrity against a tempting financial offer. Jared seeks a new future, putting him at odds with Richard. A defiant Gilfoyle butts heads with HR over his lack of direct reports. Hoover and Denpok worry when Gavin explores alternative solutions to keep a leaner Hooli afloat.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1730750"}}},{"id":1730751,"url":"http://www.tvmaze.com/episodes/1730751/silicon-valley-6x03-hooli-smokes","name":"Hooli Smokes!","season":6,"number":3,"type":"regular","airdate":"2019-11-10","airtime":"22:00","airstamp":"2019-11-11T03:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/218/546788.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/218/546788.jpg"},"summary":"

The Pied Piper team races to close a major deal; Dinesh considers being a better person; an angry Jared reluctantly helps Richard.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1730751"}}},{"id":1732276,"url":"http://www.tvmaze.com/episodes/1732276/silicon-valley-6x04-maximizing-alphaness","name":"Maximizing Alphaness","season":6,"number":4,"type":"regular","airdate":"2019-11-17","airtime":"22:00","airstamp":"2019-11-18T03:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/221/554758.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/221/554758.jpg"},"summary":"

Richard's authority is threatened by his former Hooli manager; Monica tries to prove her support of other women; Gavin sets his sights on the literary world.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1732276"}}},{"id":1757882,"url":"http://www.tvmaze.com/episodes/1757882/silicon-valley-6x05-tethics","name":"Tethics","season":6,"number":5,"type":"regular","airdate":"2019-11-24","airtime":"22:00","airstamp":"2019-11-25T03:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/227/568829.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/227/568829.jpg"},"summary":"

Richard fumes at Gavin's new ethical stance; Dinesh's trip turns into a nightmare; Gilfoyle and Monica work on their peer review scores.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1757882"}}},{"id":1757883,"url":"http://www.tvmaze.com/episodes/1757883/silicon-valley-6x06-russfest","name":"RussFest","season":6,"number":6,"type":"regular","airdate":"2019-12-01","airtime":"22:00","airstamp":"2019-12-02T03:00:00+00:00","runtime":30,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/227/568817.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/227/568817.jpg"},"summary":"

A major event puts PiperNet's capabilities to the test; Monica confronts Jian-Yang over his use of Pied Piper's name.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1757883"}}},{"id":1757884,"url":"http://www.tvmaze.com/episodes/1757884/silicon-valley-6x07-exit-event","name":"Exit Event","season":6,"number":7,"type":"regular","airdate":"2019-12-08","airtime":"22:05","airstamp":"2019-12-09T03:05:00+00:00","runtime":48,"image":{"medium":"http://static.tvmaze.com/uploads/images/medium_landscape/230/576385.jpg","original":"http://static.tvmaze.com/uploads/images/original_untouched/230/576385.jpg"},"summary":"

Richard and the Pied Piper team look to pull off a spectacular feat on the day of a big launch.

","_links":{"self":{"href":"http://api.tvmaze.com/episodes/1757884"}}}]}} 2 | -------------------------------------------------------------------------------- /conf/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master spark://spark-master:7077 2 | spark.eventLog.enabled true 3 | spark.eventLog.dir /opt/spark/spark-events 4 | spark.history.fs.logDirectory /opt/spark/spark-events 5 | -------------------------------------------------------------------------------- /docker-compose.yarn.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | spark-yarn-master: 5 | container_name: da-spark-yarn-master 6 | build: 7 | dockerfile: Dockerfile-yarn 8 | context: . 9 | image: da-spark-yarn-image 10 | entrypoint: ['./entrypoint.sh', 'master'] 11 | volumes: 12 | - ./book_data:/opt/spark/data 13 | - ./spark_apps:/opt/spark/apps 14 | env_file: 15 | - .env.spark 16 | ports: 17 | - '9090:8080' 18 | - '9870:9870' 19 | - '7077:7077' 20 | - '8088:8088' 21 | 22 | 23 | spark-yarn-worker: 24 | # container_name: da-spark-worker 25 | image: da-spark-yarn-image 26 | entrypoint: ['./entrypoint.sh', 'worker'] 27 | depends_on: 28 | - spark-yarn-master 29 | env_file: 30 | - .env.spark 31 | volumes: 32 | - ./book_data:/opt/spark/data 33 | - ./spark_apps:/opt/spark/apps 34 | 35 | yarn-history-server: 36 | container_name: da-spark-yarn-history 37 | image: da-spark-yarn-image 38 | entrypoint: ['./entrypoint.sh', 'history'] 39 | depends_on: 40 | - spark-yarn-master 41 | env_file: 42 | - .env.spark 43 | ports: 44 | - '18080:18080' 45 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | spark-master: 5 | container_name: da-spark-master 6 | build: . 7 | image: da-spark-image 8 | entrypoint: ['./entrypoint.sh', 'master'] 9 | healthcheck: 10 | test: [ "CMD", "curl", "-f", "http://localhost:8080" ] 11 | interval: 5s 12 | timeout: 3s 13 | retries: 3 14 | volumes: 15 | - ./book_data:/opt/spark/data 16 | - ./spark_apps:/opt/spark/apps 17 | - spark-logs:/opt/spark/spark-events 18 | env_file: 19 | - .env.spark 20 | ports: 21 | - '9090:8080' 22 | - '7077:7077' 23 | 24 | 25 | spark-history-server: 26 | container_name: da-spark-history 27 | image: da-spark-image 28 | entrypoint: ['./entrypoint.sh', 'history'] 29 | depends_on: 30 | - spark-master 31 | env_file: 32 | - .env.spark 33 | volumes: 34 | - spark-logs:/opt/spark/spark-events 35 | ports: 36 | - '18080:18080' 37 | 38 | spark-worker: 39 | # container_name: da-spark-worker 40 | image: da-spark-image 41 | entrypoint: ['./entrypoint.sh', 'worker'] 42 | depends_on: 43 | - spark-master 44 | env_file: 45 | - .env.spark 46 | volumes: 47 | - ./book_data:/opt/spark/data 48 | - ./spark_apps:/opt/spark/apps 49 | - spark-logs:/opt/spark/spark-events 50 | 51 | volumes: 52 | spark-logs: 53 | -------------------------------------------------------------------------------- /entrypoint-yarn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPARK_WORKLOAD=$1 4 | 5 | echo "SPARK_WORKLOAD: $SPARK_WORKLOAD" 6 | 7 | /etc/init.d/ssh start 8 | 9 | if [ "$SPARK_WORKLOAD" == "master" ]; 10 | then 11 | hdfs namenode -format 12 | 13 | # start the master node processes 14 | hdfs --daemon start namenode 15 | hdfs --daemon start secondarynamenode 16 | yarn --daemon start resourcemanager 17 | 18 | # create required directories 19 | while ! hdfs dfs -mkdir -p /spark-logs; 20 | do 21 | echo "Failed creating /spark-logs hdfs dir" 22 | done 23 | echo "Created /spark-logs hdfs dir" 24 | hdfs dfs -mkdir -p /opt/spark/data 25 | echo "Created /opt/spark/data hdfs dir" 26 | 27 | 28 | # copy the data to the data HDFS directory 29 | hdfs dfs -copyFromLocal /opt/spark/data/* /opt/spark/data 30 | hdfs dfs -ls /opt/spark/data 31 | 32 | elif [ "$SPARK_WORKLOAD" == "worker" ]; 33 | then 34 | hdfs namenode -format 35 | 36 | # start the worker node processes 37 | hdfs --daemon start datanode 38 | yarn --daemon start nodemanager 39 | elif [ "$SPARK_WORKLOAD" == "history" ]; 40 | then 41 | 42 | while ! hdfs dfs -test -d /spark-logs; 43 | do 44 | echo "spark-logs doesn't exist yet... retrying" 45 | sleep 1; 46 | done 47 | echo "Exit loop" 48 | 49 | # start the spark history server 50 | start-history-server.sh 51 | fi 52 | 53 | tail -f /dev/null 54 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPARK_WORKLOAD=$1 4 | 5 | echo "SPARK_WORKLOAD: $SPARK_WORKLOAD" 6 | 7 | if [ "$SPARK_WORKLOAD" == "master" ]; 8 | then 9 | start-master.sh -p 7077 10 | elif [ "$SPARK_WORKLOAD" == "worker" ]; 11 | then 12 | start-worker.sh spark://spark-master:7077 13 | elif [ "$SPARK_WORKLOAD" == "history" ] 14 | then 15 | start-history-server.sh 16 | fi 17 | -------------------------------------------------------------------------------- /requirements/requirements.in: -------------------------------------------------------------------------------- 1 | ipython 2 | pandas 3 | pyarrow 4 | numpy 5 | pyspark 6 | -------------------------------------------------------------------------------- /requirements/requirements.txt: -------------------------------------------------------------------------------- 1 | # SHA1:26a1da68e1a978adfc818c1a75e63642e86609a6 2 | # 3 | # This file is autogenerated by pip-compile-multi 4 | # To update, run: 5 | # 6 | # pip-compile-multi 7 | # 8 | appnope==0.1.3 9 | # via ipython 10 | asttokens==2.2.1 11 | # via stack-data 12 | backcall==0.2.0 13 | # via ipython 14 | decorator==5.1.1 15 | # via ipython 16 | executing==1.2.0 17 | # via stack-data 18 | ipython==8.7.0 19 | # via -r requirements/requirements.in 20 | jedi==0.18.2 21 | # via ipython 22 | matplotlib-inline==0.1.6 23 | # via ipython 24 | numpy==1.24.0 25 | # via 26 | # -r requirements/requirements.in 27 | # pandas 28 | # pyarrow 29 | pandas==1.5.2 30 | # via -r requirements/requirements.in 31 | parso==0.8.3 32 | # via jedi 33 | pexpect==4.8.0 34 | # via ipython 35 | pickleshare==0.7.5 36 | # via ipython 37 | prompt-toolkit==3.0.36 38 | # via ipython 39 | ptyprocess==0.7.0 40 | # via pexpect 41 | pure-eval==0.2.2 42 | # via stack-data 43 | py4j==0.10.9.5 44 | # via pyspark 45 | pyarrow==10.0.1 46 | # via -r requirements/requirements.in 47 | pygments==2.13.0 48 | # via ipython 49 | pyspark==3.3.1 50 | # via -r requirements/requirements.in 51 | python-dateutil==2.8.2 52 | # via pandas 53 | pytz==2022.7 54 | # via pandas 55 | six==1.16.0 56 | # via python-dateutil 57 | stack-data==0.6.2 58 | # via ipython 59 | traitlets==5.8.0 60 | # via 61 | # ipython 62 | # matplotlib-inline 63 | wcwidth==0.2.5 64 | # via prompt-toolkit 65 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter02/ex2.py: -------------------------------------------------------------------------------- 1 | # ex 2.3 2 | 3 | # Rewrite: 4 | # exo2_3_df = ( 5 | # spark.read.text("./data/gutenberg_books/1342-0.txt") 6 | # .select(length(col("value"))) 7 | # .withColumnRenamed("length(value)", "number_of_char") 8 | # ) 9 | # 10 | # Solution: 11 | # exo2_3_df = ( 12 | # spark.read.text("./data/gutenberg_books/1342-0.txt") 13 | # .select(length(col("value")).alias("number_of_char")) 14 | # ) 15 | 16 | # ex 2.5 17 | # a) 18 | # words_without_is = words_nonull.where(col("word") != "is") 19 | # b) 20 | # words_more_than_3_char = words_nonull.where(length(col("word")) > 3) 21 | 22 | # ex 2.6 23 | # words_no_is_not_the_if = ( 24 | # words_nonull.where(~col("word").isin( 25 | # ["no", "is", "the", "if"]))) 26 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter02/word_non_null.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import col, explode, lower, regexp_extract, split 3 | 4 | spark = SparkSession.builder.appName( 5 | "Ch02 - Analyzing the vocabulary of Pride and Prejudice." 6 | ).getOrCreate() 7 | 8 | book = spark.read.text("/opt/spark/data/pride-and-prejudice.txt") 9 | 10 | lines = book.select(split(col("value"), " ").alias("line")) 11 | 12 | words = lines.select(explode(col("line")).alias("word")) 13 | 14 | words_lower = words.select(lower(col("word")).alias("word_lower")) 15 | words_clean = words_lower.select( 16 | regexp_extract(col("word_lower"), "[a-z]*", 0).alias("word") 17 | ) 18 | words_nonull = words_clean.where(col("word") != "") 19 | 20 | results = words_nonull.groupby(col("word")).count() 21 | 22 | results.orderBy(col("count").desc()).show(10) 23 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter03/ex3_3.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import IntegerType 4 | 5 | spark = SparkSession.builder.appName( 6 | "ex3_3" 7 | ).getOrCreate() 8 | 9 | 10 | def get_distinct_words(filename): 11 | book = spark.read.text(filename) 12 | 13 | return ( 14 | book.select(F.split(F.col("value"), " ").alias("line")) 15 | .select(F.explode(F.col("line")).alias("word")) 16 | .select(F.lower(F.col("word")).alias("word_lower")) 17 | .select(F.regexp_extract(F.col("word_lower"), "[a-z]*", 0).alias("word")) 18 | .where(F.col("word") != "") 19 | .distinct() 20 | .count() 21 | ) 22 | 23 | 24 | result = get_distinct_words("/opt/spark/data/gutenberg_books/*.txt") 25 | 26 | result = spark.createDataFrame([result], IntegerType()).toDF("distinct words") 27 | result.show() 28 | 29 | result.coalesce(1).write.mode("overwrite").csv( 30 | "/opt/spark/data/results/chapter03/simple_count.csv" 31 | ) 32 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter03/ex3_4.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName( 5 | "ex3_4" 6 | ).getOrCreate() 7 | 8 | 9 | def get_distinct_words(filename): 10 | book = spark.read.text(filename) 11 | 12 | return ( 13 | book.select(F.split(F.col("value"), " ").alias("line")) 14 | .select(F.explode(F.col("line")).alias("word")) 15 | .select(F.lower(F.col("word")).alias("word_lower")) 16 | .select(F.regexp_extract(F.col("word_lower"), "[a-z]*", 0).alias("word")) 17 | .where(F.col("word") != "") 18 | .groupby(F.col("word")) 19 | .count() 20 | .where(F.col("count") == 1) 21 | .limit(5) 22 | ) 23 | 24 | 25 | result = get_distinct_words("/opt/spark/data/pride-and-prejudice.txt") 26 | 27 | result.show() 28 | 29 | result.coalesce(1).write.mode("overwrite").csv( 30 | "/opt/spark/data/results/chapter03/simple_count.csv" 31 | ) 32 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter03/ex3_5.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName( 5 | "Ex3_5" 6 | ).getOrCreate() 7 | 8 | 9 | def get_distinct_words(filename): 10 | book = spark.read.text(filename) 11 | 12 | results = ( 13 | book.select(F.split(F.col("value"), " ").alias("line")) 14 | .select(F.explode(F.col("line")).alias("word")) 15 | .select(F.lower(F.substring(F.col("word"), 1, 1)).alias("letter")) 16 | .where(F.col("letter").rlike("[a-z]")) 17 | .groupby(F.col("letter")) 18 | .count() 19 | ) 20 | 21 | results = results.orderBy(F.col("count").desc()).limit(5) 22 | 23 | return results 24 | 25 | 26 | result = get_distinct_words("/opt/spark/data/pride-and-prejudice.txt") 27 | 28 | result.show() 29 | 30 | result.coalesce(1).write.mode("overwrite").csv( 31 | "/opt/spark/data/results/chapter03/simple_count.csv" 32 | ) 33 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter03/ex3_5_2.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName( 5 | "Ex3_5_2" 6 | ).getOrCreate() 7 | 8 | 9 | def get_distinct_words(filename): 10 | book = spark.read.text(filename) 11 | 12 | results = ( 13 | book.select(F.split(F.col("value"), " ").alias("line")) 14 | .select(F.explode(F.col("line")).alias("word")) 15 | .select(F.lower(F.substring(F.col("word"), 1, 1)).alias("letter")) 16 | .where(F.col("letter").rlike("[a-z]")) 17 | .groupby(F.col("letter").isin(["a", "e", "i", "o", "u"]).alias("is_vowel")) 18 | .count() 19 | ) 20 | 21 | return results 22 | 23 | 24 | result = get_distinct_words("/opt/spark/data/pride-and-prejudice.txt") 25 | 26 | result.show() 27 | 28 | result.coalesce(1).write.mode("overwrite").csv( 29 | "/opt/spark/data/results/chapter03/simple_count.csv" 30 | ) 31 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter03/word_non_null.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName( 5 | "Ch03 - Analyzing the vocabulary of Pride and Prejudice." 6 | ).getOrCreate() 7 | 8 | book = spark.read.text("/opt/spark/data/pride-and-prejudice.txt") 9 | 10 | lines = book.select(F.split(F.col("value"), " ").alias("line")) 11 | 12 | words = lines.select(F.explode(F.col("line")).alias("word")) 13 | 14 | words_lower = words.select(F.lower(F.col("word")).alias("word_lower")) 15 | words_clean = words_lower.select( 16 | F.regexp_extract(F.col("word_lower"), "[a-z]*", 0).alias("word") 17 | ) 18 | words_nonull = words_clean.where(F.col("word") != "") 19 | 20 | results = words_nonull.groupby(F.col("word")).count() 21 | 22 | results.orderBy(F.col("count").desc()).show(10) 23 | 24 | results.coalesce(1).write.csv("/opt/spark/data/results/chapter03/simple_count.csv") 25 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter03/word_non_null_short.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName( 5 | "Ch03 - Analyzing the vocabulary of Pride and Prejudice. - short" 6 | ).getOrCreate() 7 | 8 | book = spark.read.text("/opt/spark/data/pride-and-prejudice.txt") 9 | 10 | results = ( 11 | book.select(F.split(F.col("value"), " ").alias("line")) 12 | .select(F.explode(F.col("line")).alias("word")) 13 | .select(F.lower(F.col("word")).alias("word_lower")) 14 | .select(F.regexp_extract(F.col("word_lower"), "[a-z]*", 0).alias("word")) 15 | .where(F.col("word") != "") 16 | .groupby(F.col("word")) 17 | .count() 18 | ) 19 | 20 | results.orderBy(F.col("count").desc()).show(10) 21 | 22 | results.coalesce(1).write.csv("/opt/spark/data/results/chapter03/simple_count.csv") 23 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter03/word_non_null_short_multiple_files.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName( 5 | "Ch03 - Analyzing the vocabulary of Pride and Prejudice. - short, multiple files" 6 | ).getOrCreate() 7 | 8 | book = spark.read.text("/opt/spark/data/gutenberg_books/*.txt") 9 | 10 | results = ( 11 | book.select(F.split(F.col("value"), " ").alias("line")) 12 | .select(F.explode(F.col("line")).alias("word")) 13 | .select(F.lower(F.col("word")).alias("word_lower")) 14 | .select(F.regexp_extract(F.col("word_lower"), "[a-z]*", 0).alias("word")) 15 | .where(F.col("word") != "") 16 | .groupby(F.col("word")) 17 | .count() 18 | ) 19 | 20 | results.orderBy(F.col("count").desc()).show(10) 21 | 22 | results.coalesce(1).write.mode("overwrite").csv( 23 | "/opt/spark/data/results/chapter03/simple_count.csv" 24 | ) 25 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/broadcast_logs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("Ch04 - Broadcast logs").getOrCreate() 6 | 7 | DIRECTORY = "/opt/spark/data/broadcast_logs" 8 | 9 | logs = spark.read.csv( 10 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 11 | sep="|", 12 | header=True, 13 | inferSchema=True, 14 | timestampFormat="yyyy-MM-dd", 15 | ) 16 | 17 | logs = logs.select("BroadcastLogID", "LogServiceID", "LogDate") 18 | 19 | logs.show(10, False) 20 | 21 | logs.printSchema() 22 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/broadcast_logs_new_column.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Ch04 - Broadcast logs with new columns").getOrCreate() 7 | 8 | DIRECTORY = "/opt/spark/data/broadcast_logs" 9 | 10 | logs = spark.read.csv( 11 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 12 | sep="|", 13 | header=True, 14 | inferSchema=True, 15 | timestampFormat="yyyy-MM-dd", 16 | ) 17 | 18 | logs = logs.select(F.col("Duration")) 19 | 20 | logs.select( 21 | F.col("Duration"), 22 | F.col("Duration").substr(1, 2).cast("int").alias("dur_hours"), 23 | F.col("Duration").substr(4, 2).cast("int").alias("dur_minutes"), 24 | F.col("Duration").substr(7, 2).cast("int").alias("dur_seconds"), 25 | ).distinct().show(5) 26 | 27 | logs = logs.withColumn( 28 | "Duration_seconds", 29 | ( 30 | F.col("Duration").substr(1, 2).cast("int") * 60 * 60 31 | + F.col("Duration").substr(4, 2).cast("int") * 60 32 | + F.col("Duration").substr(7, 2).cast("int") 33 | ), 34 | ) 35 | 36 | logs.show(10, False) 37 | 38 | print(logs.dtypes) 39 | 40 | logs.printSchema() 41 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/broadcast_logs_stats.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("Ch04 - Broadcast logs stats").getOrCreate() 6 | 7 | DIRECTORY = "/opt/spark/data/broadcast_logs" 8 | 9 | logs = spark.read.csv( 10 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 11 | sep="|", 12 | header=True, 13 | inferSchema=True, 14 | timestampFormat="yyyy-MM-dd", 15 | ) 16 | 17 | logs.show(10, False) 18 | 19 | logs.printSchema() 20 | 21 | for col in logs.columns: 22 | logs.describe(col).show() 23 | 24 | for col in logs.columns: 25 | logs.select(col).summary().show() 26 | 27 | for col in logs.columns: 28 | logs.select(col).summary("min", "10%", "90%", "max").show() 29 | 30 | 31 | # WARNING describe() and summary() are two very useful methods, 32 | # but they are not meant to be used for anything other than quickly peeking at data during development. 33 | # The PySpark developers don’t guarantee that the output will look the same from version to version, 34 | # so if you need one of the outputs for your program, 35 | # use the corresponding function in pyspark.sql.functions. 36 | # They’re all there. 37 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/broadcast_logs_tidy.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Ch04 - Broadcast logs tidy").getOrCreate() 7 | 8 | DIRECTORY = "/opt/spark/data/broadcast_logs" 9 | 10 | logs = spark.read.csv( 11 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 12 | sep="|", 13 | header=True, 14 | inferSchema=True, 15 | timestampFormat="yyyy-MM-dd", 16 | ) 17 | 18 | logs = logs.withColumn( 19 | "Duration_seconds", 20 | ( 21 | F.col("Duration").substr(1, 2).cast("int") * 60 * 60 22 | + F.col("Duration").substr(4, 2).cast("int") * 60 23 | + F.col("Duration").substr(7, 2).cast("int") 24 | ), 25 | ) 26 | 27 | logs.toDF(*[x.lower() for x in logs.columns]).printSchema() 28 | logs.select(sorted(logs.columns)).printSchema() 29 | 30 | logs.show(10, False) 31 | 32 | print(logs.dtypes) 33 | 34 | logs.printSchema() 35 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/broadcast_logs_unpacking.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Ch04 - Broadcast logs unpacking").getOrCreate() 7 | 8 | DIRECTORY = "/opt/spark/data/broadcast_logs" 9 | 10 | logs = spark.read.csv( 11 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 12 | sep="|", 13 | header=True, 14 | inferSchema=True, 15 | timestampFormat="yyyy-MM-dd", 16 | ) 17 | 18 | column_split = np.array_split(np.array(logs.columns), len(logs.columns) // 3) 19 | 20 | print(column_split) 21 | 22 | for x in column_split: 23 | logs.select(*x).show(5, False) 24 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/ex4_1.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("ex4_1").getOrCreate() 6 | 7 | DIRECTORY = "/opt/spark/data/" 8 | 9 | items = spark.read.csv( 10 | os.path.join(DIRECTORY, "sample.csv"), 11 | sep=",", 12 | header=True, 13 | inferSchema=True, 14 | quote="$", 15 | ) 16 | 17 | items.show() 18 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/ex4_3.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("ex4_3").getOrCreate() 6 | 7 | DIRECTORY = "/opt/spark/data/broadcast_logs" 8 | 9 | logs = spark.read.csv( 10 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 11 | sep="|", 12 | header=True, 13 | inferSchema=True, 14 | timestampFormat="yyyy-MM-dd", 15 | ) 16 | 17 | logs_raw = spark.read.csv( 18 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), header=True 19 | ) 20 | 21 | logs.printSchema() 22 | logs_raw.printSchema() 23 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/ex4_4.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("ex4_4").getOrCreate() 6 | 7 | DIRECTORY = "/opt/spark/data/broadcast_logs" 8 | 9 | logs = spark.read.csv( 10 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 11 | sep="|", 12 | header=True, 13 | inferSchema=True, 14 | timestampFormat="yyyy-MM-dd", 15 | ) 16 | 17 | logs_clean = logs.select(*[col for col in logs.columns if not col.endswith("ID")]) 18 | 19 | logs.printSchema() 20 | 21 | logs_clean.printSchema() 22 | 23 | logs_clean.show(10) 24 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter04/tabular_data.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("Ch04 - Tabular data example").getOrCreate() 4 | 5 | my_grocery_list = [ 6 | ["Banana", 2, 1.74], 7 | ["Apple", 4, 2.04], 8 | ["Carrot", 1, 1.09], 9 | ["Cake", 1, 10.99], 10 | ] 11 | 12 | df_grocery_list = spark.createDataFrame(my_grocery_list, ["Item", "Quantity", "Price"]) 13 | 14 | df_grocery_list.printSchema() 15 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter05/broadcast_logs.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Ch05 - Broadcast logs script ch05").getOrCreate() 7 | 8 | DIRECTORY = "/opt/spark/data/broadcast_logs" 9 | 10 | logs = spark.read.csv( 11 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 12 | sep="|", 13 | header=True, 14 | inferSchema=True, 15 | timestampFormat="yyyy-MM-dd", 16 | ) 17 | 18 | logs = logs.withColumn( 19 | "duration_seconds", 20 | ( 21 | F.col("Duration").substr(1, 2).cast("int") * 60 * 60 22 | + F.col("Duration").substr(4, 2).cast("int") * 60 23 | + F.col("Duration").substr(7, 2).cast("int") 24 | ), 25 | ) 26 | 27 | log_identifier = spark.read.csv( 28 | os.path.join(DIRECTORY, "ReferenceTables/LogIdentifier.csv"), 29 | sep="|", 30 | header=True, 31 | inferSchema=True, 32 | ) 33 | 34 | cd_category = spark.read.csv( 35 | os.path.join(DIRECTORY, "ReferenceTables/CD_Category.csv"), 36 | sep="|", 37 | header=True, 38 | inferSchema=True, 39 | ).select( 40 | "CategoryID", 41 | "CategoryCD", 42 | F.col("EnglishDescription").alias("Category_Description"), 43 | ) 44 | 45 | cd_program_class = spark.read.csv( 46 | os.path.join(DIRECTORY, "ReferenceTables/CD_ProgramClass.csv"), 47 | sep="|", 48 | header=True, 49 | inferSchema=True, 50 | ).select( 51 | "ProgramClassID", 52 | "ProgramClassCD", 53 | F.col("EnglishDescription").alias("ProgramClass_Description"), 54 | ) 55 | 56 | # log_identifier.printSchema() 57 | 58 | log_identifier = log_identifier.where(F.col("PrimaryFG") == 1) 59 | 60 | # print(log_identifier.count()) 61 | # log_identifier.show(5) 62 | 63 | joined_logs = logs.join( 64 | log_identifier, 65 | "LogServiceID", # more complex logs["LogServiceID"] == log_identifier["LogServiceID"] 66 | how="inner", 67 | ) 68 | 69 | full_log = joined_logs.join(cd_category, "CategoryID", how="left").join( 70 | cd_program_class, "ProgramClassID", how="left" 71 | ) 72 | 73 | # joined_logs.printSchema() 74 | # joined_logs.show(5) 75 | 76 | full_log.groupby("ProgramClassCD", "ProgramClass_Description").agg( 77 | F.sum("duration_seconds").alias("duration_total") 78 | ).orderBy("duration_total", ascending=False).show(100, False) 79 | 80 | # F.when( 81 | # F.trim(F.col("ProgramClassCD")).isin( 82 | # ["COM", "PRC", "PGI", "PRO", "PSA", "MAG", "LOC", "SPO", "MER", "SOL"] 83 | # ), 84 | # F.col("duration_seconds") # take this value if the ProgramClassCD value is in list 85 | # ).otherwise(0) 86 | 87 | commercial_programs = [ 88 | "COM", 89 | "PRC", 90 | "PGI", 91 | "PRO", 92 | "PSA", 93 | "MAG", 94 | "LOC", 95 | "SPO", 96 | "MER", 97 | "SOL", 98 | ] 99 | 100 | answer = ( 101 | full_log.groupby("ProgramClassCD", "ProgramClass_Description") 102 | .agg( 103 | F.sum( 104 | F.when( 105 | F.trim(F.col("ProgramClassCD")).isin(commercial_programs), 106 | F.col( 107 | "duration_seconds" 108 | ), # take this value if the ProgramClassCD value is in list 109 | ).otherwise(0) 110 | ).alias("duration_commercial"), 111 | F.sum("duration_seconds").alias("duration_total"), 112 | ) 113 | .withColumn( 114 | "commercial_ratio", F.col("duration_commercial") / F.col("duration_total") 115 | ) 116 | ) 117 | 118 | answer.orderBy("commercial_ratio", ascending=False).show(1000, False) 119 | 120 | # Drop null values 121 | answer_no_null = answer.dropna(subset=["commercial_ratio"]) 122 | 123 | answer_no_null.orderBy("commercial_ratio", ascending=False).show(1000, False) 124 | 125 | # Fill null values 126 | answer_filled = answer.fillna(0) 127 | 128 | answer_filled.orderBy("commercial_ratio", ascending=False).show(1000, False) 129 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter05/broadcast_logs_naming.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Ch05 - Broadcast logs script naming ch05").getOrCreate() 7 | 8 | DIRECTORY = "/opt/spark/data/broadcast_logs" 9 | 10 | logs = spark.read.csv( 11 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 12 | sep="|", 13 | header=True, 14 | inferSchema=True, 15 | timestampFormat="yyyy-MM-dd", 16 | ) 17 | 18 | log_identifier = spark.read.csv( 19 | os.path.join(DIRECTORY, "ReferenceTables/LogIdentifier.csv"), 20 | sep="|", 21 | header=True, 22 | inferSchema=True, 23 | ) 24 | 25 | log_identifier.printSchema() 26 | 27 | log_identifier = log_identifier.where(F.col("PrimaryFG") == 1) 28 | print(log_identifier.count()) 29 | 30 | log_identifier.show(5) 31 | 32 | # 1st approach to handling column ambiguity - use the short syntax for join type, e.g. on="LogServiceID" 33 | 34 | # 2nd drop one of the columns 35 | joined_logs = logs.join( 36 | log_identifier, 37 | on=[logs["LogServiceID"] == log_identifier["LogServiceID"]], 38 | ) 39 | 40 | joined_logs = joined_logs.drop(log_identifier["LogServiceID"]) 41 | 42 | # 3rd alias the table 43 | joined_logs = logs.join( 44 | log_identifier.alias("right"), 45 | logs["LogServiceID"] == log_identifier["LogServiceID"], 46 | ) 47 | 48 | joined_logs = joined_logs.drop(F.col("right.LogServiceID")).select("LogServiceID") 49 | 50 | joined_logs.printSchema() 51 | joined_logs.show(5) 52 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter05/ex5.py: -------------------------------------------------------------------------------- 1 | # 5.1. Duplicate all of the rows that do not satisfy join on right table 2 | # 5.2. Inner 3 | # 5.3. Left 4 | # 5.4 5 | # write alternative source code without using left_anti: 6 | # left.join(right, how="left_anti", 7 | # on="my_column").select("my_column").distinct() 8 | 9 | import pyspark.sql.functions as F 10 | from pyspark.sql import SparkSession 11 | 12 | spark = SparkSession.builder.appName("ex5_4").getOrCreate() 13 | 14 | left = [ 15 | ["Banana", 2, 1.74], 16 | ["Apple", 4, 2.04], 17 | ["Carrot", 1, 1.09], 18 | ["Cake", 1, 10.99], 19 | ] 20 | 21 | right = [["Banana"], ["Carrot"]] 22 | 23 | left = spark.createDataFrame(left, ["Item", "Quantity", "Price"]) 24 | right = spark.createDataFrame(right, ["Item"]) 25 | 26 | left.join(right, how="left_anti", on="Item").select("Item").distinct().show() 27 | 28 | left.alias("left").join( 29 | right.alias("right"), how="left", on=[right["Item"] == left["Item"]] 30 | ).where(F.col("right.Item").isNull()).select(F.col("left.Item")).distinct().show() 31 | 32 | # official solution 33 | left.join(right, how="left", on=left["Item"] == right["Item"]).where( 34 | right["Item"].isnull() 35 | ).select(left["Item"]) 36 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter05/ex5_5.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Broadcast logs script ch05 (ex5_5)").getOrCreate() 7 | 8 | spark.sparkContext.setLogLevel("WARN") 9 | 10 | DIRECTORY = "/opt/spark/data/broadcast_logs" 11 | 12 | logs = spark.read.csv( 13 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 14 | sep="|", 15 | header=True, 16 | inferSchema=True, 17 | timestampFormat="yyyy-MM-dd", 18 | ) 19 | 20 | logs = logs.drop("BroadcastLogID", "SequenceNO") 21 | 22 | logs = logs.withColumn( 23 | "duration_seconds", 24 | ( 25 | F.col("Duration").substr(1, 2).cast("int") * 60 * 60 26 | + F.col("Duration").substr(4, 2).cast("int") * 60 27 | + F.col("Duration").substr(7, 2).cast("int") 28 | ), 29 | ) 30 | 31 | log_identifier = spark.read.csv( 32 | os.path.join(DIRECTORY, "ReferenceTables/LogIdentifier.csv"), 33 | sep="|", 34 | header=True, 35 | inferSchema=True, 36 | ) 37 | 38 | cd_category = spark.read.csv( 39 | os.path.join(DIRECTORY, "ReferenceTables/CD_Category.csv"), 40 | sep="|", 41 | header=True, 42 | inferSchema=True, 43 | ).select( 44 | "CategoryID", 45 | "CategoryCD", 46 | F.col("EnglishDescription").alias("Category_Description"), 47 | ) 48 | 49 | cd_program_class = spark.read.csv( 50 | os.path.join(DIRECTORY, "ReferenceTables/CD_ProgramClass.csv"), 51 | sep="|", 52 | header=True, 53 | inferSchema=True, 54 | ).select( 55 | "ProgramClassID", 56 | "ProgramClassCD", 57 | F.col("EnglishDescription").alias("ProgramClass_Description"), 58 | ) 59 | 60 | # ex 5.5 table 61 | program_call_signs = spark.read.csv( 62 | os.path.join(DIRECTORY, "Call_Signs.csv"), 63 | sep=",", 64 | header=True, 65 | inferSchema=True, 66 | ).select( 67 | "LogIdentifierID", 68 | F.col("Undertaking_Name").alias("undertaking_name"), 69 | ) 70 | 71 | 72 | log_identifier = log_identifier.where(F.col("PrimaryFG") == 1) 73 | 74 | joined_logs = logs.join( 75 | log_identifier, 76 | "LogServiceID", # more complex logs["LogServiceID"] == log_identifier["LogServiceID"] 77 | how="inner", 78 | ) 79 | 80 | joined_logs_with_signs = joined_logs.join( 81 | program_call_signs, how="inner", on="LogIdentifierID" 82 | ) 83 | 84 | full_log = joined_logs_with_signs.join(cd_category, "CategoryID", how="left").join( 85 | cd_program_class, "ProgramClassID", how="left" 86 | ) 87 | 88 | commercial_programs = [ 89 | "COM", 90 | "PRC", 91 | "PGI", 92 | "PRO", 93 | "PSA", 94 | "MAG", 95 | "LOC", 96 | "SPO", 97 | "MER", 98 | "SOL", 99 | ] 100 | 101 | answer = ( 102 | full_log.groupby("LogIdentifierID", "undertaking_name") 103 | .agg( 104 | F.sum( 105 | F.when( 106 | F.trim(F.col("ProgramClassCD")).isin(commercial_programs), 107 | F.col("duration_seconds"), 108 | ).otherwise(0) 109 | ).alias("duration_commercial"), 110 | F.sum("duration_seconds").alias("duration_total"), 111 | ) 112 | .withColumn( 113 | "commercial_ratio", F.col("duration_commercial") / F.col("duration_total") 114 | ) 115 | ) 116 | 117 | # Fill null values 118 | answer_filled = answer.fillna(0) 119 | 120 | answer_filled.orderBy("commercial_ratio", ascending=False).show(1000, False) 121 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter05/ex5_6.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Broadcast logs script ch05 (ex5_6)").getOrCreate() 7 | 8 | spark.sparkContext.setLogLevel("WARN") 9 | 10 | DIRECTORY = "/opt/spark/data/broadcast_logs" 11 | 12 | logs = spark.read.csv( 13 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 14 | sep="|", 15 | header=True, 16 | inferSchema=True, 17 | timestampFormat="yyyy-MM-dd", 18 | ) 19 | 20 | logs = logs.drop("BroadcastLogID", "SequenceNO") 21 | 22 | logs = logs.withColumn( 23 | "duration_seconds", 24 | ( 25 | F.col("Duration").substr(1, 2).cast("int") * 60 * 60 26 | + F.col("Duration").substr(4, 2).cast("int") * 60 27 | + F.col("Duration").substr(7, 2).cast("int") 28 | ), 29 | ) 30 | 31 | log_identifier = spark.read.csv( 32 | os.path.join(DIRECTORY, "ReferenceTables/LogIdentifier.csv"), 33 | sep="|", 34 | header=True, 35 | inferSchema=True, 36 | ) 37 | 38 | cd_category = spark.read.csv( 39 | os.path.join(DIRECTORY, "ReferenceTables/CD_Category.csv"), 40 | sep="|", 41 | header=True, 42 | inferSchema=True, 43 | ).select( 44 | "CategoryID", 45 | "CategoryCD", 46 | F.col("EnglishDescription").alias("Category_Description"), 47 | ) 48 | 49 | cd_program_class = spark.read.csv( 50 | os.path.join(DIRECTORY, "ReferenceTables/CD_ProgramClass.csv"), 51 | sep="|", 52 | header=True, 53 | inferSchema=True, 54 | ).select( 55 | "ProgramClassID", 56 | "ProgramClassCD", 57 | F.col("EnglishDescription").alias("ProgramClass_Description"), 58 | ) 59 | 60 | # ex 5.5 table 61 | program_call_signs = spark.read.csv( 62 | os.path.join(DIRECTORY, "Call_Signs.csv"), 63 | sep=",", 64 | header=True, 65 | inferSchema=True, 66 | ).select( 67 | "LogIdentifierID", 68 | F.col("Undertaking_Name").alias("undertaking_name"), 69 | ) 70 | 71 | log_identifier = log_identifier.where(F.col("PrimaryFG") == 1) 72 | 73 | joined_logs = logs.join( 74 | log_identifier, 75 | "LogServiceID", # more complex logs["LogServiceID"] == log_identifier["LogServiceID"] 76 | how="inner", 77 | ) 78 | 79 | joined_logs_with_signs = joined_logs.join( 80 | program_call_signs, how="inner", on="LogIdentifierID" 81 | ) 82 | 83 | full_log = joined_logs_with_signs.join(cd_category, "CategoryID", how="left").join( 84 | cd_program_class, "ProgramClassID", how="left" 85 | ) 86 | 87 | commercial_programs = [ 88 | "COM", 89 | "PRC", 90 | "PGI", 91 | "PRO", 92 | "PSA", 93 | "MAG", 94 | "LOC", 95 | "SPO", 96 | "MER", 97 | "SOL", 98 | ] 99 | 100 | answer = ( 101 | full_log.groupby("LogIdentifierID", "undertaking_name") 102 | .agg( 103 | F.sum( 104 | F.when( 105 | F.trim(F.col("ProgramClassCD")).isin(commercial_programs), 106 | F.col("duration_seconds"), 107 | ) 108 | .when( 109 | F.trim(F.col("ProgramClassCD")) == "PRC", 110 | F.col("duration_seconds") * 0.75, 111 | ) 112 | .otherwise(0) 113 | ).alias("duration_commercial"), 114 | F.sum("duration_seconds").alias("duration_total"), 115 | ) 116 | .withColumn( 117 | "commercial_ratio", F.col("duration_commercial") / F.col("duration_total") 118 | ) 119 | ) 120 | 121 | # Fill null values 122 | answer_filled = answer.fillna(0) 123 | 124 | answer_filled.orderBy("commercial_ratio", ascending=False).show(1000, False) 125 | 126 | # answer_filled.join( 127 | # program_call_signs, 128 | # how="left", 129 | # on=[answer_filled["ProgramClassCD"] == program_call_signs["LogIdentifierID"]], 130 | # ).orderBy("commercial_ratio", ascending=False).show(1000, False) 131 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter05/ex5_7.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Broadcast logs script ch05 (ex5_7)").getOrCreate() 7 | 8 | spark.sparkContext.setLogLevel("WARN") 9 | 10 | DIRECTORY = "/opt/spark/data/broadcast_logs" 11 | 12 | logs = spark.read.csv( 13 | os.path.join(DIRECTORY, "BroadcastLogs_2018_Q3_M8_sample.CSV"), 14 | sep="|", 15 | header=True, 16 | inferSchema=True, 17 | timestampFormat="yyyy-MM-dd", 18 | ) 19 | 20 | logs = logs.drop("BroadcastLogID", "SequenceNO") 21 | 22 | logs = logs.withColumn( 23 | "duration_seconds", 24 | ( 25 | F.col("Duration").substr(1, 2).cast("int") * 60 * 60 26 | + F.col("Duration").substr(4, 2).cast("int") * 60 27 | + F.col("Duration").substr(7, 2).cast("int") 28 | ), 29 | ) 30 | 31 | log_identifier = spark.read.csv( 32 | os.path.join(DIRECTORY, "ReferenceTables/LogIdentifier.csv"), 33 | sep="|", 34 | header=True, 35 | inferSchema=True, 36 | ) 37 | 38 | cd_category = spark.read.csv( 39 | os.path.join(DIRECTORY, "ReferenceTables/CD_Category.csv"), 40 | sep="|", 41 | header=True, 42 | inferSchema=True, 43 | ).select( 44 | "CategoryID", 45 | "CategoryCD", 46 | F.col("EnglishDescription").alias("Category_Description"), 47 | ) 48 | 49 | cd_program_class = spark.read.csv( 50 | os.path.join(DIRECTORY, "ReferenceTables/CD_ProgramClass.csv"), 51 | sep="|", 52 | header=True, 53 | inferSchema=True, 54 | ).select( 55 | "ProgramClassID", 56 | "ProgramClassCD", 57 | F.col("EnglishDescription").alias("ProgramClass_Description"), 58 | ) 59 | 60 | # ex 5.5 table 61 | program_call_signs = spark.read.csv( 62 | os.path.join(DIRECTORY, "Call_Signs.csv"), 63 | sep=",", 64 | header=True, 65 | inferSchema=True, 66 | ).select( 67 | "LogIdentifierID", 68 | F.col("Undertaking_Name").alias("undertaking_name"), 69 | ) 70 | 71 | log_identifier = log_identifier.where(F.col("PrimaryFG") == 1) 72 | 73 | joined_logs = logs.join( 74 | log_identifier, 75 | "LogServiceID", # more complex logs["LogServiceID"] == log_identifier["LogServiceID"] 76 | how="inner", 77 | ) 78 | 79 | joined_logs_with_signs = joined_logs.join( 80 | program_call_signs, how="inner", on="LogIdentifierID" 81 | ) 82 | 83 | full_log = joined_logs_with_signs.join(cd_category, "CategoryID", how="left").join( 84 | cd_program_class, "ProgramClassID", how="left" 85 | ) 86 | 87 | commercial_programs = [ 88 | "COM", 89 | "PRC", 90 | "PGI", 91 | "PRO", 92 | "PSA", 93 | "MAG", 94 | "LOC", 95 | "SPO", 96 | "MER", 97 | "SOL", 98 | ] 99 | 100 | answer = ( 101 | full_log.groupby("LogIdentifierID", "undertaking_name") 102 | .agg( 103 | F.sum( 104 | F.when( 105 | F.trim(F.col("ProgramClassCD")).isin(commercial_programs), 106 | F.col("duration_seconds"), 107 | ) 108 | .when( 109 | F.trim(F.col("ProgramClassCD")) == "PRC", 110 | F.col("duration_seconds") * 0.75, 111 | ) 112 | .otherwise(0) 113 | ).alias("duration_commercial"), 114 | F.sum("duration_seconds").alias("duration_total"), 115 | ) 116 | .withColumn( 117 | "commercial_ratio", F.col("duration_commercial") / F.col("duration_total") 118 | ) 119 | ) 120 | 121 | # Fill null values 122 | answer_filled = answer.fillna(0) 123 | 124 | answer_filled.groupby(F.round("commercial_ratio", 1).alias("commercial_ratio")).agg( 125 | F.count("*").alias("number_of_channels") 126 | ).orderBy("commercial_ratio", ascending=False).show(1000, False) 127 | 128 | # answer_filled.orderBy("commercial_ratio", ascending=False).show(1000, False) 129 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/defining_schema.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | import pyspark.sql.types as T 3 | from py4j.protocol import Py4JJavaError 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Ch06 - defining schema").getOrCreate() 7 | 8 | data_dir = "/opt/spark/data" 9 | 10 | three_shows = spark.read.json(f"{data_dir}/shows/shows-*.json", multiLine=True) 11 | 12 | assert three_shows.count() == 3 13 | 14 | episode_links_schema = T.StructType( 15 | [T.StructField("self", T.StructType([T.StructField("href", T.StringType())]))] 16 | ) 17 | 18 | episode_image_schema = T.StructType( 19 | [T.StructField("medium", T.StringType()), T.StructField("original", T.StringType())] 20 | ) 21 | 22 | episode_schema = T.StructType( 23 | [ 24 | T.StructField("_links", episode_links_schema), 25 | T.StructField("airdate", T.DateType()), 26 | T.StructField("airstamp", T.TimestampType()), 27 | T.StructField("airtime", T.StringType()), 28 | T.StructField("id", T.StringType()), 29 | T.StructField("image", episode_image_schema), 30 | T.StructField("name", T.StringType()), 31 | T.StructField("number", T.LongType()), 32 | T.StructField("runtime", T.LongType()), 33 | T.StructField("season", T.LongType()), 34 | T.StructField("summary", T.StringType()), 35 | T.StructField("url", T.StringType()), 36 | ] 37 | ) 38 | 39 | embedded_schema = T.StructType( 40 | [ 41 | T.StructField( 42 | "_embedded", 43 | T.StructType([T.StructField("episodes", T.ArrayType(episode_schema))]), 44 | ) 45 | ] 46 | ) 47 | 48 | data_dir = "/opt/spark/data" 49 | 50 | shows_with_schema = spark.read.json( 51 | f"{data_dir}/shows/shows-*.json", 52 | multiLine=True, 53 | schema=embedded_schema, 54 | mode="FAILFAST", 55 | ) 56 | 57 | shows_with_schema.printSchema() 58 | 59 | for column in ["airdate", "airstamp"]: 60 | shows_with_schema.select(f"_embedded.episodes.{column}").select( 61 | F.explode(column) 62 | ).show(5) 63 | 64 | # Wrong schema example 65 | episode_schema_WRONG = T.StructType( 66 | [ 67 | T.StructField("_links", episode_links_schema), 68 | T.StructField("airdate", T.DateType()), 69 | T.StructField("airstamp", T.TimestampType()), 70 | T.StructField("airtime", T.LongType()), 71 | T.StructField("id", T.StringType()), 72 | T.StructField("image", episode_image_schema), 73 | T.StructField("name", T.LongType()), 74 | T.StructField("number", T.LongType()), 75 | T.StructField("runtime", T.LongType()), 76 | T.StructField("season", T.LongType()), 77 | T.StructField("summary", T.StringType()), 78 | T.StructField("url", T.StringType()), 79 | ] 80 | ) 81 | 82 | embedded_schema_WRONG = T.StructType( 83 | [ 84 | T.StructField( 85 | "_embedded", 86 | T.StructType( 87 | [T.StructField("episodes", T.ArrayType(episode_schema_WRONG))] 88 | ), 89 | ) 90 | ] 91 | ) 92 | 93 | shows_with_WRONG_schema = spark.read.json( 94 | f"{data_dir}/shows/shows-*.json", 95 | multiLine=True, 96 | schema=embedded_schema_WRONG, 97 | mode="FAILFAST", 98 | ) 99 | 100 | try: 101 | shows_with_WRONG_schema.show() 102 | except Py4JJavaError: 103 | pass 104 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/defining_schema_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pprint 3 | 4 | import pyspark.sql.functions as F 5 | import pyspark.sql.types as T 6 | from pyspark.sql import SparkSession 7 | 8 | spark = SparkSession.builder.appName("Ch06 - defining schema json").getOrCreate() 9 | 10 | data_dir = "/opt/spark/data" 11 | 12 | three_shows = spark.read.json(f"{data_dir}/shows/shows-*.json", multiLine=True) 13 | 14 | assert three_shows.count() == 3 15 | 16 | 17 | episode_links_schema = T.StructType( 18 | [T.StructField("self", T.StructType([T.StructField("href", T.StringType())]))] 19 | ) 20 | 21 | episode_image_schema = T.StructType( 22 | [T.StructField("medium", T.StringType()), T.StructField("original", T.StringType())] 23 | ) 24 | 25 | episode_schema = T.StructType( 26 | [ 27 | T.StructField("_links", episode_links_schema), 28 | T.StructField("airdate", T.DateType()), 29 | T.StructField("airstamp", T.TimestampType()), 30 | T.StructField("airtime", T.StringType()), 31 | T.StructField("id", T.StringType()), 32 | T.StructField("image", episode_image_schema), 33 | T.StructField("name", T.StringType()), 34 | T.StructField("number", T.LongType()), 35 | T.StructField("runtime", T.LongType()), 36 | T.StructField("season", T.LongType()), 37 | T.StructField("summary", T.StringType()), 38 | T.StructField("url", T.StringType()), 39 | ] 40 | ) 41 | 42 | embedded_schema = T.StructType( 43 | [ 44 | T.StructField( 45 | "_embedded", 46 | T.StructType([T.StructField("episodes", T.ArrayType(episode_schema))]), 47 | ) 48 | ] 49 | ) 50 | 51 | data_dir = "/opt/spark/data" 52 | 53 | shows_with_schema = spark.read.json( 54 | f"{data_dir}/shows/shows-*.json", 55 | multiLine=True, 56 | schema=embedded_schema, 57 | mode="FAILFAST", 58 | ) 59 | 60 | shows_with_schema.printSchema() 61 | 62 | pprint.pprint( 63 | shows_with_schema.select(F.explode("_embedded.episodes").alias("episode")) 64 | .select("episode.airtime") 65 | .schema.jsonValue() 66 | ) 67 | 68 | # pprint.pprint(T.StructField("array_example", T.ArrayType(T.StringType())).jsonValue()) 69 | # 70 | # pprint.pprint( 71 | # T.StructField("map_example", T.MapType(T.StringType(), T.LongType())).jsonValue() 72 | # ) 73 | 74 | pprint.pprint( 75 | T.StructType( 76 | [ 77 | T.StructField("map_example", T.MapType(T.StringType(), T.LongType())), 78 | T.StructField("array_example", T.ArrayType(T.StringType())), 79 | ] 80 | ).jsonValue() 81 | ) 82 | 83 | other_shows_schema = T.StructType.fromJson(json.loads(shows_with_schema.schema.json())) 84 | 85 | assert other_shows_schema == shows_with_schema.schema 86 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/ex6.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.types as T 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName("Ex6").getOrCreate() 5 | sc = spark.sparkContext 6 | 7 | # ex 6.1 8 | data = """{"name": "Sample name", 9 | "keywords": ["PySpark", "Python", "Data"]}""" 10 | 11 | df = spark.read.json(sc.parallelize([data])) 12 | 13 | df.show() 14 | df.printSchema() 15 | 16 | # ex 6.2 17 | data = """{"name": "Sample name", 18 | "keywords": ["PySpark", 3.2, "Data"]}""" 19 | 20 | df = spark.read.json(sc.parallelize([data])) 21 | 22 | df.show() 23 | df.printSchema() 24 | 25 | # ex 6.3 26 | # missing field definition 27 | # A StructType() will take a list of StructField(), not the types directly. 28 | # We need to wrap T.StringType(), T.LongType(), and T.LongType() into a StructField(), giv- ing them an appropriate name. 29 | 30 | # ex 6.4 31 | # If we have a struct with a field that corresponds to that column name. The column will become unreachable. 32 | # E.g. info.status -> column, info -> struct with field status 33 | 34 | # ex 6.5 35 | dict_schema = T.StructType( 36 | [ 37 | T.StructField("one", T.IntegerType()), 38 | T.StructField("two", T.ArrayType(T.IntegerType())), 39 | ] 40 | ) 41 | 42 | df = spark.createDataFrame( 43 | [{"one": 1, "two": [1, 2, 3]}], schema=dict_schema, verifySchema=True 44 | ) 45 | 46 | df.show() 47 | df.printSchema() 48 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/ex6_6.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName("Ex6_6").getOrCreate() 5 | 6 | data_dir = "/opt/spark/data" 7 | 8 | three_shows = spark.read.json(f"{data_dir}/shows/shows-*.json", multiLine=True) 9 | 10 | print(three_shows.count()) 11 | 12 | assert three_shows.count() == 3 13 | 14 | three_shows.printSchema() 15 | 16 | # From book solution: 17 | # sol6_6 = three_shows.select( 18 | # "name", 19 | # F.array_min("_embedded.episodes.airdate").cast("date").alias("first"), 20 | # F.array_max("_embedded.episodes.airdate").cast("date").alias("last"), ).select("name", ( 21 | # F.col("last") - F.col("first")).alias("tenure")) 22 | # 23 | # sol6_6.show(truncate=50) 24 | 25 | data = three_shows.select( 26 | "id", 27 | "name", 28 | F.to_timestamp(F.col("_embedded.episodes.airstamp").getItem(0)).alias( 29 | "first_ep_airstamp" 30 | ), 31 | F.to_timestamp(F.element_at(F.col("_embedded.episodes.airstamp"), -1)).alias( 32 | "last_ep_airstamp" 33 | ), 34 | ) 35 | 36 | data = data.select( 37 | "id", 38 | "name", 39 | ( 40 | F.unix_timestamp("last_ep_airstamp") - F.unix_timestamp("first_ep_airstamp") 41 | ).alias("duration_seconds"), 42 | ).orderBy("duration_seconds", ascending=False) 43 | 44 | data.show(5) 45 | 46 | data.printSchema() 47 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/ex6_7.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName("Ex6_7").getOrCreate() 4 | 5 | data_dir = "/opt/spark/data" 6 | 7 | three_shows = spark.read.json(f"{data_dir}/shows/shows-*.json", multiLine=True) 8 | 9 | print(three_shows.count()) 10 | 11 | assert three_shows.count() == 3 12 | 13 | three_shows.printSchema() 14 | 15 | # I don't get this assignment 16 | data = three_shows.select( 17 | "id", "name", "_embedded.episodes.name", "_embedded.episodes.airdate" 18 | ) 19 | 20 | data.show() 21 | 22 | data.printSchema() 23 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/ex6_8.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName("Ex6_8").getOrCreate() 5 | 6 | exo6_8 = spark.createDataFrame([[1, 2], [2, 4], [3, 9]], ["one", "square"]) 7 | 8 | exo6_8 = exo6_8.select( 9 | F.map_from_arrays(F.collect_list("one"), F.collect_list("square")).alias("my_map") 10 | ) 11 | 12 | exo6_8.show(truncate=False) 13 | exo6_8.printSchema() 14 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/reading_json.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName("Ch06 - reading json").getOrCreate() 5 | 6 | data_dir = "/opt/spark/data" 7 | 8 | shows = spark.read.json(f"{data_dir}/shows/shows-silicon-valley.json") 9 | 10 | print(shows.count()) 11 | 12 | assert shows.count() == 1 13 | 14 | three_shows = spark.read.json(f"{data_dir}/shows/shows-*.json", multiLine=True) 15 | 16 | print(three_shows.count()) 17 | 18 | assert three_shows.count() == 3 19 | 20 | three_shows.printSchema() 21 | 22 | # ARRAY EXAMPLES START HERE 23 | array_subset = three_shows.select("name", "genres") 24 | 25 | array_subset.show(3, False) 26 | 27 | array_subset = array_subset.select( 28 | "name", 29 | array_subset.genres[0].alias("dot_and_index"), 30 | F.col("genres")[0].alias("col_and_index"), 31 | array_subset.genres.getItem(0).alias("dot_and_method"), 32 | F.col("genres").getItem(0).alias("col_and_method"), 33 | ) 34 | 35 | array_subset.show() 36 | 37 | array_subset_repeated = array_subset.select( 38 | "name", 39 | F.lit("Comedy").alias("one"), 40 | F.lit("Horror").alias("two"), 41 | F.lit("Drama").alias("three"), 42 | F.col("dot_and_index"), 43 | ).select( 44 | "name", 45 | F.array("one", "two", "three").alias("Some_Genres"), 46 | F.array_repeat("dot_and_index", 5).alias("Repeated_Genres"), 47 | ) 48 | 49 | array_subset_repeated.show(3, False) 50 | 51 | print("SIZE") 52 | array_subset_repeated.select( 53 | "name", F.size("Some_Genres"), F.size("Repeated_Genres") 54 | ).show(3, False) 55 | 56 | print("ARRAY_DISTINCT") 57 | array_subset_repeated.select( 58 | "name", F.array_distinct("Some_Genres"), F.array_distinct("Repeated_Genres") 59 | ).show(3, False) 60 | 61 | print("ARRAY_INTERSECT") 62 | array_subset_repeated.select( 63 | "name", F.array_intersect("Some_Genres", "Repeated_Genres").alias("Genres") 64 | ).show(3, False) 65 | 66 | # MAP EXAMPLES START HERE 67 | columns = ["name", "language", "type"] 68 | 69 | shows_map = three_shows.select( 70 | *[F.lit(column) for column in columns], F.array(*columns).alias("values") 71 | ) 72 | 73 | shows_map.show(truncate=False) 74 | 75 | shows_map = shows_map.select(F.array(*columns).alias("keys"), "values") 76 | 77 | shows_map.show(truncate=False) 78 | 79 | # Now we create the actual Map 80 | shows_map = shows_map.select(F.map_from_arrays("keys", "values").alias("mapped")) 81 | 82 | shows_map.printSchema() 83 | 84 | shows_map.show(truncate=False) 85 | 86 | shows_map.select( 87 | F.col("mapped.name"), 88 | F.col("mapped")["name"], 89 | shows_map.mapped["name"], 90 | ).show() 91 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/reading_json_explode_collect.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName("Ch06 - reading json explode collection").getOrCreate() 7 | 8 | data_dir = "/opt/spark/data" 9 | 10 | three_shows = spark.read.json(f"{data_dir}/shows/shows-*.json", multiLine=True) 11 | 12 | assert three_shows.count() == 3 13 | 14 | three_shows.printSchema() 15 | 16 | # EXPLODE AND COLLECT START HERE 17 | episodes = three_shows.select("id", F.explode("_embedded.episodes").alias("episodes")) 18 | 19 | episodes.show(5, truncate=70) 20 | 21 | pprint(episodes.count()) 22 | 23 | episode_name_id = three_shows.select( 24 | F.map_from_arrays( 25 | F.col("_embedded.episodes.id"), F.col("_embedded.episodes.name") 26 | ).alias("name_id") 27 | ) 28 | 29 | episode_name_id.show(5, truncate=70) 30 | 31 | episode_name_id = episode_name_id.select( 32 | F.posexplode("name_id").alias("position", "id", "name") 33 | ) 34 | 35 | episode_name_id.show(5) 36 | 37 | collected = episodes.groupby("id").agg(F.collect_list("episodes").alias("episodes")) 38 | 39 | pprint(collected.count()) 40 | 41 | collected.printSchema() 42 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter06/reading_json_struct.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName("Ch06 - reading json struct").getOrCreate() 5 | 6 | data_dir = "/opt/spark/data" 7 | 8 | three_shows = spark.read.json(f"{data_dir}/shows/shows-*.json", multiLine=True) 9 | 10 | assert three_shows.count() == 3 11 | 12 | three_shows.printSchema() 13 | 14 | # STRUCT STARTS HERE 15 | struct_ex = three_shows.select( 16 | F.struct(F.col("status"), F.col("weight"), F.lit(True).alias("has_watched")).alias( 17 | "info" 18 | ) 19 | ) 20 | 21 | struct_ex.show(3, False) 22 | 23 | struct_ex.printSchema() 24 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/backblaze.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | import pyspark.sql.types as T 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("Ch07 - Backblaze data").getOrCreate() 6 | 7 | data_dir = "/opt/spark/data" 8 | 9 | # only the minimal example from the book included 10 | q3 = spark.read.csv( 11 | f"{data_dir}/backblaze_data/data_Q3_2019", header=True, inferSchema=True 12 | ) 13 | 14 | backblaze_2019 = q3 15 | 16 | backblaze_2019 = backblaze_2019.select( 17 | [ 18 | F.col(x).cast(T.LongType()) if x.startswith("smart") else F.col(x) 19 | for x in backblaze_2019.columns 20 | ] 21 | ) 22 | 23 | view_name = "backblaze_stats_2019" 24 | backblaze_2019.createOrReplaceTempView(view_name) 25 | 26 | # Querying with SQL 27 | 28 | spark.sql(f"select serial_number from {view_name} where failure = 1").show(5) 29 | 30 | backblaze_2019.where("failure = 1").select(F.col("serial_number")).show(5) 31 | 32 | spark.sql( 33 | f""" 34 | SELECT 35 | model, 36 | min(capacity_bytes / pow(1024, 3)) min_GB, 37 | max(capacity_bytes / pow(1024, 3)) max_GB 38 | FROM {view_name} 39 | GROUP BY 1 40 | HAVING min_GB != max_GB 41 | ORDER BY 3 DESC 42 | """ 43 | ).show(5) 44 | 45 | backblaze_2019.groupby(F.col("model")).agg( 46 | F.min(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("min_GB"), 47 | F.max(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("max_GB"), 48 | ).where(F.col("min_GB") != F.col("max_GB")).orderBy( 49 | F.col("max_GB"), ascending=False 50 | ).show( 51 | 5 52 | ) 53 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/blending_sql_python.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName( 7 | "Ch07 - Backblaze data - Blending SQL and Python" 8 | ).getOrCreate() 9 | 10 | # spark.sparkContext.setLogLevel("WARN") 11 | 12 | data_dir = "/opt/spark/data/backblaze_data" 13 | 14 | DATA_FILES = ["data_Q3_2019"] 15 | 16 | data = [ 17 | spark.read.csv(f"{data_dir}/{file_dir}", header=True, inferSchema=True) 18 | for file_dir in DATA_FILES 19 | ] 20 | 21 | common_columns = list( 22 | reduce( 23 | lambda acc, element: acc.intersection(element), [set(df.columns) for df in data] 24 | ) 25 | ) 26 | 27 | assert {"model", "capacity_bytes", "date", "failure"}.issubset(set(common_columns)) 28 | 29 | full_data = reduce( 30 | lambda acc, df: acc.select(common_columns).union(df.select(common_columns)), data 31 | ) 32 | 33 | # full_data.printSchema() 34 | 35 | # Methods that accept SQL-type statements: 36 | # selectExpr, epxr, where/filter 37 | # selectExpr() is just like the select() method with the exception that it will pro- cess SQL-style operations. 38 | 39 | full_data = full_data.selectExpr( 40 | "model", "(capacity_bytes / pow(1024, 3)) as capacity_GB", "date", "failure" 41 | ) 42 | 43 | # Alternative: 44 | # full_data = full_data.select( 45 | # F.col("model"), 46 | # (F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("capacity_GB"), 47 | # F.col("date"), 48 | # F.col("failure") 49 | # ) 50 | 51 | drive_days = full_data.groupby("model", "capacity_GB").agg( 52 | F.count("*").alias("drive_days") 53 | ) 54 | 55 | failures = ( 56 | full_data.where("failure = 1") 57 | .groupby("model", "capacity_GB") 58 | .agg(F.count("*").alias("failures")) 59 | ) 60 | 61 | # Alternative 62 | # failures = ( 63 | # full_data.where("failure = 1") 64 | # .groupby("model", "capacity_GB") 65 | # .agg(F.expr("count(*) failures")) 66 | # ) 67 | 68 | # failures.show(5) 69 | 70 | summarized_data = ( 71 | drive_days.join(failures, on=["model", "capacity_GB"], how="left") 72 | .fillna(0.0, "failures") 73 | .selectExpr("model", "capacity_GB", "(failures / drive_days) AS failure_rate") 74 | .cache() 75 | ) 76 | 77 | # pprint("Summarized data:") 78 | # summarized_data.show(5) 79 | 80 | 81 | def most_reliable_drive_for_capacity(data, capacity_GB=2048.0, precision=0.25, top_n=3): 82 | """Return the top 3 drive for a given approximate capacity. 83 | 84 | Given a capacity in GB and a precision as a decimal number, we keep the N 85 | drives where: 86 | - the capacity is between (capacity * 1 / (1 + precision)), capacity * (1 + precision) 87 | - the failure rate is the lowest 88 | 89 | """ 90 | capacity_min = capacity_GB / (1 + precision) 91 | capacity_max = capacity_GB * (1 + precision) 92 | 93 | answer = ( 94 | data.where(f"capacity_GB between {capacity_min} and {capacity_max}") 95 | .orderBy("failure_rate", "capacity_GB", ascending=[True, False]) 96 | .limit(top_n) 97 | ) 98 | 99 | return answer 100 | 101 | 102 | most_reliable_drive_for_capacity(summarized_data, capacity_GB=11176.0).show() 103 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/creating_view.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | import pyspark.sql.types as T 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("Ch07 - Backblaze data - creating view").getOrCreate() 6 | 7 | data_dir = "/opt/spark/data" 8 | 9 | # only the minimal example from the book included 10 | q3 = spark.read.csv( 11 | f"{data_dir}/backblaze_data/data_Q3_2019", header=True, inferSchema=True 12 | ) 13 | 14 | backblaze_2019 = q3 15 | 16 | backblaze_2019 = backblaze_2019.select( 17 | [ 18 | F.col(x).cast(T.LongType()) if x.startswith("smart") else F.col(x) 19 | for x in backblaze_2019.columns 20 | ] 21 | ) 22 | 23 | backblaze_2019.createOrReplaceTempView("drive_stats") 24 | 25 | spark.sql( 26 | """ 27 | CREATE OR REPLACE TEMP VIEW drive_days AS 28 | SELECT model, COUNT(*) as drive_days 29 | FROM drive_stats 30 | GROUP BY model 31 | """ 32 | ) 33 | 34 | spark.sql( 35 | """ 36 | CREATE OR REPLACE TEMP VIEW failures AS 37 | SELECT model, count(*) AS failures 38 | FROM drive_stats 39 | WHERE failure = 1 40 | GROUP BY model""" 41 | ) 42 | 43 | 44 | drive_days = backblaze_2019.groupby(F.col("model")).agg( 45 | F.count(F.col("*")).alias("drive_days") 46 | ) 47 | 48 | failures = ( 49 | backblaze_2019.where(F.col("failure") == 1) 50 | .groupby(F.col("model")) 51 | .agg(F.count(F.col("*")).alias("failures")) 52 | ) 53 | 54 | spark.sql( 55 | """ 56 | SELECT 57 | drive_days.model, 58 | drive_days, 59 | failures 60 | FROM drive_days 61 | LEFT JOIN failures ON drive_days.model = failures.model 62 | """ 63 | ).show(5) 64 | 65 | drive_days.join(failures, on="model", how="left").show(5) 66 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/download_backblaze_data.py: -------------------------------------------------------------------------------- 1 | ##download_backblaze_data.py################################################### 2 | # 3 | # Requirements 4 | # 5 | # - wget (`pip install wget`) 6 | # 7 | # How to use: 8 | # 9 | # `python download_backblaze_data.py [parameter]` 10 | # 11 | # Parameters: 12 | # 13 | # - parameter: either `full` or `min` 14 | # 15 | # If set to `full` will download the data sets used in Chapter 7 (4 files, 16 | # ~2.3GB compressed, 12.4GB uncompressed). 17 | # 18 | # If set to `minimal` will download only 2019 Q3 (1 file, 574MB compressed, 19 | # 3.1GB uncompressed). 20 | # 21 | ############################################################################### 22 | 23 | import sys 24 | from pathlib import Path 25 | 26 | import wget 27 | 28 | DATASETS_FULL = [ 29 | "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q1_2019.zip", 30 | "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q2_2019.zip", 31 | "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q3_2019.zip", 32 | "https://f001.backblazeb2.com/file/Backblaze-Hard-Drive-Data/data_Q4_2019.zip", 33 | ] 34 | 35 | DATASETS_MINIMAL = DATASETS_FULL[2:3] # Slice to keep as a list. Simplifies 36 | # the code later. 37 | 38 | if __name__ == "__main__": 39 | 40 | # DON'T RUN THIS FROM DOCKER - THE DIRECTORY NAMES DO NOT MATCH 41 | try: 42 | param = sys.argv[1] 43 | 44 | if param.lower() == "full": 45 | datasets = DATASETS_FULL 46 | elif param.lower() == "minimal": 47 | datasets = DATASETS_MINIMAL 48 | else: 49 | raise AssertionError() 50 | except (AssertionError, IndexError): 51 | print( 52 | "Parameter missing. Refer to the documentation at the top of the source code for more information" 53 | ) 54 | sys.exit(1) 55 | 56 | data_dir = "book_data/backblaze_data" 57 | 58 | Path(data_dir).mkdir(exist_ok=True) 59 | 60 | for dataset in datasets: 61 | print("\n", dataset.rsplit("/", maxsplit=1)[-1]) 62 | wget.download(dataset, out=data_dir) 63 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/ex7_2.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName( 7 | "Ex7_2n" 8 | ).getOrCreate() 9 | 10 | data_dir = "/opt/spark/data/backblaze_data" 11 | 12 | DATA_FILES = ["data_Q3_2019"] 13 | 14 | data = [ 15 | spark.read.csv(f"{data_dir}/{file_dir}", header=True, inferSchema=True) 16 | for file_dir in DATA_FILES 17 | ] 18 | 19 | common_columns = list( 20 | reduce( 21 | lambda acc, element: acc.intersection(element), [set(df.columns) for df in data] 22 | ) 23 | ) 24 | 25 | assert {"model", "capacity_bytes", "date", "failure"}.issubset(set(common_columns)) 26 | 27 | full_data = reduce( 28 | lambda acc, df: acc.select(common_columns).union(df.select(common_columns)), data 29 | ) 30 | 31 | # full_data.printSchema() 32 | 33 | # Methods that accept SQL-type statements: 34 | # selectExpr, epxr, where/filter 35 | # selectExpr() is just like the select() method with the exception that it will pro- cess SQL-style operations. 36 | 37 | full_data = full_data.selectExpr( 38 | "model", "(capacity_bytes / pow(1024, 3)) as capacity_GB", "date", "failure" 39 | ) 40 | 41 | # Alternative: 42 | # full_data = full_data.select( 43 | # F.col("model"), 44 | # (F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("capacity_GB"), 45 | # F.col("date"), 46 | # F.col("failure") 47 | # ) 48 | 49 | 50 | summarized_data = ( 51 | full_data.groupBy("model", "capacity_GB") 52 | .agg( 53 | F.count("*").alias("drive_days"), 54 | F.sum(F.col("failure")).alias("failures"), 55 | ) 56 | .selectExpr("model", "capacity_GB", "(failures / drive_days) AS failure_rate") 57 | ) 58 | 59 | summarized_data.show(5) 60 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/ex7_3.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName( 7 | "Ex7_3" 8 | ).getOrCreate() 9 | 10 | data_dir = "/opt/spark/data/backblaze_data" 11 | 12 | DATA_FILES = ["data_Q3_2019"] 13 | 14 | data = [ 15 | spark.read.csv(f"{data_dir}/{file_dir}", header=True, inferSchema=True) 16 | for file_dir in DATA_FILES 17 | ] 18 | 19 | common_columns = list( 20 | reduce( 21 | lambda acc, element: acc.intersection(element), [set(df.columns) for df in data] 22 | ) 23 | ) 24 | 25 | assert {"model", "capacity_bytes", "date", "failure"}.issubset(set(common_columns)) 26 | 27 | full_data = reduce( 28 | lambda acc, df: acc.select(common_columns).union(df.select(common_columns)), data 29 | ) 30 | 31 | # Methods that accept SQL-type statements: 32 | # selectExpr, epxr, where/filter 33 | # selectExpr() is just like the select() method with the exception that it will pro- cess SQL-style operations. 34 | 35 | # Group by model, capacity and failure - to get the first date that a failure is reported 36 | # When looking at the reliability of each drive model, 37 | # we can use drive days as a unit and count the failures versus drive days. 38 | full_data = ( 39 | full_data.selectExpr( 40 | "serial_number", 41 | "model", 42 | "capacity_bytes / pow(1024, 3) as capacity_GB", 43 | "date", 44 | "failure", 45 | ) 46 | .groupby("serial_number", "model", "capacity_GB") 47 | .agg( 48 | F.datediff(F.max("date").cast("date"), F.min("date").cast("date")).alias("age") 49 | ) 50 | ) 51 | 52 | summarized_data = full_data.groupby("model", "capacity_GB").agg( 53 | F.avg("age").alias("avg_age") 54 | ) 55 | 56 | # 57 | summarized_data.orderBy("avg_age", ascending=False).show(20) 58 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/ex7_4.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName( 6 | "Ex7_4" 7 | ).getOrCreate() 8 | 9 | data_dir = "/opt/spark/data/backblaze_data" 10 | 11 | DATA_FILES = ["data_Q3_2019"] 12 | 13 | data = [ 14 | spark.read.csv(f"{data_dir}/{file_dir}", header=True, inferSchema=True) 15 | for file_dir in DATA_FILES 16 | ] 17 | 18 | common_columns = list( 19 | reduce( 20 | lambda acc, element: acc.intersection(element), [set(df.columns) for df in data] 21 | ) 22 | ) 23 | 24 | assert {"model", "capacity_bytes", "date", "failure"}.issubset(set(common_columns)) 25 | 26 | full_data = reduce( 27 | lambda acc, df: acc.select(common_columns).union(df.select(common_columns)), data 28 | ) 29 | 30 | # Methods that accept SQL-type statements: 31 | # selectExpr, epxr, where/filter 32 | # selectExpr() is just like the select() method with the exception that it will pro- cess SQL-style operations. 33 | 34 | # Group by model, capacity and failure - to get the first date that a failure is reported 35 | # When looking at the reliability of each drive model, 36 | # we can use drive days as a unit and count the failures versus drive days. 37 | full_data = ( 38 | full_data.selectExpr( 39 | "cast(date as date) as date", 40 | "capacity_bytes / pow(1024, 4) as capacity_TB", 41 | ) 42 | .where("extract(day from date) = 1") 43 | .groupby("date") 44 | .sum("capacity_TB") 45 | ) 46 | 47 | full_data.orderBy("date").show() 48 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/ex7_5.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | 3 | import pyspark.sql.functions as F 4 | from pyspark.sql import SparkSession 5 | 6 | spark = SparkSession.builder.appName( 7 | "Ex7_5" 8 | ).getOrCreate() 9 | 10 | data_dir = "/opt/spark/data/backblaze_data" 11 | 12 | DATA_FILES = ["data_Q3_2019"] 13 | 14 | data = [ 15 | spark.read.csv(f"{data_dir}/{file_dir}", header=True, inferSchema=True) 16 | for file_dir in DATA_FILES 17 | ] 18 | 19 | common_columns = list( 20 | reduce( 21 | lambda acc, element: acc.intersection(element), [set(df.columns) for df in data] 22 | ) 23 | ) 24 | 25 | assert {"model", "capacity_bytes", "date", "failure"}.issubset(set(common_columns)) 26 | 27 | full_data = reduce( 28 | lambda acc, df: acc.select(common_columns).union(df.select(common_columns)), data 29 | ) 30 | 31 | # Methods that accept SQL-type statements: 32 | # selectExpr, epxr, where/filter 33 | # selectExpr() is just like the select() method with the exception that it will pro- cess SQL-style operations. 34 | 35 | # Group by model, capacity and failure - to get the first date that a failure is reported 36 | # When looking at the reliability of each drive model, 37 | # we can use drive days as a unit and count the failures versus drive days. 38 | 39 | capacity_count = full_data.groupby("model", "capacity_bytes").agg( 40 | F.count("*").alias("capacity_occurrence") 41 | ) 42 | 43 | most_common_capacity = capacity_count.groupby("model").agg( 44 | F.max("capacity_occurrence").alias("most_common_capacity_occurrence") 45 | ) 46 | 47 | sol = most_common_capacity.join( 48 | capacity_count, 49 | on=( 50 | capacity_count["model"] 51 | == most_common_capacity["model"] & capacity_count["capacity_occurrence"] 52 | == most_common_capacity["most_common_capacity_occurrence"] 53 | ), 54 | ).select(most_common_capacity["model"], "capacity_bytes") 55 | 56 | sol.show(5) 57 | 58 | full_data = full_data.drop("capacity_bytes").join(sol, on="model") 59 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/periodic_table.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | from pyspark.sql import SparkSession 3 | 4 | spark = SparkSession.builder.appName("Ch07 - Perdiodic table - PySpark vs SQL").getOrCreate() 5 | 6 | data_dir = "/opt/spark/data" 7 | 8 | elements = spark.read.csv( 9 | f"{data_dir}/elements/Periodic_Table_Of_Elements.csv", header=True, inferSchema=True 10 | ) 11 | 12 | elements.where(F.col("phase") == "liq").groupby("period").count().show() 13 | 14 | # SQL: 15 | # SELECT period, COUNT(*) 16 | # FROM elements 17 | # WHERE phase = 'liq' 18 | # GROUP BY period 19 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/spark_catalog.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.utils import AnalysisException 5 | 6 | # The spark catalog is a way for Spark to manage its SQL namespace 7 | spark = SparkSession.builder.appName("Ch07 - Perdiodic table - spark catalog").getOrCreate() 8 | 9 | data_dir = "/opt/spark/data" 10 | 11 | elements = spark.read.csv( 12 | f"{data_dir}/elements/Periodic_Table_Of_Elements.csv", header=True, inferSchema=True 13 | ) 14 | 15 | elements.createOrReplaceTempView( 16 | "elements" 17 | ) # register the data frame so that we can query it with Spark SQL 18 | 19 | try: 20 | spark.sql( 21 | "select period, count(*) from elements where phase='liq' group by period" 22 | ).show(5) 23 | except AnalysisException as e: 24 | print(e) 25 | 26 | pprint(spark.catalog.listTables()) 27 | 28 | spark.catalog.dropTempView("elements") 29 | 30 | pprint(spark.catalog.listTables()) 31 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/sql_querying.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.utils import AnalysisException 3 | 4 | spark = SparkSession.builder.appName("Ch07 - Perdiodic table - SQL querying").getOrCreate() 5 | 6 | data_dir = "/opt/spark/data" 7 | 8 | elements = spark.read.csv( 9 | f"{data_dir}/elements/Periodic_Table_Of_Elements.csv", header=True, inferSchema=True 10 | ) 11 | 12 | elements.createOrReplaceTempView( 13 | "elements" 14 | ) # register the data frame so that we can query it with Spark SQL 15 | 16 | try: 17 | spark.sql( 18 | "select period, count(*) from elements where phase='liq' group by period" 19 | ).show(5) 20 | except AnalysisException as e: 21 | print(e) 22 | -------------------------------------------------------------------------------- /spark_apps/data_analysis_book/chapter07/subquery_cte.py: -------------------------------------------------------------------------------- 1 | import pyspark.sql.functions as F 2 | import pyspark.sql.types as T 3 | from pyspark.sql import SparkSession 4 | 5 | spark = SparkSession.builder.appName("Ch07 - Backblaze data - subquery and cte").getOrCreate() 6 | 7 | data_dir = "/opt/spark/data" 8 | 9 | # only the minimal example from the book included 10 | q3 = spark.read.csv( 11 | f"{data_dir}/backblaze_data/data_Q3_2019", header=True, inferSchema=True 12 | ) 13 | 14 | backblaze_2019 = q3 15 | 16 | backblaze_2019 = backblaze_2019.select( 17 | [ 18 | F.col(x).cast(T.LongType()) if x.startswith("smart") else F.col(x) 19 | for x in backblaze_2019.columns 20 | ] 21 | ) 22 | 23 | backblaze_2019.createOrReplaceTempView("drive_stats") 24 | 25 | spark.sql( 26 | """ 27 | SELECT 28 | failures.model, 29 | failures / drive_days failure_rate 30 | FROM ( 31 | SELECT 32 | model, 33 | count(*) as drive_days 34 | FROM drive_stats 35 | GROUP BY model 36 | ) AS drive_days 37 | INNER JOIN ( 38 | SELECT 39 | model, 40 | count(*) as failures 41 | FROM drive_stats 42 | WHERE failure = 1 43 | GROUP BY model 44 | ) AS failures 45 | ON drive_days.model = failures.model 46 | ORDER BY 2 DESC 47 | """ 48 | ).show(5) 49 | 50 | spark.sql( 51 | """ 52 | WITH drive_days AS ( 53 | SELECT 54 | model, 55 | count(*) AS drive_days 56 | FROM drive_stats 57 | GROUP BY model 58 | ), 59 | failures AS ( 60 | SELECT 61 | model, 62 | count(*) AS failures 63 | FROM drive_stats 64 | WHERE failure = 1 65 | GROUP BY model 66 | ) 67 | SELECT 68 | failures.model, 69 | (failures / drive_days) failure_rate 70 | FROM drive_days 71 | INNER JOIN failures 72 | ON drive_days.model = failures.model 73 | ORDER BY 2 DESC 74 | """ 75 | ).show(5) 76 | 77 | 78 | def failure_rate(drive_stats): 79 | drive_days = drive_stats.groupby(F.col("model")).agg( 80 | F.count(F.col("*")).alias("drive_days") 81 | ) 82 | 83 | failures = ( 84 | drive_stats.where(F.col("failure") == 1) 85 | .groupby(F.col("model")) 86 | .agg(F.count(F.col("*")).alias("failures")) 87 | ) 88 | 89 | answer = ( 90 | drive_days.join(failures, on="model", how="inner") 91 | .withColumn("failure_rate", F.col("failures") / F.col("drive_days")) 92 | .orderBy(F.col("failure_rate").desc()) 93 | ) 94 | 95 | return answer 96 | 97 | 98 | failure_rate(backblaze_2019).show(5) 99 | 100 | # We are testing if we have a variable drive_days in scope 101 | # once the function returned confirms that our intermediate 102 | # frames are neatly confined inside the function scope. 103 | print("drive_days" in dir()) 104 | -------------------------------------------------------------------------------- /ssh_config: -------------------------------------------------------------------------------- 1 | Host * 2 | UserKnownHostsFile /dev/null 3 | StrictHostKeyChecking no 4 | -------------------------------------------------------------------------------- /yarn/capacity-scheduler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | yarn.scheduler.capacity.root.queues 4 | default 5 | 6 | 7 | yarn.scheduler.capacity.root.default.capacity 8 | 100 9 | 10 | 11 | yarn.scheduler.capacity.resource-calculator 12 | org.apache.hadoop.yarn.util.resource.DominantResourceCalculator 13 | 14 | 15 | -------------------------------------------------------------------------------- /yarn/core-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | fs.defaultFS 6 | hdfs://spark-yarn-master:8080 7 | 8 | 9 | -------------------------------------------------------------------------------- /yarn/hdfs-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | dfs.namenode.name.dir 6 | /opt/hadoop/data/nameNode 7 | 8 | 9 | dfs.datanode.data.dir 10 | /opt/hadoop/data/dataNode 11 | 12 | 13 | dfs.replication 14 | 2 15 | 16 | 17 | dfs.permissions.enabled 18 | false 19 | 20 | 21 | -------------------------------------------------------------------------------- /yarn/mapred-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | mapreduce.framework.name 4 | yarn 5 | 6 | 7 | yarn.app.mapreduce.am.env 8 | HADOOP_MAPRED_HOME=$HADOOP_HOME 9 | 10 | 11 | mapreduce.map.env 12 | HADOOP_MAPRED_HOME=$HADOOP_HOME 13 | 14 | 15 | mapreduce.reduce.env 16 | HADOOP_MAPRED_HOME=$HADOOP_HOME 17 | 18 | 19 | yarn.app.mapreduce.am.resource.mb 20 | 1024 21 | 22 | 23 | mapreduce.map.memory.mb 24 | 512 25 | 26 | 27 | mapreduce.reduce.memory.mb 28 | 512 29 | 30 | 31 | -------------------------------------------------------------------------------- /yarn/spark-defaults.conf: -------------------------------------------------------------------------------- 1 | spark.master yarn 2 | spark.submit.deployMode client 3 | spark.driver.memory 512m 4 | spark.executor.memory 512m 5 | spark.yarn.am.memory 1G 6 | spark.eventLog.enabled true 7 | spark.eventLog.dir hdfs://spark-yarn-master:8080/spark-logs 8 | spark.history.provider org.apache.spark.deploy.history.FsHistoryProvider 9 | spark.history.fs.logDirectory hdfs://spark-yarn-master:8080/spark-logs 10 | spark.yarn.historyServer.address localhost:18080 11 | spark.history.fs.update.interval 10s 12 | spark.history.ui.port 18080 13 | -------------------------------------------------------------------------------- /yarn/yarn-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | yarn.resourcemanager.hostname 5 | spark-yarn-master 6 | 7 | 8 | yarn.nodemanager.resource.cpu-vcores 9 | 2 10 | 11 | 12 | yarn.nodemanager.aux-services 13 | mapreduce_shuffle 14 | 15 | 16 | yarn.nodemanager.resource.memory-mb 17 | 2048 18 | 19 | 20 | yarn.scheduler.maximum-allocation-mb 21 | 2048 22 | 23 | 24 | yarn.scheduler.minimum-allocation-mb 25 | 512 26 | 27 | 28 | 29 | yarn.log-aggregation-enable 30 | true 31 | 32 | 33 | --------------------------------------------------------------------------------