├── Chapter01 ├── wordcount-datarame.ipynb ├── wordcount-rdd.ipynb └── wordcount-sql.ipynb ├── Chapter02 ├── batch-data-ingestion.ipynb ├── data-formats.ipynb ├── file-sink.ipynb ├── mysql-sink.ipynb ├── mysql-source.ipynb ├── serving-layer.ipynb ├── streaming-data-ingestion.ipynb └── utils │ ├── kafka-setup.ipynb │ └── mysql-setup.ipynb ├── Chapter03 ├── building-delta-lake.ipynb ├── data-cleansing.ipynb ├── data-integration-cleansing.ipynb ├── reliability-challenges.ipynb └── test ├── Chapter04 ├── change-data-capture.py ├── datalake-source-stream.py ├── lambda-arch-delta.py ├── multi-hop-pipeline.py └── stateful-processing.py ├── Chapter05 └── data-wrangling.py ├── Chapter06 └── feature_enineering.py ├── Chapter07 └── supervised-learning.py ├── Chapter08 └── unsupervised-learning.py ├── Chapter09 └── ml-lifecycle.py ├── Chapter10 └── scale-out-python-ml.py ├── Chapter11 ├── databricks-charts-graphs.py └── python-charts-graphs.py ├── Chapter12 ├── spark-sql.py └── test.txt ├── LICENSE ├── README.md ├── all_chapters ├── README.md ├── ess_pyspark.dbc └── ess_pyspark.zip └── data ├── ConsolidatedCities.csv ├── adult.data ├── countries_codes.csv ├── images ├── DSC_0100.jpg ├── DSC_0101.jpg ├── DSC_0104.jpg ├── DSC_0105.jpg ├── DSC_0106.jpg ├── DSC_0107.jpg ├── DSC_0108.jpg ├── DSC_0109.jpg └── DSC_0110.jpg └── online_retail.zip /Chapter01/wordcount-datarame.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter01/wordcount-datarame.ipynb -------------------------------------------------------------------------------- /Chapter01/wordcount-rdd.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter01/wordcount-rdd.ipynb -------------------------------------------------------------------------------- /Chapter01/wordcount-sql.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter01/wordcount-sql.ipynb -------------------------------------------------------------------------------- /Chapter02/batch-data-ingestion.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/batch-data-ingestion.ipynb -------------------------------------------------------------------------------- /Chapter02/data-formats.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/data-formats.ipynb -------------------------------------------------------------------------------- /Chapter02/file-sink.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/file-sink.ipynb -------------------------------------------------------------------------------- /Chapter02/mysql-sink.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/mysql-sink.ipynb -------------------------------------------------------------------------------- /Chapter02/mysql-source.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/mysql-source.ipynb -------------------------------------------------------------------------------- /Chapter02/serving-layer.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/serving-layer.ipynb -------------------------------------------------------------------------------- /Chapter02/streaming-data-ingestion.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/streaming-data-ingestion.ipynb -------------------------------------------------------------------------------- /Chapter02/utils/kafka-setup.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/utils/kafka-setup.ipynb -------------------------------------------------------------------------------- /Chapter02/utils/mysql-setup.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter02/utils/mysql-setup.ipynb -------------------------------------------------------------------------------- /Chapter03/building-delta-lake.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter03/building-delta-lake.ipynb -------------------------------------------------------------------------------- /Chapter03/data-cleansing.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter03/data-cleansing.ipynb -------------------------------------------------------------------------------- /Chapter03/data-integration-cleansing.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter03/data-integration-cleansing.ipynb -------------------------------------------------------------------------------- /Chapter03/reliability-challenges.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter03/reliability-challenges.ipynb -------------------------------------------------------------------------------- /Chapter03/test: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Chapter04/change-data-capture.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter04/change-data-capture.py -------------------------------------------------------------------------------- /Chapter04/datalake-source-stream.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter04/datalake-source-stream.py -------------------------------------------------------------------------------- /Chapter04/lambda-arch-delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter04/lambda-arch-delta.py -------------------------------------------------------------------------------- /Chapter04/multi-hop-pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter04/multi-hop-pipeline.py -------------------------------------------------------------------------------- /Chapter04/stateful-processing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter04/stateful-processing.py -------------------------------------------------------------------------------- /Chapter05/data-wrangling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter05/data-wrangling.py -------------------------------------------------------------------------------- /Chapter06/feature_enineering.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter06/feature_enineering.py -------------------------------------------------------------------------------- /Chapter07/supervised-learning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter07/supervised-learning.py -------------------------------------------------------------------------------- /Chapter08/unsupervised-learning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter08/unsupervised-learning.py -------------------------------------------------------------------------------- /Chapter09/ml-lifecycle.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter09/ml-lifecycle.py -------------------------------------------------------------------------------- /Chapter10/scale-out-python-ml.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter10/scale-out-python-ml.py -------------------------------------------------------------------------------- /Chapter11/databricks-charts-graphs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter11/databricks-charts-graphs.py -------------------------------------------------------------------------------- /Chapter11/python-charts-graphs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter11/python-charts-graphs.py -------------------------------------------------------------------------------- /Chapter12/spark-sql.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/Chapter12/spark-sql.py -------------------------------------------------------------------------------- /Chapter12/test.txt: -------------------------------------------------------------------------------- 1 | d 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/README.md -------------------------------------------------------------------------------- /all_chapters/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/all_chapters/README.md -------------------------------------------------------------------------------- /all_chapters/ess_pyspark.dbc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/all_chapters/ess_pyspark.dbc -------------------------------------------------------------------------------- /all_chapters/ess_pyspark.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/all_chapters/ess_pyspark.zip -------------------------------------------------------------------------------- /data/ConsolidatedCities.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/ConsolidatedCities.csv -------------------------------------------------------------------------------- /data/adult.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/adult.data -------------------------------------------------------------------------------- /data/countries_codes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/countries_codes.csv -------------------------------------------------------------------------------- /data/images/DSC_0100.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0100.jpg -------------------------------------------------------------------------------- /data/images/DSC_0101.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0101.jpg -------------------------------------------------------------------------------- /data/images/DSC_0104.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0104.jpg -------------------------------------------------------------------------------- /data/images/DSC_0105.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0105.jpg -------------------------------------------------------------------------------- /data/images/DSC_0106.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0106.jpg -------------------------------------------------------------------------------- /data/images/DSC_0107.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0107.jpg -------------------------------------------------------------------------------- /data/images/DSC_0108.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0108.jpg -------------------------------------------------------------------------------- /data/images/DSC_0109.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0109.jpg -------------------------------------------------------------------------------- /data/images/DSC_0110.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/images/DSC_0110.jpg -------------------------------------------------------------------------------- /data/online_retail.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Essential-PySpark-for-Scalable-Data-Analytics/HEAD/data/online_retail.zip --------------------------------------------------------------------------------