├── .gitignore
├── README.md
├── code
    ├── README.md
    ├── bonus_chapters
    │   ├── README.md
    │   ├── TF-IDF
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── TF_IDF.log
    │   │   │   ├── TF_IDF.py
    │   │   │   └── data
    │   │   │   │   ├── doc1
    │   │   │   │   ├── doc2
    │   │   │   │   ├── doc3
    │   │   │   │   └── doc4
    │   │   └── scala
    │   │   │   ├── build.gradle
    │   │   │   ├── gradle
    │   │   │       └── wrapper
    │   │   │       │   ├── gradle-wrapper.jar
    │   │   │       │   └── gradle-wrapper.properties
    │   │   │   ├── gradlew
    │   │   │   ├── gradlew.bat
    │   │   │   ├── out
    │   │   │       └── production
    │   │   │       │   └── resources
    │   │   │       │       └── log4j.properties
    │   │   │   ├── settings.gradle
    │   │   │   └── src
    │   │   │       └── main
    │   │   │           ├── data
    │   │   │               ├── doc1
    │   │   │               ├── doc2
    │   │   │               ├── doc3
    │   │   │               └── doc4
    │   │   │           ├── resources
    │   │   │               └── log4j.properties
    │   │   │           └── scala
    │   │   │               └── org
    │   │   │                   └── data
    │   │   │                       └── algorithms
    │   │   │                           └── spark
    │   │   │                               └── bonus_chapter
    │   │   │                                   └── TFIDF.scala
    │   ├── Top-N
    │   │   ├── README.md
    │   │   ├── python
    │   │   │   ├── Top_N_movies_Dataframe_using_API.log
    │   │   │   ├── Top_N_movies_Dataframe_using_API.py
    │   │   │   ├── Top_N_movies_Dataframe_using_API.sh
    │   │   │   ├── Top_N_movies_Dataframe_using_SQL.log
    │   │   │   ├── Top_N_movies_Dataframe_using_SQL.py
    │   │   │   ├── Top_N_movies_Dataframe_using_SQL.sh
    │   │   │   ├── Top_N_movies_RDD_using_combineByKey.log
    │   │   │   ├── Top_N_movies_RDD_using_combineByKey.py
    │   │   │   ├── Top_N_movies_RDD_using_combineByKey.sh
    │   │   │   ├── Top_N_movies_RDD_using_groupByKey.log
    │   │   │   ├── Top_N_movies_RDD_using_groupByKey.py
    │   │   │   ├── Top_N_movies_RDD_using_groupByKey.sh
    │   │   │   ├── Top_N_movies_RDD_using_reduceByKey.log
    │   │   │   ├── Top_N_movies_RDD_using_reduceByKey.py
    │   │   │   ├── Top_N_movies_RDD_using_reduceByKey.sh
    │   │   │   ├── Top_N_movies_RDD_using_takeOrdered.log
    │   │   │   ├── Top_N_movies_RDD_using_takeOrdered.py
    │   │   │   └── Top_N_movies_RDD_using_takeOrdered.sh
    │   │   ├── scala
    │   │   │   └── README.md
    │   │   └── top-10.jpeg
    │   ├── UDF
    │   │   ├── README.md
    │   │   ├── UDF.pdf
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── dataframe_UDF_example.log
    │   │   │   └── dataframe_UDF_example.py
    │   │   └── scala
    │   │   │   └── README.md
    │   ├── anagrams
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── anagrams_by_combinebykey.log
    │   │   │   ├── anagrams_by_combinebykey.py
    │   │   │   ├── anagrams_by_groupbykey.log
    │   │   │   ├── anagrams_by_groupbykey.py
    │   │   │   ├── anagrams_by_reducebykey.log
    │   │   │   ├── anagrams_by_reducebykey.py
    │   │   │   └── sample_document.txt
    │   │   └── scala
    │   │   │   ├── README.md
    │   │   │   ├── build.gradle
    │   │   │   ├── data
    │   │   │       └── sample_document.txt
    │   │   │   ├── gradle
    │   │   │       └── wrapper
    │   │   │       │   ├── gradle-wrapper.jar
    │   │   │       │   └── gradle-wrapper.properties
    │   │   │   ├── gradlew
    │   │   │   ├── gradlew.bat
    │   │   │   ├── run_spark_applications_scripts
    │   │   │       ├── anagrams_by_combine_by_key.sh
    │   │   │       ├── anagrams_by_group_by_key.sh
    │   │   │       └── anagrams_by_reduce_by_key.sh
    │   │   │   ├── settings.gradle
    │   │   │   └── src
    │   │   │       └── main
    │   │   │           ├── resources
    │   │   │               └── log4j.properties
    │   │   │           └── scala
    │   │   │               └── org
    │   │   │                   └── data
    │   │   │                       └── algorithms
    │   │   │                           └── spark
    │   │   │                               └── bonuschapter
    │   │   │                                   ├── AnagramsByCombineByKey.scala
    │   │   │                                   ├── AnagramsByGroupByKey.scala
    │   │   │                                   └── AnagramsByReduceByKey.scala
    │   ├── correlation
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── all_versus_all_correlation_dataframe.log
    │   │   │   ├── all_versus_all_correlation_dataframe.py
    │   │   │   ├── all_versus_all_correlation_rdd.log
    │   │   │   ├── all_versus_all_correlation_rdd.py
    │   │   │   ├── rdd_cartesian_in_action.txt
    │   │   │   └── sample_input.txt
    │   │   └── scala
    │   │   │   ├── build.gradle
    │   │   │   ├── gradle
    │   │   │       └── wrapper
    │   │   │       │   ├── gradle-wrapper.jar
    │   │   │       │   └── gradle-wrapper.properties
    │   │   │   ├── gradlew
    │   │   │   ├── gradlew.bat
    │   │   │   ├── settings.gradle
    │   │   │   └── src
    │   │   │       └── main
    │   │   │           ├── resources
    │   │   │               └── log4j.properties
    │   │   │           └── scala
    │   │   │               └── org
    │   │   │                   └── data
    │   │   │                       └── algorithms
    │   │   │                           └── spark
    │   │   │                               └── bonuschapter
    │   │   │                                   └── AllVersusAllCorrelationDataframe.scala
    │   ├── dataframes
    │   │   ├── README.md
    │   │   ├── arrays_in_dataframes
    │   │   │   ├── python
    │   │   │   │   ├── README.md
    │   │   │   │   ├── creating_arrays_in_dataframes.log
    │   │   │   │   └── creating_arrays_in_dataframes.py
    │   │   │   └── scala
    │   │   │   │   └── README.md
    │   │   ├── dataframe_tutorial_from_collection.py.md
    │   │   ├── dataframe_tutorial_from_text_files.py.md
    │   │   └── explode_arrays_into_rows
    │   │   │   ├── python
    │   │   │       ├── README.md
    │   │   │       ├── explode_arrays_into_rows.log
    │   │   │       └── explode_arrays_into_rows.py
    │   │   │   └── scala
    │   │   │       └── README.md
    │   ├── join
    │   │   ├── README.md
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── dataframe_join_cross.log
    │   │   │   ├── dataframe_join_cross.py
    │   │   │   ├── dataframe_join_inner.log
    │   │   │   ├── dataframe_join_inner.py
    │   │   │   ├── dataframe_join_left.log
    │   │   │   ├── dataframe_join_left.py
    │   │   │   ├── dataframe_join_right.py
    │   │   │   ├── rdd_join_inner.log
    │   │   │   ├── rdd_join_inner.py
    │   │   │   ├── rdd_join_left.log
    │   │   │   ├── rdd_join_left.py
    │   │   │   ├── rdd_join_right.log
    │   │   │   └── rdd_join_right.py
    │   │   └── scala
    │   │   │   └── README.md
    │   ├── k-mers
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── kmer_fasta.py
    │   │   │   ├── kmer_fastq.py
    │   │   │   ├── sample_1.fasta
    │   │   │   └── sample_1.fastq
    │   │   └── scala
    │   │   │   ├── README.md
    │   │   │   ├── build.gradle
    │   │   │   ├── data
    │   │   │       ├── sample_1.fasta
    │   │   │       └── sample_1.fastq
    │   │   │   ├── gradle
    │   │   │       └── wrapper
    │   │   │       │   ├── gradle-wrapper.jar
    │   │   │       │   └── gradle-wrapper.properties
    │   │   │   ├── gradlew
    │   │   │   ├── gradlew.bat
    │   │   │   ├── run_spark_applications_scripts
    │   │   │       ├── kmer_fast_q.sh
    │   │   │       └── kmer_fasta.sh
    │   │   │   ├── settings.gradle
    │   │   │   └── src
    │   │   │       └── main
    │   │   │           ├── resources
    │   │   │               └── log4j.properties
    │   │   │           └── scala
    │   │   │               └── org
    │   │   │                   └── data
    │   │   │                       └── algorithms
    │   │   │                           └── spark
    │   │   │                               └── bonuschapter
    │   │   │                                   ├── KMERFastQ.scala
    │   │   │                                   └── KMERFasta.scala
    │   ├── lambda_expressions
    │   │   ├── Lambda_Expressions.pdf
    │   │   ├── Lambda_Expressions_basics.md
    │   │   └── README.md
    │   ├── mappartitions
    │   │   ├── README.md
    │   │   ├── SAMPLE_INPUT_FILES
    │   │   │   ├── file1.txt
    │   │   │   ├── file2.txt
    │   │   │   └── file3.txt
    │   │   ├── images
    │   │   │   ├── mappartitions_image_1.drawio.png
    │   │   │   └── mappartitions_image_2.drawio.png
    │   │   ├── python
    │   │   │   ├── README.md
    │   │   │   ├── find_min_max_by_mappartitions.log
    │   │   │   ├── find_min_max_by_mappartitions.py
    │   │   │   └── find_min_max_by_mappartitions.sh
    │   │   └── scala
    │   │   │   └── README.md
    │   ├── physical_partitioning
    │   │   ├── README.md
    │   │   ├── continents_countries_temp.csv
    │   │   ├── data_partitioning.png
    │   │   ├── partition_by_one_column.py
    │   │   ├── partition_by_one_column_schema.sql
    │   │   ├── partition_by_two_columns.py
    │   │   └── partition_by_two_columns_schema.sql
    │   ├── pyspark_tutorial
    │   │   ├── README.md
    │   │   └── pyspark_tutorial.md
    │   └── wordcount
    │   │   ├── README.md
    │   │   ├── python
    │   │       ├── README.md
    │   │       ├── data
    │   │       │   ├── foxdata.txt
    │   │       │   └── sample_document.txt
    │   │       ├── word_count_by_dataframe.log
    │   │       ├── word_count_by_dataframe.py
    │   │       ├── word_count_by_dataframe_shorthand.log
    │   │       ├── word_count_by_dataframe_shorthand.py
    │   │       ├── wordcount_by_combinebykey.log
    │   │       ├── wordcount_by_combinebykey.py
    │   │       ├── wordcount_by_combinebykey.sh
    │   │       ├── wordcount_by_groupbykey.py
    │   │       ├── wordcount_by_groupbykey.sh
    │   │       ├── wordcount_by_groupbykey_shorthand.py
    │   │       ├── wordcount_by_groupbykey_shorthand.sh
    │   │       ├── wordcount_by_reducebykey.py
    │   │       ├── wordcount_by_reducebykey.sh
    │   │       ├── wordcount_by_reducebykey_shorthand.py
    │   │       ├── wordcount_by_reducebykey_shorthand.sh
    │   │       ├── wordcount_by_reducebykey_with_filter.py
    │   │       └── wordcount_by_reducebykey_with_filter.sh
    │   │   ├── scala
    │   │       ├── README.md
    │   │       ├── build.gradle
    │   │       ├── data
    │   │       │   └── sample_document.txt
    │   │       ├── gradle
    │   │       │   └── wrapper
    │   │       │   │   ├── gradle-wrapper.jar
    │   │       │   │   └── gradle-wrapper.properties
    │   │       ├── gradlew
    │   │       ├── gradlew.bat
    │   │       ├── run_spark_applications_scripts
    │   │       │   ├── word_count_by_group_by_key.sh
    │   │       │   ├── word_count_by_group_by_key_shorthand.sh
    │   │       │   ├── word_count_by_reduce_by_key.sh
    │   │       │   ├── word_count_by_reduce_by_key_shorthand.sh
    │   │       │   └── word_count_by_reduce_by_key_with_filter.sh
    │   │       ├── settings.gradle
    │   │       └── src
    │   │       │   └── main
    │   │       │       ├── resources
    │   │       │           └── log4j.properties
    │   │       │       └── scala
    │   │       │           └── org
    │   │       │               └── data
    │   │       │                   └── algorithms
    │   │       │                       └── spark
    │   │       │                           └── bonuschapter
    │   │       │                               ├── WordCountByGroupByKey.scala
    │   │       │                               ├── WordCountByGroupByKeyShorthand.scala
    │   │       │                               ├── WordCountByReduceByKey.scala
    │   │       │                               ├── WordCountByReduceByKeyShorthand.scala
    │   │       │                               └── WordCountByReduceByKeyWithFilter.scala
    │   │   └── word_count_with_mapreduce.png
    ├── chap01
    │   ├── data
    │   │   ├── census_2010.json
    │   │   ├── emps.txt
    │   │   ├── sample_5_records.txt
    │   │   └── users.txt
    │   ├── python
    │   │   ├── average_by_key_use_aggregatebykey.py
    │   │   ├── average_by_key_use_aggregatebykey.sh
    │   │   ├── average_by_key_use_combinebykey.py
    │   │   ├── average_by_key_use_combinebykey.sh
    │   │   ├── average_by_key_use_foldbykey.py
    │   │   ├── average_by_key_use_foldbykey.sh
    │   │   ├── average_by_key_use_groupbykey.py
    │   │   ├── average_by_key_use_groupbykey.sh
    │   │   ├── average_by_key_use_reducebykey.py
    │   │   ├── average_by_key_use_reducebykey.sh
    │   │   ├── dataframe_action_describe.py
    │   │   ├── dataframe_action_describe.sh
    │   │   ├── dataframe_add_column.py
    │   │   ├── dataframe_add_column.sh
    │   │   ├── dataframe_drop_column.py
    │   │   ├── dataframe_drop_column.sh
    │   │   ├── dataframe_filter.py
    │   │   ├── dataframe_filter.sh
    │   │   ├── dataframe_join_cross.py
    │   │   ├── dataframe_join_cross.sh
    │   │   ├── dataframe_join_inner.py
    │   │   ├── dataframe_join_inner.sh
    │   │   ├── dataframe_join_left.py
    │   │   ├── dataframe_join_left.sh
    │   │   ├── dataframe_join_right.py
    │   │   ├── dataframe_join_right.sh
    │   │   ├── dataframe_sql.py
    │   │   ├── dataframe_sql.sh
    │   │   ├── rdd_transformation_cartesian.py
    │   │   ├── rdd_transformation_cartesian.sh
    │   │   ├── rdd_transformation_combinebykey.py
    │   │   ├── rdd_transformation_combinebykey.sh
    │   │   ├── rdd_transformation_filter.py
    │   │   ├── rdd_transformation_filter.sh
    │   │   ├── rdd_transformation_flatmap.py
    │   │   ├── rdd_transformation_flatmap.sh
    │   │   ├── rdd_transformation_groupbykey.py
    │   │   ├── rdd_transformation_groupbykey.sh
    │   │   ├── rdd_transformation_join.py
    │   │   ├── rdd_transformation_join.sh
    │   │   ├── rdd_transformation_map.py
    │   │   ├── rdd_transformation_map.sh
    │   │   ├── rdd_transformation_mappartitions.py
    │   │   ├── rdd_transformation_mappartitions.sh
    │   │   ├── rdd_transformation_mappartitions_handle_empty_partitions.py
    │   │   ├── rdd_transformation_mappartitions_handle_empty_partitions.sh
    │   │   ├── rdd_transformation_reducebykey.py
    │   │   ├── rdd_transformation_reducebykey.sh
    │   │   ├── rdd_transformation_sortby.py
    │   │   ├── rdd_transformation_sortby.sh
    │   │   ├── rdd_transformation_takeordered.py
    │   │   └── rdd_transformation_takeordered.sh
    │   └── scala
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── run.sh
    │   │   ├── run_spark_applications_scripts
    │   │       ├── aggregate_by_key_use_aggregate_by_key.sh
    │   │       ├── aggregate_by_key_use_combine_by_key.sh
    │   │       ├── average_by_key_use_fold_by_key.sh
    │   │       ├── average_by_key_use_group_by_key.sh
    │   │       ├── average_by_key_use_reduce_by_key.sh
    │   │       ├── dataframe_action_describe.sh
    │   │       ├── dataframe_add_column.sh
    │   │       ├── dataframe_drop_column.sh
    │   │       ├── dataframe_filter.sh
    │   │       ├── dataframe_join_cross.sh
    │   │       ├── dataframe_join_inner.sh
    │   │       ├── dataframe_join_left.sh
    │   │       ├── dataframe_join_right.sh
    │   │       ├── dataframe_sql.sh
    │   │       ├── rdd_action_take_ordered.sh
    │   │       ├── rdd_transformation_cartesian.sh
    │   │       ├── rdd_transformation_combine_by_key.sh
    │   │       ├── rdd_transformation_filter.sh
    │   │       ├── rdd_transformation_flat_map.sh
    │   │       ├── rdd_transformation_group_by_key.sh
    │   │       ├── rdd_transformation_join.sh
    │   │       ├── rdd_transformation_map.sh
    │   │       ├── rdd_transformation_map_partition.sh
    │   │       ├── rdd_transformation_mappartitions_handle_empty_partitions.sh
    │   │       ├── rdd_transformation_reduce_by_key.sh
    │   │       └── rdd_transformation_sort_by.sh
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           ├── resources
    │   │               └── log4j.properties
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch01
    │   │                                   ├── AggregateByKeyUseAggregateByKey.scala
    │   │                                   ├── AggregateByKeyUseCombineByKey.scala
    │   │                                   ├── AverageByKeyUseFoldByKey.scala
    │   │                                   ├── AverageByKeyUseGroupByKey.scala
    │   │                                   ├── AverageByKeyUseReduceByKey.scala
    │   │                                   ├── DataframeActionDescribe.scala
    │   │                                   ├── DataframeAddColumn.scala
    │   │                                   ├── DataframeDropColumn.scala
    │   │                                   ├── DataframeFilter.scala
    │   │                                   ├── DataframeJoinCross.scala
    │   │                                   ├── DataframeJoinInner.scala
    │   │                                   ├── DataframeJoinLeft.scala
    │   │                                   ├── DataframeJoinRight.scala
    │   │                                   ├── DataframeSQL.scala
    │   │                                   ├── RDDActionTakeOrdered.scala
    │   │                                   ├── RDDTransformationCartesian.scala
    │   │                                   ├── RDDTransformationCombineByKey.scala
    │   │                                   ├── RDDTransformationFilter.scala
    │   │                                   ├── RDDTransformationFlatMap.scala
    │   │                                   ├── RDDTransformationGroupByKey.scala
    │   │                                   ├── RDDTransformationJoin.scala
    │   │                                   ├── RDDTransformationMap.scala
    │   │                                   ├── RDDTransformationMapPartition.scala
    │   │                                   ├── RDDTransformationMappartitionsHandleEmptyPartitions.scala
    │   │                                   ├── RDDTransformationReduceByKey.scala
    │   │                                   └── RDDTransformationSortBy.scala
    ├── chap02
    │   ├── python
    │   │   ├── DNA-FASTA-PERFORMANCE
    │   │   │   └── performance_of_FASTA_versions_1_2_3.txt
    │   │   ├── DNA-FASTA-V1
    │   │   │   ├── dna_base_count_ver_1.py
    │   │   │   ├── dna_base_count_ver_1.sh
    │   │   │   ├── dna_base_count_ver_1_1GB.sh
    │   │   │   └── dna_base_count_ver_1_big.sh
    │   │   ├── DNA-FASTA-V2
    │   │   │   ├── dna_base_count_ver_2.py
    │   │   │   ├── dna_base_count_ver_2.sh
    │   │   │   ├── dna_base_count_ver_2_1GB.sh
    │   │   │   └── dna_base_count_ver_2_big.sh
    │   │   ├── DNA-FASTA-V3
    │   │   │   ├── dna_base_count_ver_3.py
    │   │   │   ├── dna_base_count_ver_3.sh
    │   │   │   ├── dna_base_count_ver_3_1GB.sh
    │   │   │   └── dna_base_count_ver_3_big.sh
    │   │   ├── DNA-FASTQ
    │   │   │   ├── dna_base_count_fastq.py
    │   │   │   └── dna_base_count_fastq.sh
    │   │   ├── README.md
    │   │   └── data
    │   │   │   ├── sample.fasta
    │   │   │   └── sp1.fastq
    │   └── scala
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── data
    │   │       ├── sample.fasta
    │   │       └── sp1.fastq
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── run_spark_applications_scripts
    │   │       ├── dna_base_count_fastq.sh
    │   │       ├── dna_base_count_ver_1.sh
    │   │       ├── dna_base_count_ver_1_1GB.sh
    │   │       ├── dna_base_count_ver_1_big.sh
    │   │       ├── dna_base_count_ver_2.sh
    │   │       ├── dna_base_count_ver_2_1GB.sh
    │   │       ├── dna_base_count_ver_2_big.sh
    │   │       ├── dna_base_count_ver_3.sh
    │   │       ├── dna_base_count_ver_3_1GB.sh
    │   │       └── dna_base_count_ver_3_big.sh
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           ├── resources
    │   │               ├── input.txt
    │   │               ├── log4j.properties
    │   │               └── sp1.fastq
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch02
    │   │                                   ├── DNABaseCountFastq.scala
    │   │                                   ├── DNABaseCountVER1.scala
    │   │                                   ├── DNABaseCountVER3.scala
    │   │                                   └── DNSBaseCountVER2.scala
    ├── chap03
    │   ├── python
    │   │   ├── bigrams_input.txt
    │   │   ├── flatmap_transformation_1_from_collection.py
    │   │   ├── flatmap_transformation_1_from_file.py
    │   │   ├── map_transformation_1_from_collection.py
    │   │   ├── map_transformation_1_from_file.py
    │   │   ├── mappartitions_transformation_1.py
    │   │   ├── mapvalues_transformation_1.py
    │   │   ├── mapvalues_transformation_2.py
    │   │   ├── mapvalues_transformation_3.py
    │   │   ├── sample_input
    │   │   └── sample_numbers.txt
    │   └── scala
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── data
    │   │       ├── bigrams_input.txt
    │   │       ├── sample_input.csv
    │   │       └── sample_numbers.txt
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── run.sh
    │   │   ├── run_spark_applications_scripts
    │   │       ├── flat_map_transformation_1_from_file.sh
    │   │       ├── flatmap_transformation_1_from_collection.sh
    │   │       ├── map_partitions_transformation_1.sh
    │   │       ├── map_transformation_1_from_collection.sh
    │   │       ├── map_transformation_1_from_file.sh
    │   │       ├── map_values_transformation_1.sh
    │   │       ├── map_values_transformation_2.sh
    │   │       └── map_values_transformation_3.sh
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           ├── resources
    │   │               ├── log4j.properties
    │   │               └── sample_numbers.txt
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch03
    │   │                                   ├── FlatMapTransformation1FromFile.scala
    │   │                                   ├── FlatmapTransformation1FromCollection.scala
    │   │                                   ├── MapPartitionsTransformation1.scala
    │   │                                   ├── MapTransformation1FromCollection.scala
    │   │                                   ├── MapTransformation1FromFile.scala
    │   │                                   ├── MapValuesTransformation1.scala
    │   │                                   ├── MapValuesTransformation2.scala
    │   │                                   └── MapValuesTransformation3.scala
    ├── chap04
    │   ├── python
    │   │   ├── README.md
    │   │   ├── average_by_key_use_aggregatebykey.py
    │   │   ├── average_by_key_use_aggregatebykey.sh
    │   │   ├── average_by_key_use_combinebykey.py
    │   │   ├── average_by_key_use_combinebykey.sh
    │   │   ├── average_by_key_use_groupbykey.py
    │   │   ├── average_by_key_use_groupbykey.sh
    │   │   ├── average_by_key_use_reducebykey.py
    │   │   ├── average_by_key_use_reducebykey.sh
    │   │   ├── dataframe_median_approx.py
    │   │   ├── dataframe_median_exact.py
    │   │   ├── exact_median_by_key_use_aggregatebykey.py
    │   │   ├── exact_median_by_key_use_combinebykey.py
    │   │   ├── exact_median_by_key_use_groupbykey.py
    │   │   └── exact_median_by_key_use_reducebykey.py
    │   └── scala
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── run.sh
    │   │   ├── run_spark_applications_scripts
    │   │       ├── average_by_key_use_aggregate_by_key.sh
    │   │       ├── average_by_key_use_combine_by_key.sh
    │   │       ├── average_by_key_use_group_by_key.sh
    │   │       ├── average_by_key_use_reduce_by_key.sh
    │   │       ├── dataframe_median_approx.sh
    │   │       ├── dataframe_median_exact.sh
    │   │       ├── exact_median_by_key_use_aggregate_by_key.sh
    │   │       ├── exact_median_by_key_use_combine_by_key.sh
    │   │       ├── exact_median_by_key_use_group_by_key.sh
    │   │       └── exact_median_by_key_use_reduce_by_key.sh
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           ├── resources
    │   │               └── log4j.properties
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch04
    │   │                                   ├── AverageByKeyUseAggregateByKey.scala
    │   │                                   ├── AverageByKeyUseCombineByKey.scala
    │   │                                   ├── AverageByKeyUseGroupByKey.scala
    │   │                                   ├── AverageByKeyUseReduceByKey.scala
    │   │                                   ├── DataframeMedianApprox.scala
    │   │                                   ├── DataframeMedianExact.scala
    │   │                                   ├── ExactMedianByKeyUseAggregateByKey.scala
    │   │                                   ├── ExactMedianByKeyUseCombineByKey.scala
    │   │                                   ├── ExactMedianByKeyUseGroupByKey.scala
    │   │                                   └── ExactMedianByKeyUseReduceByKey.scala
    ├── chap05
    │   ├── Partitions_and_Executors.md
    │   ├── README.md
    │   ├── python
    │   │   ├── README.md
    │   │   ├── customers.RECORD.FORMAT.txt
    │   │   ├── customers.txt
    │   │   ├── customers_with_date.RECORD.FORMAT.txt
    │   │   ├── customers_with_date.txt
    │   │   ├── partition_data_as_text_by_year_month.log
    │   │   ├── partition_data_as_text_by_year_month.py
    │   │   ├── partition_data_by_customer_and_year.log
    │   │   ├── partition_data_by_customer_and_year.py
    │   │   ├── partition_data_by_customer_and_year.sh
    │   │   └── partition_data_by_customer_and_year_single_file.py
    │   └── scala
    │   │   └── README.md
    ├── chap06
    │   ├── README.md
    │   ├── python
    │   │   ├── airports.json
    │   │   ├── breadth_first_search_example.log
    │   │   ├── breadth_first_search_example.py
    │   │   ├── breadth_first_search_example.sh
    │   │   ├── connected_component_example.log
    │   │   ├── connected_component_example.py
    │   │   ├── connected_component_example.sh
    │   │   ├── graph_builder.log
    │   │   ├── graph_builder.py
    │   │   ├── graph_builder.sh
    │   │   ├── label_propagation_algorithm_example.log
    │   │   ├── label_propagation_algorithm_example.py
    │   │   ├── label_propagation_algorithm_example.sh
    │   │   ├── pagerank.py
    │   │   ├── pagerank_data.txt
    │   │   ├── pagerank_example.log
    │   │   ├── pagerank_example.py
    │   │   ├── pagerank_example.sh
    │   │   ├── sample_graph_edges.txt
    │   │   ├── sample_graph_vertices.txt
    │   │   ├── shortest_path_finder.log
    │   │   ├── shortest_path_finder.py
    │   │   ├── shortest_path_finder.sh
    │   │   ├── triangles_counter.log
    │   │   ├── triangles_counter.py
    │   │   ├── triangles_counter.sh
    │   │   ├── unique_triangles_finder.log
    │   │   ├── unique_triangles_finder.py
    │   │   └── unique_triangles_finder.sh
    │   └── scala
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch06
    │   │                                   └── BreadthFirstSearchExample.scala
    ├── chap07
    │   ├── python
    │   │   ├── cats.no.header.csv
    │   │   ├── cats.with.header.csv
    │   │   ├── datasource_csv_reader_no_header.py
    │   │   ├── datasource_csv_reader_no_header.sh
    │   │   ├── datasource_csv_reader_with_header.py
    │   │   ├── datasource_csv_reader_with_header.sh
    │   │   ├── datasource_csv_writer.py
    │   │   ├── datasource_csv_writer.sh
    │   │   ├── datasource_elasticsearch_reader.py
    │   │   ├── datasource_elasticsearch_reader.sh
    │   │   ├── datasource_elasticsearch_writer.py
    │   │   ├── datasource_elasticsearch_writer.sh
    │   │   ├── datasource_gzip_reader.py
    │   │   ├── datasource_gzip_reader.sh
    │   │   ├── datasource_jdbc_reader.py
    │   │   ├── datasource_jdbc_reader.sh
    │   │   ├── datasource_jdbc_writer.py
    │   │   ├── datasource_jdbc_writer.sh
    │   │   ├── datasource_json_reader_multi_line.py
    │   │   ├── datasource_json_reader_multi_line.sh
    │   │   ├── datasource_json_reader_single_line.py
    │   │   ├── datasource_json_reader_single_line.sh
    │   │   ├── datasource_mongodb_reader.py
    │   │   ├── datasource_mongodb_reader.sh
    │   │   ├── datasource_mongodb_writer.py
    │   │   ├── datasource_mongodb_writer.sh
    │   │   ├── datasource_redis_reader.py
    │   │   ├── datasource_redis_reader.sh
    │   │   ├── datasource_redis_writer.py
    │   │   ├── datasource_redis_writer.sh
    │   │   ├── datasource_textfile_reader.py
    │   │   ├── datasource_textfile_reader.sh
    │   │   ├── datasource_textfile_writer.py
    │   │   ├── datasource_textfile_writer.sh
    │   │   ├── images
    │   │   │   ├── cat1.jpg
    │   │   │   ├── cat2.jpg
    │   │   │   ├── cat3.jpg
    │   │   │   ├── cat4.jpg
    │   │   │   ├── duck1.jpg
    │   │   │   ├── duck2.jpg
    │   │   │   └── not-image.txt
    │   │   ├── mongodb_coll44.png
    │   │   ├── mongodb_coll66.png
    │   │   ├── name_age_salary.csv
    │   │   ├── people.txt
    │   │   ├── sample_multi_line.json
    │   │   ├── sample_no_header.csv
    │   │   ├── sample_numbers.txt
    │   │   ├── sample_single_line.json
    │   │   ├── sample_with_header.csv
    │   │   └── twitter.avro
    │   └── scala
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── data
    │   │       ├── sample_multi_line.json
    │   │       ├── sample_no_header.csv
    │   │       ├── sample_no_header.csv.gz
    │   │       ├── sample_numbers.txt
    │   │       ├── sample_single_line.json
    │   │       └── sample_with_header.csv
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── run.sh
    │   │   ├── run_spark_applications_scripts
    │   │       ├── datasource_csv_reader_header.sh
    │   │       ├── datasource_csv_reader_no_header.sh
    │   │       ├── datasource_csv_writer.sh
    │   │       ├── datasource_elasticsearch_reader.sh
    │   │       ├── datasource_elasticsearch_writer.sh
    │   │       ├── datasource_gzip_reader.sh
    │   │       ├── datasource_jdbc_reader.sh
    │   │       ├── datasource_jdbc_writer.sh
    │   │       ├── datasource_json_reader_multi_line.sh
    │   │       ├── datasource_json_reader_single_line.sh
    │   │       ├── datasource_mongodb_reader.sh
    │   │       ├── datasource_mongodb_writer.sh
    │   │       ├── datasource_redis_reader.sh
    │   │       ├── datasource_redis_writer.sh
    │   │       ├── datasource_textfile_reader.sh
    │   │       └── datasource_textfile_writer.sh
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           ├── resources
    │   │               └── log4j.properties
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch07
    │   │                                   ├── DatasourceCSVReaderHeader.scala
    │   │                                   ├── DatasourceCSVReaderNoHeader.scala
    │   │                                   ├── DatasourceCSVWriter.scala
    │   │                                   ├── DatasourceElasticsearchReader.scala
    │   │                                   ├── DatasourceElasticsearchWriter.scala
    │   │                                   ├── DatasourceGZIPReader.scala
    │   │                                   ├── DatasourceJDBCReader.scala
    │   │                                   ├── DatasourceJDBCWriter.scala
    │   │                                   ├── DatasourceJSONReaderMultiLine.scala
    │   │                                   ├── DatasourceJSONReaderSingleLine.scala
    │   │                                   ├── DatasourceMongodbReader.scala
    │   │                                   ├── DatasourceMongodbWriter.scala
    │   │                                   ├── DatasourceRedisReader.scala
    │   │                                   ├── DatasourceRedisWriter.scala
    │   │                                   ├── DatasourceTextfileReader.scala
    │   │                                   └── DatasourceTextfileWriter.scala
    ├── chap08
    │   ├── python
    │   │   ├── page_rank
    │   │   │   ├── page_rank.py
    │   │   │   └── pagerank_2.py
    │   │   └── rank_product
    │   │   │   ├── rank_product_using_combinebykey.py
    │   │   │   ├── rank_product_using_combinebykey.sh
    │   │   │   ├── rank_product_using_groupbykey.py
    │   │   │   ├── rank_product_using_groupbykey.sh
    │   │   │   └── sample_input
    │   │   │       ├── rp1.txt
    │   │   │       ├── rp2.txt
    │   │   │       └── rp3.txt
    │   └── scala
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── data
    │   │       ├── sample_input
    │   │       │   ├── rp1.txt
    │   │       │   ├── rp2.txt
    │   │       │   └── rp3.txt
    │   │       └── urls.txt
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── run_spark_applications_scripts
    │   │       ├── page_rank.sh
    │   │       ├── rank_product_using_combine_by_key.sh
    │   │       └── rank_product_using_group_by_key.sh
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           ├── resources
    │   │               └── log4j.properties
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch08
    │   │                                   ├── PageRank.scala
    │   │                                   ├── RankProductUsingCombineByKey.scala
    │   │                                   └── RankProductUsingGroupByKey.scala
    ├── chap09
    │   ├── python
    │   │   └── README.md
    │   └── scala
    │   │   └── README.md
    ├── chap10
    │   ├── README.md
    │   ├── data_design_patterns.md
    │   ├── data_design_patterns.md.save
    │   ├── data_design_patterns.pdf
    │   ├── python
    │   │   ├── average_monoid_use_aggregatebykey.py
    │   │   ├── average_monoid_use_aggregatebykey.sh
    │   │   ├── average_monoid_use_combinebykey.py
    │   │   ├── average_monoid_use_combinebykey.sh
    │   │   ├── average_monoid_use_groupbykey.py
    │   │   ├── average_monoid_use_groupbykey.sh
    │   │   ├── average_monoid_use_reducebykey.py
    │   │   ├── average_monoid_use_reducebykey.sh
    │   │   ├── dna_base_count_basic_using_combinebykey.py
    │   │   ├── dna_base_count_basic_using_combinebykey.sh
    │   │   ├── dna_base_count_basic_using_groupbykey.py
    │   │   ├── dna_base_count_basic_using_groupbykey.sh
    │   │   ├── dna_base_count_basic_using_reducebykey.py
    │   │   ├── dna_base_count_basic_using_reducebykey.sh
    │   │   ├── dna_base_count_inmapper_combiner_using_combinebykey.py
    │   │   ├── dna_base_count_inmapper_combiner_using_combinebykey.sh
    │   │   ├── dna_base_count_inmapper_combiner_using_groupbykey.py
    │   │   ├── dna_base_count_inmapper_combiner_using_groupbykey.sh
    │   │   ├── dna_base_count_inmapper_combiner_using_reducebykey.py
    │   │   ├── dna_base_count_inmapper_combiner_using_reducebykey.sh
    │   │   ├── dna_base_count_using_mappartitions.py
    │   │   ├── dna_base_count_using_mappartitions.sh
    │   │   ├── inmapper_combiner_local_aggregation.py
    │   │   ├── inmapper_combiner_local_aggregation.sh
    │   │   ├── inmapper_combiner_use_basic_mapreduce.py
    │   │   ├── inmapper_combiner_use_basic_mapreduce.sh
    │   │   ├── inmapper_combiner_use_mappartitions.py
    │   │   ├── inmapper_combiner_use_mappartitions.sh
    │   │   ├── minmax_force_empty_partitions.py
    │   │   ├── minmax_force_empty_partitions.sh
    │   │   ├── minmax_use_mappartitions.py
    │   │   ├── minmax_use_mappartitions.sh
    │   │   ├── minmax_use_mappartitions_v2.py
    │   │   ├── minmax_use_mappartitions_v2.sh
    │   │   ├── sample_dna_seq.txt
    │   │   ├── sample_input.txt
    │   │   ├── sample_numbers.txt
    │   │   ├── structured_to_hierarchical_to_xml_dataframe.py
    │   │   ├── structured_to_hierarchical_to_xml_rdd.py
    │   │   ├── top_N_use_mappartitions.py
    │   │   ├── top_N_use_mappartitions.sh
    │   │   ├── top_N_use_takeordered.py
    │   │   └── top_N_use_takeordered.sh
    │   └── scala
    │   │   ├── .gitignore
    │   │   ├── README.md
    │   │   ├── build.gradle
    │   │   ├── data
    │   │       ├── sample_dna_seq.txt
    │   │       ├── sample_input.txt
    │   │       └── sample_numbers.txt
    │   │   ├── gradle
    │   │       └── wrapper
    │   │       │   ├── gradle-wrapper.jar
    │   │       │   └── gradle-wrapper.properties
    │   │   ├── gradlew
    │   │   ├── gradlew.bat
    │   │   ├── out
    │   │       └── production
    │   │       │   └── resources
    │   │       │       └── log4j.properties
    │   │   ├── run.sh
    │   │   ├── run_spark_applications_scripts
    │   │       ├── average_monoid_use_aggregate_by_key.sh
    │   │       ├── average_monoid_use_combine_by_key.sh
    │   │       ├── average_monoid_use_group_by_key.sh
    │   │       ├── average_monoid_use_reduce_by_key.sh
    │   │       ├── dna_base_count_basic_in_mapper_combiner_using_combine_by_key.sh
    │   │       ├── dna_base_count_basic_in_mapper_combiner_using_group_by_key.sh
    │   │       ├── dna_base_count_basic_in_mapper_combiner_using_reduce_by_key.sh
    │   │       ├── dna_base_count_basic_using_combine_by_key.sh
    │   │       ├── dna_base_count_basic_using_group_by_key.sh
    │   │       ├── dna_base_count_basic_using_mappartitions.sh
    │   │       ├── dna_base_count_basic_using_reduce_by_key.sh
    │   │       ├── in_mapper_combiner_use_mappartitions.sh
    │   │       ├── in_mapper_combiner_using_local_aggregation.sh
    │   │       ├── in_mapper_combiner_using_map_reduce.sh
    │   │       ├── min_max_force_empty_partitions.sh
    │   │       ├── min_max_use_mappartitions.sh
    │   │       ├── structured_to_hierarchical_to_xml_dataframe.sh
    │   │       ├── structured_to_hierarchical_to_xml_rdd.sh
    │   │       ├── top_n_use_map_partitions.sh
    │   │       └── top_n_use_take_ordered.sh
    │   │   ├── settings.gradle
    │   │   └── src
    │   │       └── main
    │   │           ├── resources
    │   │               └── log4j.properties
    │   │           └── scala
    │   │               └── org
    │   │                   └── data
    │   │                       └── algorithms
    │   │                           └── spark
    │   │                               └── ch10
    │   │                                   ├── AverageMonoidUseAggregateByKey.scala
    │   │                                   ├── AverageMonoidUseCombineByKey.scala
    │   │                                   ├── AverageMonoidUseGroupByKey.scala
    │   │                                   ├── AverageMonoidUseReduceByKey.scala
    │   │                                   ├── DNABaseCountBasicInMapperCombinerUsingCombineByKey.scala
    │   │                                   ├── DNABaseCountBasicInMapperCombinerUsingGroupByKey.scala
    │   │                                   ├── DNABaseCountBasicInMapperCombinerUsingReduceByKey.scala
    │   │                                   ├── DNABaseCountBasicUsingCombineByKey.scala
    │   │                                   ├── DNABaseCountBasicUsingGroupByKey.scala
    │   │                                   ├── DNABaseCountBasicUsingMappartitions.scala
    │   │                                   ├── DNABaseCountBasicUsingReduceByKey.scala
    │   │                                   ├── InMapperCombinerUseMappartitions.scala
    │   │                                   ├── InMapperCombinerUsingLocalAggregation.scala
    │   │                                   ├── InMapperCombinerUsingMapReduce.scala
    │   │                                   ├── MinMaxForceEmptyPartitions.scala
    │   │                                   ├── MinMaxUseMappartitions.scala
    │   │                                   ├── StructuredToHierarchicalToXmlDataframe.scala
    │   │                                   ├── StructuredToHierarchicalToXmlRDD.scala
    │   │                                   ├── TopNUseMapPartitions.scala
    │   │                                   └── TopNUseTakeOrdered.scala
    ├── chap11
    │   ├── python
    │   │   ├── README.md
    │   │   ├── inner_join_dataframe_spark.py
    │   │   ├── inner_join_in_mapreduce.py
    │   │   ├── inner_join_rdd_spark.py
    │   │   ├── left_join_dataframe_spark.py
    │   │   ├── left_join_in_mapreduce.py
    │   │   ├── left_join_rdd_spark.py
    │   │   ├── right_join_dataframe_spark.py
    │   │   ├── right_join_in_mapreduce.py
    │   │   └── right_join_rdd_spark.py
    │   └── scala
    │   │   └── README.md
    ├── chap12
    │   ├── python
    │   │   └── README.md
    │   └── scala
    │   │   └── README.md
    └── jars
    │   ├── avro-mapred-1.7.7-hadoop1.jar
    │   ├── avro-mapred-1.7.7-hadoop2.jar
    │   ├── com-cotdp-hadoop-1.0-SNAPSHOT.jar
    │   ├── elasticsearch-hadoop-6.4.2.jar
    │   ├── elasticsearch-spark_2.11-2.4.5.jar
    │   ├── graphframes-0.6.0-spark2.3-s_2.11.jar
    │   ├── hbase-spark-connector-1.0.0.jar
    │   ├── htrace-core-3.1.0-incubating.jar
    │   ├── mongo-java-driver-3.8.2.jar
    │   ├── mongo-spark-connector_2.11-2.2.5.jar
    │   ├── mongodb-driver-3.8.2.jar
    │   ├── mysql-connector-java-5.1.42.jar
    │   ├── shc-core-1.1.3-2.3-s_2.11.jar
    │   ├── shc-examples-1.1.3-2.3-s_2.11.jar
    │   ├── spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar
    │   └── spark-redis-2.3.1-SNAPSHOT.jar
├── data
    ├── chap02
    │   ├── NC_000907.1.fasta
    │   ├── README.md
    │   ├── human_g1k_v37_chr1_59kb.fasta
    │   ├── sample.fasta
    │   └── sp1.fastq
    └── chap06
    │   └── flightdata2018.json
├── docs
    ├── FOREWORD_by_Dr_Matei_Zaharia.md
    ├── goal_of_book.md
    └── story_of_book.md
├── images
    ├── Data-Algorithms-with-Spark_mech2.pdf
    ├── Data-Algorithms-with-Spark_mech2.png
    ├── Data_Algorithms_with_Spark_COVER_9781492082385.jpg
    ├── Data_Algorithms_with_Spark_COVER_9781492082385.png
    ├── FOREWORD_by_Dr_Matei_Zaharia.md
    ├── anagram.png
    ├── book_cover_final.pdf
    ├── correlation-coefficient.png
    ├── data-alg-foreword2.docx
    ├── data-alg-foreword2.pdf
    ├── data_algorithms_hard_copy_image.jpg
    ├── data_algorithms_with_spark.jpg
    ├── data_algorithms_with_spark.pdf
    ├── data_algorithms_with_spark_amazon.jpg
    ├── data_algorithms_with_spark_knowledge_is_power.jpeg
    ├── data_algorithms_with_spark_small.jpeg
    ├── joins-in-SQL.jpeg
    ├── kmer.jpg
    ├── kmer_4.png
    ├── mappartitions_image_1.drawio.png
    ├── mappartitions_image_2.drawio.png
    └── sql_joins.png
└── wiki-spark
    ├── README.md
    └── docs
        ├── dataframe_to_rdd.md
        ├── duplicate_removal_dataframe.md
        ├── duplicate_removal_rdd.md
        ├── explode_function.md
        ├── flatmap_transformation.md
        ├── how-to-use-UDF-in-spark.md
        ├── lambda_expressions.adoc
        ├── lambda_expressions.pdf
        ├── lambda_expressions_basics.md
        ├── monoid
            ├── README.md
            ├── monoid.md
            └── monoid_math.png
        ├── rdd_to_dataframe.md
        ├── reduce-the-verbosity-of-spark-runtime-output.md
        ├── spark-explode.png
        ├── spark-flatmap.png
        ├── using-graphframes-with-jupyter.demo.png
        ├── using-graphframes-with-jupyter.md
        └── wiki.jpeg


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea/
3 | build
4 | .gradle
5 | .idea
6 | !gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/python/README.md:
--------------------------------------------------------------------------------
 1 | # TF-IDF
 2 | 
 3 | TF-IDF (term frequency-inverse document frequency) is a 
 4 | statistical measure that evaluates how relevant a word is 
 5 | to a document in a collection of documents. 
 6 | This is done by multiplying two metrics: how many times a 
 7 | word appears in a document, and the inverse document 
 8 | frequency of the word across a set of documents.
 9 | 
10 | # References
11 | 
12 | 1. [Introduction to TF-IDF](https://github.com/mahmoudparsian/machine-learning-course/blob/master/docs/TF-IDF/README.md)
13 | 
14 | 2. [TF(Term Frequency)-IDF(Inverse Document Frequency) from scratch in python](https://towardsdatascience.com/tf-term-frequency-idf-inverse-document-frequency-from-scratch-in-python-6c2b61b78558)
15 | 
16 | 3. [Understanding TF-ID: A Simple Introduction](https://monkeylearn.com/blog/what-is-tf-idf/)
17 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/python/data/doc1:
--------------------------------------------------------------------------------
1 | a crazy fox jumped over red fox and jumped over fox
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/python/data/doc2:
--------------------------------------------------------------------------------
1 | dogs are the best friend of red fox and I like dogs
2 | dogs are good
3 | 
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/python/data/doc3:
--------------------------------------------------------------------------------
1 | I do not like cleaning and cooking but I like dogs
2 | I like playing tennis
3 | 
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/python/data/doc4:
--------------------------------------------------------------------------------
1 | computer science is great
2 | fox jumped
3 | 
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.bonus_chapter'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/TF-IDF/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_tfidf'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/src/main/data/doc1:
--------------------------------------------------------------------------------
1 | a crazy fox jumped over red fox and jumped over fox
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/src/main/data/doc2:
--------------------------------------------------------------------------------
1 | dogs are the best friend of red fox and I like dogs
2 | dogs are good
3 | 
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/src/main/data/doc3:
--------------------------------------------------------------------------------
1 | I do not like cleaning and cooking but I like dogs
2 | I like playing tennis
3 | 
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/TF-IDF/scala/src/main/data/doc4:
--------------------------------------------------------------------------------
1 | computer science is great
2 | fox jumped
3 | 
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/python/Top_N_movies_Dataframe_using_API.sh:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------
 2 | begin=`/bin/date`
 3 | echo "begin=${begin}"
 4 | START_TIME=$(date +%s)
 5 | #command block that takes time to complete...
 6 | #---------------------------------------------------------
 7 | #
 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0"
 9 | export PROG="Top_N_movies_Dataframe_using_API.py"
10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m"
11 | export ratings="${INPUT_PATH}/ratings.csv"
12 | export movies="${INPUT_PATH}/movies.csv"
13 | export rating_threshold="0"
14 | #
15 | #
16 | # for top 10
17 | export N=10
18 | #
19 | # sys.argv[x]                        1     2          3         4
20 | $SPARK_HOME/bin/spark-submit ${PROG} ${N}  ${ratings} ${movies} ${rating_threshold}
21 | #
22 | #---------------------------------------------------------
23 | end=`/bin/date`
24 | echo "end=${end}"
25 | END_TIME=$(date +%s)
26 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task."
27 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/python/Top_N_movies_Dataframe_using_SQL.sh:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------
 2 | begin=`/bin/date`
 3 | echo "begin=${begin}"
 4 | START_TIME=$(date +%s)
 5 | #command block that takes time to complete...
 6 | #---------------------------------------------------------
 7 | #
 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0"
 9 | export PROG="Top_N_movies_Dataframe_using_SQL.py"
10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m"
11 | export ratings="${INPUT_PATH}/ratings.csv"
12 | export movies="${INPUT_PATH}/movies.csv"
13 | export rating_threshold="0"
14 | #
15 | #
16 | # for top 10
17 | export N=10
18 | #
19 | # sys.argv[x]                        1     2          3         4
20 | $SPARK_HOME/bin/spark-submit ${PROG} ${N}  ${ratings} ${movies} ${rating_threshold}
21 | #
22 | #---------------------------------------------------------
23 | end=`/bin/date`
24 | echo "end=${end}"
25 | END_TIME=$(date +%s)
26 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task."
27 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_combineByKey.sh:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------
 2 | begin=`/bin/date`
 3 | echo "begin=${begin}"
 4 | START_TIME=$(date +%s)
 5 | #command block that takes time to complete...
 6 | #---------------------------------------------------------
 7 | #
 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0"
 9 | export PROG="Top_N_movies_RDD_using_combineByKey.py"
10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m"
11 | export ratings="${INPUT_PATH}/ratings.csv"
12 | export movies="${INPUT_PATH}/movies.csv"
13 | export rating_threshold="0"
14 | #
15 | # for top 10
16 | export N=10
17 | #
18 | # sys.argv[x]                        1     2          3         4
19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N}  ${ratings} ${movies} ${rating_threshold}
20 | #
21 | #---------------------------------------------------------
22 | end=`/bin/date`
23 | echo "end=${end}"
24 | END_TIME=$(date +%s)
25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task."
26 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_groupByKey.sh:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------
 2 | begin=`/bin/date`
 3 | echo "begin=${begin}"
 4 | START_TIME=$(date +%s)
 5 | #command block that takes time to complete...
 6 | #---------------------------------------------------------
 7 | #
 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0"
 9 | export PROG="Top_N_movies_RDD_using_groupByKey.py"
10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m"
11 | export ratings="${INPUT_PATH}/ratings.csv"
12 | export movies="${INPUT_PATH}/movies.csv"
13 | export rating_threshold="0"
14 | #
15 | # for top 10
16 | export N=10
17 | #
18 | # sys.argv[x]                        1     2          3         4
19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N}  ${ratings} ${movies} ${rating_threshold}
20 | #
21 | #---------------------------------------------------------
22 | end=`/bin/date`
23 | echo "end=${end}"
24 | END_TIME=$(date +%s)
25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task."
26 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_reduceByKey.sh:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------
 2 | begin=`/bin/date`
 3 | echo "begin=${begin}"
 4 | START_TIME=$(date +%s)
 5 | #command block that takes time to complete...
 6 | #---------------------------------------------------------
 7 | #
 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0"
 9 | export PROG="Top_N_movies_RDD_using_reduceByKey.py"
10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m"
11 | export ratings="${INPUT_PATH}/ratings.csv"
12 | export movies="${INPUT_PATH}/movies.csv"
13 | export rating_threshold="0"
14 | #
15 | # for top 10
16 | export N=10
17 | #
18 | # sys.argv[x]                        1     2          3         4
19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N}  ${ratings} ${movies} ${rating_threshold}
20 | #
21 | #---------------------------------------------------------
22 | end=`/bin/date`
23 | echo "end=${end}"
24 | END_TIME=$(date +%s)
25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task."
26 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_takeOrdered.sh:
--------------------------------------------------------------------------------
 1 | #---------------------------------------------------------
 2 | begin=`/bin/date`
 3 | echo "begin=${begin}"
 4 | START_TIME=$(date +%s)
 5 | #command block that takes time to complete...
 6 | #---------------------------------------------------------
 7 | #
 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0"
 9 | export PROG="Top_N_movies_RDD_using_takeOrdered.py"
10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m"
11 | export ratings="${INPUT_PATH}/ratings.csv"
12 | export movies="${INPUT_PATH}/movies.csv"
13 | export rating_threshold="0"
14 | #
15 | # for top 10
16 | export N=10
17 | #
18 | # sys.argv[x]                        1    2          3         4
19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N} ${ratings} ${movies} ${rating_threshold}
20 | #
21 | #---------------------------------------------------------
22 | end=`/bin/date`
23 | echo "end=${end}"
24 | END_TIME=$(date +%s)
25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task."
26 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/scala/README.md:
--------------------------------------------------------------------------------
1 | Scala solutions for Top-N
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/Top-N/top-10.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/Top-N/top-10.jpeg


--------------------------------------------------------------------------------
/code/bonus_chapters/UDF/UDF.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/UDF/UDF.pdf


--------------------------------------------------------------------------------
/code/bonus_chapters/UDF/python/README.md:
--------------------------------------------------------------------------------
1 | Demo Spark's UDF (user-defined-function)
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/UDF/python/dataframe_UDF_example.log:
--------------------------------------------------------------------------------
 1 | % export SPARK_HOME=/home/mparsian/spark-3.2.0
 2 | % $SPARK_HOME/bin/spark-submit dataframe_UDF_example.py
 3 | 
 4 | +---+------------+
 5 | |ID |Name        |
 6 | +---+------------+
 7 | |100|john jones  |
 8 | |200|tracey smith|
 9 | |300|amy sanders |
10 | |400|null        |
11 | +---+------------+
12 | 
13 | +---+------------+
14 | |ID |Name        |
15 | +---+------------+
16 | |100|John Jones  |
17 | |200|Tracey Smith|
18 | |300|Amy Sanders |
19 | |400|null        |
20 | +---+------------+
21 | 
22 | +---+------------+------------+
23 | |ID |Name        |Upper Name  |
24 | +---+------------+------------+
25 | |100|john jones  |JOHN JONES  |
26 | |200|tracey smith|TRACEY SMITH|
27 | |300|amy sanders |AMY SANDERS |
28 | |400|null        |null        |
29 | +---+------------+------------+
30 | 
31 | +---+------------+
32 | |ID |Name        |
33 | +---+------------+
34 | |100|John Jones  |
35 | |200|Tracey Smith|
36 | |300|Amy Sanders |
37 | |400|null        |
38 | +---+------------+
39 | 
40 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/UDF/scala/README.md:
--------------------------------------------------------------------------------
1 | Demo Spark's UDF (user-defined-function)
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/python/sample_document.txt:
--------------------------------------------------------------------------------
1 | fox jumped bowel bowel bowel elbow below bare bear
2 | fox jumped bore bore bore boer robe bears
3 | bears baser saber fox jumped and jumped over bear
4 | fox is silent and listen listen mars rams mars bears
5 | Mary and Elvis lives in Detroit army Easter Listen
6 | silent eaters Death Hated elvis Mary easter Silent
7 | Artist Elvis are in army Listen Silent detroit
8 | artist is here and strait and traits hated
9 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.bonus_chapter'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/data/sample_document.txt:
--------------------------------------------------------------------------------
1 | fox jumped bowel bowel bowel elbow below bare bear
2 | fox jumped bore bore bore boer robe bears
3 | bears baser saber fox jumped and jumped over bear
4 | fox is silent and listen listen mars rams mars bears
5 | Mary and Elvis lives in Detroit army Easter Listen
6 | silent eaters Death Hated elvis Mary easter Silent
7 | Artist Elvis are in army Listen Silent detroit
8 | artist is here and strait and traits hated
9 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/anagrams/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/run_spark_applications_scripts/anagrams_by_combine_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.AnagramsByCombineByKey "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/run_spark_applications_scripts/anagrams_by_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.AnagramsByGroupByKey "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/run_spark_applications_scripts/anagrams_by_reduce_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.AnagramsByReduceByKey "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/bonus_chapters/anagrams/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_anagrams'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/correlation/python/rdd_cartesian_in_action.txt:
--------------------------------------------------------------------------------
 1 | How does Cartesian work?
 2 | 
 3 | >>> mylist = [('g1', [1, 11]), ('g2', [2, 22]), ('g3', [3, 33])]
 4 | >>> rdd = spark.sparkContext.parallelize(mylist)
 5 | >>> rdd.collect()
 6 | [('g1', [1, 11]), ('g2', [2, 22]), ('g3', [3, 33])]
 7 | >>> cart = rdd.cartesian(rdd)
 8 | >>> cart.mapValues(lambda v: list(v)).collect()
 9 | [
10 |  (('g1', [1, 11]), ['g1', [1, 11]]), 
11 |  (('g1', [1, 11]), ['g2', [2, 22]]), 
12 |  (('g1', [1, 11]), ['g3', [3, 33]]), 
13 |  (('g2', [2, 22]), ['g1', [1, 11]]), 
14 |  (('g2', [2, 22]), ['g2', [2, 22]]), 
15 |  (('g2', [2, 22]), ['g3', [3, 33]]), 
16 |  (('g3', [3, 33]), ['g1', [1, 11]]), 
17 |  (('g3', [3, 33]), ['g2', [2, 22]]), 
18 |  (('g3', [3, 33]), ['g3', [3, 33]])
19 | ]
20 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/correlation/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.bonus_chapter'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/bonus_chapters/correlation/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/correlation/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/bonus_chapters/correlation/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/correlation/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_correlation'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/correlation/scala/src/main/scala/org/data/algorithms/spark/bonuschapter/AllVersusAllCorrelationDataframe.scala:
--------------------------------------------------------------------------------
1 | package org.data.algorithms.spark.bonuschapter
2 | 
3 | object AllVersusAllCorrelationDataframe {
4 | 
5 | }
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/dataframes/README.md:
--------------------------------------------------------------------------------
 1 | ## Spark DataFrames Tutorial
 2 | 
 3 | #### [1. DataFrames Tutorial: from Python Collections](./dataframe_tutorial_from_collection.py.md)
 4 | 
 5 | #### [2. DataFrames Tutorial: from CSV Text Files](./dataframe_tutorial_from_text_files.py.md)
 6 | 
 7 | #### [3. Arrays in DataFrames](./arrays_in_dataframes/)
 8 | 
 9 | #### [4. Exploding Arrays in DataFrames](./explode_arrays_into_rows/)
10 | 
11 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/dataframes/arrays_in_dataframes/python/README.md:
--------------------------------------------------------------------------------
1 | How to use arrays in DataFrames
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/dataframes/arrays_in_dataframes/scala/README.md:
--------------------------------------------------------------------------------
1 | How to use arrays in DataFrames
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/README.md:
--------------------------------------------------------------------------------
1 | Pyspark – Split multiple array columns into rows
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/dataframes/explode_arrays_into_rows/python/explode_arrays_into_rows.log:
--------------------------------------------------------------------------------
 1 | /Users/mparsian/spark-3.2.1/bin/spark-submit explode_arrays_into_rows.py
 2 | 
 3 | root
 4 |  |-- name: string (nullable = true)
 5 |  |-- age: string (nullable = true)
 6 |  |-- languages: array (nullable = true)
 7 |  |    |-- element: string (containsNull = true)
 8 | 
 9 | +-----+---+--------------------+
10 | | name|age|           languages|
11 | +-----+---+--------------------+
12 | | Rafa| 20|        [SQL, NoSQL]|
13 | | Alex| 21|    [Ada, SQL, Java]|
14 | | Jane| 22|[Fortran, Cobol, ...|
15 | |Maria| 23|                  []|
16 | +-----+---+--------------------+
17 | 
18 | root
19 |  |-- name: string (nullable = true)
20 |  |-- age: string (nullable = true)
21 |  |-- col: string (nullable = true)
22 | 
23 | +----+---+-------+
24 | |name|age|    col|
25 | +----+---+-------+
26 | |Rafa| 20|    SQL|
27 | |Rafa| 20|  NoSQL|
28 | |Alex| 21|    Ada|
29 | |Alex| 21|    SQL|
30 | |Alex| 21|   Java|
31 | |Jane| 22|Fortran|
32 | |Jane| 22|  Cobol|
33 | |Jane| 22|      R|
34 | |Jane| 22|    C++|
35 | +----+---+-------+
36 | 
37 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/dataframes/explode_arrays_into_rows/scala/README.md:
--------------------------------------------------------------------------------
1 | Pyspark – Split multiple array columns into rows
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/join/README.md:
--------------------------------------------------------------------------------
 1 | # Join operation
 2 | 
 3 | ------
 4 | 
 5 | * In a nutshell, a join is an SQL operation performed 
 6 | to establish a connection between two (or more) database 
 7 | tables based on matching columns, thereby creating a 
 8 | relationship between the tables. 
 9 | 
10 | * Types of joins
11 | 	* Cross join. A cross join returns all possible combinations 
12 | 	  of rows of two tables (also called a Cartesian product).
13 | 
14 | 	* Join/inner join. An inner join, also known as a simple join, 
15 | 	  returns rows from joined tables that have matching rows. 
16 | 
17 | 	* Left outer join/left join.
18 | 
19 | 	* Right outer join/right join.
20 | 
21 | 	* Full outer join.
22 | 
23 | -----------
24 | 
25 | ![joins](../../../images/sql_joins.png)
26 | 
27 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/join/python/README.md:
--------------------------------------------------------------------------------
1 | # Join in Spark
2 | 
3 | A JOIN clause is used to combine rows from two tables 
4 | (expressed as RDDs or DataFrames), based on a related 
5 | column between them.
6 | 
7 | ![joins](../../../../images/joins-in-SQL.jpeg)
8 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/join/python/rdd_join_inner.log:
--------------------------------------------------------------------------------
 1 | % cat /tmp/A.txt
 2 | k1,v1
 3 | k1,v2
 4 | k2,v3
 5 | k2,v4
 6 | k2,v5
 7 | k3,v6
 8 | k3,v7
 9 | k4,v8
10 | 
11 | % cat /tmp/B.txt
12 | k1,t1
13 | k1,t2
14 | k1,t3
15 | k2,t4
16 | k2,t5
17 | k5,t6
18 | k6,t7
19 | 
20 | % A="/tmp/A.txt"
21 | % B="/tmp/B.txt"
22 | % ~/spark-3.2.1/bin/spark-submit rdd_join_inner.py   $A   $B
23 | 
24 | rdd_A= 
25 | [
26 |  ('k1', 'v1'), 
27 |  ('k1', 'v2'), 
28 |  ('k2', 'v3'), 
29 |  ('k2', 'v4'), 
30 |  ('k2', 'v5'), 
31 |  ('k3', 'v6'), 
32 |  ('k3', 'v7'), 
33 |  ('k4', 'v8')
34 | ]
35 | 
36 | rdd_B= 
37 | [
38 |  ('k1', 't1'), 
39 |  ('k1', 't2'), 
40 |  ('k1', 't3'), 
41 |  ('k2', 't4'), 
42 |  ('k2', 't5'), 
43 |  ('k5', 't6'), 
44 |  ('k6', 't7')
45 |  ]
46 | 
47 | A_joined_B= 
48 | [
49 |  ('k1', ('v1', 't1')), 
50 |  ('k1', ('v1', 't2')), 
51 |  ('k1', ('v1', 't3')), 
52 |  ('k1', ('v2', 't1')), 
53 |  ('k1', ('v2', 't2')), 
54 |  ('k1', ('v2', 't3')), 
55 |  ('k2', ('v3', 't4')), 
56 |  ('k2', ('v3', 't5')), 
57 |  ('k2', ('v4', 't4')), 
58 |  ('k2', ('v4', 't5')), 
59 |  ('k2', ('v5', 't4')), 
60 |  ('k2', ('v5', 't5'))
61 | ]
62 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/join/python/rdd_join_left.log:
--------------------------------------------------------------------------------
 1 | % cat /tmp/A.txt
 2 | k1,v1
 3 | k1,v2
 4 | k2,v3
 5 | k2,v4
 6 | k2,v5
 7 | k3,v6
 8 | k3,v7
 9 | k4,v8
10 | 
11 | % cat /tmp/B.txt
12 | k1,t1
13 | k1,t2
14 | k1,t3
15 | k2,t4
16 | k2,t5
17 | k5,t6
18 | k6,t7
19 | 
20 | % A="/tmp/A.txt"
21 | % B="/tmp/B.txt"
22 | % ~/spark-3.2.1/bin/spark-submit rdd_join_left.py   $A   $B
23 | 
24 | rdd_A= 
25 | [
26 |  ('k1', 'v1'), 
27 |  ('k1', 'v2'), 
28 |  ('k2', 'v3'), 
29 |  ('k2', 'v4'), 
30 |  ('k2', 'v5'), 
31 |  ('k3', 'v6'), 
32 |  ('k3', 'v7'), 
33 |  ('k4', 'v8')
34 | ]
35 | 
36 | rdd_B= 
37 | [
38 |  ('k1', 't1'), 
39 |  ('k1', 't2'), 
40 |  ('k1', 't3'), 
41 |  ('k2', 't4'), 
42 |  ('k2', 't5'), 
43 |  ('k5', 't6'), 
44 |  ('k6', 't7')
45 |  ]
46 |  
47 | A_left_joined_B= 
48 | [
49 |  ('k1', ('v1', 't1')), 
50 |  ('k1', ('v1', 't2')), 
51 |  ('k1', ('v1', 't3')), 
52 |  ('k1', ('v2', 't1')), 
53 |  ('k1', ('v2', 't2')), 
54 |  ('k1', ('v2', 't3')), 
55 |  ('k2', ('v3', 't4')), 
56 |  ('k2', ('v3', 't5')), 
57 |  ('k2', ('v4', 't4')), 
58 |  ('k2', ('v4', 't5')), 
59 |  ('k2', ('v5', 't4')), 
60 |  ('k2', ('v5', 't5')), 
61 |  ('k4', ('v8', None)), 
62 |  ('k3', ('v6', None)), 
63 |  ('k3', ('v7', None))
64 | ]
65 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/join/python/rdd_join_right.log:
--------------------------------------------------------------------------------
 1 | % cat /tmp/A.txt
 2 | k1,v1
 3 | k1,v2
 4 | k2,v3
 5 | k2,v4
 6 | k2,v5
 7 | k3,v6
 8 | k3,v7
 9 | k4,v8
10 | 
11 | % cat /tmp/B.txt
12 | k1,t1
13 | k1,t2
14 | k1,t3
15 | k2,t4
16 | k2,t5
17 | k5,t6
18 | k6,t7
19 | 
20 | % A="/tmp/A.txt"
21 | % B="/tmp/B.txt"
22 | % ~/spark-3.2.1/bin/spark-submit rdd_join_right.py   $A   $B
23 | 
24 | rdd_A= 
25 | [
26 |  ('k1', 'v1'), 
27 |  ('k1', 'v2'), 
28 |  ('k2', 'v3'), 
29 |  ('k2', 'v4'), 
30 |  ('k2', 'v5'), 
31 |  ('k3', 'v6'), 
32 |  ('k3', 'v7'), 
33 |  ('k4', 'v8')
34 | ]
35 | 
36 | rdd_B= 
37 | [
38 |  ('k1', 't1'), 
39 |  ('k1', 't2'), 
40 |  ('k1', 't3'), 
41 |  ('k2', 't4'), 
42 |  ('k2', 't5'), 
43 |  ('k5', 't6'), 
44 |  ('k6', 't7')
45 |  ]
46 | 
47 | A_right_joined_B= 
48 | [
49 |  ('k1', ('v1', 't1')), 
50 |  ('k1', ('v1', 't2')), 
51 |  ('k1', ('v1', 't3')), 
52 |  ('k1', ('v2', 't1')), 
53 |  ('k1', ('v2', 't2')), 
54 |  ('k1', ('v2', 't3')), 
55 |  ('k2', ('v3', 't4')), 
56 |  ('k2', ('v3', 't5')), 
57 |  ('k2', ('v4', 't4')), 
58 |  ('k2', ('v4', 't5')), 
59 |  ('k2', ('v5', 't4')), 
60 |  ('k2', ('v5', 't5')), 
61 |  ('k5', (None, 't6')),
62 |  ('k6', (None, 't7'))
63 | ]
64 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/join/scala/README.md:
--------------------------------------------------------------------------------
1 | Join in Spark
2 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/python/sample_1.fasta:
--------------------------------------------------------------------------------
1 | >SEQUENCE_1
2 | GATTTGGGGCCCAAAGCAGTATCGATGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCAAATAGTGGATCCATTTGTTCAACTCACAGTTTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT
3 | >SEQUENCE_2
4 | GATTTGATTTGGGGCCCAAAGCAGTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTATCGATCAAATAGTGGATCGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCATTTGTTCAACTCACAGTTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT
5 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/python/sample_1.fastq:
--------------------------------------------------------------------------------
 1 | @SEQ_ID
 2 | GATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT
 3 | +
 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
 5 | @SEQ_ID
 6 | GATTTCCCGTTCAAAGCAGTATCGATCTTTTAGTAAATCCATTTGTTCAACTCACAGTTG
 7 | +
 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
 9 | @SEQ_ID
10 | GACCCGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT
11 | +
12 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
13 | @SEQ_ID
14 | TCATCATCATCCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT
15 | +
16 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
17 | @SEQ_ID
18 | AGTAAGTAAGTAATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCTAGTAAGTA
19 | +
20 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
21 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.bonus_chapter'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/data/sample_1.fasta:
--------------------------------------------------------------------------------
1 | >SEQUENCE_1
2 | GATTTGGGGCCCAAAGCAGTATCGATGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCAAATAGTGGATCCATTTGTTCAACTCACAGTTTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT
3 | >SEQUENCE_2
4 | GATTTGATTTGGGGCCCAAAGCAGTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTATCGATCAAATAGTGGATCGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCATTTGTTCAACTCACAGTTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT
5 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/data/sample_1.fastq:
--------------------------------------------------------------------------------
 1 | @SEQ_ID
 2 | GATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT
 3 | +
 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
 5 | @SEQ_ID
 6 | GATTTCCCGTTCAAAGCAGTATCGATCTTTTAGTAAATCCATTTGTTCAACTCACAGTTG
 7 | +
 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
 9 | @SEQ_ID
10 | GACCCGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT
11 | +
12 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
13 | @SEQ_ID
14 | TCATCATCATCCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT
15 | +
16 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
17 | @SEQ_ID
18 | AGTAAGTAAGTAATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCTAGTAAGTA
19 | +
20 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
21 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/k-mers/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/run_spark_applications_scripts/kmer_fast_q.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_1.fastq"
3 | K=4
4 | N=3
5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.KMERFastQ "--args=$INPUT_PATH $K $N"


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/run_spark_applications_scripts/kmer_fasta.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_1.fasta"
3 | K=4
4 | N=3
5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.KMERFasta "--args=$INPUT_PATH $K $N"
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/k-mers/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_k-mers'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/lambda_expressions/Lambda_Expressions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/lambda_expressions/Lambda_Expressions.pdf


--------------------------------------------------------------------------------
/code/bonus_chapters/lambda_expressions/README.md:
--------------------------------------------------------------------------------
1 | # Lambda Functions/Expressions
2 | 
3 | ### [Lambda functions basics](./Lambda_Expressions_basics.md)  
4 | 
5 | ### [Lambda functions tutorial](./Lambda_Expressions.pdf)
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/file1.txt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | -2
 5 | 4
 6 | 5
 7 | 6
 8 | -1
 9 | 8
10 | 9
11 | 3
12 | 2
13 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/file2.txt:
--------------------------------------------------------------------------------
 1 | 5
 2 | 6
 3 | 7
 4 | 8
 5 | -1
 6 | -2
 7 | -3
 8 | 3
 9 | 4
10 | 5
11 | 6
12 | 33
13 | 3
14 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/file3.txt:
--------------------------------------------------------------------------------
 1 | 1
 2 | 2
 3 | 3
 4 | 4
 5 | 5
 6 | 6
 7 | 7
 8 | 8
 9 | 9
10 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/mappartitions/images/mappartitions_image_1.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/mappartitions/images/mappartitions_image_1.drawio.png


--------------------------------------------------------------------------------
/code/bonus_chapters/mappartitions/images/mappartitions_image_2.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/mappartitions/images/mappartitions_image_2.drawio.png


--------------------------------------------------------------------------------
/code/bonus_chapters/mappartitions/python/find_min_max_by_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | # set SPARK_HOME
 2 | SPARK_HOME="/Users/mparsian/spark-3.2.0"
 3 | 
 4 | # define your input path
 5 | INPUT_PATH="/book/code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/"
 6 | 
 7 | # define your PySpark program
 8 | PROG="/book/code/bonus_chapters/mappartitions/python/find_min_max_by_mappartitions.py"
 9 | 
10 | # run your program
11 | $SPARK_HOME/bin/spark-submit  ${PROG}  ${INPUT_PATH}
12 | 
13 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/mappartitions/scala/README.md:
--------------------------------------------------------------------------------
1 | Example of mapPartitions() is given to find (count, minimum, maximum) 
2 | for all given numbers (data is given by a directory, which may have
3 | any number of text files). Each input file may have any number of records.
4 | Each record has one number (integer).
5 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/physical_partitioning/data_partitioning.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/physical_partitioning/data_partitioning.png


--------------------------------------------------------------------------------
/code/bonus_chapters/physical_partitioning/partition_by_one_column.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | # create a SparkSession object
 5 | spark = SparkSession.builder.getOrCreate()
 6 | 
 7 | 
 8 | # define input path
 9 | # input_path= 's3://mybucket/INPUT2/continents_countries_temp.csv'
10 | input_path = sys.argv[1]
11 | 
12 | # read data and create a DataFrame
13 | df = spark.read.format("csv")\
14 |     .option("header","true")\
15 |     .option("inferSchema", "true")\
16 |     .load(input_path)
17 | 
18 | df.show(10, truncate=False)
19 | df.printSchema()
20 | 
21 | # define output path
22 | # output_path = "s3://mybucket/SCU/OUTPUT2/continents_countries1/"
23 | output_path = sys.argv[2]
24 | 
25 | # partiton DataFrame by the "continent" column and save it to the output path
26 | df.repartition("continent")\
27 |     .write.mode("append")\
28 |     .partitionBy("continent")\
29 |     .parquet(output_path)
30 | 
31 | # done!
32 | spark.stop()
33 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/physical_partitioning/partition_by_one_column_schema.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE `continents`(
 2 |   `country` string,
 3 |   `city` string,
 4 |   `temperature` integer
 5 | )
 6 | PARTITIONED BY (
 7 |   `continent` string
 8 | )
 9 | STORED AS PARQUET
10 | LOCATION 's3://mybucket/SCU/OUTPUT2/continents_countries1/'
11 | tblproperties ("parquet.compress"="SNAPPY");
12 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/physical_partitioning/partition_by_two_columns.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | # create a SparkSession object
 5 | spark = SparkSession.builder.getOrCreate()
 6 | 
 7 | # define input path
 8 | # input_path= 's3://mybucket/INPUT2/continents_countries_temp.csv'
 9 | input_path = sys.argv[1]
10 | 
11 | # read data and create a DataFrame
12 | df = spark.read.format("csv")\
13 |     .option("header","true")\
14 |     .option("inferSchema", "true")\
15 |     .load(input_path)
16 | 
17 | df.show(10, truncate=False)
18 | df.printSchema()
19 | 
20 | # define output path
21 | # output_path = "s3://mybucket/SCU/OUTPUT2/continents_countries2/"
22 | output_path = sys.argv[2]
23 | 
24 | # partiton DataFrame by the "continent" and "country" columns
25 | # and save it to the output path
26 | df.repartition("continent", "country")\
27 |     .write.mode("append")\
28 |     .partitionBy("continent", "country")\
29 |     .parquet(output_path)
30 | 
31 | spark.stop()
32 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/physical_partitioning/partition_by_two_columns_schema.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE `continents_2`(
 2 |   `city` string,
 3 |   `temperature` integer
 4 | )
 5 | PARTITIONED BY (
 6 |   `continent` string,
 7 |   `country` string
 8 | )
 9 | STORED AS PARQUET
10 | LOCATION 's3://mybucket/SCU/OUTPUT2/continents_countries2/'
11 | tblproperties ("parquet.compress"="SNAPPY");
12 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/data/foxdata.txt:
--------------------------------------------------------------------------------
1 | a red fox jumped of high
2 | fox jumped over a high fence
3 | red of fox jumped
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/data/sample_document.txt:
--------------------------------------------------------------------------------
1 | a crazy fox jumped and jumped
2 | red fox jumped
3 | gray fox jumped and jumped of the fence
4 | red fox jumped of the fence
5 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/word_count_by_dataframe_shorthand.log:
--------------------------------------------------------------------------------
 1 | % export SPARK_HOME=<your-installed-spark-directory>
 2 | % $SPARK_HOME/bin/spark-submit word_count_by_dataframe_shorthand.py data/foxdata.txt
 3 | 
 4 | input_path= data/foxdata.txt
 5 | 
 6 | final_word_count:
 7 | +------+-----+
 8 | |word  |count|
 9 | +------+-----+
10 | |jumped|3    |
11 | |fox   |3    |
12 | |red   |2    |
13 | |high  |2    |
14 | +------+-----+
15 | 
16 | root
17 |  |-- word: string (nullable = false)
18 |  |-- count: long (nullable = false)
19 | 
20 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/word_count_by_dataframe_shorthand.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | import pyspark.sql.functions as F
 3 | import sys
 4 | 
 5 | #------------------------------------------------------
 6 | # Word Count (shorthand notation) using Spark Dataframe
 7 | #------------------------------------------------------
 8 | 
 9 | # create an instance of SparkSession object
10 | spark = SparkSession.builder.getOrCreate()
11 | 
12 | # define your input
13 | input_path = sys.argv[1]
14 | print("input_path=", input_path)
15 | 
16 | # read input and create a DataFrame(words: [String])
17 | # the created df is single column table (name of column: words)
18 | # where each row will be an array of string objects
19 | final_word_count = spark.read\
20 |   .text(input_path)\
21 |   .select(F.split(F.col("value"), " ").alias("words"))\
22 |   .select(F.explode(F.col("words")).alias("word"))\
23 |   .select(F.lower(F.col("word")).alias("word"))\
24 |   .filter(F.length(F.col("word")) > 2)\
25 |   .groupby(F.col("word")).count()\
26 |   .where("count > 1")
27 | 
28 | # for debugging purposes
29 | print("final_word_count:")
30 | final_word_count.show(10, truncate=False)
31 | final_word_count.printSchema()
32 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_combinebykey.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | #======================================
 5 | #
 6 | # NOTE: print() and collect() are used 
 7 | # fordebugging and educational purposes.
 8 | #
 9 | # @author Mahmoud Parsian
10 | #
11 | #======================================
12 | 
13 | # create an instance of a SparkSession as spark
14 | spark = SparkSession.builder.getOrCreate()
15 | print("spark.version=", spark.version)
16 | 
17 | # set input path
18 | input_path = sys.argv[1]
19 | print("input_path=", input_path)
20 | 
21 | # Note that the "combined data type"
22 | # for combineByKey() is an Integer.
23 | frequencies = spark.sparkContext.textFile(input_path)\
24 |   .flatMap(lambda line: line.split(" "))\
25 |   .map(lambda word: (word, 1))\
26 |   .combineByKey(\
27 |     lambda v: 1,\
28 |     lambda C, v: C+1,\
29 |     lambda C, D: C+D\
30 |   )
31 | #
32 | print(frequencies.collect())
33 | 
34 | # done!
35 | spark.stop()
36 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/home/spark-3.3.1"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="${SPARK_HOME}/NOTICE"
 6 | #
 7 | # define your PySpark program
 8 | PROG="wordcount_by_combinebykey.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 
12 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_groupbykey.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | #======================================
 5 | #
 6 | # NOTE: print() and collect() are used for debugging and educational purposes.
 7 | #
 8 | # @author Mahmoud Parsian
 9 | #
10 | #======================================
11 | def main():
12 | 
13 |     # create an instance of a SparkSession as spark
14 |     spark = SparkSession.builder.getOrCreate()
15 | 
16 |     # set input path
17 |     input_path = sys.argv[1]
18 |     print("input_path=", input_path)
19 | 
20 |     # create RDD from a text file
21 |     records = spark.sparkContext.textFile(input_path)
22 |     print(records.collect())
23 | 
24 |     words = records.flatMap(lambda line: line.split(" "))
25 |     print(words.collect())
26 | 
27 |     pairs =  words.map(lambda word: (word, 1))
28 |     print(pairs.collect())
29 | 
30 |     frequencies = pairs.reduceByKey(lambda a, b: a + b)
31 |     print(frequencies.collect())
32 | 
33 |     # done!
34 |     spark.stop()
35 | #end-def
36 | #======================================
37 | if __name__ == "__main__":
38 |     main()


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.2.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/NOTICE"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/wordcount/wordcount_by_groupbykey.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 
12 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_groupbykey_shorthand.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | #======================================
 5 | #
 6 | # NOTE: print() and collect() are used for debugging and educational purposes.
 7 | #
 8 | # @author Mahmoud Parsian
 9 | #
10 | #======================================
11 | def main():
12 | 
13 |     # create an instance of a SparkSession as spark
14 |     spark = SparkSession.builder.getOrCreate()
15 | 
16 |     # set input path
17 |     input_path = sys.argv[1]
18 |     print("input_path=", input_path)
19 | 
20 |     frequencies = spark.sparkContext.textFile(input_path)\
21 |         .flatMap(lambda line: line.split(" "))\
22 |         .map(lambda word: (word, 1))\
23 |         .reduceByKey(lambda a, b: a + b)
24 |     #
25 |     print(frequencies.collect())
26 | 
27 |     # done!
28 |     spark.stop()
29 | #end-def
30 | #======================================
31 | if __name__ == "__main__":
32 |     main()


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_groupbykey_shorthand.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.2.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/NOTICE"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/wordcount/wordcount_by_groupbykey_shorthand.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 
12 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_reducebykey.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | #======================================
 5 | #
 6 | # NOTE: print() and collect() are used for debugging and educational purposes.
 7 | #
 8 | # @author Mahmoud Parsian
 9 | #
10 | #======================================
11 | def main():
12 | 
13 |     # create an instance of a SparkSession as spark
14 |     spark = SparkSession.builder.getOrCreate()
15 | 
16 |     # set input path
17 |     input_path = sys.argv[1]
18 |     print("input_path=", input_path)
19 | 
20 |     # create RDD from a text file
21 |     records = spark.sparkContext.textFile(input_path)
22 |     print(records.collect())
23 | 
24 |     words = records.flatMap(lambda line: line.split(" "))
25 |     print(words.collect())
26 | 
27 |     pairs =  words.map(lambda word: (word, 1))
28 |     print(pairs.collect())
29 | 
30 |     frequencies = pairs.reduceByKey(lambda a, b: a + b)
31 |     print(frequencies.collect())
32 | 
33 |     # done!
34 |     spark.stop()
35 | #end-def
36 | #======================================
37 | if __name__ == "__main__":
38 |     main()


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.2.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/NOTICE"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/wordcount/wordcount_by_reducebykey.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 
12 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_reducebykey_shorthand.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | #======================================
 5 | #
 6 | # NOTE: print() and collect() are used for debugging and educational purposes.
 7 | #
 8 | # @author Mahmoud Parsian
 9 | #
10 | #======================================
11 | def main():
12 | 
13 |     # create an instance of a SparkSession as spark
14 |     spark = SparkSession.builder.getOrCreate()
15 | 
16 |     # set input path
17 |     input_path = sys.argv[1]
18 |     print("input_path=", input_path)
19 | 
20 |     frequencies = spark.sparkContext.textFile(input_path)\
21 |         .flatMap(lambda line: line.split(" "))
22 |         .map(lambda word: (word, 1))\
23 |         .reduceByKey(lambda a, b: a + b)
24 |     #
25 |     print(frequencies.collect())
26 | 
27 |     # done!
28 |     spark.stop()
29 | #end-def
30 | #======================================
31 | if __name__ == "__main__":
32 |     main()


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_reducebykey_shorthand.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.2.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/NOTICE"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/wordcount/wordcount_by_reducebykey_shorthand.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 
12 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/python/wordcount_by_reducebykey_with_filter.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.2.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/NOTICE"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/wordcount/wordcount_by_reducebykey_with_filter.py"
 9 | 
10 | # drop words if its length are less than 3
11 | WORD_LENGTH_THRESHOLD=3
12 | 
13 | # drop words (after reduction) if its frequency is less than 2
14 | FREQUENCY_THRESHOLD=2
15 | 
16 | # submit your spark application
17 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} ${WORD_LENGTH_THRESHOLD} ${FREQUENCY_THRESHOLD}
18 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/README.md:
--------------------------------------------------------------------------------
 1 | ````
 2 | The purpose of this folder is to present 
 3 | multiple solutions for classic word count 
 4 | problem.
 5 | 
 6 | Solutions are provided by using reduceByKey()
 7 | and groupByKey() reducers. In general, solution
 8 | by using reduceByKey() is a scale-out solution
 9 | than using groupByKey().
10 | 
11 | 
12 | best regards,
13 | Biman Mandal
14 | ````
15 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.bonus_chapter'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/data/sample_document.txt:
--------------------------------------------------------------------------------
1 | a crazy fox jumped and jumped
2 | red fox jumped
3 | gray fox jumped and jumped of the fence
4 | red fox jumped of the fence
5 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/wordcount/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByGroupByKey "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_group_by_key_shorthand.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByGroupByKeyShorthand "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_reduce_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByReduceByKey "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_reduce_by_key_shorthand.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByReduceByKeyShorthand "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_reduce_by_key_with_filter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_document.txt"
3 | # drop words if its length are less than 3
4 | WORD_LENGTH_THRESHOLD=3
5 | # drop words (after reduction) if its frequency is less than 2
6 | FREQUENCY_THRESHOLD=2
7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByReduceByKeyWithFilter "--args=$INPUT_PATH $WORD_LENGTH_THRESHOLD $FREQUENCY_THRESHOLD"


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_wordcount'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/src/main/scala/org/data/algorithms/spark/bonuschapter/WordCountByGroupByKeyShorthand.scala:
--------------------------------------------------------------------------------
 1 | package org.data.algorithms.spark.bonuschapter
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |  *----------------------------------------
 7 |  * NOTE: print() and collect() are used for debugging and educational purposes.
 8 |  *
 9 |  * @author Biman Mandal
10 |  *----------------------------------------
11 |  */
12 | object WordCountByGroupByKeyShorthand {
13 |   def main(args: Array[String]): Unit = {
14 |     // create an instance of a SparkSession as spark
15 |     val spark = SparkSession.builder.master("local[*]").getOrCreate()
16 | 
17 |     // set input path
18 |     val inputPath = args(0)
19 |     println("inputPath=" + inputPath)
20 | 
21 |     val frequencies = spark.sparkContext.textFile(inputPath)
22 |       .flatMap(line => line.split(" "))
23 |       .map(word => (word, 1))
24 |       .groupByKey()
25 |       .map(x => (x._1, x._2.toList.sum))
26 | 
27 |     println(frequencies.collect().mkString("Array(", ", ", ")"))
28 | 
29 |     // done!
30 |     spark.stop()
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/scala/src/main/scala/org/data/algorithms/spark/bonuschapter/WordCountByReduceByKeyShorthand.scala:
--------------------------------------------------------------------------------
 1 | package org.data.algorithms.spark.bonuschapter
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |  *----------------------------------------
 7 |  * NOTE: print() and collect() are used for debugging and educational purposes.
 8 |  *
 9 |  * @author Biman Mandal
10 |  *----------------------------------------
11 |  */
12 | object WordCountByReduceByKeyShorthand {
13 |   def main(args: Array[String]): Unit = {
14 |     // create an instance of a SparkSession as spark
15 |     val spark = SparkSession.builder.master("local[*]").getOrCreate()
16 | 
17 |     // set input path
18 |     val inputPath = args(0)
19 |     println("inputPath=" + inputPath)
20 | 
21 |     val frequencies = spark.sparkContext.textFile(inputPath)
22 |       .flatMap(line => line.split(" "))
23 |       .map(word => (word, 1))
24 |       .reduceByKey((x, y) => x + y)
25 | 
26 |     println(frequencies.collect().mkString("Array(", ", ", ")"))
27 | 
28 |     // done!
29 |     spark.stop()
30 |   }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/code/bonus_chapters/wordcount/word_count_with_mapreduce.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/wordcount/word_count_with_mapreduce.png


--------------------------------------------------------------------------------
/code/chap01/data/emps.txt:
--------------------------------------------------------------------------------
1 | 1000,alex,67000
2 | 1001,bob,24000
3 | 1002,jane,69000
4 | 1003,betty,55000
5 | 1004,jeff,59000
6 | 


--------------------------------------------------------------------------------
/code/chap01/data/sample_5_records.txt:
--------------------------------------------------------------------------------
1 | A,3
2 | A,4
3 | A,5
4 | B,10
5 | B,20
6 | 


--------------------------------------------------------------------------------
/code/chap01/data/users.txt:
--------------------------------------------------------------------------------
1 | 1,Alex,30,124
2 | 2,Bert,32,234
3 | 3,Curt,28,312
4 | 4,Don,32,180
5 | 5,Mary,30,100
6 | 6,Jane,28,212
7 | 7,Joe,28,128
8 | 8,Al,40,600
9 | 


--------------------------------------------------------------------------------
/code/chap01/python/average_by_key_use_aggregatebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_aggregatebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_aggregatebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/average_by_key_use_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_combinebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_combinebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/average_by_key_use_foldbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_foldbykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_foldbykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/average_by_key_use_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_groupbykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_groupbykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/average_by_key_use_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_reducebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_reducebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_action_describe.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_action_describe.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_action_describe.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_add_column.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_add_column.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_add_column.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_drop_column.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_drop_column.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_drop_column.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_filter.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_filter.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_filter.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_join_cross.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_cross.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_cross.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_join_inner.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_inner.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_inner.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_join_left.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_left.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_left.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_join_right.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_right.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_right.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/dataframe_sql.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_sql.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/dataframe_sql.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_cartesian.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_cartesian.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_cartesian.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_combinebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_combinebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_filter.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_filter.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_filter.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_flatmap.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_flatmap.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_flatmap.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_groupbykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-2.4.3"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_groupbykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_join.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_join.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.3"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_join.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_map.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_map.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_map.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_mappartitions.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_mappartitions_handle_empty_partitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_mappartitions_handle_empty_partitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_mappartitions_handle_empty_partitions.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_reducebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_reducebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_sortby.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_sortby.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_sortby.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/python/rdd_transformation_takeordered.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_takeordered.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_takeordered.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap01/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.ch01'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/chap01/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap01/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/chap01/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | update_data_in_sh() {
 4 | 	filename=$1
 5 | 	classname=$2
 6 | 	echo "#!/bin/bash" > $filename
 7 | 	echo "./gradlew clean run -PmainClass="$classname >> $filename
 8 | }
 9 | 
10 | script_folder_name=run_spark_applications_scripts 
11 | 
12 | if [ ! -d $script_folder_name ]
13 | then
14 | 	mkdir $script_folder_name
15 | fi
16 | 
17 | 
18 | for file in `find . -type f -regex ".*\.scala"`
19 | do
20 | 	filename=$(echo $file | awk -F "/" '{print $NF}' | cut -d "." -f1)
21 | 	path=$(echo $file | rev | cut -d "/" -f 2- | rev)
22 | 	packagename=$(echo $file | awk -F "/" '
23 | 		BEGIN { ORS="" }; 
24 | 			{for(i=5;i<NF-1;++i)print $i"."};
25 | 			{print $i}
26 | 		') 
27 | 	mainClassName=$(echo "$packagename.$filename")
28 | 	shellScriptName=$(echo $script_folder_name/$(echo $filename | cut -d "." -f1  \
29 | 		| sed 's/\([^A-Z]\)\([A-Z0-9]\)/\1_\2/g' \
30 | 		| sed 's/\([A-Z0-9]\)\([A-Z0-9]\)\([^A-Z]\)/\1_\2\3/g' \
31 | 		| tr '[:upper:]' '[:lower:]')".sh")
32 | 	
33 | 	update_data_in_sh $shellScriptName $mainClassName
34 | 	echo $shellScriptName created...
35 | 	chmod 755 $shellScriptName 
36 | done;


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/aggregate_by_key_use_aggregate_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.AggregateByKeyUseAggregateByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/aggregate_by_key_use_combine_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.AggregateByKeyUseCombineByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/average_by_key_use_fold_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.AverageByKeyUseFoldByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/average_by_key_use_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.AverageByKeyUseGroupByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/average_by_key_use_reduce_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.AverageByKeyUseReduceByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_action_describe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeActionDescribe
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_add_column.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeAddColumn
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_drop_column.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeDropColumn
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_filter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeFilter
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_join_cross.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeJoinCross
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_join_inner.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeJoinInner
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_join_left.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeJoinLeft
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_join_right.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeJoinRight
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/dataframe_sql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.DataframeSQL
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_action_take_ordered.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDActionTakeOrdered
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_cartesian.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationCartesian
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_combine_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationCombineByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_filter.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationFilter
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_flat_map.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationFlatMap
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationGroupByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_join.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationJoin
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_map.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationMap
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_map_partition.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationMapPartition
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_mappartitions_handle_empty_partitions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationMappartitionsHandleEmptyPartitions
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_reduce_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationReduceByKey
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/run_spark_applications_scripts/rdd_transformation_sort_by.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch01.RDDTransformationSortBy
3 | 


--------------------------------------------------------------------------------
/code/chap01/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch01'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V1/dna_base_count_ver_1.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.0.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///book/code/chap02/data/sample.fasta"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/chap02/DNA-FASTA-V1/dna_base_count_ver_1.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V1/dna_base_count_ver_1_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/book/spark-3.0.0"
 8 | #
 9 | # define your input path
10 | INPUT_PATH="file:///book/code/chap02/data/*.fasta"
11 | #
12 | # define your PySpark program
13 | PROG="/book/code/chap02/DNA-FASTA-V1/dna_base_count_ver_1.py"
14 | #
15 | # submit your spark application
16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
17 | #
18 | duration=$SECONDS
19 | echo ""
20 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
21 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V1/dna_base_count_ver_1_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/book/spark-3.0.0"
 8 | #
 9 | # NOTE: define your input path
10 | # Before running your PySpark program,
11 | # Download chr1.subst.fa from this location:
12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
14 | #
15 | INPUT_PATH="file:///book/code/chap02/chr1.subst.fa"
16 | #
17 | # define your PySpark program
18 | PROG=/book/code/chap02/DNA-FASTA-V1/dna_base_count_ver_1.py
19 | #
20 | # submit your spark application
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
22 | #
23 | duration=$SECONDS
24 | echo ""
25 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
26 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V2/dna_base_count_ver_2.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.0.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///book/code/chap02/data/sample.fasta"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/chap02/DNA-FASTA-V2/dna_base_count_ver_2.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V2/dna_base_count_ver_2_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/book/spark-3.0.0"
 8 | #
 9 | # define your input path
10 | INPUT_PATH="file:///book/code/chap02/data/*.fa"
11 | #
12 | # define your PySpark program
13 | PROG="/book/code/chap02/DNA-FASTA-V2/dna_base_count_ver_2.py"
14 | #
15 | # submit your spark application
16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
17 | #
18 | duration=$SECONDS
19 | echo ""
20 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
21 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V2/dna_base_count_ver_2_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/book/spark-3.0.0"
 8 | #
 9 | # NOTE: define your input path
10 | # Before running your PySpark program,
11 | # Download chr1.subst.fa from this location:
12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
14 | #
15 | INPUT_PATH="file:///book/code/chap02/data/chr1.subst.fa"
16 | #
17 | # define your PySpark program
18 | PROG="/book/code/chap02/DNA-FASTA-V2/dna_base_count_ver_2.py"
19 | #
20 | # submit your spark application
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
22 | #
23 | duration=$SECONDS
24 | echo ""
25 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
26 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V3/dna_base_count_ver_3.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.0.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///book/code/chap02/data/sample.fasta"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/chap02/DNA-FASTA-V3/dna_base_count_ver_3.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V3/dna_base_count_ver_3_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/book/spark-3.0.0"
 8 | #
 9 | # define your input path
10 | INPUT_PATH="file:///book/code/chap02/data/*.fa"
11 | #
12 | # define your PySpark program
13 | PROG="/book/code/chap02/DNA-FASTA-V3/dna_base_count_ver_3.py"
14 | #
15 | # submit your spark application
16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
17 | #
18 | #
19 | duration=$SECONDS
20 | echo ""
21 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
22 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTA-V3/dna_base_count_ver_3_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/book/spark-3.0.0"
 8 | #
 9 | # NOTE: define your input path
10 | # Before running your PySpark program,
11 | # Download chr1.subst.fa from this location:
12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
14 | #
15 | INPUT_PATH="file:///book/code/chap02/data/chr1.subst.fa"
16 | #
17 | # define your PySpark program
18 | PROG="/book/code/chap02/DNA-FASTA-V3/dna_base_count_ver_3.py"
19 | #
20 | # submit your spark application
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
22 | #
23 | #
24 | duration=$SECONDS
25 | echo ""
26 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
27 | 


--------------------------------------------------------------------------------
/code/chap02/python/DNA-FASTQ/dna_base_count_fastq.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/book/spark-3.0.0"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///book/code/chap02/data/sp1.fastq"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/book/code/chap02/DNA-FASTQ/dna_base_count_fastq.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap02/python/data/sample.fasta:
--------------------------------------------------------------------------------
 1 | >seq1
 2 | cGTAaccaataaaaaaacaagcttaacctaattc
 3 | >seq2
 4 | agcttagTTTGGatctggccgggg
 5 | >seq3
 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca
 7 | gaattcgcacca
 8 | AATAAAACCTCACCCAT
 9 | agagcccagaatttactcCCC
10 | >seq4
11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca
12 | gaattcgcacca
13 | 


--------------------------------------------------------------------------------
/code/chap02/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | 
 7 | group 'com.spark.algos.data'
 8 | version '1.0-SNAPSHOT'
 9 | 
10 | repositories {
11 |     mavenLocal()
12 |     mavenCentral()
13 | }
14 | 
15 | dependencies {
16 |     implementation group: "org.scala-lang", name: "scala-library", version: "2.13.7"
17 |     implementation group: "org.apache.spark", name: "spark-core_2.13", version: "3.2.0"
18 |     implementation group: "org.apache.spark", name: "spark-sql_2.13", version: "3.2.0"
19 | }
20 | 
21 | application {
22 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
23 | }


--------------------------------------------------------------------------------
/code/chap02/scala/data/sample.fasta:
--------------------------------------------------------------------------------
 1 | >seq1
 2 | cGTAaccaataaaaaaacaagcttaacctaattc
 3 | >seq2
 4 | agcttagTTTGGatctggccgggg
 5 | >seq3
 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca
 7 | gaattcgcacca
 8 | AATAAAACCTCACCCAT
 9 | agagcccagaatttactcCCC
10 | >seq4
11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca
12 | gaattcgcacca
13 | 


--------------------------------------------------------------------------------
/code/chap02/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap02/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/chap02/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_fastq.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # define your input path
3 | INPUT_PATH="data/sp1.fastq"
4 | 
5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountFastq "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # define your input path
3 | INPUT_PATH="data/sample.fasta"
4 | 
5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER1 "--args=$INPUT_PATH"
6 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_1_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | #------------------------------------------------------
 5 | # NOTE: define your input path
 6 | # Before running your Spark program,
 7 | # Download *.fa from this location and place it under
 8 | # the following directory: /book/chap02/data/
 9 | #
10 | # Download URL:
11 | #   http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
12 | #------------------------------------------------------
13 | # define your input path
14 | INPUT_PATH="data/*.fasta"
15 | 
16 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER1 "--args=$INPUT_PATH"
17 | 
18 | #
19 | duration=$SECONDS
20 | echo ""
21 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
22 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_1_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # NOTE: define your input path
 7 | # Before running your Spark program,
 8 | # Download chr1.subst.fa from this location:
 9 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
10 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
11 | #
12 | # define your input path
13 | INPUT_PATH="data/chr1.subst.fa"
14 | 
15 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER1 "--args=$INPUT_PATH"
16 | 
17 | duration=$SECONDS
18 | echo ""
19 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
20 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # define your input path
3 | INPUT_PATH="data/sample.fasta"
4 | 
5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER2 "--args=$INPUT_PATH"
6 | 
7 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_2_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | #------------------------------------------------------
 5 | # NOTE: define your input path
 6 | # Before running your Spark program,
 7 | # Download *.fa from this location and place it under
 8 | # the following directory: /book/chap02/data/
 9 | #
10 | # Download URL:
11 | #   http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
12 | #------------------------------------------------------
13 | #
14 | # define your input path
15 | INPUT_PATH="data/*.fa"
16 | 
17 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER2 "--args=$INPUT_PATH"
18 | 
19 | duration=$SECONDS
20 | echo ""
21 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
22 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_2_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # NOTE: define your input path
 7 | # Before running your Spark program,
 8 | # Download chr1.subst.fa from this location:
 9 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
10 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
11 | #
12 | INPUT_PATH="data/chr1.subst.fa"
13 | 
14 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER2 "--args=$INPUT_PATH"
15 | 
16 | duration=$SECONDS
17 | echo ""
18 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
19 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_3.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # define your input path
3 | INPUT_PATH="data/sample.fasta"
4 | 
5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER3 "--args=$INPUT_PATH"
6 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_3_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | #------------------------------------------------------
 5 | # NOTE: define your input path
 6 | # Before running your Spark program,
 7 | # Download *.fa from this location and place it under
 8 | # the following directory: /book/chap02/data/
 9 | #
10 | # Download URL:
11 | #   http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
12 | #------------------------------------------------------
13 | INPUT_PATH="data/*.fa"
14 | 
15 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER3 "--args=$INPUT_PATH"
16 | 
17 | duration=$SECONDS
18 | echo ""
19 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
20 | 


--------------------------------------------------------------------------------
/code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_3_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # NOTE: define your input path
 7 | # Before running your Spark program,
 8 | # Download chr1.subst.fa from this location:
 9 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
10 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
11 | #
12 | INPUT_PATH="data/chr1.subst.fa"
13 | 
14 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER3 "--args=$INPUT_PATH"
15 | 
16 | duration=$SECONDS
17 | echo ""
18 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
19 | 


--------------------------------------------------------------------------------
/code/chap02/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch02'


--------------------------------------------------------------------------------
/code/chap02/scala/src/main/resources/input.txt:
--------------------------------------------------------------------------------
 1 | >seq1
 2 | cGTAaccaataaaaaaacaagcttaacctaattc
 3 | >seq2
 4 | agcttagTTTGGatctggccgggg
 5 | >seq3
 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca
 7 | gaattcgcacca
 8 | AATAAAACCTCACCCAT
 9 | agagcccagaatttactcCCC
10 | >seq4
11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca
12 | gaattcgcacca


--------------------------------------------------------------------------------
/code/chap03/python/bigrams_input.txt:
--------------------------------------------------------------------------------
1 | Spark shines_id in data analytics and beyond
2 | this is the
3 | this is the first record
4 | Spark shines in data analytics and beyond
5 | this is the second record
6 | Spark shines again in data analytics and beyond
7 | 


--------------------------------------------------------------------------------
/code/chap03/python/sample_input:
--------------------------------------------------------------------------------
1 | alex,Sunnyvale,25
2 | alex,Sunnyvale,33
3 | mary,Ames,22
4 | mary,Cupertino,66
5 | mary,Sunnyvale,44
6 | jane,Ames,20
7 | jane,Troy,40
8 | bob,Ames,26
9 | 


--------------------------------------------------------------------------------
/code/chap03/python/sample_numbers.txt:
--------------------------------------------------------------------------------
 1 | 9
 2 | 5
 3 | 33
 4 | 66
 5 | -21
 6 | -33
 7 | 1
 8 | 2
 9 | 3
10 | 44
11 | 55
12 | 66
13 | 1
14 | 2
15 | -1
16 | -2
17 | 0
18 | 5
19 | 6
20 | 7
21 | 8
22 | 0
23 | -8
24 | -9
25 | 0
26 | 0
27 | 6
28 | 7
29 | 8
30 | 9
31 | 0
32 | -1
33 | 


--------------------------------------------------------------------------------
/code/chap03/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.ch03'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/chap03/scala/data/bigrams_input.txt:
--------------------------------------------------------------------------------
1 | Spark shines_id in data analytics and beyond
2 | this is the
3 | this is the first record
4 | Spark shines in data analytics and beyond
5 | this is the second record
6 | Spark shines again in data analytics and beyond
7 | 


--------------------------------------------------------------------------------
/code/chap03/scala/data/sample_input.csv:
--------------------------------------------------------------------------------
1 | alex,Sunnyvale,25
2 | alex,Sunnyvale,33
3 | mary,Ames,22
4 | mary,Cupertino,66
5 | mary,Sunnyvale,44
6 | jane,Ames,20
7 | jane,Troy,40
8 | bob,Ames,26
9 | 


--------------------------------------------------------------------------------
/code/chap03/scala/data/sample_numbers.txt:
--------------------------------------------------------------------------------
 1 | 9
 2 | 5
 3 | 33
 4 | 66
 5 | -21
 6 | -33
 7 | 1
 8 | 2
 9 | 3
10 | 44
11 | 55
12 | 66
13 | 1
14 | 2
15 | -1
16 | -2
17 | 0
18 | 5
19 | 6
20 | 7
21 | 8
22 | 0
23 | -8
24 | -9
25 | 0
26 | 0
27 | 6
28 | 7
29 | 8
30 | 9
31 | 0
32 | -1
33 | 


--------------------------------------------------------------------------------
/code/chap03/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap03/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/chap03/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | update_data_in_sh() {
 4 | 	filename=$1
 5 | 	classname=$2
 6 | 	echo "#!/bin/bash" > $filename
 7 | 	echo "./gradlew clean run -PmainClass="$classname >> $filename
 8 | }
 9 | 
10 | script_folder_name=run_spark_applications_scripts 
11 | 
12 | if [ ! -d $script_folder_name ]
13 | then
14 | 	mkdir $script_folder_name
15 | fi
16 | 
17 | 
18 | for file in `find . -type f -regex ".*\.scala"`
19 | do
20 | 	filename=$(echo $file | awk -F "/" '{print $NF}' | cut -d "." -f1)
21 | 	path=$(echo $file | rev | cut -d "/" -f 2- | rev)
22 | 	packagename=$(echo $file | awk -F "/" '
23 | 		BEGIN { ORS="" }; 
24 | 			{for(i=5;i<NF-1;++i)print $i"."};
25 | 			{print $i}
26 | 		') 
27 | 	mainClassName=$(echo "$packagename.$filename")
28 | 	shellScriptName=$(echo $script_folder_name/$(echo $filename | cut -d "." -f1  \
29 | 		| sed 's/\([^A-Z]\)\([A-Z0-9]\)/\1_\2/g' \
30 | 		| sed 's/\([A-Z0-9]\)\([A-Z0-9]\)\([^A-Z]\)/\1_\2\3/g' \
31 | 		| tr '[:upper:]' '[:lower:]')".sh")
32 | 	
33 | 	update_data_in_sh $shellScriptName $mainClassName
34 | 	echo $shellScriptName created...
35 | 	chmod 755 $shellScriptName 
36 | done;


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/flat_map_transformation_1_from_file.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/bigrams_input.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.FlatMapTransformation1FromFile "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/flatmap_transformation_1_from_collection.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.FlatmapTransformation1FromCollection
3 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/map_partitions_transformation_1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_numbers.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.MapPartitionsTransformation1 "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/map_transformation_1_from_collection.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.MapTransformation1FromCollection
3 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/map_transformation_1_from_file.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_input.csv"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.MapTransformation1FromFile "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/map_values_transformation_1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.MapValuesTransformation1
3 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/map_values_transformation_2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.MapValuesTransformation2
3 | 


--------------------------------------------------------------------------------
/code/chap03/scala/run_spark_applications_scripts/map_values_transformation_3.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch03.MapValuesTransformation3
3 | 


--------------------------------------------------------------------------------
/code/chap03/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch03'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/chap03/scala/src/main/resources/sample_numbers.txt:
--------------------------------------------------------------------------------
 1 | 9
 2 | 5
 3 | 33
 4 | 66
 5 | -21
 6 | -33
 7 | 1
 8 | 2
 9 | 3
10 | 44
11 | 55
12 | 66
13 | 1
14 | 2
15 | -1
16 | -2
17 | 0
18 | 5
19 | 6
20 | 7
21 | 8
22 | 0
23 | -8
24 | -9
25 | 0
26 | 0
27 | 6
28 | 7
29 | 8
30 | 9
31 | 0
32 | -1
33 | 


--------------------------------------------------------------------------------
/code/chap04/python/average_by_key_use_aggregatebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the aggregateByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/book/spark-3.2.0"
 8 | export SPARK_PROG="/book/code/chap04/average_by_key_use_aggregatebykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap04/python/average_by_key_use_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the combineByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/book/spark-3.2.0"
 8 | export SPARK_PROG="/book/code/chap04/average_by_key_use_combinebykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap04/python/average_by_key_use_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the groupByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/book/spark-3.2.0"
 8 | export SPARK_PROG="/book/code/chap04/average_by_key_use_groupbykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap04/python/average_by_key_use_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the reduceByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/book/spark-3.2.0"
 8 | export SPARK_PROG="/book/code/chap04/average_by_key_use_reducebykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap04/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.ch04'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 |     implementation "org.scalanlp:breeze_$scalaClassifier:2.0.1-RC2"
21 | 
22 | }
23 | 
24 | application {
25 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
26 | }


--------------------------------------------------------------------------------
/code/chap04/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap04/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/chap04/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | update_data_in_sh() {
 4 | 	filename=$1
 5 | 	classname=$2
 6 | 	echo "#!/bin/bash" > $filename
 7 | 	echo "./gradlew clean run -PmainClass="$classname >> $filename
 8 | }
 9 | 
10 | script_folder_name=run_spark_applications_scripts 
11 | 
12 | if [ ! -d $script_folder_name ]
13 | then
14 | 	mkdir $script_folder_name
15 | fi
16 | 
17 | 
18 | for file in `find . -type f -regex ".*\.scala"`
19 | do
20 | 	filename=$(echo $file | awk -F "/" '{print $NF}' | cut -d "." -f1)
21 | 	path=$(echo $file | rev | cut -d "/" -f 2- | rev)
22 | 	packagename=$(echo $file | awk -F "/" '
23 | 		BEGIN { ORS="" }; 
24 | 			{for(i=5;i<NF-1;++i)print $i"."};
25 | 			{print $i}
26 | 		') 
27 | 	mainClassName=$(echo "$packagename.$filename")
28 | 	shellScriptName=$(echo $script_folder_name/$(echo $filename | cut -d "." -f1  \
29 | 		| sed 's/\([^A-Z]\)\([A-Z0-9]\)/\1_\2/g' \
30 | 		| sed 's/\([A-Z0-9]\)\([A-Z0-9]\)\([^A-Z]\)/\1_\2\3/g' \
31 | 		| tr '[:upper:]' '[:lower:]')".sh")
32 | 	
33 | 	update_data_in_sh $shellScriptName $mainClassName
34 | 	echo $shellScriptName created...
35 | 	chmod 755 $shellScriptName 
36 | done;


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/average_by_key_use_aggregate_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.AverageByKeyUseAggregateByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/average_by_key_use_combine_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.AverageByKeyUseCombineByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/average_by_key_use_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.AverageByKeyUseGroupByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/average_by_key_use_reduce_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.AverageByKeyUseReduceByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/dataframe_median_approx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.DataframeMedianApprox
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/dataframe_median_exact.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.DataframeMedianExact
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/exact_median_by_key_use_aggregate_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.ExactMedianByKeyUseAggregateByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/exact_median_by_key_use_combine_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.ExactMedianByKeyUseCombineByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/exact_median_by_key_use_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.ExactMedianByKeyUseGroupByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/run_spark_applications_scripts/exact_median_by_key_use_reduce_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch04.ExactMedianByKeyUseReduceByKey
3 | 


--------------------------------------------------------------------------------
/code/chap04/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch04'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/chap05/python/README.md:
--------------------------------------------------------------------------------
1 | PySpark Solutions
2 | 


--------------------------------------------------------------------------------
/code/chap05/python/customers.RECORD.FORMAT.txt:
--------------------------------------------------------------------------------
1 | Each record has the following format:
2 | 
3 | <customer_id><,><year><,><transaction_id><,><transaction_value>
4 | 


--------------------------------------------------------------------------------
/code/chap05/python/customers.txt:
--------------------------------------------------------------------------------
 1 | c1,2019,T0011,20.67
 2 | c1,2019,T0012,12.34
 3 | c1,2019,T0013,44.30
 4 | c1,2018,T0001,20.67
 5 | c1,2018,T0002,12.34
 6 | c1,2018,T0003,44.30
 7 | c2,2019,T0017,744.30
 8 | c2,2019,T0018,820.67
 9 | c2,2018,T0022,182.34
10 | c2,2018,T0033,494.30
11 | 


--------------------------------------------------------------------------------
/code/chap05/python/customers_with_date.RECORD.FORMAT.txt:
--------------------------------------------------------------------------------
1 | Each record has the following format:
2 | 
3 | <customer_id><,><year><,><transaction_id><,><amoun>
4 | 


--------------------------------------------------------------------------------
/code/chap05/python/customers_with_date.txt:
--------------------------------------------------------------------------------
 1 | c1,2/9/2019,T0011,20
 2 | c1,2/9/2019,T0012,12
 3 | c1,3/9/2019,T0013,30
 4 | c1,3/9/2019,T0014,42
 5 | c1,4/12/2019,T0023,48
 6 | c1,4/12/2018,T0051,28
 7 | c1,4/12/2019,T0043,42
 8 | c1,4/12/2018,T0091,29
 9 | c1,1/3/2018,T0002,12
10 | c1,4/3/2018,T0003,44
11 | c2,2/10/2019,T0511,20
12 | c2,2/10/2019,T0612,17
13 | c2,2/9/2019,T0061,25
14 | c2,2/9/2019,T0062,78
15 | c2,3/12/2019,T0513,67
16 | c2,3/12/2019,T0014,42
17 | c2,4/10/2019,T0023,48
18 | c2,4/10/2018,T0051,28
19 | c2,4/12/2019,T0043,42
20 | c2,4/12/2018,T0091,29
21 | c2,1/9/2018,T0002,12
22 | c2,4/9/2018,T0003,46
23 | 


--------------------------------------------------------------------------------
/code/chap05/python/partition_data_by_customer_and_year.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run the following program:
 3 | #      partition_data_by_customer_and_year.py
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/book/spark-3.2.0"
 8 | export INPUT_PATH="/book/code/chap05/customers.txt"
 9 | export OUTPUT_PATH="/tmp/partition_demo"
10 | export SPARK_PROG="/book/code/chap05/partition_data_by_customer_and_year.py"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH $OUTPUT_PATH
14 | 


--------------------------------------------------------------------------------
/code/chap05/scala/README.md:
--------------------------------------------------------------------------------
1 | Scala Solutions
2 | 


--------------------------------------------------------------------------------
/code/chap06/python/airports.json:
--------------------------------------------------------------------------------
 1 | {"id":"ORD","City":"Chicago","State":"IL","Country":"USA"}
 2 | {"id":"LGA","City":"New York","State":"NY","Country":"USA"}
 3 | {"id":"BOS","City":"Boston","State":"MA","Country":"USA"}
 4 | {"id":"IAH","City":"Houston","State":"TX","Country":"USA"}
 5 | {"id":"EWR","City":"Newark","State":"NJ","Country":"USA"}
 6 | {"id":"DEN","City":"Denver","State":"CO","Country":"USA"}
 7 | {"id":"MIA","City":"Miami","State":"FL","Country":"USA"}
 8 | {"id":"SFO","City":"San Francisco","State":"CA","Country":"USA"}
 9 | {"id":"ATL","City":"Atlanta","State":"GA","Country":"USA"}
10 | {"id":"DFW","City":"Dallas","State":"TX","Country":"USA"}
11 | {"id":"CLT","City":"Charlotte","State":"NC","Country":"USA"}
12 | {"id":"LAX","City":"Los Angeles","State":"CA","Country":"USA"}
13 | {"id":"SEA","City":"Seattle","State":"WA","Country":"USA"}
14 | 


--------------------------------------------------------------------------------
/code/chap06/python/breadth_first_search_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. applying Breadth-first search (BFS) algorithm
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/home/book/spark-3.2.0"
 9 | export SPARK_PROG="/home/book/code/chap06/breadth_first_search_example.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap06/python/connected_component_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. finding connected components
 5 | #
 6 | # Reference: https://en.wikipedia.org/wiki/Connected_component_(graph_theory)
 7 | #-----------------------------------------------------
 8 | # @author Mahmoud Parsian
 9 | #-----------------------------------------------------
10 | export SPARK_HOME="/home/book/spark-3.2.0"
11 | export SPARK_PROG="/home/book/code/chap06/connected_component_example.py"
12 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
13 | #
14 | # run the PySpark program:
15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
16 | 


--------------------------------------------------------------------------------
/code/chap06/python/graph_builder.log:
--------------------------------------------------------------------------------
 1 | export SPARK_HOME="/home/book/spark-3.2.0"
 2 | export SPARK_PROG="/home/book/code/chap06/graph_builder.py"
 3 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
 4 | #
 5 | # run the PySpark program:
 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
 7 | 
 8 | +---+-------+---+
 9 | | id|   name|age|
10 | +---+-------+---+
11 | |  a|  Alice| 34|
12 | |  b|    Bob| 36|
13 | |  c|Charlie| 30|
14 | +---+-------+---+
15 | 
16 | +---+---+------------+
17 | |src|dst|relationship|
18 | +---+---+------------+
19 | |  a|  b|      friend|
20 | |  b|  c|      follow|
21 | |  c|  b|      follow|
22 | +---+---+------------+
23 | 
24 | graph= GraphFrame(
25 |    v:[id: string, name: string ... 1 more field], 
26 |    e:[src: string, dst: string ... 1 more field]
27 | )
28 |    
29 | +---+--------+
30 | | id|inDegree|
31 | +---+--------+
32 | |  c|       1|
33 | |  b|       2|
34 | +---+--------+
35 | 
36 | count_follow= 2


--------------------------------------------------------------------------------
/code/chap06/python/graph_builder.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building a graph using 
 3 | # GraphFrames package.
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/home/book/spark-3.2.0"
 8 | export SPARK_PROG="/home/book/code/chap06/graph_builder.py"
 9 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
13 | 


--------------------------------------------------------------------------------
/code/chap06/python/label_propagation_algorithm_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. applying Label Propagation Algorithm (LPA)
 5 | #
 6 | # Reference: https://en.wikipedia.org/wiki/Label_Propagation_Algorithm
 7 | #-----------------------------------------------------
 8 | # @author Mahmoud Parsian
 9 | #-----------------------------------------------------
10 | export SPARK_HOME="/home/book/spark-3.2.0"
11 | export SPARK_PROG="/home/book/code/chap06/label_propagation_algorithm_example.py"
12 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
13 | #
14 | # run the PySpark program:
15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
16 | 


--------------------------------------------------------------------------------
/code/chap06/python/pagerank_data.txt:
--------------------------------------------------------------------------------
1 | 1,2
2 | 1,3
3 | 1,4
4 | 2,1
5 | 3,1
6 | 4,1
7 | 4,5
8 | 1,5
9 | 


--------------------------------------------------------------------------------
/code/chap06/python/pagerank_example.log:
--------------------------------------------------------------------------------
 1 | export SPARK_HOME="/home/book/spark-3.2.0"
 2 | export SPARK_PROG="/home/book/code/chap06/pagerank_example.py"
 3 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
 4 | #
 5 | # run the PySpark program:
 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
 7 | 
 8 | +---+-------+---+
 9 | | id|   name|age|
10 | +---+-------+---+
11 | |  a|  Alice| 34|
12 | |  b|    Bob| 36|
13 | |  c|Charlie| 30|
14 | +---+-------+---+
15 | 
16 | +---+---+------------+
17 | |src|dst|relationship|
18 | +---+---+------------+
19 | |  a|  b|      friend|
20 | |  b|  c|      follow|
21 | |  c|  b|      follow|
22 | +---+---+------------+
23 | 
24 | graph= GraphFrame(
25 |    v:[id: string, name: string ... 1 more field], 
26 |    e:[src: string, dst: string ... 1 more field]
27 | )
28 | 
29 | +---+------------------+
30 | | id|          pagerank|
31 | +---+------------------+
32 | |  b|1.0905890109440908|
33 | |  a|              0.01|
34 | |  c|1.8994109890559092|
35 | +---+------------------+
36 | 


--------------------------------------------------------------------------------
/code/chap06/python/pagerank_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building  
 3 | #   1. a graph using GraphFrames package.
 4 | #   2. applying PageRank algorithm to the built graph
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/home/book/spark-3.2.0"
 9 | export SPARK_PROG="/home/book/code/chap06/pagerank_example.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap06/python/sample_graph_edges.txt:
--------------------------------------------------------------------------------
 1 | edge_weight,from_id,to_id
 2 | 0,5,15
 3 | 1,18,8
 4 | 2,6,1
 5 | 3,0,10
 6 | 4,2,4
 7 | 5,19,7
 8 | 6,9,7
 9 | 7,11,9
10 | 8,14,9
11 | 9,16,11
12 | 10,17,8
13 | 1,3,4
14 | 2,12,15
15 | 3,13,2
16 | 4,21,0
17 | 5,22,4
18 | 16,22,8
19 | 17,24,4
20 | 18,28,7
21 | 19,28,13
22 | 20,28,16
23 | 1,29,11
24 | 2,30,16
25 | 3,31,15
26 | 24,32,2
27 | 25,32,30
28 | 6,35,11
29 | 7,35,24
30 | 28,36,16
31 | 29,39,7
32 | 30,39,28
33 | 1,40,7
34 | 2,40,11
35 | 3,41,5
36 | 4,41,16
37 | 5,41,32
38 | 6,42,32
39 | 7,43,36
40 | 8,44,16
41 | 9,46,7
42 | 6,49,3
43 | 1,5,31
44 | 2,30,42
45 | 4,17,22
46 | 4,18,22
47 | 1,50,51
48 | 2,51,52
49 | 3,50,52
50 | 1,71,72
51 | 1,71,73
52 | 1,72,73
53 | 


--------------------------------------------------------------------------------
/code/chap06/python/sample_graph_vertices.txt:
--------------------------------------------------------------------------------
 1 | vertex_id
 2 | 0
 3 | 1
 4 | 2
 5 | 3
 6 | 4
 7 | 5
 8 | 6
 9 | 7
10 | 8
11 | 9
12 | 10
13 | 11
14 | 12
15 | 13
16 | 14
17 | 15
18 | 16
19 | 17
20 | 18
21 | 19
22 | 20
23 | 21
24 | 22
25 | 23
26 | 24
27 | 25
28 | 26
29 | 27
30 | 28
31 | 29
32 | 30
33 | 31
34 | 32
35 | 33
36 | 34
37 | 35
38 | 36
39 | 37
40 | 38
41 | 39
42 | 40
43 | 41
44 | 42
45 | 43
46 | 44
47 | 45
48 | 46
49 | 47
50 | 48
51 | 49
52 | 50
53 | 51
54 | 52
55 | 71
56 | 72
57 | 73
58 | 


--------------------------------------------------------------------------------
/code/chap06/python/shortest_path_finder.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. finding shortest paths for given landmarks
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/home/book/spark-3.2.0"
 9 | export SPARK_PROG="/home/book/code/chap06/shortest_path_finder.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap06/python/triangles_counter.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building  
 3 | #   1. a graph using GraphFrames package.
 4 | #   2. applying Triangles Counting algorithm to the built graph
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/home/book/spark-3.2.0"
 9 | export SPARK_PROG="/home/book/code/chap06/triangles_counter.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap06/python/unique_triangles_finder.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building  
 3 | #   1. a graph using GraphFrames package.
 4 | #   2. find unique Triangles from the built graph
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/home/book/spark-3.2.0"
 9 | export SPARK_PROG="/home/book/code/chap06/unique_triangles_finder.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12"
11 | #
12 | export VERTICES_PATH="/home/book/code/chap06/sample_graph_vertices.txt"
13 | export EDGES_PATH="/home/book/code/chap06/sample_graph_edges.txt"
14 | #
15 | # run the PySpark program:
16 | ${SPARK_HOME}/bin/spark-submit --packages ${GRAPH_FRAMES} ${SPARK_PROG} ${VERTICES_PATH} ${EDGES_PATH}
17 | 


--------------------------------------------------------------------------------
/code/chap06/scala/README.md:
--------------------------------------------------------------------------------
1 | Scala Solutions
2 | 


--------------------------------------------------------------------------------
/code/chap06/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.12'
 5 | ext.scalaVersion = '2.12.15'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.ch03'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 | //    mavenLocal()
13 |     mavenCentral()
14 |     maven {
15 |         url "https://repos.spark-packages.org"
16 |     }
17 | }
18 | 
19 | dependencies {
20 |     implementation "org.scala-lang:scala-library:$scalaVersion"
21 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
22 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
23 |     implementation "org.apache.spark:spark-graphx_$scalaClassifier:$sparkVersion"
24 |     implementation 'graphframes:graphframes:0.8.2-spark3.2-s_2.12'
25 | 
26 | }
27 | 
28 | application {
29 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
30 | }


--------------------------------------------------------------------------------
/code/chap06/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch06'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/chap07/python/cats.no.header.csv:
--------------------------------------------------------------------------------
1 | cuttie,2,female,6
2 | mono,3,male,9
3 | fuzzy,1,female,4
4 | 


--------------------------------------------------------------------------------
/code/chap07/python/cats.with.header.csv:
--------------------------------------------------------------------------------
1 | name,age,gender,weight
2 | cuttie,2,female,6
3 | mono,3,male,9
4 | fuzzy,1,female,4
5 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_csv_reader_no_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_csv_reader_no_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_FILE="/book/code/chap07/sample_no_header.csv"
 8 | export SPARK_PROG="/book/code/chap07/datasource_csv_reader_no_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_csv_reader_with_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_csv_reader_with_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_FILE="/book/code/chap07/sample_with_header.csv"
 8 | export SPARK_PROG="/book/code/chap07/datasource_csv_reader_with_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_csv_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_csv_writer.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_csv_writer.py"
 8 | export OUTPUT_CSV_FILE_PATH="/tmp/output.csv"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG ${OUTPUT_CSV_FILE_PATH}
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_elasticsearch_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_elasticsearch_reader.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_elasticsearch_reader.py"
 8 | export ELASTIC_SEARCH_HOST="localhost"
 9 | export JAR="/book/code/jars/elasticsearch-hadoop-6.4.2.jar"
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST}
13 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_elasticsearch_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_elasticsearch_writer.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_elasticsearch_writer.py"
 8 | export ELASTIC_SEARCH_HOST="localhost"
 9 | export JAR="/book/code/jars/elasticsearch-hadoop-6.4.2.jar"
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST}
13 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_gzip_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_gzip_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_FILE="/book/code/chap07/sample_no_header.csv"
 8 | export SPARK_PROG="/book/code/chap07/datasource_csv_reader_no_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_jdbc_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_jdbc_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_jdbc_reader.py"
 8 | #
 9 | # define the required MySQL database connection parameters
10 | export JDBC_URL="jdbc:mysql://localhost/metadb"
11 | export JDBC_DRIVER="com.mysql.jdbc.Driver"
12 | export JDBC_USER="root"
13 | export JDBC_PASSWORD="mp22_password"
14 | export JDBC_SOURCE_TABLE_NAME="dept"
15 | #
16 | # define the required JAR file for MySQL database access
17 | export JAR="/book/code/jars/mysql-connector-java-5.1.42.jar"
18 | #
19 | # run the PySpark program:
20 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_SOURCE_TABLE_NAME}
21 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_jdbc_writer.sh:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------
 2 | # This is a shell script to run datasource_jdbc_writer.py
 3 | #--------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #--------------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_jdbc_writer.py"
 8 | #
 9 | # define the required MySQL database connection parameters
10 | export JDBC_URL="jdbc:mysql://localhost/metadb"
11 | export JDBC_DRIVER="com.mysql.jdbc.Driver"
12 | export JDBC_USER="root"
13 | export JDBC_PASSWORD="mp22_password"
14 | export JDBC_TARGET_TABLE_NAME="people"
15 | #
16 | # define the required JAR file for MySQL database access
17 | export JAR="/book/code/jars/mysql-connector-java-5.1.42.jar"
18 | #
19 | # run the PySpark program:
20 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_TARGET_TABLE_NAME}
21 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_json_reader_multi_line.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_json_reader_multi_line.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_FILE="/book/code/chap07/sample_multi_line.json"
 8 | export SPARK_PROG="/book/code/chap07/datasource_json_reader_multi_line.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_json_reader_single_line.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_json_reader_single_line.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_FILE="/book/code/chap07/sample_single_line.json"
 8 | export SPARK_PROG="/book/code/chap07/datasource_json_reader_single_line.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_mongodb_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_mongodb_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_mongodb_reader.py"
 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll44"
 9 | export JAR1="/book/code/jars/mongo-java-driver-3.8.2.jar"
10 | export JAR2="/book/code/jars/mongo-spark-connector_2.11-2.2.5.jar"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI}
14 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_mongodb_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_mongodb_reader.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_mongodb_writer.py"
 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll66"
 9 | export JAR1="/book/code/jars/mongo-java-driver-3.8.2.jar"
10 | export JAR2="/book/code/jars/mongo-spark-connector_2.11-2.2.5.jar"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI}
14 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_redis_reader.sh:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------
 2 | # This is a shell script to run datasource_redis_reader.py
 3 | #--------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #--------------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_redis_reader.py"
 8 | #
 9 | # define the required redis database connection parameters
10 | export REDIS_HOST="localhost"
11 | export REDIS_PORT="6379"
12 | # you may add password
13 | #export REDIS_PASSWORD="<your-password>"
14 | #
15 | # define the required JAR file for redis database access
16 | export JAR="/book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar"
17 | #
18 | # run the PySpark program:
19 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 
20 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_redis_writer.sh:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------
 2 | # This is a shell script to run datasource_redis_writer.py
 3 | #--------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #--------------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap07/datasource_redis_writer.py"
 8 | #
 9 | # define the required redis database connection parameters
10 | export REDIS_HOST="localhost"
11 | export REDIS_PORT="6379"
12 | # you may add password
13 | #export REDIS_PASSWORD="<your-password>"
14 | #
15 | # define the required JAR file for redis database access
16 | export JAR="/book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar"
17 | #
18 | # run the PySpark program:
19 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 
20 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_textfile_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_textfile_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_FILE="/book/code/chap07/sample_numbers.txt"
 8 | export SPARK_PROG="/book/code/chap07/datasource_textfile_reader.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/datasource_textfile_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_textfile_writer.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export OUTPUT_PATH="/tmp/zoutput"
 8 | export SPARK_PROG="/book/code/chap07/datasource_textfile_writer.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $OUTPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap07/python/images/cat1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat1.jpg


--------------------------------------------------------------------------------
/code/chap07/python/images/cat2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat2.jpg


--------------------------------------------------------------------------------
/code/chap07/python/images/cat3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat3.jpg


--------------------------------------------------------------------------------
/code/chap07/python/images/cat4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat4.jpg


--------------------------------------------------------------------------------
/code/chap07/python/images/duck1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/duck1.jpg


--------------------------------------------------------------------------------
/code/chap07/python/images/duck2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/duck2.jpg


--------------------------------------------------------------------------------
/code/chap07/python/images/not-image.txt:
--------------------------------------------------------------------------------
1 | not an image
2 | 


--------------------------------------------------------------------------------
/code/chap07/python/mongodb_coll44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/mongodb_coll44.png


--------------------------------------------------------------------------------
/code/chap07/python/mongodb_coll66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/mongodb_coll66.png


--------------------------------------------------------------------------------
/code/chap07/python/name_age_salary.csv:
--------------------------------------------------------------------------------
 1 | alex,60,18000
 2 | adel,40,45000
 3 | adel,50,77000
 4 | jane,40,52000
 5 | jane,60,81000
 6 | alex,50,62000
 7 | mary,50,92000
 8 | mary,60,63000
 9 | mary,40,55000
10 | mary,40,55000
11 | 


--------------------------------------------------------------------------------
/code/chap07/python/people.txt:
--------------------------------------------------------------------------------
1 | Alex,30,Tennis
2 | Betty,40,Swimming
3 | Dave,20,Walking
4 | Jeff,77,Baseball
5 | 


--------------------------------------------------------------------------------
/code/chap07/python/sample_multi_line.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}},
 3 |     {"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}},
 4 |     {
 5 |         "name": "bob",
 6 |         "id": 300,
 7 |         "scores": [
 8 |             3,
 9 |             4,
10 |             6,
11 |             9
12 |         ],
13 |         "dict": {
14 |             "key": "value33",
15 |             "key2": "value44"
16 |         }
17 |     },
18 |     {
19 |         "name": "bob",
20 |         "id": 400,
21 |         "scores": [
22 |             3,
23 |             5,
24 |             6,
25 |             9
26 |         ],
27 |         "dict": {
28 |             "key": "value55",
29 |             "key2": "value66"
30 |         }
31 |     }
32 | ]
33 | 


--------------------------------------------------------------------------------
/code/chap07/python/sample_no_header.csv:
--------------------------------------------------------------------------------
1 | Alex,Sunnyvale,30
2 | Mary,Cupertino,28
3 | Jane,Stanford,44
4 | Bob,Ames,33
5 | 


--------------------------------------------------------------------------------
/code/chap07/python/sample_numbers.txt:
--------------------------------------------------------------------------------
1 | 123,344,455,6666,2,300
2 | 7777,4444,55
3 | 22,34
4 | 900,901,902,9000,5600,5600,5700,45
5 | 45
6 | 70,71,72
7 | 


--------------------------------------------------------------------------------
/code/chap07/python/sample_single_line.json:
--------------------------------------------------------------------------------
1 | {"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}}
2 | {"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}}
3 | {"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}}
4 | {"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}}
5 | 


--------------------------------------------------------------------------------
/code/chap07/python/sample_with_header.csv:
--------------------------------------------------------------------------------
1 | name,city,age
2 | Alex,Sunnyvale,30
3 | Mary,Cupertino,28
4 | Jane,Stanford,44
5 | Bob,Ames,33
6 | 


--------------------------------------------------------------------------------
/code/chap07/python/twitter.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/twitter.avro


--------------------------------------------------------------------------------
/code/chap07/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.12'
 5 | ext.scalaVersion = '2.12.15'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.ch07'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 |     implementation 'com.redislabs:spark-redis_2.12:3.0.0'
21 |     implementation 'org.elasticsearch:elasticsearch-hadoop:7.16.3'
22 |     implementation 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1'
23 |     implementation 'mysql:mysql-connector-java:8.0.27'
24 | }
25 | 
26 | application {
27 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
28 | }


--------------------------------------------------------------------------------
/code/chap07/scala/data/sample_multi_line.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}},
 3 |     {"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}},
 4 |     {
 5 |         "name": "bob",
 6 |         "id": 300,
 7 |         "scores": [
 8 |             3,
 9 |             4,
10 |             6,
11 |             9
12 |         ],
13 |         "dict": {
14 |             "key": "value33",
15 |             "key2": "value44"
16 |         }
17 |     },
18 |     {
19 |         "name": "bob",
20 |         "id": 400,
21 |         "scores": [
22 |             3,
23 |             5,
24 |             6,
25 |             9
26 |         ],
27 |         "dict": {
28 |             "key": "value55",
29 |             "key2": "value66"
30 |         }
31 |     }
32 | ]
33 | 


--------------------------------------------------------------------------------
/code/chap07/scala/data/sample_no_header.csv:
--------------------------------------------------------------------------------
1 | Alex,Sunnyvale,30
2 | Mary,Cupertino,28
3 | Jane,Stanford,44
4 | Bob,Ames,33
5 | 


--------------------------------------------------------------------------------
/code/chap07/scala/data/sample_no_header.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/scala/data/sample_no_header.csv.gz


--------------------------------------------------------------------------------
/code/chap07/scala/data/sample_numbers.txt:
--------------------------------------------------------------------------------
1 | 123,344,455,6666,2,300
2 | 7777,4444,55
3 | 22,34
4 | 900,901,902,9000,5600,5600,5700,45
5 | 45
6 | 70,71,72
7 | 


--------------------------------------------------------------------------------
/code/chap07/scala/data/sample_single_line.json:
--------------------------------------------------------------------------------
1 | {"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}}
2 | {"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}}
3 | {"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}}
4 | {"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}}
5 | 


--------------------------------------------------------------------------------
/code/chap07/scala/data/sample_with_header.csv:
--------------------------------------------------------------------------------
1 | name,city,age
2 | Alex,Sunnyvale,30
3 | Mary,Cupertino,28
4 | Jane,Stanford,44
5 | Bob,Ames,33
6 | 


--------------------------------------------------------------------------------
/code/chap07/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/chap07/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_csv_reader_header.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_with_header.csv"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceCSVReaderHeader "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_csv_reader_no_header.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | INPUT_PATH="data/sample_no_header.csv"
4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceCSVReaderNoHeader "--args=$INPUT_PATH"
5 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_csv_writer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | export OUTPUT_CSV_FILE_PATH="data/tmp/output"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceCSVWriter "--args=$OUTPUT_CSV_FILE_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_elasticsearch_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ELASTICSEARCH_SERVER="localhost"
3 | ELASTICSEARCH_PORT="9200"
4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceElasticsearchReader "--args=$ELASTICSEARCH_SERVER $ELASTICSEARCH_PORT"
5 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_elasticsearch_writer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ELASTICSEARCH_SERVER="localhost"
3 | ELASTICSEARCH_PORT="9200"
4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceElasticsearchWriter "--args=$ELASTICSEARCH_SERVER $ELASTICSEARCH_PORT"
5 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_gzip_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_no_header.csv.gz"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceGZIPReader "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_jdbc_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | JDBC_URL=jdbc:mysql://localhost/metadb
3 | JDBC_DRIVER=com.mysql.cj.jdbc.Driver
4 | JDBC_TARGET_TABLE_NAME=people
5 | JDBC_USER=root
6 | JDBC_PASSWORD=my-secret-pw
7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJDBCReader "--args=$JDBC_URL $JDBC_DRIVER $JDBC_USER $JDBC_PASSWORD $JDBC_TARGET_TABLE_NAME"
8 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_jdbc_writer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | JDBC_URL=jdbc:mysql://localhost/metadb
3 | JDBC_DRIVER=com.mysql.cj.jdbc.Driver
4 | JDBC_TARGET_TABLE_NAME=people
5 | JDBC_USER=root
6 | JDBC_PASSWORD=my-secret-pw
7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJDBCWriter "--args=$JDBC_URL $JDBC_DRIVER $JDBC_USER $JDBC_PASSWORD $JDBC_TARGET_TABLE_NAME"
8 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_multi_line.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_multi_line.json"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderMultiLine "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_single_line.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_single_line.json"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderSingleLine "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_mongodb_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | MONGODB_URI="mongodb://localhost:27017/test.coll66"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceMongodbReader "--args=$MONGODB_URI"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_mongodb_writer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | MONGODB_URI="mongodb://localhost:27017/test.coll66"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceMongodbWriter "--args=$MONGODB_URI"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_redis_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | REDIS_SERVER="localhost"
3 | REDIS_PORT="6379"
4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceRedisReader "--args=$REDIS_SERVER $REDIS_PORT"
5 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_redis_writer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | REDIS_SERVER="localhost"
3 | REDIS_PORT="6379"
4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceRedisWriter "--args=$REDIS_SERVER $REDIS_PORT"
5 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_textfile_reader.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_numbers.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceTextfileReader "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/run_spark_applications_scripts/datasource_textfile_writer.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | OUTPUT_PATH="data/tmp/text-file-out"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceTextfileWriter "--args=$OUTPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap07/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch07'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/chap08/python/rank_product/sample_input/rp1.txt:
--------------------------------------------------------------------------------
1 | K_1,30.0
2 | K_2,60.0
3 | K_3,10.0
4 | K_4,80.0
5 | 


--------------------------------------------------------------------------------
/code/chap08/python/rank_product/sample_input/rp2.txt:
--------------------------------------------------------------------------------
1 | K_1,90.0
2 | K_2,70.0
3 | K_3,40.0
4 | K_4,50.0
5 | 


--------------------------------------------------------------------------------
/code/chap08/python/rank_product/sample_input/rp3.txt:
--------------------------------------------------------------------------------
1 | K_1,4.0
2 | K_2,8.0
3 | 


--------------------------------------------------------------------------------
/code/chap08/scala/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 8
 2 | The program covers page rank and rank product algorithms using group by key and combine by key.
 3 | 
 4 | * ### Page Rank:
 5 |     * `org.data.algorithms.spark.ch08.PageRank` (Spark program)
 6 |     * `./run_spark_applications_scripts/page_rank.sh` (shell script to call Spark Application)
 7 | 
 8 | * ### Rank Product using Combine By Key:
 9 |     * `org.data.algorithms.spark.ch08.RankProductUsingCombineByKey` (Spark program)
10 |     * `./run_spark_applications_scripts/rank_product_using_combine_by_key.sh` (shell script to call Spark Application)
11 | 
12 | * ### Rank Product using Group By Key:
13 |     * `org.data.algorithms.spark.ch08.RankProductUsingGroupByKey` (Spark program)
14 |     * `./run_spark_applications_scripts/rank_product_using_group_by_key.sh` (shell script to call Spark Application)
15 | 


--------------------------------------------------------------------------------
/code/chap08/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.ch08'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/chap08/scala/data/sample_input/rp1.txt:
--------------------------------------------------------------------------------
1 | K_1,30.0
2 | K_2,60.0
3 | K_3,10.0
4 | K_4,80.0
5 | 


--------------------------------------------------------------------------------
/code/chap08/scala/data/sample_input/rp2.txt:
--------------------------------------------------------------------------------
1 | K_1,90.0
2 | K_2,70.0
3 | K_3,40.0
4 | K_4,50.0
5 | 


--------------------------------------------------------------------------------
/code/chap08/scala/data/sample_input/rp3.txt:
--------------------------------------------------------------------------------
1 | K_1,4.0
2 | K_2,8.0
3 | 


--------------------------------------------------------------------------------
/code/chap08/scala/data/urls.txt:
--------------------------------------------------------------------------------
1 | url_1,url_4
2 | url_2,url_1
3 | url_3,url_2
4 | url_3,url_1
5 | url_4,url_3
6 | url_4,url_1


--------------------------------------------------------------------------------
/code/chap08/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap08/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/chap08/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/chap08/scala/run_spark_applications_scripts/page_rank.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/urls.txt"
3 | NUMBER_OF_ITERATIONS=5
4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch08.PageRank "--args=$INPUT_PATH $NUMBER_OF_ITERATIONS"
5 | 


--------------------------------------------------------------------------------
/code/chap08/scala/run_spark_applications_scripts/rank_product_using_combine_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | OUTPUT_PATH="data/tmp/rank-product-combine-by-key"
3 | NUMBER_OF_STUDIES=3
4 | INPUT_PATH_FOR_STUDY_1="data/sample_input/rp1.txt"
5 | INPUT_PATH_FOR_STUDY_2="data/sample_input/rp2.txt"
6 | INPUT_PATH_FOR_STUDY_3="data/sample_input/rp3.txt"
7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch08.RankProductUsingCombineByKey "--args=$OUTPUT_PATH $NUMBER_OF_STUDIES $INPUT_PATH_FOR_STUDY_1 $INPUT_PATH_FOR_STUDY_2 $INPUT_PATH_FOR_STUDY_3"
8 | 


--------------------------------------------------------------------------------
/code/chap08/scala/run_spark_applications_scripts/rank_product_using_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | OUTPUT_PATH="data/tmp/rank-product-group-by-key"
3 | NUMBER_OF_STUDIES=3
4 | INPUT_PATH_FOR_STUDY_1="data/sample_input/rp1.txt"
5 | INPUT_PATH_FOR_STUDY_2="data/sample_input/rp2.txt"
6 | INPUT_PATH_FOR_STUDY_3="data/sample_input/rp3.txt"
7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch08.RankProductUsingGroupByKey "--args=$OUTPUT_PATH $NUMBER_OF_STUDIES $INPUT_PATH_FOR_STUDY_1 $INPUT_PATH_FOR_STUDY_2 $INPUT_PATH_FOR_STUDY_3"
8 | 


--------------------------------------------------------------------------------
/code/chap08/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch08'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/chap09/python/README.md:
--------------------------------------------------------------------------------
1 | # PySpark Solutions for Chapter 9
2 | 
3 | Work in Progress...
4 | 
5 | Sample Codes for this chapter will be posted by end of March 2023.
6 | 
7 | Thanks.
8 | 


--------------------------------------------------------------------------------
/code/chap09/scala/README.md:
--------------------------------------------------------------------------------
1 | Scala Solutions
2 | 


--------------------------------------------------------------------------------
/code/chap10/data_design_patterns.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap10/data_design_patterns.pdf


--------------------------------------------------------------------------------
/code/chap10/python/average_monoid_use_aggregatebykey.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/book/code/chap10/average_monoid_use_aggregatebykey.py"
3 | # define your input path
4 | export INPUT="/book/code/chap10/sample_input.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/book/spark-3.2.0"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap10/python/average_monoid_use_combinebykey.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/book/code/chap10/average_monoid_use_combinebykey.py"
3 | # define your input path
4 | export INPUT="/book/code/chap10/sample_input.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/book/spark-3.2.0"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap10/python/average_monoid_use_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #==========================================
 2 | # NOTE:
 3 | #
 4 | # In general, avoid using groupByKey(), and 
 5 | # instead use reduceByKey() or combineByKey().
 6 | # For details see: 
 7 | #   https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html
 8 | #
 9 | # The groupByKey() solution is provided for educational 
10 | # purposes.  If you need all of the values of a key for 
11 | # some aggregation such as finding the "median" (which you
12 | # need all of the values per key), then  the groupByKey() 
13 | # may be used.
14 | #==========================================
15 | #
16 | # define PySpark program
17 | export PROG="/book/code/chap10/average_monoid_use_groupbykey.py"
18 | # define your input path
19 | export INPUT="/book/code/chap10/sample_input.txt"
20 | # define your Spark home directory
21 | export SPARK_HOME="/book/spark-3.2.0"
22 | # run the program
23 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
24 | 


--------------------------------------------------------------------------------
/code/chap10/python/average_monoid_use_reducebykey.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/book/code/chap10/average_monoid_use_reducebykey.py"
3 | # define your input path
4 | export INPUT="/book/code/chap10/sample_input.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/book/spark-3.2.0"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap10/python/dna_base_count_basic_using_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | start_time=$(date +%s)
 2 | #
 3 | INPUT_PATH=~/Downloads/rs_chY.fas
 4 | $SPARK_HOME/bin/spark-submit dna_base_count_basic_using_combinebykey.py $INPUT_PATH
 5 | #
 6 | end_time=$(date +%s)
 7 | # elapsed time with second resolution
 8 | elapsed=$(( end_time - start_time ))
 9 | echo "elapsed time (in seconds):  $elapsed"
10 | 
11 | 


--------------------------------------------------------------------------------
/code/chap10/python/dna_base_count_basic_using_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | start_time=$(date +%s)
 2 | #
 3 | INPUT_PATH=~/Downloads/rs_chY.fas
 4 | $SPARK_HOME/bin/spark-submit dna_base_count_basic_using_groupbykey.py $INPUT_PATH
 5 | #
 6 | end_time=$(date +%s)
 7 | # elapsed time with second resolution
 8 | elapsed=$(( end_time - start_time ))
 9 | echo "elapsed time (in seconds):  $elapsed"
10 | 
11 | 


--------------------------------------------------------------------------------
/code/chap10/python/dna_base_count_basic_using_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | start_time=$(date +%s)
 2 | #
 3 | INPUT_PATH=~/Downloads/rs_chY.fas
 4 | $SPARK_HOME/bin/spark-submit dna_base_count_basic_using_reducebykey.py $INPUT_PATH
 5 | #
 6 | end_time=$(date +%s)
 7 | # elapsed time with second resolution
 8 | elapsed=$(( end_time - start_time ))
 9 | echo "elapsed time (in seconds):  $elapsed"
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/code/chap10/python/dna_base_count_inmapper_combiner_using_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | start_time=$(date +%s)
 2 | #
 3 | INPUT_PATH=~/Downloads/rs_chY.fas
 4 | $SPARK_HOME/bin/spark-submit dna_base_count_inmapper_combiner_using_combinebykey.py $INPUT_PATH
 5 | #
 6 | end_time=$(date +%s)
 7 | # elapsed time with second resolution
 8 | elapsed=$(( end_time - start_time ))
 9 | echo "elapsed time (in seconds):  $elapsed"
10 | 


--------------------------------------------------------------------------------
/code/chap10/python/dna_base_count_inmapper_combiner_using_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | start_time=$(date +%s)
 2 | #
 3 | INPUT_PATH=~/Downloads/rs_chY.fas
 4 | $SPARK_HOME/bin/spark-submit dna_base_count_inmapper_combiner_using_groupbykey.py $INPUT_PATH
 5 | #
 6 | end_time=$(date +%s)
 7 | # elapsed time with second resolution
 8 | elapsed=$(( end_time - start_time ))
 9 | echo "elapsed time (in seconds):  $elapsed"
10 | 


--------------------------------------------------------------------------------
/code/chap10/python/dna_base_count_inmapper_combiner_using_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | start_time=$(date +%s)
 2 | #
 3 | INPUT_PATH=~/Downloads/rs_chY.fas
 4 | $SPARK_HOME/bin/spark-submit dna_base_count_inmapper_combiner_using_reducebykey.py $INPUT_PATH
 5 | #
 6 | end_time=$(date +%s)
 7 | # elapsed time with second resolution
 8 | elapsed=$(( end_time - start_time ))
 9 | echo "elapsed time (in seconds):  $elapsed"
10 | 


--------------------------------------------------------------------------------
/code/chap10/python/dna_base_count_using_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | start_time=$(date +%s)
 2 | #
 3 | INPUT_PATH=~/Downloads/rs_chY.fas
 4 | $SPARK_HOME/bin/spark-submit dna_base_count_using_mappartitions.py $INPUT_PATH
 5 | #
 6 | end_time=$(date +%s)
 7 | # elapsed time with second resolution
 8 | elapsed=$(( end_time - start_time ))
 9 | echo "elapsed time (in seconds):  $elapsed"
10 | 
11 | 


--------------------------------------------------------------------------------
/code/chap10/python/inmapper_combiner_local_aggregation.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/book/code/chap10/inmapper_combiner_local_aggregation.py"
3 | # define your input path
4 | export INPUT="/book/code/chap10/sample_dna_seq.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/book/spark-3.2.0"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap10/python/inmapper_combiner_use_basic_mapreduce.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/book/code/chap10/inmapper_combiner_use_basic_mapreduce.py"
3 | # define your input path
4 | export INPUT="/book/code/chap10/sample_dna_seq.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/book/spark-3.2.0"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap10/python/inmapper_combiner_use_mappartitions.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/book/code/chap10/inmapper_combiner_use_mappartitions.py"
3 | # define your input path
4 | export INPUT="/book/code/chap10/sample_dna_seq.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/book/spark-3.2.0"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap10/python/minmax_force_empty_partitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run minmax_use_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_PATH="/book/code/chap10/sample_numbers.txt"
 8 | export SPARK_PROG="/book/code/chap10/minmax_force_empty_partitions.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap10/python/minmax_use_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run minmax_use_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export INPUT_PATH="/book/code/chap10/sample_numbers.txt"
 8 | export SPARK_PROG="/book/code/chap10/minmax_use_mappartitions.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap10/python/minmax_use_mappartitions_v2.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run minmax_use_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.4.0"
 7 | export INPUT_PATH="/book/code/chap10/sample_numbers.txt"
 8 | export SPARK_PROG="/book/code/chap10/minmax_use_mappartitions_v2.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap10/python/sample_dna_seq.txt:
--------------------------------------------------------------------------------
 1 | ATCGGGATCCGGG
 2 | ATTCCGGGATTCCCC
 3 | ATGGCCCCCGGGATCGGG
 4 | CGGTATCCGGGGAAAAA
 5 | aaattCCGGAACCGGGGGTTT
 6 | CCTTTTATCGGGCAAATTTTCCCGG
 7 | attttcccccggaaaAAATTTCCGGG
 8 | ACTGACTAGCTAGCTAACTG
 9 | GCATCGTAGCTAGCTACGAT
10 | AATTCCCGCATCGATCGTACGTACGTAG
11 | ATCGATCGATCGTACGATCG
12 | 


--------------------------------------------------------------------------------
/code/chap10/python/sample_input.txt:
--------------------------------------------------------------------------------
 1 | a,2
 2 | a,3
 3 | a,4
 4 | a,5
 5 | a,7
 6 | b,4
 7 | b,5
 8 | b,6
 9 | c,3
10 | c,4
11 | c,5
12 | c,6
13 | 


--------------------------------------------------------------------------------
/code/chap10/python/sample_numbers.txt:
--------------------------------------------------------------------------------
 1 | 23,24,22,44,66,77,44,44,555,666
 2 | 12,4,555,66,67,68,57,55,56,45,45,45,66,77
 3 | 34,35,36,97300,78,79
 4 | 120,44,444,445,345,345,555
 5 | 11,33,34,35,36,37,47,7777,8888,6666,44,55
 6 | 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105
 7 | 6,7,8,9,10
 8 | 8,9,10,12,12
 9 | 7777
10 | 222,333,444,555,666,111,112,5,113,114
11 | 5555,4444,24
12 | 


--------------------------------------------------------------------------------
/code/chap10/python/top_N_use_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run top_N_use_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap10/top_N_use_mappartitions.py"
 8 | #
 9 | # run the PySpark program:
10 | # find Top-3
11 | export N = 3
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N
13 | 


--------------------------------------------------------------------------------
/code/chap10/python/top_N_use_takeordered.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run top_N_use_takeordered.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/book/spark-3.2.0"
 7 | export SPARK_PROG="/book/code/chap10/top_N_use_takeordered.py"
 8 | #
 9 | # run the PySpark program:
10 | # find Top-3
11 | export N = 3
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N
13 | 


--------------------------------------------------------------------------------
/code/chap10/scala/.gitignore:
--------------------------------------------------------------------------------
1 | data/*.gz


--------------------------------------------------------------------------------
/code/chap10/scala/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'scala'
 2 | apply plugin: 'application'
 3 | 
 4 | ext.scalaClassifier = '2.13'
 5 | ext.scalaVersion = '2.13.7'
 6 | ext.sparkVersion = '3.2.0'
 7 | 
 8 | group 'org.data.algorithms.spark.ch10'
 9 | version '1.0-SNAPSHOT'
10 | 
11 | repositories {
12 |     mavenLocal()
13 |     mavenCentral()
14 | }
15 | 
16 | dependencies {
17 |     implementation "org.scala-lang:scala-library:$scalaVersion"
18 |     implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion"
19 |     implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion"
20 | }
21 | 
22 | application {
23 |     mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL"
24 | }


--------------------------------------------------------------------------------
/code/chap10/scala/data/sample_dna_seq.txt:
--------------------------------------------------------------------------------
 1 | ATCGGGATCCGGG
 2 | ATTCCGGGATTCCCC
 3 | ATGGCCCCCGGGATCGGG
 4 | CGGTATCCGGGGAAAAA
 5 | aaattCCGGAACCGGGGGTTT
 6 | CCTTTTATCGGGCAAATTTTCCCGG
 7 | attttcccccggaaaAAATTTCCGGG
 8 | ACTGACTAGCTAGCTAACTG
 9 | GCATCGTAGCTAGCTACGAT
10 | AATTCCCGCATCGATCGTACGTACGTAG
11 | ATCGATCGATCGTACGATCG
12 | 


--------------------------------------------------------------------------------
/code/chap10/scala/data/sample_input.txt:
--------------------------------------------------------------------------------
 1 | a,2
 2 | a,3
 3 | a,4
 4 | a,5
 5 | a,7
 6 | b,4
 7 | b,5
 8 | b,6
 9 | c,3
10 | c,4
11 | c,5
12 | c,6
13 | 


--------------------------------------------------------------------------------
/code/chap10/scala/data/sample_numbers.txt:
--------------------------------------------------------------------------------
 1 | 23,24,22,44,66,77,44,44,555,666
 2 | 12,4,555,66,67,68,57,55,56,45,45,45,66,77
 3 | 34,35,36,97300,78,79
 4 | 120,44,444,445,345,345,555
 5 | 11,33,34,35,36,37,47,7777,8888,6666,44,55
 6 | 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105
 7 | 6,7,8,9,10
 8 | 8,9,10,12,12
 9 | 7777
10 | 222,333,444,555,666,111,112,5,113,114
11 | 5555,4444,24
12 | 


--------------------------------------------------------------------------------
/code/chap10/scala/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap10/scala/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/code/chap10/scala/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/average_monoid_use_aggregate_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT="data/sample_input.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseAggregateByKey "--args= $INPUT"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/average_monoid_use_combine_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT="data/sample_input.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseCombineByKey "--args= $INPUT"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/average_monoid_use_group_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT="data/sample_input.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseGroupByKey "--args= $INPUT"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/average_monoid_use_reduce_by_key.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT="data/sample_input.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseReduceByKey "--args=$INPUT"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_in_mapper_combiner_using_combine_by_key.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Download the file to data directory
 3 | INPUT_PATH="data/rs_chY.fas.gz"
 4 | if [ !  -f $INPUT_PATH ]; then
 5 |   curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data
 6 | fi
 7 | #If file exists run the spark application
 8 | if test $INPUT_PATH ; then
 9 |   start_time=$(date +%s)
10 |   ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicInMapperCombinerUsingCombineByKey "--args=$INPUT_PATH"
11 |   end_time=$(date +%s)
12 |   # elapsed time with second resolution
13 |   elapsed=$(( end_time - start_time ))
14 |   echo "elapsed time (in seconds):  $elapsed"
15 | fi


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_in_mapper_combiner_using_group_by_key.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Download the file to data directory
 3 | INPUT_PATH="data/rs_chY.fas.gz"
 4 | if [ !  -f $INPUT_PATH ]; then
 5 |   curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data
 6 | fi
 7 | #If file exists run the spark application
 8 | if test $INPUT_PATH ; then
 9 |   start_time=$(date +%s)
10 |   ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicInMapperCombinerUsingGroupByKey "--args=$INPUT_PATH"
11 |   end_time=$(date +%s)
12 |   # elapsed time with second resolution
13 |   elapsed=$(( end_time - start_time ))
14 |   echo "elapsed time (in seconds):  $elapsed"
15 | fi


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_in_mapper_combiner_using_reduce_by_key.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Download the file to data directory
 3 | INPUT_PATH="data/rs_chY.fas.gz"
 4 | if [ !  -f $INPUT_PATH ]; then
 5 |   curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data
 6 | fi
 7 | #If file exists run the spark application
 8 | if test $INPUT_PATH ; then
 9 |   start_time=$(date +%s)
10 |   ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicInMapperCombinerUsingReduceByKey "--args=$INPUT_PATH"
11 |   end_time=$(date +%s)
12 |   # elapsed time with second resolution
13 |   elapsed=$(( end_time - start_time ))
14 |   echo "elapsed time (in seconds):  $elapsed"
15 | fi


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_combine_by_key.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Download the file to data directory
 3 | INPUT_PATH="data/rs_chY.fas.gz"
 4 | if [ !  -f $INPUT_PATH ]; then
 5 |   curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data
 6 | fi
 7 | #If file exists run the spark application
 8 | if test $INPUT_PATH ; then
 9 |   start_time=$(date +%s)
10 |   ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingCombineByKey "--args=$INPUT_PATH"
11 |   end_time=$(date +%s)
12 |   # elapsed time with second resolution
13 |   elapsed=$(( end_time - start_time ))
14 |   echo "elapsed time (in seconds):  $elapsed"
15 | fi


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_group_by_key.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Download the file to data directory
 3 | INPUT_PATH="data/rs_chY.fas.gz"
 4 | if [ !  -f $INPUT_PATH ]; then
 5 |   curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data
 6 | fi
 7 | #If file exists run the spark application
 8 | if test $INPUT_PATH ; then
 9 |   start_time=$(date +%s)
10 |   ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingGroupByKey "--args=$INPUT_PATH"
11 |   end_time=$(date +%s)
12 |   # elapsed time with second resolution
13 |   elapsed=$(( end_time - start_time ))
14 |   echo "elapsed time (in seconds):  $elapsed"
15 | fi
16 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Download the file to data directory
 3 | INPUT_PATH="data/rs_chY.fas.gz"
 4 | if [ !  -f $INPUT_PATH ]; then
 5 |   curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data
 6 | fi
 7 | #If file exists run the spark application
 8 | if test $INPUT_PATH ; then
 9 |   start_time=$(date +%s)
10 |   ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingMappartitions "--args=$INPUT_PATH"
11 |   end_time=$(date +%s)
12 |   # elapsed time with second resolution
13 |   elapsed=$(( end_time - start_time ))
14 |   echo "elapsed time (in seconds):  $elapsed"
15 | fi


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_reduce_by_key.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #Download the file to data directory
 3 | INPUT_PATH="data/rs_chY.fas.gz"
 4 | if [ !  -f $INPUT_PATH ]; then
 5 |   curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data
 6 | fi
 7 | #If file exists run the spark application
 8 | if test $INPUT_PATH ; then
 9 |   start_time=$(date +%s)
10 |   ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingReduceByKey "--args=$INPUT_PATH"
11 |   end_time=$(date +%s)
12 |   # elapsed time with second resolution
13 |   elapsed=$(( end_time - start_time ))
14 |   echo "elapsed time (in seconds):  $elapsed"
15 | fi
16 | 
17 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/in_mapper_combiner_use_mappartitions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_dna_seq.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.InMapperCombinerUseMappartitions "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/in_mapper_combiner_using_local_aggregation.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_dna_seq.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.InMapperCombinerUsingLocalAggregation "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/in_mapper_combiner_using_map_reduce.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_dna_seq.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.InMapperCombinerUsingMapReduce "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/min_max_force_empty_partitions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_numbers.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.MinMaxForceEmptyPartitions "--args=$INPUT_PATH"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/min_max_use_mappartitions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | INPUT_PATH="data/sample_numbers.txt"
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.MinMaxUseMappartitions "--args=$INPUT_PATH"


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/structured_to_hierarchical_to_xml_dataframe.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.StructuredToHierarchicalToXmlDataframe
3 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/structured_to_hierarchical_to_xml_rdd.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.StructuredToHierarchicalToXmlRDD
3 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/top_n_use_map_partitions.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | N=3
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.TopNUseMapPartitions "--args=$N"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/run_spark_applications_scripts/top_n_use_take_ordered.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | N=3
3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.TopNUseTakeOrdered "--args=$N"
4 | 


--------------------------------------------------------------------------------
/code/chap10/scala/settings.gradle:
--------------------------------------------------------------------------------
1 | rootProject.name = 'data-algos-with-spark-ch10'
2 | 
3 | 


--------------------------------------------------------------------------------
/code/chap11/scala/README.md:
--------------------------------------------------------------------------------
1 | Scala Solutions
2 | 


--------------------------------------------------------------------------------
/code/chap12/python/README.md:
--------------------------------------------------------------------------------
1 | PySpark Solutions
2 | 


--------------------------------------------------------------------------------
/code/chap12/scala/README.md:
--------------------------------------------------------------------------------
1 | Scala Solutions
2 | 


--------------------------------------------------------------------------------
/code/jars/avro-mapred-1.7.7-hadoop1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/avro-mapred-1.7.7-hadoop1.jar


--------------------------------------------------------------------------------
/code/jars/avro-mapred-1.7.7-hadoop2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/avro-mapred-1.7.7-hadoop2.jar


--------------------------------------------------------------------------------
/code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/code/jars/elasticsearch-hadoop-6.4.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/elasticsearch-hadoop-6.4.2.jar


--------------------------------------------------------------------------------
/code/jars/elasticsearch-spark_2.11-2.4.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/elasticsearch-spark_2.11-2.4.5.jar


--------------------------------------------------------------------------------
/code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar


--------------------------------------------------------------------------------
/code/jars/hbase-spark-connector-1.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/hbase-spark-connector-1.0.0.jar


--------------------------------------------------------------------------------
/code/jars/htrace-core-3.1.0-incubating.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/htrace-core-3.1.0-incubating.jar


--------------------------------------------------------------------------------
/code/jars/mongo-java-driver-3.8.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mongo-java-driver-3.8.2.jar


--------------------------------------------------------------------------------
/code/jars/mongo-spark-connector_2.11-2.2.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mongo-spark-connector_2.11-2.2.5.jar


--------------------------------------------------------------------------------
/code/jars/mongodb-driver-3.8.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mongodb-driver-3.8.2.jar


--------------------------------------------------------------------------------
/code/jars/mysql-connector-java-5.1.42.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mysql-connector-java-5.1.42.jar


--------------------------------------------------------------------------------
/code/jars/shc-core-1.1.3-2.3-s_2.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/shc-core-1.1.3-2.3-s_2.11.jar


--------------------------------------------------------------------------------
/code/jars/shc-examples-1.1.3-2.3-s_2.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/shc-examples-1.1.3-2.3-s_2.11.jar


--------------------------------------------------------------------------------
/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar


--------------------------------------------------------------------------------
/code/jars/spark-redis-2.3.1-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/spark-redis-2.3.1-SNAPSHOT.jar


--------------------------------------------------------------------------------
/data/chap02/README.md:
--------------------------------------------------------------------------------
1 | Sample FASTA and FASTQ Files
2 | 


--------------------------------------------------------------------------------
/data/chap02/sample.fasta:
--------------------------------------------------------------------------------
 1 | >seq1
 2 | cGTAaccaataaaaaaacaagcttaacctaattc
 3 | >seq2
 4 | agcttagTTTGGatctggccgggg
 5 | >seq3
 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca
 7 | gaattcgcacca
 8 | AATAAAACCTCACCCAT
 9 | agagcccagaatttactcCCC
10 | >seq4
11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca
12 | gaattcgcacca
13 | 


--------------------------------------------------------------------------------
/docs/goal_of_book.md:
--------------------------------------------------------------------------------
 1 | # Goal of this book: Data Algorithms with Spark
 2 | 
 3 | 1. Keep it SIMPLE!
 4 | 
 5 | 2. Goal of this book: enable writing efficient & 
 6 |   simpler PySpark code for data algorithms using Spark
 7 | 
 8 | 3. A lot of [working PySpark code](../code/) is provided 
 9 |    so that the reader can understand how to use basic 
10 |    transformations on using RDDs and DataFrames
11 |    
12 | 4. As much as possible, I have avoided writing complex 
13 |    code and functions: keep it simple so that you can 
14 |    debug easily and your co-workers can understand them.
15 |    
16 | 5. CUT-and-PASTE: you may take portions of the [code](../code/) 
17 |    and tailor it to your needs
18 |    


--------------------------------------------------------------------------------
/images/Data-Algorithms-with-Spark_mech2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data-Algorithms-with-Spark_mech2.pdf


--------------------------------------------------------------------------------
/images/Data-Algorithms-with-Spark_mech2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data-Algorithms-with-Spark_mech2.png


--------------------------------------------------------------------------------
/images/Data_Algorithms_with_Spark_COVER_9781492082385.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data_Algorithms_with_Spark_COVER_9781492082385.jpg


--------------------------------------------------------------------------------
/images/Data_Algorithms_with_Spark_COVER_9781492082385.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data_Algorithms_with_Spark_COVER_9781492082385.png


--------------------------------------------------------------------------------
/images/anagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/anagram.png


--------------------------------------------------------------------------------
/images/book_cover_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/book_cover_final.pdf


--------------------------------------------------------------------------------
/images/correlation-coefficient.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/correlation-coefficient.png


--------------------------------------------------------------------------------
/images/data-alg-foreword2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data-alg-foreword2.docx


--------------------------------------------------------------------------------
/images/data-alg-foreword2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data-alg-foreword2.pdf


--------------------------------------------------------------------------------
/images/data_algorithms_hard_copy_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_hard_copy_image.jpg


--------------------------------------------------------------------------------
/images/data_algorithms_with_spark.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark.jpg


--------------------------------------------------------------------------------
/images/data_algorithms_with_spark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark.pdf


--------------------------------------------------------------------------------
/images/data_algorithms_with_spark_amazon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark_amazon.jpg


--------------------------------------------------------------------------------
/images/data_algorithms_with_spark_knowledge_is_power.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark_knowledge_is_power.jpeg


--------------------------------------------------------------------------------
/images/data_algorithms_with_spark_small.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark_small.jpeg


--------------------------------------------------------------------------------
/images/joins-in-SQL.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/joins-in-SQL.jpeg


--------------------------------------------------------------------------------
/images/kmer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/kmer.jpg


--------------------------------------------------------------------------------
/images/kmer_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/kmer_4.png


--------------------------------------------------------------------------------
/images/mappartitions_image_1.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/mappartitions_image_1.drawio.png


--------------------------------------------------------------------------------
/images/mappartitions_image_2.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/mappartitions_image_2.drawio.png


--------------------------------------------------------------------------------
/images/sql_joins.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/sql_joins.png


--------------------------------------------------------------------------------
/wiki-spark/docs/dataframe_to_rdd.md:
--------------------------------------------------------------------------------
 1 | # DataFrame to RDD
 2 | 
 3 | There are times that you might want to 
 4 | convert a **DataFrame to an RDD**.
 5 | 
 6 | ## RDD and DataFrame
 7 | 
 8 | * Spark's DataFrame (full name as: `pyspark.sql.DataFrame`)
 9 | is an immutable and distributed collection of data grouped 
10 | into named columns.
11 | 
12 | * Spark's RDD (full name as: `pyspark.RDD`)
13 | is a Resilient Distributed Dataset (`RDD`), 
14 | the basic abstraction in Spark. RDD represents an 
15 | immutable, partitioned collection of elements that 
16 | can be operated on in parallel.
17 | 
18 | ## DataFrame to RDD Conversion
19 | 
20 | To convert a `DataFrame` to an `RDD`, you just need to
21 | call `DataFrame.rdd`.
22 | 
23 | 	>>> spark.version
24 | 	'3.3.1'
25 | 	>>> records = [("alex", 10), ("jane", 20), ("rose", 30)]
26 | 	>>> df = spark.createDataFrame(records, ["name", "age"])
27 | 	>>> df.show()
28 | 	+----+---+
29 | 	|name|age|
30 | 	+----+---+
31 | 	|alex| 10|
32 | 	|jane| 20|
33 | 	|rose| 30|
34 | 	+----+---+
35 | 	
36 | 	>>># Convert a DataFrame to an RDD
37 | 	>>> rdd = df.rdd
38 | 	>>> rdd.collect()
39 | 	[
40 | 	 Row(name='alex', age=10), 
41 | 	 Row(name='jane', age=20), 
42 | 	 Row(name='rose', age=30)
43 | 	]
44 | 


--------------------------------------------------------------------------------
/wiki-spark/docs/flatmap_transformation.md:
--------------------------------------------------------------------------------
 1 | # `RDD.flatMap()`
 2 | 
 3 | The `RDD.flatMap(f)` returns a new RDD by first applying 
 4 | a function `f()` to all elements of this RDD, and then 
 5 | flattening the results.
 6 | 
 7 | 
 8 | ![](./spark-flatmap.png)
 9 | 
10 | 
11 | In the following example note that the empty elements are 
12 | dropped. 
13 | 
14 | 
15 | 
16 | ~~~python
17 | >>> spark.version
18 | '3.3.2'
19 | 
20 | >>> some_elements = [["e0", "e1", "e2", "e3"], 
21 |                      [], 
22 |                      ["e4", "e5"], 
23 |                      ["e6"], 
24 |                      [] ]
25 | >>> len(some_elements)
26 | 5
27 | >>> rdd = spark.sparkContext.parallelize(some_elements)
28 | >>> rdd.collect()
29 | [ ['e0', 'e1', 'e2', 'e3'], [], ['e4', 'e5'], ['e6'], [] ]
30 | 
31 | >>> mapped = rdd.map(lambda x: x)
32 | >>> mapped.collect()
33 | [ ['e0', 'e1', 'e2', 'e3'], [], ['e4', 'e5'], ['e6'], [] ]
34 | >>> mapped.count()
35 | 5
36 | 
37 | >>> flat_mapped = rdd.flatMap(lambda x: x)
38 | >>> flat_mapped.collect()
39 | ['e0', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6']
40 | >>> flat_mapped.count()
41 | 7
42 | ~~~


--------------------------------------------------------------------------------
/wiki-spark/docs/lambda_expressions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/lambda_expressions.pdf


--------------------------------------------------------------------------------
/wiki-spark/docs/monoid/monoid_math.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/monoid/monoid_math.png


--------------------------------------------------------------------------------
/wiki-spark/docs/spark-explode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/spark-explode.png


--------------------------------------------------------------------------------
/wiki-spark/docs/spark-flatmap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/spark-flatmap.png


--------------------------------------------------------------------------------
/wiki-spark/docs/using-graphframes-with-jupyter.demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/using-graphframes-with-jupyter.demo.png


--------------------------------------------------------------------------------
/wiki-spark/docs/wiki.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/wiki.jpeg


--------------------------------------------------------------------------------