├── .gitignore ├── README.md ├── code ├── README.md ├── bonus_chapters │ ├── README.md │ ├── TF-IDF │ │ ├── python │ │ │ ├── README.md │ │ │ ├── TF_IDF.log │ │ │ ├── TF_IDF.py │ │ │ └── data │ │ │ │ ├── doc1 │ │ │ │ ├── doc2 │ │ │ │ ├── doc3 │ │ │ │ └── doc4 │ │ └── scala │ │ │ ├── build.gradle │ │ │ ├── gradle │ │ │ └── wrapper │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ └── gradle-wrapper.properties │ │ │ ├── gradlew │ │ │ ├── gradlew.bat │ │ │ ├── out │ │ │ └── production │ │ │ │ └── resources │ │ │ │ └── log4j.properties │ │ │ ├── settings.gradle │ │ │ └── src │ │ │ └── main │ │ │ ├── data │ │ │ ├── doc1 │ │ │ ├── doc2 │ │ │ ├── doc3 │ │ │ └── doc4 │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ └── scala │ │ │ └── org │ │ │ └── data │ │ │ └── algorithms │ │ │ └── spark │ │ │ └── bonus_chapter │ │ │ └── TFIDF.scala │ ├── Top-N │ │ ├── README.md │ │ ├── python │ │ │ ├── Top_N_movies_Dataframe_using_API.log │ │ │ ├── Top_N_movies_Dataframe_using_API.py │ │ │ ├── Top_N_movies_Dataframe_using_API.sh │ │ │ ├── Top_N_movies_Dataframe_using_SQL.log │ │ │ ├── Top_N_movies_Dataframe_using_SQL.py │ │ │ ├── Top_N_movies_Dataframe_using_SQL.sh │ │ │ ├── Top_N_movies_RDD_using_combineByKey.log │ │ │ ├── Top_N_movies_RDD_using_combineByKey.py │ │ │ ├── Top_N_movies_RDD_using_combineByKey.sh │ │ │ ├── Top_N_movies_RDD_using_groupByKey.log │ │ │ ├── Top_N_movies_RDD_using_groupByKey.py │ │ │ ├── Top_N_movies_RDD_using_groupByKey.sh │ │ │ ├── Top_N_movies_RDD_using_reduceByKey.log │ │ │ ├── Top_N_movies_RDD_using_reduceByKey.py │ │ │ ├── Top_N_movies_RDD_using_reduceByKey.sh │ │ │ ├── Top_N_movies_RDD_using_takeOrdered.log │ │ │ ├── Top_N_movies_RDD_using_takeOrdered.py │ │ │ └── Top_N_movies_RDD_using_takeOrdered.sh │ │ ├── scala │ │ │ └── README.md │ │ └── top-10.jpeg │ ├── UDF │ │ ├── README.md │ │ ├── UDF.pdf │ │ ├── python │ │ │ ├── README.md │ │ │ ├── dataframe_UDF_example.log │ │ │ └── dataframe_UDF_example.py │ │ └── scala │ │ │ └── README.md │ ├── anagrams │ │ ├── python │ │ │ ├── README.md │ │ │ ├── anagrams_by_combinebykey.log │ │ │ ├── anagrams_by_combinebykey.py │ │ │ ├── anagrams_by_groupbykey.log │ │ │ ├── anagrams_by_groupbykey.py │ │ │ ├── anagrams_by_reducebykey.log │ │ │ ├── anagrams_by_reducebykey.py │ │ │ └── sample_document.txt │ │ └── scala │ │ │ ├── README.md │ │ │ ├── build.gradle │ │ │ ├── data │ │ │ └── sample_document.txt │ │ │ ├── gradle │ │ │ └── wrapper │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ └── gradle-wrapper.properties │ │ │ ├── gradlew │ │ │ ├── gradlew.bat │ │ │ ├── run_spark_applications_scripts │ │ │ ├── anagrams_by_combine_by_key.sh │ │ │ ├── anagrams_by_group_by_key.sh │ │ │ └── anagrams_by_reduce_by_key.sh │ │ │ ├── settings.gradle │ │ │ └── src │ │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ └── scala │ │ │ └── org │ │ │ └── data │ │ │ └── algorithms │ │ │ └── spark │ │ │ └── bonuschapter │ │ │ ├── AnagramsByCombineByKey.scala │ │ │ ├── AnagramsByGroupByKey.scala │ │ │ └── AnagramsByReduceByKey.scala │ ├── correlation │ │ ├── python │ │ │ ├── README.md │ │ │ ├── all_versus_all_correlation_dataframe.log │ │ │ ├── all_versus_all_correlation_dataframe.py │ │ │ ├── all_versus_all_correlation_rdd.log │ │ │ ├── all_versus_all_correlation_rdd.py │ │ │ ├── rdd_cartesian_in_action.txt │ │ │ └── sample_input.txt │ │ └── scala │ │ │ ├── build.gradle │ │ │ ├── gradle │ │ │ └── wrapper │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ └── gradle-wrapper.properties │ │ │ ├── gradlew │ │ │ ├── gradlew.bat │ │ │ ├── settings.gradle │ │ │ └── src │ │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ └── scala │ │ │ └── org │ │ │ └── data │ │ │ └── algorithms │ │ │ └── spark │ │ │ └── bonuschapter │ │ │ └── AllVersusAllCorrelationDataframe.scala │ ├── dataframes │ │ ├── README.md │ │ ├── arrays_in_dataframes │ │ │ ├── python │ │ │ │ ├── README.md │ │ │ │ ├── creating_arrays_in_dataframes.log │ │ │ │ └── creating_arrays_in_dataframes.py │ │ │ └── scala │ │ │ │ └── README.md │ │ ├── dataframe_tutorial_from_collection.py.md │ │ ├── dataframe_tutorial_from_text_files.py.md │ │ └── explode_arrays_into_rows │ │ │ ├── python │ │ │ ├── README.md │ │ │ ├── explode_arrays_into_rows.log │ │ │ └── explode_arrays_into_rows.py │ │ │ └── scala │ │ │ └── README.md │ ├── join │ │ ├── README.md │ │ ├── python │ │ │ ├── README.md │ │ │ ├── dataframe_join_cross.log │ │ │ ├── dataframe_join_cross.py │ │ │ ├── dataframe_join_inner.log │ │ │ ├── dataframe_join_inner.py │ │ │ ├── dataframe_join_left.log │ │ │ ├── dataframe_join_left.py │ │ │ ├── dataframe_join_right.py │ │ │ ├── rdd_join_inner.log │ │ │ ├── rdd_join_inner.py │ │ │ ├── rdd_join_left.log │ │ │ ├── rdd_join_left.py │ │ │ ├── rdd_join_right.log │ │ │ └── rdd_join_right.py │ │ └── scala │ │ │ └── README.md │ ├── k-mers │ │ ├── python │ │ │ ├── README.md │ │ │ ├── kmer_fasta.py │ │ │ ├── kmer_fastq.py │ │ │ ├── sample_1.fasta │ │ │ └── sample_1.fastq │ │ └── scala │ │ │ ├── README.md │ │ │ ├── build.gradle │ │ │ ├── data │ │ │ ├── sample_1.fasta │ │ │ └── sample_1.fastq │ │ │ ├── gradle │ │ │ └── wrapper │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ └── gradle-wrapper.properties │ │ │ ├── gradlew │ │ │ ├── gradlew.bat │ │ │ ├── run_spark_applications_scripts │ │ │ ├── kmer_fast_q.sh │ │ │ └── kmer_fasta.sh │ │ │ ├── settings.gradle │ │ │ └── src │ │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ └── scala │ │ │ └── org │ │ │ └── data │ │ │ └── algorithms │ │ │ └── spark │ │ │ └── bonuschapter │ │ │ ├── KMERFastQ.scala │ │ │ └── KMERFasta.scala │ ├── lambda_expressions │ │ ├── Lambda_Expressions.pdf │ │ ├── Lambda_Expressions_basics.md │ │ └── README.md │ ├── mappartitions │ │ ├── README.md │ │ ├── SAMPLE_INPUT_FILES │ │ │ ├── file1.txt │ │ │ ├── file2.txt │ │ │ └── file3.txt │ │ ├── images │ │ │ ├── mappartitions_image_1.drawio.png │ │ │ └── mappartitions_image_2.drawio.png │ │ ├── python │ │ │ ├── README.md │ │ │ ├── find_min_max_by_mappartitions.log │ │ │ ├── find_min_max_by_mappartitions.py │ │ │ └── find_min_max_by_mappartitions.sh │ │ └── scala │ │ │ └── README.md │ ├── physical_partitioning │ │ ├── README.md │ │ ├── continents_countries_temp.csv │ │ ├── data_partitioning.png │ │ ├── partition_by_one_column.py │ │ ├── partition_by_one_column_schema.sql │ │ ├── partition_by_two_columns.py │ │ └── partition_by_two_columns_schema.sql │ ├── pyspark_tutorial │ │ ├── README.md │ │ └── pyspark_tutorial.md │ └── wordcount │ │ ├── README.md │ │ ├── python │ │ ├── README.md │ │ ├── data │ │ │ ├── foxdata.txt │ │ │ └── sample_document.txt │ │ ├── word_count_by_dataframe.log │ │ ├── word_count_by_dataframe.py │ │ ├── word_count_by_dataframe_shorthand.log │ │ ├── word_count_by_dataframe_shorthand.py │ │ ├── wordcount_by_combinebykey.log │ │ ├── wordcount_by_combinebykey.py │ │ ├── wordcount_by_combinebykey.sh │ │ ├── wordcount_by_groupbykey.py │ │ ├── wordcount_by_groupbykey.sh │ │ ├── wordcount_by_groupbykey_shorthand.py │ │ ├── wordcount_by_groupbykey_shorthand.sh │ │ ├── wordcount_by_reducebykey.py │ │ ├── wordcount_by_reducebykey.sh │ │ ├── wordcount_by_reducebykey_shorthand.py │ │ ├── wordcount_by_reducebykey_shorthand.sh │ │ ├── wordcount_by_reducebykey_with_filter.py │ │ └── wordcount_by_reducebykey_with_filter.sh │ │ ├── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── data │ │ │ └── sample_document.txt │ │ ├── gradle │ │ │ └── wrapper │ │ │ │ ├── gradle-wrapper.jar │ │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── run_spark_applications_scripts │ │ │ ├── word_count_by_group_by_key.sh │ │ │ ├── word_count_by_group_by_key_shorthand.sh │ │ │ ├── word_count_by_reduce_by_key.sh │ │ │ ├── word_count_by_reduce_by_key_shorthand.sh │ │ │ └── word_count_by_reduce_by_key_with_filter.sh │ │ ├── settings.gradle │ │ └── src │ │ │ └── main │ │ │ ├── resources │ │ │ └── log4j.properties │ │ │ └── scala │ │ │ └── org │ │ │ └── data │ │ │ └── algorithms │ │ │ └── spark │ │ │ └── bonuschapter │ │ │ ├── WordCountByGroupByKey.scala │ │ │ ├── WordCountByGroupByKeyShorthand.scala │ │ │ ├── WordCountByReduceByKey.scala │ │ │ ├── WordCountByReduceByKeyShorthand.scala │ │ │ └── WordCountByReduceByKeyWithFilter.scala │ │ └── word_count_with_mapreduce.png ├── chap01 │ ├── data │ │ ├── census_2010.json │ │ ├── emps.txt │ │ ├── sample_5_records.txt │ │ └── users.txt │ ├── python │ │ ├── average_by_key_use_aggregatebykey.py │ │ ├── average_by_key_use_aggregatebykey.sh │ │ ├── average_by_key_use_combinebykey.py │ │ ├── average_by_key_use_combinebykey.sh │ │ ├── average_by_key_use_foldbykey.py │ │ ├── average_by_key_use_foldbykey.sh │ │ ├── average_by_key_use_groupbykey.py │ │ ├── average_by_key_use_groupbykey.sh │ │ ├── average_by_key_use_reducebykey.py │ │ ├── average_by_key_use_reducebykey.sh │ │ ├── dataframe_action_describe.py │ │ ├── dataframe_action_describe.sh │ │ ├── dataframe_add_column.py │ │ ├── dataframe_add_column.sh │ │ ├── dataframe_drop_column.py │ │ ├── dataframe_drop_column.sh │ │ ├── dataframe_filter.py │ │ ├── dataframe_filter.sh │ │ ├── dataframe_join_cross.py │ │ ├── dataframe_join_cross.sh │ │ ├── dataframe_join_inner.py │ │ ├── dataframe_join_inner.sh │ │ ├── dataframe_join_left.py │ │ ├── dataframe_join_left.sh │ │ ├── dataframe_join_right.py │ │ ├── dataframe_join_right.sh │ │ ├── dataframe_sql.py │ │ ├── dataframe_sql.sh │ │ ├── rdd_transformation_cartesian.py │ │ ├── rdd_transformation_cartesian.sh │ │ ├── rdd_transformation_combinebykey.py │ │ ├── rdd_transformation_combinebykey.sh │ │ ├── rdd_transformation_filter.py │ │ ├── rdd_transformation_filter.sh │ │ ├── rdd_transformation_flatmap.py │ │ ├── rdd_transformation_flatmap.sh │ │ ├── rdd_transformation_groupbykey.py │ │ ├── rdd_transformation_groupbykey.sh │ │ ├── rdd_transformation_join.py │ │ ├── rdd_transformation_join.sh │ │ ├── rdd_transformation_map.py │ │ ├── rdd_transformation_map.sh │ │ ├── rdd_transformation_mappartitions.py │ │ ├── rdd_transformation_mappartitions.sh │ │ ├── rdd_transformation_mappartitions_handle_empty_partitions.py │ │ ├── rdd_transformation_mappartitions_handle_empty_partitions.sh │ │ ├── rdd_transformation_reducebykey.py │ │ ├── rdd_transformation_reducebykey.sh │ │ ├── rdd_transformation_sortby.py │ │ ├── rdd_transformation_sortby.sh │ │ ├── rdd_transformation_takeordered.py │ │ └── rdd_transformation_takeordered.sh │ └── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── run.sh │ │ ├── run_spark_applications_scripts │ │ ├── aggregate_by_key_use_aggregate_by_key.sh │ │ ├── aggregate_by_key_use_combine_by_key.sh │ │ ├── average_by_key_use_fold_by_key.sh │ │ ├── average_by_key_use_group_by_key.sh │ │ ├── average_by_key_use_reduce_by_key.sh │ │ ├── dataframe_action_describe.sh │ │ ├── dataframe_add_column.sh │ │ ├── dataframe_drop_column.sh │ │ ├── dataframe_filter.sh │ │ ├── dataframe_join_cross.sh │ │ ├── dataframe_join_inner.sh │ │ ├── dataframe_join_left.sh │ │ ├── dataframe_join_right.sh │ │ ├── dataframe_sql.sh │ │ ├── rdd_action_take_ordered.sh │ │ ├── rdd_transformation_cartesian.sh │ │ ├── rdd_transformation_combine_by_key.sh │ │ ├── rdd_transformation_filter.sh │ │ ├── rdd_transformation_flat_map.sh │ │ ├── rdd_transformation_group_by_key.sh │ │ ├── rdd_transformation_join.sh │ │ ├── rdd_transformation_map.sh │ │ ├── rdd_transformation_map_partition.sh │ │ ├── rdd_transformation_mappartitions_handle_empty_partitions.sh │ │ ├── rdd_transformation_reduce_by_key.sh │ │ └── rdd_transformation_sort_by.sh │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch01 │ │ ├── AggregateByKeyUseAggregateByKey.scala │ │ ├── AggregateByKeyUseCombineByKey.scala │ │ ├── AverageByKeyUseFoldByKey.scala │ │ ├── AverageByKeyUseGroupByKey.scala │ │ ├── AverageByKeyUseReduceByKey.scala │ │ ├── DataframeActionDescribe.scala │ │ ├── DataframeAddColumn.scala │ │ ├── DataframeDropColumn.scala │ │ ├── DataframeFilter.scala │ │ ├── DataframeJoinCross.scala │ │ ├── DataframeJoinInner.scala │ │ ├── DataframeJoinLeft.scala │ │ ├── DataframeJoinRight.scala │ │ ├── DataframeSQL.scala │ │ ├── RDDActionTakeOrdered.scala │ │ ├── RDDTransformationCartesian.scala │ │ ├── RDDTransformationCombineByKey.scala │ │ ├── RDDTransformationFilter.scala │ │ ├── RDDTransformationFlatMap.scala │ │ ├── RDDTransformationGroupByKey.scala │ │ ├── RDDTransformationJoin.scala │ │ ├── RDDTransformationMap.scala │ │ ├── RDDTransformationMapPartition.scala │ │ ├── RDDTransformationMappartitionsHandleEmptyPartitions.scala │ │ ├── RDDTransformationReduceByKey.scala │ │ └── RDDTransformationSortBy.scala ├── chap02 │ ├── python │ │ ├── DNA-FASTA-PERFORMANCE │ │ │ └── performance_of_FASTA_versions_1_2_3.txt │ │ ├── DNA-FASTA-V1 │ │ │ ├── dna_base_count_ver_1.py │ │ │ ├── dna_base_count_ver_1.sh │ │ │ ├── dna_base_count_ver_1_1GB.sh │ │ │ └── dna_base_count_ver_1_big.sh │ │ ├── DNA-FASTA-V2 │ │ │ ├── dna_base_count_ver_2.py │ │ │ ├── dna_base_count_ver_2.sh │ │ │ ├── dna_base_count_ver_2_1GB.sh │ │ │ └── dna_base_count_ver_2_big.sh │ │ ├── DNA-FASTA-V3 │ │ │ ├── dna_base_count_ver_3.py │ │ │ ├── dna_base_count_ver_3.sh │ │ │ ├── dna_base_count_ver_3_1GB.sh │ │ │ └── dna_base_count_ver_3_big.sh │ │ ├── DNA-FASTQ │ │ │ ├── dna_base_count_fastq.py │ │ │ └── dna_base_count_fastq.sh │ │ ├── README.md │ │ └── data │ │ │ ├── sample.fasta │ │ │ └── sp1.fastq │ └── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── data │ │ ├── sample.fasta │ │ └── sp1.fastq │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── run_spark_applications_scripts │ │ ├── dna_base_count_fastq.sh │ │ ├── dna_base_count_ver_1.sh │ │ ├── dna_base_count_ver_1_1GB.sh │ │ ├── dna_base_count_ver_1_big.sh │ │ ├── dna_base_count_ver_2.sh │ │ ├── dna_base_count_ver_2_1GB.sh │ │ ├── dna_base_count_ver_2_big.sh │ │ ├── dna_base_count_ver_3.sh │ │ ├── dna_base_count_ver_3_1GB.sh │ │ └── dna_base_count_ver_3_big.sh │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ ├── resources │ │ ├── input.txt │ │ ├── log4j.properties │ │ └── sp1.fastq │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch02 │ │ ├── DNABaseCountFastq.scala │ │ ├── DNABaseCountVER1.scala │ │ ├── DNABaseCountVER3.scala │ │ └── DNSBaseCountVER2.scala ├── chap03 │ ├── python │ │ ├── bigrams_input.txt │ │ ├── flatmap_transformation_1_from_collection.py │ │ ├── flatmap_transformation_1_from_file.py │ │ ├── map_transformation_1_from_collection.py │ │ ├── map_transformation_1_from_file.py │ │ ├── mappartitions_transformation_1.py │ │ ├── mapvalues_transformation_1.py │ │ ├── mapvalues_transformation_2.py │ │ ├── mapvalues_transformation_3.py │ │ ├── sample_input │ │ └── sample_numbers.txt │ └── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── data │ │ ├── bigrams_input.txt │ │ ├── sample_input.csv │ │ └── sample_numbers.txt │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── run.sh │ │ ├── run_spark_applications_scripts │ │ ├── flat_map_transformation_1_from_file.sh │ │ ├── flatmap_transformation_1_from_collection.sh │ │ ├── map_partitions_transformation_1.sh │ │ ├── map_transformation_1_from_collection.sh │ │ ├── map_transformation_1_from_file.sh │ │ ├── map_values_transformation_1.sh │ │ ├── map_values_transformation_2.sh │ │ └── map_values_transformation_3.sh │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ ├── resources │ │ ├── log4j.properties │ │ └── sample_numbers.txt │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch03 │ │ ├── FlatMapTransformation1FromFile.scala │ │ ├── FlatmapTransformation1FromCollection.scala │ │ ├── MapPartitionsTransformation1.scala │ │ ├── MapTransformation1FromCollection.scala │ │ ├── MapTransformation1FromFile.scala │ │ ├── MapValuesTransformation1.scala │ │ ├── MapValuesTransformation2.scala │ │ └── MapValuesTransformation3.scala ├── chap04 │ ├── python │ │ ├── README.md │ │ ├── average_by_key_use_aggregatebykey.py │ │ ├── average_by_key_use_aggregatebykey.sh │ │ ├── average_by_key_use_combinebykey.py │ │ ├── average_by_key_use_combinebykey.sh │ │ ├── average_by_key_use_groupbykey.py │ │ ├── average_by_key_use_groupbykey.sh │ │ ├── average_by_key_use_reducebykey.py │ │ ├── average_by_key_use_reducebykey.sh │ │ ├── dataframe_median_approx.py │ │ ├── dataframe_median_exact.py │ │ ├── exact_median_by_key_use_aggregatebykey.py │ │ ├── exact_median_by_key_use_combinebykey.py │ │ ├── exact_median_by_key_use_groupbykey.py │ │ └── exact_median_by_key_use_reducebykey.py │ └── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── run.sh │ │ ├── run_spark_applications_scripts │ │ ├── average_by_key_use_aggregate_by_key.sh │ │ ├── average_by_key_use_combine_by_key.sh │ │ ├── average_by_key_use_group_by_key.sh │ │ ├── average_by_key_use_reduce_by_key.sh │ │ ├── dataframe_median_approx.sh │ │ ├── dataframe_median_exact.sh │ │ ├── exact_median_by_key_use_aggregate_by_key.sh │ │ ├── exact_median_by_key_use_combine_by_key.sh │ │ ├── exact_median_by_key_use_group_by_key.sh │ │ └── exact_median_by_key_use_reduce_by_key.sh │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch04 │ │ ├── AverageByKeyUseAggregateByKey.scala │ │ ├── AverageByKeyUseCombineByKey.scala │ │ ├── AverageByKeyUseGroupByKey.scala │ │ ├── AverageByKeyUseReduceByKey.scala │ │ ├── DataframeMedianApprox.scala │ │ ├── DataframeMedianExact.scala │ │ ├── ExactMedianByKeyUseAggregateByKey.scala │ │ ├── ExactMedianByKeyUseCombineByKey.scala │ │ ├── ExactMedianByKeyUseGroupByKey.scala │ │ └── ExactMedianByKeyUseReduceByKey.scala ├── chap05 │ ├── Partitions_and_Executors.md │ ├── README.md │ ├── python │ │ ├── README.md │ │ ├── customers.RECORD.FORMAT.txt │ │ ├── customers.txt │ │ ├── customers_with_date.RECORD.FORMAT.txt │ │ ├── customers_with_date.txt │ │ ├── partition_data_as_text_by_year_month.log │ │ ├── partition_data_as_text_by_year_month.py │ │ ├── partition_data_by_customer_and_year.log │ │ ├── partition_data_by_customer_and_year.py │ │ ├── partition_data_by_customer_and_year.sh │ │ └── partition_data_by_customer_and_year_single_file.py │ └── scala │ │ └── README.md ├── chap06 │ ├── README.md │ ├── python │ │ ├── airports.json │ │ ├── breadth_first_search_example.log │ │ ├── breadth_first_search_example.py │ │ ├── breadth_first_search_example.sh │ │ ├── connected_component_example.log │ │ ├── connected_component_example.py │ │ ├── connected_component_example.sh │ │ ├── graph_builder.log │ │ ├── graph_builder.py │ │ ├── graph_builder.sh │ │ ├── label_propagation_algorithm_example.log │ │ ├── label_propagation_algorithm_example.py │ │ ├── label_propagation_algorithm_example.sh │ │ ├── pagerank.py │ │ ├── pagerank_data.txt │ │ ├── pagerank_example.log │ │ ├── pagerank_example.py │ │ ├── pagerank_example.sh │ │ ├── sample_graph_edges.txt │ │ ├── sample_graph_vertices.txt │ │ ├── shortest_path_finder.log │ │ ├── shortest_path_finder.py │ │ ├── shortest_path_finder.sh │ │ ├── triangles_counter.log │ │ ├── triangles_counter.py │ │ ├── triangles_counter.sh │ │ ├── unique_triangles_finder.log │ │ ├── unique_triangles_finder.py │ │ └── unique_triangles_finder.sh │ └── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch06 │ │ └── BreadthFirstSearchExample.scala ├── chap07 │ ├── python │ │ ├── cats.no.header.csv │ │ ├── cats.with.header.csv │ │ ├── datasource_csv_reader_no_header.py │ │ ├── datasource_csv_reader_no_header.sh │ │ ├── datasource_csv_reader_with_header.py │ │ ├── datasource_csv_reader_with_header.sh │ │ ├── datasource_csv_writer.py │ │ ├── datasource_csv_writer.sh │ │ ├── datasource_elasticsearch_reader.py │ │ ├── datasource_elasticsearch_reader.sh │ │ ├── datasource_elasticsearch_writer.py │ │ ├── datasource_elasticsearch_writer.sh │ │ ├── datasource_gzip_reader.py │ │ ├── datasource_gzip_reader.sh │ │ ├── datasource_jdbc_reader.py │ │ ├── datasource_jdbc_reader.sh │ │ ├── datasource_jdbc_writer.py │ │ ├── datasource_jdbc_writer.sh │ │ ├── datasource_json_reader_multi_line.py │ │ ├── datasource_json_reader_multi_line.sh │ │ ├── datasource_json_reader_single_line.py │ │ ├── datasource_json_reader_single_line.sh │ │ ├── datasource_mongodb_reader.py │ │ ├── datasource_mongodb_reader.sh │ │ ├── datasource_mongodb_writer.py │ │ ├── datasource_mongodb_writer.sh │ │ ├── datasource_redis_reader.py │ │ ├── datasource_redis_reader.sh │ │ ├── datasource_redis_writer.py │ │ ├── datasource_redis_writer.sh │ │ ├── datasource_textfile_reader.py │ │ ├── datasource_textfile_reader.sh │ │ ├── datasource_textfile_writer.py │ │ ├── datasource_textfile_writer.sh │ │ ├── images │ │ │ ├── cat1.jpg │ │ │ ├── cat2.jpg │ │ │ ├── cat3.jpg │ │ │ ├── cat4.jpg │ │ │ ├── duck1.jpg │ │ │ ├── duck2.jpg │ │ │ └── not-image.txt │ │ ├── mongodb_coll44.png │ │ ├── mongodb_coll66.png │ │ ├── name_age_salary.csv │ │ ├── people.txt │ │ ├── sample_multi_line.json │ │ ├── sample_no_header.csv │ │ ├── sample_numbers.txt │ │ ├── sample_single_line.json │ │ ├── sample_with_header.csv │ │ └── twitter.avro │ └── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── data │ │ ├── sample_multi_line.json │ │ ├── sample_no_header.csv │ │ ├── sample_no_header.csv.gz │ │ ├── sample_numbers.txt │ │ ├── sample_single_line.json │ │ └── sample_with_header.csv │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── run.sh │ │ ├── run_spark_applications_scripts │ │ ├── datasource_csv_reader_header.sh │ │ ├── datasource_csv_reader_no_header.sh │ │ ├── datasource_csv_writer.sh │ │ ├── datasource_elasticsearch_reader.sh │ │ ├── datasource_elasticsearch_writer.sh │ │ ├── datasource_gzip_reader.sh │ │ ├── datasource_jdbc_reader.sh │ │ ├── datasource_jdbc_writer.sh │ │ ├── datasource_json_reader_multi_line.sh │ │ ├── datasource_json_reader_single_line.sh │ │ ├── datasource_mongodb_reader.sh │ │ ├── datasource_mongodb_writer.sh │ │ ├── datasource_redis_reader.sh │ │ ├── datasource_redis_writer.sh │ │ ├── datasource_textfile_reader.sh │ │ └── datasource_textfile_writer.sh │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch07 │ │ ├── DatasourceCSVReaderHeader.scala │ │ ├── DatasourceCSVReaderNoHeader.scala │ │ ├── DatasourceCSVWriter.scala │ │ ├── DatasourceElasticsearchReader.scala │ │ ├── DatasourceElasticsearchWriter.scala │ │ ├── DatasourceGZIPReader.scala │ │ ├── DatasourceJDBCReader.scala │ │ ├── DatasourceJDBCWriter.scala │ │ ├── DatasourceJSONReaderMultiLine.scala │ │ ├── DatasourceJSONReaderSingleLine.scala │ │ ├── DatasourceMongodbReader.scala │ │ ├── DatasourceMongodbWriter.scala │ │ ├── DatasourceRedisReader.scala │ │ ├── DatasourceRedisWriter.scala │ │ ├── DatasourceTextfileReader.scala │ │ └── DatasourceTextfileWriter.scala ├── chap08 │ ├── python │ │ ├── page_rank │ │ │ ├── page_rank.py │ │ │ └── pagerank_2.py │ │ └── rank_product │ │ │ ├── rank_product_using_combinebykey.py │ │ │ ├── rank_product_using_combinebykey.sh │ │ │ ├── rank_product_using_groupbykey.py │ │ │ ├── rank_product_using_groupbykey.sh │ │ │ └── sample_input │ │ │ ├── rp1.txt │ │ │ ├── rp2.txt │ │ │ └── rp3.txt │ └── scala │ │ ├── README.md │ │ ├── build.gradle │ │ ├── data │ │ ├── sample_input │ │ │ ├── rp1.txt │ │ │ ├── rp2.txt │ │ │ └── rp3.txt │ │ └── urls.txt │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── run_spark_applications_scripts │ │ ├── page_rank.sh │ │ ├── rank_product_using_combine_by_key.sh │ │ └── rank_product_using_group_by_key.sh │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch08 │ │ ├── PageRank.scala │ │ ├── RankProductUsingCombineByKey.scala │ │ └── RankProductUsingGroupByKey.scala ├── chap09 │ ├── python │ │ └── README.md │ └── scala │ │ └── README.md ├── chap10 │ ├── README.md │ ├── data_design_patterns.md │ ├── data_design_patterns.md.save │ ├── data_design_patterns.pdf │ ├── python │ │ ├── average_monoid_use_aggregatebykey.py │ │ ├── average_monoid_use_aggregatebykey.sh │ │ ├── average_monoid_use_combinebykey.py │ │ ├── average_monoid_use_combinebykey.sh │ │ ├── average_monoid_use_groupbykey.py │ │ ├── average_monoid_use_groupbykey.sh │ │ ├── average_monoid_use_reducebykey.py │ │ ├── average_monoid_use_reducebykey.sh │ │ ├── dna_base_count_basic_using_combinebykey.py │ │ ├── dna_base_count_basic_using_combinebykey.sh │ │ ├── dna_base_count_basic_using_groupbykey.py │ │ ├── dna_base_count_basic_using_groupbykey.sh │ │ ├── dna_base_count_basic_using_reducebykey.py │ │ ├── dna_base_count_basic_using_reducebykey.sh │ │ ├── dna_base_count_inmapper_combiner_using_combinebykey.py │ │ ├── dna_base_count_inmapper_combiner_using_combinebykey.sh │ │ ├── dna_base_count_inmapper_combiner_using_groupbykey.py │ │ ├── dna_base_count_inmapper_combiner_using_groupbykey.sh │ │ ├── dna_base_count_inmapper_combiner_using_reducebykey.py │ │ ├── dna_base_count_inmapper_combiner_using_reducebykey.sh │ │ ├── dna_base_count_using_mappartitions.py │ │ ├── dna_base_count_using_mappartitions.sh │ │ ├── inmapper_combiner_local_aggregation.py │ │ ├── inmapper_combiner_local_aggregation.sh │ │ ├── inmapper_combiner_use_basic_mapreduce.py │ │ ├── inmapper_combiner_use_basic_mapreduce.sh │ │ ├── inmapper_combiner_use_mappartitions.py │ │ ├── inmapper_combiner_use_mappartitions.sh │ │ ├── minmax_force_empty_partitions.py │ │ ├── minmax_force_empty_partitions.sh │ │ ├── minmax_use_mappartitions.py │ │ ├── minmax_use_mappartitions.sh │ │ ├── minmax_use_mappartitions_v2.py │ │ ├── minmax_use_mappartitions_v2.sh │ │ ├── sample_dna_seq.txt │ │ ├── sample_input.txt │ │ ├── sample_numbers.txt │ │ ├── structured_to_hierarchical_to_xml_dataframe.py │ │ ├── structured_to_hierarchical_to_xml_rdd.py │ │ ├── top_N_use_mappartitions.py │ │ ├── top_N_use_mappartitions.sh │ │ ├── top_N_use_takeordered.py │ │ └── top_N_use_takeordered.sh │ └── scala │ │ ├── .gitignore │ │ ├── README.md │ │ ├── build.gradle │ │ ├── data │ │ ├── sample_dna_seq.txt │ │ ├── sample_input.txt │ │ └── sample_numbers.txt │ │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ │ ├── gradlew │ │ ├── gradlew.bat │ │ ├── out │ │ └── production │ │ │ └── resources │ │ │ └── log4j.properties │ │ ├── run.sh │ │ ├── run_spark_applications_scripts │ │ ├── average_monoid_use_aggregate_by_key.sh │ │ ├── average_monoid_use_combine_by_key.sh │ │ ├── average_monoid_use_group_by_key.sh │ │ ├── average_monoid_use_reduce_by_key.sh │ │ ├── dna_base_count_basic_in_mapper_combiner_using_combine_by_key.sh │ │ ├── dna_base_count_basic_in_mapper_combiner_using_group_by_key.sh │ │ ├── dna_base_count_basic_in_mapper_combiner_using_reduce_by_key.sh │ │ ├── dna_base_count_basic_using_combine_by_key.sh │ │ ├── dna_base_count_basic_using_group_by_key.sh │ │ ├── dna_base_count_basic_using_mappartitions.sh │ │ ├── dna_base_count_basic_using_reduce_by_key.sh │ │ ├── in_mapper_combiner_use_mappartitions.sh │ │ ├── in_mapper_combiner_using_local_aggregation.sh │ │ ├── in_mapper_combiner_using_map_reduce.sh │ │ ├── min_max_force_empty_partitions.sh │ │ ├── min_max_use_mappartitions.sh │ │ ├── structured_to_hierarchical_to_xml_dataframe.sh │ │ ├── structured_to_hierarchical_to_xml_rdd.sh │ │ ├── top_n_use_map_partitions.sh │ │ └── top_n_use_take_ordered.sh │ │ ├── settings.gradle │ │ └── src │ │ └── main │ │ ├── resources │ │ └── log4j.properties │ │ └── scala │ │ └── org │ │ └── data │ │ └── algorithms │ │ └── spark │ │ └── ch10 │ │ ├── AverageMonoidUseAggregateByKey.scala │ │ ├── AverageMonoidUseCombineByKey.scala │ │ ├── AverageMonoidUseGroupByKey.scala │ │ ├── AverageMonoidUseReduceByKey.scala │ │ ├── DNABaseCountBasicInMapperCombinerUsingCombineByKey.scala │ │ ├── DNABaseCountBasicInMapperCombinerUsingGroupByKey.scala │ │ ├── DNABaseCountBasicInMapperCombinerUsingReduceByKey.scala │ │ ├── DNABaseCountBasicUsingCombineByKey.scala │ │ ├── DNABaseCountBasicUsingGroupByKey.scala │ │ ├── DNABaseCountBasicUsingMappartitions.scala │ │ ├── DNABaseCountBasicUsingReduceByKey.scala │ │ ├── InMapperCombinerUseMappartitions.scala │ │ ├── InMapperCombinerUsingLocalAggregation.scala │ │ ├── InMapperCombinerUsingMapReduce.scala │ │ ├── MinMaxForceEmptyPartitions.scala │ │ ├── MinMaxUseMappartitions.scala │ │ ├── StructuredToHierarchicalToXmlDataframe.scala │ │ ├── StructuredToHierarchicalToXmlRDD.scala │ │ ├── TopNUseMapPartitions.scala │ │ └── TopNUseTakeOrdered.scala ├── chap11 │ ├── python │ │ ├── README.md │ │ ├── inner_join_dataframe_spark.py │ │ ├── inner_join_in_mapreduce.py │ │ ├── inner_join_rdd_spark.py │ │ ├── left_join_dataframe_spark.py │ │ ├── left_join_in_mapreduce.py │ │ ├── left_join_rdd_spark.py │ │ ├── right_join_dataframe_spark.py │ │ ├── right_join_in_mapreduce.py │ │ └── right_join_rdd_spark.py │ └── scala │ │ └── README.md ├── chap12 │ ├── python │ │ └── README.md │ └── scala │ │ └── README.md └── jars │ ├── avro-mapred-1.7.7-hadoop1.jar │ ├── avro-mapred-1.7.7-hadoop2.jar │ ├── com-cotdp-hadoop-1.0-SNAPSHOT.jar │ ├── elasticsearch-hadoop-6.4.2.jar │ ├── elasticsearch-spark_2.11-2.4.5.jar │ ├── graphframes-0.6.0-spark2.3-s_2.11.jar │ ├── hbase-spark-connector-1.0.0.jar │ ├── htrace-core-3.1.0-incubating.jar │ ├── mongo-java-driver-3.8.2.jar │ ├── mongo-spark-connector_2.11-2.2.5.jar │ ├── mongodb-driver-3.8.2.jar │ ├── mysql-connector-java-5.1.42.jar │ ├── shc-core-1.1.3-2.3-s_2.11.jar │ ├── shc-examples-1.1.3-2.3-s_2.11.jar │ ├── spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar │ └── spark-redis-2.3.1-SNAPSHOT.jar ├── data ├── chap02 │ ├── NC_000907.1.fasta │ ├── README.md │ ├── human_g1k_v37_chr1_59kb.fasta │ ├── sample.fasta │ └── sp1.fastq └── chap06 │ └── flightdata2018.json ├── docs ├── FOREWORD_by_Dr_Matei_Zaharia.md ├── goal_of_book.md └── story_of_book.md ├── images ├── Data-Algorithms-with-Spark_mech2.pdf ├── Data-Algorithms-with-Spark_mech2.png ├── Data_Algorithms_with_Spark_COVER_9781492082385.jpg ├── Data_Algorithms_with_Spark_COVER_9781492082385.png ├── FOREWORD_by_Dr_Matei_Zaharia.md ├── anagram.png ├── book_cover_final.pdf ├── correlation-coefficient.png ├── data-alg-foreword2.docx ├── data-alg-foreword2.pdf ├── data_algorithms_hard_copy_image.jpg ├── data_algorithms_with_spark.jpg ├── data_algorithms_with_spark.pdf ├── data_algorithms_with_spark_amazon.jpg ├── data_algorithms_with_spark_knowledge_is_power.jpeg ├── data_algorithms_with_spark_small.jpeg ├── joins-in-SQL.jpeg ├── kmer.jpg ├── kmer_4.png ├── mappartitions_image_1.drawio.png ├── mappartitions_image_2.drawio.png └── sql_joins.png └── wiki-spark ├── README.md └── docs ├── dataframe_to_rdd.md ├── duplicate_removal_dataframe.md ├── duplicate_removal_rdd.md ├── explode_function.md ├── flatmap_transformation.md ├── how-to-use-UDF-in-spark.md ├── lambda_expressions.adoc ├── lambda_expressions.pdf ├── lambda_expressions_basics.md ├── monoid ├── README.md ├── monoid.md └── monoid_math.png ├── rdd_to_dataframe.md ├── reduce-the-verbosity-of-spark-runtime-output.md ├── spark-explode.png ├── spark-flatmap.png ├── using-graphframes-with-jupyter.demo.png ├── using-graphframes-with-jupyter.md └── wiki.jpeg /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ 3 | build 4 | .gradle 5 | .idea 6 | !gradle-wrapper.jar -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/python/README.md: -------------------------------------------------------------------------------- 1 | # TF-IDF 2 | 3 | TF-IDF (term frequency-inverse document frequency) is a 4 | statistical measure that evaluates how relevant a word is 5 | to a document in a collection of documents. 6 | This is done by multiplying two metrics: how many times a 7 | word appears in a document, and the inverse document 8 | frequency of the word across a set of documents. 9 | 10 | # References 11 | 12 | 1. [Introduction to TF-IDF](https://github.com/mahmoudparsian/machine-learning-course/blob/master/docs/TF-IDF/README.md) 13 | 14 | 2. [TF(Term Frequency)-IDF(Inverse Document Frequency) from scratch in python](https://towardsdatascience.com/tf-term-frequency-idf-inverse-document-frequency-from-scratch-in-python-6c2b61b78558) 15 | 16 | 3. [Understanding TF-ID: A Simple Introduction](https://monkeylearn.com/blog/what-is-tf-idf/) 17 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/python/data/doc1: -------------------------------------------------------------------------------- 1 | a crazy fox jumped over red fox and jumped over fox 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/python/data/doc2: -------------------------------------------------------------------------------- 1 | dogs are the best friend of red fox and I like dogs 2 | dogs are good 3 | 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/python/data/doc3: -------------------------------------------------------------------------------- 1 | I do not like cleaning and cooking but I like dogs 2 | I like playing tennis 3 | 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/python/data/doc4: -------------------------------------------------------------------------------- 1 | computer science is great 2 | fox jumped 3 | 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.bonus_chapter' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/TF-IDF/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_tfidf' 2 | 3 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/src/main/data/doc1: -------------------------------------------------------------------------------- 1 | a crazy fox jumped over red fox and jumped over fox 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/src/main/data/doc2: -------------------------------------------------------------------------------- 1 | dogs are the best friend of red fox and I like dogs 2 | dogs are good 3 | 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/src/main/data/doc3: -------------------------------------------------------------------------------- 1 | I do not like cleaning and cooking but I like dogs 2 | I like playing tennis 3 | 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/TF-IDF/scala/src/main/data/doc4: -------------------------------------------------------------------------------- 1 | computer science is great 2 | fox jumped 3 | 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/python/Top_N_movies_Dataframe_using_API.sh: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------- 2 | begin=`/bin/date` 3 | echo "begin=${begin}" 4 | START_TIME=$(date +%s) 5 | #command block that takes time to complete... 6 | #--------------------------------------------------------- 7 | # 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0" 9 | export PROG="Top_N_movies_Dataframe_using_API.py" 10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m" 11 | export ratings="${INPUT_PATH}/ratings.csv" 12 | export movies="${INPUT_PATH}/movies.csv" 13 | export rating_threshold="0" 14 | # 15 | # 16 | # for top 10 17 | export N=10 18 | # 19 | # sys.argv[x] 1 2 3 4 20 | $SPARK_HOME/bin/spark-submit ${PROG} ${N} ${ratings} ${movies} ${rating_threshold} 21 | # 22 | #--------------------------------------------------------- 23 | end=`/bin/date` 24 | echo "end=${end}" 25 | END_TIME=$(date +%s) 26 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task." 27 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/python/Top_N_movies_Dataframe_using_SQL.sh: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------- 2 | begin=`/bin/date` 3 | echo "begin=${begin}" 4 | START_TIME=$(date +%s) 5 | #command block that takes time to complete... 6 | #--------------------------------------------------------- 7 | # 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0" 9 | export PROG="Top_N_movies_Dataframe_using_SQL.py" 10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m" 11 | export ratings="${INPUT_PATH}/ratings.csv" 12 | export movies="${INPUT_PATH}/movies.csv" 13 | export rating_threshold="0" 14 | # 15 | # 16 | # for top 10 17 | export N=10 18 | # 19 | # sys.argv[x] 1 2 3 4 20 | $SPARK_HOME/bin/spark-submit ${PROG} ${N} ${ratings} ${movies} ${rating_threshold} 21 | # 22 | #--------------------------------------------------------- 23 | end=`/bin/date` 24 | echo "end=${end}" 25 | END_TIME=$(date +%s) 26 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task." 27 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_combineByKey.sh: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------- 2 | begin=`/bin/date` 3 | echo "begin=${begin}" 4 | START_TIME=$(date +%s) 5 | #command block that takes time to complete... 6 | #--------------------------------------------------------- 7 | # 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0" 9 | export PROG="Top_N_movies_RDD_using_combineByKey.py" 10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m" 11 | export ratings="${INPUT_PATH}/ratings.csv" 12 | export movies="${INPUT_PATH}/movies.csv" 13 | export rating_threshold="0" 14 | # 15 | # for top 10 16 | export N=10 17 | # 18 | # sys.argv[x] 1 2 3 4 19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N} ${ratings} ${movies} ${rating_threshold} 20 | # 21 | #--------------------------------------------------------- 22 | end=`/bin/date` 23 | echo "end=${end}" 24 | END_TIME=$(date +%s) 25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task." 26 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_groupByKey.sh: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------- 2 | begin=`/bin/date` 3 | echo "begin=${begin}" 4 | START_TIME=$(date +%s) 5 | #command block that takes time to complete... 6 | #--------------------------------------------------------- 7 | # 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0" 9 | export PROG="Top_N_movies_RDD_using_groupByKey.py" 10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m" 11 | export ratings="${INPUT_PATH}/ratings.csv" 12 | export movies="${INPUT_PATH}/movies.csv" 13 | export rating_threshold="0" 14 | # 15 | # for top 10 16 | export N=10 17 | # 18 | # sys.argv[x] 1 2 3 4 19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N} ${ratings} ${movies} ${rating_threshold} 20 | # 21 | #--------------------------------------------------------- 22 | end=`/bin/date` 23 | echo "end=${end}" 24 | END_TIME=$(date +%s) 25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task." 26 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_reduceByKey.sh: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------- 2 | begin=`/bin/date` 3 | echo "begin=${begin}" 4 | START_TIME=$(date +%s) 5 | #command block that takes time to complete... 6 | #--------------------------------------------------------- 7 | # 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0" 9 | export PROG="Top_N_movies_RDD_using_reduceByKey.py" 10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m" 11 | export ratings="${INPUT_PATH}/ratings.csv" 12 | export movies="${INPUT_PATH}/movies.csv" 13 | export rating_threshold="0" 14 | # 15 | # for top 10 16 | export N=10 17 | # 18 | # sys.argv[x] 1 2 3 4 19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N} ${ratings} ${movies} ${rating_threshold} 20 | # 21 | #--------------------------------------------------------- 22 | end=`/bin/date` 23 | echo "end=${end}" 24 | END_TIME=$(date +%s) 25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task." 26 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/python/Top_N_movies_RDD_using_takeOrdered.sh: -------------------------------------------------------------------------------- 1 | #--------------------------------------------------------- 2 | begin=`/bin/date` 3 | echo "begin=${begin}" 4 | START_TIME=$(date +%s) 5 | #command block that takes time to complete... 6 | #--------------------------------------------------------- 7 | # 8 | export SPARK_HOME="/Users/mparsian/spark-3.3.0" 9 | export PROG="Top_N_movies_RDD_using_takeOrdered.py" 10 | export INPUT_PATH="/Users/mparsian/Downloads/ml-25m" 11 | export ratings="${INPUT_PATH}/ratings.csv" 12 | export movies="${INPUT_PATH}/movies.csv" 13 | export rating_threshold="0" 14 | # 15 | # for top 10 16 | export N=10 17 | # 18 | # sys.argv[x] 1 2 3 4 19 | $SPARK_HOME/bin/spark-submit ${PROG} ${N} ${ratings} ${movies} ${rating_threshold} 20 | # 21 | #--------------------------------------------------------- 22 | end=`/bin/date` 23 | echo "end=${end}" 24 | END_TIME=$(date +%s) 25 | echo "elapsed time: $((${END_TIME} - ${START_TIME})) seconds to complete this task." 26 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/scala/README.md: -------------------------------------------------------------------------------- 1 | Scala solutions for Top-N 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/Top-N/top-10.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/Top-N/top-10.jpeg -------------------------------------------------------------------------------- /code/bonus_chapters/UDF/UDF.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/UDF/UDF.pdf -------------------------------------------------------------------------------- /code/bonus_chapters/UDF/python/README.md: -------------------------------------------------------------------------------- 1 | Demo Spark's UDF (user-defined-function) 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/UDF/python/dataframe_UDF_example.log: -------------------------------------------------------------------------------- 1 | % export SPARK_HOME=/home/mparsian/spark-3.2.0 2 | % $SPARK_HOME/bin/spark-submit dataframe_UDF_example.py 3 | 4 | +---+------------+ 5 | |ID |Name | 6 | +---+------------+ 7 | |100|john jones | 8 | |200|tracey smith| 9 | |300|amy sanders | 10 | |400|null | 11 | +---+------------+ 12 | 13 | +---+------------+ 14 | |ID |Name | 15 | +---+------------+ 16 | |100|John Jones | 17 | |200|Tracey Smith| 18 | |300|Amy Sanders | 19 | |400|null | 20 | +---+------------+ 21 | 22 | +---+------------+------------+ 23 | |ID |Name |Upper Name | 24 | +---+------------+------------+ 25 | |100|john jones |JOHN JONES | 26 | |200|tracey smith|TRACEY SMITH| 27 | |300|amy sanders |AMY SANDERS | 28 | |400|null |null | 29 | +---+------------+------------+ 30 | 31 | +---+------------+ 32 | |ID |Name | 33 | +---+------------+ 34 | |100|John Jones | 35 | |200|Tracey Smith| 36 | |300|Amy Sanders | 37 | |400|null | 38 | +---+------------+ 39 | 40 | -------------------------------------------------------------------------------- /code/bonus_chapters/UDF/scala/README.md: -------------------------------------------------------------------------------- 1 | Demo Spark's UDF (user-defined-function) 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/python/sample_document.txt: -------------------------------------------------------------------------------- 1 | fox jumped bowel bowel bowel elbow below bare bear 2 | fox jumped bore bore bore boer robe bears 3 | bears baser saber fox jumped and jumped over bear 4 | fox is silent and listen listen mars rams mars bears 5 | Mary and Elvis lives in Detroit army Easter Listen 6 | silent eaters Death Hated elvis Mary easter Silent 7 | Artist Elvis are in army Listen Silent detroit 8 | artist is here and strait and traits hated 9 | -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.bonus_chapter' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/data/sample_document.txt: -------------------------------------------------------------------------------- 1 | fox jumped bowel bowel bowel elbow below bare bear 2 | fox jumped bore bore bore boer robe bears 3 | bears baser saber fox jumped and jumped over bear 4 | fox is silent and listen listen mars rams mars bears 5 | Mary and Elvis lives in Detroit army Easter Listen 6 | silent eaters Death Hated elvis Mary easter Silent 7 | Artist Elvis are in army Listen Silent detroit 8 | artist is here and strait and traits hated 9 | -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/anagrams/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/run_spark_applications_scripts/anagrams_by_combine_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.AnagramsByCombineByKey "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/run_spark_applications_scripts/anagrams_by_group_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.AnagramsByGroupByKey "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/run_spark_applications_scripts/anagrams_by_reduce_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.AnagramsByReduceByKey "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/bonus_chapters/anagrams/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_anagrams' 2 | 3 | -------------------------------------------------------------------------------- /code/bonus_chapters/correlation/python/rdd_cartesian_in_action.txt: -------------------------------------------------------------------------------- 1 | How does Cartesian work? 2 | 3 | >>> mylist = [('g1', [1, 11]), ('g2', [2, 22]), ('g3', [3, 33])] 4 | >>> rdd = spark.sparkContext.parallelize(mylist) 5 | >>> rdd.collect() 6 | [('g1', [1, 11]), ('g2', [2, 22]), ('g3', [3, 33])] 7 | >>> cart = rdd.cartesian(rdd) 8 | >>> cart.mapValues(lambda v: list(v)).collect() 9 | [ 10 | (('g1', [1, 11]), ['g1', [1, 11]]), 11 | (('g1', [1, 11]), ['g2', [2, 22]]), 12 | (('g1', [1, 11]), ['g3', [3, 33]]), 13 | (('g2', [2, 22]), ['g1', [1, 11]]), 14 | (('g2', [2, 22]), ['g2', [2, 22]]), 15 | (('g2', [2, 22]), ['g3', [3, 33]]), 16 | (('g3', [3, 33]), ['g1', [1, 11]]), 17 | (('g3', [3, 33]), ['g2', [2, 22]]), 18 | (('g3', [3, 33]), ['g3', [3, 33]]) 19 | ] 20 | -------------------------------------------------------------------------------- /code/bonus_chapters/correlation/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.bonus_chapter' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/bonus_chapters/correlation/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/correlation/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/bonus_chapters/correlation/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/correlation/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_correlation' 2 | 3 | -------------------------------------------------------------------------------- /code/bonus_chapters/correlation/scala/src/main/scala/org/data/algorithms/spark/bonuschapter/AllVersusAllCorrelationDataframe.scala: -------------------------------------------------------------------------------- 1 | package org.data.algorithms.spark.bonuschapter 2 | 3 | object AllVersusAllCorrelationDataframe { 4 | 5 | } 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/dataframes/README.md: -------------------------------------------------------------------------------- 1 | ## Spark DataFrames Tutorial 2 | 3 | #### [1. DataFrames Tutorial: from Python Collections](./dataframe_tutorial_from_collection.py.md) 4 | 5 | #### [2. DataFrames Tutorial: from CSV Text Files](./dataframe_tutorial_from_text_files.py.md) 6 | 7 | #### [3. Arrays in DataFrames](./arrays_in_dataframes/) 8 | 9 | #### [4. Exploding Arrays in DataFrames](./explode_arrays_into_rows/) 10 | 11 | -------------------------------------------------------------------------------- /code/bonus_chapters/dataframes/arrays_in_dataframes/python/README.md: -------------------------------------------------------------------------------- 1 | How to use arrays in DataFrames 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/dataframes/arrays_in_dataframes/scala/README.md: -------------------------------------------------------------------------------- 1 | How to use arrays in DataFrames 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/dataframes/explode_arrays_into_rows/python/README.md: -------------------------------------------------------------------------------- 1 | Pyspark – Split multiple array columns into rows 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/dataframes/explode_arrays_into_rows/python/explode_arrays_into_rows.log: -------------------------------------------------------------------------------- 1 | /Users/mparsian/spark-3.2.1/bin/spark-submit explode_arrays_into_rows.py 2 | 3 | root 4 | |-- name: string (nullable = true) 5 | |-- age: string (nullable = true) 6 | |-- languages: array (nullable = true) 7 | | |-- element: string (containsNull = true) 8 | 9 | +-----+---+--------------------+ 10 | | name|age| languages| 11 | +-----+---+--------------------+ 12 | | Rafa| 20| [SQL, NoSQL]| 13 | | Alex| 21| [Ada, SQL, Java]| 14 | | Jane| 22|[Fortran, Cobol, ...| 15 | |Maria| 23| []| 16 | +-----+---+--------------------+ 17 | 18 | root 19 | |-- name: string (nullable = true) 20 | |-- age: string (nullable = true) 21 | |-- col: string (nullable = true) 22 | 23 | +----+---+-------+ 24 | |name|age| col| 25 | +----+---+-------+ 26 | |Rafa| 20| SQL| 27 | |Rafa| 20| NoSQL| 28 | |Alex| 21| Ada| 29 | |Alex| 21| SQL| 30 | |Alex| 21| Java| 31 | |Jane| 22|Fortran| 32 | |Jane| 22| Cobol| 33 | |Jane| 22| R| 34 | |Jane| 22| C++| 35 | +----+---+-------+ 36 | 37 | -------------------------------------------------------------------------------- /code/bonus_chapters/dataframes/explode_arrays_into_rows/scala/README.md: -------------------------------------------------------------------------------- 1 | Pyspark – Split multiple array columns into rows 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/join/README.md: -------------------------------------------------------------------------------- 1 | # Join operation 2 | 3 | ------ 4 | 5 | * In a nutshell, a join is an SQL operation performed 6 | to establish a connection between two (or more) database 7 | tables based on matching columns, thereby creating a 8 | relationship between the tables. 9 | 10 | * Types of joins 11 | * Cross join. A cross join returns all possible combinations 12 | of rows of two tables (also called a Cartesian product). 13 | 14 | * Join/inner join. An inner join, also known as a simple join, 15 | returns rows from joined tables that have matching rows. 16 | 17 | * Left outer join/left join. 18 | 19 | * Right outer join/right join. 20 | 21 | * Full outer join. 22 | 23 | ----------- 24 | 25 | ![joins](../../../images/sql_joins.png) 26 | 27 | -------------------------------------------------------------------------------- /code/bonus_chapters/join/python/README.md: -------------------------------------------------------------------------------- 1 | # Join in Spark 2 | 3 | A JOIN clause is used to combine rows from two tables 4 | (expressed as RDDs or DataFrames), based on a related 5 | column between them. 6 | 7 | ![joins](../../../../images/joins-in-SQL.jpeg) 8 | -------------------------------------------------------------------------------- /code/bonus_chapters/join/python/rdd_join_inner.log: -------------------------------------------------------------------------------- 1 | % cat /tmp/A.txt 2 | k1,v1 3 | k1,v2 4 | k2,v3 5 | k2,v4 6 | k2,v5 7 | k3,v6 8 | k3,v7 9 | k4,v8 10 | 11 | % cat /tmp/B.txt 12 | k1,t1 13 | k1,t2 14 | k1,t3 15 | k2,t4 16 | k2,t5 17 | k5,t6 18 | k6,t7 19 | 20 | % A="/tmp/A.txt" 21 | % B="/tmp/B.txt" 22 | % ~/spark-3.2.1/bin/spark-submit rdd_join_inner.py $A $B 23 | 24 | rdd_A= 25 | [ 26 | ('k1', 'v1'), 27 | ('k1', 'v2'), 28 | ('k2', 'v3'), 29 | ('k2', 'v4'), 30 | ('k2', 'v5'), 31 | ('k3', 'v6'), 32 | ('k3', 'v7'), 33 | ('k4', 'v8') 34 | ] 35 | 36 | rdd_B= 37 | [ 38 | ('k1', 't1'), 39 | ('k1', 't2'), 40 | ('k1', 't3'), 41 | ('k2', 't4'), 42 | ('k2', 't5'), 43 | ('k5', 't6'), 44 | ('k6', 't7') 45 | ] 46 | 47 | A_joined_B= 48 | [ 49 | ('k1', ('v1', 't1')), 50 | ('k1', ('v1', 't2')), 51 | ('k1', ('v1', 't3')), 52 | ('k1', ('v2', 't1')), 53 | ('k1', ('v2', 't2')), 54 | ('k1', ('v2', 't3')), 55 | ('k2', ('v3', 't4')), 56 | ('k2', ('v3', 't5')), 57 | ('k2', ('v4', 't4')), 58 | ('k2', ('v4', 't5')), 59 | ('k2', ('v5', 't4')), 60 | ('k2', ('v5', 't5')) 61 | ] 62 | -------------------------------------------------------------------------------- /code/bonus_chapters/join/python/rdd_join_left.log: -------------------------------------------------------------------------------- 1 | % cat /tmp/A.txt 2 | k1,v1 3 | k1,v2 4 | k2,v3 5 | k2,v4 6 | k2,v5 7 | k3,v6 8 | k3,v7 9 | k4,v8 10 | 11 | % cat /tmp/B.txt 12 | k1,t1 13 | k1,t2 14 | k1,t3 15 | k2,t4 16 | k2,t5 17 | k5,t6 18 | k6,t7 19 | 20 | % A="/tmp/A.txt" 21 | % B="/tmp/B.txt" 22 | % ~/spark-3.2.1/bin/spark-submit rdd_join_left.py $A $B 23 | 24 | rdd_A= 25 | [ 26 | ('k1', 'v1'), 27 | ('k1', 'v2'), 28 | ('k2', 'v3'), 29 | ('k2', 'v4'), 30 | ('k2', 'v5'), 31 | ('k3', 'v6'), 32 | ('k3', 'v7'), 33 | ('k4', 'v8') 34 | ] 35 | 36 | rdd_B= 37 | [ 38 | ('k1', 't1'), 39 | ('k1', 't2'), 40 | ('k1', 't3'), 41 | ('k2', 't4'), 42 | ('k2', 't5'), 43 | ('k5', 't6'), 44 | ('k6', 't7') 45 | ] 46 | 47 | A_left_joined_B= 48 | [ 49 | ('k1', ('v1', 't1')), 50 | ('k1', ('v1', 't2')), 51 | ('k1', ('v1', 't3')), 52 | ('k1', ('v2', 't1')), 53 | ('k1', ('v2', 't2')), 54 | ('k1', ('v2', 't3')), 55 | ('k2', ('v3', 't4')), 56 | ('k2', ('v3', 't5')), 57 | ('k2', ('v4', 't4')), 58 | ('k2', ('v4', 't5')), 59 | ('k2', ('v5', 't4')), 60 | ('k2', ('v5', 't5')), 61 | ('k4', ('v8', None)), 62 | ('k3', ('v6', None)), 63 | ('k3', ('v7', None)) 64 | ] 65 | -------------------------------------------------------------------------------- /code/bonus_chapters/join/python/rdd_join_right.log: -------------------------------------------------------------------------------- 1 | % cat /tmp/A.txt 2 | k1,v1 3 | k1,v2 4 | k2,v3 5 | k2,v4 6 | k2,v5 7 | k3,v6 8 | k3,v7 9 | k4,v8 10 | 11 | % cat /tmp/B.txt 12 | k1,t1 13 | k1,t2 14 | k1,t3 15 | k2,t4 16 | k2,t5 17 | k5,t6 18 | k6,t7 19 | 20 | % A="/tmp/A.txt" 21 | % B="/tmp/B.txt" 22 | % ~/spark-3.2.1/bin/spark-submit rdd_join_right.py $A $B 23 | 24 | rdd_A= 25 | [ 26 | ('k1', 'v1'), 27 | ('k1', 'v2'), 28 | ('k2', 'v3'), 29 | ('k2', 'v4'), 30 | ('k2', 'v5'), 31 | ('k3', 'v6'), 32 | ('k3', 'v7'), 33 | ('k4', 'v8') 34 | ] 35 | 36 | rdd_B= 37 | [ 38 | ('k1', 't1'), 39 | ('k1', 't2'), 40 | ('k1', 't3'), 41 | ('k2', 't4'), 42 | ('k2', 't5'), 43 | ('k5', 't6'), 44 | ('k6', 't7') 45 | ] 46 | 47 | A_right_joined_B= 48 | [ 49 | ('k1', ('v1', 't1')), 50 | ('k1', ('v1', 't2')), 51 | ('k1', ('v1', 't3')), 52 | ('k1', ('v2', 't1')), 53 | ('k1', ('v2', 't2')), 54 | ('k1', ('v2', 't3')), 55 | ('k2', ('v3', 't4')), 56 | ('k2', ('v3', 't5')), 57 | ('k2', ('v4', 't4')), 58 | ('k2', ('v4', 't5')), 59 | ('k2', ('v5', 't4')), 60 | ('k2', ('v5', 't5')), 61 | ('k5', (None, 't6')), 62 | ('k6', (None, 't7')) 63 | ] 64 | -------------------------------------------------------------------------------- /code/bonus_chapters/join/scala/README.md: -------------------------------------------------------------------------------- 1 | Join in Spark 2 | -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/python/sample_1.fasta: -------------------------------------------------------------------------------- 1 | >SEQUENCE_1 2 | GATTTGGGGCCCAAAGCAGTATCGATGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCAAATAGTGGATCCATTTGTTCAACTCACAGTTTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT 3 | >SEQUENCE_2 4 | GATTTGATTTGGGGCCCAAAGCAGTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTATCGATCAAATAGTGGATCGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCATTTGTTCAACTCACAGTTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT 5 | -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/python/sample_1.fastq: -------------------------------------------------------------------------------- 1 | @SEQ_ID 2 | GATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT 3 | + 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 5 | @SEQ_ID 6 | GATTTCCCGTTCAAAGCAGTATCGATCTTTTAGTAAATCCATTTGTTCAACTCACAGTTG 7 | + 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 9 | @SEQ_ID 10 | GACCCGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT 11 | + 12 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 13 | @SEQ_ID 14 | TCATCATCATCCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT 15 | + 16 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 17 | @SEQ_ID 18 | AGTAAGTAAGTAATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCTAGTAAGTA 19 | + 20 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 21 | -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.bonus_chapter' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/data/sample_1.fasta: -------------------------------------------------------------------------------- 1 | >SEQUENCE_1 2 | GATTTGGGGCCCAAAGCAGTATCGATGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCAAATAGTGGATCCATTTGTTCAACTCACAGTTTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT 3 | >SEQUENCE_2 4 | GATTTGATTTGGGGCCCAAAGCAGTGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTATCGATCAAATAGTGGATCGATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTTCATTTGTTCAACTCACAGTTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT 5 | -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/data/sample_1.fastq: -------------------------------------------------------------------------------- 1 | @SEQ_ID 2 | GATTTGGGGCCCAAAGCAGTATCGATCAAATAGTGGATCCATTTGTTCAACTCACAGTTT 3 | + 4 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 5 | @SEQ_ID 6 | GATTTCCCGTTCAAAGCAGTATCGATCTTTTAGTAAATCCATTTGTTCAACTCACAGTTG 7 | + 8 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 9 | @SEQ_ID 10 | GACCCGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT 11 | + 12 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 13 | @SEQ_ID 14 | TCATCATCATCCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCT 15 | + 16 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 17 | @SEQ_ID 18 | AGTAAGTAAGTAATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGCCTAGTAAGTA 19 | + 20 | !''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65 21 | -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/k-mers/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/run_spark_applications_scripts/kmer_fast_q.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_1.fastq" 3 | K=4 4 | N=3 5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.KMERFastQ "--args=$INPUT_PATH $K $N" -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/run_spark_applications_scripts/kmer_fasta.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_1.fasta" 3 | K=4 4 | N=3 5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.KMERFasta "--args=$INPUT_PATH $K $N" 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/k-mers/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_k-mers' 2 | 3 | -------------------------------------------------------------------------------- /code/bonus_chapters/lambda_expressions/Lambda_Expressions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/lambda_expressions/Lambda_Expressions.pdf -------------------------------------------------------------------------------- /code/bonus_chapters/lambda_expressions/README.md: -------------------------------------------------------------------------------- 1 | # Lambda Functions/Expressions 2 | 3 | ### [Lambda functions basics](./Lambda_Expressions_basics.md) 4 | 5 | ### [Lambda functions tutorial](./Lambda_Expressions.pdf) 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/file1.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | -2 5 | 4 6 | 5 7 | 6 8 | -1 9 | 8 10 | 9 11 | 3 12 | 2 13 | -------------------------------------------------------------------------------- /code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/file2.txt: -------------------------------------------------------------------------------- 1 | 5 2 | 6 3 | 7 4 | 8 5 | -1 6 | -2 7 | -3 8 | 3 9 | 4 10 | 5 11 | 6 12 | 33 13 | 3 14 | -------------------------------------------------------------------------------- /code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/file3.txt: -------------------------------------------------------------------------------- 1 | 1 2 | 2 3 | 3 4 | 4 5 | 5 6 | 6 7 | 7 8 | 8 9 | 9 10 | -------------------------------------------------------------------------------- /code/bonus_chapters/mappartitions/images/mappartitions_image_1.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/mappartitions/images/mappartitions_image_1.drawio.png -------------------------------------------------------------------------------- /code/bonus_chapters/mappartitions/images/mappartitions_image_2.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/mappartitions/images/mappartitions_image_2.drawio.png -------------------------------------------------------------------------------- /code/bonus_chapters/mappartitions/python/find_min_max_by_mappartitions.sh: -------------------------------------------------------------------------------- 1 | # set SPARK_HOME 2 | SPARK_HOME="/Users/mparsian/spark-3.2.0" 3 | 4 | # define your input path 5 | INPUT_PATH="/book/code/bonus_chapters/mappartitions/SAMPLE_INPUT_FILES/" 6 | 7 | # define your PySpark program 8 | PROG="/book/code/bonus_chapters/mappartitions/python/find_min_max_by_mappartitions.py" 9 | 10 | # run your program 11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 12 | 13 | -------------------------------------------------------------------------------- /code/bonus_chapters/mappartitions/scala/README.md: -------------------------------------------------------------------------------- 1 | Example of mapPartitions() is given to find (count, minimum, maximum) 2 | for all given numbers (data is given by a directory, which may have 3 | any number of text files). Each input file may have any number of records. 4 | Each record has one number (integer). 5 | -------------------------------------------------------------------------------- /code/bonus_chapters/physical_partitioning/data_partitioning.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/physical_partitioning/data_partitioning.png -------------------------------------------------------------------------------- /code/bonus_chapters/physical_partitioning/partition_by_one_column.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.sql import SparkSession 3 | 4 | # create a SparkSession object 5 | spark = SparkSession.builder.getOrCreate() 6 | 7 | 8 | # define input path 9 | # input_path= 's3://mybucket/INPUT2/continents_countries_temp.csv' 10 | input_path = sys.argv[1] 11 | 12 | # read data and create a DataFrame 13 | df = spark.read.format("csv")\ 14 | .option("header","true")\ 15 | .option("inferSchema", "true")\ 16 | .load(input_path) 17 | 18 | df.show(10, truncate=False) 19 | df.printSchema() 20 | 21 | # define output path 22 | # output_path = "s3://mybucket/SCU/OUTPUT2/continents_countries1/" 23 | output_path = sys.argv[2] 24 | 25 | # partiton DataFrame by the "continent" column and save it to the output path 26 | df.repartition("continent")\ 27 | .write.mode("append")\ 28 | .partitionBy("continent")\ 29 | .parquet(output_path) 30 | 31 | # done! 32 | spark.stop() 33 | -------------------------------------------------------------------------------- /code/bonus_chapters/physical_partitioning/partition_by_one_column_schema.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE `continents`( 2 | `country` string, 3 | `city` string, 4 | `temperature` integer 5 | ) 6 | PARTITIONED BY ( 7 | `continent` string 8 | ) 9 | STORED AS PARQUET 10 | LOCATION 's3://mybucket/SCU/OUTPUT2/continents_countries1/' 11 | tblproperties ("parquet.compress"="SNAPPY"); 12 | -------------------------------------------------------------------------------- /code/bonus_chapters/physical_partitioning/partition_by_two_columns.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.sql import SparkSession 3 | 4 | # create a SparkSession object 5 | spark = SparkSession.builder.getOrCreate() 6 | 7 | # define input path 8 | # input_path= 's3://mybucket/INPUT2/continents_countries_temp.csv' 9 | input_path = sys.argv[1] 10 | 11 | # read data and create a DataFrame 12 | df = spark.read.format("csv")\ 13 | .option("header","true")\ 14 | .option("inferSchema", "true")\ 15 | .load(input_path) 16 | 17 | df.show(10, truncate=False) 18 | df.printSchema() 19 | 20 | # define output path 21 | # output_path = "s3://mybucket/SCU/OUTPUT2/continents_countries2/" 22 | output_path = sys.argv[2] 23 | 24 | # partiton DataFrame by the "continent" and "country" columns 25 | # and save it to the output path 26 | df.repartition("continent", "country")\ 27 | .write.mode("append")\ 28 | .partitionBy("continent", "country")\ 29 | .parquet(output_path) 30 | 31 | spark.stop() 32 | -------------------------------------------------------------------------------- /code/bonus_chapters/physical_partitioning/partition_by_two_columns_schema.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE `continents_2`( 2 | `city` string, 3 | `temperature` integer 4 | ) 5 | PARTITIONED BY ( 6 | `continent` string, 7 | `country` string 8 | ) 9 | STORED AS PARQUET 10 | LOCATION 's3://mybucket/SCU/OUTPUT2/continents_countries2/' 11 | tblproperties ("parquet.compress"="SNAPPY"); 12 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/data/foxdata.txt: -------------------------------------------------------------------------------- 1 | a red fox jumped of high 2 | fox jumped over a high fence 3 | red of fox jumped 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/data/sample_document.txt: -------------------------------------------------------------------------------- 1 | a crazy fox jumped and jumped 2 | red fox jumped 3 | gray fox jumped and jumped of the fence 4 | red fox jumped of the fence 5 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/word_count_by_dataframe_shorthand.log: -------------------------------------------------------------------------------- 1 | % export SPARK_HOME= 2 | % $SPARK_HOME/bin/spark-submit word_count_by_dataframe_shorthand.py data/foxdata.txt 3 | 4 | input_path= data/foxdata.txt 5 | 6 | final_word_count: 7 | +------+-----+ 8 | |word |count| 9 | +------+-----+ 10 | |jumped|3 | 11 | |fox |3 | 12 | |red |2 | 13 | |high |2 | 14 | +------+-----+ 15 | 16 | root 17 | |-- word: string (nullable = false) 18 | |-- count: long (nullable = false) 19 | 20 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/word_count_by_dataframe_shorthand.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | import pyspark.sql.functions as F 3 | import sys 4 | 5 | #------------------------------------------------------ 6 | # Word Count (shorthand notation) using Spark Dataframe 7 | #------------------------------------------------------ 8 | 9 | # create an instance of SparkSession object 10 | spark = SparkSession.builder.getOrCreate() 11 | 12 | # define your input 13 | input_path = sys.argv[1] 14 | print("input_path=", input_path) 15 | 16 | # read input and create a DataFrame(words: [String]) 17 | # the created df is single column table (name of column: words) 18 | # where each row will be an array of string objects 19 | final_word_count = spark.read\ 20 | .text(input_path)\ 21 | .select(F.split(F.col("value"), " ").alias("words"))\ 22 | .select(F.explode(F.col("words")).alias("word"))\ 23 | .select(F.lower(F.col("word")).alias("word"))\ 24 | .filter(F.length(F.col("word")) > 2)\ 25 | .groupby(F.col("word")).count()\ 26 | .where("count > 1") 27 | 28 | # for debugging purposes 29 | print("final_word_count:") 30 | final_word_count.show(10, truncate=False) 31 | final_word_count.printSchema() 32 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_combinebykey.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | #====================================== 5 | # 6 | # NOTE: print() and collect() are used 7 | # fordebugging and educational purposes. 8 | # 9 | # @author Mahmoud Parsian 10 | # 11 | #====================================== 12 | 13 | # create an instance of a SparkSession as spark 14 | spark = SparkSession.builder.getOrCreate() 15 | print("spark.version=", spark.version) 16 | 17 | # set input path 18 | input_path = sys.argv[1] 19 | print("input_path=", input_path) 20 | 21 | # Note that the "combined data type" 22 | # for combineByKey() is an Integer. 23 | frequencies = spark.sparkContext.textFile(input_path)\ 24 | .flatMap(lambda line: line.split(" "))\ 25 | .map(lambda word: (word, 1))\ 26 | .combineByKey(\ 27 | lambda v: 1,\ 28 | lambda C, v: C+1,\ 29 | lambda C, D: C+D\ 30 | ) 31 | # 32 | print(frequencies.collect()) 33 | 34 | # done! 35 | spark.stop() 36 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_combinebykey.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/home/spark-3.3.1" 3 | # 4 | # define your input path 5 | INPUT_PATH="${SPARK_HOME}/NOTICE" 6 | # 7 | # define your PySpark program 8 | PROG="wordcount_by_combinebykey.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 12 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_groupbykey.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | #====================================== 5 | # 6 | # NOTE: print() and collect() are used for debugging and educational purposes. 7 | # 8 | # @author Mahmoud Parsian 9 | # 10 | #====================================== 11 | def main(): 12 | 13 | # create an instance of a SparkSession as spark 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | # set input path 17 | input_path = sys.argv[1] 18 | print("input_path=", input_path) 19 | 20 | # create RDD from a text file 21 | records = spark.sparkContext.textFile(input_path) 22 | print(records.collect()) 23 | 24 | words = records.flatMap(lambda line: line.split(" ")) 25 | print(words.collect()) 26 | 27 | pairs = words.map(lambda word: (word, 1)) 28 | print(pairs.collect()) 29 | 30 | frequencies = pairs.reduceByKey(lambda a, b: a + b) 31 | print(frequencies.collect()) 32 | 33 | # done! 34 | spark.stop() 35 | #end-def 36 | #====================================== 37 | if __name__ == "__main__": 38 | main() -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_groupbykey.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/book/spark-3.2.0" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/NOTICE" 6 | # 7 | # define your PySpark program 8 | PROG="/book/code/wordcount/wordcount_by_groupbykey.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 12 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_groupbykey_shorthand.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | #====================================== 5 | # 6 | # NOTE: print() and collect() are used for debugging and educational purposes. 7 | # 8 | # @author Mahmoud Parsian 9 | # 10 | #====================================== 11 | def main(): 12 | 13 | # create an instance of a SparkSession as spark 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | # set input path 17 | input_path = sys.argv[1] 18 | print("input_path=", input_path) 19 | 20 | frequencies = spark.sparkContext.textFile(input_path)\ 21 | .flatMap(lambda line: line.split(" "))\ 22 | .map(lambda word: (word, 1))\ 23 | .reduceByKey(lambda a, b: a + b) 24 | # 25 | print(frequencies.collect()) 26 | 27 | # done! 28 | spark.stop() 29 | #end-def 30 | #====================================== 31 | if __name__ == "__main__": 32 | main() -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_groupbykey_shorthand.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/book/spark-3.2.0" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/NOTICE" 6 | # 7 | # define your PySpark program 8 | PROG="/book/code/wordcount/wordcount_by_groupbykey_shorthand.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 12 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_reducebykey.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | #====================================== 5 | # 6 | # NOTE: print() and collect() are used for debugging and educational purposes. 7 | # 8 | # @author Mahmoud Parsian 9 | # 10 | #====================================== 11 | def main(): 12 | 13 | # create an instance of a SparkSession as spark 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | # set input path 17 | input_path = sys.argv[1] 18 | print("input_path=", input_path) 19 | 20 | # create RDD from a text file 21 | records = spark.sparkContext.textFile(input_path) 22 | print(records.collect()) 23 | 24 | words = records.flatMap(lambda line: line.split(" ")) 25 | print(words.collect()) 26 | 27 | pairs = words.map(lambda word: (word, 1)) 28 | print(pairs.collect()) 29 | 30 | frequencies = pairs.reduceByKey(lambda a, b: a + b) 31 | print(frequencies.collect()) 32 | 33 | # done! 34 | spark.stop() 35 | #end-def 36 | #====================================== 37 | if __name__ == "__main__": 38 | main() -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_reducebykey.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/book/spark-3.2.0" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/NOTICE" 6 | # 7 | # define your PySpark program 8 | PROG="/book/code/wordcount/wordcount_by_reducebykey.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 12 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_reducebykey_shorthand.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | #====================================== 5 | # 6 | # NOTE: print() and collect() are used for debugging and educational purposes. 7 | # 8 | # @author Mahmoud Parsian 9 | # 10 | #====================================== 11 | def main(): 12 | 13 | # create an instance of a SparkSession as spark 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | # set input path 17 | input_path = sys.argv[1] 18 | print("input_path=", input_path) 19 | 20 | frequencies = spark.sparkContext.textFile(input_path)\ 21 | .flatMap(lambda line: line.split(" ")) 22 | .map(lambda word: (word, 1))\ 23 | .reduceByKey(lambda a, b: a + b) 24 | # 25 | print(frequencies.collect()) 26 | 27 | # done! 28 | spark.stop() 29 | #end-def 30 | #====================================== 31 | if __name__ == "__main__": 32 | main() -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_reducebykey_shorthand.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/book/spark-3.2.0" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/NOTICE" 6 | # 7 | # define your PySpark program 8 | PROG="/book/code/wordcount/wordcount_by_reducebykey_shorthand.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} 12 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/python/wordcount_by_reducebykey_with_filter.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/book/spark-3.2.0" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/NOTICE" 6 | # 7 | # define your PySpark program 8 | PROG="/book/code/wordcount/wordcount_by_reducebykey_with_filter.py" 9 | 10 | # drop words if its length are less than 3 11 | WORD_LENGTH_THRESHOLD=3 12 | 13 | # drop words (after reduction) if its frequency is less than 2 14 | FREQUENCY_THRESHOLD=2 15 | 16 | # submit your spark application 17 | $SPARK_HOME/bin/spark-submit ${PROG} ${INPUT_PATH} ${WORD_LENGTH_THRESHOLD} ${FREQUENCY_THRESHOLD} 18 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/README.md: -------------------------------------------------------------------------------- 1 | ```` 2 | The purpose of this folder is to present 3 | multiple solutions for classic word count 4 | problem. 5 | 6 | Solutions are provided by using reduceByKey() 7 | and groupByKey() reducers. In general, solution 8 | by using reduceByKey() is a scale-out solution 9 | than using groupByKey(). 10 | 11 | 12 | best regards, 13 | Biman Mandal 14 | ```` 15 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.bonus_chapter' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/data/sample_document.txt: -------------------------------------------------------------------------------- 1 | a crazy fox jumped and jumped 2 | red fox jumped 3 | gray fox jumped and jumped of the fence 4 | red fox jumped of the fence 5 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/wordcount/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_group_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByGroupByKey "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_group_by_key_shorthand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByGroupByKeyShorthand "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_reduce_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByReduceByKey "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_reduce_by_key_shorthand.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByReduceByKeyShorthand "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/run_spark_applications_scripts/word_count_by_reduce_by_key_with_filter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_document.txt" 3 | # drop words if its length are less than 3 4 | WORD_LENGTH_THRESHOLD=3 5 | # drop words (after reduction) if its frequency is less than 2 6 | FREQUENCY_THRESHOLD=2 7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.bonuschapter.WordCountByReduceByKeyWithFilter "--args=$INPUT_PATH $WORD_LENGTH_THRESHOLD $FREQUENCY_THRESHOLD" -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-bonus_chapter_wordcount' 2 | 3 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/src/main/scala/org/data/algorithms/spark/bonuschapter/WordCountByGroupByKeyShorthand.scala: -------------------------------------------------------------------------------- 1 | package org.data.algorithms.spark.bonuschapter 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | *---------------------------------------- 7 | * NOTE: print() and collect() are used for debugging and educational purposes. 8 | * 9 | * @author Biman Mandal 10 | *---------------------------------------- 11 | */ 12 | object WordCountByGroupByKeyShorthand { 13 | def main(args: Array[String]): Unit = { 14 | // create an instance of a SparkSession as spark 15 | val spark = SparkSession.builder.master("local[*]").getOrCreate() 16 | 17 | // set input path 18 | val inputPath = args(0) 19 | println("inputPath=" + inputPath) 20 | 21 | val frequencies = spark.sparkContext.textFile(inputPath) 22 | .flatMap(line => line.split(" ")) 23 | .map(word => (word, 1)) 24 | .groupByKey() 25 | .map(x => (x._1, x._2.toList.sum)) 26 | 27 | println(frequencies.collect().mkString("Array(", ", ", ")")) 28 | 29 | // done! 30 | spark.stop() 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/scala/src/main/scala/org/data/algorithms/spark/bonuschapter/WordCountByReduceByKeyShorthand.scala: -------------------------------------------------------------------------------- 1 | package org.data.algorithms.spark.bonuschapter 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | *---------------------------------------- 7 | * NOTE: print() and collect() are used for debugging and educational purposes. 8 | * 9 | * @author Biman Mandal 10 | *---------------------------------------- 11 | */ 12 | object WordCountByReduceByKeyShorthand { 13 | def main(args: Array[String]): Unit = { 14 | // create an instance of a SparkSession as spark 15 | val spark = SparkSession.builder.master("local[*]").getOrCreate() 16 | 17 | // set input path 18 | val inputPath = args(0) 19 | println("inputPath=" + inputPath) 20 | 21 | val frequencies = spark.sparkContext.textFile(inputPath) 22 | .flatMap(line => line.split(" ")) 23 | .map(word => (word, 1)) 24 | .reduceByKey((x, y) => x + y) 25 | 26 | println(frequencies.collect().mkString("Array(", ", ", ")")) 27 | 28 | // done! 29 | spark.stop() 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /code/bonus_chapters/wordcount/word_count_with_mapreduce.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/bonus_chapters/wordcount/word_count_with_mapreduce.png -------------------------------------------------------------------------------- /code/chap01/data/emps.txt: -------------------------------------------------------------------------------- 1 | 1000,alex,67000 2 | 1001,bob,24000 3 | 1002,jane,69000 4 | 1003,betty,55000 5 | 1004,jeff,59000 6 | -------------------------------------------------------------------------------- /code/chap01/data/sample_5_records.txt: -------------------------------------------------------------------------------- 1 | A,3 2 | A,4 3 | A,5 4 | B,10 5 | B,20 6 | -------------------------------------------------------------------------------- /code/chap01/data/users.txt: -------------------------------------------------------------------------------- 1 | 1,Alex,30,124 2 | 2,Bert,32,234 3 | 3,Curt,28,312 4 | 4,Don,32,180 5 | 5,Mary,30,100 6 | 6,Jane,28,212 7 | 7,Joe,28,128 8 | 8,Al,40,600 9 | -------------------------------------------------------------------------------- /code/chap01/python/average_by_key_use_aggregatebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_aggregatebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_aggregatebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/average_by_key_use_combinebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_combinebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_combinebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/average_by_key_use_foldbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_foldbykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_foldbykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/average_by_key_use_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_groupbykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_groupbykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/average_by_key_use_reducebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_reducebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/average_by_key_use_reducebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_action_describe.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_action_describe.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_action_describe.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_add_column.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_add_column.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_add_column.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_drop_column.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_drop_column.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_drop_column.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_filter.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_filter.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_filter.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_join_cross.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_cross.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_cross.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_join_inner.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_inner.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_inner.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_join_left.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_left.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_left.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_join_right.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_right.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_join_right.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/dataframe_sql.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_sql.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/dataframe_sql.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_cartesian.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_cartesian.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_cartesian.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_combinebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_combinebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_combinebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_filter.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_filter.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_filter.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_flatmap.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_flatmap.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_flatmap.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_groupbykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-2.4.3" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_groupbykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_join.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_join.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.3" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_join.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_map.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_map.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_map.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_mappartitions.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_mappartitions_handle_empty_partitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_mappartitions_handle_empty_partitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_mappartitions_handle_empty_partitions.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_reducebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_reducebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_reducebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_sortby.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_sortby.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_sortby.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/python/rdd_transformation_takeordered.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_takeordered.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap01/rdd_transformation_takeordered.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap01/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.ch01' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/chap01/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap01/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/chap01/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/chap01/scala/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | update_data_in_sh() { 4 | filename=$1 5 | classname=$2 6 | echo "#!/bin/bash" > $filename 7 | echo "./gradlew clean run -PmainClass="$classname >> $filename 8 | } 9 | 10 | script_folder_name=run_spark_applications_scripts 11 | 12 | if [ ! -d $script_folder_name ] 13 | then 14 | mkdir $script_folder_name 15 | fi 16 | 17 | 18 | for file in `find . -type f -regex ".*\.scala"` 19 | do 20 | filename=$(echo $file | awk -F "/" '{print $NF}' | cut -d "." -f1) 21 | path=$(echo $file | rev | cut -d "/" -f 2- | rev) 22 | packagename=$(echo $file | awk -F "/" ' 23 | BEGIN { ORS="" }; 24 | {for(i=5;iseq1 2 | cGTAaccaataaaaaaacaagcttaacctaattc 3 | >seq2 4 | agcttagTTTGGatctggccgggg 5 | >seq3 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca 7 | gaattcgcacca 8 | AATAAAACCTCACCCAT 9 | agagcccagaatttactcCCC 10 | >seq4 11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca 12 | gaattcgcacca 13 | -------------------------------------------------------------------------------- /code/chap02/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | 7 | group 'com.spark.algos.data' 8 | version '1.0-SNAPSHOT' 9 | 10 | repositories { 11 | mavenLocal() 12 | mavenCentral() 13 | } 14 | 15 | dependencies { 16 | implementation group: "org.scala-lang", name: "scala-library", version: "2.13.7" 17 | implementation group: "org.apache.spark", name: "spark-core_2.13", version: "3.2.0" 18 | implementation group: "org.apache.spark", name: "spark-sql_2.13", version: "3.2.0" 19 | } 20 | 21 | application { 22 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 23 | } -------------------------------------------------------------------------------- /code/chap02/scala/data/sample.fasta: -------------------------------------------------------------------------------- 1 | >seq1 2 | cGTAaccaataaaaaaacaagcttaacctaattc 3 | >seq2 4 | agcttagTTTGGatctggccgggg 5 | >seq3 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca 7 | gaattcgcacca 8 | AATAAAACCTCACCCAT 9 | agagcccagaatttactcCCC 10 | >seq4 11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca 12 | gaattcgcacca 13 | -------------------------------------------------------------------------------- /code/chap02/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap02/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/chap02/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_fastq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # define your input path 3 | INPUT_PATH="data/sp1.fastq" 4 | 5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountFastq "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # define your input path 3 | INPUT_PATH="data/sample.fasta" 4 | 5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER1 "--args=$INPUT_PATH" 6 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_1_1GB.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | #------------------------------------------------------ 5 | # NOTE: define your input path 6 | # Before running your Spark program, 7 | # Download *.fa from this location and place it under 8 | # the following directory: /book/chap02/data/ 9 | # 10 | # Download URL: 11 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 12 | #------------------------------------------------------ 13 | # define your input path 14 | INPUT_PATH="data/*.fasta" 15 | 16 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER1 "--args=$INPUT_PATH" 17 | 18 | # 19 | duration=$SECONDS 20 | echo "" 21 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 22 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_1_big.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # NOTE: define your input path 7 | # Before running your Spark program, 8 | # Download chr1.subst.fa from this location: 9 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 10 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz 11 | # 12 | # define your input path 13 | INPUT_PATH="data/chr1.subst.fa" 14 | 15 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER1 "--args=$INPUT_PATH" 16 | 17 | duration=$SECONDS 18 | echo "" 19 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 20 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # define your input path 3 | INPUT_PATH="data/sample.fasta" 4 | 5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER2 "--args=$INPUT_PATH" 6 | 7 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_2_1GB.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | #------------------------------------------------------ 5 | # NOTE: define your input path 6 | # Before running your Spark program, 7 | # Download *.fa from this location and place it under 8 | # the following directory: /book/chap02/data/ 9 | # 10 | # Download URL: 11 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 12 | #------------------------------------------------------ 13 | # 14 | # define your input path 15 | INPUT_PATH="data/*.fa" 16 | 17 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER2 "--args=$INPUT_PATH" 18 | 19 | duration=$SECONDS 20 | echo "" 21 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 22 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_2_big.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # NOTE: define your input path 7 | # Before running your Spark program, 8 | # Download chr1.subst.fa from this location: 9 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 10 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz 11 | # 12 | INPUT_PATH="data/chr1.subst.fa" 13 | 14 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER2 "--args=$INPUT_PATH" 15 | 16 | duration=$SECONDS 17 | echo "" 18 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 19 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # define your input path 3 | INPUT_PATH="data/sample.fasta" 4 | 5 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER3 "--args=$INPUT_PATH" 6 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_3_1GB.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | #------------------------------------------------------ 5 | # NOTE: define your input path 6 | # Before running your Spark program, 7 | # Download *.fa from this location and place it under 8 | # the following directory: /book/chap02/data/ 9 | # 10 | # Download URL: 11 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 12 | #------------------------------------------------------ 13 | INPUT_PATH="data/*.fa" 14 | 15 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER3 "--args=$INPUT_PATH" 16 | 17 | duration=$SECONDS 18 | echo "" 19 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 20 | -------------------------------------------------------------------------------- /code/chap02/scala/run_spark_applications_scripts/dna_base_count_ver_3_big.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # NOTE: define your input path 7 | # Before running your Spark program, 8 | # Download chr1.subst.fa from this location: 9 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 10 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz 11 | # 12 | INPUT_PATH="data/chr1.subst.fa" 13 | 14 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch02.DNABaseCountVER3 "--args=$INPUT_PATH" 15 | 16 | duration=$SECONDS 17 | echo "" 18 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 19 | -------------------------------------------------------------------------------- /code/chap02/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-ch02' -------------------------------------------------------------------------------- /code/chap02/scala/src/main/resources/input.txt: -------------------------------------------------------------------------------- 1 | >seq1 2 | cGTAaccaataaaaaaacaagcttaacctaattc 3 | >seq2 4 | agcttagTTTGGatctggccgggg 5 | >seq3 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca 7 | gaattcgcacca 8 | AATAAAACCTCACCCAT 9 | agagcccagaatttactcCCC 10 | >seq4 11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca 12 | gaattcgcacca -------------------------------------------------------------------------------- /code/chap03/python/bigrams_input.txt: -------------------------------------------------------------------------------- 1 | Spark shines_id in data analytics and beyond 2 | this is the 3 | this is the first record 4 | Spark shines in data analytics and beyond 5 | this is the second record 6 | Spark shines again in data analytics and beyond 7 | -------------------------------------------------------------------------------- /code/chap03/python/sample_input: -------------------------------------------------------------------------------- 1 | alex,Sunnyvale,25 2 | alex,Sunnyvale,33 3 | mary,Ames,22 4 | mary,Cupertino,66 5 | mary,Sunnyvale,44 6 | jane,Ames,20 7 | jane,Troy,40 8 | bob,Ames,26 9 | -------------------------------------------------------------------------------- /code/chap03/python/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 9 2 | 5 3 | 33 4 | 66 5 | -21 6 | -33 7 | 1 8 | 2 9 | 3 10 | 44 11 | 55 12 | 66 13 | 1 14 | 2 15 | -1 16 | -2 17 | 0 18 | 5 19 | 6 20 | 7 21 | 8 22 | 0 23 | -8 24 | -9 25 | 0 26 | 0 27 | 6 28 | 7 29 | 8 30 | 9 31 | 0 32 | -1 33 | -------------------------------------------------------------------------------- /code/chap03/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.ch03' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/chap03/scala/data/bigrams_input.txt: -------------------------------------------------------------------------------- 1 | Spark shines_id in data analytics and beyond 2 | this is the 3 | this is the first record 4 | Spark shines in data analytics and beyond 5 | this is the second record 6 | Spark shines again in data analytics and beyond 7 | -------------------------------------------------------------------------------- /code/chap03/scala/data/sample_input.csv: -------------------------------------------------------------------------------- 1 | alex,Sunnyvale,25 2 | alex,Sunnyvale,33 3 | mary,Ames,22 4 | mary,Cupertino,66 5 | mary,Sunnyvale,44 6 | jane,Ames,20 7 | jane,Troy,40 8 | bob,Ames,26 9 | -------------------------------------------------------------------------------- /code/chap03/scala/data/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 9 2 | 5 3 | 33 4 | 66 5 | -21 6 | -33 7 | 1 8 | 2 9 | 3 10 | 44 11 | 55 12 | 66 13 | 1 14 | 2 15 | -1 16 | -2 17 | 0 18 | 5 19 | 6 20 | 7 21 | 8 22 | 0 23 | -8 24 | -9 25 | 0 26 | 0 27 | 6 28 | 7 29 | 8 30 | 9 31 | 0 32 | -1 33 | -------------------------------------------------------------------------------- /code/chap03/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap03/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/chap03/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/chap03/scala/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | update_data_in_sh() { 4 | filename=$1 5 | classname=$2 6 | echo "#!/bin/bash" > $filename 7 | echo "./gradlew clean run -PmainClass="$classname >> $filename 8 | } 9 | 10 | script_folder_name=run_spark_applications_scripts 11 | 12 | if [ ! -d $script_folder_name ] 13 | then 14 | mkdir $script_folder_name 15 | fi 16 | 17 | 18 | for file in `find . -type f -regex ".*\.scala"` 19 | do 20 | filename=$(echo $file | awk -F "/" '{print $NF}' | cut -d "." -f1) 21 | path=$(echo $file | rev | cut -d "/" -f 2- | rev) 22 | packagename=$(echo $file | awk -F "/" ' 23 | BEGIN { ORS="" }; 24 | {for(i=5;i $filename 7 | echo "./gradlew clean run -PmainClass="$classname >> $filename 8 | } 9 | 10 | script_folder_name=run_spark_applications_scripts 11 | 12 | if [ ! -d $script_folder_name ] 13 | then 14 | mkdir $script_folder_name 15 | fi 16 | 17 | 18 | for file in `find . -type f -regex ".*\.scala"` 19 | do 20 | filename=$(echo $file | awk -F "/" '{print $NF}' | cut -d "." -f1) 21 | path=$(echo $file | rev | cut -d "/" -f 2- | rev) 22 | packagename=$(echo $file | awk -F "/" ' 23 | BEGIN { ORS="" }; 24 | {for(i=5;i<,><,><,> 4 | -------------------------------------------------------------------------------- /code/chap05/python/customers.txt: -------------------------------------------------------------------------------- 1 | c1,2019,T0011,20.67 2 | c1,2019,T0012,12.34 3 | c1,2019,T0013,44.30 4 | c1,2018,T0001,20.67 5 | c1,2018,T0002,12.34 6 | c1,2018,T0003,44.30 7 | c2,2019,T0017,744.30 8 | c2,2019,T0018,820.67 9 | c2,2018,T0022,182.34 10 | c2,2018,T0033,494.30 11 | -------------------------------------------------------------------------------- /code/chap05/python/customers_with_date.RECORD.FORMAT.txt: -------------------------------------------------------------------------------- 1 | Each record has the following format: 2 | 3 | <,><,><,> 4 | -------------------------------------------------------------------------------- /code/chap05/python/customers_with_date.txt: -------------------------------------------------------------------------------- 1 | c1,2/9/2019,T0011,20 2 | c1,2/9/2019,T0012,12 3 | c1,3/9/2019,T0013,30 4 | c1,3/9/2019,T0014,42 5 | c1,4/12/2019,T0023,48 6 | c1,4/12/2018,T0051,28 7 | c1,4/12/2019,T0043,42 8 | c1,4/12/2018,T0091,29 9 | c1,1/3/2018,T0002,12 10 | c1,4/3/2018,T0003,44 11 | c2,2/10/2019,T0511,20 12 | c2,2/10/2019,T0612,17 13 | c2,2/9/2019,T0061,25 14 | c2,2/9/2019,T0062,78 15 | c2,3/12/2019,T0513,67 16 | c2,3/12/2019,T0014,42 17 | c2,4/10/2019,T0023,48 18 | c2,4/10/2018,T0051,28 19 | c2,4/12/2019,T0043,42 20 | c2,4/12/2018,T0091,29 21 | c2,1/9/2018,T0002,12 22 | c2,4/9/2018,T0003,46 23 | -------------------------------------------------------------------------------- /code/chap05/python/partition_data_by_customer_and_year.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run the following program: 3 | # partition_data_by_customer_and_year.py 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/book/spark-3.2.0" 8 | export INPUT_PATH="/book/code/chap05/customers.txt" 9 | export OUTPUT_PATH="/tmp/partition_demo" 10 | export SPARK_PROG="/book/code/chap05/partition_data_by_customer_and_year.py" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH $OUTPUT_PATH 14 | -------------------------------------------------------------------------------- /code/chap05/scala/README.md: -------------------------------------------------------------------------------- 1 | Scala Solutions 2 | -------------------------------------------------------------------------------- /code/chap06/python/airports.json: -------------------------------------------------------------------------------- 1 | {"id":"ORD","City":"Chicago","State":"IL","Country":"USA"} 2 | {"id":"LGA","City":"New York","State":"NY","Country":"USA"} 3 | {"id":"BOS","City":"Boston","State":"MA","Country":"USA"} 4 | {"id":"IAH","City":"Houston","State":"TX","Country":"USA"} 5 | {"id":"EWR","City":"Newark","State":"NJ","Country":"USA"} 6 | {"id":"DEN","City":"Denver","State":"CO","Country":"USA"} 7 | {"id":"MIA","City":"Miami","State":"FL","Country":"USA"} 8 | {"id":"SFO","City":"San Francisco","State":"CA","Country":"USA"} 9 | {"id":"ATL","City":"Atlanta","State":"GA","Country":"USA"} 10 | {"id":"DFW","City":"Dallas","State":"TX","Country":"USA"} 11 | {"id":"CLT","City":"Charlotte","State":"NC","Country":"USA"} 12 | {"id":"LAX","City":"Los Angeles","State":"CA","Country":"USA"} 13 | {"id":"SEA","City":"Seattle","State":"WA","Country":"USA"} 14 | -------------------------------------------------------------------------------- /code/chap06/python/breadth_first_search_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. applying Breadth-first search (BFS) algorithm 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/home/book/spark-3.2.0" 9 | export SPARK_PROG="/home/book/code/chap06/breadth_first_search_example.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap06/python/connected_component_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. finding connected components 5 | # 6 | # Reference: https://en.wikipedia.org/wiki/Connected_component_(graph_theory) 7 | #----------------------------------------------------- 8 | # @author Mahmoud Parsian 9 | #----------------------------------------------------- 10 | export SPARK_HOME="/home/book/spark-3.2.0" 11 | export SPARK_PROG="/home/book/code/chap06/connected_component_example.py" 12 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 13 | # 14 | # run the PySpark program: 15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 16 | -------------------------------------------------------------------------------- /code/chap06/python/graph_builder.log: -------------------------------------------------------------------------------- 1 | export SPARK_HOME="/home/book/spark-3.2.0" 2 | export SPARK_PROG="/home/book/code/chap06/graph_builder.py" 3 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 4 | # 5 | # run the PySpark program: 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 7 | 8 | +---+-------+---+ 9 | | id| name|age| 10 | +---+-------+---+ 11 | | a| Alice| 34| 12 | | b| Bob| 36| 13 | | c|Charlie| 30| 14 | +---+-------+---+ 15 | 16 | +---+---+------------+ 17 | |src|dst|relationship| 18 | +---+---+------------+ 19 | | a| b| friend| 20 | | b| c| follow| 21 | | c| b| follow| 22 | +---+---+------------+ 23 | 24 | graph= GraphFrame( 25 | v:[id: string, name: string ... 1 more field], 26 | e:[src: string, dst: string ... 1 more field] 27 | ) 28 | 29 | +---+--------+ 30 | | id|inDegree| 31 | +---+--------+ 32 | | c| 1| 33 | | b| 2| 34 | +---+--------+ 35 | 36 | count_follow= 2 -------------------------------------------------------------------------------- /code/chap06/python/graph_builder.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building a graph using 3 | # GraphFrames package. 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/home/book/spark-3.2.0" 8 | export SPARK_PROG="/home/book/code/chap06/graph_builder.py" 9 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 13 | -------------------------------------------------------------------------------- /code/chap06/python/label_propagation_algorithm_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. applying Label Propagation Algorithm (LPA) 5 | # 6 | # Reference: https://en.wikipedia.org/wiki/Label_Propagation_Algorithm 7 | #----------------------------------------------------- 8 | # @author Mahmoud Parsian 9 | #----------------------------------------------------- 10 | export SPARK_HOME="/home/book/spark-3.2.0" 11 | export SPARK_PROG="/home/book/code/chap06/label_propagation_algorithm_example.py" 12 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 13 | # 14 | # run the PySpark program: 15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 16 | -------------------------------------------------------------------------------- /code/chap06/python/pagerank_data.txt: -------------------------------------------------------------------------------- 1 | 1,2 2 | 1,3 3 | 1,4 4 | 2,1 5 | 3,1 6 | 4,1 7 | 4,5 8 | 1,5 9 | -------------------------------------------------------------------------------- /code/chap06/python/pagerank_example.log: -------------------------------------------------------------------------------- 1 | export SPARK_HOME="/home/book/spark-3.2.0" 2 | export SPARK_PROG="/home/book/code/chap06/pagerank_example.py" 3 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 4 | # 5 | # run the PySpark program: 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 7 | 8 | +---+-------+---+ 9 | | id| name|age| 10 | +---+-------+---+ 11 | | a| Alice| 34| 12 | | b| Bob| 36| 13 | | c|Charlie| 30| 14 | +---+-------+---+ 15 | 16 | +---+---+------------+ 17 | |src|dst|relationship| 18 | +---+---+------------+ 19 | | a| b| friend| 20 | | b| c| follow| 21 | | c| b| follow| 22 | +---+---+------------+ 23 | 24 | graph= GraphFrame( 25 | v:[id: string, name: string ... 1 more field], 26 | e:[src: string, dst: string ... 1 more field] 27 | ) 28 | 29 | +---+------------------+ 30 | | id| pagerank| 31 | +---+------------------+ 32 | | b|1.0905890109440908| 33 | | a| 0.01| 34 | | c|1.8994109890559092| 35 | +---+------------------+ 36 | -------------------------------------------------------------------------------- /code/chap06/python/pagerank_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building 3 | # 1. a graph using GraphFrames package. 4 | # 2. applying PageRank algorithm to the built graph 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/home/book/spark-3.2.0" 9 | export SPARK_PROG="/home/book/code/chap06/pagerank_example.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap06/python/sample_graph_edges.txt: -------------------------------------------------------------------------------- 1 | edge_weight,from_id,to_id 2 | 0,5,15 3 | 1,18,8 4 | 2,6,1 5 | 3,0,10 6 | 4,2,4 7 | 5,19,7 8 | 6,9,7 9 | 7,11,9 10 | 8,14,9 11 | 9,16,11 12 | 10,17,8 13 | 1,3,4 14 | 2,12,15 15 | 3,13,2 16 | 4,21,0 17 | 5,22,4 18 | 16,22,8 19 | 17,24,4 20 | 18,28,7 21 | 19,28,13 22 | 20,28,16 23 | 1,29,11 24 | 2,30,16 25 | 3,31,15 26 | 24,32,2 27 | 25,32,30 28 | 6,35,11 29 | 7,35,24 30 | 28,36,16 31 | 29,39,7 32 | 30,39,28 33 | 1,40,7 34 | 2,40,11 35 | 3,41,5 36 | 4,41,16 37 | 5,41,32 38 | 6,42,32 39 | 7,43,36 40 | 8,44,16 41 | 9,46,7 42 | 6,49,3 43 | 1,5,31 44 | 2,30,42 45 | 4,17,22 46 | 4,18,22 47 | 1,50,51 48 | 2,51,52 49 | 3,50,52 50 | 1,71,72 51 | 1,71,73 52 | 1,72,73 53 | -------------------------------------------------------------------------------- /code/chap06/python/sample_graph_vertices.txt: -------------------------------------------------------------------------------- 1 | vertex_id 2 | 0 3 | 1 4 | 2 5 | 3 6 | 4 7 | 5 8 | 6 9 | 7 10 | 8 11 | 9 12 | 10 13 | 11 14 | 12 15 | 13 16 | 14 17 | 15 18 | 16 19 | 17 20 | 18 21 | 19 22 | 20 23 | 21 24 | 22 25 | 23 26 | 24 27 | 25 28 | 26 29 | 27 30 | 28 31 | 29 32 | 30 33 | 31 34 | 32 35 | 33 36 | 34 37 | 35 38 | 36 39 | 37 40 | 38 41 | 39 42 | 40 43 | 41 44 | 42 45 | 43 46 | 44 47 | 45 48 | 46 49 | 47 50 | 48 51 | 49 52 | 50 53 | 51 54 | 52 55 | 71 56 | 72 57 | 73 58 | -------------------------------------------------------------------------------- /code/chap06/python/shortest_path_finder.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. finding shortest paths for given landmarks 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/home/book/spark-3.2.0" 9 | export SPARK_PROG="/home/book/code/chap06/shortest_path_finder.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap06/python/triangles_counter.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building 3 | # 1. a graph using GraphFrames package. 4 | # 2. applying Triangles Counting algorithm to the built graph 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/home/book/spark-3.2.0" 9 | export SPARK_PROG="/home/book/code/chap06/triangles_counter.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap06/python/unique_triangles_finder.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building 3 | # 1. a graph using GraphFrames package. 4 | # 2. find unique Triangles from the built graph 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/home/book/spark-3.2.0" 9 | export SPARK_PROG="/home/book/code/chap06/unique_triangles_finder.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.8.2-spark3.2-s_2.12" 11 | # 12 | export VERTICES_PATH="/home/book/code/chap06/sample_graph_vertices.txt" 13 | export EDGES_PATH="/home/book/code/chap06/sample_graph_edges.txt" 14 | # 15 | # run the PySpark program: 16 | ${SPARK_HOME}/bin/spark-submit --packages ${GRAPH_FRAMES} ${SPARK_PROG} ${VERTICES_PATH} ${EDGES_PATH} 17 | -------------------------------------------------------------------------------- /code/chap06/scala/README.md: -------------------------------------------------------------------------------- 1 | Scala Solutions 2 | -------------------------------------------------------------------------------- /code/chap06/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.12' 5 | ext.scalaVersion = '2.12.15' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.ch03' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | // mavenLocal() 13 | mavenCentral() 14 | maven { 15 | url "https://repos.spark-packages.org" 16 | } 17 | } 18 | 19 | dependencies { 20 | implementation "org.scala-lang:scala-library:$scalaVersion" 21 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 22 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 23 | implementation "org.apache.spark:spark-graphx_$scalaClassifier:$sparkVersion" 24 | implementation 'graphframes:graphframes:0.8.2-spark3.2-s_2.12' 25 | 26 | } 27 | 28 | application { 29 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 30 | } -------------------------------------------------------------------------------- /code/chap06/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-ch06' 2 | 3 | -------------------------------------------------------------------------------- /code/chap07/python/cats.no.header.csv: -------------------------------------------------------------------------------- 1 | cuttie,2,female,6 2 | mono,3,male,9 3 | fuzzy,1,female,4 4 | -------------------------------------------------------------------------------- /code/chap07/python/cats.with.header.csv: -------------------------------------------------------------------------------- 1 | name,age,gender,weight 2 | cuttie,2,female,6 3 | mono,3,male,9 4 | fuzzy,1,female,4 5 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_csv_reader_no_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_csv_reader_no_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_FILE="/book/code/chap07/sample_no_header.csv" 8 | export SPARK_PROG="/book/code/chap07/datasource_csv_reader_no_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_csv_reader_with_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_csv_reader_with_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_FILE="/book/code/chap07/sample_with_header.csv" 8 | export SPARK_PROG="/book/code/chap07/datasource_csv_reader_with_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_csv_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_csv_writer.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_csv_writer.py" 8 | export OUTPUT_CSV_FILE_PATH="/tmp/output.csv" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG ${OUTPUT_CSV_FILE_PATH} 12 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_elasticsearch_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_elasticsearch_reader.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_elasticsearch_reader.py" 8 | export ELASTIC_SEARCH_HOST="localhost" 9 | export JAR="/book/code/jars/elasticsearch-hadoop-6.4.2.jar" 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST} 13 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_elasticsearch_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_elasticsearch_writer.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_elasticsearch_writer.py" 8 | export ELASTIC_SEARCH_HOST="localhost" 9 | export JAR="/book/code/jars/elasticsearch-hadoop-6.4.2.jar" 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST} 13 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_gzip_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_gzip_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_FILE="/book/code/chap07/sample_no_header.csv" 8 | export SPARK_PROG="/book/code/chap07/datasource_csv_reader_no_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_jdbc_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_jdbc_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_jdbc_reader.py" 8 | # 9 | # define the required MySQL database connection parameters 10 | export JDBC_URL="jdbc:mysql://localhost/metadb" 11 | export JDBC_DRIVER="com.mysql.jdbc.Driver" 12 | export JDBC_USER="root" 13 | export JDBC_PASSWORD="mp22_password" 14 | export JDBC_SOURCE_TABLE_NAME="dept" 15 | # 16 | # define the required JAR file for MySQL database access 17 | export JAR="/book/code/jars/mysql-connector-java-5.1.42.jar" 18 | # 19 | # run the PySpark program: 20 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_SOURCE_TABLE_NAME} 21 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_jdbc_writer.sh: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------------- 2 | # This is a shell script to run datasource_jdbc_writer.py 3 | #-------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #-------------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_jdbc_writer.py" 8 | # 9 | # define the required MySQL database connection parameters 10 | export JDBC_URL="jdbc:mysql://localhost/metadb" 11 | export JDBC_DRIVER="com.mysql.jdbc.Driver" 12 | export JDBC_USER="root" 13 | export JDBC_PASSWORD="mp22_password" 14 | export JDBC_TARGET_TABLE_NAME="people" 15 | # 16 | # define the required JAR file for MySQL database access 17 | export JAR="/book/code/jars/mysql-connector-java-5.1.42.jar" 18 | # 19 | # run the PySpark program: 20 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_TARGET_TABLE_NAME} 21 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_json_reader_multi_line.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_json_reader_multi_line.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_FILE="/book/code/chap07/sample_multi_line.json" 8 | export SPARK_PROG="/book/code/chap07/datasource_json_reader_multi_line.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_json_reader_single_line.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_json_reader_single_line.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_FILE="/book/code/chap07/sample_single_line.json" 8 | export SPARK_PROG="/book/code/chap07/datasource_json_reader_single_line.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_mongodb_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_mongodb_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_mongodb_reader.py" 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll44" 9 | export JAR1="/book/code/jars/mongo-java-driver-3.8.2.jar" 10 | export JAR2="/book/code/jars/mongo-spark-connector_2.11-2.2.5.jar" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI} 14 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_mongodb_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_mongodb_reader.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_mongodb_writer.py" 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll66" 9 | export JAR1="/book/code/jars/mongo-java-driver-3.8.2.jar" 10 | export JAR2="/book/code/jars/mongo-spark-connector_2.11-2.2.5.jar" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI} 14 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_redis_reader.sh: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------------- 2 | # This is a shell script to run datasource_redis_reader.py 3 | #-------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #-------------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_redis_reader.py" 8 | # 9 | # define the required redis database connection parameters 10 | export REDIS_HOST="localhost" 11 | export REDIS_PORT="6379" 12 | # you may add password 13 | #export REDIS_PASSWORD="" 14 | # 15 | # define the required JAR file for redis database access 16 | export JAR="/book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar" 17 | # 18 | # run the PySpark program: 19 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 20 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_redis_writer.sh: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------------- 2 | # This is a shell script to run datasource_redis_writer.py 3 | #-------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #-------------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap07/datasource_redis_writer.py" 8 | # 9 | # define the required redis database connection parameters 10 | export REDIS_HOST="localhost" 11 | export REDIS_PORT="6379" 12 | # you may add password 13 | #export REDIS_PASSWORD="" 14 | # 15 | # define the required JAR file for redis database access 16 | export JAR="/book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar" 17 | # 18 | # run the PySpark program: 19 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 20 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_textfile_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_textfile_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_FILE="/book/code/chap07/sample_numbers.txt" 8 | export SPARK_PROG="/book/code/chap07/datasource_textfile_reader.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/python/datasource_textfile_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_textfile_writer.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export OUTPUT_PATH="/tmp/zoutput" 8 | export SPARK_PROG="/book/code/chap07/datasource_textfile_writer.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $OUTPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap07/python/images/cat1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat1.jpg -------------------------------------------------------------------------------- /code/chap07/python/images/cat2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat2.jpg -------------------------------------------------------------------------------- /code/chap07/python/images/cat3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat3.jpg -------------------------------------------------------------------------------- /code/chap07/python/images/cat4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/cat4.jpg -------------------------------------------------------------------------------- /code/chap07/python/images/duck1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/duck1.jpg -------------------------------------------------------------------------------- /code/chap07/python/images/duck2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/images/duck2.jpg -------------------------------------------------------------------------------- /code/chap07/python/images/not-image.txt: -------------------------------------------------------------------------------- 1 | not an image 2 | -------------------------------------------------------------------------------- /code/chap07/python/mongodb_coll44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/mongodb_coll44.png -------------------------------------------------------------------------------- /code/chap07/python/mongodb_coll66.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/mongodb_coll66.png -------------------------------------------------------------------------------- /code/chap07/python/name_age_salary.csv: -------------------------------------------------------------------------------- 1 | alex,60,18000 2 | adel,40,45000 3 | adel,50,77000 4 | jane,40,52000 5 | jane,60,81000 6 | alex,50,62000 7 | mary,50,92000 8 | mary,60,63000 9 | mary,40,55000 10 | mary,40,55000 11 | -------------------------------------------------------------------------------- /code/chap07/python/people.txt: -------------------------------------------------------------------------------- 1 | Alex,30,Tennis 2 | Betty,40,Swimming 3 | Dave,20,Walking 4 | Jeff,77,Baseball 5 | -------------------------------------------------------------------------------- /code/chap07/python/sample_multi_line.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}}, 3 | {"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}}, 4 | { 5 | "name": "bob", 6 | "id": 300, 7 | "scores": [ 8 | 3, 9 | 4, 10 | 6, 11 | 9 12 | ], 13 | "dict": { 14 | "key": "value33", 15 | "key2": "value44" 16 | } 17 | }, 18 | { 19 | "name": "bob", 20 | "id": 400, 21 | "scores": [ 22 | 3, 23 | 5, 24 | 6, 25 | 9 26 | ], 27 | "dict": { 28 | "key": "value55", 29 | "key2": "value66" 30 | } 31 | } 32 | ] 33 | -------------------------------------------------------------------------------- /code/chap07/python/sample_no_header.csv: -------------------------------------------------------------------------------- 1 | Alex,Sunnyvale,30 2 | Mary,Cupertino,28 3 | Jane,Stanford,44 4 | Bob,Ames,33 5 | -------------------------------------------------------------------------------- /code/chap07/python/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 123,344,455,6666,2,300 2 | 7777,4444,55 3 | 22,34 4 | 900,901,902,9000,5600,5600,5700,45 5 | 45 6 | 70,71,72 7 | -------------------------------------------------------------------------------- /code/chap07/python/sample_single_line.json: -------------------------------------------------------------------------------- 1 | {"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}} 2 | {"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}} 3 | {"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}} 4 | {"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}} 5 | -------------------------------------------------------------------------------- /code/chap07/python/sample_with_header.csv: -------------------------------------------------------------------------------- 1 | name,city,age 2 | Alex,Sunnyvale,30 3 | Mary,Cupertino,28 4 | Jane,Stanford,44 5 | Bob,Ames,33 6 | -------------------------------------------------------------------------------- /code/chap07/python/twitter.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/python/twitter.avro -------------------------------------------------------------------------------- /code/chap07/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.12' 5 | ext.scalaVersion = '2.12.15' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.ch07' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | implementation 'com.redislabs:spark-redis_2.12:3.0.0' 21 | implementation 'org.elasticsearch:elasticsearch-hadoop:7.16.3' 22 | implementation 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1' 23 | implementation 'mysql:mysql-connector-java:8.0.27' 24 | } 25 | 26 | application { 27 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 28 | } -------------------------------------------------------------------------------- /code/chap07/scala/data/sample_multi_line.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}}, 3 | {"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}}, 4 | { 5 | "name": "bob", 6 | "id": 300, 7 | "scores": [ 8 | 3, 9 | 4, 10 | 6, 11 | 9 12 | ], 13 | "dict": { 14 | "key": "value33", 15 | "key2": "value44" 16 | } 17 | }, 18 | { 19 | "name": "bob", 20 | "id": 400, 21 | "scores": [ 22 | 3, 23 | 5, 24 | 6, 25 | 9 26 | ], 27 | "dict": { 28 | "key": "value55", 29 | "key2": "value66" 30 | } 31 | } 32 | ] 33 | -------------------------------------------------------------------------------- /code/chap07/scala/data/sample_no_header.csv: -------------------------------------------------------------------------------- 1 | Alex,Sunnyvale,30 2 | Mary,Cupertino,28 3 | Jane,Stanford,44 4 | Bob,Ames,33 5 | -------------------------------------------------------------------------------- /code/chap07/scala/data/sample_no_header.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/scala/data/sample_no_header.csv.gz -------------------------------------------------------------------------------- /code/chap07/scala/data/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 123,344,455,6666,2,300 2 | 7777,4444,55 3 | 22,34 4 | 900,901,902,9000,5600,5600,5700,45 5 | 45 6 | 70,71,72 7 | -------------------------------------------------------------------------------- /code/chap07/scala/data/sample_single_line.json: -------------------------------------------------------------------------------- 1 | {"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}} 2 | {"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}} 3 | {"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}} 4 | {"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}} 5 | -------------------------------------------------------------------------------- /code/chap07/scala/data/sample_with_header.csv: -------------------------------------------------------------------------------- 1 | name,city,age 2 | Alex,Sunnyvale,30 3 | Mary,Cupertino,28 4 | Jane,Stanford,44 5 | Bob,Ames,33 6 | -------------------------------------------------------------------------------- /code/chap07/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap07/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/chap07/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_csv_reader_header.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_with_header.csv" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceCSVReaderHeader "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_csv_reader_no_header.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | INPUT_PATH="data/sample_no_header.csv" 4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceCSVReaderNoHeader "--args=$INPUT_PATH" 5 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_csv_writer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export OUTPUT_CSV_FILE_PATH="data/tmp/output" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceCSVWriter "--args=$OUTPUT_CSV_FILE_PATH" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_elasticsearch_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ELASTICSEARCH_SERVER="localhost" 3 | ELASTICSEARCH_PORT="9200" 4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceElasticsearchReader "--args=$ELASTICSEARCH_SERVER $ELASTICSEARCH_PORT" 5 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_elasticsearch_writer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ELASTICSEARCH_SERVER="localhost" 3 | ELASTICSEARCH_PORT="9200" 4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceElasticsearchWriter "--args=$ELASTICSEARCH_SERVER $ELASTICSEARCH_PORT" 5 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_gzip_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_no_header.csv.gz" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceGZIPReader "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_jdbc_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | JDBC_URL=jdbc:mysql://localhost/metadb 3 | JDBC_DRIVER=com.mysql.cj.jdbc.Driver 4 | JDBC_TARGET_TABLE_NAME=people 5 | JDBC_USER=root 6 | JDBC_PASSWORD=my-secret-pw 7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJDBCReader "--args=$JDBC_URL $JDBC_DRIVER $JDBC_USER $JDBC_PASSWORD $JDBC_TARGET_TABLE_NAME" 8 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_jdbc_writer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | JDBC_URL=jdbc:mysql://localhost/metadb 3 | JDBC_DRIVER=com.mysql.cj.jdbc.Driver 4 | JDBC_TARGET_TABLE_NAME=people 5 | JDBC_USER=root 6 | JDBC_PASSWORD=my-secret-pw 7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJDBCWriter "--args=$JDBC_URL $JDBC_DRIVER $JDBC_USER $JDBC_PASSWORD $JDBC_TARGET_TABLE_NAME" 8 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_multi_line.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_multi_line.json" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderMultiLine "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_json_reader_single_line.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_single_line.json" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceJSONReaderSingleLine "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_mongodb_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MONGODB_URI="mongodb://localhost:27017/test.coll66" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceMongodbReader "--args=$MONGODB_URI" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_mongodb_writer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | MONGODB_URI="mongodb://localhost:27017/test.coll66" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceMongodbWriter "--args=$MONGODB_URI" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_redis_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | REDIS_SERVER="localhost" 3 | REDIS_PORT="6379" 4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceRedisReader "--args=$REDIS_SERVER $REDIS_PORT" 5 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_redis_writer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | REDIS_SERVER="localhost" 3 | REDIS_PORT="6379" 4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceRedisWriter "--args=$REDIS_SERVER $REDIS_PORT" 5 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_textfile_reader.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_numbers.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceTextfileReader "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/run_spark_applications_scripts/datasource_textfile_writer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUTPUT_PATH="data/tmp/text-file-out" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch07.DatasourceTextfileWriter "--args=$OUTPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/chap07/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-ch07' 2 | 3 | -------------------------------------------------------------------------------- /code/chap08/python/rank_product/sample_input/rp1.txt: -------------------------------------------------------------------------------- 1 | K_1,30.0 2 | K_2,60.0 3 | K_3,10.0 4 | K_4,80.0 5 | -------------------------------------------------------------------------------- /code/chap08/python/rank_product/sample_input/rp2.txt: -------------------------------------------------------------------------------- 1 | K_1,90.0 2 | K_2,70.0 3 | K_3,40.0 4 | K_4,50.0 5 | -------------------------------------------------------------------------------- /code/chap08/python/rank_product/sample_input/rp3.txt: -------------------------------------------------------------------------------- 1 | K_1,4.0 2 | K_2,8.0 3 | -------------------------------------------------------------------------------- /code/chap08/scala/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 8 2 | The program covers page rank and rank product algorithms using group by key and combine by key. 3 | 4 | * ### Page Rank: 5 | * `org.data.algorithms.spark.ch08.PageRank` (Spark program) 6 | * `./run_spark_applications_scripts/page_rank.sh` (shell script to call Spark Application) 7 | 8 | * ### Rank Product using Combine By Key: 9 | * `org.data.algorithms.spark.ch08.RankProductUsingCombineByKey` (Spark program) 10 | * `./run_spark_applications_scripts/rank_product_using_combine_by_key.sh` (shell script to call Spark Application) 11 | 12 | * ### Rank Product using Group By Key: 13 | * `org.data.algorithms.spark.ch08.RankProductUsingGroupByKey` (Spark program) 14 | * `./run_spark_applications_scripts/rank_product_using_group_by_key.sh` (shell script to call Spark Application) 15 | -------------------------------------------------------------------------------- /code/chap08/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.ch08' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/chap08/scala/data/sample_input/rp1.txt: -------------------------------------------------------------------------------- 1 | K_1,30.0 2 | K_2,60.0 3 | K_3,10.0 4 | K_4,80.0 5 | -------------------------------------------------------------------------------- /code/chap08/scala/data/sample_input/rp2.txt: -------------------------------------------------------------------------------- 1 | K_1,90.0 2 | K_2,70.0 3 | K_3,40.0 4 | K_4,50.0 5 | -------------------------------------------------------------------------------- /code/chap08/scala/data/sample_input/rp3.txt: -------------------------------------------------------------------------------- 1 | K_1,4.0 2 | K_2,8.0 3 | -------------------------------------------------------------------------------- /code/chap08/scala/data/urls.txt: -------------------------------------------------------------------------------- 1 | url_1,url_4 2 | url_2,url_1 3 | url_3,url_2 4 | url_3,url_1 5 | url_4,url_3 6 | url_4,url_1 -------------------------------------------------------------------------------- /code/chap08/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap08/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/chap08/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/chap08/scala/run_spark_applications_scripts/page_rank.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/urls.txt" 3 | NUMBER_OF_ITERATIONS=5 4 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch08.PageRank "--args=$INPUT_PATH $NUMBER_OF_ITERATIONS" 5 | -------------------------------------------------------------------------------- /code/chap08/scala/run_spark_applications_scripts/rank_product_using_combine_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUTPUT_PATH="data/tmp/rank-product-combine-by-key" 3 | NUMBER_OF_STUDIES=3 4 | INPUT_PATH_FOR_STUDY_1="data/sample_input/rp1.txt" 5 | INPUT_PATH_FOR_STUDY_2="data/sample_input/rp2.txt" 6 | INPUT_PATH_FOR_STUDY_3="data/sample_input/rp3.txt" 7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch08.RankProductUsingCombineByKey "--args=$OUTPUT_PATH $NUMBER_OF_STUDIES $INPUT_PATH_FOR_STUDY_1 $INPUT_PATH_FOR_STUDY_2 $INPUT_PATH_FOR_STUDY_3" 8 | -------------------------------------------------------------------------------- /code/chap08/scala/run_spark_applications_scripts/rank_product_using_group_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | OUTPUT_PATH="data/tmp/rank-product-group-by-key" 3 | NUMBER_OF_STUDIES=3 4 | INPUT_PATH_FOR_STUDY_1="data/sample_input/rp1.txt" 5 | INPUT_PATH_FOR_STUDY_2="data/sample_input/rp2.txt" 6 | INPUT_PATH_FOR_STUDY_3="data/sample_input/rp3.txt" 7 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch08.RankProductUsingGroupByKey "--args=$OUTPUT_PATH $NUMBER_OF_STUDIES $INPUT_PATH_FOR_STUDY_1 $INPUT_PATH_FOR_STUDY_2 $INPUT_PATH_FOR_STUDY_3" 8 | -------------------------------------------------------------------------------- /code/chap08/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-ch08' 2 | 3 | -------------------------------------------------------------------------------- /code/chap09/python/README.md: -------------------------------------------------------------------------------- 1 | # PySpark Solutions for Chapter 9 2 | 3 | Work in Progress... 4 | 5 | Sample Codes for this chapter will be posted by end of March 2023. 6 | 7 | Thanks. 8 | -------------------------------------------------------------------------------- /code/chap09/scala/README.md: -------------------------------------------------------------------------------- 1 | Scala Solutions 2 | -------------------------------------------------------------------------------- /code/chap10/data_design_patterns.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap10/data_design_patterns.pdf -------------------------------------------------------------------------------- /code/chap10/python/average_monoid_use_aggregatebykey.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/book/code/chap10/average_monoid_use_aggregatebykey.py" 3 | # define your input path 4 | export INPUT="/book/code/chap10/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap10/python/average_monoid_use_combinebykey.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/book/code/chap10/average_monoid_use_combinebykey.py" 3 | # define your input path 4 | export INPUT="/book/code/chap10/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap10/python/average_monoid_use_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #========================================== 2 | # NOTE: 3 | # 4 | # In general, avoid using groupByKey(), and 5 | # instead use reduceByKey() or combineByKey(). 6 | # For details see: 7 | # https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html 8 | # 9 | # The groupByKey() solution is provided for educational 10 | # purposes. If you need all of the values of a key for 11 | # some aggregation such as finding the "median" (which you 12 | # need all of the values per key), then the groupByKey() 13 | # may be used. 14 | #========================================== 15 | # 16 | # define PySpark program 17 | export PROG="/book/code/chap10/average_monoid_use_groupbykey.py" 18 | # define your input path 19 | export INPUT="/book/code/chap10/sample_input.txt" 20 | # define your Spark home directory 21 | export SPARK_HOME="/book/spark-3.2.0" 22 | # run the program 23 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 24 | -------------------------------------------------------------------------------- /code/chap10/python/average_monoid_use_reducebykey.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/book/code/chap10/average_monoid_use_reducebykey.py" 3 | # define your input path 4 | export INPUT="/book/code/chap10/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap10/python/dna_base_count_basic_using_combinebykey.sh: -------------------------------------------------------------------------------- 1 | start_time=$(date +%s) 2 | # 3 | INPUT_PATH=~/Downloads/rs_chY.fas 4 | $SPARK_HOME/bin/spark-submit dna_base_count_basic_using_combinebykey.py $INPUT_PATH 5 | # 6 | end_time=$(date +%s) 7 | # elapsed time with second resolution 8 | elapsed=$(( end_time - start_time )) 9 | echo "elapsed time (in seconds): $elapsed" 10 | 11 | -------------------------------------------------------------------------------- /code/chap10/python/dna_base_count_basic_using_groupbykey.sh: -------------------------------------------------------------------------------- 1 | start_time=$(date +%s) 2 | # 3 | INPUT_PATH=~/Downloads/rs_chY.fas 4 | $SPARK_HOME/bin/spark-submit dna_base_count_basic_using_groupbykey.py $INPUT_PATH 5 | # 6 | end_time=$(date +%s) 7 | # elapsed time with second resolution 8 | elapsed=$(( end_time - start_time )) 9 | echo "elapsed time (in seconds): $elapsed" 10 | 11 | -------------------------------------------------------------------------------- /code/chap10/python/dna_base_count_basic_using_reducebykey.sh: -------------------------------------------------------------------------------- 1 | start_time=$(date +%s) 2 | # 3 | INPUT_PATH=~/Downloads/rs_chY.fas 4 | $SPARK_HOME/bin/spark-submit dna_base_count_basic_using_reducebykey.py $INPUT_PATH 5 | # 6 | end_time=$(date +%s) 7 | # elapsed time with second resolution 8 | elapsed=$(( end_time - start_time )) 9 | echo "elapsed time (in seconds): $elapsed" 10 | 11 | 12 | -------------------------------------------------------------------------------- /code/chap10/python/dna_base_count_inmapper_combiner_using_combinebykey.sh: -------------------------------------------------------------------------------- 1 | start_time=$(date +%s) 2 | # 3 | INPUT_PATH=~/Downloads/rs_chY.fas 4 | $SPARK_HOME/bin/spark-submit dna_base_count_inmapper_combiner_using_combinebykey.py $INPUT_PATH 5 | # 6 | end_time=$(date +%s) 7 | # elapsed time with second resolution 8 | elapsed=$(( end_time - start_time )) 9 | echo "elapsed time (in seconds): $elapsed" 10 | -------------------------------------------------------------------------------- /code/chap10/python/dna_base_count_inmapper_combiner_using_groupbykey.sh: -------------------------------------------------------------------------------- 1 | start_time=$(date +%s) 2 | # 3 | INPUT_PATH=~/Downloads/rs_chY.fas 4 | $SPARK_HOME/bin/spark-submit dna_base_count_inmapper_combiner_using_groupbykey.py $INPUT_PATH 5 | # 6 | end_time=$(date +%s) 7 | # elapsed time with second resolution 8 | elapsed=$(( end_time - start_time )) 9 | echo "elapsed time (in seconds): $elapsed" 10 | -------------------------------------------------------------------------------- /code/chap10/python/dna_base_count_inmapper_combiner_using_reducebykey.sh: -------------------------------------------------------------------------------- 1 | start_time=$(date +%s) 2 | # 3 | INPUT_PATH=~/Downloads/rs_chY.fas 4 | $SPARK_HOME/bin/spark-submit dna_base_count_inmapper_combiner_using_reducebykey.py $INPUT_PATH 5 | # 6 | end_time=$(date +%s) 7 | # elapsed time with second resolution 8 | elapsed=$(( end_time - start_time )) 9 | echo "elapsed time (in seconds): $elapsed" 10 | -------------------------------------------------------------------------------- /code/chap10/python/dna_base_count_using_mappartitions.sh: -------------------------------------------------------------------------------- 1 | start_time=$(date +%s) 2 | # 3 | INPUT_PATH=~/Downloads/rs_chY.fas 4 | $SPARK_HOME/bin/spark-submit dna_base_count_using_mappartitions.py $INPUT_PATH 5 | # 6 | end_time=$(date +%s) 7 | # elapsed time with second resolution 8 | elapsed=$(( end_time - start_time )) 9 | echo "elapsed time (in seconds): $elapsed" 10 | 11 | -------------------------------------------------------------------------------- /code/chap10/python/inmapper_combiner_local_aggregation.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/book/code/chap10/inmapper_combiner_local_aggregation.py" 3 | # define your input path 4 | export INPUT="/book/code/chap10/sample_dna_seq.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap10/python/inmapper_combiner_use_basic_mapreduce.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/book/code/chap10/inmapper_combiner_use_basic_mapreduce.py" 3 | # define your input path 4 | export INPUT="/book/code/chap10/sample_dna_seq.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap10/python/inmapper_combiner_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/book/code/chap10/inmapper_combiner_use_mappartitions.py" 3 | # define your input path 4 | export INPUT="/book/code/chap10/sample_dna_seq.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap10/python/minmax_force_empty_partitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run minmax_use_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_PATH="/book/code/chap10/sample_numbers.txt" 8 | export SPARK_PROG="/book/code/chap10/minmax_force_empty_partitions.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap10/python/minmax_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run minmax_use_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export INPUT_PATH="/book/code/chap10/sample_numbers.txt" 8 | export SPARK_PROG="/book/code/chap10/minmax_use_mappartitions.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap10/python/minmax_use_mappartitions_v2.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run minmax_use_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.4.0" 7 | export INPUT_PATH="/book/code/chap10/sample_numbers.txt" 8 | export SPARK_PROG="/book/code/chap10/minmax_use_mappartitions_v2.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap10/python/sample_dna_seq.txt: -------------------------------------------------------------------------------- 1 | ATCGGGATCCGGG 2 | ATTCCGGGATTCCCC 3 | ATGGCCCCCGGGATCGGG 4 | CGGTATCCGGGGAAAAA 5 | aaattCCGGAACCGGGGGTTT 6 | CCTTTTATCGGGCAAATTTTCCCGG 7 | attttcccccggaaaAAATTTCCGGG 8 | ACTGACTAGCTAGCTAACTG 9 | GCATCGTAGCTAGCTACGAT 10 | AATTCCCGCATCGATCGTACGTACGTAG 11 | ATCGATCGATCGTACGATCG 12 | -------------------------------------------------------------------------------- /code/chap10/python/sample_input.txt: -------------------------------------------------------------------------------- 1 | a,2 2 | a,3 3 | a,4 4 | a,5 5 | a,7 6 | b,4 7 | b,5 8 | b,6 9 | c,3 10 | c,4 11 | c,5 12 | c,6 13 | -------------------------------------------------------------------------------- /code/chap10/python/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 23,24,22,44,66,77,44,44,555,666 2 | 12,4,555,66,67,68,57,55,56,45,45,45,66,77 3 | 34,35,36,97300,78,79 4 | 120,44,444,445,345,345,555 5 | 11,33,34,35,36,37,47,7777,8888,6666,44,55 6 | 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105 7 | 6,7,8,9,10 8 | 8,9,10,12,12 9 | 7777 10 | 222,333,444,555,666,111,112,5,113,114 11 | 5555,4444,24 12 | -------------------------------------------------------------------------------- /code/chap10/python/top_N_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run top_N_use_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap10/top_N_use_mappartitions.py" 8 | # 9 | # run the PySpark program: 10 | # find Top-3 11 | export N = 3 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N 13 | -------------------------------------------------------------------------------- /code/chap10/python/top_N_use_takeordered.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run top_N_use_takeordered.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/book/spark-3.2.0" 7 | export SPARK_PROG="/book/code/chap10/top_N_use_takeordered.py" 8 | # 9 | # run the PySpark program: 10 | # find Top-3 11 | export N = 3 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N 13 | -------------------------------------------------------------------------------- /code/chap10/scala/.gitignore: -------------------------------------------------------------------------------- 1 | data/*.gz -------------------------------------------------------------------------------- /code/chap10/scala/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'scala' 2 | apply plugin: 'application' 3 | 4 | ext.scalaClassifier = '2.13' 5 | ext.scalaVersion = '2.13.7' 6 | ext.sparkVersion = '3.2.0' 7 | 8 | group 'org.data.algorithms.spark.ch10' 9 | version '1.0-SNAPSHOT' 10 | 11 | repositories { 12 | mavenLocal() 13 | mavenCentral() 14 | } 15 | 16 | dependencies { 17 | implementation "org.scala-lang:scala-library:$scalaVersion" 18 | implementation "org.apache.spark:spark-core_$scalaClassifier:$sparkVersion" 19 | implementation "org.apache.spark:spark-sql_$scalaClassifier:$sparkVersion" 20 | } 21 | 22 | application { 23 | mainClass = project.hasProperty("mainClass") ? project.getProperty("mainClass") : "NULL" 24 | } -------------------------------------------------------------------------------- /code/chap10/scala/data/sample_dna_seq.txt: -------------------------------------------------------------------------------- 1 | ATCGGGATCCGGG 2 | ATTCCGGGATTCCCC 3 | ATGGCCCCCGGGATCGGG 4 | CGGTATCCGGGGAAAAA 5 | aaattCCGGAACCGGGGGTTT 6 | CCTTTTATCGGGCAAATTTTCCCGG 7 | attttcccccggaaaAAATTTCCGGG 8 | ACTGACTAGCTAGCTAACTG 9 | GCATCGTAGCTAGCTACGAT 10 | AATTCCCGCATCGATCGTACGTACGTAG 11 | ATCGATCGATCGTACGATCG 12 | -------------------------------------------------------------------------------- /code/chap10/scala/data/sample_input.txt: -------------------------------------------------------------------------------- 1 | a,2 2 | a,3 3 | a,4 4 | a,5 5 | a,7 6 | b,4 7 | b,5 8 | b,6 9 | c,3 10 | c,4 11 | c,5 12 | c,6 13 | -------------------------------------------------------------------------------- /code/chap10/scala/data/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 23,24,22,44,66,77,44,44,555,666 2 | 12,4,555,66,67,68,57,55,56,45,45,45,66,77 3 | 34,35,36,97300,78,79 4 | 120,44,444,445,345,345,555 5 | 11,33,34,35,36,37,47,7777,8888,6666,44,55 6 | 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105 7 | 6,7,8,9,10 8 | 8,9,10,12,12 9 | 7777 10 | 222,333,444,555,666,111,112,5,113,114 11 | 5555,4444,24 12 | -------------------------------------------------------------------------------- /code/chap10/scala/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/chap10/scala/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /code/chap10/scala/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-6.8-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/average_monoid_use_aggregate_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT="data/sample_input.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseAggregateByKey "--args= $INPUT" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/average_monoid_use_combine_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT="data/sample_input.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseCombineByKey "--args= $INPUT" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/average_monoid_use_group_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT="data/sample_input.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseGroupByKey "--args= $INPUT" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/average_monoid_use_reduce_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT="data/sample_input.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.AverageMonoidUseReduceByKey "--args=$INPUT" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_in_mapper_combiner_using_combine_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download the file to data directory 3 | INPUT_PATH="data/rs_chY.fas.gz" 4 | if [ ! -f $INPUT_PATH ]; then 5 | curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data 6 | fi 7 | #If file exists run the spark application 8 | if test $INPUT_PATH ; then 9 | start_time=$(date +%s) 10 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicInMapperCombinerUsingCombineByKey "--args=$INPUT_PATH" 11 | end_time=$(date +%s) 12 | # elapsed time with second resolution 13 | elapsed=$(( end_time - start_time )) 14 | echo "elapsed time (in seconds): $elapsed" 15 | fi -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_in_mapper_combiner_using_group_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download the file to data directory 3 | INPUT_PATH="data/rs_chY.fas.gz" 4 | if [ ! -f $INPUT_PATH ]; then 5 | curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data 6 | fi 7 | #If file exists run the spark application 8 | if test $INPUT_PATH ; then 9 | start_time=$(date +%s) 10 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicInMapperCombinerUsingGroupByKey "--args=$INPUT_PATH" 11 | end_time=$(date +%s) 12 | # elapsed time with second resolution 13 | elapsed=$(( end_time - start_time )) 14 | echo "elapsed time (in seconds): $elapsed" 15 | fi -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_in_mapper_combiner_using_reduce_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download the file to data directory 3 | INPUT_PATH="data/rs_chY.fas.gz" 4 | if [ ! -f $INPUT_PATH ]; then 5 | curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data 6 | fi 7 | #If file exists run the spark application 8 | if test $INPUT_PATH ; then 9 | start_time=$(date +%s) 10 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicInMapperCombinerUsingReduceByKey "--args=$INPUT_PATH" 11 | end_time=$(date +%s) 12 | # elapsed time with second resolution 13 | elapsed=$(( end_time - start_time )) 14 | echo "elapsed time (in seconds): $elapsed" 15 | fi -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_combine_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download the file to data directory 3 | INPUT_PATH="data/rs_chY.fas.gz" 4 | if [ ! -f $INPUT_PATH ]; then 5 | curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data 6 | fi 7 | #If file exists run the spark application 8 | if test $INPUT_PATH ; then 9 | start_time=$(date +%s) 10 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingCombineByKey "--args=$INPUT_PATH" 11 | end_time=$(date +%s) 12 | # elapsed time with second resolution 13 | elapsed=$(( end_time - start_time )) 14 | echo "elapsed time (in seconds): $elapsed" 15 | fi -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_group_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download the file to data directory 3 | INPUT_PATH="data/rs_chY.fas.gz" 4 | if [ ! -f $INPUT_PATH ]; then 5 | curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data 6 | fi 7 | #If file exists run the spark application 8 | if test $INPUT_PATH ; then 9 | start_time=$(date +%s) 10 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingGroupByKey "--args=$INPUT_PATH" 11 | end_time=$(date +%s) 12 | # elapsed time with second resolution 13 | elapsed=$(( end_time - start_time )) 14 | echo "elapsed time (in seconds): $elapsed" 15 | fi 16 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download the file to data directory 3 | INPUT_PATH="data/rs_chY.fas.gz" 4 | if [ ! -f $INPUT_PATH ]; then 5 | curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data 6 | fi 7 | #If file exists run the spark application 8 | if test $INPUT_PATH ; then 9 | start_time=$(date +%s) 10 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingMappartitions "--args=$INPUT_PATH" 11 | end_time=$(date +%s) 12 | # elapsed time with second resolution 13 | elapsed=$(( end_time - start_time )) 14 | echo "elapsed time (in seconds): $elapsed" 15 | fi -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/dna_base_count_basic_using_reduce_by_key.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #Download the file to data directory 3 | INPUT_PATH="data/rs_chY.fas.gz" 4 | if [ ! -f $INPUT_PATH ]; then 5 | curl -o $INPUT_PATH https://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606/rs_fasta/rs_chY.fas.gz -P data 6 | fi 7 | #If file exists run the spark application 8 | if test $INPUT_PATH ; then 9 | start_time=$(date +%s) 10 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.DNABaseCountBasicUsingReduceByKey "--args=$INPUT_PATH" 11 | end_time=$(date +%s) 12 | # elapsed time with second resolution 13 | elapsed=$(( end_time - start_time )) 14 | echo "elapsed time (in seconds): $elapsed" 15 | fi 16 | 17 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/in_mapper_combiner_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_dna_seq.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.InMapperCombinerUseMappartitions "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/in_mapper_combiner_using_local_aggregation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_dna_seq.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.InMapperCombinerUsingLocalAggregation "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/in_mapper_combiner_using_map_reduce.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_dna_seq.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.InMapperCombinerUsingMapReduce "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/min_max_force_empty_partitions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_numbers.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.MinMaxForceEmptyPartitions "--args=$INPUT_PATH" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/min_max_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | INPUT_PATH="data/sample_numbers.txt" 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.MinMaxUseMappartitions "--args=$INPUT_PATH" -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/structured_to_hierarchical_to_xml_dataframe.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.StructuredToHierarchicalToXmlDataframe 3 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/structured_to_hierarchical_to_xml_rdd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.StructuredToHierarchicalToXmlRDD 3 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/top_n_use_map_partitions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | N=3 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.TopNUseMapPartitions "--args=$N" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/run_spark_applications_scripts/top_n_use_take_ordered.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | N=3 3 | ./gradlew clean run -PmainClass=org.data.algorithms.spark.ch10.TopNUseTakeOrdered "--args=$N" 4 | -------------------------------------------------------------------------------- /code/chap10/scala/settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'data-algos-with-spark-ch10' 2 | 3 | -------------------------------------------------------------------------------- /code/chap11/scala/README.md: -------------------------------------------------------------------------------- 1 | Scala Solutions 2 | -------------------------------------------------------------------------------- /code/chap12/python/README.md: -------------------------------------------------------------------------------- 1 | PySpark Solutions 2 | -------------------------------------------------------------------------------- /code/chap12/scala/README.md: -------------------------------------------------------------------------------- 1 | Scala Solutions 2 | -------------------------------------------------------------------------------- /code/jars/avro-mapred-1.7.7-hadoop1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/avro-mapred-1.7.7-hadoop1.jar -------------------------------------------------------------------------------- /code/jars/avro-mapred-1.7.7-hadoop2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/avro-mapred-1.7.7-hadoop2.jar -------------------------------------------------------------------------------- /code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /code/jars/elasticsearch-hadoop-6.4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/elasticsearch-hadoop-6.4.2.jar -------------------------------------------------------------------------------- /code/jars/elasticsearch-spark_2.11-2.4.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/elasticsearch-spark_2.11-2.4.5.jar -------------------------------------------------------------------------------- /code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar -------------------------------------------------------------------------------- /code/jars/hbase-spark-connector-1.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/hbase-spark-connector-1.0.0.jar -------------------------------------------------------------------------------- /code/jars/htrace-core-3.1.0-incubating.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/htrace-core-3.1.0-incubating.jar -------------------------------------------------------------------------------- /code/jars/mongo-java-driver-3.8.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mongo-java-driver-3.8.2.jar -------------------------------------------------------------------------------- /code/jars/mongo-spark-connector_2.11-2.2.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mongo-spark-connector_2.11-2.2.5.jar -------------------------------------------------------------------------------- /code/jars/mongodb-driver-3.8.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mongodb-driver-3.8.2.jar -------------------------------------------------------------------------------- /code/jars/mysql-connector-java-5.1.42.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/mysql-connector-java-5.1.42.jar -------------------------------------------------------------------------------- /code/jars/shc-core-1.1.3-2.3-s_2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/shc-core-1.1.3-2.3-s_2.11.jar -------------------------------------------------------------------------------- /code/jars/shc-examples-1.1.3-2.3-s_2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/shc-examples-1.1.3-2.3-s_2.11.jar -------------------------------------------------------------------------------- /code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar -------------------------------------------------------------------------------- /code/jars/spark-redis-2.3.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/code/jars/spark-redis-2.3.1-SNAPSHOT.jar -------------------------------------------------------------------------------- /data/chap02/README.md: -------------------------------------------------------------------------------- 1 | Sample FASTA and FASTQ Files 2 | -------------------------------------------------------------------------------- /data/chap02/sample.fasta: -------------------------------------------------------------------------------- 1 | >seq1 2 | cGTAaccaataaaaaaacaagcttaacctaattc 3 | >seq2 4 | agcttagTTTGGatctggccgggg 5 | >seq3 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca 7 | gaattcgcacca 8 | AATAAAACCTCACCCAT 9 | agagcccagaatttactcCCC 10 | >seq4 11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca 12 | gaattcgcacca 13 | -------------------------------------------------------------------------------- /docs/goal_of_book.md: -------------------------------------------------------------------------------- 1 | # Goal of this book: Data Algorithms with Spark 2 | 3 | 1. Keep it SIMPLE! 4 | 5 | 2. Goal of this book: enable writing efficient & 6 | simpler PySpark code for data algorithms using Spark 7 | 8 | 3. A lot of [working PySpark code](../code/) is provided 9 | so that the reader can understand how to use basic 10 | transformations on using RDDs and DataFrames 11 | 12 | 4. As much as possible, I have avoided writing complex 13 | code and functions: keep it simple so that you can 14 | debug easily and your co-workers can understand them. 15 | 16 | 5. CUT-and-PASTE: you may take portions of the [code](../code/) 17 | and tailor it to your needs 18 | -------------------------------------------------------------------------------- /images/Data-Algorithms-with-Spark_mech2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data-Algorithms-with-Spark_mech2.pdf -------------------------------------------------------------------------------- /images/Data-Algorithms-with-Spark_mech2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data-Algorithms-with-Spark_mech2.png -------------------------------------------------------------------------------- /images/Data_Algorithms_with_Spark_COVER_9781492082385.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data_Algorithms_with_Spark_COVER_9781492082385.jpg -------------------------------------------------------------------------------- /images/Data_Algorithms_with_Spark_COVER_9781492082385.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/Data_Algorithms_with_Spark_COVER_9781492082385.png -------------------------------------------------------------------------------- /images/anagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/anagram.png -------------------------------------------------------------------------------- /images/book_cover_final.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/book_cover_final.pdf -------------------------------------------------------------------------------- /images/correlation-coefficient.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/correlation-coefficient.png -------------------------------------------------------------------------------- /images/data-alg-foreword2.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data-alg-foreword2.docx -------------------------------------------------------------------------------- /images/data-alg-foreword2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data-alg-foreword2.pdf -------------------------------------------------------------------------------- /images/data_algorithms_hard_copy_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_hard_copy_image.jpg -------------------------------------------------------------------------------- /images/data_algorithms_with_spark.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark.jpg -------------------------------------------------------------------------------- /images/data_algorithms_with_spark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark.pdf -------------------------------------------------------------------------------- /images/data_algorithms_with_spark_amazon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark_amazon.jpg -------------------------------------------------------------------------------- /images/data_algorithms_with_spark_knowledge_is_power.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark_knowledge_is_power.jpeg -------------------------------------------------------------------------------- /images/data_algorithms_with_spark_small.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/data_algorithms_with_spark_small.jpeg -------------------------------------------------------------------------------- /images/joins-in-SQL.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/joins-in-SQL.jpeg -------------------------------------------------------------------------------- /images/kmer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/kmer.jpg -------------------------------------------------------------------------------- /images/kmer_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/kmer_4.png -------------------------------------------------------------------------------- /images/mappartitions_image_1.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/mappartitions_image_1.drawio.png -------------------------------------------------------------------------------- /images/mappartitions_image_2.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/mappartitions_image_2.drawio.png -------------------------------------------------------------------------------- /images/sql_joins.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/images/sql_joins.png -------------------------------------------------------------------------------- /wiki-spark/docs/dataframe_to_rdd.md: -------------------------------------------------------------------------------- 1 | # DataFrame to RDD 2 | 3 | There are times that you might want to 4 | convert a **DataFrame to an RDD**. 5 | 6 | ## RDD and DataFrame 7 | 8 | * Spark's DataFrame (full name as: `pyspark.sql.DataFrame`) 9 | is an immutable and distributed collection of data grouped 10 | into named columns. 11 | 12 | * Spark's RDD (full name as: `pyspark.RDD`) 13 | is a Resilient Distributed Dataset (`RDD`), 14 | the basic abstraction in Spark. RDD represents an 15 | immutable, partitioned collection of elements that 16 | can be operated on in parallel. 17 | 18 | ## DataFrame to RDD Conversion 19 | 20 | To convert a `DataFrame` to an `RDD`, you just need to 21 | call `DataFrame.rdd`. 22 | 23 | >>> spark.version 24 | '3.3.1' 25 | >>> records = [("alex", 10), ("jane", 20), ("rose", 30)] 26 | >>> df = spark.createDataFrame(records, ["name", "age"]) 27 | >>> df.show() 28 | +----+---+ 29 | |name|age| 30 | +----+---+ 31 | |alex| 10| 32 | |jane| 20| 33 | |rose| 30| 34 | +----+---+ 35 | 36 | >>># Convert a DataFrame to an RDD 37 | >>> rdd = df.rdd 38 | >>> rdd.collect() 39 | [ 40 | Row(name='alex', age=10), 41 | Row(name='jane', age=20), 42 | Row(name='rose', age=30) 43 | ] 44 | -------------------------------------------------------------------------------- /wiki-spark/docs/flatmap_transformation.md: -------------------------------------------------------------------------------- 1 | # `RDD.flatMap()` 2 | 3 | The `RDD.flatMap(f)` returns a new RDD by first applying 4 | a function `f()` to all elements of this RDD, and then 5 | flattening the results. 6 | 7 | 8 | ![](./spark-flatmap.png) 9 | 10 | 11 | In the following example note that the empty elements are 12 | dropped. 13 | 14 | 15 | 16 | ~~~python 17 | >>> spark.version 18 | '3.3.2' 19 | 20 | >>> some_elements = [["e0", "e1", "e2", "e3"], 21 | [], 22 | ["e4", "e5"], 23 | ["e6"], 24 | [] ] 25 | >>> len(some_elements) 26 | 5 27 | >>> rdd = spark.sparkContext.parallelize(some_elements) 28 | >>> rdd.collect() 29 | [ ['e0', 'e1', 'e2', 'e3'], [], ['e4', 'e5'], ['e6'], [] ] 30 | 31 | >>> mapped = rdd.map(lambda x: x) 32 | >>> mapped.collect() 33 | [ ['e0', 'e1', 'e2', 'e3'], [], ['e4', 'e5'], ['e6'], [] ] 34 | >>> mapped.count() 35 | 5 36 | 37 | >>> flat_mapped = rdd.flatMap(lambda x: x) 38 | >>> flat_mapped.collect() 39 | ['e0', 'e1', 'e2', 'e3', 'e4', 'e5', 'e6'] 40 | >>> flat_mapped.count() 41 | 7 42 | ~~~ -------------------------------------------------------------------------------- /wiki-spark/docs/lambda_expressions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/lambda_expressions.pdf -------------------------------------------------------------------------------- /wiki-spark/docs/monoid/monoid_math.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/monoid/monoid_math.png -------------------------------------------------------------------------------- /wiki-spark/docs/spark-explode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/spark-explode.png -------------------------------------------------------------------------------- /wiki-spark/docs/spark-flatmap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/spark-flatmap.png -------------------------------------------------------------------------------- /wiki-spark/docs/using-graphframes-with-jupyter.demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/using-graphframes-with-jupyter.demo.png -------------------------------------------------------------------------------- /wiki-spark/docs/wiki.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/data-algorithms-with-spark/4c049703105e4ea5e3d635edaac303a6b210da41/wiki-spark/docs/wiki.jpeg --------------------------------------------------------------------------------