├── LICENSE.md
├── README.md
├── code
    ├── README.md
    ├── chap01
    │   ├── README.md
    │   ├── basic_dataframe_example.log
    │   ├── basic_dataframe_example.py
    │   ├── basic_dataframe_example.sh
    │   ├── compute_stats.log
    │   ├── compute_stats.py
    │   ├── compute_stats.sh
    │   ├── compute_stats_detailed.log
    │   ├── compute_stats_detailed.py
    │   ├── compute_stats_detailed.sh
    │   ├── compute_stats_with_threshold.log
    │   ├── compute_stats_with_threshold.py
    │   ├── compute_stats_with_threshold.sh
    │   ├── compute_stats_with_threshold_and_filter.log
    │   ├── compute_stats_with_threshold_and_filter.py
    │   ├── compute_stats_with_threshold_and_filter.sh
    │   ├── dataframe_creation_from_csv.log
    │   ├── dataframe_creation_from_csv.py
    │   ├── dataframe_creation_from_csv.sh
    │   ├── fox_data.txt
    │   ├── name_city_age.csv
    │   ├── rdd_creation_from_csv.log
    │   ├── rdd_creation_from_csv.py
    │   ├── rdd_creation_from_csv.sh
    │   ├── sample.txt
    │   ├── sample_numbers.txt
    │   ├── sample_people.json
    │   ├── sort_numbers.log
    │   ├── sort_numbers.py
    │   ├── sort_numbers.sh
    │   ├── url_frequencies.txt
    │   ├── word_count.log
    │   ├── word_count.py
    │   ├── word_count.sh
    │   ├── word_count_with_params.py
    │   ├── word_count_with_params.sh
    │   ├── word_count_with_threshold.py
    │   └── word_count_with_threshold.sh
    ├── chap02
    │   ├── README.md
    │   ├── generate_key_value_pairs.py
    │   ├── sample_file.txt
    │   ├── sample_file_extra.txt
    │   ├── sum_by_groupbykey.log
    │   ├── sum_by_groupbykey.py
    │   ├── sum_by_groupbykey.sh
    │   ├── sum_by_reducebykey.log
    │   ├── sum_by_reducebykey.py
    │   ├── sum_by_reducebykey.sh
    │   ├── word_count_driver.log
    │   ├── word_count_driver.py
    │   ├── word_count_driver.sh
    │   ├── word_count_driver_by_groupbykey.log
    │   ├── word_count_driver_by_groupbykey.py
    │   ├── word_count_driver_by_groupbykey.sh
    │   ├── word_count_driver_shorthand.log
    │   ├── word_count_driver_shorthand.py
    │   ├── word_count_driver_shorthand.sh
    │   ├── word_count_driver_shorthand_by_groupbykey.log
    │   ├── word_count_driver_shorthand_by_groupbykey.py
    │   ├── word_count_driver_shorthand_by_groupbykey.sh
    │   ├── word_count_driver_shorthand_sorted.log
    │   ├── word_count_driver_shorthand_sorted.py
    │   ├── word_count_driver_shorthand_sorted.sh
    │   ├── word_count_driver_with_filter.log
    │   ├── word_count_driver_with_filter.py
    │   ├── word_count_driver_with_filter.sh
    │   ├── word_count_driver_with_filter_and_threshold.log
    │   ├── word_count_driver_with_filter_and_threshold.py
    │   ├── word_count_driver_with_filter_and_threshold.sh
    │   ├── word_count_python.py
    │   └── word_count_python_shorthand.py
    ├── chap03
    │   ├── datadir
    │   │   ├── file1
    │   │   └── file2
    │   ├── dataframe_creation_from_collection.log
    │   ├── dataframe_creation_from_collection.py
    │   ├── dataframe_creation_from_collection.sh
    │   ├── dataframe_creation_from_csv_no_header.log
    │   ├── dataframe_creation_from_csv_no_header.py
    │   ├── dataframe_creation_from_csv_no_header.sh
    │   ├── dataframe_creation_from_csv_with_header.log
    │   ├── dataframe_creation_from_csv_with_header.py
    │   ├── dataframe_creation_from_csv_with_header.sh
    │   ├── dataframe_creation_from_dictionary.log
    │   ├── dataframe_creation_from_dictionary.py
    │   ├── dataframe_creation_from_dictionary.sh
    │   ├── dataframe_creation_from_directory.log
    │   ├── dataframe_creation_from_directory.py
    │   ├── dataframe_creation_from_directory.sh
    │   ├── dataframe_creation_from_rdd.log
    │   ├── dataframe_creation_from_rdd.py
    │   ├── dataframe_creation_from_rdd.sh
    │   ├── kv.txt
    │   ├── kv_no_header.txt
    │   ├── kv_with_header.txt
    │   ├── rdd_creation_from_collection.log
    │   ├── rdd_creation_from_collection.py
    │   ├── rdd_creation_from_collection.sh
    │   ├── rdd_creation_from_dataframe.log
    │   ├── rdd_creation_from_dataframe.py
    │   ├── rdd_creation_from_dataframe.sh
    │   ├── rdd_creation_from_dictionary.log
    │   ├── rdd_creation_from_dictionary.py
    │   ├── rdd_creation_from_dictionary.sh
    │   ├── rdd_creation_from_directory.log
    │   ├── rdd_creation_from_directory.py
    │   ├── rdd_creation_from_directory.sh
    │   ├── rdd_creation_from_file.log
    │   ├── rdd_creation_from_file.py
    │   ├── rdd_creation_from_file.sh
    │   ├── sample.txt
    │   ├── sample_dir
    │   │   ├── file1.txt
    │   │   └── file2.txt
    │   ├── sample_dir2
    │   │   ├── file1.txt
    │   │   ├── file2.txt
    │   │   ├── file3.csv
    │   │   └── file4.csv
    │   ├── word_count.py
    │   ├── word_count.sh
    │   ├── word_count_with_params.py
    │   ├── word_count_with_params.sh
    │   ├── word_count_with_threshold.py
    │   └── word_count_with_threshold.sh
    ├── chap04
    │   ├── DNA-FASTA-PERFORMANCE
    │   │   └── performance_of_FASTA_versions_1_2_3.txt
    │   ├── DNA-FASTA-V1
    │   │   ├── run_dna_base_count_ver_1.py
    │   │   ├── run_dna_base_count_ver_1.sh
    │   │   ├── run_dna_base_count_ver_1_1GB.sh
    │   │   └── run_dna_base_count_ver_1_big.sh
    │   ├── DNA-FASTA-V2
    │   │   ├── dna_base_count_ver_2.py
    │   │   ├── run_dna_base_count_ver_2.sh
    │   │   ├── run_dna_base_count_ver_2_1GB.sh
    │   │   └── run_dna_base_count_ver_2_big.sh
    │   ├── DNA-FASTA-V3
    │   │   ├── dna_base_count_ver_3.py
    │   │   ├── run_dna_base_count_ver_3.sh
    │   │   ├── run_dna_base_count_ver_3_1GB.sh
    │   │   └── run_dna_base_count_ver_3_big.sh
    │   ├── DNA-FASTQ
    │   │   ├── dna_base_count_fastq.py
    │   │   └── run_dna_base_count_fastq.sh
    │   ├── README.md
    │   └── data
    │   │   ├── sample.fasta
    │   │   └── sp1.fastq
    ├── chap05
    │   ├── average_by_key_use_aggregatebykey.log
    │   ├── average_by_key_use_aggregatebykey.py
    │   ├── average_by_key_use_aggregatebykey.sh
    │   ├── average_by_key_use_combinebykey.log
    │   ├── average_by_key_use_combinebykey.py
    │   ├── average_by_key_use_combinebykey.sh
    │   ├── average_by_key_use_foldbykey.log
    │   ├── average_by_key_use_foldbykey.py
    │   ├── average_by_key_use_foldbykey.sh
    │   ├── average_by_key_use_groupbykey.log
    │   ├── average_by_key_use_groupbykey.py
    │   ├── average_by_key_use_groupbykey.sh
    │   ├── average_by_key_use_reducebykey.log
    │   ├── average_by_key_use_reducebykey.py
    │   ├── average_by_key_use_reducebykey.sh
    │   ├── dataframe_action_describe.log
    │   ├── dataframe_action_describe.py
    │   ├── dataframe_action_describe.sh
    │   ├── dataframe_drop.log
    │   ├── dataframe_drop.py
    │   ├── dataframe_drop.sh
    │   ├── dataframe_filter.log
    │   ├── dataframe_filter.py
    │   ├── dataframe_filter.sh
    │   ├── dataframe_join_cross.log
    │   ├── dataframe_join_cross.py
    │   ├── dataframe_join_cross.sh
    │   ├── dataframe_join_inner.log
    │   ├── dataframe_join_inner.py
    │   ├── dataframe_join_inner.sh
    │   ├── dataframe_join_left.log
    │   ├── dataframe_join_left.py
    │   ├── dataframe_join_left.sh
    │   ├── dataframe_join_right.log
    │   ├── dataframe_join_right.py
    │   ├── dataframe_join_right.sh
    │   ├── dataframe_sql.log
    │   ├── dataframe_sql.py
    │   ├── dataframe_sql.sh
    │   ├── dataframe_withcolumn.log
    │   ├── dataframe_withcolumn.py
    │   ├── dataframe_withcolumn.sh
    │   ├── emps.txt
    │   ├── rdd_transformation_cartesian.log
    │   ├── rdd_transformation_cartesian.py
    │   ├── rdd_transformation_cartesian.sh
    │   ├── rdd_transformation_combinebykey.log
    │   ├── rdd_transformation_combinebykey.py
    │   ├── rdd_transformation_combinebykey.sh
    │   ├── rdd_transformation_filter.log
    │   ├── rdd_transformation_filter.py
    │   ├── rdd_transformation_filter.sh
    │   ├── rdd_transformation_flatmap.log
    │   ├── rdd_transformation_flatmap.py
    │   ├── rdd_transformation_flatmap.sh
    │   ├── rdd_transformation_groupbykey.log
    │   ├── rdd_transformation_groupbykey.py
    │   ├── rdd_transformation_groupbykey.sh
    │   ├── rdd_transformation_join.log
    │   ├── rdd_transformation_join.py
    │   ├── rdd_transformation_join.sh
    │   ├── rdd_transformation_map.log
    │   ├── rdd_transformation_map.py
    │   ├── rdd_transformation_map.sh
    │   ├── rdd_transformation_mappartitions.log
    │   ├── rdd_transformation_mappartitions.py
    │   ├── rdd_transformation_mappartitions.sh
    │   ├── rdd_transformation_mappartitions_handle_empty_partitions.log
    │   ├── rdd_transformation_mappartitions_handle_empty_partitions.py
    │   ├── rdd_transformation_mappartitions_handle_empty_partitions.sh
    │   ├── rdd_transformation_reducebykey.log
    │   ├── rdd_transformation_reducebykey.py
    │   ├── rdd_transformation_reducebykey.sh
    │   ├── rdd_transformation_sortby.log
    │   ├── rdd_transformation_sortby.py
    │   ├── rdd_transformation_sortby.sh
    │   ├── rdd_transformation_takeordered.log
    │   ├── rdd_transformation_takeordered.py
    │   ├── rdd_transformation_takeordered.sh
    │   ├── sample_5_records.txt
    │   └── users.txt
    ├── chap06
    │   ├── README.md
    │   ├── average_by_key_use_aggregatebykey.py
    │   ├── average_by_key_use_aggregatebykey.sh
    │   ├── average_by_key_use_combinebykey.py
    │   ├── average_by_key_use_combinebykey.sh
    │   ├── average_by_key_use_groupbykey.py
    │   ├── average_by_key_use_groupbykey.sh
    │   ├── average_by_key_use_reducebykey.py
    │   └── average_by_key_use_reducebykey.sh
    ├── chap07
    │   ├── WorldCupPlayers.csv
    │   ├── WorldCupPlayers.csv.data.source
    │   ├── WorldCupPlayers.csv.zip
    │   ├── customers.RECORD.FORMAT.txt
    │   ├── customers.txt
    │   ├── dataframe_creation_add_columns.log
    │   ├── dataframe_creation_add_columns.py
    │   ├── dataframe_creation_add_columns.sh
    │   ├── dataframe_creation_aggregate_multiple_columns.log
    │   ├── dataframe_creation_aggregate_multiple_columns.py
    │   ├── dataframe_creation_aggregate_multiple_columns.sh
    │   ├── dataframe_creation_aggregate_single_column.log
    │   ├── dataframe_creation_aggregate_single_column.py
    │   ├── dataframe_creation_aggregate_single_column.sh
    │   ├── dataframe_creation_call_udf.log
    │   ├── dataframe_creation_call_udf.py
    │   ├── dataframe_creation_call_udf.sh
    │   ├── dataframe_creation_cvs_no_header.log
    │   ├── dataframe_creation_cvs_no_header.py
    │   ├── dataframe_creation_cvs_no_header.sh
    │   ├── dataframe_creation_cvs_with_header.log
    │   ├── dataframe_creation_cvs_with_header.py
    │   ├── dataframe_creation_cvs_with_header.sh
    │   ├── dataframe_creation_from_collections.log
    │   ├── dataframe_creation_from_collections.py
    │   ├── dataframe_creation_from_collections.sh
    │   ├── dataframe_creation_from_pandas.log
    │   ├── dataframe_creation_from_pandas.py
    │   ├── dataframe_creation_from_pandas.sh
    │   ├── dataframe_creation_from_rows.log
    │   ├── dataframe_creation_from_rows.py
    │   ├── dataframe_creation_from_rows.sh
    │   ├── dataframe_creation_order_by.log
    │   ├── dataframe_creation_order_by.py
    │   ├── dataframe_creation_order_by.sh
    │   ├── dataframe_creation_with_explicit_schema.log
    │   ├── dataframe_creation_with_explicit_schema.py
    │   ├── dataframe_creation_with_explicit_schema.sh
    │   ├── dataframe_crosstab.log
    │   ├── dataframe_crosstab.py
    │   ├── dataframe_crosstab.sh
    │   ├── dataframe_drop_column.log
    │   ├── dataframe_drop_column.py
    │   ├── dataframe_drop_column.sh
    │   ├── dataframe_drop_duplicates.log
    │   ├── dataframe_drop_duplicates.py
    │   ├── dataframe_drop_duplicates.sh
    │   ├── dataframe_multi_dim_agg_groupby.log
    │   ├── dataframe_multi_dim_agg_groupby.py
    │   ├── dataframe_multi_dim_agg_groupby.sh
    │   ├── dataframe_multi_dim_agg_rollup.log
    │   ├── dataframe_multi_dim_agg_rollup.py
    │   ├── dataframe_multi_dim_agg_rollup.sh
    │   ├── dataframe_tutorial_with_worldcup.log
    │   ├── dataframe_tutorial_with_worldcup.py
    │   ├── dataframe_tutorial_with_worldcup.sh
    │   ├── dataframe_with_statistical_data.log
    │   ├── dataframe_with_statistical_data.py
    │   ├── dataframe_with_statistical_data.sh
    │   ├── emps_no_header.txt
    │   ├── emps_with_header.txt
    │   ├── life_expentancy.txt
    │   ├── partition_data_by_customer_and_year.log
    │   ├── partition_data_by_customer_and_year.py
    │   ├── partition_data_by_customer_and_year.sh
    │   ├── partition_data_by_customer_and_year_single_file.py
    │   ├── strings-2.parquet
    │   ├── users.parquet
    │   └── users4.parquet
    ├── chap08
    │   ├── cats.no.header.csv
    │   ├── cats.with.header.csv
    │   ├── datasource_csv_reader_no_header.log
    │   ├── datasource_csv_reader_no_header.py
    │   ├── datasource_csv_reader_no_header.sh
    │   ├── datasource_csv_reader_with_header.log
    │   ├── datasource_csv_reader_with_header.py
    │   ├── datasource_csv_reader_with_header.sh
    │   ├── datasource_csv_writer.log
    │   ├── datasource_csv_writer.py
    │   ├── datasource_csv_writer.sh
    │   ├── datasource_elasticsearch_reader.log
    │   ├── datasource_elasticsearch_reader.py
    │   ├── datasource_elasticsearch_reader.sh
    │   ├── datasource_elasticsearch_writer.log
    │   ├── datasource_elasticsearch_writer.py
    │   ├── datasource_elasticsearch_writer.sh
    │   ├── datasource_gzip_reader.log
    │   ├── datasource_gzip_reader.py
    │   ├── datasource_gzip_reader.sh
    │   ├── datasource_jdbc_reader.log
    │   ├── datasource_jdbc_reader.py
    │   ├── datasource_jdbc_reader.sh
    │   ├── datasource_jdbc_writer.log
    │   ├── datasource_jdbc_writer.py
    │   ├── datasource_jdbc_writer.sh
    │   ├── datasource_json_reader_multi_line.log
    │   ├── datasource_json_reader_multi_line.py
    │   ├── datasource_json_reader_multi_line.sh
    │   ├── datasource_json_reader_single_line.log
    │   ├── datasource_json_reader_single_line.py
    │   ├── datasource_json_reader_single_line.sh
    │   ├── datasource_mongodb_reader.log
    │   ├── datasource_mongodb_reader.py
    │   ├── datasource_mongodb_reader.sh
    │   ├── datasource_mongodb_writer.log
    │   ├── datasource_mongodb_writer.py
    │   ├── datasource_mongodb_writer.sh
    │   ├── datasource_redis_reader.log
    │   ├── datasource_redis_reader.py
    │   ├── datasource_redis_reader.sh
    │   ├── datasource_redis_writer.log
    │   ├── datasource_redis_writer.py
    │   ├── datasource_redis_writer.sh
    │   ├── datasource_textfile_reader.log
    │   ├── datasource_textfile_reader.py
    │   ├── datasource_textfile_reader.sh
    │   ├── datasource_textfile_writer.log
    │   ├── datasource_textfile_writer.py
    │   ├── datasource_textfile_writer.sh
    │   ├── images
    │   │   ├── cat1.jpg
    │   │   ├── cat2.jpg
    │   │   ├── cat3.jpg
    │   │   ├── cat4.jpg
    │   │   ├── duck1.jpg
    │   │   ├── duck2.jpg
    │   │   └── not-image.txt
    │   ├── mongodb_coll44.png
    │   ├── mongodb_coll66.png
    │   ├── name_age_salary.csv
    │   ├── people.txt
    │   ├── sample_multi_line.json
    │   ├── sample_no_header.csv
    │   ├── sample_numbers.txt
    │   ├── sample_single_line.json
    │   ├── sample_with_header.csv
    │   └── twitter.avro
    ├── chap09
    │   ├── logistic_regression_builder.log
    │   ├── logistic_regression_builder.py
    │   ├── logistic_regression_builder.sh
    │   ├── logistic_regression_predictor.log
    │   ├── logistic_regression_predictor.py
    │   ├── logistic_regression_predictor.sh
    │   ├── model
    │   │   ├── data
    │   │   │   ├── ._SUCCESS.crc
    │   │   │   ├── .part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet.crc
    │   │   │   ├── _SUCCESS
    │   │   │   └── part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet
    │   │   └── metadata
    │   │   │   ├── ._SUCCESS.crc
    │   │   │   ├── .part-00000.crc
    │   │   │   ├── _SUCCESS
    │   │   │   └── part-00000
    │   ├── new_emails.txt
    │   ├── training_emails_nospam.txt
    │   └── training_emails_spam.txt
    ├── chap10
    │   ├── recommendation_example.py
    │   └── test.data
    ├── chap11
    │   ├── airports.json
    │   ├── breadth_first_search_example.log
    │   ├── breadth_first_search_example.py
    │   ├── breadth_first_search_example.sh
    │   ├── connected_component_example.log
    │   ├── connected_component_example.py
    │   ├── connected_component_example.sh
    │   ├── flightdata2018.json
    │   ├── graph_builder.log
    │   ├── graph_builder.py
    │   ├── graph_builder.sh
    │   ├── label_propagation_algorithm_example.log
    │   ├── label_propagation_algorithm_example.py
    │   ├── label_propagation_algorithm_example.sh
    │   ├── pagerank.py
    │   ├── pagerank_data.txt
    │   ├── pagerank_example.log
    │   ├── pagerank_example.py
    │   ├── pagerank_example.sh
    │   ├── sample_graph_edges.txt
    │   ├── sample_graph_vertices.txt
    │   ├── shortest_path_finder.log
    │   ├── shortest_path_finder.py
    │   ├── shortest_path_finder.sh
    │   ├── triangles_counter.log
    │   ├── triangles_counter.py
    │   ├── triangles_counter.sh
    │   ├── unique_triangles_finder.log
    │   ├── unique_triangles_finder.py
    │   └── unique_triangles_finder.sh
    ├── chap12
    │   ├── average_monoid_use_aggregatebykey.log
    │   ├── average_monoid_use_aggregatebykey.py
    │   ├── average_monoid_use_aggregatebykey.sh
    │   ├── average_monoid_use_combinebykey.log
    │   ├── average_monoid_use_combinebykey.py
    │   ├── average_monoid_use_combinebykey.sh
    │   ├── average_monoid_use_groupbykey.log
    │   ├── average_monoid_use_groupbykey.py
    │   ├── average_monoid_use_groupbykey.sh
    │   ├── average_monoid_use_reducebykey.log
    │   ├── average_monoid_use_reducebykey.py
    │   ├── average_monoid_use_reducebykey.sh
    │   ├── inmapper_combiner_local_aggregation.log
    │   ├── inmapper_combiner_local_aggregation.py
    │   ├── inmapper_combiner_local_aggregation.sh
    │   ├── inmapper_combiner_use_basic_mapreduce.log
    │   ├── inmapper_combiner_use_basic_mapreduce.py
    │   ├── inmapper_combiner_use_basic_mapreduce.sh
    │   ├── inmapper_combiner_use_mappartitions.log
    │   ├── inmapper_combiner_use_mappartitions.py
    │   ├── inmapper_combiner_use_mappartitions.sh
    │   ├── minmax_force_empty_partitions.log
    │   ├── minmax_force_empty_partitions.py
    │   ├── minmax_force_empty_partitions.sh
    │   ├── minmax_use_mappartitions.log
    │   ├── minmax_use_mappartitions.py
    │   ├── minmax_use_mappartitions.sh
    │   ├── sample_dna_seq.txt
    │   ├── sample_input.txt
    │   ├── sample_numbers.txt
    │   ├── top_N_use_mappartitions.log
    │   ├── top_N_use_mappartitions.py
    │   ├── top_N_use_mappartitions.sh
    │   ├── top_N_use_takeordered.log
    │   ├── top_N_use_takeordered.py
    │   └── top_N_use_takeordered.sh
    ├── examples
    │   └── wordcount
    │   │   ├── foxdata.txt
    │   │   ├── wordcount.py
    │   │   └── wordcount.py.usage
    └── jars
    │   ├── avro-mapred-1.7.7-hadoop1.jar
    │   ├── avro-mapred-1.7.7-hadoop2.jar
    │   ├── com-cotdp-hadoop-1.0-SNAPSHOT.jar
    │   ├── elasticsearch-hadoop-6.4.2.jar
    │   ├── elasticsearch-spark_2.11-2.4.5.jar
    │   ├── graphframes-0.6.0-spark2.3-s_2.11.jar
    │   ├── hbase-spark-connector-1.0.0.jar
    │   ├── htrace-core-3.1.0-incubating.jar
    │   ├── mongo-java-driver-3.8.2.jar
    │   ├── mongo-spark-connector_2.11-2.2.5.jar
    │   ├── mongodb-driver-3.8.2.jar
    │   ├── mysql-connector-java-5.1.42.jar
    │   ├── shc-core-1.1.3-2.3-s_2.11.jar
    │   ├── shc-examples-1.1.3-2.3-s_2.11.jar
    │   ├── spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar
    │   └── spark-redis-2.3.1-SNAPSHOT.jar
├── images
    ├── pyspark_algorithms.jpg
    ├── pyspark_algorithms0.jpg
    ├── pyspark_algorithms2.jpg
    └── pyspark_algorithms3.jpg
├── sample_chapters
    ├── Appendix_Questions_and_Answers.epub
    ├── Appendix_Questions_and_Answers.pdf
    ├── README.md
    ├── chap04_Getting_Started_with_PySpark.epub
    └── chap04_Getting_Started_with_PySpark.pdf
└── where_to_buy_book
    └── README.md


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright [2019] [Mahmoud Parsian]
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 | http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/code/chap01/basic_dataframe_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run basic_dataframe_example.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap01/sample_people.json"
 8 | export SPARK_PROG="/pyspark_book/code/chap01/basic_dataframe_example.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit  $SPARK_PROG  $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap01/compute_stats.log:
--------------------------------------------------------------------------------
 1 | # define python3: find out where python3 is installed?
 2 | $ type python3
 3 | python3 is /usr/local/bin/python3
 4 | $ /usr/local/bin/python3 --version
 5 | Python 3.7.1
 6 | export PYSPARK_PYTHON=/usr/local/bin/python3
 7 | #
 8 | export PROG="/pyspark_book/code/chap01/compute_stats.py"
 9 | # define your input path
10 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt"
11 | # define your Spark home directory
12 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
13 | 
14 | # run the program
15 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
16 | 
17 | inputPath :  /pyspark_book/code/chap01/url_frequencies.txt
18 | 
19 | results =  [
20 |  ('url3', (21.857142857142858, 20, 18.97743020387263)), 
21 |  ('url1', (6.8, 8, 4.324349662087931)), 
22 |  ('url2', (6, 6.0, 3.265986323710904)),
23 |  ('url4', (1.5, 1.5, 0.7071067811865476))
24 | ]
25 | 


--------------------------------------------------------------------------------
/code/chap01/compute_stats.sh:
--------------------------------------------------------------------------------
 1 | # NOTE:
 2 | #      Since statistics functions are defined in
 3 | #      Python3, I use Python3 instead of Python2
 4 | #
 5 | # define python3: find out where python3 is installed?
 6 | #$ type python3
 7 | #python3 is /usr/local/bin/python3
 8 | #$ /usr/local/bin/python3 --version
 9 | #Python 3.7.1
10 | export PYSPARK_PYTHON=/usr/local/bin/python3
11 | #
12 | # define PySpark program
13 | export PROG="/pyspark_book/code/chap01/compute_stats.py"
14 | #
15 | # define your input path
16 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt"
17 | #
18 | # define your Spark home directory
19 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
20 | #
21 | # run the program
22 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
23 | 


--------------------------------------------------------------------------------
/code/chap01/compute_stats_detailed.sh:
--------------------------------------------------------------------------------
 1 | # NOTE:
 2 | #      Since statistics functions are defined in
 3 | #      Python3, we use Python3 instead of Python2
 4 | #
 5 | # define python3: find out where python3 is installed?
 6 | #$ type python3
 7 | #python3 is /usr/local/bin/python3
 8 | #$ /usr/local/bin/python3 --version
 9 | #Python 3.7.1
10 | export PYSPARK_PYTHON=/usr/local/bin/python3
11 | #
12 | # define PySpark program
13 | export PROG="/pyspark_book/code/chap01/compute_stats_detailed.py"
14 | # define your input path
15 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt"
16 | # define your Spark home directory
17 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
18 | # run the program
19 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
20 | 


--------------------------------------------------------------------------------
/code/chap01/compute_stats_with_threshold.log:
--------------------------------------------------------------------------------
 1 | # define python3: find out where python3 is installed?
 2 | $ type python3
 3 | python3 is /usr/local/bin/python3
 4 | $ /usr/local/bin/python3 --version
 5 | Python 3.7.1
 6 | export PYSPARK_PYTHON=/usr/local/bin/python3
 7 | #
 8 | # define PySpark program
 9 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold.py"
10 | #
11 | # define your input path
12 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt"
13 | #
14 | # define your Spark home directory
15 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
16 | #
17 | # define the length threshold
18 | export THRESHOLD_RECORD_LENGTH=5
19 | 
20 | # run the program
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT $THRESHOLD_RECORD_LENGTH
22 | 
23 | inputPath :  /pyspark_book/code/chap01/url_frequencies.txt
24 | 
25 | THRESHOLD_RECORD_LENGTH : 5
26 | 
27 | results =  [
28 |  ('url3', (21.857142857142858, 20, 18.97743020387263)), 
29 |  ('url1', (6.8, 8, 4.324349662087931)), 
30 |  ('url2', (6, 6.0, 3.265986323710904)),
31 |  ('url4', (1.5, 1.5, 0.7071067811865476))
32 | ]
33 | 


--------------------------------------------------------------------------------
/code/chap01/compute_stats_with_threshold.sh:
--------------------------------------------------------------------------------
 1 | # define python3: find out where python3 is installed?
 2 | #$ type python3
 3 | #python3 is /usr/local/bin/python3
 4 | #$ /usr/local/bin/python3 --version
 5 | #Python 3.7.1
 6 | export PYSPARK_PYTHON=/usr/local/bin/python3
 7 | #
 8 | # define PySpark program
 9 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold.py"
10 | #
11 | # define your input path
12 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt"
13 | #
14 | # define your Spark home directory
15 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
16 | #
17 | # define the length threshold
18 | export THRESHOLD_RECORD_LENGTH=5
19 | #
20 | # run the program
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT $THRESHOLD_RECORD_LENGTH
22 | 


--------------------------------------------------------------------------------
/code/chap01/compute_stats_with_threshold_and_filter.log:
--------------------------------------------------------------------------------
 1 | # define python3: find out where python3 is installed?
 2 | $ type python3
 3 | python3 is /usr/local/bin/python3
 4 | $ /usr/local/bin/python3 --version
 5 | Python 3.7.1
 6 | export PYSPARK_PYTHON=/usr/local/bin/python3
 7 | #
 8 | # define PySpark program
 9 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold_and_filter.py"
10 | #
11 | # define your input path
12 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt"
13 | #
14 | # define your Spark home directory
15 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
16 | #
17 | # define the length threshold
18 | export THRESHOLD_RECORD_LENGTH=5
19 | 
20 | # define the mean threshold
21 | export THRESHOLD_MEAN=2
22 | 
23 | # run the program
24 | $SPARK_HOME/bin/spark-submit $PROG $INPUT ${THRESHOLD_RECORD_LENGTH} ${THRESHOLD_MEAN}
25 | 
26 | inputPath :  /pyspark_book/code/chap01/url_frequencies.txt
27 | 
28 | THRESHOLD_RECORD_LENGTH : 5
29 | 
30 | THRESHOLD_MEAN : 2.0
31 | 
32 | results =  
33 | [
34 |  ('url3', (21.857142857142858, 20, 18.97743020387263)), 
35 |  ('url1', (6.8, 8, 4.324349662087931)), 
36 |  ('url2', (6, 6.0, 3.265986323710904)), 
37 |  ('url4', (1.5, 1.5, 0.7071067811865476))
38 | ]
39 | 
40 | final_results =  
41 | [
42 |  ('url3', (21.857142857142858, 20, 18.97743020387263)), 
43 |  ('url1', (6.8, 8, 4.324349662087931)), 
44 |  ('url2', (6, 6.0, 3.265986323710904))
45 | ]


--------------------------------------------------------------------------------
/code/chap01/compute_stats_with_threshold_and_filter.sh:
--------------------------------------------------------------------------------
 1 | # NOTE:
 2 | #      Since statistics functions are defined in
 3 | #      Python3, we use Python3 instead of Python2
 4 | #
 5 | # define python3: find out where python3 is installed?
 6 | #$ type python3
 7 | #python3 is /usr/local/bin/python3
 8 | #$ /usr/local/bin/python3 --version
 9 | #Python 3.7.1
10 | export PYSPARK_PYTHON=/usr/local/bin/python3
11 | #
12 | # define PySpark program
13 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold_and_filter.py"
14 | #
15 | # define your input path
16 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt"
17 | #
18 | # define your Spark home directory
19 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
20 | #
21 | # define the length threshold
22 | export THRESHOLD_RECORD_LENGTH=5
23 | #
24 | # define the mean threshold
25 | export THRESHOLD_MEAN=2
26 | #
27 | # run the program
28 | $SPARK_HOME/bin/spark-submit $PROG $INPUT ${THRESHOLD_RECORD_LENGTH} ${THRESHOLD_MEAN}
29 | 


--------------------------------------------------------------------------------
/code/chap01/dataframe_creation_from_csv.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_csv.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap01/name_city_age.csv"
 8 | export SPARK_PROG="/pyspark_book/code/chap01/dataframe_creation_from_csv.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap01/fox_data.txt:
--------------------------------------------------------------------------------
1 | a cute fox jumped and juped high
2 | a red cute fox jumped high and high
3 | a red fox jumped
4 | 


--------------------------------------------------------------------------------
/code/chap01/name_city_age.csv:
--------------------------------------------------------------------------------
 1 | Alex,Ames,40
 2 | Betty,Ames,33
 3 | Alex,Ames,50
 4 | Betty,Stanford,45
 5 | Jeff,Sunnyvale,55
 6 | Bob,Sunnyvale,60
 7 | Terry,Stanford,75
 8 | David,Stanford,90
 9 | Don,Stanford,80
10 | 


--------------------------------------------------------------------------------
/code/chap01/rdd_creation_from_csv.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_creation_from_csv.py name_city_age.csv
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1087bcba8>
 4 | 
 5 | input path :  name_city_age.csv
 6 | file_contents =
 7 | Alex,Ames,40
 8 | Betty,Ames,33
 9 | Alex,Ames,50
10 | Betty,Stanford,45
11 | Jeff,Sunnyvale,55
12 | Bob,Sunnyvale,60
13 | Terry,Stanford,75
14 | David,Stanford,90
15 | Don,Stanford,80
16 | 
17 | rdd = name_city_age.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
18 | rdd.count =  9
19 | rdd.collect() =  
20 | [
21 |  'Alex,Ames,40', 
22 |  'Betty,Ames,33', 
23 |  'Alex,Ames,50', 
24 |  'Betty,Stanford,45', 
25 |  'Jeff,Sunnyvale,55', 
26 |  'Bob,Sunnyvale,60', 
27 |  'Terry,Stanford,75', 
28 |  'David,Stanford,90', 
29 |  'Don,Stanford,80'
30 | ]
31 | 
32 | pairs = PythonRDD[3] at RDD at PythonRDD.scala:48
33 | pairs.count =  9
34 | pairs.collect() =  
35 | [
36 |  ('Ames', (40, 1)), 
37 |  ('Ames', (33, 1)), 
38 |  ('Ames', (50, 1)), 
39 |  ('Stanford', (45, 1)), 
40 |  ('Sunnyvale', (55, 1)), 
41 |  ('Sunnyvale', (60, 1)), 
42 |  ('Stanford', (75, 1)), 
43 |  ('Stanford', (90, 1)), 
44 |  ('Stanford', (80, 1))
45 | ]
46 | 
47 | sum_and_count = PythonRDD[9] at RDD at PythonRDD.scala:48
48 | sum_and_count.count =  3
49 | sum_and_count.collect() =  
50 | [
51 |  ('Stanford', (290, 4)), 
52 |  ('Ames', (123, 3)), 
53 |  ('Sunnyvale', (115, 2))
54 | ]
55 | 
56 | average_per_city = PythonRDD[11] at RDD at PythonRDD.scala:48
57 | average_per_city.count =  3
58 | average_per_city.collect() =  
59 | [
60 |  ('Stanford', 72.5), 
61 |  ('Ames', 41.0), 
62 |  ('Sunnyvale', 57.5)
63 | ]


--------------------------------------------------------------------------------
/code/chap01/rdd_creation_from_csv.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_creation_from_csv.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap01/name_city_age.csv"
 8 | export SPARK_PROG="/pyspark_book/code/chap01/rdd_creation_from_csv.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit  $SPARK_PROG  $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap01/sample.txt:
--------------------------------------------------------------------------------
1 | red fox jumped high
2 | fox jumped over high fence
3 | red fox jumped
4 | 


--------------------------------------------------------------------------------
/code/chap01/sample_numbers.txt:
--------------------------------------------------------------------------------
1 | 23 24 12 11
2 | 2 8 9 30 40 50 33 31
3 | 2 9 33 40 70 51 52
4 | 10 11 12
5 | 


--------------------------------------------------------------------------------
/code/chap01/sample_people.json:
--------------------------------------------------------------------------------
1 | {"name":"Alex", "city":"Cupertino"}
2 | {"name":"Bob", "city":"Sunnyvale"}
3 | {"name":"Betty", "city":"Sunnyvale", "age":30}
4 | {"name":"Max", "city":"Stanford", "age":30}
5 | {"name":"Martina", "city":"Stanford", "age":40}
6 | {"name":"Jane", "city":"Cupertino", "age":19}
7 | 


--------------------------------------------------------------------------------
/code/chap01/sort_numbers.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit sort_numbers.py sample_numbers.txt
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x107cde8d0>
 4 | 
 5 | input path :  sample_numbers.txt
 6 | 
 7 | file_contents =
 8 | 23 24 12 11
 9 | 2 8 9 30 40 50 33 31
10 | 2 9 33 40 70 51 52
11 | 10 11 12
12 | 
13 | records = sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
14 | records.count =  4
15 | records.collect() =  
16 | [
17 |  '23 24 12 11', 
18 |  '2 8 9 30 40 50 33 31', 
19 |  '2 9 33 40 70 51 52', 
20 |  '10 11 12'
21 | ]
22 | 
23 | sorted numbers:
24 | 2
25 | 2
26 | 8
27 | 9
28 | 9
29 | 10
30 | 11
31 | 11
32 | 12
33 | 12
34 | 23
35 | 24
36 | 30
37 | 31
38 | 33
39 | 33
40 | 40
41 | 40
42 | 50
43 | 51
44 | 52
45 | 70


--------------------------------------------------------------------------------
/code/chap01/sort_numbers.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run sort_numbers.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap01/sample_numbers.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap01/sort_numbers.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit  $SPARK_PROG  $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap01/url_frequencies.txt:
--------------------------------------------------------------------------------
 1 | url1,1
 2 | url1,9
 3 | url1,4
 4 | ur,5
 5 | url1,8
 6 | url1,12
 7 | url2,2
 8 | url2,6
 9 | ur,2
10 | url2,10
11 | url2,6
12 | url3,1
13 | url3,10
14 | url3,20
15 | url3,30
16 | url3,40
17 | url3,50
18 | url3,2
19 | url4,1
20 | url4,2
21 | 
22 | 


--------------------------------------------------------------------------------
/code/chap01/word_count.log:
--------------------------------------------------------------------------------
 1 | $ ./bin/spark-submit zmp/word_count.py
 2 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
 3 | 
 4 | [
 5 | u'red fox jumped high', 
 6 | u'fox jumped over high fence', 
 7 | u'red fox jumped'
 8 | ]
 9 | 
10 | 
11 | [
12 | u'red', 
13 | u'fox', 
14 | u'jumped', 
15 | u'high', 
16 | u'fox', 
17 | u'jumped', 
18 | u'over', 
19 | u'high', 
20 | u'fence', 
21 | u'red', 
22 | u'fox', 
23 | u'jumped'
24 | ]
25 | 
26 | 
27 | [
28 | (u'red', 1), 
29 | (u'fox', 1), 
30 | (u'jumped', 1), 
31 | (u'high', 1), 
32 | (u'fox', 1), 
33 | (u'jumped', 1), 
34 | (u'over', 1), 
35 | (u'high', 1), 
36 | (u'fence', 1), 
37 | (u'red', 1), (u'fox', 1), 
38 | (u'jumped', 1)
39 | ]
40 | 
41 | 
42 | [
43 | (u'high', 2), 
44 | (u'over', 1), 
45 | (u'fox', 3), 
46 | (u'red', 2), 
47 | (u'fence', 1), 
48 | (u'jumped', 3)
49 | ]
50 | 


--------------------------------------------------------------------------------
/code/chap01/word_count.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark import SparkConf
 3 | from pyspark import SparkContext
 4 | 
 5 | def wordcount(sc, input_path):
 6 | 
 7 |     records_rdd = sc.textFile(input_path)
 8 |     print(records_rdd.collect())
 9 |     
10 |     words_rdd = records_rdd.flatMap(lambda line: line.split(" "))
11 |     print(words_rdd.collect())
12 |     
13 |     pairs_rdd =  words_rdd.map(lambda word: (word, 1))
14 |     print(pairs_rdd.collect())
15 |     
16 |     frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b)
17 |     print(frequencies_rdd.collect())
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     
22 |     conf = SparkConf()
23 |     conf.setAppName("WordCount")
24 |     conf.set('spark.executor.memory', '500M')
25 |     conf.set('spark.cores.max', 4)
26 |     try:
27 |         sc = SparkContext(conf=conf)
28 |         # hard coded input path, for DEMO only
29 |         # never hard code
30 |         input_path = "/tmp/sample.txt" 
31 | 
32 |     except: 
33 |         print ("Failed to connect!")
34 |         print(sys.exc_info()[0])
35 |     
36 |     # Execute word count
37 |     wordcount(sc, input_path)
38 | 


--------------------------------------------------------------------------------
/code/chap01/word_count.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap01/word_count_with_params.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap01/word_count_with_threshold.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark import SparkConf
 3 | from pyspark import SparkContext
 4 | 
 5 | def wordcount(sc, input_path, threshold):
 6 | 
 7 |     records_rdd = sc.textFile(input_path)
 8 |     print(records_rdd.collect())
 9 |     
10 |     words_rdd = records_rdd.flatMap(lambda line: line.split(" "))
11 |     print(words_rdd.collect())
12 |     
13 |     pairs_rdd =  words_rdd.map(lambda word: (word, 1))
14 |     print(pairs_rdd.collect())
15 |     
16 |     frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b)
17 |     print(frequencies_rdd.collect())
18 |     
19 |     # filter out words with fewer than threshold occurrences
20 |     filtered_rdd = frequencies_rdd.filter(lambda (word, count): count >= threshold)
21 |     print(filtered_rdd.collect())
22 | 
23 | if __name__ == '__main__':
24 |     
25 |     conf = SparkConf()
26 |     conf.setAppName("WordCount")
27 |     conf.set('spark.executor.memory', '500M')
28 |     conf.set('spark.cores.max', 4)
29 |     try:
30 |         sc = SparkContext(conf=conf)
31 |     except: 
32 |         print ("Failed to connect!")
33 |         print(sys.exc_info()[0])
34 |     
35 |     #   sys.argv[0] is the name of the script.
36 |     #   sys.argv[1] is the first parameter: filename
37 |     #   sys.argv[2] is the second parameter: threshold
38 |     input_path = sys.argv[1] # "file:///Users/mparsian/sample.txt"
39 |     print("input_path: {}".format(input_path))
40 |     
41 |     # get threshold
42 |     threshold = int(sys.argv[2])
43 |     
44 |     # Execute word count
45 |     wordcount(sc, input_path, threshold)
46 | 


--------------------------------------------------------------------------------
/code/chap01/word_count_with_threshold.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap02/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 2 Programs
 2 | 
 3 | This chapter presents the "Hello World!" program
 4 | in PySpark. 
 5 | 
 6 | I have presented "word count" problem and provided 
 7 | several solutions to it using `reduceByKey()` and 
 8 | `groupByKey()` transformations. 
 9 | 
10 | Note that the `reduceByKey()` transformation is 
11 | efficient than  the `groupByKey()`. When possible, 
12 | we should avoid using the `groupByKey()` transformation 
13 | and replace it be `reduceByKey()`, `aggregateByKey()`, 
14 | or `combineByKey()`.
15 | 
16 | Examples are provided to show how to use the 
17 | `filter()` transformation.
18 | 


--------------------------------------------------------------------------------
/code/chap02/generate_key_value_pairs.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | #-----------------------------------------------------
 3 | # @author Mahmoud Parsian
 4 | #-----------------------------------------------------
 5 | import random
 6 | #---------------------------------------------------------
 7 | # Create 1000,000,000 "<key><,><value>" pairs such that 
 8 | #   key is a random number in range of 1 to 10,000
 9 | #   value is a random number in range of 1 to 5
10 | #---------------------------------------------------------
11 | for x in range(1000000000):
12 |     print(str(random.randint(1,10000)) + "," + str(random.randint(1,5)))
13 | 


--------------------------------------------------------------------------------
/code/chap02/sample_file.txt:
--------------------------------------------------------------------------------
1 | red fox jumped high and high
2 | red fox jumped high fence
3 | fox jumped
4 | 


--------------------------------------------------------------------------------
/code/chap02/sample_file_extra.txt:
--------------------------------------------------------------------------------
1 | a red fox jumped high and of high
2 | red fox jumped of high fence
3 | a fox jumped
4 | 


--------------------------------------------------------------------------------
/code/chap02/sum_by_groupbykey.log:
--------------------------------------------------------------------------------
 1 | $ cat generate_key_value_pairs.py
 2 | from __future__ import print_function
 3 | import random
 4 | #---------------------------------------------------------
 5 | # Create 1000,000,000 "<key><,><value>" pairs such that
 6 | #   key is a random number in range of 1 to 10,000
 7 | #   value is a random number in range of 1 to 5
 8 | #---------------------------------------------------------
 9 | for x in range(1000000000):
10 |     print(str(random.randint(1,10000)) + "," + str(random.randint(1,5)))
11 | 
12 | # create one billion (key, value) pairs
13 | $ python generate_key_value_pairs.py > kv.txt
14 | 
15 | $ ls -l kv.txt
16 | -rw-r--r--  1 mparsian  dev  6889378545 Mar 10 10:17 kv.txt
17 | 
18 | $ wc -l kv.txt
19 | 1000000000 kv.txt
20 | 
21 | $ head kv.txt
22 | 2122,3
23 | 3147,4
24 | 8281,4
25 | 5390,5
26 | 4549,4
27 | 2901,3
28 | 288,5
29 | 2878,4
30 | 2250,2
31 | 3452,3
32 | 
33 |  
34 | $ ./sum_by_groupbykey.sh
35 | Mar 10 10:36:16 PST 2019
36 | input path :  .../code/chap02/kv.txt
37 | rdd.getNumPartitions() =  206
38 | 
39 | results =  
40 | [
41 |  ('8079', 299950), 
42 |  ('9263', 299717), 
43 |  ('8095', 300566), 
44 |  ...
45 | ]
46 | 
47 | 33 minutes and 53 seconds elapsed.
48 | 


--------------------------------------------------------------------------------
/code/chap02/sum_by_groupbykey.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | #-----------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------
 7 | #
 8 | # Create (key, value) pair from given input record:
 9 | # record: <key><,><value>
10 | # <1> accept a record of the form "key,value"
11 | # <2> tokenize input record, 
12 | #    tokens[0]: key, 
13 | #    tokens[1]: value
14 | # <3> return a pair of (key, value)
15 | #
16 | def create_pair(record):  # <1>
17 |     tokens = record.split(',')  # <2>
18 |     key = str(tokens[0])
19 |     value = int(tokens[1])
20 |     return (key, value)  # <3>
21 | #end-def
22 | #-----------------------------------
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     if len(sys.argv) != 2:
27 |         print("Usage: ", __file__, " <input-path>", file=sys.stderr)
28 |         exit(-1)
29 |         
30 |     #------------------------------------------
31 |     # create an instance of SparkSession object
32 |     #------------------------------------------
33 |     spark = SparkSession\
34 |         .builder\
35 |         .appName("test:groupBykey()")\
36 |         .getOrCreate()
37 | 
38 |     input_path = sys.argv[1]
39 |     print("input path : ", input_path)
40 | 
41 |     #
42 |     rdd = spark.sparkContext.textFile(input_path)
43 |     print("rdd.getNumPartitions() = ", rdd.getNumPartitions()) 
44 |     #       
45 |     results = rdd.map(create_pair)\
46 |         .groupByKey()\
47 |         .mapValues(lambda values: sum(values))
48 |     
49 |     # display final results
50 |     print("results = ", results.collect())
51 | 
52 |     spark.stop()
53 | 


--------------------------------------------------------------------------------
/code/chap02/sum_by_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #-----------------------------------------------------
 3 | # @author Mahmoud Parsian
 4 | #-----------------------------------------------------
 5 | SECONDS=0
 6 | /bin/date
 7 | # do some work
 8 | #
 9 | # define Spark's installed directory
10 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
11 | #
12 | # create one billion (key, value) pairs
13 | python generate_key_value_pairs.py > kv.txt
14 | #
15 | # NOTE: define your input path
16 | INPUT_PATH="file:///pyspark_book/code/chap02/kv.txt"
17 | #
18 | # define your PySpark program
19 | PROG="/pyspark_book/code/chap02/sum_by_groupbykey.py"
20 | #
21 | # submit your spark application
22 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
23 | #
24 | #
25 | duration=$SECONDS
26 | echo ""
27 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
28 | 


--------------------------------------------------------------------------------
/code/chap02/sum_by_reducebykey.log:
--------------------------------------------------------------------------------
 1 | $ cat generate_key_value_pairs.py
 2 | from __future__ import print_function
 3 | import random
 4 | #---------------------------------------------------------
 5 | # Create 1000,000,000 "<key><,><value>" pairs such that
 6 | #   key is a random number in range of 1 to 10,000
 7 | #   value is a random number in range of 1 to 5
 8 | #---------------------------------------------------------
 9 | for x in range(1000000000):
10 |     print(str(random.randint(1,10000)) + "," + str(random.randint(1,5)))
11 | 
12 | # create one billion (key, value) pairs
13 | $ python generate_key_value_pairs.py > kv.txt
14 | 
15 | $ ls -l kv.txt
16 | -rw-r--r--  1 mparsian  dev  6889378545 Mar 10 10:17 kv.txt
17 | 
18 | $ wc -l kv.txt
19 | 1000000000 kv.txt
20 | 
21 | $ head kv.txt
22 | 2122,3
23 | 3147,4
24 | 8281,4
25 | 5390,5
26 | 4549,4
27 | 2901,3
28 | 288,5
29 | 2878,4
30 | 2250,2
31 | 3452,3
32 | 
33 | $ ./sum_by_reducebykey.sh
34 | Mar 10 11:27:19 PST 2019
35 | input path :  .../code/chap02/kv.txt
36 | rdd.getNumPartitions() =  206
37 | 
38 | results =  
39 | [
40 |  ('8079', 299950), 
41 |  ('9263', 299717), 
42 |  ('8095', 300566), 
43 |  ...
44 | ]
45 | 
46 | 32 minutes and 47 seconds elapsed.
47 | 
48 | 


--------------------------------------------------------------------------------
/code/chap02/sum_by_reducebykey.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | #-----------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------
 7 | #
 8 | # Create (key, value) pair from given input record:
 9 | # record: <key><,><value>
10 | # <1> accept a record of the form "key,value"
11 | # <2> tokenize input record, 
12 | #    tokens[0]: key, 
13 | #    tokens[1]: value
14 | # <3> return a pair of (key, value)
15 | #
16 | def create_pair(record):  # <1>
17 |     tokens = record.split(',')  # <2>
18 |     key = str(tokens[0])
19 |     value = int(tokens[1])
20 |     return (key, value)  # <3>
21 | #end-def
22 | #-----------------------------------
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     if len(sys.argv) != 2:
27 |         print("Usage: ", __file__, " <input-path>", file=sys.stderr)
28 |         exit(-1)
29 |         
30 |     #------------------------------------------
31 |     # create an instance of SparkSession object
32 |     #------------------------------------------
33 |     spark = SparkSession\
34 |         .builder\
35 |         .appName("test:reduceBykey()")\
36 |         .getOrCreate()
37 | 
38 |     input_path = sys.argv[1]
39 |     print("input path : ", input_path)
40 |     #
41 |     rdd = spark.sparkContext.textFile(input_path)
42 |     print("rdd.getNumPartitions() = ", rdd.getNumPartitions()) 
43 |     #       
44 |     results = rdd.map(create_pair)\
45 |         .reduceByKey(lambda a, b: a+b)
46 |     
47 |     # display final results
48 |     print("results = ", results.collect())
49 | 
50 |     spark.stop()
51 | 


--------------------------------------------------------------------------------
/code/chap02/sum_by_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #-----------------------------------------------------
 3 | # @author Mahmoud Parsian
 4 | #-----------------------------------------------------
 5 | SECONDS=0
 6 | /bin/date
 7 | # do some work
 8 | #
 9 | # define Spark's installed directory
10 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
11 | #
12 | # create one billion (key, value) pairs
13 | python generate_key_value_pairs.py > kv.txt
14 | #
15 | # NOTE: define your input path
16 | INPUT_PATH="file:///pyspark_book/code/chap02/kv.txt"
17 | #
18 | # define your PySpark program
19 | PROG="/pyspark_book/code/chap02/sum_by_reducebykey.py"
20 | #
21 | # submit your spark application
22 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
23 | #
24 | #
25 | duration=$SECONDS
26 | echo ""
27 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
28 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit word_count_driver.py sample_file.txt
 2 | 
 3 | input_path: sample_file.txt
 4 | 
 5 | records.count():  3
 6 | 
 7 | records.collect():  
 8 | [
 9 |  u'red fox jumped high and high', 
10 |  u'red fox jumped high fence', 
11 |  u'fox jumped'
12 | ]
13 | 
14 | non_empty_records.count():  3
15 | 
16 | non_empty_records.collect():  
17 | [
18 |  u'red fox jumped high and high', 
19 |  u'red fox jumped high fence', 
20 |  u'fox jumped'
21 | ]
22 | 
23 | words.count():  13
24 | 
25 | words.collect():  
26 | [
27 |  u'red', 
28 |  u'fox', 
29 |  u'jumped', 
30 |  u'high', 
31 |  u'and', 
32 |  u'high', 
33 |  u'red', 
34 |  u'fox', 
35 |  u'jumped', 
36 |  u'high', 
37 |  u'fence', 
38 |  u'fox', 
39 |  u'jumped'
40 | ]
41 | 
42 | pairs.count():  13
43 | 
44 | pairs.collect():  
45 | [
46 |  (u'red', 1), 
47 |  (u'fox', 1), 
48 |  (u'jumped', 1), 
49 |  (u'high', 1), 
50 |  (u'and', 1), 
51 |  (u'high', 1), 
52 |  (u'red', 1), 
53 |  (u'fox', 1), 
54 |  (u'jumped', 1), 
55 |  (u'high', 1), 
56 |  (u'fence', 1), 
57 |  (u'fox', 1), 
58 |  (u'jumped', 1)
59 | ]
60 | 
61 | frequencies.count():  6
62 | 
63 | frequencies.collect():  
64 | [
65 |  (u'and', 1), 
66 |  (u'high', 3), 
67 |  (u'fox', 3), 
68 |  (u'red', 2), 
69 |  (u'fence', 1), 
70 |  (u'jumped', 3)
71 | ]


--------------------------------------------------------------------------------
/code/chap02/word_count_driver.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for word count in PySpark.
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_by_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for word count in PySpark.
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_by_groupbykey.py"
 9 | #
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 
13 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit word_count_driver_shorthand.py sample_file.txt
 2 | 
 3 | input_path: sample_file.txt
 4 | 
 5 | frequencies.count():  6
 6 | frequencies.collect():  
 7 | [
 8 |  (u'and', 1), 
 9 |  (u'high', 3), 
10 |  (u'fox', 3), 
11 |  (u'red', 2), 
12 |  (u'fence', 1), 
13 |  (u'jumped', 3)
14 | ]
15 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-----------------------------------------------------
 3 | # This is a word count in PySpark.
 4 | # The goal is to show how "word count" works.
 5 | # Here we write transformations in a shorthand!
 6 | #------------------------------------------------------
 7 | # Input Parameters:
 8 | #    argv[1]: String, input path
 9 | #-------------------------------------------------------
10 | # @author Mahmoud Parsian
11 | #-------------------------------------------------------
12 | from __future__ import print_function 
13 | import sys 
14 | from pyspark.sql import SparkSession 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     if len(sys.argv) != 2:  
19 |         print("Usage: word_count_driver_shorthand.py <input-path>", file=sys.stderr)
20 |         exit(-1)
21 | 
22 |     spark = SparkSession\
23 |         .builder\
24 |         .appName("Word-Count-App")\
25 |         .getOrCreate()
26 | 
27 |     #  sys.argv[0] is the name of the script.
28 |     #  sys.argv[1] is the first parameter
29 |     input_path = sys.argv[1]  
30 |     print("input_path: {}".format(input_path))
31 | 
32 |     # create frequencies as RDD<unique-word, frequency>
33 |     frequencies = spark.sparkContext.textFile(input_path)\
34 |         .filter(lambda line: len(line) > 0)\
35 |         .flatMap(lambda line: line.lower().split(" "))\
36 |         .map(lambda word: (word, 1))\
37 |         .reduceByKey(lambda a, b: a + b)
38 |     # 
39 |     print("frequencies.count(): ", frequencies.count())
40 |     print("frequencies.collect(): ", frequencies.collect())
41 | 
42 |     # done!
43 |     spark.stop()
44 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for word count in PySpark.
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_shorthand.py"
 9 | #
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 
13 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand_by_groupbykey.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit word_count_driver_shorthand_by_groupbykey.py sample_file.txt
 2 | 
 3 | input_path: sample_file.txt
 4 | 
 5 | frequencies.count():  6
 6 | 
 7 | frequencies.collect():  
 8 | [
 9 |  (u'and', 1), 
10 |  (u'high', 3), 
11 |  (u'fox', 3), 
12 |  (u'red', 2), 
13 |  (u'fence', 1), 
14 |  (u'jumped', 3)
15 | ]
16 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand_by_groupbykey.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-----------------------------------------------------
 3 | # This is a word count in PySpark.
 4 | # The goal is to show how "word count" works.
 5 | # Here we write transformations in a shorthand!
 6 | #------------------------------------------------------
 7 | # Input Parameters:
 8 | #    argv[1]: String, input path
 9 | #-------------------------------------------------------
10 | # @author Mahmoud Parsian
11 | #-------------------------------------------------------
12 | from __future__ import print_function 
13 | import sys 
14 | from pyspark.sql import SparkSession 
15 | 
16 | if __name__ == '__main__':
17 | 
18 |     if len(sys.argv) != 2:  
19 |         print("Usage: word_count_driver.py <input-file>", file=sys.stderr)
20 |         exit(-1)
21 | 
22 |     spark = SparkSession\
23 |         .builder\
24 |         .appName("Word-Count-App")\
25 |         .getOrCreate()
26 | 
27 |     #  sys.argv[0] is the name of the script.
28 |     #  sys.argv[1] is the first parameter
29 |     input_path = sys.argv[1]  
30 |     print("input_path: {}".format(input_path))
31 | 
32 |     # create frequencies as RDD<unique-word, frequency>
33 |     frequencies = spark.sparkContext.textFile(input_path)\
34 |         .filter(lambda line: len(line) > 0)\
35 |         .flatMap(lambda line: line.lower().split(" "))\
36 |         .map(lambda word: (word, 1))\
37 |         .groupByKey()\
38 |         .mapValues(lambda counts : sum(counts))
39 |     # 
40 |     print("frequencies.count(): ", frequencies.count())
41 |     print("frequencies.collect(): ", frequencies.collect())
42 | 
43 |     # done!
44 |     spark.stop()
45 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand_by_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for word count in PySpark.
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_shorthand_by_groupbykey.py"
 9 | #
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 
13 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand_sorted.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit word_count_driver_shorthand_sorted.py sample_file.txt
 2 | 
 3 | input_path: sample_file.txt
 4 | 
 5 | frequencies.count():  6
 6 | frequencies.collect():  
 7 | [
 8 |  (u'and', 1), 
 9 |  (u'high', 3), 
10 |  (u'fox', 3), 
11 |  (u'red', 2), 
12 |  (u'fence', 1), 
13 |  (u'jumped', 3)
14 | ]
15 | 
16 | sorted_by_key.count():  6
17 | sorted_by_key.collect():  
18 | [
19 |  (u'and', 1), 
20 |  (u'fence', 1), 
21 |  (u'fox', 3), 
22 |  (u'high', 3), 
23 |  (u'jumped', 3), 
24 |  (u'red', 2)
25 | ]
26 | 
27 | sorted_by_value.count():  6
28 | sorted_by_value.collect():  
29 | [
30 |  (u'and', 1), 
31 |  (u'fence', 1), 
32 |  (u'red', 2), 
33 |  (u'high', 3), 
34 |  (u'fox', 3), 
35 |  (u'jumped', 3)
36 | ]
37 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_shorthand_sorted.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for word count in PySpark.
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_shorthand_sorted.spy"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_with_filter.log:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a word count in PySpark.
 3 | # The goal is to show how "word count" works.
 4 | # Here we write transformations in a shorthand!
 5 | #
 6 | # RULES:
 7 | #   RULE-1:
 8 | #        Here I introduce the RDD.filter() transformation
 9 | #        to ignore the words if their length is less than 3.
10 | #        This is implemented by:
11 | #            .filter(lambda word : len(word) > 2)
12 | #   RULE-2:
13 | #        If the total frequency of any unique word is less
14 | #        than 2, then ignore that word from the final output
15 | #        This is implemented by:
16 | #            .filter(lambda (k, v) : v > 1)
17 | #
18 | #------------------------------------------------------
19 | 
20 | ./bin/spark-submit word_count_driver_with_filter.py sample_file_extra.txt
21 | 
22 | input_path: sample_file_extra.txt
23 | 
24 | file_contents =
25 | a red fox jumped high and of high
26 | red fox jumped of high fence
27 | a fox jumped
28 | 
29 | frequencies.count():  6
30 | frequencies.collect():  
31 | [
32 |  (u'and', 1), 
33 |  (u'high', 3), 
34 |  (u'fox', 3), 
35 |  (u'red', 2), 
36 |  (u'fence', 1), 
37 |  (u'jumped', 3)
38 | ]
39 | 
40 | filtered.count():  4
41 | filtered.collect():  
42 | [
43 |  (u'high', 3), 
44 |  (u'fox', 3), 
45 |  (u'red', 2), 
46 |  (u'jumped', 3)
47 | ]


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_with_filter.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for word count in PySpark.
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file_extra.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_with_filter.py"
 9 | #
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 
13 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_with_filter_and_threshold.log:
--------------------------------------------------------------------------------
 1 | export THRESHOLD_WORD_LENGTH=2
 2 | export THRESHOLD_FREQUENCY=1
 3 | ./bin/spark-submit word_count_driver_with_filter_and_threshold.py sample_file_extra.txt ${THRESHOLD_WORD_LENGTH} ${THRESHOLD_FREQUENCY}
 4 | 
 5 | len(sys.argv) =  4
 6 | script: sys.argv[0] =  /pyspark_book/code/chap02/word_count_driver_with_filter_and_threshold.py
 7 | p1:     sys.argv[1] =  sample_file_extra.txt
 8 | p2:     sys.argv[2] =  2
 9 | p3:     sys.argv[3] =  1
10 | 
11 | script: /pyspark_book/git-manning/code/chap02/word_count_driver_with_filter_and_threshold.py
12 | 
13 | input_path: sample_file_extra.txt
14 | 
15 | file_contents =
16 | a red fox jumped high and of high
17 | red fox jumped of high fence
18 | a fox jumped
19 | 
20 | THRESHOLD_WORD_LENGTH =  2
21 | THRESHOLD_FREQUENCY =  1
22 | 
23 | frequencies.count():  6
24 | frequencies.collect():  
25 | [
26 |  (u'and', 1), 
27 |  (u'high', 3), 
28 |  (u'fox', 3), 
29 |  (u'red', 2), 
30 |  (u'fence', 1), 
31 |  (u'jumped', 3)
32 | ]
33 | 
34 | filtered.count():  4
35 | filtered.collect(): 
36 | [
37 |  (u'high', 3), 
38 |  (u'fox', 3), 
39 |  (u'red', 2), 
40 |  (u'jumped', 3)
41 | ]
42 |  


--------------------------------------------------------------------------------
/code/chap02/word_count_driver_with_filter_and_threshold.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for word count in PySpark.
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file_extra.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_with_filter_and_threshold.py"
 9 | #
10 | # define thresholds
11 | export THRESHOLD_WORD_LENGTH=2
12 | export THRESHOLD_FREQUENCY=1
13 | #
14 | # run the PySpark program:
15 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE ${THRESHOLD_WORD_LENGTH} ${THRESHOLD_FREQUENCY}
16 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_python.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-----------------------------------------------------
 3 | # This is a word count in Python programming language.
 4 | # The goal is to show how "word count" works.
 5 | #------------------------------------------------------
 6 | # Input Parameters:
 7 | #    argv[1]: String, input path
 8 | #-------------------------------------------------------
 9 | # @author Mahmoud Parsian
10 | #-------------------------------------------------------
11 | import sys
12 | import collections 
13 | #
14 | input_path = sys.argv[1] 
15 | #
16 | file = open(input_path, "r") 
17 | wordcount = collections.Counter() 
18 | #
19 | for word in file.read().split(): 
20 |     wordcount[word] += 1 
21 | #for-done
22 | print (wordcount) 
23 | file.close() 
24 | 


--------------------------------------------------------------------------------
/code/chap02/word_count_python_shorthand.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | #-----------------------------------------------------
 3 | # This is a word count in Python programming language.
 4 | # The goal is to show how "word count" works.
 5 | #------------------------------------------------------
 6 | # Input Parameters:
 7 | #    argv[1]: String, input path
 8 | #-------------------------------------------------------
 9 | # @author a book reviewer (anonymous)
10 | #-------------------------------------------------------
11 | import sys
12 | import collections
13 | #
14 | input_path = sys.argv[1]
15 | #
16 | with open(input_path) as input_file:
17 |     word_count = collections.Counter(input_file.read().split())
18 | #
19 | print (word_count)
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/code/chap03/datadir/file1:
--------------------------------------------------------------------------------
1 | record 1 of file1
2 | record 2 of file1
3 | record 3 of file1
4 | 


--------------------------------------------------------------------------------
/code/chap03/datadir/file2:
--------------------------------------------------------------------------------
1 | record 1 of file2
2 | record 2 of file2
3 | record 3 of file2
4 | record 4 of file2
5 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_collection.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_collection.py
 3 | # Run this using Python3
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_collection.py"
 9 | export PYSPARK_PYTHON=python3
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
13 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_csv_no_header.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_from_csv_no_header.py kv_no_header.txt
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10f623dd0>
 4 | 
 5 | input path :  kv_no_header.txt
 6 | 
 7 | file_contents =
 8 | alex,200
 9 | alex,300
10 | bob,100
11 | bob,400
12 | bob,500
13 | mary,700
14 | mary,200
15 | mary,300
16 | jane,300
17 | adel,200
18 | adel,400
19 | adel,600
20 | adel,800
21 | 
22 | df =  
23 | [
24 |  Row(_c0=u'alex', _c1=200), 
25 |  Row(_c0=u'alex', _c1=300), 
26 |  Row(_c0=u'bob', _c1=100), 
27 |  Row(_c0=u'bob', _c1=400), 
28 |  Row(_c0=u'bob', _c1=500), 
29 |  Row(_c0=u'mary', _c1=700), 
30 |  Row(_c0=u'mary', _c1=200), 
31 |  Row(_c0=u'mary', _c1=300), 
32 |  Row(_c0=u'jane', _c1=300), 
33 |  Row(_c0=u'adel', _c1=200), 
34 |  Row(_c0=u'adel', _c1=400), 
35 |  Row(_c0=u'adel', _c1=600), 
36 |  Row(_c0=u'adel', _c1=800)
37 | ]
38 | 
39 | +----+---+
40 | | _c0|_c1|
41 | +----+---+
42 | |alex|200|
43 | |alex|300|
44 | | bob|100|
45 | | bob|400|
46 | | bob|500|
47 | |mary|700|
48 | |mary|200|
49 | |mary|300|
50 | |jane|300|
51 | |adel|200|
52 | |adel|400|
53 | |adel|600|
54 | |adel|800|
55 | +----+---+
56 | 
57 | root
58 |  |-- _c0: string (nullable = true)
59 |  |-- _c1: integer (nullable = true)
60 | 
61 | 
62 | +----+-----+
63 | |name|value|
64 | +----+-----+
65 | |alex|  200|
66 | |alex|  300|
67 | | bob|  100|
68 | | bob|  400|
69 | | bob|  500|
70 | |mary|  700|
71 | |mary|  200|
72 | |mary|  300|
73 | |jane|  300|
74 | |adel|  200|
75 | |adel|  400|
76 | |adel|  600|
77 | |adel|  800|
78 | +----+-----+
79 | 
80 | root
81 |  |-- name: string (nullable = true)
82 |  |-- value: integer (nullable = true)
83 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_csv_no_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_csv_no_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap03/kv_no_header.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_csv_no_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_csv_with_header.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_from_csv_with_header.py kv_with_header.txt
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1009e5dd0>
 4 | 
 5 | input path :  kv_with_header.txt
 6 | 
 7 | file_contents =
 8 | name,value
 9 | alex,200
10 | alex,300
11 | bob,100
12 | bob,400
13 | bob,500
14 | mary,700
15 | mary,200
16 | mary,300
17 | jane,300
18 | adel,200
19 | adel,400
20 | adel,600
21 | adel,800
22 | 
23 | df =  
24 | [
25 |  Row(name=u'alex', value=200), 
26 |  Row(name=u'alex', value=300), 
27 |  Row(name=u'bob', value=100), 
28 |  Row(name=u'bob', value=400), 
29 |  Row(name=u'bob', value=500), 
30 |  Row(name=u'mary', value=700), 
31 |  Row(name=u'mary', value=200), 
32 |  Row(name=u'mary', value=300), 
33 |  Row(name=u'jane', value=300), 
34 |  Row(name=u'adel', value=200), 
35 |  Row(name=u'adel', value=400), 
36 |  Row(name=u'adel', value=600), 
37 |  Row(name=u'adel', value=800)
38 | ]
39 | 
40 | +----+-----+
41 | |name|value|
42 | +----+-----+
43 | |alex|  200|
44 | |alex|  300|
45 | | bob|  100|
46 | | bob|  400|
47 | | bob|  500|
48 | |mary|  700|
49 | |mary|  200|
50 | |mary|  300|
51 | |jane|  300|
52 | |adel|  200|
53 | |adel|  400|
54 | |adel|  600|
55 | |adel|  800|
56 | +----+-----+
57 | 
58 | root
59 |  |-- name: string (nullable = true)
60 |  |-- value: integer (nullable = true)
61 | 
62 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_csv_with_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_csv_with_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap03/kv_with_header.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_csv_with_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_dictionary.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_from_dictionary.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1055c6590>
 4 | 
 5 | mydict= 
 6 | {
 7 |  'A': '1', 
 8 |  'B': '2', 
 9 |  'E': '99', 
10 |  'D': '8'
11 | }
12 | 
13 | df =  DataFrame[key: string, value: string]
14 | df.count =  4
15 | df.collect() =  
16 | [
17 |  Row(key=u'A', value=u'1'), 
18 |  Row(key=u'B', value=u'2'), 
19 |  Row(key=u'E', value=u'99'), 
20 |  Row(key=u'D', value=u'8')
21 | ]
22 | 
23 | +---+-----+
24 | |key|value|
25 | +---+-----+
26 | |  A|    1|
27 | |  B|    2|
28 | |  E|   99|
29 | |  D|    8|
30 | +---+-----+
31 | 
32 | root
33 |  |-- key: string (nullable = true)
34 |  |-- value: string (nullable = true)
35 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_dictionary.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_dictionary.py
 3 | # Run this using Python3 by setting PYSPARK_PYTHON
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_dictionary.py"
 9 | export PYSPARK_PYTHON=python3
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
13 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_directory.log:
--------------------------------------------------------------------------------
 1 | $ ls -l sample_dir2
 2 | -rw-r--r--  1 mparsian  dev  54 Nov 11 19:53 file1.txt
 3 | -rw-r--r--  1 mparsian  dev  90 Nov 11 19:53 file2.txt
 4 | -rw-r--r--@ 1 mparsian  dev  31 Nov 11 19:54 file3.csv
 5 | -rw-r--r--  1 mparsian  dev  19 Nov 11 19:55 file4.csv
 6 | 
 7 | $ cat file3.csv
 8 | alex,33
 9 | bob,45
10 | mary,25
11 | jeff,10
12 | 
13 | $ cat file4.csv
14 | amanda,44
15 | terry,64
16 | 
17 | ./bin/spark-submit dataframe_creation_from_directory.py sample_dir2
18 | 
19 | spark= <pyspark.sql.session.SparkSession object at 0x10d8a8c90>
20 | 
21 | input_dir :  sample_dir2
22 | 
23 | dir_listing =  ['file2.txt', 'file1.txt', 'file3.csv', 'file4.csv']
24 | 
25 | df =  
26 | [
27 |  Row(_c0=u'alex', _c1=33), 
28 |  Row(_c0=u'bob', _c1=45), 
29 |  Row(_c0=u'mary', _c1=25), 
30 |  Row(_c0=u'jeff', _c1=10), 
31 |  Row(_c0=u'amanda', _c1=44), 
32 |  Row(_c0=u'terry', _c1=64)
33 | ]
34 | 
35 | +------+---+
36 | |   _c0|_c1|
37 | +------+---+
38 | |  alex| 33|
39 | |   bob| 45|
40 | |  mary| 25|
41 | |  jeff| 10|
42 | |amanda| 44|
43 | | terry| 64|
44 | +------+---+
45 | 
46 | root
47 |  |-- _c0: string (nullable = true)
48 |  |-- _c1: integer (nullable = true)


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_directory.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_directory.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_DIR="/pyspark_book/code/chap03/sample_dir2"
 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_directory.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_rdd.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_from_rdd.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10e6ffd10>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('mary', 'Cupertino', 22), 
 9 |  ('jane', 'Ames', 20), 
10 |  ('bob', 'Stanford', 26)
11 | ]
12 | 
13 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
14 | rdd.count() =  4
15 | rdd.collect() =  
16 | [
17 |  ('alex', 'Sunnyvale', 25), 
18 |  ('mary', 'Cupertino', 22), 
19 |  ('jane', 'Ames', 20), 
20 |  ('bob', 'Stanford', 26)
21 | ]
22 | 
23 | people =  PythonRDD[2] at RDD at PythonRDD.scala:48
24 | people.count() =  4
25 | people.collect() =  
26 | [
27 |  Row(age=25, city='Sunnyvale', name='alex'), 
28 |  Row(age=22, city='Cupertino', name='mary'), 
29 |  Row(age=20, city='Ames', name='jane'), 
30 |  Row(age=26, city='Stanford', name='bob')
31 | ]
32 | 
33 | df =  DataFrame[age: bigint, city: string, name: string]
34 | df.count() =  4
35 | df.collect() =  
36 | [
37 |  Row(age=25, city=u'Sunnyvale', name=u'alex'), 
38 |  Row(age=22, city=u'Cupertino', name=u'mary'), 
39 |  Row(age=20, city=u'Ames', name=u'jane'), 
40 |  Row(age=26, city=u'Stanford', name=u'bob')
41 | ]
42 | 
43 | +---+---------+----+
44 | |age|     city|name|
45 | +---+---------+----+
46 | | 25|Sunnyvale|alex|
47 | | 22|Cupertino|mary|
48 | | 20|     Ames|jane|
49 | | 26| Stanford| bob|
50 | +---+---------+----+
51 | 
52 | root
53 |  |-- age: long (nullable = true)
54 |  |-- city: string (nullable = true)
55 |  |-- name: string (nullable = true)


--------------------------------------------------------------------------------
/code/chap03/dataframe_creation_from_rdd.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_rdd.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_rdd.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap03/kv.txt:
--------------------------------------------------------------------------------
1 | alex,200
2 | bob,100
3 | mary,700
4 | jane,300
5 | adel,900
6 | 


--------------------------------------------------------------------------------
/code/chap03/kv_no_header.txt:
--------------------------------------------------------------------------------
 1 | alex,200
 2 | alex,300
 3 | bob,100
 4 | bob,400
 5 | bob,500
 6 | mary,700
 7 | mary,200
 8 | mary,300
 9 | jane,300
10 | adel,200
11 | adel,400
12 | adel,600
13 | adel,800
14 | 


--------------------------------------------------------------------------------
/code/chap03/kv_with_header.txt:
--------------------------------------------------------------------------------
 1 | name,value
 2 | alex,200
 3 | alex,300
 4 | bob,100
 5 | bob,400
 6 | bob,500
 7 | mary,700
 8 | mary,200
 9 | mary,300
10 | jane,300
11 | adel,200
12 | adel,400
13 | adel,600
14 | adel,800
15 | 


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_collection.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_creation_from_collection.py
 2 | 
 3 | 
 4 | spark= <pyspark.sql.session.SparkSession object at 0x10f9045d0>
 5 | 
 6 | list_of_strings= ['alex', 'bob', 'jane', 'mary', 'adel']
 7 | rdd1= ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
 8 | rdd1.count= 5
 9 | rdd1.collect()= ['alex', 'bob', 'jane', 'mary', 'adel']
10 | 
11 | list_of_pairs =  [('alex', 1), ('alex', 3), ('alex', 9), ('alex', 10), ('bob', 4), ('bob', 8)]
12 | rdd2 =  ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:175
13 | rdd2.count =  6
14 | rdd2.collect() =  [('alex', 1), ('alex', 3), ('alex', 9), ('alex', 10), ('bob', 4), ('bob', 8)]
15 | 
16 | rdd2_added =  PythonRDD[8] at RDD at PythonRDD.scala:48
17 | rdd2_added.count =  2
18 | rdd2_added.collect() =  [('bob', 12), ('alex', 23)]
19 | 
20 | rdd2_grouped =  PythonRDD[8] at RDD at PythonRDD.scala:48
21 | rdd2_grouped.count =  2
22 | rdd2_grouped.collect() =  [('bob', <pyspark.resultiterable.ResultIterable object at 0x10f92e5d0>), ('alex', <pyspark.resultiterable.ResultIterable object at 0x10fb020d0>)]
23 | rdd2_grouped.collect() = (as a list) =  [('bob', [4, 8]), ('alex', [1, 3, 9, 10])]
24 | 
25 | d =  {'key3': 'value3', 'key2': 'value2', 'key1': 'value1'}
26 | d.items()=  [('key3', 'value3'), ('key2', 'value2'), ('key1', 'value1')]
27 | rdd_from_dict =  ParallelCollectionRDD[17] at parallelize at PythonRDD.scala:175
28 | rdd_from_dict.collect() =  [('key3', 'value3'), ('key2', 'value2'), ('key1', 'value1')]
29 | rdd_from_dict.count =  3


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_collection.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_creation_from_collection.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_collection.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_dataframe.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_creation_from_dataframe.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x104e4dc50>
 4 | 
 5 | list_of_pairs= 
 6 | [
 7 |  ('alex', 1), 
 8 |  ('alex', 5), 
 9 |  ('bob', 2), 
10 |  ('bob', 40), 
11 |  ('jane', 60), 
12 |  ('mary', 700), 
13 |  ('adel', 800)
14 | ]
15 | 
16 | df =  DataFrame[_1: string, _2: bigint]
17 | df.count =  7
18 | df.collect() =  
19 | [
20 |  Row(_1=u'alex', _2=1), 
21 |  Row(_1=u'alex', _2=5), 
22 |  Row(_1=u'bob', _2=2), 
23 |  Row(_1=u'bob', _2=40), 
24 |  Row(_1=u'jane', _2=60), 
25 |  Row(_1=u'mary', _2=700), 
26 |  Row(_1=u'adel', _2=800)
27 | ]
28 | 
29 | +----+---+
30 | |  _1| _2|
31 | +----+---+
32 | |alex|  1|
33 | |alex|  5|
34 | | bob|  2|
35 | | bob| 40|
36 | |jane| 60|
37 | |mary|700|
38 | |adel|800|
39 | +----+---+
40 | 
41 | root
42 |  |-- _1: string (nullable = true)
43 |  |-- _2: long (nullable = true)
44 | 
45 | rdd =  DataFrame[_1: string, _2: bigint]
46 | rdd.count =  7
47 | rdd.collect() =  
48 | [
49 |  Row(_1=u'alex', _2=1), 
50 |  Row(_1=u'alex', _2=5), 
51 |  Row(_1=u'bob', _2=2), 
52 |  Row(_1=u'bob', _2=40), 
53 |  Row(_1=u'jane', _2=60), 
54 |  Row(_1=u'mary', _2=700), 
55 |  Row(_1=u'adel', _2=800)
56 | ]


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_dataframe.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_creation_from_dataframe.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_dataframe.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_dictionary.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_creation_from_dictionary.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1055c6590>
 4 | 
 5 | mydict= 
 6 | {
 7 |  'A': '1', 
 8 |  'B': '2', 
 9 |  'E': '99', 
10 |  'D': '8'
11 | }
12 | 
13 | 
14 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
15 | rdd.count =  4
16 | rdd.collect() =  
17 | [
18 |  ('A', '1'), 
19 |  ('B', '2'), 
20 |  ('E', '99'), 
21 |  ('D', '8')
22 | ]


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_dictionary.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_creation_from_dictionary.py
 3 | # Run this using Python3 by setting PYSPARK_PYTHON
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_dictionary.py"
 9 | export PYSPARK_PYTHON=python3
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
13 | 


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_directory.log:
--------------------------------------------------------------------------------
 1 | $ ls -l sample_dir/
 2 | total 16
 3 | -rw-r--r--  1 mparsian  dev  54 Nov 11 18:59 file1.txt
 4 | -rw-r--r--  1 mparsian  dev  90 Nov 11 19:00 file2.txt
 5 | 
 6 | $ cat file1.txt
 7 | record 1 of file1
 8 | record 2 of file1
 9 | record 3 of file1
10 | 
11 | $ cat file2.txt
12 | record 1 of file2
13 | record 2 of file2
14 | record 3 of file2
15 | record 4 of file2
16 | record 5 of file2
17 | 
18 | ./bin/spark-submit rdd_creation_from_directory.py sample_dir
19 | 
20 | spark= <pyspark.sql.session.SparkSession object at 0x10718a710>
21 | 
22 | dir path :  sample_dir
23 | 
24 | dir_listing =  ['file2.txt', 'file1.txt']
25 | 
26 | rdd = sample_dir/ MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
27 | rdd.count =  8
28 | rdd.collect() =  
29 | [
30 |  u'record 1 of file2', 
31 |  u'record 2 of file2', 
32 |  u'record 3 of file2', 
33 |  u'record 4 of file2', 
34 |  u'record 5 of file2', 
35 |  u'record 1 of file1', 
36 |  u'record 2 of file1', 
37 |  u'record 3 of file1'
38 | ]
39 | 
40 | filtered = PythonRDD[3] at RDD at PythonRDD.scala:48
41 | filtered.count =  2
42 | filtered.collect() =  
43 | [
44 |  u'record 3 of file2', 
45 |  u'record 3 of file1'
46 | ]


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_directory.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_creation_from_directory.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | #
 7 | # $ cd sample_dir/
 8 | # $ ls -l
 9 | # -rw-r--r--  1 mparsian  dev  54 Nov 11 18:59 file1.txt
10 | # -rw-r--r--  1 mparsian  dev  90 Nov 11 19:00 file2.txt
11 | #
12 | # $ cat file1.txt
13 | # record 1 of file1
14 | # record 2 of file1
15 | # record 3 of file1
16 | #
17 | # $ cat file2.txt
18 | # record 1 of file2
19 | # record 2 of file2
20 | # record 3 of file2
21 | # record 4 of file2
22 | # record 5 of file2
23 | #
24 | #------------------------------------------------------
25 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
26 | export INPUT_DIR="/pyspark_book/code/chap03/sample_dir"
27 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_directory.py"
28 | #
29 | # run the PySpark program:
30 | $SPARK_HOME/bin/spark-submit  $SPARK_PROG  $INPUT_DIR
31 | 


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_file.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_creation_from_file.py kv.txt
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x108fde9e8>
 4 | 
 5 | input path :  kv.txt
 6 | 
 7 | file_contents =
 8 | alex,200
 9 | bob,100
10 | mary,700
11 | jane,300
12 | adel,900
13 | 
14 | rdd = kv.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
15 | rdd.count =  5
16 | rdd.collect() =  
17 | [
18 |  'alex,200', 
19 |  'bob,100', 
20 |  'mary,700', 
21 |  'jane,300', 
22 |  'adel,900'
23 | ]
24 | 
25 | pairs = PythonRDD[3] at RDD at PythonRDD.scala:48
26 | pairs.count =  5
27 | pairs.collect() =  
28 | [
29 |  ('alex', 200), 
30 |  ('bob', 100), 
31 |  ('mary', 700), 
32 |  ('jane', 300), 
33 |  ('adel', 900)
34 | ]
35 | 
36 | filtered = PythonRDD[5] at RDD at PythonRDD.scala:48
37 | filtered.count =  2
38 | filtered.collect() =  
39 | [
40 |  ('mary', 700), 
41 |  ('adel', 900)
42 | ]


--------------------------------------------------------------------------------
/code/chap03/rdd_creation_from_file.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_creation_from_file.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap03/kv.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_file.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit  $SPARK_PROG  $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap03/sample.txt:
--------------------------------------------------------------------------------
1 | red fox jumped high
2 | fox jumped over high fence
3 | red fox jumped
4 | 


--------------------------------------------------------------------------------
/code/chap03/sample_dir/file1.txt:
--------------------------------------------------------------------------------
1 | record 1 of file1
2 | record 2 of file1
3 | record 3 of file1
4 | 


--------------------------------------------------------------------------------
/code/chap03/sample_dir/file2.txt:
--------------------------------------------------------------------------------
1 | record 1 of file2
2 | record 2 of file2
3 | record 3 of file2
4 | record 4 of file2
5 | record 5 of file2
6 | 


--------------------------------------------------------------------------------
/code/chap03/sample_dir2/file1.txt:
--------------------------------------------------------------------------------
1 | record 1 of file1
2 | record 2 of file1
3 | record 3 of file1
4 | 


--------------------------------------------------------------------------------
/code/chap03/sample_dir2/file2.txt:
--------------------------------------------------------------------------------
1 | record 1 of file2
2 | record 2 of file2
3 | record 3 of file2
4 | record 4 of file2
5 | record 5 of file2
6 | 


--------------------------------------------------------------------------------
/code/chap03/sample_dir2/file3.csv:
--------------------------------------------------------------------------------
1 | alex,33
2 | bob,45
3 | mary,25
4 | jeff,10
5 | 


--------------------------------------------------------------------------------
/code/chap03/sample_dir2/file4.csv:
--------------------------------------------------------------------------------
1 | amanda,44
2 | terry,64
3 | 


--------------------------------------------------------------------------------
/code/chap03/word_count.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark import SparkConf
 3 | from pyspark import SparkContext
 4 | 
 5 | def wordcount(sc, input_path):
 6 | 
 7 |     records_rdd = sc.textFile(input_path)
 8 |     print(records_rdd.collect())
 9 |     
10 |     words_rdd = records_rdd.flatMap(lambda line: line.split(" "))
11 |     print(words_rdd.collect())
12 |     
13 |     pairs_rdd =  words_rdd.map(lambda word: (word, 1))
14 |     print(pairs_rdd.collect())
15 |     
16 |     frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b)
17 |     print(frequencies_rdd.collect())
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     
22 |     conf = SparkConf()
23 |     conf.setAppName("WordCount")
24 |     conf.set('spark.executor.memory', '500M')
25 |     conf.set('spark.cores.max', 4)
26 |     try:
27 |         sc = SparkContext(conf=conf)
28 |         # hard coded input path, for DEMO only
29 |         # never hard code
30 |         input_path = "/tmp/sample.txt" 
31 | 
32 |     except: 
33 |         print ("Failed to connect!")
34 |         print(sys.exc_info()[0])
35 |     
36 |     # Execute word count
37 |     wordcount(sc, input_path)
38 | 


--------------------------------------------------------------------------------
/code/chap03/word_count.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap03/word_count_with_params.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG=/pyspark_book/code/chap03/word_count_with_params.py
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap03/word_count_with_threshold.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pyspark import SparkConf
 3 | from pyspark import SparkContext
 4 | 
 5 | def wordcount(sc, input_path, threshold):
 6 | 
 7 |     records_rdd = sc.textFile(input_path)
 8 |     print(records_rdd.collect())
 9 |     
10 |     words_rdd = records_rdd.flatMap(lambda line: line.split(" "))
11 |     print(words_rdd.collect())
12 |     
13 |     pairs_rdd =  words_rdd.map(lambda word: (word, 1))
14 |     print(pairs_rdd.collect())
15 |     
16 |     frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b)
17 |     print(frequencies_rdd.collect())
18 |     
19 |     # filter out words with fewer than threshold occurrences
20 |     filtered_rdd = frequencies_rdd.filter(lambda (word, count): count >= threshold)
21 |     print(filtered_rdd.collect())
22 | 
23 | if __name__ == '__main__':
24 |     
25 |     conf = SparkConf()
26 |     conf.setAppName("WordCount")
27 |     conf.set('spark.executor.memory', '500M')
28 |     conf.set('spark.cores.max', 4)
29 |     try:
30 |         sc = SparkContext(conf=conf)
31 |     except: 
32 |         print ("Failed to connect!")
33 |         print(sys.exc_info()[0])
34 |     
35 |     #   sys.argv[0] is the name of the script.
36 |     #   sys.argv[1] is the first parameter: filename
37 |     #   sys.argv[2] is the second parameter: threshold
38 |     input_path = sys.argv[1] # "file:///Users/mparsian/sample.txt"
39 |     print("input_path: {}".format(input_path))
40 |     
41 |     # get threshold
42 |     threshold = int(sys.argv[2])
43 |     
44 |     # Execute word count
45 |     wordcount(sc, input_path, threshold)
46 | 


--------------------------------------------------------------------------------
/code/chap03/word_count_with_threshold.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt"
 6 | #
 7 | # define your PySpark program
 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V1/run_dna_base_count_ver_1.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sample.fasta"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V1/dna_base_count_ver_1.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V1/run_dna_base_count_ver_1_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | #
 9 | # define your input path
10 | INPUT_PATH="file:///pyspark_book/code/chap04/data/*.fasta"
11 | #
12 | # define your PySpark program
13 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V1/dna_base_count_ver_1.py"
14 | #
15 | # submit your spark application
16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
17 | #
18 | duration=$SECONDS
19 | echo ""
20 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
21 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V1/run_dna_base_count_ver_1_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | #
 9 | # NOTE: define your input path
10 | # Before running your PySpark program,
11 | # Download chr1.subst.fa from this location:
12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
14 | #
15 | INPUT_PATH="file:///pyspark_book/code/chap04/chr1.subst.fa"
16 | #
17 | # define your PySpark program
18 | PROG=/pyspark_book/code/chap04/DNA-FASTA-V1/dna_base_count_ver_1.py
19 | #
20 | # submit your spark application
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
22 | #
23 | duration=$SECONDS
24 | echo ""
25 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
26 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V2/run_dna_base_count_ver_2.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sample.fasta"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V2/dna_base_count_ver_2.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V2/run_dna_base_count_ver_2_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | #
 9 | # define your input path
10 | INPUT_PATH="file:///pyspark_book/code/chap04/data/*.fa"
11 | #
12 | # define your PySpark program
13 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V2/dna_base_count_ver_2.py"
14 | #
15 | # submit your spark application
16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
17 | #
18 | duration=$SECONDS
19 | echo ""
20 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
21 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V2/run_dna_base_count_ver_2_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | #
 9 | # NOTE: define your input path
10 | # Before running your PySpark program,
11 | # Download chr1.subst.fa from this location:
12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
14 | #
15 | INPUT_PATH="file:///pyspark_book/code/chap04/data/chr1.subst.fa"
16 | #
17 | # define your PySpark program
18 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V2/dna_base_count_ver_2.py"
19 | #
20 | # submit your spark application
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
22 | #
23 | duration=$SECONDS
24 | echo ""
25 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
26 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V3/run_dna_base_count_ver_3.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sample.fasta"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V3/dna_base_count_ver_3.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V3/run_dna_base_count_ver_3_1GB.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | #
 9 | # define your input path
10 | INPUT_PATH="file:///pyspark_book/code/chap04/data/*.fa"
11 | #
12 | # define your PySpark program
13 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V3/dna_base_count_ver_3.py"
14 | #
15 | # submit your spark application
16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
17 | #
18 | #
19 | duration=$SECONDS
20 | echo ""
21 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
22 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTA-V3/run_dna_base_count_ver_3_big.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | SECONDS=0
 3 | /bin/date
 4 | # do some work
 5 | #
 6 | # define Spark's installed directory
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | #
 9 | # NOTE: define your input path
10 | # Before running your PySpark program,
11 | # Download chr1.subst.fa from this location:
12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/
13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz
14 | #
15 | INPUT_PATH="file:///pyspark_book/code/chap04/data/chr1.subst.fa"
16 | #
17 | # define your PySpark program
18 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V3/dna_base_count_ver_3.py"
19 | #
20 | # submit your spark application
21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
22 | #
23 | #
24 | duration=$SECONDS
25 | echo ""
26 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed."
27 | 


--------------------------------------------------------------------------------
/code/chap04/DNA-FASTQ/run_dna_base_count_fastq.sh:
--------------------------------------------------------------------------------
 1 | # define Spark's installed directory
 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 3 | #
 4 | # define your input path
 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sp1.fastq"
 6 | #
 7 | # define your PySpark program
 8 | PROG="/pyspark_book/code/chap04/DNA-FASTQ/dna_base_count_fastq.py"
 9 | #
10 | # submit your spark application
11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap04/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 4
 2 | 
 3 | ## DNA-Base-Count Programs using FASTA Input Format
 4 | 
 5 | Using FASTA input files, there are 3 versions of DNA-Base-Count
 6 | 
 7 | * Version-1:
 8 |     * Uses basic MapReduce programs
 9 |     * Using PySpark (`chap04/DNA-FASTA-V1/dna_base_count_ver_1.py`)
10 | 
11 | * Version-2:
12 |     * Uses InMapper Combiner design pattern
13 |     * Using PySpark (`chap04/DNA-FASTA-V2/dna_base_count_ver_2.py`)
14 | 
15 | * Version-3:
16 |     * Uses InMapper Combiner design pattern (by using mapPartitions() transformations)
17 |     * Using PySpark (`chap04/DNA-FASTA-V3/dna_base_count_ver_3.py`)
18 | 
19 | 
20 | ## DNA-Base-Count Programs using FASTQ Input Format
21 | 
22 | Using FASTQ input files, the following solution is available:
23 | 
24 | * Uses InMapper Combiner design pattern (by using mapPartitions() transformations)
25 | * Using PySpark (`chap04/DNA-FASTQ/dna_base_count_fastq.py`)
26 | 
27 | 
28 | ## FASTA Files to Test DNA-Base-Count
29 | 
30 | * A small sample FASTA file (`chap04/data/sample.fasta`) is provided.
31 | 
32 | * To test DNA-Base-Count programs with large size FASTA files,
33 | you may download them from here:
34 | 
35 | 
36 | ````
37 | 	ftp://ftp.ensembl.org/pub/release-91/fasta/homo_sapiens/dna/
38 | 
39 | 	ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/
40 | 	
41 | 	ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
42 | 	
43 | ````
44 | 


--------------------------------------------------------------------------------
/code/chap04/data/sample.fasta:
--------------------------------------------------------------------------------
 1 | >seq1
 2 | cGTAaccaataaaaaaacaagcttaacctaattc
 3 | >seq2
 4 | agcttagTTTGGatctggccgggg
 5 | >seq3
 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca
 7 | gaattcgcacca
 8 | AATAAAACCTCACCCAT
 9 | agagcccagaatttactcCCC
10 | >seq4
11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca
12 | gaattcgcacca
13 | 


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_aggregatebykey.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit average_by_key_use_aggregatebykey.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10a397790>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('alex', 'Sunnyvale', 33), 
 9 |  ('alex', 'Sunnyvale', 45), 
10 |  ('alex', 'Sunnyvale', 63), 
11 |  ('mary', 'Ames', 22), 
12 |  ('mary', 'Cupertino', 66), 
13 |  ('mary', 'Ames', 20), 
14 |  ('bob', 'Ames', 26)
15 | ]
16 | 
17 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
18 | rdd.count() =  8
19 | rdd.collect() =  
20 | [
21 |  ('alex', 'Sunnyvale', 25), 
22 |  ('alex', 'Sunnyvale', 33), 
23 |  ('alex', 'Sunnyvale', 45), 
24 |  ('alex', 'Sunnyvale', 63), 
25 |  ('mary', 'Ames', 22), 
26 |  ('mary', 'Cupertino', 66), 
27 |  ('mary', 'Ames', 20), 
28 |  ('bob', 'Ames', 26)
29 | ]
30 | 
31 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
32 | rdd2.count() =  8
33 | rdd2.collect() =  
34 | [
35 |  ('alex', 25), 
36 |  ('alex', 33), 
37 |  ('alex', 45), 
38 |  ('alex', 63), 
39 |  ('mary', 22), 
40 |  ('mary', 66), 
41 |  ('mary', 20), 
42 |  ('bob', 26)
43 | ]
44 | 
45 | sum_count =  PythonRDD[8] at RDD at PythonRDD.scala:48
46 | sum_count.count() =  3
47 | sum_count.collect() =  
48 | [
49 |  ('bob', (26, 1)), 
50 |  ('alex', (166, 4)), 
51 |  ('mary', (108, 3))
52 | ]
53 | 
54 | averages =  PythonRDD[10] at RDD at PythonRDD.scala:48
55 | averages.count() =  3
56 | averages.collect() =  
57 | [
58 |  ('bob', 26.0), 
59 |  ('alex', 41.5), 
60 |  ('mary', 36.0)
61 | ]


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_aggregatebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_aggregatebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_aggregatebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_combinebykey.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit average_by_key_use_combinebykey.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x110da6790>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('alex', 'Sunnyvale', 33), 
 9 |  ('alex', 'Sunnyvale', 45), 
10 |  ('alex', 'Sunnyvale', 63), 
11 |  ('mary', 'Ames', 22), 
12 |  ('mary', 'Cupertino', 66), 
13 |  ('mary', 'Ames', 20), 
14 |  ('bob', 'Ames', 26)
15 | ]
16 | 
17 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
18 | rdd.count() =  8
19 | rdd.collect() =  
20 | [
21 |  ('alex', 'Sunnyvale', 25), 
22 |  ('alex', 'Sunnyvale', 33), 
23 |  ('alex', 'Sunnyvale', 45), 
24 |  ('alex', 'Sunnyvale', 63), 
25 |  ('mary', 'Ames', 22), 
26 |  ('mary', 'Cupertino', 66), 
27 |  ('mary', 'Ames', 20), 
28 |  ('bob', 'Ames', 26)
29 | ]
30 | 
31 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
32 | rdd2.count() =  8
33 | rdd2.collect() =  
34 | [
35 |  ('alex', 25), 
36 |  ('alex', 33), 
37 |  ('alex', 45), 
38 |  ('alex', 63), 
39 |  ('mary', 22), 
40 |  ('mary', 66), 
41 |  ('mary', 20), 
42 |  ('bob', 26)
43 | ]
44 | 
45 | sum_count =  PythonRDD[8] at RDD at PythonRDD.scala:48
46 | sum_count.count() =  3
47 | sum_count.collect() =  
48 | [
49 |  ('bob', (26, 1)), 
50 |  ('alex', (166, 4)), 
51 |  ('mary', (108, 3))
52 | ]
53 | 
54 | averages =  PythonRDD[10] at RDD at PythonRDD.scala:48
55 | averages.count() =  3
56 | averages.collect() =  
57 | [
58 |  ('bob', 26.0), 
59 |  ('alex', 41.5), 
60 |  ('mary', 36.0)
61 | ]


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_combinebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_combinebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_foldbykey.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit average_by_key_use_foldbykey.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10aacc710>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('alex', 'Sunnyvale', 33), 
 9 |  ('alex', 'Sunnyvale', 45), 
10 |  ('alex', 'Sunnyvale', 63), 
11 |  ('mary', 'Ames', 22), 
12 |  ('mary', 'Cupertino', 66), 
13 |  ('mary', 'Ames', 20), 
14 |  ('bob', 'Ames', 26)
15 | ]
16 | 
17 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
18 | rdd.count() =  8
19 | rdd.collect() =  
20 | [
21 |  ('alex', 'Sunnyvale', 25), 
22 |  ('alex', 'Sunnyvale', 33), 
23 |  ('alex', 'Sunnyvale', 45), 
24 |  ('alex', 'Sunnyvale', 63), 
25 |  ('mary', 'Ames', 22), 
26 |  ('mary', 'Cupertino', 66), 
27 |  ('mary', 'Ames', 20), 
28 |  ('bob', 'Ames', 26)
29 | ]
30 | 
31 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
32 | rdd2.count() =  8
33 | rdd2.collect() =  
34 | [
35 |  ('alex', (25, 1)), 
36 |  ('alex', (33, 1)), 
37 |  ('alex', (45, 1)), 
38 |  ('alex', (63, 1)), 
39 |  ('mary', (22, 1)), 
40 |  ('mary', (66, 1)), 
41 |  ('mary', (20, 1)), 
42 |  ('bob', (26, 1))
43 | ]
44 | 
45 | sum_count =  PythonRDD[8] at RDD at PythonRDD.scala:48
46 | sum_count.count() =  3
47 | sum_count.collect() =  
48 | [
49 |  ('bob', (26, 1)), 
50 |  ('alex', (166, 4)), 
51 |  ('mary', (108, 3))
52 | ]
53 | 
54 | averages =  PythonRDD[10] at RDD at PythonRDD.scala:48
55 | averages.count() =  3
56 | averages.collect() =  
57 | [
58 |  ('bob', 26.0), 
59 |  ('alex', 41.5), 
60 |  ('mary', 36.0)
61 | ]


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_foldbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_foldbykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_foldbykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_groupbykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_groupbykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_reducebykey.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit average_by_key_use_reducebykey.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1067685d0>
 4 | 
 5 | list_of_tuples =  [('alex', 'Sunnyvale', 25), ('alex', 'Sunnyvale', 33), ('alex', 'Sunnyvale', 45), ('alex', 'Sunnyvale', 63), ('mary', 'Ames', 22), ('mary', 'Cupertino', 66), ('mary', 'Ames', 20), ('bob', 'Ames', 26)]
 6 | 
 7 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
 8 | rdd.count() =  8
 9 | rdd.collect() =  
10 | [
11 |  ('alex', 'Sunnyvale', 25), 
12 |  ('alex', 'Sunnyvale', 33), 
13 |  ('alex', 'Sunnyvale', 45), 
14 |  ('alex', 'Sunnyvale', 63), 
15 |  ('mary', 'Ames', 22), 
16 |  ('mary', 'Cupertino', 66), 
17 |  ('mary', 'Ames', 20), 
18 |  ('bob', 'Ames', 26)
19 | ]
20 | 
21 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
22 | rdd2.count() =  8
23 | rdd2.collect() =  
24 | [
25 |  ('alex', (25, 1)), 
26 |  ('alex', (33, 1)), 
27 |  ('alex', (45, 1)), 
28 |  ('alex', (63, 1)), 
29 |  ('mary', (22, 1)), 
30 |  ('mary', (66, 1)), 
31 |  ('mary', (20, 1)), 
32 |  ('bob', (26, 1))
33 | ]
34 | 
35 | sum_count =  PythonRDD[8] at RDD at PythonRDD.scala:48
36 | sum_count.count() =  3
37 | sum_count.collect() =  
38 | [
39 |  ('bob', (26, 1)), 
40 |  ('alex', (166, 4)), 
41 |  ('mary', (108, 3))
42 | ]
43 | 
44 | averages =  PythonRDD[10] at RDD at PythonRDD.scala:48
45 | averages.count() =  3
46 | averages.collect() =  
47 | [
48 |  ('bob', 26.0), 
49 |  ('alex', 41.5), 
50 |  ('mary', 36.0)
51 | ]


--------------------------------------------------------------------------------
/code/chap05/average_by_key_use_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run average_by_key_use_reducebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_reducebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_action_describe.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_action_describe.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x107c01cd0>
 4 | 
 5 | pairs =  
 6 | [
 7 |  (10, 'z1'), 
 8 |  (1, 'z2'), 
 9 |  (2, 'z3'), 
10 |  (9, 'z4'), 
11 |  (3, 'z5'), 
12 |  (4, 'z6'), 
13 |  (5, 'z7'), 
14 |  (6, 'z8'), 
15 |  (7, 'z9')
16 | ]
17 | 
18 | df.count():  9
19 | df.collect():  
20 | [
21 |  Row(number=10, name=u'z1'), 
22 |  Row(number=1, name=u'z2'), 
23 |  Row(number=2, name=u'z3'), 
24 |  Row(number=9, name=u'z4'), 
25 |  Row(number=3, name=u'z5'), 
26 |  Row(number=4, name=u'z6'), 
27 |  Row(number=5, name=u'z7'), 
28 |  Row(number=6, name=u'z8'), 
29 |  Row(number=7, name=u'z9')
30 | ]
31 | 
32 | +------+----+
33 | |number|name|
34 | +------+----+
35 | |    10|  z1|
36 | |     1|  z2|
37 | |     2|  z3|
38 | |     9|  z4|
39 | |     3|  z5|
40 | |     4|  z6|
41 | |     5|  z7|
42 | |     6|  z8|
43 | |     7|  z9|
44 | +------+----+
45 | 
46 | +-------+------------------+
47 | |summary|            number|
48 | +-------+------------------+
49 | |  count|                 9|
50 | |   mean| 5.222222222222222|
51 | | stddev|3.0731814857642954|
52 | |    min|                 1|
53 | |    max|                10|
54 | +-------+------------------+
55 | 
56 | +-------+------------------+----+
57 | |summary|            number|name|
58 | +-------+------------------+----+
59 | |  count|                 9|   9|
60 | |   mean| 5.222222222222222|null|
61 | | stddev|3.0731814857642954|null|
62 | |    min|                 1|  z1|
63 | |    max|                10|  z9|
64 | +-------+------------------+----+
65 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_action_describe.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_action_describe.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_action_describe.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_drop.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_drop.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_drop.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_filter.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_filter.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_filter.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_join_cross.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_cross.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_cross.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_join_inner.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_inner.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_inner.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_join_left.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_left.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_left.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_join_right.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_join_right.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_right.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_sql.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_sql.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_sql.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/dataframe_withcolumn.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_withcolumn.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_withcolumn.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/emps.txt:
--------------------------------------------------------------------------------
1 | 1000,alex,67000
2 | 1001,bob,24000
3 | 1002,jane,69000
4 | 1003,betty,55000
5 | 1004,jeff,59000
6 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_cartesian.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_cartesian.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10fe37dd0>
 4 | 
 5 | a =  [('a', 2), ('b', 3), ('c', 4)]
 6 | 
 7 | b =  [('p', 50), ('x', 60), ('y', 70), ('z', 80)]
 8 | 
 9 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
10 | rdd.count() =  3
11 | rdd.collect() =  
12 | [
13 |  ('a', 2), 
14 |  ('b', 3), 
15 |  ('c', 4)
16 | ]
17 | 
18 | rdd2 =  ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:175
19 | rdd2.count() =  4
20 | rdd2.collect() =  
21 | [
22 |  ('p', 50), 
23 |  ('x', 60), 
24 |  ('y', 70), 
25 |  ('z', 80)
26 | ]
27 | 
28 | cart =  org.apache.spark.api.java.JavaPairRDD@4a2f7b2c
29 | cart.count() =  12
30 | cart.collect() =  
31 | [
32 |  (('a', 2), ('p', 50)), 
33 |  (('a', 2), ('x', 60)), 
34 |  (('a', 2), ('y', 70)), 
35 |  (('a', 2), ('z', 80)), 
36 |  (('b', 3), ('p', 50)), 
37 |  (('b', 3), ('x', 60)), 
38 |  (('b', 3), ('y', 70)), 
39 |  (('b', 3), ('z', 80)), 
40 |  (('c', 4), ('p', 50)), 
41 |  (('c', 4), ('x', 60)), 
42 |  (('c', 4), ('y', 70)), 
43 |  (('c', 4), ('z', 80))
44 | ]


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_cartesian.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_cartesian.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_cartesian.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_combinebykey.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_combinebykey.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x105862610>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('alex', 'Sunnyvale', 33), 
 9 |  ('alex', 'Sunnyvale', 45), 
10 |  ('alex', 'Sunnyvale', 63), 
11 |  ('mary', 'Ames', 22), 
12 |  ('mary', 'Cupertino', 66), 
13 |  ('mary', 'Ames', 20), 
14 |  ('bob', 'Ames', 26)
15 | ]
16 | 
17 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
18 | rdd.count() =  8
19 | rdd.collect() =  
20 | [
21 |  ('alex', 'Sunnyvale', 25), 
22 |  ('alex', 'Sunnyvale', 33), 
23 |  ('alex', 'Sunnyvale', 45), 
24 |  ('alex', 'Sunnyvale', 63), 
25 |  ('mary', 'Ames', 22), 
26 |  ('mary', 'Cupertino', 66), 
27 |  ('mary', 'Ames', 20), 
28 |  ('bob', 'Ames', 26)
29 | ]
30 | 
31 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
32 | rdd2.count() =  8
33 | rdd2.collect() =  
34 | [
35 |  ('alex', 25), 
36 |  ('alex', 33), 
37 |  ('alex', 45), 
38 |  ('alex', 63), 
39 |  ('mary', 22), 
40 |  ('mary', 66), 
41 |  ('mary', 20), 
42 |  ('bob', 26)
43 | ]
44 | 
45 | combined =  PythonRDD[8] at RDD at PythonRDD.scala:48
46 | combined.count() =  3
47 | combined.collect() =  
48 | [
49 |  ('bob', (26, 26, 1)), 
50 |  ('alex', (25, 63, 4)), 
51 |  ('mary', (20, 66, 3))
52 | ]
53 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_combinebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_combinebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_filter.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_filter.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x103fbed90>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('alex', 'Sunnyvale', 33), 
 9 |  ('mary', 'Ames', 22), 
10 |  ('mary', 'Cupertino', 66), 
11 |  ('jane', 'Ames', 20), 
12 |  ('bob', 'Ames', 26)
13 | ]
14 | 
15 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
16 | rdd.count() =  6
17 | rdd.collect() =  
18 | [
19 |  ('alex', 'Sunnyvale', 25), 
20 |  ('alex', 'Sunnyvale', 33), 
21 |  ('mary', 'Ames', 22), 
22 |  ('mary', 'Cupertino', 66), 
23 |  ('jane', 'Ames', 20), 
24 |  ('bob', 'Ames', 26)
25 | ]
26 | 
27 | filtered_by_lambda =  PythonRDD[2] at RDD at PythonRDD.scala:48
28 | filtered_by_lambda.count() =  2
29 | filtered_by_lambda.collect() =  
30 | [
31 |  ('alex', 'Sunnyvale', 33), 
32 |  ('mary', 'Cupertino', 66)
33 | ]
34 | 
35 | filtered_by_function =  PythonRDD[4] at RDD at PythonRDD.scala:48
36 | filtered_by_function.count() =  2
37 | filtered_by_function.collect() =  
38 | [
39 |  ('alex', 'Sunnyvale', 33), 
40 |  ('mary', 'Cupertino', 66)
41 | ]


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_filter.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_filter.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_filter.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_flatmap.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_flatmap.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10649fdd0>
 4 | 
 5 | list_of_strings =  
 6 | [
 7 |  'of', 
 8 |  'a fox jumped', 
 9 |  'fox jumped of fence', 
10 |  'a foxy fox jumped high'
11 | ]
12 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
13 | rdd.count() =  4
14 | rdd.collect() =  
15 | [
16 |  'of', 
17 |  'a fox jumped', 
18 |  'fox jumped of fence', 
19 |  'a foxy fox jumped high'
20 | ]
21 | 
22 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
23 | rdd2.count() =  9
24 | rdd2.collect() =  
25 | [
26 |  'fox', 
27 |  'jumped', 
28 |  'fox', 
29 |  'jumped', 
30 |  'fence', 
31 |  'foxy', 
32 |  'fox', 
33 |  'jumped', 
34 |  'high'
35 | ]
36 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_flatmap.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_flatmap.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_flatmap.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_groupbykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_groupbykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_join.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_join.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1066e3c90>
 4 | 
 5 | source_pairs =  
 6 | [
 7 |  (1, 'u'), 
 8 |  (1, 'v'), 
 9 |  (2, 'a'), 
10 |  (3, 'b'), 
11 |  (4, 'z1')
12 | ]
13 | 
14 | source.count():  5
15 | source.collect():  
16 | [
17 |  (1, 'u'), 
18 |  (1, 'v'), 
19 |  (2, 'a'), 
20 |  (3, 'b'), 
21 |  (4, 'z1')
22 | ]
23 | 
24 | other_pairs =  
25 | [
26 |  (1, 'x'), 
27 |  (1, 'y'), 
28 |  (2, 'c'), 
29 |  (2, 'd'), 
30 |  (3, 'm'), 
31 |  (8, 'z2')
32 | ]
33 | 
34 | other.count():  6
35 | other.collect():  
36 | [
37 |  (1, 'x'), 
38 |  (1, 'y'), 
39 |  (2, 'c'), 
40 |  (2, 'd'), 
41 |  (3, 'm'), 
42 |  (8, 'z2')
43 | ]
44 | 
45 | joined.count():  7
46 | joined.collect():  
47 | [
48 |  (1, ('u', 'x')), 
49 |  (1, ('u', 'y')), 
50 |  (1, ('v', 'x')), 
51 |  (1, ('v', 'y')), 
52 |  (2, ('a', 'c')), 
53 |  (2, ('a', 'd')), 
54 |  (3, ('b', 'm'))
55 | ]


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_join.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_join.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_join.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_map.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_map.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1090b6450>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('alex', 'Sunnyvale', 33), 
 9 |  ('mary', 'Ames', 22), 
10 |  ('mary', 'Cupertino', 66), 
11 |  ('jane', 'Ames', 20), 
12 |  ('bob', 'Ames', 26)
13 | ]
14 | 
15 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
16 | rdd.count() =  6
17 | rdd.collect() =  
18 | [
19 |  ('alex', 'Sunnyvale', 25), 
20 |  ('alex', 'Sunnyvale', 33), 
21 |  ('mary', 'Ames', 22), 
22 |  ('mary', 'Cupertino', 66), 
23 |  ('jane', 'Ames', 20), 
24 |  ('bob', 'Ames', 26)
25 | ]
26 | 
27 | 
28 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
29 | rdd2.count() =  6
30 | rdd2.collect() =  
31 | [
32 |  ('alex', 25), 
33 |  ('alex', 33), 
34 |  ('mary', 22), 
35 |  ('mary', 66), 
36 |  ('jane', 20), 
37 |  ('bob', 26)
38 | ]
39 | 
40 | rdd3 =  PythonRDD[4] at RDD at PythonRDD.scala:48
41 | rdd3.count() =  6
42 | rdd3.collect() =  
43 | [
44 |  ('Sunnyvale', ('alex', 'Sunnyvale', 25)), 
45 |  ('Sunnyvale', ('alex', 'Sunnyvale', 33)), 
46 |  ('Ames', ('mary', 'Ames', 22)), 
47 |  ('Cupertino', ('mary', 'Cupertino', 66)), 
48 |  ('Ames', ('jane', 'Ames', 20)), 
49 |  ('Ames', ('bob', 'Ames', 26))
50 | ]
51 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_map.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_map.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_map.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_mappartitions.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_mappartitions.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10e502610>
 4 | 
 5 | numbers =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
 6 | 
 7 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
 8 | rdd.count() =  13
 9 | rdd.collect() =  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
10 | rdd.getNumPartitions() =  3
11 | 
12 | ==begin-partition=
13 | 1
14 | 2
15 | 3
16 | 4
17 | ==end-partition=
18 | 
19 | ==begin-partition=
20 | 9
21 | 10
22 | 11
23 | 12
24 | 13
25 | ==end-partition=
26 | 
27 | ==begin-partition=
28 | 5
29 | 6
30 | 7
31 | 8
32 | ==end-partition=
33 | 
34 | minmax_rdd =  PythonRDD[3] at RDD at PythonRDD.scala:48
35 | minmax_rdd.count() =  6
36 | minmax_rdd.collect() =  [1, 4, 5, 8, 9, 13]
37 | 
38 | minmax_list =  [1, 4, 5, 8, 9, 13]
39 | min(minmax_list) =  1
40 | max(minmax_list) =  13


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_mappartitions.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_mappartitions_handle_empty_partitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_mappartitions_handle_empty_partitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_mappartitions_handle_empty_partitions.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_reducebykey.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit  rdd_transformation_reducebykey.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10f836650>
 4 | 
 5 | list_of_tuples =  
 6 | [
 7 |  ('alex', 'Sunnyvale', 25), 
 8 |  ('alex', 'Sunnyvale', 33), 
 9 |  ('alex', 'Sunnyvale', 45), 
10 |  ('alex', 'Sunnyvale', 63), 
11 |  ('mary', 'Ames', 22), 
12 |  ('mary', 'Cupertino', 66), 
13 |  ('mary', 'Ames', 20), 
14 |  ('bob', 'Ames', 26)
15 | ]
16 | 
17 | rdd =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
18 | rdd.count() =  8
19 | rdd.collect() =  
20 | [
21 |  ('alex', 'Sunnyvale', 25), 
22 |  ('alex', 'Sunnyvale', 33), 
23 |  ('alex', 'Sunnyvale', 45), 
24 |  ('alex', 'Sunnyvale', 63), 
25 |  ('mary', 'Ames', 22), 
26 |  ('mary', 'Cupertino', 66), 
27 |  ('mary', 'Ames', 20), 
28 |  ('bob', 'Ames', 26)
29 | ]
30 | 
31 | rdd2 =  PythonRDD[2] at RDD at PythonRDD.scala:48
32 | rdd2.count() =  8
33 | rdd2.collect() =  
34 | [
35 |  ('alex', 25), 
36 |  ('alex', 33), 
37 |  ('alex', 45), 
38 |  ('alex', 63), 
39 |  ('mary', 22), 
40 |  ('mary', 66), 
41 |  ('mary', 20), 
42 |  ('bob', 26)
43 | ]
44 | 
45 | rdd3 =  PythonRDD[8] at RDD at PythonRDD.scala:48
46 | rdd3.count() =  3
47 | rdd3.collect() =  
48 | [
49 |  ('bob', 26), 
50 |  ('alex', 166), 
51 |  ('mary', 108)
52 | ]
53 | 
54 | rdd4 =  PythonRDD[14] at RDD at PythonRDD.scala:48
55 | rdd4.count() =  3
56 | rdd4.collect() =  
57 | [
58 |  ('bob', 26), 
59 |  ('alex', 63), 
60 |  ('mary', 66)
61 | ]
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_reducebykey.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_reducebykey.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_sortby.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_sortby.py
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1018f7510>
 4 | 
 5 | pairs = 
 6 | [
 7 |  (10, 'z1'), 
 8 |  (1,  'z2'), 
 9 |  (2,  'z3'), 
10 |  (9,  'z4'), 
11 |  (3,  'z5'), 
12 |  (4,  'z6'), 
13 |  (5,  'z7'), 
14 |  (6,  'z8'), 
15 |  (7,  'z9')
16 | ]
17 | 
18 | rdd.count():  9
19 | rdd.collect():  
20 | [
21 |  (10, 'z1'), 
22 |  (1,  'z2'), 
23 |  (2,  'z3'), 
24 |  (9,  'z4'), 
25 |  (3,  'z5'), 
26 |  (4,  'z6'), 
27 |  (5,  'z7'), 
28 |  (6,  'z8'), 
29 |  (7,  'z9')
30 | ]
31 | 
32 | sorted_by_key_ascending.count():  9
33 | sorted_by_key_ascending.collect():  
34 | [
35 |  (1,  'z2'), 
36 |  (2,  'z3'), 
37 |  (3,  'z5'), 
38 |  (4,  'z6'), 
39 |  (5,  'z7'), 
40 |  (6,  'z8'), 
41 |  (7,  'z9'), 
42 |  (9,  'z4'), 
43 |  (10, 'z1')
44 | ]
45 | 
46 | sorted_by_key_descending.count():  9
47 | sorted_by_key_descending.collect():  
48 | [
49 |  (10, 'z1'), 
50 |  (9,  'z4'), 
51 |  (7,  'z9'), 
52 |  (6,  'z8'), 
53 |  (5,  'z7'), 
54 |  (4,  'z6'), 
55 |  (3,  'z5'), 
56 |  (2,  'z3'), 
57 |  (1,  'z2')
58 | ]
59 | 
60 | sorted_by_value_ascending.count():  9
61 | sorted_by_value_ascending.collect():  
62 | [
63 |  (10, 'z1'), 
64 |  (1,  'z2'), 
65 |  (2,  'z3'), 
66 |  (9,  'z4'), 
67 |  (3,  'z5'), 
68 |  (4,  'z6'), 
69 |  (5,  'z7'), 
70 |  (6,  'z8'), 
71 |  (7,  'z9')
72 | ]
73 | 
74 | sorted_by_value_descending.count():  9
75 | sorted_by_value_descending.collect():  
76 | [
77 |  (7,  'z9'), 
78 |  (6,  'z8'), 
79 |  (5,  'z7'), 
80 |  (4,  'z6'), 
81 |  (3,  'z5'), 
82 |  (9,  'z4'), 
83 |  (2,  'z3'), 
84 |  (1,  'z2'), 
85 |  (10, 'z1')
86 | ]


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_sortby.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_sortby.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_sortby.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_takeordered.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit rdd_transformation_takeordered.py
 2 | 
 3 | spark =  <pyspark.sql.session.SparkSession object at 0x10611cd90>
 4 | 
 5 | sc =  <SparkContext master=local[*] appName=rdd_transformation_takeordered>
 6 | 
 7 | numbers =  [8, 10, 1, 2, 9, 3, 4, 5, 6, 7]
 8 | 
 9 | top3 =  [1, 2, 3]
10 | 
11 | bottom3 =  [10, 9, 8]
12 | 
13 | pairs =  
14 | [
15 |  (10, 'z1'), 
16 |  (1, 'z2'), 
17 |  (2, 'z3'), 
18 |  (9, 'z4'), 
19 |  (3, 'z5'), 
20 |  (4, 'z6'), 
21 |  (5, 'z7'), 
22 |  (6, 'z8'), 
23 |  (7, 'z9')
24 | ]
25 | 
26 | top3_pairs =  
27 | [
28 |  (1, 'z2'), 
29 |  (2, 'z3'), 
30 |  (3, 'z5')
31 | ]
32 | 
33 | bottom3_pairs =  
34 | [
35 |  (10, 'z1'), 
36 |  (9, 'z4'), 
37 |  (7, 'z9')
38 | ]


--------------------------------------------------------------------------------
/code/chap05/rdd_transformation_takeordered.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run rdd_transformation_takeordered.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_takeordered.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap05/sample_5_records.txt:
--------------------------------------------------------------------------------
1 | A,3
2 | A,4
3 | A,5
4 | B,10
5 | B,20
6 | 


--------------------------------------------------------------------------------
/code/chap05/users.txt:
--------------------------------------------------------------------------------
1 | 1,Alex,30,124
2 | 2,Bert,32,234
3 | 3,Curt,28,312
4 | 4,Don,32,180
5 | 5,Mary,30,100
6 | 6,Jane,28,212
7 | 7,Joe,28,128
8 | 8,Al,40,600
9 | 


--------------------------------------------------------------------------------
/code/chap06/README.md:
--------------------------------------------------------------------------------
 1 | # Chapter 6
 2 | 
 3 | ## Programs
 4 | 
 5 | The goal of the programs in this chapter is
 6 | to show some of the important reductions 
 7 | in Spark. Some of the reductions by key
 8 | are:
 9 | 
10 | * reduceBykey()
11 | * combineBykey()
12 | * groupBykey()
13 | * aggregateBykey()
14 | 
15 | 
16 | We want to find out average per key in PySpark. 
17 | The solutions are presented in this chapter as:
18 | 
19 | * combineByKey():
20 | 	* `average_by_key_use_combinebykey.py` (PySpark program) 
21 | 	* `average_by_key_use_combinebykey.sh` (shell script to call PySpark)
22 | 
23 | * groupByKey():
24 | 	* `average_by_key_use_groupbykey.py` (PySpark program)
25 | 	* `average_by_key_use_groupbykey.sh` (shell script to call PySpark)
26 | 
27 | * reduceByKey():
28 | 	* `average_by_key_use_reducebykey.py` (PySpark program)
29 | 	* `average_by_key_use_reducebykey.sh` (shell script to call PySpark)
30 | 
31 | * aggregateByKey():
32 | 	* `average_by_key_use_aggregatebykey.py` (PySpark program)
33 | 	* `average_by_key_use_aggregatebykey.sh` (shell script to call PySpark)
34 | 


--------------------------------------------------------------------------------
/code/chap06/average_by_key_use_aggregatebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the aggregateByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_aggregatebykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap06/average_by_key_use_combinebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the combineByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_combinebykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap06/average_by_key_use_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the groupByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_groupbykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap06/average_by_key_use_reducebykey.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for finding averages per
 3 | # key by using the reduceByKey() transformation  
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_reducebykey.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 
12 | 
13 | 


--------------------------------------------------------------------------------
/code/chap07/WorldCupPlayers.csv.data.source:
--------------------------------------------------------------------------------
1 | https://www.kaggle.com/abecklas/fifa-world-cup/downloads/WorldCupPlayers.csv/5
2 | 


--------------------------------------------------------------------------------
/code/chap07/WorldCupPlayers.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/WorldCupPlayers.csv.zip


--------------------------------------------------------------------------------
/code/chap07/customers.RECORD.FORMAT.txt:
--------------------------------------------------------------------------------
1 | Each record has the following format:
2 | 
3 | <customer_id><,><year><,><transaction_id><,><transaction_value>
4 | 


--------------------------------------------------------------------------------
/code/chap07/customers.txt:
--------------------------------------------------------------------------------
 1 | c1,2019,T0011,20.67
 2 | c1,2019,T0012,12.34
 3 | c1,2019,T0013,44.30
 4 | c1,2018,T0001,20.67
 5 | c1,2018,T0002,12.34
 6 | c1,2018,T0003,44.30
 7 | c2,2019,T0017,744.30
 8 | c2,2019,T0018,820.67
 9 | c2,2018,T0022,182.34
10 | c2,2018,T0033,494.30
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_add_columns.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_add_columns.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_add_columns.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_aggregate_multiple_columns.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_aggregate_multiple_columns.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_aggregate_multiple_columns.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_aggregate_single_column.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_aggregate_single_column.py
 2 | 
 3 | +----+------+-----+
 4 | |name|  food|price|
 5 | +----+------+-----+
 6 | |mary| lemon|  2.0|
 7 | |adam| grape| 1.22|
 8 | |adam|carrot| 2.44|
 9 | |adam|orange| 1.99|
10 | |john|tomato| 1.99|
11 | |john|carrot| 0.45|
12 | |john|banana| 1.29|
13 | |bill| apple| 0.99|
14 | |bill|  taco| 2.59|
15 | +----+------+-----+
16 | 
17 | +------------------+
18 | |        sum(price)|
19 | +------------------+
20 | |14.959999999999999|
21 | +------------------+
22 | 
23 | +-----------------+
24 | |       avg(price)|
25 | +-----------------+
26 | |1.662222222222222|
27 | +-----------------+
28 | 
29 | +----------+
30 | |max(price)|
31 | +----------+
32 | |      2.59|
33 | +----------+
34 | 
35 | +----------+
36 | |min(price)|
37 | +----------+
38 | |      0.45|
39 | +----------+
40 | 
41 | +----+------------------+
42 | |name|        avg(price)|
43 | +----+------------------+
44 | |adam|1.8833333333333335|
45 | |mary|               2.0|
46 | |john|1.2433333333333334|
47 | |bill|              1.79|
48 | +----+------------------+
49 | 
50 | +----+----------+
51 | |name|max(price)|
52 | +----+----------+
53 | |adam|      2.44|
54 | |mary|       2.0|
55 | |john|      1.99|
56 | |bill|      2.59|
57 | +----+----------+
58 | 
59 | +----+----------+
60 | |name|min(price)|
61 | +----+----------+
62 | |adam|      1.22|
63 | |mary|       2.0|
64 | |john|      0.45|
65 | |bill|      0.99|
66 | +----+----------+
67 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_aggregate_single_column.sh:
--------------------------------------------------------------------------------
 1 | #----------------------------------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_aggregate_single_column.py
 3 | #----------------------------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #----------------------------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_aggregate_single_column.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_call_udf.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_call_udf.py
 2 | 
 3 | squareof3= 9
 4 | 
 5 | squareof7= 49
 6 | 
 7 | data= [('alex', 5), ('jane', 7), ('bob', 9)]
 8 | 
 9 | +----+---+
10 | |name|age|
11 | +----+---+
12 | |alex|  5|
13 | |jane|  7|
14 | | bob|  9|
15 | +----+---+
16 | 
17 | +----+---+-----------+
18 | |name|age|age_squared|
19 | +----+---+-----------+
20 | |alex|  5|         25|
21 | |jane|  7|         49|
22 | | bob|  9|         81|
23 | +----+---+-----------+
24 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_call_udf.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_call_udf.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_call_udf.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_cvs_no_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_cvs_no_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap07/emps_no_header.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_cvs_no_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_cvs_with_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_cvs_with_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap07/emps_with_header.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_cvs_with_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_from_collections.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_from_collections.py
 2 | data= [('k1', 2), ('k1', 3), ('k1', 5), ('k2', 7), ('k2', 9), ('k3', 8)]
 3 | 
 4 | +---+---+
 5 | | _1| _2|
 6 | +---+---+
 7 | | k1|  2|
 8 | | k1|  3|
 9 | | k1|  5|
10 | | k2|  7|
11 | | k2|  9|
12 | | k3|  8|
13 | +---+---+
14 | 
15 | +------+--------+
16 | |my_key|my_value|
17 | +------+--------+
18 | |    k1|       2|
19 | |    k1|       3|
20 | |    k1|       5|
21 | |    k2|       7|
22 | |    k2|       9|
23 | |    k3|       8|
24 | +------+--------+
25 | 
26 | +------+--------+
27 | |my_key|my_value|
28 | +------+--------+
29 | |    k2|       7|
30 | |    k2|       9|
31 | |    k3|       8|
32 | +------+--------+
33 | 
34 | +------+----+
35 | |my_key|size|
36 | +------+----+
37 | |    k2|   2|
38 | |    k1|   3|
39 | |    k3|   1|
40 | +------+----+
41 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_from_collections.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_collections.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | #export INPUT_FILE="/pyspark_book/code/chap07/emps_no_header.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_from_collections.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
12 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_from_pandas.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_from_pandas.py
 2 | 
 3 | panda_dataframe =
 4 |     integers  floats    int_arrays
 5 | 0         2     1.2           [6]
 6 | 1         5    -2.0        [1, 2]
 7 | 2         7     1.5     [3, 4, 5]
 8 | 3         8     2.7  [6, 7, 8, 9]
 9 | 4         9     3.6  [10, 11, 12]
10 | 
11 | spark_df =  
12 | DataFrame
13 | [
14 |    integers: bigint, 
15 |    floats: double, 
16 |    int_arrays: array<bigint>
17 | ]
18 | 
19 | spark_df.show():
20 | +--------+------+------------+
21 | |integers|floats|  int_arrays|
22 | +--------+------+------------+
23 | |       2|   1.2|         [6]|
24 | |       5|  -2.0|      [1, 2]|
25 | |       7|   1.5|   [3, 4, 5]|
26 | |       8|   2.7|[6, 7, 8, 9]|
27 | |       9|   3.6|[10, 11, 12]|
28 | +--------+------+------------+
29 | 
30 | pandas_df =
31 |     integers  floats    int_arrays
32 | 0         2     1.2           [6]
33 | 1         5    -2.0        [1, 2]
34 | 2         7     1.5     [3, 4, 5]
35 | 3         8     2.7  [6, 7, 8, 9]
36 | 4         9     3.6  [10, 11, 12]


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_from_pandas.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_pandas.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_from_pandas.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_from_rows.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_from_rows.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_from_rows.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_order_by.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_creation_order_by.py
 2 | 
 3 | data= [('A', 8), ('B', 3), ('A', 4), ('B', 2), ('Z', 7)]
 4 | 
 5 | +---+-----+
 6 | | id|value|
 7 | +---+-----+
 8 | |  A|    8|
 9 | |  B|    3|
10 | |  A|    4|
11 | |  B|    2|
12 | |  Z|    7|
13 | +---+-----+
14 | 
15 | +---+-----+
16 | | id|value|
17 | +---+-----+
18 | |  A|    8|
19 | |  A|    4|
20 | |  B|    3|
21 | |  B|    2|
22 | |  Z|    7|
23 | +---+-----+
24 | 
25 | +---+-----+
26 | | id|value|
27 | +---+-----+
28 | |  Z|    7|
29 | |  B|    3|
30 | |  B|    2|
31 | |  A|    8|
32 | |  A|    4|
33 | +---+-----+
34 | 
35 | +---+-----+
36 | | id|value|
37 | +---+-----+
38 | |  B|    2|
39 | |  B|    3|
40 | |  A|    4|
41 | |  Z|    7|
42 | |  A|    8|
43 | +---+-----+
44 | 
45 | +---+-----+
46 | | id|value|
47 | +---+-----+
48 | |  A|    8|
49 | |  Z|    7|
50 | |  A|    4|
51 | |  B|    3|
52 | |  B|    2|
53 | +---+-----+
54 | 
55 | +---+-----+
56 | | id|value|
57 | +---+-----+
58 | |  A|    4|
59 | |  A|    8|
60 | |  B|    2|
61 | |  B|    3|
62 | |  Z|    7|
63 | +---+-----+
64 | 
65 | +---+-----+
66 | | id|value|
67 | +---+-----+
68 | |  A|    8|
69 | |  A|    4|
70 | |  B|    3|
71 | |  B|    2|
72 | |  Z|    7|
73 | +---+-----+
74 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_order_by.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_order_by.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_order_by.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_creation_with_explicit_schema.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_creation_with_explicit_schema.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap07/emps_no_header.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_with_explicit_schema.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_crosstab.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_crosstab.py
 2 | 
 3 | +---+-----+
 4 | |key|value|
 5 | +---+-----+
 6 | |  1|    1|
 7 | |  1|    2|
 8 | |  2|    1|
 9 | |  2|    1|
10 | |  2|    3|
11 | |  3|    2|
12 | |  3|    3|
13 | |  4|    4|
14 | +---+-----+
15 | 
16 | +---------+---+---+---+---+
17 | |key_value|  1|  2|  3|  4|
18 | +---------+---+---+---+---+
19 | |        2|  2|  0|  1|  0|
20 | |        4|  0|  0|  0|  1|
21 | |        1|  1|  1|  0|  0|
22 | |        3|  0|  1|  1|  0|
23 | +---------+---+---+---+---+
24 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_crosstab.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_crosstab.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_crosstab.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_drop_column.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_drop_column.py
 2 | 
 3 | data =  
 4 | [
 5 |  (100, 'a', 1.0), 
 6 |  (200, 'b', 2.0), 
 7 |  (300, 'c', 3.0), 
 8 |  (400, 'd', 4.0)
 9 | ]
10 | 
11 | columns =  ('id', 'code', 'scale')
12 | 
13 | +---+----+-----+
14 | | id|code|scale|
15 | +---+----+-----+
16 | |100|   a|  1.0|
17 | |200|   b|  2.0|
18 | |300|   c|  3.0|
19 | |400|   d|  4.0|
20 | +---+----+-----+
21 | 
22 | +---+----+
23 | | id|code|
24 | +---+----+
25 | |100|   a|
26 | |200|   b|
27 | |300|   c|
28 | |400|   d|
29 | +---+----+
30 | 
31 | +---+-----+
32 | | id|scale|
33 | +---+-----+
34 | |100|  1.0|
35 | |200|  2.0|
36 | |300|  3.0|
37 | |400|  4.0|
38 | +---+-----+


--------------------------------------------------------------------------------
/code/chap07/dataframe_drop_column.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_drop_column.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_drop_column.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_drop_duplicates.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit dataframe_drop_duplicates.py
 2 | 
 3 | data =  
 4 | [
 5 |  (100, 'a', 1.0), 
 6 |  (100, 'a', 1.0), 
 7 |  (200, 'b', 2.0), 
 8 |  (300, 'c', 3.0), 
 9 |  (300, 'c', 3.0), 
10 |  (400, 'd', 4.0)
11 | ]
12 | 
13 | columns =  ('id', 'code', 'scale')
14 | 
15 | +---+----+-----+
16 | | id|code|scale|
17 | +---+----+-----+
18 | |100|   a|  1.0|
19 | |100|   a|  1.0|
20 | |200|   b|  2.0|
21 | |300|   c|  3.0|
22 | |300|   c|  3.0|
23 | |400|   d|  4.0|
24 | +---+----+-----+
25 | 
26 | +---+----+-----+
27 | | id|code|scale|
28 | +---+----+-----+
29 | |200|   b|  2.0|
30 | |300|   c|  3.0|
31 | |400|   d|  4.0|
32 | |100|   a|  1.0|
33 | +---+----+-----+
34 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_drop_duplicates.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_drop_duplicates.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_drop_duplicates.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_multi_dim_agg_groupby.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_multi_dim_agg_groupby.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_multi_dim_agg_groupby.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_multi_dim_agg_rollup.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_multi_dim_agg_rollup.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_multi_dim_agg_rollup.py"
 8 | #
 9 | # run the PySpark program:
10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG
11 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_tutorial_with_worldcup.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_tutorial_with_worldcup.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | #
 8 | export INPUT_PATH="/pyspark_book/code/chap07/WorldCupPlayers.csv"
 9 | # source of input data:
10 | # https://www.kaggle.com/abecklas/fifa-world-cup/downloads/WorldCupPlayers.csv/5
11 | #
12 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_tutorial_with_worldcup.py"
13 | #
14 | # run the PySpark program:
15 | $SPARK_HOME/bin/spark-submit  $SPARK_PROG  $INPUT_PATH
16 | 


--------------------------------------------------------------------------------
/code/chap07/dataframe_with_statistical_data.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run dataframe_with_statistical_data.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap07/life_expentancy.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_with_statistical_data.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap07/emps_no_header.txt:
--------------------------------------------------------------------------------
1 | 1001,alex,67000,SALES
2 | 1002,bob,24000,SALES
3 | 1003,boby,24000,SALES
4 | 1004,jane,69000,SOFTWARE
5 | 1005,betty,55000,SOFTWARE
6 | 1006,jeff,59000,SOFTWARE
7 | 1007,dara,72000,SOFTWARE
8 | 


--------------------------------------------------------------------------------
/code/chap07/emps_with_header.txt:
--------------------------------------------------------------------------------
1 | id,name,salary,dept
2 | 1001,alex,67000,SALES
3 | 1002,bob,24000,SALES
4 | 1003,boby,24000,SALES
5 | 1004,jane,69000,SOFTWARE
6 | 1005,betty,55000,SOFTWARE
7 | 1006,jeff,59000,SOFTWARE
8 | 1007,dara,72000,SOFTWARE
9 | 


--------------------------------------------------------------------------------
/code/chap07/partition_data_by_customer_and_year.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run the following program:
 3 | #      partition_data_by_customer_and_year.py
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export INPUT_PATH="/pyspark_book/code/chap07/customers.txt"
 9 | export OUTPUT_PATH="/tmp/partition_demo"
10 | export SPARK_PROG="/pyspark_book/code/chap07/partition_data_by_customer_and_year.py"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH $OUTPUT_PATH
14 | 


--------------------------------------------------------------------------------
/code/chap07/strings-2.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/strings-2.parquet


--------------------------------------------------------------------------------
/code/chap07/users.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/users.parquet


--------------------------------------------------------------------------------
/code/chap07/users4.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/users4.parquet


--------------------------------------------------------------------------------
/code/chap08/cats.no.header.csv:
--------------------------------------------------------------------------------
1 | cuttie,2,female,6
2 | mono,3,male,9
3 | fuzzy,1,female,4
4 | 


--------------------------------------------------------------------------------
/code/chap08/cats.with.header.csv:
--------------------------------------------------------------------------------
1 | name,age,gender,weight
2 | cuttie,2,female,6
3 | mono,3,male,9
4 | fuzzy,1,female,4
5 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_csv_reader_no_header.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit datasource_csv_reader_no_header.py sample_no_header.csv
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x105b01d10>
 4 | 
 5 | input path :  sample_no_header.csv
 6 | 
 7 | file_contents =
 8 | Alex,Sunnyvale,30
 9 | Mary,Cupertino,28
10 | Jane,Stanford,44
11 | Bob,Ames,33
12 | 
13 | df =  
14 | [
15 |  Row(_c0=u'Alex', _c1=u'Sunnyvale', _c2=30), 
16 |  Row(_c0=u'Mary', _c1=u'Cupertino', _c2=28), 
17 |  Row(_c0=u'Jane', _c1=u'Stanford', _c2=44), 
18 |  Row(_c0=u'Bob', _c1=u'Ames', _c2=33)
19 | ]
20 | 
21 | +----+---------+---+
22 | | _c0|      _c1|_c2|
23 | +----+---------+---+
24 | |Alex|Sunnyvale| 30|
25 | |Mary|Cupertino| 28|
26 | |Jane| Stanford| 44|
27 | | Bob|     Ames| 33|
28 | +----+---------+---+
29 | 
30 | root
31 |  |-- _c0: string (nullable = true)
32 |  |-- _c1: string (nullable = true)
33 |  |-- _c2: integer (nullable = true)
34 | 
35 | +----+---------+---+
36 | |name|     city|age|
37 | +----+---------+---+
38 | |Alex|Sunnyvale| 30|
39 | |Mary|Cupertino| 28|
40 | |Jane| Stanford| 44|
41 | | Bob|     Ames| 33|
42 | +----+---------+---+
43 | 
44 | root
45 |  |-- name: string (nullable = true)
46 |  |-- city: string (nullable = true)
47 |  |-- age: integer (nullable = true)
48 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_csv_reader_no_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_csv_reader_no_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_no_header.csv"
 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_reader_no_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_csv_reader_with_header.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit datasource_csv_reader_with_header.py sample_with_header.csv
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1023d3c50>
 4 | 
 5 | input path :  sample_with_header.csv
 6 | 
 7 | file_contents =
 8 | name,city,age
 9 | Alex,Sunnyvale,30
10 | Mary,Cupertino,28
11 | Jane,Stanford,44
12 | Bob,Ames,33
13 | 
14 | df.count() =  4
15 | df.collect() =  
16 | [
17 |  Row(name=u'Alex', city=u'Sunnyvale', age=30), 
18 |  Row(name=u'Mary', city=u'Cupertino', age=28), 
19 |  Row(name=u'Jane', city=u'Stanford', age=44), 
20 |  Row(name=u'Bob', city=u'Ames', age=33)
21 | ]
22 | 
23 | +----+---------+---+
24 | |name|     city|age|
25 | +----+---------+---+
26 | |Alex|Sunnyvale| 30|
27 | |Mary|Cupertino| 28|
28 | |Jane| Stanford| 44|
29 | | Bob|     Ames| 33|
30 | +----+---------+---+
31 | 
32 | root
33 |  |-- name: string (nullable = true)
34 |  |-- city: string (nullable = true)
35 |  |-- age: integer (nullable = true)
36 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_csv_reader_with_header.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_csv_reader_with_header.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_with_header.csv"
 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_reader_with_header.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_csv_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_csv_writer.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_writer.py"
 8 | export OUTPUT_CSV_FILE_PATH="/tmp/output.csv"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG ${OUTPUT_CSV_FILE_PATH}
12 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_elasticsearch_reader.log:
--------------------------------------------------------------------------------
 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 2 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_elasticsearch_reader.py"
 3 | export ELASTIC_SEARCH_HOST="localhost"
 4 | export JAR="/pyspark_book/jars/elasticsearch-hadoop-6.4.2.jar"
 5 | #
 6 | # run the PySpark program:
 7 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST}
 8 | 
 9 | es_hostname :  localhost
10 | 
11 | spark= <pyspark.sql.session.SparkSession object at 0x107f1ed30>
12 | 
13 | rs_rdd :  MapPartitionsRDD[2] at mapPartitions at SerDeUtil.scala:244
14 | 
15 | es_rdd.count() :  4
16 | 
17 | es_rdd.collect():  
18 | [
19 |  ('100', {'key1': 'some_value1', 'doc_id': 100}), 
20 |  ('200', {'key2': 'some_value2', 'doc_id': 200}), 
21 |  ('300', {'key3': 'some_value3', 'doc_id': 300}), 
22 |  ('400', {'key4': 'some_value4', 'doc_id': 400})
23 | ]


--------------------------------------------------------------------------------
/code/chap08/datasource_elasticsearch_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_elasticsearch_reader.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_elasticsearch_reader.py"
 8 | export ELASTIC_SEARCH_HOST="localhost"
 9 | export JAR="/pyspark_book/jars/elasticsearch-hadoop-6.4.2.jar"
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST}
13 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_elasticsearch_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_elasticsearch_writer.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_elasticsearch_writer.py"
 8 | export ELASTIC_SEARCH_HOST="localhost"
 9 | export JAR="/pyspark_book/jars/elasticsearch-hadoop-6.4.2.jar"
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST}
13 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_gzip_reader.log:
--------------------------------------------------------------------------------
 1 | $ cat z1.file.txt
 2 | z1: record1
 3 | z1: record2
 4 | z1: record3
 5 | 
 6 | $ cat z2.file.txt
 7 | z2: record1
 8 | z2: record2
 9 | z2: record3
10 | z2: record4
11 | 
12 | # gzip the files: z11.file.txt and z22.file.txt
13 | $ cp z1.file.txt z11.file.txt
14 | $ cp z2.file.txt z22.file.txt
15 | $ gzip z11.file.txt
16 | $ gzip z22.file.txt
17 |   
18 | $ ls -l z*gz
19 | -rw-r--r--  1   52  z11.file.txt.gz
20 | -rw-r--r--  1   55  z22.file.txt.gz 
21 |   
22 | $ export INPUT_PATH="z11.file.txt.gz,z22.file.txt.gz"
23 | 
24 | $ ./bin/spark-submit  datasource_gzip_reader.py  $INPUT_PATH
25 | 
26 | spark= <pyspark.sql.session.SparkSession object at 0x10fcffd50>
27 | 
28 | gz_input_path :  z11.file.txt.gz,z22.file.txt.gz
29 | 
30 | gzip_rdd =  z11.file.txt.gz,z22.file.txt.gz MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
31 | gzip_rdd.count() =  7
32 | gzip_rdd.collect() =  
33 | [
34 |  u'z1: record1', 
35 |  u'z1: record2', 
36 |  u'z1: record3', 
37 |  u'z2: record1', 
38 |  u'z2: record2', 
39 |  u'z2: record3', 
40 |  u'z2: record4'
41 | ]


--------------------------------------------------------------------------------
/code/chap08/datasource_gzip_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_gzip_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | FILE1=z11.file.txt.gz
 8 | FILE2=z22.file.txt.gz
 9 | export INPUT_FILE="/pyspark_book/code/chap08/sample_no_header.csv"
10 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_reader_no_header.py"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
14 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_jdbc_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_jdbc_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_jdbc_reader.py"
 8 | #
 9 | # define the required MySQL database connection parameters
10 | export JDBC_URL="jdbc:mysql://localhost/metadb"
11 | export JDBC_DRIVER="com.mysql.jdbc.Driver"
12 | export JDBC_USER="root"
13 | export JDBC_PASSWORD="mp22_pass"
14 | export JDBC_SOURCE_TABLE_NAME="dept"
15 | #
16 | # define the required JAR file for MySQL database access
17 | export JAR="/pyspark_book/code/jars/mysql-connector-java-5.1.42.jar"
18 | #
19 | # run the PySpark program:
20 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_SOURCE_TABLE_NAME}
21 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_jdbc_writer.sh:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------
 2 | # This is a shell script to run datasource_jdbc_writer.py
 3 | #--------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #--------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_jdbc_writer.py"
 8 | #
 9 | # define the required MySQL database connection parameters
10 | export JDBC_URL="jdbc:mysql://localhost/metadb"
11 | export JDBC_DRIVER="com.mysql.jdbc.Driver"
12 | export JDBC_USER="root"
13 | export JDBC_PASSWORD="mp22_pass"
14 | export JDBC_TARGET_TABLE_NAME="people"
15 | #
16 | # define the required JAR file for MySQL database access
17 | export JAR="/pyspark_book/code/jars/mysql-connector-java-5.1.42.jar"
18 | #
19 | # run the PySpark program:
20 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_TARGET_TABLE_NAME}
21 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_json_reader_multi_line.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_json_reader_multi_line.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_multi_line.json"
 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_json_reader_multi_line.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_json_reader_single_line.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_json_reader_single_line.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_single_line.json"
 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_json_reader_single_line.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_mongodb_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_mongodb_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_mongodb_reader.py"
 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll44"
 9 | export JAR1="/pyspark_book/code/jars/mongo-java-driver-3.8.2.jar"
10 | export JAR2="/pyspark_book/code/jars/mongo-spark-connector_2.11-2.2.5.jar"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI}
14 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_mongodb_writer.log:
--------------------------------------------------------------------------------
 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 2 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_mongodb_writer.py"
 3 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll66"
 4 | export JAR1="/pyspark_book/code/jars/mongo-java-driver-3.8.2.jar"
 5 | export JAR2="/pyspark_book/code/jars/mongo-spark-connector_2.11-2.2.5.jar"
 6 | #
 7 | # run the PySpark program:
 8 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI}
 9 | 
10 | mongodb_collection_uri :  mongodb://127.0.0.1/test.coll66
11 | 
12 | spark= <pyspark.sql.session.SparkSession object at 0x10495f7d0>
13 | 
14 | +-------+---------+---+
15 | |name   |city     |age|
16 | +-------+---------+---+
17 | |Alex   |Ames     |50 |
18 | |Gandalf|Cupertino|60 |
19 | |Thorin |Sunnyvale|95 |
20 | |Betty  |Ames     |78 |
21 | |Brian  |Stanford |77 |
22 | +-------+---------+---+
23 | 
24 | people.count() =  5
25 | people.collect() =  
26 | [
27 |  Row(name=u'Alex', city=u'Ames', age=50), 
28 |  Row(name=u'Gandalf', city=u'Cupertino', age=60), 
29 |  Row(name=u'Thorin', city=u'Sunnyvale', age=95), 
30 |  Row(name=u'Betty', city=u'Ames', age=78), 
31 |  Row(name=u'Brian', city=u'Stanford', age=77)
32 | ]
33 | 
34 | root
35 |  |-- name: string (nullable = true)
36 |  |-- city: string (nullable = true)
37 |  |-- age: long (nullable = true)


--------------------------------------------------------------------------------
/code/chap08/datasource_mongodb_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------------
 2 | # This is a shell script to run datasource_mongodb_reader.py
 3 | #-----------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_mongodb_writer.py"
 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll66"
 9 | export JAR1="/pyspark_book/code/jars/mongo-java-driver-3.8.2.jar"
10 | export JAR2="/pyspark_book/code/jars/mongo-spark-connector_2.11-2.2.5.jar"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI}
14 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_redis_reader.log:
--------------------------------------------------------------------------------
 1 | JAR="/pyspark_book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar"
 2 | ./bin/spark-submit --jars ${JAR} datasource_redis_reader.py localhost 6379
 3 | 
 4 | REDIS_HOST =  localhost
 5 | 
 6 | REDIS_PORT =  6379
 7 | 
 8 | spark= <pyspark.sql.session.SparkSession object at 0x10d8fe490>
 9 | 
10 | loaded_df =
11 |  DataFrame[name: string, city: string, age: bigint]
12 | loaded_df.count():  5
13 | loaded_df.collect():  
14 | [
15 |  Row(name=u'Brian', city=u'Stanford', age=77), 
16 |  Row(name=u'Alex', city=u'Ames', age=50), 
17 |  Row(name=u'Gandalf', city=u'Cupertino', age=60), 
18 |  Row(name=u'Thorin', city=u'Sunnyvale', age=95), 
19 |  Row(name=u'Betty', city=u'Ames', age=78)
20 | ]
21 | 
22 | +-------+---------+---+
23 | |   name|     city|age|
24 | +-------+---------+---+
25 | |  Brian| Stanford| 77|
26 | |   Alex|     Ames| 50|
27 | |Gandalf|Cupertino| 60|
28 | | Thorin|Sunnyvale| 95|
29 | |  Betty|     Ames| 78|
30 | +-------+---------+---+
31 | 
32 | root
33 |  |-- name: string (nullable = true)
34 |  |-- city: string (nullable = true)
35 |  |-- age: long (nullable = true)
36 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_redis_reader.sh:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------
 2 | # This is a shell script to run datasource_redis_reader.py
 3 | #--------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #--------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_redis_reader.py"
 8 | #
 9 | # define the required redis database connection parameters
10 | export REDIS_HOST="localhost"
11 | export REDIS_PORT="6379"
12 | # you may add password
13 | #export REDIS_PASSWORD="<your-password>"
14 | #
15 | # define the required JAR file for redis database access
16 | export JAR="/pyspark_book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar"
17 | #
18 | # run the PySpark program:
19 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 
20 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_redis_writer.sh:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------------
 2 | # This is a shell script to run datasource_redis_writer.py
 3 | #--------------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #--------------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_redis_writer.py"
 8 | #
 9 | # define the required redis database connection parameters
10 | export REDIS_HOST="localhost"
11 | export REDIS_PORT="6379"
12 | # you may add password
13 | #export REDIS_PASSWORD="<your-password>"
14 | #
15 | # define the required JAR file for redis database access
16 | export JAR="/pyspark_book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar"
17 | #
18 | # run the PySpark program:
19 | $SPARK_HOME/bin/spark-submit --jars ${JAR}  ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 
20 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_textfile_reader.log:
--------------------------------------------------------------------------------
 1 | $ cat sample_numbers.txt
 2 | 123,344,455,6666,2,300
 3 | 7777,4444,55
 4 | 22,34
 5 | 900,901,902,9000,5600,5600,5700,45
 6 | 45
 7 | 70,71,72
 8 |   
 9 | $ export INPUT_PATH="sample_numbers.txt"
10 | $ ./bin/spark-submit datasource_textfile_reader.py ${INPUT_PATH}
11 | 
12 | spark= <pyspark.sql.session.SparkSession object at 0x10f8a8c90>
13 | 
14 | input_path :  sample_numbers.txt
15 | 
16 | file_contents =
17 | 123,344,455,6666,2,300
18 | 7777,4444,55
19 | 22,34
20 | 900,901,902,9000,5600,5600,5700,45
21 | 45
22 | 70,71,72
23 | 
24 | records =  sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
25 | records.count() =  6
26 | records.collect() =  
27 | [
28 |  u'123,344,455,6666,2,300', 
29 |  u'7777,4444,55', 
30 |  u'22,34', 
31 |  u'900,901,902,9000,5600,5600,5700,45', 
32 |  u'45', 
33 |  u'70,71,72'
34 | ]
35 | numbers =  PythonRDD[3] at RDD at PythonRDD.scala:48
36 | numbers.count() =  23
37 | numbers.collect() =  
38 | [
39 |  123, 
40 |  344, 
41 |  455, 
42 |  6666, 
43 |  2, 
44 |  300, 
45 |  7777, 
46 |  4444, 
47 |  55, 
48 |  22, 
49 |  34, 
50 |  900, 
51 |  901, 
52 |  902, 
53 |  9000, 
54 |  5600, 
55 |  5600, 
56 |  5700, 
57 |  45, 
58 |  45, 
59 |  70, 
60 |  71, 
61 |  72
62 | ]


--------------------------------------------------------------------------------
/code/chap08/datasource_textfile_reader.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_textfile_reader.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_numbers.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_textfile_reader.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE
12 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_textfile_writer.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit datasource_textfile_writer.py /tmp/zoutput
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x102fc9dd0>
 4 | 
 5 | output_path :  /tmp/zoutput
 6 | 
 7 | data =  
 8 | [
 9 |  'data element 1', 
10 |  'data element 2', 
11 |  'data element 3', 
12 |  'data element 4'
13 | ]
14 | records =  ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
15 | records.count() =  4
16 | records.collect() =  
17 | [
18 |  'data element 1', 
19 |  'data element 2', 
20 |  'data element 3', 
21 |  'data element 4'
22 | ]
23 | loaded_records =  /tmp/zoutput MapPartitionsRDD[6] at textFile at NativeMethodAccessorImpl.java:0
24 | loaded_records.count() =  4
25 | loaded_records.collect() =  
26 | [
27 |  u'data element 3', 
28 |  u'data element 2', 
29 |  u'data element 1', 
30 |  u'data element 4'
31 | ]
32 | 
33 | $ ls -l /tmp/zoutput/
34 | total 32
35 | -rw-r--r--  1 mparsian  wheel   0 Nov  7 20:50 _SUCCESS
36 | -rw-r--r--  1 mparsian  wheel   0 Nov  7 20:50 part-00000
37 | -rw-r--r--  1 mparsian  wheel  15 Nov  7 20:50 part-00001
38 | -rw-r--r--  1 mparsian  wheel   0 Nov  7 20:50 part-00002
39 | -rw-r--r--  1 mparsian  wheel  15 Nov  7 20:50 part-00003
40 | -rw-r--r--  1 mparsian  wheel   0 Nov  7 20:50 part-00004
41 | -rw-r--r--  1 mparsian  wheel  15 Nov  7 20:50 part-00005
42 | -rw-r--r--  1 mparsian  wheel   0 Nov  7 20:50 part-00006
43 | -rw-r--r--  1 mparsian  wheel  15 Nov  7 20:50 part-00007
44 | 
45 | $ cat /tmp/zoutput/part*
46 | data element 1
47 | data element 2
48 | data element 3
49 | data element 4
50 | 


--------------------------------------------------------------------------------
/code/chap08/datasource_textfile_writer.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run datasource_textfile_writer.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export OUTPUT_PATH="/tmp/zoutput"
 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_textfile_writer.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $OUTPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap08/images/cat1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat1.jpg


--------------------------------------------------------------------------------
/code/chap08/images/cat2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat2.jpg


--------------------------------------------------------------------------------
/code/chap08/images/cat3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat3.jpg


--------------------------------------------------------------------------------
/code/chap08/images/cat4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat4.jpg


--------------------------------------------------------------------------------
/code/chap08/images/duck1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/duck1.jpg


--------------------------------------------------------------------------------
/code/chap08/images/duck2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/duck2.jpg


--------------------------------------------------------------------------------
/code/chap08/images/not-image.txt:
--------------------------------------------------------------------------------
1 | not an image
2 | 


--------------------------------------------------------------------------------
/code/chap08/mongodb_coll44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/mongodb_coll44.png


--------------------------------------------------------------------------------
/code/chap08/mongodb_coll66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/mongodb_coll66.png


--------------------------------------------------------------------------------
/code/chap08/name_age_salary.csv:
--------------------------------------------------------------------------------
 1 | alex,60,18000
 2 | adel,40,45000
 3 | adel,50,77000
 4 | jane,40,52000
 5 | jane,60,81000
 6 | alex,50,62000
 7 | mary,50,92000
 8 | mary,60,63000
 9 | mary,40,55000
10 | mary,40,55000
11 | 


--------------------------------------------------------------------------------
/code/chap08/people.txt:
--------------------------------------------------------------------------------
1 | Alex,30,Tennis
2 | Betty,40,Swimming
3 | Dave,20,Walking
4 | Jeff,77,Baseball
5 | 


--------------------------------------------------------------------------------
/code/chap08/sample_multi_line.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}},
 3 |     {"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}},
 4 |     {
 5 |         "name": "bob",
 6 |         "id": 300,
 7 |         "scores": [
 8 |             3,
 9 |             4,
10 |             6,
11 |             9
12 |         ],
13 |         "dict": {
14 |             "key": "value33",
15 |             "key2": "value44"
16 |         }
17 |     },
18 |     {
19 |         "name": "bob",
20 |         "id": 400,
21 |         "scores": [
22 |             3,
23 |             5,
24 |             6,
25 |             9
26 |         ],
27 |         "dict": {
28 |             "key": "value55",
29 |             "key2": "value66"
30 |         }
31 |     }
32 | ]
33 | 


--------------------------------------------------------------------------------
/code/chap08/sample_no_header.csv:
--------------------------------------------------------------------------------
1 | Alex,Sunnyvale,30
2 | Mary,Cupertino,28
3 | Jane,Stanford,44
4 | Bob,Ames,33
5 | 


--------------------------------------------------------------------------------
/code/chap08/sample_numbers.txt:
--------------------------------------------------------------------------------
1 | 123,344,455,6666,2,300
2 | 7777,4444,55
3 | 22,34
4 | 900,901,902,9000,5600,5600,5700,45
5 | 45
6 | 70,71,72
7 | 


--------------------------------------------------------------------------------
/code/chap08/sample_single_line.json:
--------------------------------------------------------------------------------
1 | {"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}}
2 | {"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}}
3 | {"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}}
4 | {"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}}
5 | 


--------------------------------------------------------------------------------
/code/chap08/sample_with_header.csv:
--------------------------------------------------------------------------------
1 | name,city,age
2 | Alex,Sunnyvale,30
3 | Mary,Cupertino,28
4 | Jane,Stanford,44
5 | Bob,Ames,33
6 | 


--------------------------------------------------------------------------------
/code/chap08/twitter.avro:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/twitter.avro


--------------------------------------------------------------------------------
/code/chap09/logistic_regression_builder.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to build and save an LR model 
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export TRAINING_DATA_SPAM="/pyspark_book/code/chap09/training_emails_spam.txt"
 8 | export TRAINING_DATA_NOSPAM="/pyspark_book/code/chap09/training_emails_nospam.txt"
 9 | export BUILT_MOLDEL_OUTPUT_PATH="/pyspark_book/code/chap09/model"
10 | export SPARK_PROG="/pyspark_book/code/chap09/logistic_regression_builder.py"
11 | #
12 | # Make sure there are no files under output path
13 | rm -fr ${BUILT_MOLDEL_OUTPUT_PATH}/*
14 | #
15 | # run the PySpark program:
16 | $SPARK_HOME/bin/spark-submit ${SPARK_PROG} ${TRAINING_DATA_NOSPAM} ${TRAINING_DATA_SPAM} ${BUILT_MOLDEL_OUTPUT_PATH}
17 | 


--------------------------------------------------------------------------------
/code/chap09/logistic_regression_predictor.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to load the built LR 
 3 | # model and to predict new emails (new_emails.txt) 
 4 | # into spam or nospam
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 9 | export NEW_EMAILS="/pyspark_book/code/chap09/new_emails.txt"
10 | export BUILT_MOLDEL_OUTPUT_PATH="/pyspark_book/code/chap09/model"
11 | export SPARK_PROG="/pyspark_book/code/chap09/logistic_regression_predictor.py"
12 | #
13 | # run the PySpark program:
14 | $SPARK_HOME/bin/spark-submit ${SPARK_PROG} ${BUILT_MOLDEL_OUTPUT_PATH} ${NEW_EMAILS}
15 | 


--------------------------------------------------------------------------------
/code/chap09/model/data/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/code/chap09/model/data/.part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/data/.part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/code/chap09/model/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/data/_SUCCESS


--------------------------------------------------------------------------------
/code/chap09/model/data/part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/data/part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet


--------------------------------------------------------------------------------
/code/chap09/model/metadata/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/code/chap09/model/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/metadata/.part-00000.crc


--------------------------------------------------------------------------------
/code/chap09/model/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/metadata/_SUCCESS


--------------------------------------------------------------------------------
/code/chap09/model/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.mllib.classification.LogisticRegressionModel","version":"1.0","numFeatures":128,"numClasses":2}
2 | 


--------------------------------------------------------------------------------
/code/chap10/test.data:
--------------------------------------------------------------------------------
 1 | 1,1,5.0
 2 | 1,2,1.0
 3 | 1,3,5.0
 4 | 1,4,1.0
 5 | 2,1,5.0
 6 | 2,2,1.0
 7 | 2,3,5.0
 8 | 2,4,1.0
 9 | 3,1,1.0
10 | 3,2,5.0
11 | 3,3,1.0
12 | 3,4,5.0
13 | 4,1,1.0
14 | 4,2,5.0
15 | 4,3,1.0
16 | 4,4,5.0
17 | 


--------------------------------------------------------------------------------
/code/chap11/airports.json:
--------------------------------------------------------------------------------
 1 | {"id":"ORD","City":"Chicago","State":"IL","Country":"USA"}
 2 | {"id":"LGA","City":"New York","State":"NY","Country":"USA"}
 3 | {"id":"BOS","City":"Boston","State":"MA","Country":"USA"}
 4 | {"id":"IAH","City":"Houston","State":"TX","Country":"USA"}
 5 | {"id":"EWR","City":"Newark","State":"NJ","Country":"USA"}
 6 | {"id":"DEN","City":"Denver","State":"CO","Country":"USA"}
 7 | {"id":"MIA","City":"Miami","State":"FL","Country":"USA"}
 8 | {"id":"SFO","City":"San Francisco","State":"CA","Country":"USA"}
 9 | {"id":"ATL","City":"Atlanta","State":"GA","Country":"USA"}
10 | {"id":"DFW","City":"Dallas","State":"TX","Country":"USA"}
11 | {"id":"CLT","City":"Charlotte","State":"NC","Country":"USA"}
12 | {"id":"LAX","City":"Los Angeles","State":"CA","Country":"USA"}
13 | {"id":"SEA","City":"Seattle","State":"WA","Country":"USA"}
14 | 


--------------------------------------------------------------------------------
/code/chap11/breadth_first_search_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. applying Breadth-first search (BFS) algorithm
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 9 | export SPARK_PROG="/pyspark_book/code/chap11/breadth_first_search_example.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap11/connected_component_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. finding connected components
 5 | #
 6 | # Reference: https://en.wikipedia.org/wiki/Connected_component_(graph_theory)
 7 | #-----------------------------------------------------
 8 | # @author Mahmoud Parsian
 9 | #-----------------------------------------------------
10 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
11 | export SPARK_PROG="/pyspark_book/code/chap11/connected_component_example.py"
12 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
13 | #
14 | # run the PySpark program:
15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
16 | 


--------------------------------------------------------------------------------
/code/chap11/graph_builder.log:
--------------------------------------------------------------------------------
 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 2 | export SPARK_PROG="/pyspark_book/code/chap11/graph_builder.py"
 3 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
 4 | #
 5 | # run the PySpark program:
 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
 7 | 
 8 | +---+-------+---+
 9 | | id|   name|age|
10 | +---+-------+---+
11 | |  a|  Alice| 34|
12 | |  b|    Bob| 36|
13 | |  c|Charlie| 30|
14 | +---+-------+---+
15 | 
16 | +---+---+------------+
17 | |src|dst|relationship|
18 | +---+---+------------+
19 | |  a|  b|      friend|
20 | |  b|  c|      follow|
21 | |  c|  b|      follow|
22 | +---+---+------------+
23 | 
24 | graph= GraphFrame(
25 |    v:[id: string, name: string ... 1 more field], 
26 |    e:[src: string, dst: string ... 1 more field]
27 | )
28 |    
29 | +---+--------+
30 | | id|inDegree|
31 | +---+--------+
32 | |  c|       1|
33 | |  b|       2|
34 | +---+--------+
35 | 
36 | count_follow= 2


--------------------------------------------------------------------------------
/code/chap11/graph_builder.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building a graph using 
 3 | # GraphFrames package.
 4 | #-----------------------------------------------------
 5 | # @author Mahmoud Parsian
 6 | #-----------------------------------------------------
 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 8 | export SPARK_PROG="/pyspark_book/code/chap11/graph_builder.py"
 9 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
10 | #
11 | # run the PySpark program:
12 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
13 | 


--------------------------------------------------------------------------------
/code/chap11/label_propagation_algorithm_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. applying Label Propagation Algorithm (LPA)
 5 | #
 6 | # Reference: https://en.wikipedia.org/wiki/Label_Propagation_Algorithm
 7 | #-----------------------------------------------------
 8 | # @author Mahmoud Parsian
 9 | #-----------------------------------------------------
10 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
11 | export SPARK_PROG="/pyspark_book/code/chap11/label_propagation_algorithm_example.py"
12 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
13 | #
14 | # run the PySpark program:
15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
16 | 


--------------------------------------------------------------------------------
/code/chap11/pagerank_data.txt:
--------------------------------------------------------------------------------
1 | 1,2
2 | 1,3
3 | 1,4
4 | 2,1
5 | 3,1
6 | 4,1
7 | 4,5
8 | 1,5
9 | 


--------------------------------------------------------------------------------
/code/chap11/pagerank_example.log:
--------------------------------------------------------------------------------
 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 2 | export SPARK_PROG="/pyspark_book/code/chap11/pagerank_example.py"
 3 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
 4 | #
 5 | # run the PySpark program:
 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
 7 | 
 8 | +---+-------+---+
 9 | | id|   name|age|
10 | +---+-------+---+
11 | |  a|  Alice| 34|
12 | |  b|    Bob| 36|
13 | |  c|Charlie| 30|
14 | +---+-------+---+
15 | 
16 | +---+---+------------+
17 | |src|dst|relationship|
18 | +---+---+------------+
19 | |  a|  b|      friend|
20 | |  b|  c|      follow|
21 | |  c|  b|      follow|
22 | +---+---+------------+
23 | 
24 | graph= GraphFrame(
25 |    v:[id: string, name: string ... 1 more field], 
26 |    e:[src: string, dst: string ... 1 more field]
27 | )
28 | 
29 | +---+------------------+
30 | | id|          pagerank|
31 | +---+------------------+
32 | |  b|1.0905890109440908|
33 | |  a|              0.01|
34 | |  c|1.8994109890559092|
35 | +---+------------------+
36 | 


--------------------------------------------------------------------------------
/code/chap11/pagerank_example.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building  
 3 | #   1. a graph using GraphFrames package.
 4 | #   2. applying PageRank algorithm to the built graph
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 9 | export SPARK_PROG="/pyspark_book/code/chap11/pagerank_example.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap11/sample_graph_edges.txt:
--------------------------------------------------------------------------------
 1 | edge_weight,from_id,to_id
 2 | 0,5,15
 3 | 1,18,8
 4 | 2,6,1
 5 | 3,0,10
 6 | 4,2,4
 7 | 5,19,7
 8 | 6,9,7
 9 | 7,11,9
10 | 8,14,9
11 | 9,16,11
12 | 10,17,8
13 | 1,3,4
14 | 2,12,15
15 | 3,13,2
16 | 4,21,0
17 | 5,22,4
18 | 16,22,8
19 | 17,24,4
20 | 18,28,7
21 | 19,28,13
22 | 20,28,16
23 | 1,29,11
24 | 2,30,16
25 | 3,31,15
26 | 24,32,2
27 | 25,32,30
28 | 6,35,11
29 | 7,35,24
30 | 28,36,16
31 | 29,39,7
32 | 30,39,28
33 | 1,40,7
34 | 2,40,11
35 | 3,41,5
36 | 4,41,16
37 | 5,41,32
38 | 6,42,32
39 | 7,43,36
40 | 8,44,16
41 | 9,46,7
42 | 6,49,3
43 | 1,5,31
44 | 2,30,42
45 | 4,17,22
46 | 4,18,22
47 | 1,50,51
48 | 2,51,52
49 | 3,50,52
50 | 1,71,72
51 | 1,71,73
52 | 1,72,73
53 | 


--------------------------------------------------------------------------------
/code/chap11/sample_graph_vertices.txt:
--------------------------------------------------------------------------------
 1 | vertex_id
 2 | 0
 3 | 1
 4 | 2
 5 | 3
 6 | 4
 7 | 5
 8 | 6
 9 | 7
10 | 8
11 | 9
12 | 10
13 | 11
14 | 12
15 | 13
16 | 14
17 | 15
18 | 16
19 | 17
20 | 18
21 | 19
22 | 20
23 | 21
24 | 22
25 | 23
26 | 24
27 | 25
28 | 26
29 | 27
30 | 28
31 | 29
32 | 30
33 | 31
34 | 32
35 | 33
36 | 34
37 | 35
38 | 36
39 | 37
40 | 38
41 | 39
42 | 40
43 | 41
44 | 42
45 | 43
46 | 44
47 | 45
48 | 46
49 | 47
50 | 48
51 | 49
52 | 50
53 | 51
54 | 52
55 | 71
56 | 72
57 | 73
58 | 


--------------------------------------------------------------------------------
/code/chap11/shortest_path_finder.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for 
 3 | #   1. building a graph using GraphFrames package.
 4 | #   2. finding shortest paths for given landmarks
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 9 | export SPARK_PROG="/pyspark_book/code/chap11/shortest_path_finder.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap11/triangles_counter.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building  
 3 | #   1. a graph using GraphFrames package.
 4 | #   2. applying Triangles Counting algorithm to the built graph
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 9 | export SPARK_PROG="/pyspark_book/code/chap11/triangles_counter.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
11 | #
12 | # run the PySpark program:
13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 
14 | 


--------------------------------------------------------------------------------
/code/chap11/unique_triangles_finder.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script for building  
 3 | #   1. a graph using GraphFrames package.
 4 | #   2. find unique Triangles from the built graph
 5 | #-----------------------------------------------------
 6 | # @author Mahmoud Parsian
 7 | #-----------------------------------------------------
 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 9 | export SPARK_PROG="/pyspark_book/code/code/chap11/unique_triangles_finder.py"
10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11"
11 | #
12 | export VERTICES_PATH="/pyspark_book/code/chap11/sample_graph_vertices.txt"
13 | export EDGES_PATH="/pyspark_book/code/chap11/sample_graph_edges.txt"
14 | #
15 | # run the PySpark program:
16 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG $VERTICES_PATH $EDGES_PATH
17 | 


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_aggregatebykey.log:
--------------------------------------------------------------------------------
 1 | # define PySpark program
 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_aggregatebykey.py"
 3 | # define your input path
 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
 5 | # define your Spark home directory
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | # run the program
 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
 9 | 
10 | input_path: sample_input.txt
11 | 
12 | records.count():  12
13 | 
14 | records.collect():  
15 | [
16 |  u'a,2', 
17 |  u'a,3', 
18 |  u'a,4', 
19 |  u'a,5', 
20 |  u'a,7', 
21 |  u'b,4', 
22 |  u'b,5', 
23 |  u'b,6', 
24 |  u'c,3', 
25 |  u'c,4', 
26 |  u'c,5', 
27 |  u'c,6'
28 | ]
29 | 
30 | pairs.count():  12
31 | 
32 | pairs.collect():  
33 | [
34 |  (u'a', 2), 
35 |  (u'a', 3), 
36 |  (u'a', 4), 
37 |  (u'a', 5), 
38 |  (u'a', 7), 
39 |  (u'b', 4), 
40 |  (u'b', 5), 
41 |  (u'b', 6), 
42 |  (u'c', 3), 
43 |  (u'c', 4), 
44 |  (u'c', 5), 
45 |  (u'c', 6)
46 | ]
47 | 
48 | sum_count.count():  3
49 | 
50 | sum_count.collect():  
51 | [
52 |  (u'a', (21, 5)), 
53 |  (u'c', (18, 4)), 
54 |  (u'b', (15, 3))
55 | ]
56 | 
57 | averages.count():  3
58 | 
59 | averages.collect():  
60 | [
61 |  (u'a', 4.2), 
62 |  (u'c', 4.5), 
63 |  (u'b', 5.0)
64 | ]
65 | 


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_aggregatebykey.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_aggregatebykey.py"
3 | # define your input path
4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_combinebykey.log:
--------------------------------------------------------------------------------
 1 | # define PySpark program
 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_combinebykey.py"
 3 | # define your input path
 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
 5 | # define your Spark home directory
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | # run the program
 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
 9 | 
10 | input_path: sample_input.txt
11 | 
12 | records.count():  12
13 | 
14 | records.collect():  
15 | [
16 |  u'a,2', 
17 |  u'a,3', 
18 |  u'a,4', 
19 |  u'a,5', 
20 |  u'a,7', 
21 |  u'b,4', 
22 |  u'b,5', 
23 |  u'b,6', 
24 |  u'c,3', 
25 |  u'c,4', 
26 |  u'c,5', 
27 |  u'c,6'
28 | ]
29 | 
30 | pairs.count():  12
31 | 
32 | pairs.collect():  
33 | [
34 |  (u'a', 2), 
35 |  (u'a', 3), 
36 |  (u'a', 4), 
37 |  (u'a', 5), 
38 |  (u'a', 7), 
39 |  (u'b', 4), 
40 |  (u'b', 5), 
41 |  (u'b', 6), 
42 |  (u'c', 3), 
43 |  (u'c', 4), 
44 |  (u'c', 5), 
45 |  (u'c', 6)
46 | ]
47 | 
48 | sum_count.count():  3
49 | 
50 | sum_count.collect():  
51 | [
52 |  (u'a', (21, 5)), 
53 |  (u'c', (18, 4)), 
54 |  (u'b', (15, 3))
55 | ]
56 | 
57 | averages.count():  3
58 | 
59 | averages.collect():  
60 | [
61 |  (u'a', 4.2), 
62 |  (u'c', 4.5), 
63 |  (u'b', 5.0)
64 | ]


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_combinebykey.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_combinebykey.py"
3 | # define your input path
4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_groupbykey.log:
--------------------------------------------------------------------------------
 1 | # define PySpark program
 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_groupbykey.py"
 3 | # define your input path
 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
 5 | # define your Spark home directory
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | # run the program
 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
 9 | 
10 | input_path: sample_input.txt
11 | 
12 | records :  sample_input.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
13 | 
14 | records.count():  12
15 | 
16 | records.collect():  
17 | [
18 |  u'a,2', 
19 |  u'a,3', 
20 |  u'a,4', 
21 |  u'a,5', 
22 |  u'a,7', 
23 |  u'b,4', 
24 |  u'b,5', 
25 |  u'b,6', 
26 |  u'c,3', 
27 |  u'c,4', 
28 |  u'c,5', 
29 |  u'c,6'
30 | ]
31 | 
32 | grouped_by_key :  PythonRDD[9] at RDD at PythonRDD.scala:48
33 | 
34 | grouped_by_key.count():  3
35 | 
36 | grouped_by_key.collect():  
37 | [
38 |  (u'a', <pyspark.resultiterable.ResultIterable object at 0x102818d10>), 
39 |  (u'c', <pyspark.resultiterable.ResultIterable object at 0x102818e10>), 
40 |  (u'b', <pyspark.resultiterable.ResultIterable object at 0x1026d9410>)
41 | ]
42 | 
43 | grouped_by_key.mapValues(lambda values : list(values)).collect():  
44 | [
45 |  (u'a', [2, 3, 4, 5, 7]), 
46 |  (u'c', [3, 4, 5, 6]), 
47 |  (u'b', [4, 5, 6])
48 | ]
49 | 
50 | averages.count():  3
51 | averages.collect():  
52 | [
53 |  (u'a', 4.2), 
54 |  (u'c', 4.5), 
55 |  (u'b', 5.0)
56 | ]


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_groupbykey.sh:
--------------------------------------------------------------------------------
 1 | #==========================================
 2 | # NOTE:
 3 | #
 4 | # In general, avoid using groupByKey(), and 
 5 | # instead use reduceByKey() or combineByKey().
 6 | # For details see: 
 7 | #   https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html
 8 | #
 9 | # The groupByKey() solution is provided for educational 
10 | # purposes.  If you need all of the values of a key for 
11 | # some aggregation such as finding the "median" (which you
12 | # need all of the values per key), then  the groupByKey() 
13 | # may be used.
14 | #==========================================
15 | #
16 | # define PySpark program
17 | export PROG="/pyspark_book/code/chap12/average_monoid_use_groupbykey.py"
18 | # define your input path
19 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
20 | # define your Spark home directory
21 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
22 | # run the program
23 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
24 | 


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_reducebykey.log:
--------------------------------------------------------------------------------
 1 | # define PySpark program
 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_reducebykey.py"
 3 | # define your input path
 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
 5 | # define your Spark home directory
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | # run the program
 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
 9 | input_path: /pyspark_book/code/chap12/sample_input.txt
10 | 
11 | records.count():  12
12 | 
13 | records.collect():  
14 | [
15 |  u'a,2', 
16 |  u'a,3', 
17 |  u'a,4', 
18 |  u'a,5', 
19 |  u'a,7', 
20 |  u'b,4', 
21 |  u'b,5', 
22 |  u'b,6', 
23 |  u'c,3', 
24 |  u'c,4', 
25 |  u'c,5', 
26 |  u'c,6'
27 | ]
28 | 
29 | sum_and_freq.count():  12
30 | 
31 | sum_and_freq.collect():  
32 | [
33 |  (u'a', (2, 1)), 
34 |  (u'a', (3, 1)), 
35 |  (u'a', (4, 1)), 
36 |  (u'a', (5, 1)), 
37 |  (u'a', (7, 1)), 
38 |  (u'b', (4, 1)), 
39 |  (u'b', (5, 1)), 
40 |  (u'b', (6, 1)), 
41 |  (u'c', (3, 1)), 
42 |  (u'c', (4, 1)), 
43 |  (u'c', (5, 1)), 
44 |  (u'c', (6, 1))
45 | ]
46 | 
47 | sum_count.count():  3
48 | 
49 | sum_count.collect():  
50 | [
51 |  (u'a', (21, 5)), 
52 |  (u'c', (18, 4)), 
53 |  (u'b', (15, 3))
54 | ]
55 | 
56 | averages.count():  3
57 | 
58 | averages.collect():  
59 | [
60 |  (u'a', 4.2), 
61 |  (u'c', 4.5), 
62 |  (u'b', 5.0)
63 | ]
64 | 


--------------------------------------------------------------------------------
/code/chap12/average_monoid_use_reducebykey.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_reducebykey.py"
3 | # define your input path
4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap12/inmapper_combiner_local_aggregation.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/pyspark_book/code/chap12/inmapper_combiner_local_aggregation.py"
3 | # define your input path
4 | export INPUT="/pyspark_book/code/chap12/sample_dna_seq.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap12/inmapper_combiner_use_basic_mapreduce.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/pyspark_book/code/chap12/inmapper_combiner_use_basic_mapreduce.py"
3 | # define your input path
4 | export INPUT="/pyspark_book/code/chap12/sample_dna_seq.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap12/inmapper_combiner_use_mappartitions.sh:
--------------------------------------------------------------------------------
1 | # define PySpark program
2 | export PROG="/pyspark_book/code/chap12/inmapper_combiner_use_mappartitions.py"
3 | # define your input path
4 | export INPUT="/pyspark_book/code/chap12/sample_dna_seq.txt"
5 | # define your Spark home directory
6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
7 | # run the program
8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT
9 | 


--------------------------------------------------------------------------------
/code/chap12/minmax_force_empty_partitions.log:
--------------------------------------------------------------------------------
 1 | #--------------------------------------------------
 2 | # NOTE: 
 3 | #      when you view min_max_count.collect(),
 4 | #      the triplets (1, -1, 0) denotes the result
 5 | #      of an empty partition, which is filtered out
 6 | #--------------------------------------------------
 7 | ./bin/spark-submit minmax_force_empty_partitions.py sample_numbers.txt
 8 | 
 9 | spark= <pyspark.sql.session.SparkSession object at 0x10b6a1710>
10 | 
11 | input_path= sample_numbers.txt
12 | 
13 | rdd= sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
14 | rdd.count= 11
15 | rdd.collect()= [u'23,24,22,44,66,77,44,44,555,666', u'12,4,555,66,67,68,57,55,56,45,45,45,66,77', u'34,35,36,97300,78,79', u'120,44,444,445,345,345,555', u'11,33,34,35,36,37,47,7777,8888,6666,44,55', u'10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105', u'6,7,8,9,10', u'8,9,10,12,12', u'7777', u'222,333,444,555,666,111,112,5,113,114', u'5555,4444,24']
16 | rdd.getNumPartitions()= 17
17 | 
18 | min_max_count= PythonRDD[3] at RDD at PythonRDD.scala:48
19 | min_max_count.count= 17
20 | min_max_count.collect()= 
21 | [
22 |  (22, 666, 10), 
23 |  (4, 555, 14), 
24 |  (1, -1, 0), 
25 |  (1, -1, 0), 
26 |  (34, 97300, 6), 
27 |  (44, 555, 7), 
28 |  (11, 8888, 12), 
29 |  (1, -1, 0), 
30 |  (1, -1, 0), 
31 |  (10, 105, 16), 
32 |  (1, -1, 0), 
33 |  (1, -1, 0), 
34 |  (6, 12, 10), 
35 |  (5, 7777, 11), 
36 |  (1, -1, 0), 
37 |  (24, 5555, 3), 
38 |  (1, -1, 0)
39 | ]
40 | 
41 | final: (min, max, count)= ( 4 ,  97300 ,  89 )
42 | 


--------------------------------------------------------------------------------
/code/chap12/minmax_force_empty_partitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run minmax_use_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_PATH="/pyspark_book/code/chap12/sample_numbers.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap12/minmax_force_empty_partitions.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap12/minmax_use_mappartitions.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit minmax_use_mappartitions.py  sample_numbers.txt
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x10c5dc610>
 4 | 
 5 | input_path= sample_numbers.txt
 6 | 
 7 | rdd= sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0
 8 | rdd.count= 11
 9 | rdd.collect()= 
10 | [
11 |  u'23,24,22,44,66,77,44,44,555,666', 
12 |  u'12,4,555,66,67,68,57,55,56,45,45,45,66,77', 
13 |  u'34,35,36,97300,78,79', 
14 |  u'120,44,444,445,345,345,555', 
15 |  u'11,33,34,35,36,37,47,7777,8888,6666,44,55', 
16 |  u'10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105', 
17 |  u'6,7,8,9,10', 
18 |  u'8,9,10,12,12', 
19 |  u'7777', 
20 |  u'222,333,444,555,666,111,112,5,113,114', 
21 |  u'5555,4444,24'
22 | ]
23 | 
24 | rdd.getNumPartitions()= 2
25 | 
26 | min_max_count= PythonRDD[3] at RDD at PythonRDD.scala:48
27 | 
28 | type(partition_iterator)= <type 'generator'>
29 | first_record= 23,24,22,44,66,77,44,44,555,666
30 | 
31 | type(partition_iterator)= <type 'generator'>
32 | first_record= 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105
33 | 
34 | min_max_count.count= 2
35 | 
36 | min_max_count.collect()= [(4, 97300, 49), (5, 7777, 40)]
37 | 
38 | final: (min, max, count)= ( 4 ,  97300 ,  89 )


--------------------------------------------------------------------------------
/code/chap12/minmax_use_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run minmax_use_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export INPUT_PATH="/pyspark_book/code/chap12/sample_numbers.txt"
 8 | export SPARK_PROG="/pyspark_book/code/chap12/minmax_use_mappartitions.py"
 9 | #
10 | # run the PySpark program:
11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH
12 | 


--------------------------------------------------------------------------------
/code/chap12/sample_dna_seq.txt:
--------------------------------------------------------------------------------
 1 | ATCGGGATCCGGG
 2 | ATTCCGGGATTCCCC
 3 | ATGGCCCCCGGGATCGGG
 4 | CGGTATCCGGGGAAAAA
 5 | aaattCCGGAACCGGGGGTTT
 6 | CCTTTTATCGGGCAAATTTTCCCGG
 7 | attttcccccggaaaAAATTTCCGGG
 8 | ACTGACTAGCTAGCTAACTG
 9 | GCATCGTAGCTAGCTACGAT
10 | AATTCCCGCATCGATCGTACGTACGTAG
11 | ATCGATCGATCGTACGATCG
12 | 


--------------------------------------------------------------------------------
/code/chap12/sample_input.txt:
--------------------------------------------------------------------------------
 1 | a,2
 2 | a,3
 3 | a,4
 4 | a,5
 5 | a,7
 6 | b,4
 7 | b,5
 8 | b,6
 9 | c,3
10 | c,4
11 | c,5
12 | c,6
13 | 


--------------------------------------------------------------------------------
/code/chap12/sample_numbers.txt:
--------------------------------------------------------------------------------
 1 | 23,24,22,44,66,77,44,44,555,666
 2 | 12,4,555,66,67,68,57,55,56,45,45,45,66,77
 3 | 34,35,36,97300,78,79
 4 | 120,44,444,445,345,345,555
 5 | 11,33,34,35,36,37,47,7777,8888,6666,44,55
 6 | 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105
 7 | 6,7,8,9,10
 8 | 8,9,10,12,12
 9 | 7777
10 | 222,333,444,555,666,111,112,5,113,114
11 | 5555,4444,24
12 | 


--------------------------------------------------------------------------------
/code/chap12/top_N_use_mappartitions.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run top_N_use_mappartitions.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap12/top_N_use_mappartitions.py"
 8 | #
 9 | # run the PySpark program:
10 | # find Top-3
11 | export N = 3
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N
13 | 


--------------------------------------------------------------------------------
/code/chap12/top_N_use_takeordered.log:
--------------------------------------------------------------------------------
 1 | ./bin/spark-submit top_N_use_takeordered.py 3
 2 | 
 3 | spark= <pyspark.sql.session.SparkSession object at 0x1106c0610>
 4 | 
 5 | N :  3
 6 | 
 7 | list_of_key_value =  
 8 | [
 9 |  ('a', 1), ('a', 7), ('a', 2), ('a', 3), 
10 |  ('b', 2), ('b', 4), 
11 |  ('c', 10), ('c', 50), ('c', 60), ('c', 70), 
12 |  ('d', 5), ('d', 15), ('d', 25), 
13 |  ('e', 1), ('e', 2), 
14 |  ('f', 9), ('f', 2), 
15 |  ('g', 22)
16 | ]
17 | 
18 | rdd= ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175
19 | rdd.count= 18
20 | rdd.collect()= 
21 | [
22 |  ('a', 1), ('a', 7), ('a', 2), ('a', 3), 
23 |  ('b', 2), ('b', 4), 
24 |  ('c', 10), ('c', 50), ('c', 60), ('c', 70), 
25 |  ('d', 5), ('d', 15), ('d', 25), 
26 |  ('e', 1), ('e', 2), 
27 |  ('f', 9), ('f', 2), 
28 |  ('g', 22)
29 | ]
30 | 
31 | combined= PythonRDD[6] at RDD at PythonRDD.scala:48
32 | combined.count= 7
33 | combined.collect()= 
34 | [
35 |  ('a', 13), 
36 |  ('c', 190), 
37 |  ('b', 6), 
38 |  ('e', 3), 
39 |  ('d', 45), 
40 |  ('g', 22), 
41 |  ('f', 11)
42 | ]
43 | 
44 | topN =  [('c', 190), ('d', 45), ('g', 22)]
45 | 
46 | bottomN =  [('e', 3), ('b', 6), ('f', 11)]


--------------------------------------------------------------------------------
/code/chap12/top_N_use_takeordered.sh:
--------------------------------------------------------------------------------
 1 | #-----------------------------------------------------
 2 | # This is a shell script to run top_N_use_takeordered.py
 3 | #-----------------------------------------------------
 4 | # @author Mahmoud Parsian
 5 | #-----------------------------------------------------
 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3"
 7 | export SPARK_PROG="/pyspark_book/code/chap12/top_N_use_takeordered.py"
 8 | #
 9 | # run the PySpark program:
10 | # find Top-3
11 | export N = 3
12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N
13 | 


--------------------------------------------------------------------------------
/code/examples/wordcount/foxdata.txt:
--------------------------------------------------------------------------------
1 | red fox jumped high
2 | fox jumped over high fence
3 | red fox jumped
4 | 


--------------------------------------------------------------------------------
/code/examples/wordcount/wordcount.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import sys
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | #
 6 | print ("This is the name of the script: ", sys.argv[0])
 7 | print ("Number of arguments: ", len(sys.argv))
 8 | print ("The arguments are: " , str(sys.argv))
 9 | #
10 | if len(sys.argv) != 3:
11 | 	print("Usage: wordcount.py <input-path>, <output-path>", file=sys.stderr)
12 | 	exit(-1)
13 | 
14 | # DEFINE your input path
15 | input_path = sys.argv[1]
16 | print("input_path: ", input_path)
17 | 
18 | # DEFINE your output path
19 | output_path = sys.argv[2]
20 | print("output_path: ", output_path)
21 | 
22 | # CREATE an instance of a SparkSession object
23 | spark = SparkSession\
24 | 	.builder\
25 | 	.appName("PythonWordCount")\
26 | 	.getOrCreate()
27 | 
28 | # CREATE a new RDD[String]
29 | lines = spark.sparkContext.textFile(input_path)
30 | print("lines=", lines.collect())
31 | 
32 | #   APPLY a SET of TRANSFORMATIONS...
33 | # counts: RDD[(String, Integer)]
34 | counts = lines.flatMap(lambda x: x.split(' ')) \
35 | 			  .map(lambda x: (x, 1)) \
36 |               .reduceByKey(lambda a,b : a+b)
37 | 
38 | #   output = [(word1, count1), (word2, count2), ...]                  
39 | output = counts.collect()
40 | for (word, count) in output:
41 | 	print("%s: %i" % (word, count))
42 | 
43 | # save output
44 | counts.saveAsTextFile(output_path)
45 | 
46 | #   DONE!
47 | spark.stop()
48 | 


--------------------------------------------------------------------------------
/code/examples/wordcount/wordcount.py.usage:
--------------------------------------------------------------------------------
1 | ./bin/spark-submit wordcount.py foxdata.txt /tmp/output
2 | 


--------------------------------------------------------------------------------
/code/jars/avro-mapred-1.7.7-hadoop1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/avro-mapred-1.7.7-hadoop1.jar


--------------------------------------------------------------------------------
/code/jars/avro-mapred-1.7.7-hadoop2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/avro-mapred-1.7.7-hadoop2.jar


--------------------------------------------------------------------------------
/code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar


--------------------------------------------------------------------------------
/code/jars/elasticsearch-hadoop-6.4.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/elasticsearch-hadoop-6.4.2.jar


--------------------------------------------------------------------------------
/code/jars/elasticsearch-spark_2.11-2.4.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/elasticsearch-spark_2.11-2.4.5.jar


--------------------------------------------------------------------------------
/code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar


--------------------------------------------------------------------------------
/code/jars/hbase-spark-connector-1.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/hbase-spark-connector-1.0.0.jar


--------------------------------------------------------------------------------
/code/jars/htrace-core-3.1.0-incubating.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/htrace-core-3.1.0-incubating.jar


--------------------------------------------------------------------------------
/code/jars/mongo-java-driver-3.8.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mongo-java-driver-3.8.2.jar


--------------------------------------------------------------------------------
/code/jars/mongo-spark-connector_2.11-2.2.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mongo-spark-connector_2.11-2.2.5.jar


--------------------------------------------------------------------------------
/code/jars/mongodb-driver-3.8.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mongodb-driver-3.8.2.jar


--------------------------------------------------------------------------------
/code/jars/mysql-connector-java-5.1.42.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mysql-connector-java-5.1.42.jar


--------------------------------------------------------------------------------
/code/jars/shc-core-1.1.3-2.3-s_2.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/shc-core-1.1.3-2.3-s_2.11.jar


--------------------------------------------------------------------------------
/code/jars/shc-examples-1.1.3-2.3-s_2.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/shc-examples-1.1.3-2.3-s_2.11.jar


--------------------------------------------------------------------------------
/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar


--------------------------------------------------------------------------------
/code/jars/spark-redis-2.3.1-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/spark-redis-2.3.1-SNAPSHOT.jar


--------------------------------------------------------------------------------
/images/pyspark_algorithms.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms.jpg


--------------------------------------------------------------------------------
/images/pyspark_algorithms0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms0.jpg


--------------------------------------------------------------------------------
/images/pyspark_algorithms2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms2.jpg


--------------------------------------------------------------------------------
/images/pyspark_algorithms3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms3.jpg


--------------------------------------------------------------------------------
/sample_chapters/Appendix_Questions_and_Answers.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/Appendix_Questions_and_Answers.epub


--------------------------------------------------------------------------------
/sample_chapters/Appendix_Questions_and_Answers.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/Appendix_Questions_and_Answers.pdf


--------------------------------------------------------------------------------
/sample_chapters/README.md:
--------------------------------------------------------------------------------
1 | ## Sample Chapters of PySpark Algorithms Book
2 | 


--------------------------------------------------------------------------------
/sample_chapters/chap04_Getting_Started_with_PySpark.epub:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/chap04_Getting_Started_with_PySpark.epub


--------------------------------------------------------------------------------
/sample_chapters/chap04_Getting_Started_with_PySpark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/chap04_Getting_Started_with_PySpark.pdf


--------------------------------------------------------------------------------
/where_to_buy_book/README.md:
--------------------------------------------------------------------------------
 1 | # PySpark Algorithms Book from Amazon.com
 2 | 
 3 | * Author: Mahmoud Parsian
 4 | * Published Date: August 2019
 5 | 
 6 | ## Purchase [PySpark Algorithms Book &rarr; PDF Version (.pdf)](https://www.amazon.com/PySpark-Algorithms-Mahmoud-Parsian-ebook/dp/B07WQHTVCJ/)
 7 | 
 8 | ## Purchase [PySpark Algorithms Book &rarr; Kindle Version (.kpf)](https://www.amazon.com/dp/B07X4B2218/ref=sr_1_2)
 9 | 
10 | 


--------------------------------------------------------------------------------