├── LICENSE.md ├── README.md ├── code ├── README.md ├── chap01 │ ├── README.md │ ├── basic_dataframe_example.log │ ├── basic_dataframe_example.py │ ├── basic_dataframe_example.sh │ ├── compute_stats.log │ ├── compute_stats.py │ ├── compute_stats.sh │ ├── compute_stats_detailed.log │ ├── compute_stats_detailed.py │ ├── compute_stats_detailed.sh │ ├── compute_stats_with_threshold.log │ ├── compute_stats_with_threshold.py │ ├── compute_stats_with_threshold.sh │ ├── compute_stats_with_threshold_and_filter.log │ ├── compute_stats_with_threshold_and_filter.py │ ├── compute_stats_with_threshold_and_filter.sh │ ├── dataframe_creation_from_csv.log │ ├── dataframe_creation_from_csv.py │ ├── dataframe_creation_from_csv.sh │ ├── fox_data.txt │ ├── name_city_age.csv │ ├── rdd_creation_from_csv.log │ ├── rdd_creation_from_csv.py │ ├── rdd_creation_from_csv.sh │ ├── sample.txt │ ├── sample_numbers.txt │ ├── sample_people.json │ ├── sort_numbers.log │ ├── sort_numbers.py │ ├── sort_numbers.sh │ ├── url_frequencies.txt │ ├── word_count.log │ ├── word_count.py │ ├── word_count.sh │ ├── word_count_with_params.py │ ├── word_count_with_params.sh │ ├── word_count_with_threshold.py │ └── word_count_with_threshold.sh ├── chap02 │ ├── README.md │ ├── generate_key_value_pairs.py │ ├── sample_file.txt │ ├── sample_file_extra.txt │ ├── sum_by_groupbykey.log │ ├── sum_by_groupbykey.py │ ├── sum_by_groupbykey.sh │ ├── sum_by_reducebykey.log │ ├── sum_by_reducebykey.py │ ├── sum_by_reducebykey.sh │ ├── word_count_driver.log │ ├── word_count_driver.py │ ├── word_count_driver.sh │ ├── word_count_driver_by_groupbykey.log │ ├── word_count_driver_by_groupbykey.py │ ├── word_count_driver_by_groupbykey.sh │ ├── word_count_driver_shorthand.log │ ├── word_count_driver_shorthand.py │ ├── word_count_driver_shorthand.sh │ ├── word_count_driver_shorthand_by_groupbykey.log │ ├── word_count_driver_shorthand_by_groupbykey.py │ ├── word_count_driver_shorthand_by_groupbykey.sh │ ├── word_count_driver_shorthand_sorted.log │ ├── word_count_driver_shorthand_sorted.py │ ├── word_count_driver_shorthand_sorted.sh │ ├── word_count_driver_with_filter.log │ ├── word_count_driver_with_filter.py │ ├── word_count_driver_with_filter.sh │ ├── word_count_driver_with_filter_and_threshold.log │ ├── word_count_driver_with_filter_and_threshold.py │ ├── word_count_driver_with_filter_and_threshold.sh │ ├── word_count_python.py │ └── word_count_python_shorthand.py ├── chap03 │ ├── datadir │ │ ├── file1 │ │ └── file2 │ ├── dataframe_creation_from_collection.log │ ├── dataframe_creation_from_collection.py │ ├── dataframe_creation_from_collection.sh │ ├── dataframe_creation_from_csv_no_header.log │ ├── dataframe_creation_from_csv_no_header.py │ ├── dataframe_creation_from_csv_no_header.sh │ ├── dataframe_creation_from_csv_with_header.log │ ├── dataframe_creation_from_csv_with_header.py │ ├── dataframe_creation_from_csv_with_header.sh │ ├── dataframe_creation_from_dictionary.log │ ├── dataframe_creation_from_dictionary.py │ ├── dataframe_creation_from_dictionary.sh │ ├── dataframe_creation_from_directory.log │ ├── dataframe_creation_from_directory.py │ ├── dataframe_creation_from_directory.sh │ ├── dataframe_creation_from_rdd.log │ ├── dataframe_creation_from_rdd.py │ ├── dataframe_creation_from_rdd.sh │ ├── kv.txt │ ├── kv_no_header.txt │ ├── kv_with_header.txt │ ├── rdd_creation_from_collection.log │ ├── rdd_creation_from_collection.py │ ├── rdd_creation_from_collection.sh │ ├── rdd_creation_from_dataframe.log │ ├── rdd_creation_from_dataframe.py │ ├── rdd_creation_from_dataframe.sh │ ├── rdd_creation_from_dictionary.log │ ├── rdd_creation_from_dictionary.py │ ├── rdd_creation_from_dictionary.sh │ ├── rdd_creation_from_directory.log │ ├── rdd_creation_from_directory.py │ ├── rdd_creation_from_directory.sh │ ├── rdd_creation_from_file.log │ ├── rdd_creation_from_file.py │ ├── rdd_creation_from_file.sh │ ├── sample.txt │ ├── sample_dir │ │ ├── file1.txt │ │ └── file2.txt │ ├── sample_dir2 │ │ ├── file1.txt │ │ ├── file2.txt │ │ ├── file3.csv │ │ └── file4.csv │ ├── word_count.py │ ├── word_count.sh │ ├── word_count_with_params.py │ ├── word_count_with_params.sh │ ├── word_count_with_threshold.py │ └── word_count_with_threshold.sh ├── chap04 │ ├── DNA-FASTA-PERFORMANCE │ │ └── performance_of_FASTA_versions_1_2_3.txt │ ├── DNA-FASTA-V1 │ │ ├── run_dna_base_count_ver_1.py │ │ ├── run_dna_base_count_ver_1.sh │ │ ├── run_dna_base_count_ver_1_1GB.sh │ │ └── run_dna_base_count_ver_1_big.sh │ ├── DNA-FASTA-V2 │ │ ├── dna_base_count_ver_2.py │ │ ├── run_dna_base_count_ver_2.sh │ │ ├── run_dna_base_count_ver_2_1GB.sh │ │ └── run_dna_base_count_ver_2_big.sh │ ├── DNA-FASTA-V3 │ │ ├── dna_base_count_ver_3.py │ │ ├── run_dna_base_count_ver_3.sh │ │ ├── run_dna_base_count_ver_3_1GB.sh │ │ └── run_dna_base_count_ver_3_big.sh │ ├── DNA-FASTQ │ │ ├── dna_base_count_fastq.py │ │ └── run_dna_base_count_fastq.sh │ ├── README.md │ └── data │ │ ├── sample.fasta │ │ └── sp1.fastq ├── chap05 │ ├── average_by_key_use_aggregatebykey.log │ ├── average_by_key_use_aggregatebykey.py │ ├── average_by_key_use_aggregatebykey.sh │ ├── average_by_key_use_combinebykey.log │ ├── average_by_key_use_combinebykey.py │ ├── average_by_key_use_combinebykey.sh │ ├── average_by_key_use_foldbykey.log │ ├── average_by_key_use_foldbykey.py │ ├── average_by_key_use_foldbykey.sh │ ├── average_by_key_use_groupbykey.log │ ├── average_by_key_use_groupbykey.py │ ├── average_by_key_use_groupbykey.sh │ ├── average_by_key_use_reducebykey.log │ ├── average_by_key_use_reducebykey.py │ ├── average_by_key_use_reducebykey.sh │ ├── dataframe_action_describe.log │ ├── dataframe_action_describe.py │ ├── dataframe_action_describe.sh │ ├── dataframe_drop.log │ ├── dataframe_drop.py │ ├── dataframe_drop.sh │ ├── dataframe_filter.log │ ├── dataframe_filter.py │ ├── dataframe_filter.sh │ ├── dataframe_join_cross.log │ ├── dataframe_join_cross.py │ ├── dataframe_join_cross.sh │ ├── dataframe_join_inner.log │ ├── dataframe_join_inner.py │ ├── dataframe_join_inner.sh │ ├── dataframe_join_left.log │ ├── dataframe_join_left.py │ ├── dataframe_join_left.sh │ ├── dataframe_join_right.log │ ├── dataframe_join_right.py │ ├── dataframe_join_right.sh │ ├── dataframe_sql.log │ ├── dataframe_sql.py │ ├── dataframe_sql.sh │ ├── dataframe_withcolumn.log │ ├── dataframe_withcolumn.py │ ├── dataframe_withcolumn.sh │ ├── emps.txt │ ├── rdd_transformation_cartesian.log │ ├── rdd_transformation_cartesian.py │ ├── rdd_transformation_cartesian.sh │ ├── rdd_transformation_combinebykey.log │ ├── rdd_transformation_combinebykey.py │ ├── rdd_transformation_combinebykey.sh │ ├── rdd_transformation_filter.log │ ├── rdd_transformation_filter.py │ ├── rdd_transformation_filter.sh │ ├── rdd_transformation_flatmap.log │ ├── rdd_transformation_flatmap.py │ ├── rdd_transformation_flatmap.sh │ ├── rdd_transformation_groupbykey.log │ ├── rdd_transformation_groupbykey.py │ ├── rdd_transformation_groupbykey.sh │ ├── rdd_transformation_join.log │ ├── rdd_transformation_join.py │ ├── rdd_transformation_join.sh │ ├── rdd_transformation_map.log │ ├── rdd_transformation_map.py │ ├── rdd_transformation_map.sh │ ├── rdd_transformation_mappartitions.log │ ├── rdd_transformation_mappartitions.py │ ├── rdd_transformation_mappartitions.sh │ ├── rdd_transformation_mappartitions_handle_empty_partitions.log │ ├── rdd_transformation_mappartitions_handle_empty_partitions.py │ ├── rdd_transformation_mappartitions_handle_empty_partitions.sh │ ├── rdd_transformation_reducebykey.log │ ├── rdd_transformation_reducebykey.py │ ├── rdd_transformation_reducebykey.sh │ ├── rdd_transformation_sortby.log │ ├── rdd_transformation_sortby.py │ ├── rdd_transformation_sortby.sh │ ├── rdd_transformation_takeordered.log │ ├── rdd_transformation_takeordered.py │ ├── rdd_transformation_takeordered.sh │ ├── sample_5_records.txt │ └── users.txt ├── chap06 │ ├── README.md │ ├── average_by_key_use_aggregatebykey.py │ ├── average_by_key_use_aggregatebykey.sh │ ├── average_by_key_use_combinebykey.py │ ├── average_by_key_use_combinebykey.sh │ ├── average_by_key_use_groupbykey.py │ ├── average_by_key_use_groupbykey.sh │ ├── average_by_key_use_reducebykey.py │ └── average_by_key_use_reducebykey.sh ├── chap07 │ ├── WorldCupPlayers.csv │ ├── WorldCupPlayers.csv.data.source │ ├── WorldCupPlayers.csv.zip │ ├── customers.RECORD.FORMAT.txt │ ├── customers.txt │ ├── dataframe_creation_add_columns.log │ ├── dataframe_creation_add_columns.py │ ├── dataframe_creation_add_columns.sh │ ├── dataframe_creation_aggregate_multiple_columns.log │ ├── dataframe_creation_aggregate_multiple_columns.py │ ├── dataframe_creation_aggregate_multiple_columns.sh │ ├── dataframe_creation_aggregate_single_column.log │ ├── dataframe_creation_aggregate_single_column.py │ ├── dataframe_creation_aggregate_single_column.sh │ ├── dataframe_creation_call_udf.log │ ├── dataframe_creation_call_udf.py │ ├── dataframe_creation_call_udf.sh │ ├── dataframe_creation_cvs_no_header.log │ ├── dataframe_creation_cvs_no_header.py │ ├── dataframe_creation_cvs_no_header.sh │ ├── dataframe_creation_cvs_with_header.log │ ├── dataframe_creation_cvs_with_header.py │ ├── dataframe_creation_cvs_with_header.sh │ ├── dataframe_creation_from_collections.log │ ├── dataframe_creation_from_collections.py │ ├── dataframe_creation_from_collections.sh │ ├── dataframe_creation_from_pandas.log │ ├── dataframe_creation_from_pandas.py │ ├── dataframe_creation_from_pandas.sh │ ├── dataframe_creation_from_rows.log │ ├── dataframe_creation_from_rows.py │ ├── dataframe_creation_from_rows.sh │ ├── dataframe_creation_order_by.log │ ├── dataframe_creation_order_by.py │ ├── dataframe_creation_order_by.sh │ ├── dataframe_creation_with_explicit_schema.log │ ├── dataframe_creation_with_explicit_schema.py │ ├── dataframe_creation_with_explicit_schema.sh │ ├── dataframe_crosstab.log │ ├── dataframe_crosstab.py │ ├── dataframe_crosstab.sh │ ├── dataframe_drop_column.log │ ├── dataframe_drop_column.py │ ├── dataframe_drop_column.sh │ ├── dataframe_drop_duplicates.log │ ├── dataframe_drop_duplicates.py │ ├── dataframe_drop_duplicates.sh │ ├── dataframe_multi_dim_agg_groupby.log │ ├── dataframe_multi_dim_agg_groupby.py │ ├── dataframe_multi_dim_agg_groupby.sh │ ├── dataframe_multi_dim_agg_rollup.log │ ├── dataframe_multi_dim_agg_rollup.py │ ├── dataframe_multi_dim_agg_rollup.sh │ ├── dataframe_tutorial_with_worldcup.log │ ├── dataframe_tutorial_with_worldcup.py │ ├── dataframe_tutorial_with_worldcup.sh │ ├── dataframe_with_statistical_data.log │ ├── dataframe_with_statistical_data.py │ ├── dataframe_with_statistical_data.sh │ ├── emps_no_header.txt │ ├── emps_with_header.txt │ ├── life_expentancy.txt │ ├── partition_data_by_customer_and_year.log │ ├── partition_data_by_customer_and_year.py │ ├── partition_data_by_customer_and_year.sh │ ├── partition_data_by_customer_and_year_single_file.py │ ├── strings-2.parquet │ ├── users.parquet │ └── users4.parquet ├── chap08 │ ├── cats.no.header.csv │ ├── cats.with.header.csv │ ├── datasource_csv_reader_no_header.log │ ├── datasource_csv_reader_no_header.py │ ├── datasource_csv_reader_no_header.sh │ ├── datasource_csv_reader_with_header.log │ ├── datasource_csv_reader_with_header.py │ ├── datasource_csv_reader_with_header.sh │ ├── datasource_csv_writer.log │ ├── datasource_csv_writer.py │ ├── datasource_csv_writer.sh │ ├── datasource_elasticsearch_reader.log │ ├── datasource_elasticsearch_reader.py │ ├── datasource_elasticsearch_reader.sh │ ├── datasource_elasticsearch_writer.log │ ├── datasource_elasticsearch_writer.py │ ├── datasource_elasticsearch_writer.sh │ ├── datasource_gzip_reader.log │ ├── datasource_gzip_reader.py │ ├── datasource_gzip_reader.sh │ ├── datasource_jdbc_reader.log │ ├── datasource_jdbc_reader.py │ ├── datasource_jdbc_reader.sh │ ├── datasource_jdbc_writer.log │ ├── datasource_jdbc_writer.py │ ├── datasource_jdbc_writer.sh │ ├── datasource_json_reader_multi_line.log │ ├── datasource_json_reader_multi_line.py │ ├── datasource_json_reader_multi_line.sh │ ├── datasource_json_reader_single_line.log │ ├── datasource_json_reader_single_line.py │ ├── datasource_json_reader_single_line.sh │ ├── datasource_mongodb_reader.log │ ├── datasource_mongodb_reader.py │ ├── datasource_mongodb_reader.sh │ ├── datasource_mongodb_writer.log │ ├── datasource_mongodb_writer.py │ ├── datasource_mongodb_writer.sh │ ├── datasource_redis_reader.log │ ├── datasource_redis_reader.py │ ├── datasource_redis_reader.sh │ ├── datasource_redis_writer.log │ ├── datasource_redis_writer.py │ ├── datasource_redis_writer.sh │ ├── datasource_textfile_reader.log │ ├── datasource_textfile_reader.py │ ├── datasource_textfile_reader.sh │ ├── datasource_textfile_writer.log │ ├── datasource_textfile_writer.py │ ├── datasource_textfile_writer.sh │ ├── images │ │ ├── cat1.jpg │ │ ├── cat2.jpg │ │ ├── cat3.jpg │ │ ├── cat4.jpg │ │ ├── duck1.jpg │ │ ├── duck2.jpg │ │ └── not-image.txt │ ├── mongodb_coll44.png │ ├── mongodb_coll66.png │ ├── name_age_salary.csv │ ├── people.txt │ ├── sample_multi_line.json │ ├── sample_no_header.csv │ ├── sample_numbers.txt │ ├── sample_single_line.json │ ├── sample_with_header.csv │ └── twitter.avro ├── chap09 │ ├── logistic_regression_builder.log │ ├── logistic_regression_builder.py │ ├── logistic_regression_builder.sh │ ├── logistic_regression_predictor.log │ ├── logistic_regression_predictor.py │ ├── logistic_regression_predictor.sh │ ├── model │ │ ├── data │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet │ │ └── metadata │ │ │ ├── ._SUCCESS.crc │ │ │ ├── .part-00000.crc │ │ │ ├── _SUCCESS │ │ │ └── part-00000 │ ├── new_emails.txt │ ├── training_emails_nospam.txt │ └── training_emails_spam.txt ├── chap10 │ ├── recommendation_example.py │ └── test.data ├── chap11 │ ├── airports.json │ ├── breadth_first_search_example.log │ ├── breadth_first_search_example.py │ ├── breadth_first_search_example.sh │ ├── connected_component_example.log │ ├── connected_component_example.py │ ├── connected_component_example.sh │ ├── flightdata2018.json │ ├── graph_builder.log │ ├── graph_builder.py │ ├── graph_builder.sh │ ├── label_propagation_algorithm_example.log │ ├── label_propagation_algorithm_example.py │ ├── label_propagation_algorithm_example.sh │ ├── pagerank.py │ ├── pagerank_data.txt │ ├── pagerank_example.log │ ├── pagerank_example.py │ ├── pagerank_example.sh │ ├── sample_graph_edges.txt │ ├── sample_graph_vertices.txt │ ├── shortest_path_finder.log │ ├── shortest_path_finder.py │ ├── shortest_path_finder.sh │ ├── triangles_counter.log │ ├── triangles_counter.py │ ├── triangles_counter.sh │ ├── unique_triangles_finder.log │ ├── unique_triangles_finder.py │ └── unique_triangles_finder.sh ├── chap12 │ ├── average_monoid_use_aggregatebykey.log │ ├── average_monoid_use_aggregatebykey.py │ ├── average_monoid_use_aggregatebykey.sh │ ├── average_monoid_use_combinebykey.log │ ├── average_monoid_use_combinebykey.py │ ├── average_monoid_use_combinebykey.sh │ ├── average_monoid_use_groupbykey.log │ ├── average_monoid_use_groupbykey.py │ ├── average_monoid_use_groupbykey.sh │ ├── average_monoid_use_reducebykey.log │ ├── average_monoid_use_reducebykey.py │ ├── average_monoid_use_reducebykey.sh │ ├── inmapper_combiner_local_aggregation.log │ ├── inmapper_combiner_local_aggregation.py │ ├── inmapper_combiner_local_aggregation.sh │ ├── inmapper_combiner_use_basic_mapreduce.log │ ├── inmapper_combiner_use_basic_mapreduce.py │ ├── inmapper_combiner_use_basic_mapreduce.sh │ ├── inmapper_combiner_use_mappartitions.log │ ├── inmapper_combiner_use_mappartitions.py │ ├── inmapper_combiner_use_mappartitions.sh │ ├── minmax_force_empty_partitions.log │ ├── minmax_force_empty_partitions.py │ ├── minmax_force_empty_partitions.sh │ ├── minmax_use_mappartitions.log │ ├── minmax_use_mappartitions.py │ ├── minmax_use_mappartitions.sh │ ├── sample_dna_seq.txt │ ├── sample_input.txt │ ├── sample_numbers.txt │ ├── top_N_use_mappartitions.log │ ├── top_N_use_mappartitions.py │ ├── top_N_use_mappartitions.sh │ ├── top_N_use_takeordered.log │ ├── top_N_use_takeordered.py │ └── top_N_use_takeordered.sh ├── examples │ └── wordcount │ │ ├── foxdata.txt │ │ ├── wordcount.py │ │ └── wordcount.py.usage └── jars │ ├── avro-mapred-1.7.7-hadoop1.jar │ ├── avro-mapred-1.7.7-hadoop2.jar │ ├── com-cotdp-hadoop-1.0-SNAPSHOT.jar │ ├── elasticsearch-hadoop-6.4.2.jar │ ├── elasticsearch-spark_2.11-2.4.5.jar │ ├── graphframes-0.6.0-spark2.3-s_2.11.jar │ ├── hbase-spark-connector-1.0.0.jar │ ├── htrace-core-3.1.0-incubating.jar │ ├── mongo-java-driver-3.8.2.jar │ ├── mongo-spark-connector_2.11-2.2.5.jar │ ├── mongodb-driver-3.8.2.jar │ ├── mysql-connector-java-5.1.42.jar │ ├── shc-core-1.1.3-2.3-s_2.11.jar │ ├── shc-examples-1.1.3-2.3-s_2.11.jar │ ├── spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar │ └── spark-redis-2.3.1-SNAPSHOT.jar ├── images ├── pyspark_algorithms.jpg ├── pyspark_algorithms0.jpg ├── pyspark_algorithms2.jpg └── pyspark_algorithms3.jpg ├── sample_chapters ├── Appendix_Questions_and_Answers.epub ├── Appendix_Questions_and_Answers.pdf ├── README.md ├── chap04_Getting_Started_with_PySpark.epub └── chap04_Getting_Started_with_PySpark.pdf └── where_to_buy_book └── README.md /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright [2019] [Mahmoud Parsian] 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /code/chap01/basic_dataframe_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run basic_dataframe_example.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap01/sample_people.json" 8 | export SPARK_PROG="/pyspark_book/code/chap01/basic_dataframe_example.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap01/compute_stats.log: -------------------------------------------------------------------------------- 1 | # define python3: find out where python3 is installed? 2 | $ type python3 3 | python3 is /usr/local/bin/python3 4 | $ /usr/local/bin/python3 --version 5 | Python 3.7.1 6 | export PYSPARK_PYTHON=/usr/local/bin/python3 7 | # 8 | export PROG="/pyspark_book/code/chap01/compute_stats.py" 9 | # define your input path 10 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt" 11 | # define your Spark home directory 12 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 13 | 14 | # run the program 15 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 16 | 17 | inputPath : /pyspark_book/code/chap01/url_frequencies.txt 18 | 19 | results = [ 20 | ('url3', (21.857142857142858, 20, 18.97743020387263)), 21 | ('url1', (6.8, 8, 4.324349662087931)), 22 | ('url2', (6, 6.0, 3.265986323710904)), 23 | ('url4', (1.5, 1.5, 0.7071067811865476)) 24 | ] 25 | -------------------------------------------------------------------------------- /code/chap01/compute_stats.sh: -------------------------------------------------------------------------------- 1 | # NOTE: 2 | # Since statistics functions are defined in 3 | # Python3, I use Python3 instead of Python2 4 | # 5 | # define python3: find out where python3 is installed? 6 | #$ type python3 7 | #python3 is /usr/local/bin/python3 8 | #$ /usr/local/bin/python3 --version 9 | #Python 3.7.1 10 | export PYSPARK_PYTHON=/usr/local/bin/python3 11 | # 12 | # define PySpark program 13 | export PROG="/pyspark_book/code/chap01/compute_stats.py" 14 | # 15 | # define your input path 16 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt" 17 | # 18 | # define your Spark home directory 19 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 20 | # 21 | # run the program 22 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 23 | -------------------------------------------------------------------------------- /code/chap01/compute_stats_detailed.sh: -------------------------------------------------------------------------------- 1 | # NOTE: 2 | # Since statistics functions are defined in 3 | # Python3, we use Python3 instead of Python2 4 | # 5 | # define python3: find out where python3 is installed? 6 | #$ type python3 7 | #python3 is /usr/local/bin/python3 8 | #$ /usr/local/bin/python3 --version 9 | #Python 3.7.1 10 | export PYSPARK_PYTHON=/usr/local/bin/python3 11 | # 12 | # define PySpark program 13 | export PROG="/pyspark_book/code/chap01/compute_stats_detailed.py" 14 | # define your input path 15 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt" 16 | # define your Spark home directory 17 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 18 | # run the program 19 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 20 | -------------------------------------------------------------------------------- /code/chap01/compute_stats_with_threshold.log: -------------------------------------------------------------------------------- 1 | # define python3: find out where python3 is installed? 2 | $ type python3 3 | python3 is /usr/local/bin/python3 4 | $ /usr/local/bin/python3 --version 5 | Python 3.7.1 6 | export PYSPARK_PYTHON=/usr/local/bin/python3 7 | # 8 | # define PySpark program 9 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold.py" 10 | # 11 | # define your input path 12 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt" 13 | # 14 | # define your Spark home directory 15 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 16 | # 17 | # define the length threshold 18 | export THRESHOLD_RECORD_LENGTH=5 19 | 20 | # run the program 21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT $THRESHOLD_RECORD_LENGTH 22 | 23 | inputPath : /pyspark_book/code/chap01/url_frequencies.txt 24 | 25 | THRESHOLD_RECORD_LENGTH : 5 26 | 27 | results = [ 28 | ('url3', (21.857142857142858, 20, 18.97743020387263)), 29 | ('url1', (6.8, 8, 4.324349662087931)), 30 | ('url2', (6, 6.0, 3.265986323710904)), 31 | ('url4', (1.5, 1.5, 0.7071067811865476)) 32 | ] 33 | -------------------------------------------------------------------------------- /code/chap01/compute_stats_with_threshold.sh: -------------------------------------------------------------------------------- 1 | # define python3: find out where python3 is installed? 2 | #$ type python3 3 | #python3 is /usr/local/bin/python3 4 | #$ /usr/local/bin/python3 --version 5 | #Python 3.7.1 6 | export PYSPARK_PYTHON=/usr/local/bin/python3 7 | # 8 | # define PySpark program 9 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold.py" 10 | # 11 | # define your input path 12 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt" 13 | # 14 | # define your Spark home directory 15 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 16 | # 17 | # define the length threshold 18 | export THRESHOLD_RECORD_LENGTH=5 19 | # 20 | # run the program 21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT $THRESHOLD_RECORD_LENGTH 22 | -------------------------------------------------------------------------------- /code/chap01/compute_stats_with_threshold_and_filter.log: -------------------------------------------------------------------------------- 1 | # define python3: find out where python3 is installed? 2 | $ type python3 3 | python3 is /usr/local/bin/python3 4 | $ /usr/local/bin/python3 --version 5 | Python 3.7.1 6 | export PYSPARK_PYTHON=/usr/local/bin/python3 7 | # 8 | # define PySpark program 9 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold_and_filter.py" 10 | # 11 | # define your input path 12 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt" 13 | # 14 | # define your Spark home directory 15 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 16 | # 17 | # define the length threshold 18 | export THRESHOLD_RECORD_LENGTH=5 19 | 20 | # define the mean threshold 21 | export THRESHOLD_MEAN=2 22 | 23 | # run the program 24 | $SPARK_HOME/bin/spark-submit $PROG $INPUT ${THRESHOLD_RECORD_LENGTH} ${THRESHOLD_MEAN} 25 | 26 | inputPath : /pyspark_book/code/chap01/url_frequencies.txt 27 | 28 | THRESHOLD_RECORD_LENGTH : 5 29 | 30 | THRESHOLD_MEAN : 2.0 31 | 32 | results = 33 | [ 34 | ('url3', (21.857142857142858, 20, 18.97743020387263)), 35 | ('url1', (6.8, 8, 4.324349662087931)), 36 | ('url2', (6, 6.0, 3.265986323710904)), 37 | ('url4', (1.5, 1.5, 0.7071067811865476)) 38 | ] 39 | 40 | final_results = 41 | [ 42 | ('url3', (21.857142857142858, 20, 18.97743020387263)), 43 | ('url1', (6.8, 8, 4.324349662087931)), 44 | ('url2', (6, 6.0, 3.265986323710904)) 45 | ] -------------------------------------------------------------------------------- /code/chap01/compute_stats_with_threshold_and_filter.sh: -------------------------------------------------------------------------------- 1 | # NOTE: 2 | # Since statistics functions are defined in 3 | # Python3, we use Python3 instead of Python2 4 | # 5 | # define python3: find out where python3 is installed? 6 | #$ type python3 7 | #python3 is /usr/local/bin/python3 8 | #$ /usr/local/bin/python3 --version 9 | #Python 3.7.1 10 | export PYSPARK_PYTHON=/usr/local/bin/python3 11 | # 12 | # define PySpark program 13 | export PROG="/pyspark_book/code/chap01/compute_stats_with_threshold_and_filter.py" 14 | # 15 | # define your input path 16 | export INPUT="/pyspark_book/code/chap01/url_frequencies.txt" 17 | # 18 | # define your Spark home directory 19 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 20 | # 21 | # define the length threshold 22 | export THRESHOLD_RECORD_LENGTH=5 23 | # 24 | # define the mean threshold 25 | export THRESHOLD_MEAN=2 26 | # 27 | # run the program 28 | $SPARK_HOME/bin/spark-submit $PROG $INPUT ${THRESHOLD_RECORD_LENGTH} ${THRESHOLD_MEAN} 29 | -------------------------------------------------------------------------------- /code/chap01/dataframe_creation_from_csv.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_csv.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap01/name_city_age.csv" 8 | export SPARK_PROG="/pyspark_book/code/chap01/dataframe_creation_from_csv.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap01/fox_data.txt: -------------------------------------------------------------------------------- 1 | a cute fox jumped and juped high 2 | a red cute fox jumped high and high 3 | a red fox jumped 4 | -------------------------------------------------------------------------------- /code/chap01/name_city_age.csv: -------------------------------------------------------------------------------- 1 | Alex,Ames,40 2 | Betty,Ames,33 3 | Alex,Ames,50 4 | Betty,Stanford,45 5 | Jeff,Sunnyvale,55 6 | Bob,Sunnyvale,60 7 | Terry,Stanford,75 8 | David,Stanford,90 9 | Don,Stanford,80 10 | -------------------------------------------------------------------------------- /code/chap01/rdd_creation_from_csv.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_creation_from_csv.py name_city_age.csv 2 | 3 | spark= 4 | 5 | input path : name_city_age.csv 6 | file_contents = 7 | Alex,Ames,40 8 | Betty,Ames,33 9 | Alex,Ames,50 10 | Betty,Stanford,45 11 | Jeff,Sunnyvale,55 12 | Bob,Sunnyvale,60 13 | Terry,Stanford,75 14 | David,Stanford,90 15 | Don,Stanford,80 16 | 17 | rdd = name_city_age.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 18 | rdd.count = 9 19 | rdd.collect() = 20 | [ 21 | 'Alex,Ames,40', 22 | 'Betty,Ames,33', 23 | 'Alex,Ames,50', 24 | 'Betty,Stanford,45', 25 | 'Jeff,Sunnyvale,55', 26 | 'Bob,Sunnyvale,60', 27 | 'Terry,Stanford,75', 28 | 'David,Stanford,90', 29 | 'Don,Stanford,80' 30 | ] 31 | 32 | pairs = PythonRDD[3] at RDD at PythonRDD.scala:48 33 | pairs.count = 9 34 | pairs.collect() = 35 | [ 36 | ('Ames', (40, 1)), 37 | ('Ames', (33, 1)), 38 | ('Ames', (50, 1)), 39 | ('Stanford', (45, 1)), 40 | ('Sunnyvale', (55, 1)), 41 | ('Sunnyvale', (60, 1)), 42 | ('Stanford', (75, 1)), 43 | ('Stanford', (90, 1)), 44 | ('Stanford', (80, 1)) 45 | ] 46 | 47 | sum_and_count = PythonRDD[9] at RDD at PythonRDD.scala:48 48 | sum_and_count.count = 3 49 | sum_and_count.collect() = 50 | [ 51 | ('Stanford', (290, 4)), 52 | ('Ames', (123, 3)), 53 | ('Sunnyvale', (115, 2)) 54 | ] 55 | 56 | average_per_city = PythonRDD[11] at RDD at PythonRDD.scala:48 57 | average_per_city.count = 3 58 | average_per_city.collect() = 59 | [ 60 | ('Stanford', 72.5), 61 | ('Ames', 41.0), 62 | ('Sunnyvale', 57.5) 63 | ] -------------------------------------------------------------------------------- /code/chap01/rdd_creation_from_csv.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_creation_from_csv.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap01/name_city_age.csv" 8 | export SPARK_PROG="/pyspark_book/code/chap01/rdd_creation_from_csv.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap01/sample.txt: -------------------------------------------------------------------------------- 1 | red fox jumped high 2 | fox jumped over high fence 3 | red fox jumped 4 | -------------------------------------------------------------------------------- /code/chap01/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 23 24 12 11 2 | 2 8 9 30 40 50 33 31 3 | 2 9 33 40 70 51 52 4 | 10 11 12 5 | -------------------------------------------------------------------------------- /code/chap01/sample_people.json: -------------------------------------------------------------------------------- 1 | {"name":"Alex", "city":"Cupertino"} 2 | {"name":"Bob", "city":"Sunnyvale"} 3 | {"name":"Betty", "city":"Sunnyvale", "age":30} 4 | {"name":"Max", "city":"Stanford", "age":30} 5 | {"name":"Martina", "city":"Stanford", "age":40} 6 | {"name":"Jane", "city":"Cupertino", "age":19} 7 | -------------------------------------------------------------------------------- /code/chap01/sort_numbers.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit sort_numbers.py sample_numbers.txt 2 | 3 | spark= 4 | 5 | input path : sample_numbers.txt 6 | 7 | file_contents = 8 | 23 24 12 11 9 | 2 8 9 30 40 50 33 31 10 | 2 9 33 40 70 51 52 11 | 10 11 12 12 | 13 | records = sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 14 | records.count = 4 15 | records.collect() = 16 | [ 17 | '23 24 12 11', 18 | '2 8 9 30 40 50 33 31', 19 | '2 9 33 40 70 51 52', 20 | '10 11 12' 21 | ] 22 | 23 | sorted numbers: 24 | 2 25 | 2 26 | 8 27 | 9 28 | 9 29 | 10 30 | 11 31 | 11 32 | 12 33 | 12 34 | 23 35 | 24 36 | 30 37 | 31 38 | 33 39 | 33 40 | 40 41 | 40 42 | 50 43 | 51 44 | 52 45 | 70 -------------------------------------------------------------------------------- /code/chap01/sort_numbers.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run sort_numbers.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap01/sample_numbers.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap01/sort_numbers.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap01/url_frequencies.txt: -------------------------------------------------------------------------------- 1 | url1,1 2 | url1,9 3 | url1,4 4 | ur,5 5 | url1,8 6 | url1,12 7 | url2,2 8 | url2,6 9 | ur,2 10 | url2,10 11 | url2,6 12 | url3,1 13 | url3,10 14 | url3,20 15 | url3,30 16 | url3,40 17 | url3,50 18 | url3,2 19 | url4,1 20 | url4,2 21 | 22 | -------------------------------------------------------------------------------- /code/chap01/word_count.log: -------------------------------------------------------------------------------- 1 | $ ./bin/spark-submit zmp/word_count.py 2 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 3 | 4 | [ 5 | u'red fox jumped high', 6 | u'fox jumped over high fence', 7 | u'red fox jumped' 8 | ] 9 | 10 | 11 | [ 12 | u'red', 13 | u'fox', 14 | u'jumped', 15 | u'high', 16 | u'fox', 17 | u'jumped', 18 | u'over', 19 | u'high', 20 | u'fence', 21 | u'red', 22 | u'fox', 23 | u'jumped' 24 | ] 25 | 26 | 27 | [ 28 | (u'red', 1), 29 | (u'fox', 1), 30 | (u'jumped', 1), 31 | (u'high', 1), 32 | (u'fox', 1), 33 | (u'jumped', 1), 34 | (u'over', 1), 35 | (u'high', 1), 36 | (u'fence', 1), 37 | (u'red', 1), (u'fox', 1), 38 | (u'jumped', 1) 39 | ] 40 | 41 | 42 | [ 43 | (u'high', 2), 44 | (u'over', 1), 45 | (u'fox', 3), 46 | (u'red', 2), 47 | (u'fence', 1), 48 | (u'jumped', 3) 49 | ] 50 | -------------------------------------------------------------------------------- /code/chap01/word_count.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark import SparkConf 3 | from pyspark import SparkContext 4 | 5 | def wordcount(sc, input_path): 6 | 7 | records_rdd = sc.textFile(input_path) 8 | print(records_rdd.collect()) 9 | 10 | words_rdd = records_rdd.flatMap(lambda line: line.split(" ")) 11 | print(words_rdd.collect()) 12 | 13 | pairs_rdd = words_rdd.map(lambda word: (word, 1)) 14 | print(pairs_rdd.collect()) 15 | 16 | frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b) 17 | print(frequencies_rdd.collect()) 18 | 19 | 20 | if __name__ == '__main__': 21 | 22 | conf = SparkConf() 23 | conf.setAppName("WordCount") 24 | conf.set('spark.executor.memory', '500M') 25 | conf.set('spark.cores.max', 4) 26 | try: 27 | sc = SparkContext(conf=conf) 28 | # hard coded input path, for DEMO only 29 | # never hard code 30 | input_path = "/tmp/sample.txt" 31 | 32 | except: 33 | print ("Failed to connect!") 34 | print(sys.exc_info()[0]) 35 | 36 | # Execute word count 37 | wordcount(sc, input_path) 38 | -------------------------------------------------------------------------------- /code/chap01/word_count.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt" 6 | # 7 | # define your PySpark program 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap01/word_count_with_params.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt" 6 | # 7 | # define your PySpark program 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap01/word_count_with_threshold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark import SparkConf 3 | from pyspark import SparkContext 4 | 5 | def wordcount(sc, input_path, threshold): 6 | 7 | records_rdd = sc.textFile(input_path) 8 | print(records_rdd.collect()) 9 | 10 | words_rdd = records_rdd.flatMap(lambda line: line.split(" ")) 11 | print(words_rdd.collect()) 12 | 13 | pairs_rdd = words_rdd.map(lambda word: (word, 1)) 14 | print(pairs_rdd.collect()) 15 | 16 | frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b) 17 | print(frequencies_rdd.collect()) 18 | 19 | # filter out words with fewer than threshold occurrences 20 | filtered_rdd = frequencies_rdd.filter(lambda (word, count): count >= threshold) 21 | print(filtered_rdd.collect()) 22 | 23 | if __name__ == '__main__': 24 | 25 | conf = SparkConf() 26 | conf.setAppName("WordCount") 27 | conf.set('spark.executor.memory', '500M') 28 | conf.set('spark.cores.max', 4) 29 | try: 30 | sc = SparkContext(conf=conf) 31 | except: 32 | print ("Failed to connect!") 33 | print(sys.exc_info()[0]) 34 | 35 | # sys.argv[0] is the name of the script. 36 | # sys.argv[1] is the first parameter: filename 37 | # sys.argv[2] is the second parameter: threshold 38 | input_path = sys.argv[1] # "file:///Users/mparsian/sample.txt" 39 | print("input_path: {}".format(input_path)) 40 | 41 | # get threshold 42 | threshold = int(sys.argv[2]) 43 | 44 | # Execute word count 45 | wordcount(sc, input_path, threshold) 46 | -------------------------------------------------------------------------------- /code/chap01/word_count_with_threshold.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt" 6 | # 7 | # define your PySpark program 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap02/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 2 Programs 2 | 3 | This chapter presents the "Hello World!" program 4 | in PySpark. 5 | 6 | I have presented "word count" problem and provided 7 | several solutions to it using `reduceByKey()` and 8 | `groupByKey()` transformations. 9 | 10 | Note that the `reduceByKey()` transformation is 11 | efficient than the `groupByKey()`. When possible, 12 | we should avoid using the `groupByKey()` transformation 13 | and replace it be `reduceByKey()`, `aggregateByKey()`, 14 | or `combineByKey()`. 15 | 16 | Examples are provided to show how to use the 17 | `filter()` transformation. 18 | -------------------------------------------------------------------------------- /code/chap02/generate_key_value_pairs.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | #----------------------------------------------------- 3 | # @author Mahmoud Parsian 4 | #----------------------------------------------------- 5 | import random 6 | #--------------------------------------------------------- 7 | # Create 1000,000,000 "<,>" pairs such that 8 | # key is a random number in range of 1 to 10,000 9 | # value is a random number in range of 1 to 5 10 | #--------------------------------------------------------- 11 | for x in range(1000000000): 12 | print(str(random.randint(1,10000)) + "," + str(random.randint(1,5))) 13 | -------------------------------------------------------------------------------- /code/chap02/sample_file.txt: -------------------------------------------------------------------------------- 1 | red fox jumped high and high 2 | red fox jumped high fence 3 | fox jumped 4 | -------------------------------------------------------------------------------- /code/chap02/sample_file_extra.txt: -------------------------------------------------------------------------------- 1 | a red fox jumped high and of high 2 | red fox jumped of high fence 3 | a fox jumped 4 | -------------------------------------------------------------------------------- /code/chap02/sum_by_groupbykey.log: -------------------------------------------------------------------------------- 1 | $ cat generate_key_value_pairs.py 2 | from __future__ import print_function 3 | import random 4 | #--------------------------------------------------------- 5 | # Create 1000,000,000 "<,>" pairs such that 6 | # key is a random number in range of 1 to 10,000 7 | # value is a random number in range of 1 to 5 8 | #--------------------------------------------------------- 9 | for x in range(1000000000): 10 | print(str(random.randint(1,10000)) + "," + str(random.randint(1,5))) 11 | 12 | # create one billion (key, value) pairs 13 | $ python generate_key_value_pairs.py > kv.txt 14 | 15 | $ ls -l kv.txt 16 | -rw-r--r-- 1 mparsian dev 6889378545 Mar 10 10:17 kv.txt 17 | 18 | $ wc -l kv.txt 19 | 1000000000 kv.txt 20 | 21 | $ head kv.txt 22 | 2122,3 23 | 3147,4 24 | 8281,4 25 | 5390,5 26 | 4549,4 27 | 2901,3 28 | 288,5 29 | 2878,4 30 | 2250,2 31 | 3452,3 32 | 33 | 34 | $ ./sum_by_groupbykey.sh 35 | Mar 10 10:36:16 PST 2019 36 | input path : .../code/chap02/kv.txt 37 | rdd.getNumPartitions() = 206 38 | 39 | results = 40 | [ 41 | ('8079', 299950), 42 | ('9263', 299717), 43 | ('8095', 300566), 44 | ... 45 | ] 46 | 47 | 33 minutes and 53 seconds elapsed. 48 | -------------------------------------------------------------------------------- /code/chap02/sum_by_groupbykey.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | #----------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------- 7 | # 8 | # Create (key, value) pair from given input record: 9 | # record: <,> 10 | # <1> accept a record of the form "key,value" 11 | # <2> tokenize input record, 12 | # tokens[0]: key, 13 | # tokens[1]: value 14 | # <3> return a pair of (key, value) 15 | # 16 | def create_pair(record): # <1> 17 | tokens = record.split(',') # <2> 18 | key = str(tokens[0]) 19 | value = int(tokens[1]) 20 | return (key, value) # <3> 21 | #end-def 22 | #----------------------------------- 23 | 24 | 25 | if __name__ == "__main__": 26 | if len(sys.argv) != 2: 27 | print("Usage: ", __file__, " ", file=sys.stderr) 28 | exit(-1) 29 | 30 | #------------------------------------------ 31 | # create an instance of SparkSession object 32 | #------------------------------------------ 33 | spark = SparkSession\ 34 | .builder\ 35 | .appName("test:groupBykey()")\ 36 | .getOrCreate() 37 | 38 | input_path = sys.argv[1] 39 | print("input path : ", input_path) 40 | 41 | # 42 | rdd = spark.sparkContext.textFile(input_path) 43 | print("rdd.getNumPartitions() = ", rdd.getNumPartitions()) 44 | # 45 | results = rdd.map(create_pair)\ 46 | .groupByKey()\ 47 | .mapValues(lambda values: sum(values)) 48 | 49 | # display final results 50 | print("results = ", results.collect()) 51 | 52 | spark.stop() 53 | -------------------------------------------------------------------------------- /code/chap02/sum_by_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #----------------------------------------------------- 3 | # @author Mahmoud Parsian 4 | #----------------------------------------------------- 5 | SECONDS=0 6 | /bin/date 7 | # do some work 8 | # 9 | # define Spark's installed directory 10 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 11 | # 12 | # create one billion (key, value) pairs 13 | python generate_key_value_pairs.py > kv.txt 14 | # 15 | # NOTE: define your input path 16 | INPUT_PATH="file:///pyspark_book/code/chap02/kv.txt" 17 | # 18 | # define your PySpark program 19 | PROG="/pyspark_book/code/chap02/sum_by_groupbykey.py" 20 | # 21 | # submit your spark application 22 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 23 | # 24 | # 25 | duration=$SECONDS 26 | echo "" 27 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 28 | -------------------------------------------------------------------------------- /code/chap02/sum_by_reducebykey.log: -------------------------------------------------------------------------------- 1 | $ cat generate_key_value_pairs.py 2 | from __future__ import print_function 3 | import random 4 | #--------------------------------------------------------- 5 | # Create 1000,000,000 "<,>" pairs such that 6 | # key is a random number in range of 1 to 10,000 7 | # value is a random number in range of 1 to 5 8 | #--------------------------------------------------------- 9 | for x in range(1000000000): 10 | print(str(random.randint(1,10000)) + "," + str(random.randint(1,5))) 11 | 12 | # create one billion (key, value) pairs 13 | $ python generate_key_value_pairs.py > kv.txt 14 | 15 | $ ls -l kv.txt 16 | -rw-r--r-- 1 mparsian dev 6889378545 Mar 10 10:17 kv.txt 17 | 18 | $ wc -l kv.txt 19 | 1000000000 kv.txt 20 | 21 | $ head kv.txt 22 | 2122,3 23 | 3147,4 24 | 8281,4 25 | 5390,5 26 | 4549,4 27 | 2901,3 28 | 288,5 29 | 2878,4 30 | 2250,2 31 | 3452,3 32 | 33 | $ ./sum_by_reducebykey.sh 34 | Mar 10 11:27:19 PST 2019 35 | input path : .../code/chap02/kv.txt 36 | rdd.getNumPartitions() = 206 37 | 38 | results = 39 | [ 40 | ('8079', 299950), 41 | ('9263', 299717), 42 | ('8095', 300566), 43 | ... 44 | ] 45 | 46 | 32 minutes and 47 seconds elapsed. 47 | 48 | -------------------------------------------------------------------------------- /code/chap02/sum_by_reducebykey.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | #----------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------- 7 | # 8 | # Create (key, value) pair from given input record: 9 | # record: <,> 10 | # <1> accept a record of the form "key,value" 11 | # <2> tokenize input record, 12 | # tokens[0]: key, 13 | # tokens[1]: value 14 | # <3> return a pair of (key, value) 15 | # 16 | def create_pair(record): # <1> 17 | tokens = record.split(',') # <2> 18 | key = str(tokens[0]) 19 | value = int(tokens[1]) 20 | return (key, value) # <3> 21 | #end-def 22 | #----------------------------------- 23 | 24 | 25 | if __name__ == "__main__": 26 | if len(sys.argv) != 2: 27 | print("Usage: ", __file__, " ", file=sys.stderr) 28 | exit(-1) 29 | 30 | #------------------------------------------ 31 | # create an instance of SparkSession object 32 | #------------------------------------------ 33 | spark = SparkSession\ 34 | .builder\ 35 | .appName("test:reduceBykey()")\ 36 | .getOrCreate() 37 | 38 | input_path = sys.argv[1] 39 | print("input path : ", input_path) 40 | # 41 | rdd = spark.sparkContext.textFile(input_path) 42 | print("rdd.getNumPartitions() = ", rdd.getNumPartitions()) 43 | # 44 | results = rdd.map(create_pair)\ 45 | .reduceByKey(lambda a, b: a+b) 46 | 47 | # display final results 48 | print("results = ", results.collect()) 49 | 50 | spark.stop() 51 | -------------------------------------------------------------------------------- /code/chap02/sum_by_reducebykey.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #----------------------------------------------------- 3 | # @author Mahmoud Parsian 4 | #----------------------------------------------------- 5 | SECONDS=0 6 | /bin/date 7 | # do some work 8 | # 9 | # define Spark's installed directory 10 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 11 | # 12 | # create one billion (key, value) pairs 13 | python generate_key_value_pairs.py > kv.txt 14 | # 15 | # NOTE: define your input path 16 | INPUT_PATH="file:///pyspark_book/code/chap02/kv.txt" 17 | # 18 | # define your PySpark program 19 | PROG="/pyspark_book/code/chap02/sum_by_reducebykey.py" 20 | # 21 | # submit your spark application 22 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 23 | # 24 | # 25 | duration=$SECONDS 26 | echo "" 27 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 28 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit word_count_driver.py sample_file.txt 2 | 3 | input_path: sample_file.txt 4 | 5 | records.count(): 3 6 | 7 | records.collect(): 8 | [ 9 | u'red fox jumped high and high', 10 | u'red fox jumped high fence', 11 | u'fox jumped' 12 | ] 13 | 14 | non_empty_records.count(): 3 15 | 16 | non_empty_records.collect(): 17 | [ 18 | u'red fox jumped high and high', 19 | u'red fox jumped high fence', 20 | u'fox jumped' 21 | ] 22 | 23 | words.count(): 13 24 | 25 | words.collect(): 26 | [ 27 | u'red', 28 | u'fox', 29 | u'jumped', 30 | u'high', 31 | u'and', 32 | u'high', 33 | u'red', 34 | u'fox', 35 | u'jumped', 36 | u'high', 37 | u'fence', 38 | u'fox', 39 | u'jumped' 40 | ] 41 | 42 | pairs.count(): 13 43 | 44 | pairs.collect(): 45 | [ 46 | (u'red', 1), 47 | (u'fox', 1), 48 | (u'jumped', 1), 49 | (u'high', 1), 50 | (u'and', 1), 51 | (u'high', 1), 52 | (u'red', 1), 53 | (u'fox', 1), 54 | (u'jumped', 1), 55 | (u'high', 1), 56 | (u'fence', 1), 57 | (u'fox', 1), 58 | (u'jumped', 1) 59 | ] 60 | 61 | frequencies.count(): 6 62 | 63 | frequencies.collect(): 64 | [ 65 | (u'and', 1), 66 | (u'high', 3), 67 | (u'fox', 3), 68 | (u'red', 2), 69 | (u'fence', 1), 70 | (u'jumped', 3) 71 | ] -------------------------------------------------------------------------------- /code/chap02/word_count_driver.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for word count in PySpark. 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_by_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for word count in PySpark. 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_by_groupbykey.py" 9 | # 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 13 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit word_count_driver_shorthand.py sample_file.txt 2 | 3 | input_path: sample_file.txt 4 | 5 | frequencies.count(): 6 6 | frequencies.collect(): 7 | [ 8 | (u'and', 1), 9 | (u'high', 3), 10 | (u'fox', 3), 11 | (u'red', 2), 12 | (u'fence', 1), 13 | (u'jumped', 3) 14 | ] 15 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #----------------------------------------------------- 3 | # This is a word count in PySpark. 4 | # The goal is to show how "word count" works. 5 | # Here we write transformations in a shorthand! 6 | #------------------------------------------------------ 7 | # Input Parameters: 8 | # argv[1]: String, input path 9 | #------------------------------------------------------- 10 | # @author Mahmoud Parsian 11 | #------------------------------------------------------- 12 | from __future__ import print_function 13 | import sys 14 | from pyspark.sql import SparkSession 15 | 16 | if __name__ == '__main__': 17 | 18 | if len(sys.argv) != 2: 19 | print("Usage: word_count_driver_shorthand.py ", file=sys.stderr) 20 | exit(-1) 21 | 22 | spark = SparkSession\ 23 | .builder\ 24 | .appName("Word-Count-App")\ 25 | .getOrCreate() 26 | 27 | # sys.argv[0] is the name of the script. 28 | # sys.argv[1] is the first parameter 29 | input_path = sys.argv[1] 30 | print("input_path: {}".format(input_path)) 31 | 32 | # create frequencies as RDD 33 | frequencies = spark.sparkContext.textFile(input_path)\ 34 | .filter(lambda line: len(line) > 0)\ 35 | .flatMap(lambda line: line.lower().split(" "))\ 36 | .map(lambda word: (word, 1))\ 37 | .reduceByKey(lambda a, b: a + b) 38 | # 39 | print("frequencies.count(): ", frequencies.count()) 40 | print("frequencies.collect(): ", frequencies.collect()) 41 | 42 | # done! 43 | spark.stop() 44 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for word count in PySpark. 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_shorthand.py" 9 | # 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 13 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand_by_groupbykey.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit word_count_driver_shorthand_by_groupbykey.py sample_file.txt 2 | 3 | input_path: sample_file.txt 4 | 5 | frequencies.count(): 6 6 | 7 | frequencies.collect(): 8 | [ 9 | (u'and', 1), 10 | (u'high', 3), 11 | (u'fox', 3), 12 | (u'red', 2), 13 | (u'fence', 1), 14 | (u'jumped', 3) 15 | ] 16 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand_by_groupbykey.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #----------------------------------------------------- 3 | # This is a word count in PySpark. 4 | # The goal is to show how "word count" works. 5 | # Here we write transformations in a shorthand! 6 | #------------------------------------------------------ 7 | # Input Parameters: 8 | # argv[1]: String, input path 9 | #------------------------------------------------------- 10 | # @author Mahmoud Parsian 11 | #------------------------------------------------------- 12 | from __future__ import print_function 13 | import sys 14 | from pyspark.sql import SparkSession 15 | 16 | if __name__ == '__main__': 17 | 18 | if len(sys.argv) != 2: 19 | print("Usage: word_count_driver.py ", file=sys.stderr) 20 | exit(-1) 21 | 22 | spark = SparkSession\ 23 | .builder\ 24 | .appName("Word-Count-App")\ 25 | .getOrCreate() 26 | 27 | # sys.argv[0] is the name of the script. 28 | # sys.argv[1] is the first parameter 29 | input_path = sys.argv[1] 30 | print("input_path: {}".format(input_path)) 31 | 32 | # create frequencies as RDD 33 | frequencies = spark.sparkContext.textFile(input_path)\ 34 | .filter(lambda line: len(line) > 0)\ 35 | .flatMap(lambda line: line.lower().split(" "))\ 36 | .map(lambda word: (word, 1))\ 37 | .groupByKey()\ 38 | .mapValues(lambda counts : sum(counts)) 39 | # 40 | print("frequencies.count(): ", frequencies.count()) 41 | print("frequencies.collect(): ", frequencies.collect()) 42 | 43 | # done! 44 | spark.stop() 45 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand_by_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for word count in PySpark. 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_shorthand_by_groupbykey.py" 9 | # 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 13 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand_sorted.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit word_count_driver_shorthand_sorted.py sample_file.txt 2 | 3 | input_path: sample_file.txt 4 | 5 | frequencies.count(): 6 6 | frequencies.collect(): 7 | [ 8 | (u'and', 1), 9 | (u'high', 3), 10 | (u'fox', 3), 11 | (u'red', 2), 12 | (u'fence', 1), 13 | (u'jumped', 3) 14 | ] 15 | 16 | sorted_by_key.count(): 6 17 | sorted_by_key.collect(): 18 | [ 19 | (u'and', 1), 20 | (u'fence', 1), 21 | (u'fox', 3), 22 | (u'high', 3), 23 | (u'jumped', 3), 24 | (u'red', 2) 25 | ] 26 | 27 | sorted_by_value.count(): 6 28 | sorted_by_value.collect(): 29 | [ 30 | (u'and', 1), 31 | (u'fence', 1), 32 | (u'red', 2), 33 | (u'high', 3), 34 | (u'fox', 3), 35 | (u'jumped', 3) 36 | ] 37 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_shorthand_sorted.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for word count in PySpark. 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_shorthand_sorted.spy" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_with_filter.log: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a word count in PySpark. 3 | # The goal is to show how "word count" works. 4 | # Here we write transformations in a shorthand! 5 | # 6 | # RULES: 7 | # RULE-1: 8 | # Here I introduce the RDD.filter() transformation 9 | # to ignore the words if their length is less than 3. 10 | # This is implemented by: 11 | # .filter(lambda word : len(word) > 2) 12 | # RULE-2: 13 | # If the total frequency of any unique word is less 14 | # than 2, then ignore that word from the final output 15 | # This is implemented by: 16 | # .filter(lambda (k, v) : v > 1) 17 | # 18 | #------------------------------------------------------ 19 | 20 | ./bin/spark-submit word_count_driver_with_filter.py sample_file_extra.txt 21 | 22 | input_path: sample_file_extra.txt 23 | 24 | file_contents = 25 | a red fox jumped high and of high 26 | red fox jumped of high fence 27 | a fox jumped 28 | 29 | frequencies.count(): 6 30 | frequencies.collect(): 31 | [ 32 | (u'and', 1), 33 | (u'high', 3), 34 | (u'fox', 3), 35 | (u'red', 2), 36 | (u'fence', 1), 37 | (u'jumped', 3) 38 | ] 39 | 40 | filtered.count(): 4 41 | filtered.collect(): 42 | [ 43 | (u'high', 3), 44 | (u'fox', 3), 45 | (u'red', 2), 46 | (u'jumped', 3) 47 | ] -------------------------------------------------------------------------------- /code/chap02/word_count_driver_with_filter.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for word count in PySpark. 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file_extra.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_with_filter.py" 9 | # 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 13 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_with_filter_and_threshold.log: -------------------------------------------------------------------------------- 1 | export THRESHOLD_WORD_LENGTH=2 2 | export THRESHOLD_FREQUENCY=1 3 | ./bin/spark-submit word_count_driver_with_filter_and_threshold.py sample_file_extra.txt ${THRESHOLD_WORD_LENGTH} ${THRESHOLD_FREQUENCY} 4 | 5 | len(sys.argv) = 4 6 | script: sys.argv[0] = /pyspark_book/code/chap02/word_count_driver_with_filter_and_threshold.py 7 | p1: sys.argv[1] = sample_file_extra.txt 8 | p2: sys.argv[2] = 2 9 | p3: sys.argv[3] = 1 10 | 11 | script: /pyspark_book/git-manning/code/chap02/word_count_driver_with_filter_and_threshold.py 12 | 13 | input_path: sample_file_extra.txt 14 | 15 | file_contents = 16 | a red fox jumped high and of high 17 | red fox jumped of high fence 18 | a fox jumped 19 | 20 | THRESHOLD_WORD_LENGTH = 2 21 | THRESHOLD_FREQUENCY = 1 22 | 23 | frequencies.count(): 6 24 | frequencies.collect(): 25 | [ 26 | (u'and', 1), 27 | (u'high', 3), 28 | (u'fox', 3), 29 | (u'red', 2), 30 | (u'fence', 1), 31 | (u'jumped', 3) 32 | ] 33 | 34 | filtered.count(): 4 35 | filtered.collect(): 36 | [ 37 | (u'high', 3), 38 | (u'fox', 3), 39 | (u'red', 2), 40 | (u'jumped', 3) 41 | ] 42 | -------------------------------------------------------------------------------- /code/chap02/word_count_driver_with_filter_and_threshold.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for word count in PySpark. 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap02/sample_file_extra.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap02/word_count_driver_with_filter_and_threshold.py" 9 | # 10 | # define thresholds 11 | export THRESHOLD_WORD_LENGTH=2 12 | export THRESHOLD_FREQUENCY=1 13 | # 14 | # run the PySpark program: 15 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE ${THRESHOLD_WORD_LENGTH} ${THRESHOLD_FREQUENCY} 16 | -------------------------------------------------------------------------------- /code/chap02/word_count_python.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #----------------------------------------------------- 3 | # This is a word count in Python programming language. 4 | # The goal is to show how "word count" works. 5 | #------------------------------------------------------ 6 | # Input Parameters: 7 | # argv[1]: String, input path 8 | #------------------------------------------------------- 9 | # @author Mahmoud Parsian 10 | #------------------------------------------------------- 11 | import sys 12 | import collections 13 | # 14 | input_path = sys.argv[1] 15 | # 16 | file = open(input_path, "r") 17 | wordcount = collections.Counter() 18 | # 19 | for word in file.read().split(): 20 | wordcount[word] += 1 21 | #for-done 22 | print (wordcount) 23 | file.close() 24 | -------------------------------------------------------------------------------- /code/chap02/word_count_python_shorthand.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #----------------------------------------------------- 3 | # This is a word count in Python programming language. 4 | # The goal is to show how "word count" works. 5 | #------------------------------------------------------ 6 | # Input Parameters: 7 | # argv[1]: String, input path 8 | #------------------------------------------------------- 9 | # @author a book reviewer (anonymous) 10 | #------------------------------------------------------- 11 | import sys 12 | import collections 13 | # 14 | input_path = sys.argv[1] 15 | # 16 | with open(input_path) as input_file: 17 | word_count = collections.Counter(input_file.read().split()) 18 | # 19 | print (word_count) 20 | 21 | 22 | -------------------------------------------------------------------------------- /code/chap03/datadir/file1: -------------------------------------------------------------------------------- 1 | record 1 of file1 2 | record 2 of file1 3 | record 3 of file1 4 | -------------------------------------------------------------------------------- /code/chap03/datadir/file2: -------------------------------------------------------------------------------- 1 | record 1 of file2 2 | record 2 of file2 3 | record 3 of file2 4 | record 4 of file2 5 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_collection.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_collection.py 3 | # Run this using Python3 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_collection.py" 9 | export PYSPARK_PYTHON=python3 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 13 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_csv_no_header.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_from_csv_no_header.py kv_no_header.txt 2 | 3 | spark= 4 | 5 | input path : kv_no_header.txt 6 | 7 | file_contents = 8 | alex,200 9 | alex,300 10 | bob,100 11 | bob,400 12 | bob,500 13 | mary,700 14 | mary,200 15 | mary,300 16 | jane,300 17 | adel,200 18 | adel,400 19 | adel,600 20 | adel,800 21 | 22 | df = 23 | [ 24 | Row(_c0=u'alex', _c1=200), 25 | Row(_c0=u'alex', _c1=300), 26 | Row(_c0=u'bob', _c1=100), 27 | Row(_c0=u'bob', _c1=400), 28 | Row(_c0=u'bob', _c1=500), 29 | Row(_c0=u'mary', _c1=700), 30 | Row(_c0=u'mary', _c1=200), 31 | Row(_c0=u'mary', _c1=300), 32 | Row(_c0=u'jane', _c1=300), 33 | Row(_c0=u'adel', _c1=200), 34 | Row(_c0=u'adel', _c1=400), 35 | Row(_c0=u'adel', _c1=600), 36 | Row(_c0=u'adel', _c1=800) 37 | ] 38 | 39 | +----+---+ 40 | | _c0|_c1| 41 | +----+---+ 42 | |alex|200| 43 | |alex|300| 44 | | bob|100| 45 | | bob|400| 46 | | bob|500| 47 | |mary|700| 48 | |mary|200| 49 | |mary|300| 50 | |jane|300| 51 | |adel|200| 52 | |adel|400| 53 | |adel|600| 54 | |adel|800| 55 | +----+---+ 56 | 57 | root 58 | |-- _c0: string (nullable = true) 59 | |-- _c1: integer (nullable = true) 60 | 61 | 62 | +----+-----+ 63 | |name|value| 64 | +----+-----+ 65 | |alex| 200| 66 | |alex| 300| 67 | | bob| 100| 68 | | bob| 400| 69 | | bob| 500| 70 | |mary| 700| 71 | |mary| 200| 72 | |mary| 300| 73 | |jane| 300| 74 | |adel| 200| 75 | |adel| 400| 76 | |adel| 600| 77 | |adel| 800| 78 | +----+-----+ 79 | 80 | root 81 | |-- name: string (nullable = true) 82 | |-- value: integer (nullable = true) 83 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_csv_no_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_csv_no_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap03/kv_no_header.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_csv_no_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_csv_with_header.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_from_csv_with_header.py kv_with_header.txt 2 | 3 | spark= 4 | 5 | input path : kv_with_header.txt 6 | 7 | file_contents = 8 | name,value 9 | alex,200 10 | alex,300 11 | bob,100 12 | bob,400 13 | bob,500 14 | mary,700 15 | mary,200 16 | mary,300 17 | jane,300 18 | adel,200 19 | adel,400 20 | adel,600 21 | adel,800 22 | 23 | df = 24 | [ 25 | Row(name=u'alex', value=200), 26 | Row(name=u'alex', value=300), 27 | Row(name=u'bob', value=100), 28 | Row(name=u'bob', value=400), 29 | Row(name=u'bob', value=500), 30 | Row(name=u'mary', value=700), 31 | Row(name=u'mary', value=200), 32 | Row(name=u'mary', value=300), 33 | Row(name=u'jane', value=300), 34 | Row(name=u'adel', value=200), 35 | Row(name=u'adel', value=400), 36 | Row(name=u'adel', value=600), 37 | Row(name=u'adel', value=800) 38 | ] 39 | 40 | +----+-----+ 41 | |name|value| 42 | +----+-----+ 43 | |alex| 200| 44 | |alex| 300| 45 | | bob| 100| 46 | | bob| 400| 47 | | bob| 500| 48 | |mary| 700| 49 | |mary| 200| 50 | |mary| 300| 51 | |jane| 300| 52 | |adel| 200| 53 | |adel| 400| 54 | |adel| 600| 55 | |adel| 800| 56 | +----+-----+ 57 | 58 | root 59 | |-- name: string (nullable = true) 60 | |-- value: integer (nullable = true) 61 | 62 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_csv_with_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_csv_with_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap03/kv_with_header.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_csv_with_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_dictionary.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_from_dictionary.py 2 | 3 | spark= 4 | 5 | mydict= 6 | { 7 | 'A': '1', 8 | 'B': '2', 9 | 'E': '99', 10 | 'D': '8' 11 | } 12 | 13 | df = DataFrame[key: string, value: string] 14 | df.count = 4 15 | df.collect() = 16 | [ 17 | Row(key=u'A', value=u'1'), 18 | Row(key=u'B', value=u'2'), 19 | Row(key=u'E', value=u'99'), 20 | Row(key=u'D', value=u'8') 21 | ] 22 | 23 | +---+-----+ 24 | |key|value| 25 | +---+-----+ 26 | | A| 1| 27 | | B| 2| 28 | | E| 99| 29 | | D| 8| 30 | +---+-----+ 31 | 32 | root 33 | |-- key: string (nullable = true) 34 | |-- value: string (nullable = true) 35 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_dictionary.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_dictionary.py 3 | # Run this using Python3 by setting PYSPARK_PYTHON 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_dictionary.py" 9 | export PYSPARK_PYTHON=python3 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 13 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_directory.log: -------------------------------------------------------------------------------- 1 | $ ls -l sample_dir2 2 | -rw-r--r-- 1 mparsian dev 54 Nov 11 19:53 file1.txt 3 | -rw-r--r-- 1 mparsian dev 90 Nov 11 19:53 file2.txt 4 | -rw-r--r--@ 1 mparsian dev 31 Nov 11 19:54 file3.csv 5 | -rw-r--r-- 1 mparsian dev 19 Nov 11 19:55 file4.csv 6 | 7 | $ cat file3.csv 8 | alex,33 9 | bob,45 10 | mary,25 11 | jeff,10 12 | 13 | $ cat file4.csv 14 | amanda,44 15 | terry,64 16 | 17 | ./bin/spark-submit dataframe_creation_from_directory.py sample_dir2 18 | 19 | spark= 20 | 21 | input_dir : sample_dir2 22 | 23 | dir_listing = ['file2.txt', 'file1.txt', 'file3.csv', 'file4.csv'] 24 | 25 | df = 26 | [ 27 | Row(_c0=u'alex', _c1=33), 28 | Row(_c0=u'bob', _c1=45), 29 | Row(_c0=u'mary', _c1=25), 30 | Row(_c0=u'jeff', _c1=10), 31 | Row(_c0=u'amanda', _c1=44), 32 | Row(_c0=u'terry', _c1=64) 33 | ] 34 | 35 | +------+---+ 36 | | _c0|_c1| 37 | +------+---+ 38 | | alex| 33| 39 | | bob| 45| 40 | | mary| 25| 41 | | jeff| 10| 42 | |amanda| 44| 43 | | terry| 64| 44 | +------+---+ 45 | 46 | root 47 | |-- _c0: string (nullable = true) 48 | |-- _c1: integer (nullable = true) -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_directory.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_directory.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_DIR="/pyspark_book/code/chap03/sample_dir2" 8 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_directory.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_rdd.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_from_rdd.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('mary', 'Cupertino', 22), 9 | ('jane', 'Ames', 20), 10 | ('bob', 'Stanford', 26) 11 | ] 12 | 13 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 14 | rdd.count() = 4 15 | rdd.collect() = 16 | [ 17 | ('alex', 'Sunnyvale', 25), 18 | ('mary', 'Cupertino', 22), 19 | ('jane', 'Ames', 20), 20 | ('bob', 'Stanford', 26) 21 | ] 22 | 23 | people = PythonRDD[2] at RDD at PythonRDD.scala:48 24 | people.count() = 4 25 | people.collect() = 26 | [ 27 | Row(age=25, city='Sunnyvale', name='alex'), 28 | Row(age=22, city='Cupertino', name='mary'), 29 | Row(age=20, city='Ames', name='jane'), 30 | Row(age=26, city='Stanford', name='bob') 31 | ] 32 | 33 | df = DataFrame[age: bigint, city: string, name: string] 34 | df.count() = 4 35 | df.collect() = 36 | [ 37 | Row(age=25, city=u'Sunnyvale', name=u'alex'), 38 | Row(age=22, city=u'Cupertino', name=u'mary'), 39 | Row(age=20, city=u'Ames', name=u'jane'), 40 | Row(age=26, city=u'Stanford', name=u'bob') 41 | ] 42 | 43 | +---+---------+----+ 44 | |age| city|name| 45 | +---+---------+----+ 46 | | 25|Sunnyvale|alex| 47 | | 22|Cupertino|mary| 48 | | 20| Ames|jane| 49 | | 26| Stanford| bob| 50 | +---+---------+----+ 51 | 52 | root 53 | |-- age: long (nullable = true) 54 | |-- city: string (nullable = true) 55 | |-- name: string (nullable = true) -------------------------------------------------------------------------------- /code/chap03/dataframe_creation_from_rdd.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_rdd.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap03/dataframe_creation_from_rdd.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap03/kv.txt: -------------------------------------------------------------------------------- 1 | alex,200 2 | bob,100 3 | mary,700 4 | jane,300 5 | adel,900 6 | -------------------------------------------------------------------------------- /code/chap03/kv_no_header.txt: -------------------------------------------------------------------------------- 1 | alex,200 2 | alex,300 3 | bob,100 4 | bob,400 5 | bob,500 6 | mary,700 7 | mary,200 8 | mary,300 9 | jane,300 10 | adel,200 11 | adel,400 12 | adel,600 13 | adel,800 14 | -------------------------------------------------------------------------------- /code/chap03/kv_with_header.txt: -------------------------------------------------------------------------------- 1 | name,value 2 | alex,200 3 | alex,300 4 | bob,100 5 | bob,400 6 | bob,500 7 | mary,700 8 | mary,200 9 | mary,300 10 | jane,300 11 | adel,200 12 | adel,400 13 | adel,600 14 | adel,800 15 | -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_collection.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_creation_from_collection.py 2 | 3 | 4 | spark= 5 | 6 | list_of_strings= ['alex', 'bob', 'jane', 'mary', 'adel'] 7 | rdd1= ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 8 | rdd1.count= 5 9 | rdd1.collect()= ['alex', 'bob', 'jane', 'mary', 'adel'] 10 | 11 | list_of_pairs = [('alex', 1), ('alex', 3), ('alex', 9), ('alex', 10), ('bob', 4), ('bob', 8)] 12 | rdd2 = ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:175 13 | rdd2.count = 6 14 | rdd2.collect() = [('alex', 1), ('alex', 3), ('alex', 9), ('alex', 10), ('bob', 4), ('bob', 8)] 15 | 16 | rdd2_added = PythonRDD[8] at RDD at PythonRDD.scala:48 17 | rdd2_added.count = 2 18 | rdd2_added.collect() = [('bob', 12), ('alex', 23)] 19 | 20 | rdd2_grouped = PythonRDD[8] at RDD at PythonRDD.scala:48 21 | rdd2_grouped.count = 2 22 | rdd2_grouped.collect() = [('bob', ), ('alex', )] 23 | rdd2_grouped.collect() = (as a list) = [('bob', [4, 8]), ('alex', [1, 3, 9, 10])] 24 | 25 | d = {'key3': 'value3', 'key2': 'value2', 'key1': 'value1'} 26 | d.items()= [('key3', 'value3'), ('key2', 'value2'), ('key1', 'value1')] 27 | rdd_from_dict = ParallelCollectionRDD[17] at parallelize at PythonRDD.scala:175 28 | rdd_from_dict.collect() = [('key3', 'value3'), ('key2', 'value2'), ('key1', 'value1')] 29 | rdd_from_dict.count = 3 -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_collection.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_creation_from_collection.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_collection.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_dataframe.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_creation_from_dataframe.py 2 | 3 | spark= 4 | 5 | list_of_pairs= 6 | [ 7 | ('alex', 1), 8 | ('alex', 5), 9 | ('bob', 2), 10 | ('bob', 40), 11 | ('jane', 60), 12 | ('mary', 700), 13 | ('adel', 800) 14 | ] 15 | 16 | df = DataFrame[_1: string, _2: bigint] 17 | df.count = 7 18 | df.collect() = 19 | [ 20 | Row(_1=u'alex', _2=1), 21 | Row(_1=u'alex', _2=5), 22 | Row(_1=u'bob', _2=2), 23 | Row(_1=u'bob', _2=40), 24 | Row(_1=u'jane', _2=60), 25 | Row(_1=u'mary', _2=700), 26 | Row(_1=u'adel', _2=800) 27 | ] 28 | 29 | +----+---+ 30 | | _1| _2| 31 | +----+---+ 32 | |alex| 1| 33 | |alex| 5| 34 | | bob| 2| 35 | | bob| 40| 36 | |jane| 60| 37 | |mary|700| 38 | |adel|800| 39 | +----+---+ 40 | 41 | root 42 | |-- _1: string (nullable = true) 43 | |-- _2: long (nullable = true) 44 | 45 | rdd = DataFrame[_1: string, _2: bigint] 46 | rdd.count = 7 47 | rdd.collect() = 48 | [ 49 | Row(_1=u'alex', _2=1), 50 | Row(_1=u'alex', _2=5), 51 | Row(_1=u'bob', _2=2), 52 | Row(_1=u'bob', _2=40), 53 | Row(_1=u'jane', _2=60), 54 | Row(_1=u'mary', _2=700), 55 | Row(_1=u'adel', _2=800) 56 | ] -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_dataframe.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_creation_from_dataframe.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_dataframe.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_dictionary.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_creation_from_dictionary.py 2 | 3 | spark= 4 | 5 | mydict= 6 | { 7 | 'A': '1', 8 | 'B': '2', 9 | 'E': '99', 10 | 'D': '8' 11 | } 12 | 13 | 14 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 15 | rdd.count = 4 16 | rdd.collect() = 17 | [ 18 | ('A', '1'), 19 | ('B', '2'), 20 | ('E', '99'), 21 | ('D', '8') 22 | ] -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_dictionary.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_creation_from_dictionary.py 3 | # Run this using Python3 by setting PYSPARK_PYTHON 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_dictionary.py" 9 | export PYSPARK_PYTHON=python3 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 13 | -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_directory.log: -------------------------------------------------------------------------------- 1 | $ ls -l sample_dir/ 2 | total 16 3 | -rw-r--r-- 1 mparsian dev 54 Nov 11 18:59 file1.txt 4 | -rw-r--r-- 1 mparsian dev 90 Nov 11 19:00 file2.txt 5 | 6 | $ cat file1.txt 7 | record 1 of file1 8 | record 2 of file1 9 | record 3 of file1 10 | 11 | $ cat file2.txt 12 | record 1 of file2 13 | record 2 of file2 14 | record 3 of file2 15 | record 4 of file2 16 | record 5 of file2 17 | 18 | ./bin/spark-submit rdd_creation_from_directory.py sample_dir 19 | 20 | spark= 21 | 22 | dir path : sample_dir 23 | 24 | dir_listing = ['file2.txt', 'file1.txt'] 25 | 26 | rdd = sample_dir/ MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 27 | rdd.count = 8 28 | rdd.collect() = 29 | [ 30 | u'record 1 of file2', 31 | u'record 2 of file2', 32 | u'record 3 of file2', 33 | u'record 4 of file2', 34 | u'record 5 of file2', 35 | u'record 1 of file1', 36 | u'record 2 of file1', 37 | u'record 3 of file1' 38 | ] 39 | 40 | filtered = PythonRDD[3] at RDD at PythonRDD.scala:48 41 | filtered.count = 2 42 | filtered.collect() = 43 | [ 44 | u'record 3 of file2', 45 | u'record 3 of file1' 46 | ] -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_directory.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_creation_from_directory.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | # 7 | # $ cd sample_dir/ 8 | # $ ls -l 9 | # -rw-r--r-- 1 mparsian dev 54 Nov 11 18:59 file1.txt 10 | # -rw-r--r-- 1 mparsian dev 90 Nov 11 19:00 file2.txt 11 | # 12 | # $ cat file1.txt 13 | # record 1 of file1 14 | # record 2 of file1 15 | # record 3 of file1 16 | # 17 | # $ cat file2.txt 18 | # record 1 of file2 19 | # record 2 of file2 20 | # record 3 of file2 21 | # record 4 of file2 22 | # record 5 of file2 23 | # 24 | #------------------------------------------------------ 25 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 26 | export INPUT_DIR="/pyspark_book/code/chap03/sample_dir" 27 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_directory.py" 28 | # 29 | # run the PySpark program: 30 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_DIR 31 | -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_file.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_creation_from_file.py kv.txt 2 | 3 | spark= 4 | 5 | input path : kv.txt 6 | 7 | file_contents = 8 | alex,200 9 | bob,100 10 | mary,700 11 | jane,300 12 | adel,900 13 | 14 | rdd = kv.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 15 | rdd.count = 5 16 | rdd.collect() = 17 | [ 18 | 'alex,200', 19 | 'bob,100', 20 | 'mary,700', 21 | 'jane,300', 22 | 'adel,900' 23 | ] 24 | 25 | pairs = PythonRDD[3] at RDD at PythonRDD.scala:48 26 | pairs.count = 5 27 | pairs.collect() = 28 | [ 29 | ('alex', 200), 30 | ('bob', 100), 31 | ('mary', 700), 32 | ('jane', 300), 33 | ('adel', 900) 34 | ] 35 | 36 | filtered = PythonRDD[5] at RDD at PythonRDD.scala:48 37 | filtered.count = 2 38 | filtered.collect() = 39 | [ 40 | ('mary', 700), 41 | ('adel', 900) 42 | ] -------------------------------------------------------------------------------- /code/chap03/rdd_creation_from_file.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_creation_from_file.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap03/kv.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap03/rdd_creation_from_file.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap03/sample.txt: -------------------------------------------------------------------------------- 1 | red fox jumped high 2 | fox jumped over high fence 3 | red fox jumped 4 | -------------------------------------------------------------------------------- /code/chap03/sample_dir/file1.txt: -------------------------------------------------------------------------------- 1 | record 1 of file1 2 | record 2 of file1 3 | record 3 of file1 4 | -------------------------------------------------------------------------------- /code/chap03/sample_dir/file2.txt: -------------------------------------------------------------------------------- 1 | record 1 of file2 2 | record 2 of file2 3 | record 3 of file2 4 | record 4 of file2 5 | record 5 of file2 6 | -------------------------------------------------------------------------------- /code/chap03/sample_dir2/file1.txt: -------------------------------------------------------------------------------- 1 | record 1 of file1 2 | record 2 of file1 3 | record 3 of file1 4 | -------------------------------------------------------------------------------- /code/chap03/sample_dir2/file2.txt: -------------------------------------------------------------------------------- 1 | record 1 of file2 2 | record 2 of file2 3 | record 3 of file2 4 | record 4 of file2 5 | record 5 of file2 6 | -------------------------------------------------------------------------------- /code/chap03/sample_dir2/file3.csv: -------------------------------------------------------------------------------- 1 | alex,33 2 | bob,45 3 | mary,25 4 | jeff,10 5 | -------------------------------------------------------------------------------- /code/chap03/sample_dir2/file4.csv: -------------------------------------------------------------------------------- 1 | amanda,44 2 | terry,64 3 | -------------------------------------------------------------------------------- /code/chap03/word_count.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark import SparkConf 3 | from pyspark import SparkContext 4 | 5 | def wordcount(sc, input_path): 6 | 7 | records_rdd = sc.textFile(input_path) 8 | print(records_rdd.collect()) 9 | 10 | words_rdd = records_rdd.flatMap(lambda line: line.split(" ")) 11 | print(words_rdd.collect()) 12 | 13 | pairs_rdd = words_rdd.map(lambda word: (word, 1)) 14 | print(pairs_rdd.collect()) 15 | 16 | frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b) 17 | print(frequencies_rdd.collect()) 18 | 19 | 20 | if __name__ == '__main__': 21 | 22 | conf = SparkConf() 23 | conf.setAppName("WordCount") 24 | conf.set('spark.executor.memory', '500M') 25 | conf.set('spark.cores.max', 4) 26 | try: 27 | sc = SparkContext(conf=conf) 28 | # hard coded input path, for DEMO only 29 | # never hard code 30 | input_path = "/tmp/sample.txt" 31 | 32 | except: 33 | print ("Failed to connect!") 34 | print(sys.exc_info()[0]) 35 | 36 | # Execute word count 37 | wordcount(sc, input_path) 38 | -------------------------------------------------------------------------------- /code/chap03/word_count.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt" 6 | # 7 | # define your PySpark program 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap03/word_count_with_params.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt" 6 | # 7 | # define your PySpark program 8 | PROG=/pyspark_book/code/chap03/word_count_with_params.py 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap03/word_count_with_threshold.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark import SparkConf 3 | from pyspark import SparkContext 4 | 5 | def wordcount(sc, input_path, threshold): 6 | 7 | records_rdd = sc.textFile(input_path) 8 | print(records_rdd.collect()) 9 | 10 | words_rdd = records_rdd.flatMap(lambda line: line.split(" ")) 11 | print(words_rdd.collect()) 12 | 13 | pairs_rdd = words_rdd.map(lambda word: (word, 1)) 14 | print(pairs_rdd.collect()) 15 | 16 | frequencies_rdd = pairs_rdd.reduceByKey(lambda a, b: a + b) 17 | print(frequencies_rdd.collect()) 18 | 19 | # filter out words with fewer than threshold occurrences 20 | filtered_rdd = frequencies_rdd.filter(lambda (word, count): count >= threshold) 21 | print(filtered_rdd.collect()) 22 | 23 | if __name__ == '__main__': 24 | 25 | conf = SparkConf() 26 | conf.setAppName("WordCount") 27 | conf.set('spark.executor.memory', '500M') 28 | conf.set('spark.cores.max', 4) 29 | try: 30 | sc = SparkContext(conf=conf) 31 | except: 32 | print ("Failed to connect!") 33 | print(sys.exc_info()[0]) 34 | 35 | # sys.argv[0] is the name of the script. 36 | # sys.argv[1] is the first parameter: filename 37 | # sys.argv[2] is the second parameter: threshold 38 | input_path = sys.argv[1] # "file:///Users/mparsian/sample.txt" 39 | print("input_path: {}".format(input_path)) 40 | 41 | # get threshold 42 | threshold = int(sys.argv[2]) 43 | 44 | # Execute word count 45 | wordcount(sc, input_path, threshold) 46 | -------------------------------------------------------------------------------- /code/chap03/word_count_with_threshold.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="$SPARK_HOME/licenses/LICENSE-heapq.txt" 6 | # 7 | # define your PySpark program 8 | PROG=/pyspark_book/code/chap01/word_count_with_params.py 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V1/run_dna_base_count_ver_1.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sample.fasta" 6 | # 7 | # define your PySpark program 8 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V1/dna_base_count_ver_1.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V1/run_dna_base_count_ver_1_1GB.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # define Spark's installed directory 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | # 9 | # define your input path 10 | INPUT_PATH="file:///pyspark_book/code/chap04/data/*.fasta" 11 | # 12 | # define your PySpark program 13 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V1/dna_base_count_ver_1.py" 14 | # 15 | # submit your spark application 16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 17 | # 18 | duration=$SECONDS 19 | echo "" 20 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 21 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V1/run_dna_base_count_ver_1_big.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # define Spark's installed directory 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | # 9 | # NOTE: define your input path 10 | # Before running your PySpark program, 11 | # Download chr1.subst.fa from this location: 12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz 14 | # 15 | INPUT_PATH="file:///pyspark_book/code/chap04/chr1.subst.fa" 16 | # 17 | # define your PySpark program 18 | PROG=/pyspark_book/code/chap04/DNA-FASTA-V1/dna_base_count_ver_1.py 19 | # 20 | # submit your spark application 21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 22 | # 23 | duration=$SECONDS 24 | echo "" 25 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 26 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V2/run_dna_base_count_ver_2.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sample.fasta" 6 | # 7 | # define your PySpark program 8 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V2/dna_base_count_ver_2.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V2/run_dna_base_count_ver_2_1GB.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # define Spark's installed directory 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | # 9 | # define your input path 10 | INPUT_PATH="file:///pyspark_book/code/chap04/data/*.fa" 11 | # 12 | # define your PySpark program 13 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V2/dna_base_count_ver_2.py" 14 | # 15 | # submit your spark application 16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 17 | # 18 | duration=$SECONDS 19 | echo "" 20 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 21 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V2/run_dna_base_count_ver_2_big.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # define Spark's installed directory 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | # 9 | # NOTE: define your input path 10 | # Before running your PySpark program, 11 | # Download chr1.subst.fa from this location: 12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz 14 | # 15 | INPUT_PATH="file:///pyspark_book/code/chap04/data/chr1.subst.fa" 16 | # 17 | # define your PySpark program 18 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V2/dna_base_count_ver_2.py" 19 | # 20 | # submit your spark application 21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 22 | # 23 | duration=$SECONDS 24 | echo "" 25 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 26 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V3/run_dna_base_count_ver_3.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sample.fasta" 6 | # 7 | # define your PySpark program 8 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V3/dna_base_count_ver_3.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V3/run_dna_base_count_ver_3_1GB.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # define Spark's installed directory 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | # 9 | # define your input path 10 | INPUT_PATH="file:///pyspark_book/code/chap04/data/*.fa" 11 | # 12 | # define your PySpark program 13 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V3/dna_base_count_ver_3.py" 14 | # 15 | # submit your spark application 16 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 17 | # 18 | # 19 | duration=$SECONDS 20 | echo "" 21 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 22 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTA-V3/run_dna_base_count_ver_3_big.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | SECONDS=0 3 | /bin/date 4 | # do some work 5 | # 6 | # define Spark's installed directory 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | # 9 | # NOTE: define your input path 10 | # Before running your PySpark program, 11 | # Download chr1.subst.fa from this location: 12 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/ 13 | # http://hgdownload.cse.ucsc.edu/goldenpath/hg19/snp137Mask/chr1.subst.fa.gz 14 | # 15 | INPUT_PATH="file:///pyspark_book/code/chap04/data/chr1.subst.fa" 16 | # 17 | # define your PySpark program 18 | PROG="/pyspark_book/code/chap04/DNA-FASTA-V3/dna_base_count_ver_3.py" 19 | # 20 | # submit your spark application 21 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 22 | # 23 | # 24 | duration=$SECONDS 25 | echo "" 26 | echo "$(($duration / 60)) minutes and $(($duration % 60)) seconds elapsed." 27 | -------------------------------------------------------------------------------- /code/chap04/DNA-FASTQ/run_dna_base_count_fastq.sh: -------------------------------------------------------------------------------- 1 | # define Spark's installed directory 2 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 3 | # 4 | # define your input path 5 | INPUT_PATH="file:///pyspark_book/code/chap04/data/sp1.fastq" 6 | # 7 | # define your PySpark program 8 | PROG="/pyspark_book/code/chap04/DNA-FASTQ/dna_base_count_fastq.py" 9 | # 10 | # submit your spark application 11 | $SPARK_HOME/bin/spark-submit $PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap04/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 4 2 | 3 | ## DNA-Base-Count Programs using FASTA Input Format 4 | 5 | Using FASTA input files, there are 3 versions of DNA-Base-Count 6 | 7 | * Version-1: 8 | * Uses basic MapReduce programs 9 | * Using PySpark (`chap04/DNA-FASTA-V1/dna_base_count_ver_1.py`) 10 | 11 | * Version-2: 12 | * Uses InMapper Combiner design pattern 13 | * Using PySpark (`chap04/DNA-FASTA-V2/dna_base_count_ver_2.py`) 14 | 15 | * Version-3: 16 | * Uses InMapper Combiner design pattern (by using mapPartitions() transformations) 17 | * Using PySpark (`chap04/DNA-FASTA-V3/dna_base_count_ver_3.py`) 18 | 19 | 20 | ## DNA-Base-Count Programs using FASTQ Input Format 21 | 22 | Using FASTQ input files, the following solution is available: 23 | 24 | * Uses InMapper Combiner design pattern (by using mapPartitions() transformations) 25 | * Using PySpark (`chap04/DNA-FASTQ/dna_base_count_fastq.py`) 26 | 27 | 28 | ## FASTA Files to Test DNA-Base-Count 29 | 30 | * A small sample FASTA file (`chap04/data/sample.fasta`) is provided. 31 | 32 | * To test DNA-Base-Count programs with large size FASTA files, 33 | you may download them from here: 34 | 35 | 36 | ```` 37 | ftp://ftp.ensembl.org/pub/release-91/fasta/homo_sapiens/dna/ 38 | 39 | ftp://ftp.ncbi.nih.gov/snp/organisms/human_9606/rs_fasta/ 40 | 41 | ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz 42 | 43 | ```` 44 | -------------------------------------------------------------------------------- /code/chap04/data/sample.fasta: -------------------------------------------------------------------------------- 1 | >seq1 2 | cGTAaccaataaaaaaacaagcttaacctaattc 3 | >seq2 4 | agcttagTTTGGatctggccgggg 5 | >seq3 6 | gcggatttactcCCCCCAAAAANNaggggagagcccagataaatggagtctgtgcgtccaca 7 | gaattcgcacca 8 | AATAAAACCTCACCCAT 9 | agagcccagaatttactcCCC 10 | >seq4 11 | gcggatttactcaggggagagcccagGGataaatggagtctgtgcgtccaca 12 | gaattcgcacca 13 | -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_aggregatebykey.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit average_by_key_use_aggregatebykey.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('alex', 'Sunnyvale', 33), 9 | ('alex', 'Sunnyvale', 45), 10 | ('alex', 'Sunnyvale', 63), 11 | ('mary', 'Ames', 22), 12 | ('mary', 'Cupertino', 66), 13 | ('mary', 'Ames', 20), 14 | ('bob', 'Ames', 26) 15 | ] 16 | 17 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 18 | rdd.count() = 8 19 | rdd.collect() = 20 | [ 21 | ('alex', 'Sunnyvale', 25), 22 | ('alex', 'Sunnyvale', 33), 23 | ('alex', 'Sunnyvale', 45), 24 | ('alex', 'Sunnyvale', 63), 25 | ('mary', 'Ames', 22), 26 | ('mary', 'Cupertino', 66), 27 | ('mary', 'Ames', 20), 28 | ('bob', 'Ames', 26) 29 | ] 30 | 31 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 32 | rdd2.count() = 8 33 | rdd2.collect() = 34 | [ 35 | ('alex', 25), 36 | ('alex', 33), 37 | ('alex', 45), 38 | ('alex', 63), 39 | ('mary', 22), 40 | ('mary', 66), 41 | ('mary', 20), 42 | ('bob', 26) 43 | ] 44 | 45 | sum_count = PythonRDD[8] at RDD at PythonRDD.scala:48 46 | sum_count.count() = 3 47 | sum_count.collect() = 48 | [ 49 | ('bob', (26, 1)), 50 | ('alex', (166, 4)), 51 | ('mary', (108, 3)) 52 | ] 53 | 54 | averages = PythonRDD[10] at RDD at PythonRDD.scala:48 55 | averages.count() = 3 56 | averages.collect() = 57 | [ 58 | ('bob', 26.0), 59 | ('alex', 41.5), 60 | ('mary', 36.0) 61 | ] -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_aggregatebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_aggregatebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_aggregatebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_combinebykey.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit average_by_key_use_combinebykey.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('alex', 'Sunnyvale', 33), 9 | ('alex', 'Sunnyvale', 45), 10 | ('alex', 'Sunnyvale', 63), 11 | ('mary', 'Ames', 22), 12 | ('mary', 'Cupertino', 66), 13 | ('mary', 'Ames', 20), 14 | ('bob', 'Ames', 26) 15 | ] 16 | 17 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 18 | rdd.count() = 8 19 | rdd.collect() = 20 | [ 21 | ('alex', 'Sunnyvale', 25), 22 | ('alex', 'Sunnyvale', 33), 23 | ('alex', 'Sunnyvale', 45), 24 | ('alex', 'Sunnyvale', 63), 25 | ('mary', 'Ames', 22), 26 | ('mary', 'Cupertino', 66), 27 | ('mary', 'Ames', 20), 28 | ('bob', 'Ames', 26) 29 | ] 30 | 31 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 32 | rdd2.count() = 8 33 | rdd2.collect() = 34 | [ 35 | ('alex', 25), 36 | ('alex', 33), 37 | ('alex', 45), 38 | ('alex', 63), 39 | ('mary', 22), 40 | ('mary', 66), 41 | ('mary', 20), 42 | ('bob', 26) 43 | ] 44 | 45 | sum_count = PythonRDD[8] at RDD at PythonRDD.scala:48 46 | sum_count.count() = 3 47 | sum_count.collect() = 48 | [ 49 | ('bob', (26, 1)), 50 | ('alex', (166, 4)), 51 | ('mary', (108, 3)) 52 | ] 53 | 54 | averages = PythonRDD[10] at RDD at PythonRDD.scala:48 55 | averages.count() = 3 56 | averages.collect() = 57 | [ 58 | ('bob', 26.0), 59 | ('alex', 41.5), 60 | ('mary', 36.0) 61 | ] -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_combinebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_combinebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_combinebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_foldbykey.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit average_by_key_use_foldbykey.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('alex', 'Sunnyvale', 33), 9 | ('alex', 'Sunnyvale', 45), 10 | ('alex', 'Sunnyvale', 63), 11 | ('mary', 'Ames', 22), 12 | ('mary', 'Cupertino', 66), 13 | ('mary', 'Ames', 20), 14 | ('bob', 'Ames', 26) 15 | ] 16 | 17 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 18 | rdd.count() = 8 19 | rdd.collect() = 20 | [ 21 | ('alex', 'Sunnyvale', 25), 22 | ('alex', 'Sunnyvale', 33), 23 | ('alex', 'Sunnyvale', 45), 24 | ('alex', 'Sunnyvale', 63), 25 | ('mary', 'Ames', 22), 26 | ('mary', 'Cupertino', 66), 27 | ('mary', 'Ames', 20), 28 | ('bob', 'Ames', 26) 29 | ] 30 | 31 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 32 | rdd2.count() = 8 33 | rdd2.collect() = 34 | [ 35 | ('alex', (25, 1)), 36 | ('alex', (33, 1)), 37 | ('alex', (45, 1)), 38 | ('alex', (63, 1)), 39 | ('mary', (22, 1)), 40 | ('mary', (66, 1)), 41 | ('mary', (20, 1)), 42 | ('bob', (26, 1)) 43 | ] 44 | 45 | sum_count = PythonRDD[8] at RDD at PythonRDD.scala:48 46 | sum_count.count() = 3 47 | sum_count.collect() = 48 | [ 49 | ('bob', (26, 1)), 50 | ('alex', (166, 4)), 51 | ('mary', (108, 3)) 52 | ] 53 | 54 | averages = PythonRDD[10] at RDD at PythonRDD.scala:48 55 | averages.count() = 3 56 | averages.collect() = 57 | [ 58 | ('bob', 26.0), 59 | ('alex', 41.5), 60 | ('mary', 36.0) 61 | ] -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_foldbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_foldbykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_foldbykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_groupbykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_groupbykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_reducebykey.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit average_by_key_use_reducebykey.py 2 | 3 | spark= 4 | 5 | list_of_tuples = [('alex', 'Sunnyvale', 25), ('alex', 'Sunnyvale', 33), ('alex', 'Sunnyvale', 45), ('alex', 'Sunnyvale', 63), ('mary', 'Ames', 22), ('mary', 'Cupertino', 66), ('mary', 'Ames', 20), ('bob', 'Ames', 26)] 6 | 7 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 8 | rdd.count() = 8 9 | rdd.collect() = 10 | [ 11 | ('alex', 'Sunnyvale', 25), 12 | ('alex', 'Sunnyvale', 33), 13 | ('alex', 'Sunnyvale', 45), 14 | ('alex', 'Sunnyvale', 63), 15 | ('mary', 'Ames', 22), 16 | ('mary', 'Cupertino', 66), 17 | ('mary', 'Ames', 20), 18 | ('bob', 'Ames', 26) 19 | ] 20 | 21 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 22 | rdd2.count() = 8 23 | rdd2.collect() = 24 | [ 25 | ('alex', (25, 1)), 26 | ('alex', (33, 1)), 27 | ('alex', (45, 1)), 28 | ('alex', (63, 1)), 29 | ('mary', (22, 1)), 30 | ('mary', (66, 1)), 31 | ('mary', (20, 1)), 32 | ('bob', (26, 1)) 33 | ] 34 | 35 | sum_count = PythonRDD[8] at RDD at PythonRDD.scala:48 36 | sum_count.count() = 3 37 | sum_count.collect() = 38 | [ 39 | ('bob', (26, 1)), 40 | ('alex', (166, 4)), 41 | ('mary', (108, 3)) 42 | ] 43 | 44 | averages = PythonRDD[10] at RDD at PythonRDD.scala:48 45 | averages.count() = 3 46 | averages.collect() = 47 | [ 48 | ('bob', 26.0), 49 | ('alex', 41.5), 50 | ('mary', 36.0) 51 | ] -------------------------------------------------------------------------------- /code/chap05/average_by_key_use_reducebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run average_by_key_use_reducebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/average_by_key_use_reducebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_action_describe.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_action_describe.py 2 | 3 | spark= 4 | 5 | pairs = 6 | [ 7 | (10, 'z1'), 8 | (1, 'z2'), 9 | (2, 'z3'), 10 | (9, 'z4'), 11 | (3, 'z5'), 12 | (4, 'z6'), 13 | (5, 'z7'), 14 | (6, 'z8'), 15 | (7, 'z9') 16 | ] 17 | 18 | df.count(): 9 19 | df.collect(): 20 | [ 21 | Row(number=10, name=u'z1'), 22 | Row(number=1, name=u'z2'), 23 | Row(number=2, name=u'z3'), 24 | Row(number=9, name=u'z4'), 25 | Row(number=3, name=u'z5'), 26 | Row(number=4, name=u'z6'), 27 | Row(number=5, name=u'z7'), 28 | Row(number=6, name=u'z8'), 29 | Row(number=7, name=u'z9') 30 | ] 31 | 32 | +------+----+ 33 | |number|name| 34 | +------+----+ 35 | | 10| z1| 36 | | 1| z2| 37 | | 2| z3| 38 | | 9| z4| 39 | | 3| z5| 40 | | 4| z6| 41 | | 5| z7| 42 | | 6| z8| 43 | | 7| z9| 44 | +------+----+ 45 | 46 | +-------+------------------+ 47 | |summary| number| 48 | +-------+------------------+ 49 | | count| 9| 50 | | mean| 5.222222222222222| 51 | | stddev|3.0731814857642954| 52 | | min| 1| 53 | | max| 10| 54 | +-------+------------------+ 55 | 56 | +-------+------------------+----+ 57 | |summary| number|name| 58 | +-------+------------------+----+ 59 | | count| 9| 9| 60 | | mean| 5.222222222222222|null| 61 | | stddev|3.0731814857642954|null| 62 | | min| 1| z1| 63 | | max| 10| z9| 64 | +-------+------------------+----+ 65 | -------------------------------------------------------------------------------- /code/chap05/dataframe_action_describe.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_action_describe.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_action_describe.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_drop.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_drop.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_drop.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_filter.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_filter.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_filter.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_join_cross.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_cross.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_cross.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_join_inner.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_inner.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_inner.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_join_left.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_left.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_left.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_join_right.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_join_right.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_join_right.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_sql.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_sql.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_sql.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/dataframe_withcolumn.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_withcolumn.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/dataframe_withcolumn.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/emps.txt: -------------------------------------------------------------------------------- 1 | 1000,alex,67000 2 | 1001,bob,24000 3 | 1002,jane,69000 4 | 1003,betty,55000 5 | 1004,jeff,59000 6 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_cartesian.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_cartesian.py 2 | 3 | spark= 4 | 5 | a = [('a', 2), ('b', 3), ('c', 4)] 6 | 7 | b = [('p', 50), ('x', 60), ('y', 70), ('z', 80)] 8 | 9 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 10 | rdd.count() = 3 11 | rdd.collect() = 12 | [ 13 | ('a', 2), 14 | ('b', 3), 15 | ('c', 4) 16 | ] 17 | 18 | rdd2 = ParallelCollectionRDD[2] at parallelize at PythonRDD.scala:175 19 | rdd2.count() = 4 20 | rdd2.collect() = 21 | [ 22 | ('p', 50), 23 | ('x', 60), 24 | ('y', 70), 25 | ('z', 80) 26 | ] 27 | 28 | cart = org.apache.spark.api.java.JavaPairRDD@4a2f7b2c 29 | cart.count() = 12 30 | cart.collect() = 31 | [ 32 | (('a', 2), ('p', 50)), 33 | (('a', 2), ('x', 60)), 34 | (('a', 2), ('y', 70)), 35 | (('a', 2), ('z', 80)), 36 | (('b', 3), ('p', 50)), 37 | (('b', 3), ('x', 60)), 38 | (('b', 3), ('y', 70)), 39 | (('b', 3), ('z', 80)), 40 | (('c', 4), ('p', 50)), 41 | (('c', 4), ('x', 60)), 42 | (('c', 4), ('y', 70)), 43 | (('c', 4), ('z', 80)) 44 | ] -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_cartesian.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_cartesian.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_cartesian.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_combinebykey.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_combinebykey.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('alex', 'Sunnyvale', 33), 9 | ('alex', 'Sunnyvale', 45), 10 | ('alex', 'Sunnyvale', 63), 11 | ('mary', 'Ames', 22), 12 | ('mary', 'Cupertino', 66), 13 | ('mary', 'Ames', 20), 14 | ('bob', 'Ames', 26) 15 | ] 16 | 17 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 18 | rdd.count() = 8 19 | rdd.collect() = 20 | [ 21 | ('alex', 'Sunnyvale', 25), 22 | ('alex', 'Sunnyvale', 33), 23 | ('alex', 'Sunnyvale', 45), 24 | ('alex', 'Sunnyvale', 63), 25 | ('mary', 'Ames', 22), 26 | ('mary', 'Cupertino', 66), 27 | ('mary', 'Ames', 20), 28 | ('bob', 'Ames', 26) 29 | ] 30 | 31 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 32 | rdd2.count() = 8 33 | rdd2.collect() = 34 | [ 35 | ('alex', 25), 36 | ('alex', 33), 37 | ('alex', 45), 38 | ('alex', 63), 39 | ('mary', 22), 40 | ('mary', 66), 41 | ('mary', 20), 42 | ('bob', 26) 43 | ] 44 | 45 | combined = PythonRDD[8] at RDD at PythonRDD.scala:48 46 | combined.count() = 3 47 | combined.collect() = 48 | [ 49 | ('bob', (26, 26, 1)), 50 | ('alex', (25, 63, 4)), 51 | ('mary', (20, 66, 3)) 52 | ] 53 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_combinebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_combinebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_combinebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_filter.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_filter.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('alex', 'Sunnyvale', 33), 9 | ('mary', 'Ames', 22), 10 | ('mary', 'Cupertino', 66), 11 | ('jane', 'Ames', 20), 12 | ('bob', 'Ames', 26) 13 | ] 14 | 15 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 16 | rdd.count() = 6 17 | rdd.collect() = 18 | [ 19 | ('alex', 'Sunnyvale', 25), 20 | ('alex', 'Sunnyvale', 33), 21 | ('mary', 'Ames', 22), 22 | ('mary', 'Cupertino', 66), 23 | ('jane', 'Ames', 20), 24 | ('bob', 'Ames', 26) 25 | ] 26 | 27 | filtered_by_lambda = PythonRDD[2] at RDD at PythonRDD.scala:48 28 | filtered_by_lambda.count() = 2 29 | filtered_by_lambda.collect() = 30 | [ 31 | ('alex', 'Sunnyvale', 33), 32 | ('mary', 'Cupertino', 66) 33 | ] 34 | 35 | filtered_by_function = PythonRDD[4] at RDD at PythonRDD.scala:48 36 | filtered_by_function.count() = 2 37 | filtered_by_function.collect() = 38 | [ 39 | ('alex', 'Sunnyvale', 33), 40 | ('mary', 'Cupertino', 66) 41 | ] -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_filter.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_filter.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_filter.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_flatmap.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_flatmap.py 2 | 3 | spark= 4 | 5 | list_of_strings = 6 | [ 7 | 'of', 8 | 'a fox jumped', 9 | 'fox jumped of fence', 10 | 'a foxy fox jumped high' 11 | ] 12 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 13 | rdd.count() = 4 14 | rdd.collect() = 15 | [ 16 | 'of', 17 | 'a fox jumped', 18 | 'fox jumped of fence', 19 | 'a foxy fox jumped high' 20 | ] 21 | 22 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 23 | rdd2.count() = 9 24 | rdd2.collect() = 25 | [ 26 | 'fox', 27 | 'jumped', 28 | 'fox', 29 | 'jumped', 30 | 'fence', 31 | 'foxy', 32 | 'fox', 33 | 'jumped', 34 | 'high' 35 | ] 36 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_flatmap.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_flatmap.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_flatmap.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_groupbykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_groupbykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_join.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_join.py 2 | 3 | spark= 4 | 5 | source_pairs = 6 | [ 7 | (1, 'u'), 8 | (1, 'v'), 9 | (2, 'a'), 10 | (3, 'b'), 11 | (4, 'z1') 12 | ] 13 | 14 | source.count(): 5 15 | source.collect(): 16 | [ 17 | (1, 'u'), 18 | (1, 'v'), 19 | (2, 'a'), 20 | (3, 'b'), 21 | (4, 'z1') 22 | ] 23 | 24 | other_pairs = 25 | [ 26 | (1, 'x'), 27 | (1, 'y'), 28 | (2, 'c'), 29 | (2, 'd'), 30 | (3, 'm'), 31 | (8, 'z2') 32 | ] 33 | 34 | other.count(): 6 35 | other.collect(): 36 | [ 37 | (1, 'x'), 38 | (1, 'y'), 39 | (2, 'c'), 40 | (2, 'd'), 41 | (3, 'm'), 42 | (8, 'z2') 43 | ] 44 | 45 | joined.count(): 7 46 | joined.collect(): 47 | [ 48 | (1, ('u', 'x')), 49 | (1, ('u', 'y')), 50 | (1, ('v', 'x')), 51 | (1, ('v', 'y')), 52 | (2, ('a', 'c')), 53 | (2, ('a', 'd')), 54 | (3, ('b', 'm')) 55 | ] -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_join.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_join.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_join.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_map.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_map.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('alex', 'Sunnyvale', 33), 9 | ('mary', 'Ames', 22), 10 | ('mary', 'Cupertino', 66), 11 | ('jane', 'Ames', 20), 12 | ('bob', 'Ames', 26) 13 | ] 14 | 15 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 16 | rdd.count() = 6 17 | rdd.collect() = 18 | [ 19 | ('alex', 'Sunnyvale', 25), 20 | ('alex', 'Sunnyvale', 33), 21 | ('mary', 'Ames', 22), 22 | ('mary', 'Cupertino', 66), 23 | ('jane', 'Ames', 20), 24 | ('bob', 'Ames', 26) 25 | ] 26 | 27 | 28 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 29 | rdd2.count() = 6 30 | rdd2.collect() = 31 | [ 32 | ('alex', 25), 33 | ('alex', 33), 34 | ('mary', 22), 35 | ('mary', 66), 36 | ('jane', 20), 37 | ('bob', 26) 38 | ] 39 | 40 | rdd3 = PythonRDD[4] at RDD at PythonRDD.scala:48 41 | rdd3.count() = 6 42 | rdd3.collect() = 43 | [ 44 | ('Sunnyvale', ('alex', 'Sunnyvale', 25)), 45 | ('Sunnyvale', ('alex', 'Sunnyvale', 33)), 46 | ('Ames', ('mary', 'Ames', 22)), 47 | ('Cupertino', ('mary', 'Cupertino', 66)), 48 | ('Ames', ('jane', 'Ames', 20)), 49 | ('Ames', ('bob', 'Ames', 26)) 50 | ] 51 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_map.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_map.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_map.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_mappartitions.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_mappartitions.py 2 | 3 | spark= 4 | 5 | numbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 6 | 7 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 8 | rdd.count() = 13 9 | rdd.collect() = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 10 | rdd.getNumPartitions() = 3 11 | 12 | ==begin-partition= 13 | 1 14 | 2 15 | 3 16 | 4 17 | ==end-partition= 18 | 19 | ==begin-partition= 20 | 9 21 | 10 22 | 11 23 | 12 24 | 13 25 | ==end-partition= 26 | 27 | ==begin-partition= 28 | 5 29 | 6 30 | 7 31 | 8 32 | ==end-partition= 33 | 34 | minmax_rdd = PythonRDD[3] at RDD at PythonRDD.scala:48 35 | minmax_rdd.count() = 6 36 | minmax_rdd.collect() = [1, 4, 5, 8, 9, 13] 37 | 38 | minmax_list = [1, 4, 5, 8, 9, 13] 39 | min(minmax_list) = 1 40 | max(minmax_list) = 13 -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_mappartitions.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_mappartitions_handle_empty_partitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_mappartitions_handle_empty_partitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_mappartitions_handle_empty_partitions.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_reducebykey.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_reducebykey.py 2 | 3 | spark= 4 | 5 | list_of_tuples = 6 | [ 7 | ('alex', 'Sunnyvale', 25), 8 | ('alex', 'Sunnyvale', 33), 9 | ('alex', 'Sunnyvale', 45), 10 | ('alex', 'Sunnyvale', 63), 11 | ('mary', 'Ames', 22), 12 | ('mary', 'Cupertino', 66), 13 | ('mary', 'Ames', 20), 14 | ('bob', 'Ames', 26) 15 | ] 16 | 17 | rdd = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 18 | rdd.count() = 8 19 | rdd.collect() = 20 | [ 21 | ('alex', 'Sunnyvale', 25), 22 | ('alex', 'Sunnyvale', 33), 23 | ('alex', 'Sunnyvale', 45), 24 | ('alex', 'Sunnyvale', 63), 25 | ('mary', 'Ames', 22), 26 | ('mary', 'Cupertino', 66), 27 | ('mary', 'Ames', 20), 28 | ('bob', 'Ames', 26) 29 | ] 30 | 31 | rdd2 = PythonRDD[2] at RDD at PythonRDD.scala:48 32 | rdd2.count() = 8 33 | rdd2.collect() = 34 | [ 35 | ('alex', 25), 36 | ('alex', 33), 37 | ('alex', 45), 38 | ('alex', 63), 39 | ('mary', 22), 40 | ('mary', 66), 41 | ('mary', 20), 42 | ('bob', 26) 43 | ] 44 | 45 | rdd3 = PythonRDD[8] at RDD at PythonRDD.scala:48 46 | rdd3.count() = 3 47 | rdd3.collect() = 48 | [ 49 | ('bob', 26), 50 | ('alex', 166), 51 | ('mary', 108) 52 | ] 53 | 54 | rdd4 = PythonRDD[14] at RDD at PythonRDD.scala:48 55 | rdd4.count() = 3 56 | rdd4.collect() = 57 | [ 58 | ('bob', 26), 59 | ('alex', 63), 60 | ('mary', 66) 61 | ] 62 | 63 | 64 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_reducebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_reducebykey.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_reducebykey.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_sortby.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_sortby.py 2 | 3 | spark= 4 | 5 | pairs = 6 | [ 7 | (10, 'z1'), 8 | (1, 'z2'), 9 | (2, 'z3'), 10 | (9, 'z4'), 11 | (3, 'z5'), 12 | (4, 'z6'), 13 | (5, 'z7'), 14 | (6, 'z8'), 15 | (7, 'z9') 16 | ] 17 | 18 | rdd.count(): 9 19 | rdd.collect(): 20 | [ 21 | (10, 'z1'), 22 | (1, 'z2'), 23 | (2, 'z3'), 24 | (9, 'z4'), 25 | (3, 'z5'), 26 | (4, 'z6'), 27 | (5, 'z7'), 28 | (6, 'z8'), 29 | (7, 'z9') 30 | ] 31 | 32 | sorted_by_key_ascending.count(): 9 33 | sorted_by_key_ascending.collect(): 34 | [ 35 | (1, 'z2'), 36 | (2, 'z3'), 37 | (3, 'z5'), 38 | (4, 'z6'), 39 | (5, 'z7'), 40 | (6, 'z8'), 41 | (7, 'z9'), 42 | (9, 'z4'), 43 | (10, 'z1') 44 | ] 45 | 46 | sorted_by_key_descending.count(): 9 47 | sorted_by_key_descending.collect(): 48 | [ 49 | (10, 'z1'), 50 | (9, 'z4'), 51 | (7, 'z9'), 52 | (6, 'z8'), 53 | (5, 'z7'), 54 | (4, 'z6'), 55 | (3, 'z5'), 56 | (2, 'z3'), 57 | (1, 'z2') 58 | ] 59 | 60 | sorted_by_value_ascending.count(): 9 61 | sorted_by_value_ascending.collect(): 62 | [ 63 | (10, 'z1'), 64 | (1, 'z2'), 65 | (2, 'z3'), 66 | (9, 'z4'), 67 | (3, 'z5'), 68 | (4, 'z6'), 69 | (5, 'z7'), 70 | (6, 'z8'), 71 | (7, 'z9') 72 | ] 73 | 74 | sorted_by_value_descending.count(): 9 75 | sorted_by_value_descending.collect(): 76 | [ 77 | (7, 'z9'), 78 | (6, 'z8'), 79 | (5, 'z7'), 80 | (4, 'z6'), 81 | (3, 'z5'), 82 | (9, 'z4'), 83 | (2, 'z3'), 84 | (1, 'z2'), 85 | (10, 'z1') 86 | ] -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_sortby.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_sortby.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_sortby.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_takeordered.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit rdd_transformation_takeordered.py 2 | 3 | spark = 4 | 5 | sc = 6 | 7 | numbers = [8, 10, 1, 2, 9, 3, 4, 5, 6, 7] 8 | 9 | top3 = [1, 2, 3] 10 | 11 | bottom3 = [10, 9, 8] 12 | 13 | pairs = 14 | [ 15 | (10, 'z1'), 16 | (1, 'z2'), 17 | (2, 'z3'), 18 | (9, 'z4'), 19 | (3, 'z5'), 20 | (4, 'z6'), 21 | (5, 'z7'), 22 | (6, 'z8'), 23 | (7, 'z9') 24 | ] 25 | 26 | top3_pairs = 27 | [ 28 | (1, 'z2'), 29 | (2, 'z3'), 30 | (3, 'z5') 31 | ] 32 | 33 | bottom3_pairs = 34 | [ 35 | (10, 'z1'), 36 | (9, 'z4'), 37 | (7, 'z9') 38 | ] -------------------------------------------------------------------------------- /code/chap05/rdd_transformation_takeordered.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run rdd_transformation_takeordered.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap05/rdd_transformation_takeordered.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap05/sample_5_records.txt: -------------------------------------------------------------------------------- 1 | A,3 2 | A,4 3 | A,5 4 | B,10 5 | B,20 6 | -------------------------------------------------------------------------------- /code/chap05/users.txt: -------------------------------------------------------------------------------- 1 | 1,Alex,30,124 2 | 2,Bert,32,234 3 | 3,Curt,28,312 4 | 4,Don,32,180 5 | 5,Mary,30,100 6 | 6,Jane,28,212 7 | 7,Joe,28,128 8 | 8,Al,40,600 9 | -------------------------------------------------------------------------------- /code/chap06/README.md: -------------------------------------------------------------------------------- 1 | # Chapter 6 2 | 3 | ## Programs 4 | 5 | The goal of the programs in this chapter is 6 | to show some of the important reductions 7 | in Spark. Some of the reductions by key 8 | are: 9 | 10 | * reduceBykey() 11 | * combineBykey() 12 | * groupBykey() 13 | * aggregateBykey() 14 | 15 | 16 | We want to find out average per key in PySpark. 17 | The solutions are presented in this chapter as: 18 | 19 | * combineByKey(): 20 | * `average_by_key_use_combinebykey.py` (PySpark program) 21 | * `average_by_key_use_combinebykey.sh` (shell script to call PySpark) 22 | 23 | * groupByKey(): 24 | * `average_by_key_use_groupbykey.py` (PySpark program) 25 | * `average_by_key_use_groupbykey.sh` (shell script to call PySpark) 26 | 27 | * reduceByKey(): 28 | * `average_by_key_use_reducebykey.py` (PySpark program) 29 | * `average_by_key_use_reducebykey.sh` (shell script to call PySpark) 30 | 31 | * aggregateByKey(): 32 | * `average_by_key_use_aggregatebykey.py` (PySpark program) 33 | * `average_by_key_use_aggregatebykey.sh` (shell script to call PySpark) 34 | -------------------------------------------------------------------------------- /code/chap06/average_by_key_use_aggregatebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for finding averages per 3 | # key by using the aggregateByKey() transformation 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_aggregatebykey.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 12 | 13 | -------------------------------------------------------------------------------- /code/chap06/average_by_key_use_combinebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for finding averages per 3 | # key by using the combineByKey() transformation 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_combinebykey.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 12 | 13 | -------------------------------------------------------------------------------- /code/chap06/average_by_key_use_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for finding averages per 3 | # key by using the groupByKey() transformation 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_groupbykey.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 12 | 13 | -------------------------------------------------------------------------------- /code/chap06/average_by_key_use_reducebykey.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for finding averages per 3 | # key by using the reduceByKey() transformation 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap06/average_by_key_use_reducebykey.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 12 | 13 | -------------------------------------------------------------------------------- /code/chap07/WorldCupPlayers.csv.data.source: -------------------------------------------------------------------------------- 1 | https://www.kaggle.com/abecklas/fifa-world-cup/downloads/WorldCupPlayers.csv/5 2 | -------------------------------------------------------------------------------- /code/chap07/WorldCupPlayers.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/WorldCupPlayers.csv.zip -------------------------------------------------------------------------------- /code/chap07/customers.RECORD.FORMAT.txt: -------------------------------------------------------------------------------- 1 | Each record has the following format: 2 | 3 | <,><,><,> 4 | -------------------------------------------------------------------------------- /code/chap07/customers.txt: -------------------------------------------------------------------------------- 1 | c1,2019,T0011,20.67 2 | c1,2019,T0012,12.34 3 | c1,2019,T0013,44.30 4 | c1,2018,T0001,20.67 5 | c1,2018,T0002,12.34 6 | c1,2018,T0003,44.30 7 | c2,2019,T0017,744.30 8 | c2,2019,T0018,820.67 9 | c2,2018,T0022,182.34 10 | c2,2018,T0033,494.30 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_add_columns.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_add_columns.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_add_columns.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_aggregate_multiple_columns.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_aggregate_multiple_columns.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_aggregate_multiple_columns.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_aggregate_single_column.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_aggregate_single_column.py 2 | 3 | +----+------+-----+ 4 | |name| food|price| 5 | +----+------+-----+ 6 | |mary| lemon| 2.0| 7 | |adam| grape| 1.22| 8 | |adam|carrot| 2.44| 9 | |adam|orange| 1.99| 10 | |john|tomato| 1.99| 11 | |john|carrot| 0.45| 12 | |john|banana| 1.29| 13 | |bill| apple| 0.99| 14 | |bill| taco| 2.59| 15 | +----+------+-----+ 16 | 17 | +------------------+ 18 | | sum(price)| 19 | +------------------+ 20 | |14.959999999999999| 21 | +------------------+ 22 | 23 | +-----------------+ 24 | | avg(price)| 25 | +-----------------+ 26 | |1.662222222222222| 27 | +-----------------+ 28 | 29 | +----------+ 30 | |max(price)| 31 | +----------+ 32 | | 2.59| 33 | +----------+ 34 | 35 | +----------+ 36 | |min(price)| 37 | +----------+ 38 | | 0.45| 39 | +----------+ 40 | 41 | +----+------------------+ 42 | |name| avg(price)| 43 | +----+------------------+ 44 | |adam|1.8833333333333335| 45 | |mary| 2.0| 46 | |john|1.2433333333333334| 47 | |bill| 1.79| 48 | +----+------------------+ 49 | 50 | +----+----------+ 51 | |name|max(price)| 52 | +----+----------+ 53 | |adam| 2.44| 54 | |mary| 2.0| 55 | |john| 1.99| 56 | |bill| 2.59| 57 | +----+----------+ 58 | 59 | +----+----------+ 60 | |name|min(price)| 61 | +----+----------+ 62 | |adam| 1.22| 63 | |mary| 2.0| 64 | |john| 0.45| 65 | |bill| 0.99| 66 | +----+----------+ 67 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_aggregate_single_column.sh: -------------------------------------------------------------------------------- 1 | #---------------------------------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_aggregate_single_column.py 3 | #---------------------------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #---------------------------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_aggregate_single_column.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_call_udf.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_call_udf.py 2 | 3 | squareof3= 9 4 | 5 | squareof7= 49 6 | 7 | data= [('alex', 5), ('jane', 7), ('bob', 9)] 8 | 9 | +----+---+ 10 | |name|age| 11 | +----+---+ 12 | |alex| 5| 13 | |jane| 7| 14 | | bob| 9| 15 | +----+---+ 16 | 17 | +----+---+-----------+ 18 | |name|age|age_squared| 19 | +----+---+-----------+ 20 | |alex| 5| 25| 21 | |jane| 7| 49| 22 | | bob| 9| 81| 23 | +----+---+-----------+ 24 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_call_udf.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_call_udf.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_call_udf.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_cvs_no_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_cvs_no_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap07/emps_no_header.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_cvs_no_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_cvs_with_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_cvs_with_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap07/emps_with_header.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_cvs_with_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_from_collections.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_from_collections.py 2 | data= [('k1', 2), ('k1', 3), ('k1', 5), ('k2', 7), ('k2', 9), ('k3', 8)] 3 | 4 | +---+---+ 5 | | _1| _2| 6 | +---+---+ 7 | | k1| 2| 8 | | k1| 3| 9 | | k1| 5| 10 | | k2| 7| 11 | | k2| 9| 12 | | k3| 8| 13 | +---+---+ 14 | 15 | +------+--------+ 16 | |my_key|my_value| 17 | +------+--------+ 18 | | k1| 2| 19 | | k1| 3| 20 | | k1| 5| 21 | | k2| 7| 22 | | k2| 9| 23 | | k3| 8| 24 | +------+--------+ 25 | 26 | +------+--------+ 27 | |my_key|my_value| 28 | +------+--------+ 29 | | k2| 7| 30 | | k2| 9| 31 | | k3| 8| 32 | +------+--------+ 33 | 34 | +------+----+ 35 | |my_key|size| 36 | +------+----+ 37 | | k2| 2| 38 | | k1| 3| 39 | | k3| 1| 40 | +------+----+ 41 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_from_collections.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_collections.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | #export INPUT_FILE="/pyspark_book/code/chap07/emps_no_header.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_from_collections.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 12 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_from_pandas.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_from_pandas.py 2 | 3 | panda_dataframe = 4 | integers floats int_arrays 5 | 0 2 1.2 [6] 6 | 1 5 -2.0 [1, 2] 7 | 2 7 1.5 [3, 4, 5] 8 | 3 8 2.7 [6, 7, 8, 9] 9 | 4 9 3.6 [10, 11, 12] 10 | 11 | spark_df = 12 | DataFrame 13 | [ 14 | integers: bigint, 15 | floats: double, 16 | int_arrays: array 17 | ] 18 | 19 | spark_df.show(): 20 | +--------+------+------------+ 21 | |integers|floats| int_arrays| 22 | +--------+------+------------+ 23 | | 2| 1.2| [6]| 24 | | 5| -2.0| [1, 2]| 25 | | 7| 1.5| [3, 4, 5]| 26 | | 8| 2.7|[6, 7, 8, 9]| 27 | | 9| 3.6|[10, 11, 12]| 28 | +--------+------+------------+ 29 | 30 | pandas_df = 31 | integers floats int_arrays 32 | 0 2 1.2 [6] 33 | 1 5 -2.0 [1, 2] 34 | 2 7 1.5 [3, 4, 5] 35 | 3 8 2.7 [6, 7, 8, 9] 36 | 4 9 3.6 [10, 11, 12] -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_from_pandas.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_pandas.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_from_pandas.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_from_rows.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_from_rows.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_from_rows.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_order_by.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_creation_order_by.py 2 | 3 | data= [('A', 8), ('B', 3), ('A', 4), ('B', 2), ('Z', 7)] 4 | 5 | +---+-----+ 6 | | id|value| 7 | +---+-----+ 8 | | A| 8| 9 | | B| 3| 10 | | A| 4| 11 | | B| 2| 12 | | Z| 7| 13 | +---+-----+ 14 | 15 | +---+-----+ 16 | | id|value| 17 | +---+-----+ 18 | | A| 8| 19 | | A| 4| 20 | | B| 3| 21 | | B| 2| 22 | | Z| 7| 23 | +---+-----+ 24 | 25 | +---+-----+ 26 | | id|value| 27 | +---+-----+ 28 | | Z| 7| 29 | | B| 3| 30 | | B| 2| 31 | | A| 8| 32 | | A| 4| 33 | +---+-----+ 34 | 35 | +---+-----+ 36 | | id|value| 37 | +---+-----+ 38 | | B| 2| 39 | | B| 3| 40 | | A| 4| 41 | | Z| 7| 42 | | A| 8| 43 | +---+-----+ 44 | 45 | +---+-----+ 46 | | id|value| 47 | +---+-----+ 48 | | A| 8| 49 | | Z| 7| 50 | | A| 4| 51 | | B| 3| 52 | | B| 2| 53 | +---+-----+ 54 | 55 | +---+-----+ 56 | | id|value| 57 | +---+-----+ 58 | | A| 4| 59 | | A| 8| 60 | | B| 2| 61 | | B| 3| 62 | | Z| 7| 63 | +---+-----+ 64 | 65 | +---+-----+ 66 | | id|value| 67 | +---+-----+ 68 | | A| 8| 69 | | A| 4| 70 | | B| 3| 71 | | B| 2| 72 | | Z| 7| 73 | +---+-----+ 74 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_order_by.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_order_by.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_order_by.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_creation_with_explicit_schema.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_creation_with_explicit_schema.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap07/emps_no_header.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_creation_with_explicit_schema.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/dataframe_crosstab.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_crosstab.py 2 | 3 | +---+-----+ 4 | |key|value| 5 | +---+-----+ 6 | | 1| 1| 7 | | 1| 2| 8 | | 2| 1| 9 | | 2| 1| 10 | | 2| 3| 11 | | 3| 2| 12 | | 3| 3| 13 | | 4| 4| 14 | +---+-----+ 15 | 16 | +---------+---+---+---+---+ 17 | |key_value| 1| 2| 3| 4| 18 | +---------+---+---+---+---+ 19 | | 2| 2| 0| 1| 0| 20 | | 4| 0| 0| 0| 1| 21 | | 1| 1| 1| 0| 0| 22 | | 3| 0| 1| 1| 0| 23 | +---------+---+---+---+---+ 24 | -------------------------------------------------------------------------------- /code/chap07/dataframe_crosstab.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_crosstab.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_crosstab.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_drop_column.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_drop_column.py 2 | 3 | data = 4 | [ 5 | (100, 'a', 1.0), 6 | (200, 'b', 2.0), 7 | (300, 'c', 3.0), 8 | (400, 'd', 4.0) 9 | ] 10 | 11 | columns = ('id', 'code', 'scale') 12 | 13 | +---+----+-----+ 14 | | id|code|scale| 15 | +---+----+-----+ 16 | |100| a| 1.0| 17 | |200| b| 2.0| 18 | |300| c| 3.0| 19 | |400| d| 4.0| 20 | +---+----+-----+ 21 | 22 | +---+----+ 23 | | id|code| 24 | +---+----+ 25 | |100| a| 26 | |200| b| 27 | |300| c| 28 | |400| d| 29 | +---+----+ 30 | 31 | +---+-----+ 32 | | id|scale| 33 | +---+-----+ 34 | |100| 1.0| 35 | |200| 2.0| 36 | |300| 3.0| 37 | |400| 4.0| 38 | +---+-----+ -------------------------------------------------------------------------------- /code/chap07/dataframe_drop_column.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_drop_column.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_drop_column.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_drop_duplicates.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit dataframe_drop_duplicates.py 2 | 3 | data = 4 | [ 5 | (100, 'a', 1.0), 6 | (100, 'a', 1.0), 7 | (200, 'b', 2.0), 8 | (300, 'c', 3.0), 9 | (300, 'c', 3.0), 10 | (400, 'd', 4.0) 11 | ] 12 | 13 | columns = ('id', 'code', 'scale') 14 | 15 | +---+----+-----+ 16 | | id|code|scale| 17 | +---+----+-----+ 18 | |100| a| 1.0| 19 | |100| a| 1.0| 20 | |200| b| 2.0| 21 | |300| c| 3.0| 22 | |300| c| 3.0| 23 | |400| d| 4.0| 24 | +---+----+-----+ 25 | 26 | +---+----+-----+ 27 | | id|code|scale| 28 | +---+----+-----+ 29 | |200| b| 2.0| 30 | |300| c| 3.0| 31 | |400| d| 4.0| 32 | |100| a| 1.0| 33 | +---+----+-----+ 34 | -------------------------------------------------------------------------------- /code/chap07/dataframe_drop_duplicates.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_drop_duplicates.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_drop_duplicates.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_multi_dim_agg_groupby.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_multi_dim_agg_groupby.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_multi_dim_agg_groupby.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_multi_dim_agg_rollup.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_multi_dim_agg_rollup.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_multi_dim_agg_rollup.py" 8 | # 9 | # run the PySpark program: 10 | $SPARK_HOME/bin/spark-submit $SPARK_PROG 11 | -------------------------------------------------------------------------------- /code/chap07/dataframe_tutorial_with_worldcup.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_tutorial_with_worldcup.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # 8 | export INPUT_PATH="/pyspark_book/code/chap07/WorldCupPlayers.csv" 9 | # source of input data: 10 | # https://www.kaggle.com/abecklas/fifa-world-cup/downloads/WorldCupPlayers.csv/5 11 | # 12 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_tutorial_with_worldcup.py" 13 | # 14 | # run the PySpark program: 15 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH 16 | -------------------------------------------------------------------------------- /code/chap07/dataframe_with_statistical_data.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run dataframe_with_statistical_data.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap07/life_expentancy.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap07/dataframe_with_statistical_data.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap07/emps_no_header.txt: -------------------------------------------------------------------------------- 1 | 1001,alex,67000,SALES 2 | 1002,bob,24000,SALES 3 | 1003,boby,24000,SALES 4 | 1004,jane,69000,SOFTWARE 5 | 1005,betty,55000,SOFTWARE 6 | 1006,jeff,59000,SOFTWARE 7 | 1007,dara,72000,SOFTWARE 8 | -------------------------------------------------------------------------------- /code/chap07/emps_with_header.txt: -------------------------------------------------------------------------------- 1 | id,name,salary,dept 2 | 1001,alex,67000,SALES 3 | 1002,bob,24000,SALES 4 | 1003,boby,24000,SALES 5 | 1004,jane,69000,SOFTWARE 6 | 1005,betty,55000,SOFTWARE 7 | 1006,jeff,59000,SOFTWARE 8 | 1007,dara,72000,SOFTWARE 9 | -------------------------------------------------------------------------------- /code/chap07/partition_data_by_customer_and_year.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run the following program: 3 | # partition_data_by_customer_and_year.py 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export INPUT_PATH="/pyspark_book/code/chap07/customers.txt" 9 | export OUTPUT_PATH="/tmp/partition_demo" 10 | export SPARK_PROG="/pyspark_book/code/chap07/partition_data_by_customer_and_year.py" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH $OUTPUT_PATH 14 | -------------------------------------------------------------------------------- /code/chap07/strings-2.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/strings-2.parquet -------------------------------------------------------------------------------- /code/chap07/users.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/users.parquet -------------------------------------------------------------------------------- /code/chap07/users4.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap07/users4.parquet -------------------------------------------------------------------------------- /code/chap08/cats.no.header.csv: -------------------------------------------------------------------------------- 1 | cuttie,2,female,6 2 | mono,3,male,9 3 | fuzzy,1,female,4 4 | -------------------------------------------------------------------------------- /code/chap08/cats.with.header.csv: -------------------------------------------------------------------------------- 1 | name,age,gender,weight 2 | cuttie,2,female,6 3 | mono,3,male,9 4 | fuzzy,1,female,4 5 | -------------------------------------------------------------------------------- /code/chap08/datasource_csv_reader_no_header.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit datasource_csv_reader_no_header.py sample_no_header.csv 2 | 3 | spark= 4 | 5 | input path : sample_no_header.csv 6 | 7 | file_contents = 8 | Alex,Sunnyvale,30 9 | Mary,Cupertino,28 10 | Jane,Stanford,44 11 | Bob,Ames,33 12 | 13 | df = 14 | [ 15 | Row(_c0=u'Alex', _c1=u'Sunnyvale', _c2=30), 16 | Row(_c0=u'Mary', _c1=u'Cupertino', _c2=28), 17 | Row(_c0=u'Jane', _c1=u'Stanford', _c2=44), 18 | Row(_c0=u'Bob', _c1=u'Ames', _c2=33) 19 | ] 20 | 21 | +----+---------+---+ 22 | | _c0| _c1|_c2| 23 | +----+---------+---+ 24 | |Alex|Sunnyvale| 30| 25 | |Mary|Cupertino| 28| 26 | |Jane| Stanford| 44| 27 | | Bob| Ames| 33| 28 | +----+---------+---+ 29 | 30 | root 31 | |-- _c0: string (nullable = true) 32 | |-- _c1: string (nullable = true) 33 | |-- _c2: integer (nullable = true) 34 | 35 | +----+---------+---+ 36 | |name| city|age| 37 | +----+---------+---+ 38 | |Alex|Sunnyvale| 30| 39 | |Mary|Cupertino| 28| 40 | |Jane| Stanford| 44| 41 | | Bob| Ames| 33| 42 | +----+---------+---+ 43 | 44 | root 45 | |-- name: string (nullable = true) 46 | |-- city: string (nullable = true) 47 | |-- age: integer (nullable = true) 48 | -------------------------------------------------------------------------------- /code/chap08/datasource_csv_reader_no_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_csv_reader_no_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_no_header.csv" 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_reader_no_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap08/datasource_csv_reader_with_header.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit datasource_csv_reader_with_header.py sample_with_header.csv 2 | 3 | spark= 4 | 5 | input path : sample_with_header.csv 6 | 7 | file_contents = 8 | name,city,age 9 | Alex,Sunnyvale,30 10 | Mary,Cupertino,28 11 | Jane,Stanford,44 12 | Bob,Ames,33 13 | 14 | df.count() = 4 15 | df.collect() = 16 | [ 17 | Row(name=u'Alex', city=u'Sunnyvale', age=30), 18 | Row(name=u'Mary', city=u'Cupertino', age=28), 19 | Row(name=u'Jane', city=u'Stanford', age=44), 20 | Row(name=u'Bob', city=u'Ames', age=33) 21 | ] 22 | 23 | +----+---------+---+ 24 | |name| city|age| 25 | +----+---------+---+ 26 | |Alex|Sunnyvale| 30| 27 | |Mary|Cupertino| 28| 28 | |Jane| Stanford| 44| 29 | | Bob| Ames| 33| 30 | +----+---------+---+ 31 | 32 | root 33 | |-- name: string (nullable = true) 34 | |-- city: string (nullable = true) 35 | |-- age: integer (nullable = true) 36 | -------------------------------------------------------------------------------- /code/chap08/datasource_csv_reader_with_header.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_csv_reader_with_header.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_with_header.csv" 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_reader_with_header.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap08/datasource_csv_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_csv_writer.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_writer.py" 8 | export OUTPUT_CSV_FILE_PATH="/tmp/output.csv" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG ${OUTPUT_CSV_FILE_PATH} 12 | -------------------------------------------------------------------------------- /code/chap08/datasource_elasticsearch_reader.log: -------------------------------------------------------------------------------- 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 2 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_elasticsearch_reader.py" 3 | export ELASTIC_SEARCH_HOST="localhost" 4 | export JAR="/pyspark_book/jars/elasticsearch-hadoop-6.4.2.jar" 5 | # 6 | # run the PySpark program: 7 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST} 8 | 9 | es_hostname : localhost 10 | 11 | spark= 12 | 13 | rs_rdd : MapPartitionsRDD[2] at mapPartitions at SerDeUtil.scala:244 14 | 15 | es_rdd.count() : 4 16 | 17 | es_rdd.collect(): 18 | [ 19 | ('100', {'key1': 'some_value1', 'doc_id': 100}), 20 | ('200', {'key2': 'some_value2', 'doc_id': 200}), 21 | ('300', {'key3': 'some_value3', 'doc_id': 300}), 22 | ('400', {'key4': 'some_value4', 'doc_id': 400}) 23 | ] -------------------------------------------------------------------------------- /code/chap08/datasource_elasticsearch_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_elasticsearch_reader.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_elasticsearch_reader.py" 8 | export ELASTIC_SEARCH_HOST="localhost" 9 | export JAR="/pyspark_book/jars/elasticsearch-hadoop-6.4.2.jar" 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST} 13 | -------------------------------------------------------------------------------- /code/chap08/datasource_elasticsearch_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_elasticsearch_writer.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_elasticsearch_writer.py" 8 | export ELASTIC_SEARCH_HOST="localhost" 9 | export JAR="/pyspark_book/jars/elasticsearch-hadoop-6.4.2.jar" 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit --jars "${JAR}" $SPARK_PROG ${ELASTIC_SEARCH_HOST} 13 | -------------------------------------------------------------------------------- /code/chap08/datasource_gzip_reader.log: -------------------------------------------------------------------------------- 1 | $ cat z1.file.txt 2 | z1: record1 3 | z1: record2 4 | z1: record3 5 | 6 | $ cat z2.file.txt 7 | z2: record1 8 | z2: record2 9 | z2: record3 10 | z2: record4 11 | 12 | # gzip the files: z11.file.txt and z22.file.txt 13 | $ cp z1.file.txt z11.file.txt 14 | $ cp z2.file.txt z22.file.txt 15 | $ gzip z11.file.txt 16 | $ gzip z22.file.txt 17 | 18 | $ ls -l z*gz 19 | -rw-r--r-- 1 52 z11.file.txt.gz 20 | -rw-r--r-- 1 55 z22.file.txt.gz 21 | 22 | $ export INPUT_PATH="z11.file.txt.gz,z22.file.txt.gz" 23 | 24 | $ ./bin/spark-submit datasource_gzip_reader.py $INPUT_PATH 25 | 26 | spark= 27 | 28 | gz_input_path : z11.file.txt.gz,z22.file.txt.gz 29 | 30 | gzip_rdd = z11.file.txt.gz,z22.file.txt.gz MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 31 | gzip_rdd.count() = 7 32 | gzip_rdd.collect() = 33 | [ 34 | u'z1: record1', 35 | u'z1: record2', 36 | u'z1: record3', 37 | u'z2: record1', 38 | u'z2: record2', 39 | u'z2: record3', 40 | u'z2: record4' 41 | ] -------------------------------------------------------------------------------- /code/chap08/datasource_gzip_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_gzip_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | FILE1=z11.file.txt.gz 8 | FILE2=z22.file.txt.gz 9 | export INPUT_FILE="/pyspark_book/code/chap08/sample_no_header.csv" 10 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_csv_reader_no_header.py" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 14 | -------------------------------------------------------------------------------- /code/chap08/datasource_jdbc_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_jdbc_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_jdbc_reader.py" 8 | # 9 | # define the required MySQL database connection parameters 10 | export JDBC_URL="jdbc:mysql://localhost/metadb" 11 | export JDBC_DRIVER="com.mysql.jdbc.Driver" 12 | export JDBC_USER="root" 13 | export JDBC_PASSWORD="mp22_pass" 14 | export JDBC_SOURCE_TABLE_NAME="dept" 15 | # 16 | # define the required JAR file for MySQL database access 17 | export JAR="/pyspark_book/code/jars/mysql-connector-java-5.1.42.jar" 18 | # 19 | # run the PySpark program: 20 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_SOURCE_TABLE_NAME} 21 | -------------------------------------------------------------------------------- /code/chap08/datasource_jdbc_writer.sh: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------------- 2 | # This is a shell script to run datasource_jdbc_writer.py 3 | #-------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #-------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_jdbc_writer.py" 8 | # 9 | # define the required MySQL database connection parameters 10 | export JDBC_URL="jdbc:mysql://localhost/metadb" 11 | export JDBC_DRIVER="com.mysql.jdbc.Driver" 12 | export JDBC_USER="root" 13 | export JDBC_PASSWORD="mp22_pass" 14 | export JDBC_TARGET_TABLE_NAME="people" 15 | # 16 | # define the required JAR file for MySQL database access 17 | export JAR="/pyspark_book/code/jars/mysql-connector-java-5.1.42.jar" 18 | # 19 | # run the PySpark program: 20 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${JDBC_URL} ${JDBC_DRIVER} ${JDBC_USER} ${JDBC_PASSWORD} ${JDBC_TARGET_TABLE_NAME} 21 | -------------------------------------------------------------------------------- /code/chap08/datasource_json_reader_multi_line.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_json_reader_multi_line.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_multi_line.json" 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_json_reader_multi_line.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap08/datasource_json_reader_single_line.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_json_reader_single_line.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_single_line.json" 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_json_reader_single_line.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap08/datasource_mongodb_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_mongodb_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_mongodb_reader.py" 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll44" 9 | export JAR1="/pyspark_book/code/jars/mongo-java-driver-3.8.2.jar" 10 | export JAR2="/pyspark_book/code/jars/mongo-spark-connector_2.11-2.2.5.jar" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI} 14 | -------------------------------------------------------------------------------- /code/chap08/datasource_mongodb_writer.log: -------------------------------------------------------------------------------- 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 2 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_mongodb_writer.py" 3 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll66" 4 | export JAR1="/pyspark_book/code/jars/mongo-java-driver-3.8.2.jar" 5 | export JAR2="/pyspark_book/code/jars/mongo-spark-connector_2.11-2.2.5.jar" 6 | # 7 | # run the PySpark program: 8 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI} 9 | 10 | mongodb_collection_uri : mongodb://127.0.0.1/test.coll66 11 | 12 | spark= 13 | 14 | +-------+---------+---+ 15 | |name |city |age| 16 | +-------+---------+---+ 17 | |Alex |Ames |50 | 18 | |Gandalf|Cupertino|60 | 19 | |Thorin |Sunnyvale|95 | 20 | |Betty |Ames |78 | 21 | |Brian |Stanford |77 | 22 | +-------+---------+---+ 23 | 24 | people.count() = 5 25 | people.collect() = 26 | [ 27 | Row(name=u'Alex', city=u'Ames', age=50), 28 | Row(name=u'Gandalf', city=u'Cupertino', age=60), 29 | Row(name=u'Thorin', city=u'Sunnyvale', age=95), 30 | Row(name=u'Betty', city=u'Ames', age=78), 31 | Row(name=u'Brian', city=u'Stanford', age=77) 32 | ] 33 | 34 | root 35 | |-- name: string (nullable = true) 36 | |-- city: string (nullable = true) 37 | |-- age: long (nullable = true) -------------------------------------------------------------------------------- /code/chap08/datasource_mongodb_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------------- 2 | # This is a shell script to run datasource_mongodb_reader.py 3 | #----------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_mongodb_writer.py" 8 | export MONGODB_COLLECTION_URI="mongodb://127.0.0.1/test.coll66" 9 | export JAR1="/pyspark_book/code/jars/mongo-java-driver-3.8.2.jar" 10 | export JAR2="/pyspark_book/code/jars/mongo-spark-connector_2.11-2.2.5.jar" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --jars "${JAR1},${JAR2}" $SPARK_PROG ${MONGODB_COLLECTION_URI} 14 | -------------------------------------------------------------------------------- /code/chap08/datasource_redis_reader.log: -------------------------------------------------------------------------------- 1 | JAR="/pyspark_book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar" 2 | ./bin/spark-submit --jars ${JAR} datasource_redis_reader.py localhost 6379 3 | 4 | REDIS_HOST = localhost 5 | 6 | REDIS_PORT = 6379 7 | 8 | spark= 9 | 10 | loaded_df = 11 | DataFrame[name: string, city: string, age: bigint] 12 | loaded_df.count(): 5 13 | loaded_df.collect(): 14 | [ 15 | Row(name=u'Brian', city=u'Stanford', age=77), 16 | Row(name=u'Alex', city=u'Ames', age=50), 17 | Row(name=u'Gandalf', city=u'Cupertino', age=60), 18 | Row(name=u'Thorin', city=u'Sunnyvale', age=95), 19 | Row(name=u'Betty', city=u'Ames', age=78) 20 | ] 21 | 22 | +-------+---------+---+ 23 | | name| city|age| 24 | +-------+---------+---+ 25 | | Brian| Stanford| 77| 26 | | Alex| Ames| 50| 27 | |Gandalf|Cupertino| 60| 28 | | Thorin|Sunnyvale| 95| 29 | | Betty| Ames| 78| 30 | +-------+---------+---+ 31 | 32 | root 33 | |-- name: string (nullable = true) 34 | |-- city: string (nullable = true) 35 | |-- age: long (nullable = true) 36 | -------------------------------------------------------------------------------- /code/chap08/datasource_redis_reader.sh: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------------- 2 | # This is a shell script to run datasource_redis_reader.py 3 | #-------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #-------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_redis_reader.py" 8 | # 9 | # define the required redis database connection parameters 10 | export REDIS_HOST="localhost" 11 | export REDIS_PORT="6379" 12 | # you may add password 13 | #export REDIS_PASSWORD="" 14 | # 15 | # define the required JAR file for redis database access 16 | export JAR="/pyspark_book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar" 17 | # 18 | # run the PySpark program: 19 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 20 | -------------------------------------------------------------------------------- /code/chap08/datasource_redis_writer.sh: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------------- 2 | # This is a shell script to run datasource_redis_writer.py 3 | #-------------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #-------------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_redis_writer.py" 8 | # 9 | # define the required redis database connection parameters 10 | export REDIS_HOST="localhost" 11 | export REDIS_PORT="6379" 12 | # you may add password 13 | #export REDIS_PASSWORD="" 14 | # 15 | # define the required JAR file for redis database access 16 | export JAR="/pyspark_book/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar" 17 | # 18 | # run the PySpark program: 19 | $SPARK_HOME/bin/spark-submit --jars ${JAR} ${SPARK_PROG} ${REDIS_HOST} ${REDIS_PORT} 20 | -------------------------------------------------------------------------------- /code/chap08/datasource_textfile_reader.log: -------------------------------------------------------------------------------- 1 | $ cat sample_numbers.txt 2 | 123,344,455,6666,2,300 3 | 7777,4444,55 4 | 22,34 5 | 900,901,902,9000,5600,5600,5700,45 6 | 45 7 | 70,71,72 8 | 9 | $ export INPUT_PATH="sample_numbers.txt" 10 | $ ./bin/spark-submit datasource_textfile_reader.py ${INPUT_PATH} 11 | 12 | spark= 13 | 14 | input_path : sample_numbers.txt 15 | 16 | file_contents = 17 | 123,344,455,6666,2,300 18 | 7777,4444,55 19 | 22,34 20 | 900,901,902,9000,5600,5600,5700,45 21 | 45 22 | 70,71,72 23 | 24 | records = sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 25 | records.count() = 6 26 | records.collect() = 27 | [ 28 | u'123,344,455,6666,2,300', 29 | u'7777,4444,55', 30 | u'22,34', 31 | u'900,901,902,9000,5600,5600,5700,45', 32 | u'45', 33 | u'70,71,72' 34 | ] 35 | numbers = PythonRDD[3] at RDD at PythonRDD.scala:48 36 | numbers.count() = 23 37 | numbers.collect() = 38 | [ 39 | 123, 40 | 344, 41 | 455, 42 | 6666, 43 | 2, 44 | 300, 45 | 7777, 46 | 4444, 47 | 55, 48 | 22, 49 | 34, 50 | 900, 51 | 901, 52 | 902, 53 | 9000, 54 | 5600, 55 | 5600, 56 | 5700, 57 | 45, 58 | 45, 59 | 70, 60 | 71, 61 | 72 62 | ] -------------------------------------------------------------------------------- /code/chap08/datasource_textfile_reader.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_textfile_reader.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_FILE="/pyspark_book/code/chap08/sample_numbers.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_textfile_reader.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_FILE 12 | -------------------------------------------------------------------------------- /code/chap08/datasource_textfile_writer.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit datasource_textfile_writer.py /tmp/zoutput 2 | 3 | spark= 4 | 5 | output_path : /tmp/zoutput 6 | 7 | data = 8 | [ 9 | 'data element 1', 10 | 'data element 2', 11 | 'data element 3', 12 | 'data element 4' 13 | ] 14 | records = ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 15 | records.count() = 4 16 | records.collect() = 17 | [ 18 | 'data element 1', 19 | 'data element 2', 20 | 'data element 3', 21 | 'data element 4' 22 | ] 23 | loaded_records = /tmp/zoutput MapPartitionsRDD[6] at textFile at NativeMethodAccessorImpl.java:0 24 | loaded_records.count() = 4 25 | loaded_records.collect() = 26 | [ 27 | u'data element 3', 28 | u'data element 2', 29 | u'data element 1', 30 | u'data element 4' 31 | ] 32 | 33 | $ ls -l /tmp/zoutput/ 34 | total 32 35 | -rw-r--r-- 1 mparsian wheel 0 Nov 7 20:50 _SUCCESS 36 | -rw-r--r-- 1 mparsian wheel 0 Nov 7 20:50 part-00000 37 | -rw-r--r-- 1 mparsian wheel 15 Nov 7 20:50 part-00001 38 | -rw-r--r-- 1 mparsian wheel 0 Nov 7 20:50 part-00002 39 | -rw-r--r-- 1 mparsian wheel 15 Nov 7 20:50 part-00003 40 | -rw-r--r-- 1 mparsian wheel 0 Nov 7 20:50 part-00004 41 | -rw-r--r-- 1 mparsian wheel 15 Nov 7 20:50 part-00005 42 | -rw-r--r-- 1 mparsian wheel 0 Nov 7 20:50 part-00006 43 | -rw-r--r-- 1 mparsian wheel 15 Nov 7 20:50 part-00007 44 | 45 | $ cat /tmp/zoutput/part* 46 | data element 1 47 | data element 2 48 | data element 3 49 | data element 4 50 | -------------------------------------------------------------------------------- /code/chap08/datasource_textfile_writer.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run datasource_textfile_writer.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export OUTPUT_PATH="/tmp/zoutput" 8 | export SPARK_PROG="/pyspark_book/code/chap08/datasource_textfile_writer.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $OUTPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap08/images/cat1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat1.jpg -------------------------------------------------------------------------------- /code/chap08/images/cat2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat2.jpg -------------------------------------------------------------------------------- /code/chap08/images/cat3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat3.jpg -------------------------------------------------------------------------------- /code/chap08/images/cat4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/cat4.jpg -------------------------------------------------------------------------------- /code/chap08/images/duck1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/duck1.jpg -------------------------------------------------------------------------------- /code/chap08/images/duck2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/images/duck2.jpg -------------------------------------------------------------------------------- /code/chap08/images/not-image.txt: -------------------------------------------------------------------------------- 1 | not an image 2 | -------------------------------------------------------------------------------- /code/chap08/mongodb_coll44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/mongodb_coll44.png -------------------------------------------------------------------------------- /code/chap08/mongodb_coll66.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/mongodb_coll66.png -------------------------------------------------------------------------------- /code/chap08/name_age_salary.csv: -------------------------------------------------------------------------------- 1 | alex,60,18000 2 | adel,40,45000 3 | adel,50,77000 4 | jane,40,52000 5 | jane,60,81000 6 | alex,50,62000 7 | mary,50,92000 8 | mary,60,63000 9 | mary,40,55000 10 | mary,40,55000 11 | -------------------------------------------------------------------------------- /code/chap08/people.txt: -------------------------------------------------------------------------------- 1 | Alex,30,Tennis 2 | Betty,40,Swimming 3 | Dave,20,Walking 4 | Jeff,77,Baseball 5 | -------------------------------------------------------------------------------- /code/chap08/sample_multi_line.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"name":"alex","id":100,"scores":[8,1,2,3],"dict": {"key": "value11"}}, 3 | {"name":"jane","id":200,"scores":[4,6],"dict": {"key": "value22"}}, 4 | { 5 | "name": "bob", 6 | "id": 300, 7 | "scores": [ 8 | 3, 9 | 4, 10 | 6, 11 | 9 12 | ], 13 | "dict": { 14 | "key": "value33", 15 | "key2": "value44" 16 | } 17 | }, 18 | { 19 | "name": "bob", 20 | "id": 400, 21 | "scores": [ 22 | 3, 23 | 5, 24 | 6, 25 | 9 26 | ], 27 | "dict": { 28 | "key": "value55", 29 | "key2": "value66" 30 | } 31 | } 32 | ] 33 | -------------------------------------------------------------------------------- /code/chap08/sample_no_header.csv: -------------------------------------------------------------------------------- 1 | Alex,Sunnyvale,30 2 | Mary,Cupertino,28 3 | Jane,Stanford,44 4 | Bob,Ames,33 5 | -------------------------------------------------------------------------------- /code/chap08/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 123,344,455,6666,2,300 2 | 7777,4444,55 3 | 22,34 4 | 900,901,902,9000,5600,5600,5700,45 5 | 45 6 | 70,71,72 7 | -------------------------------------------------------------------------------- /code/chap08/sample_single_line.json: -------------------------------------------------------------------------------- 1 | {"name":"alex","id":200,"scores":[1,2],"dict": {"key1": "value11", "key2": "value12"}} 2 | {"name":"bob","id":300,"scores":[1,2,4,6],"dict": {"key1": "value16"}} 3 | {"name":"jane","id":400,"scores":[2,4,6],"dict": {"key4": "value41"}} 4 | {"name":"mary","id":500,"scores":[5,9],"dict": {"key4": "value77", "key3": "value88"}} 5 | -------------------------------------------------------------------------------- /code/chap08/sample_with_header.csv: -------------------------------------------------------------------------------- 1 | name,city,age 2 | Alex,Sunnyvale,30 3 | Mary,Cupertino,28 4 | Jane,Stanford,44 5 | Bob,Ames,33 6 | -------------------------------------------------------------------------------- /code/chap08/twitter.avro: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap08/twitter.avro -------------------------------------------------------------------------------- /code/chap09/logistic_regression_builder.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to build and save an LR model 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export TRAINING_DATA_SPAM="/pyspark_book/code/chap09/training_emails_spam.txt" 8 | export TRAINING_DATA_NOSPAM="/pyspark_book/code/chap09/training_emails_nospam.txt" 9 | export BUILT_MOLDEL_OUTPUT_PATH="/pyspark_book/code/chap09/model" 10 | export SPARK_PROG="/pyspark_book/code/chap09/logistic_regression_builder.py" 11 | # 12 | # Make sure there are no files under output path 13 | rm -fr ${BUILT_MOLDEL_OUTPUT_PATH}/* 14 | # 15 | # run the PySpark program: 16 | $SPARK_HOME/bin/spark-submit ${SPARK_PROG} ${TRAINING_DATA_NOSPAM} ${TRAINING_DATA_SPAM} ${BUILT_MOLDEL_OUTPUT_PATH} 17 | -------------------------------------------------------------------------------- /code/chap09/logistic_regression_predictor.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to load the built LR 3 | # model and to predict new emails (new_emails.txt) 4 | # into spam or nospam 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 9 | export NEW_EMAILS="/pyspark_book/code/chap09/new_emails.txt" 10 | export BUILT_MOLDEL_OUTPUT_PATH="/pyspark_book/code/chap09/model" 11 | export SPARK_PROG="/pyspark_book/code/chap09/logistic_regression_predictor.py" 12 | # 13 | # run the PySpark program: 14 | $SPARK_HOME/bin/spark-submit ${SPARK_PROG} ${BUILT_MOLDEL_OUTPUT_PATH} ${NEW_EMAILS} 15 | -------------------------------------------------------------------------------- /code/chap09/model/data/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /code/chap09/model/data/.part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/data/.part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /code/chap09/model/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/data/_SUCCESS -------------------------------------------------------------------------------- /code/chap09/model/data/part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/data/part-00000-1d219005-0cb4-4a77-98bf-2f69a69655a1-c000.snappy.parquet -------------------------------------------------------------------------------- /code/chap09/model/metadata/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /code/chap09/model/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/metadata/.part-00000.crc -------------------------------------------------------------------------------- /code/chap09/model/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/chap09/model/metadata/_SUCCESS -------------------------------------------------------------------------------- /code/chap09/model/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.mllib.classification.LogisticRegressionModel","version":"1.0","numFeatures":128,"numClasses":2} 2 | -------------------------------------------------------------------------------- /code/chap10/test.data: -------------------------------------------------------------------------------- 1 | 1,1,5.0 2 | 1,2,1.0 3 | 1,3,5.0 4 | 1,4,1.0 5 | 2,1,5.0 6 | 2,2,1.0 7 | 2,3,5.0 8 | 2,4,1.0 9 | 3,1,1.0 10 | 3,2,5.0 11 | 3,3,1.0 12 | 3,4,5.0 13 | 4,1,1.0 14 | 4,2,5.0 15 | 4,3,1.0 16 | 4,4,5.0 17 | -------------------------------------------------------------------------------- /code/chap11/airports.json: -------------------------------------------------------------------------------- 1 | {"id":"ORD","City":"Chicago","State":"IL","Country":"USA"} 2 | {"id":"LGA","City":"New York","State":"NY","Country":"USA"} 3 | {"id":"BOS","City":"Boston","State":"MA","Country":"USA"} 4 | {"id":"IAH","City":"Houston","State":"TX","Country":"USA"} 5 | {"id":"EWR","City":"Newark","State":"NJ","Country":"USA"} 6 | {"id":"DEN","City":"Denver","State":"CO","Country":"USA"} 7 | {"id":"MIA","City":"Miami","State":"FL","Country":"USA"} 8 | {"id":"SFO","City":"San Francisco","State":"CA","Country":"USA"} 9 | {"id":"ATL","City":"Atlanta","State":"GA","Country":"USA"} 10 | {"id":"DFW","City":"Dallas","State":"TX","Country":"USA"} 11 | {"id":"CLT","City":"Charlotte","State":"NC","Country":"USA"} 12 | {"id":"LAX","City":"Los Angeles","State":"CA","Country":"USA"} 13 | {"id":"SEA","City":"Seattle","State":"WA","Country":"USA"} 14 | -------------------------------------------------------------------------------- /code/chap11/breadth_first_search_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. applying Breadth-first search (BFS) algorithm 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 9 | export SPARK_PROG="/pyspark_book/code/chap11/breadth_first_search_example.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap11/connected_component_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. finding connected components 5 | # 6 | # Reference: https://en.wikipedia.org/wiki/Connected_component_(graph_theory) 7 | #----------------------------------------------------- 8 | # @author Mahmoud Parsian 9 | #----------------------------------------------------- 10 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 11 | export SPARK_PROG="/pyspark_book/code/chap11/connected_component_example.py" 12 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 13 | # 14 | # run the PySpark program: 15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 16 | -------------------------------------------------------------------------------- /code/chap11/graph_builder.log: -------------------------------------------------------------------------------- 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 2 | export SPARK_PROG="/pyspark_book/code/chap11/graph_builder.py" 3 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 4 | # 5 | # run the PySpark program: 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 7 | 8 | +---+-------+---+ 9 | | id| name|age| 10 | +---+-------+---+ 11 | | a| Alice| 34| 12 | | b| Bob| 36| 13 | | c|Charlie| 30| 14 | +---+-------+---+ 15 | 16 | +---+---+------------+ 17 | |src|dst|relationship| 18 | +---+---+------------+ 19 | | a| b| friend| 20 | | b| c| follow| 21 | | c| b| follow| 22 | +---+---+------------+ 23 | 24 | graph= GraphFrame( 25 | v:[id: string, name: string ... 1 more field], 26 | e:[src: string, dst: string ... 1 more field] 27 | ) 28 | 29 | +---+--------+ 30 | | id|inDegree| 31 | +---+--------+ 32 | | c| 1| 33 | | b| 2| 34 | +---+--------+ 35 | 36 | count_follow= 2 -------------------------------------------------------------------------------- /code/chap11/graph_builder.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building a graph using 3 | # GraphFrames package. 4 | #----------------------------------------------------- 5 | # @author Mahmoud Parsian 6 | #----------------------------------------------------- 7 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 8 | export SPARK_PROG="/pyspark_book/code/chap11/graph_builder.py" 9 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 10 | # 11 | # run the PySpark program: 12 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 13 | -------------------------------------------------------------------------------- /code/chap11/label_propagation_algorithm_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. applying Label Propagation Algorithm (LPA) 5 | # 6 | # Reference: https://en.wikipedia.org/wiki/Label_Propagation_Algorithm 7 | #----------------------------------------------------- 8 | # @author Mahmoud Parsian 9 | #----------------------------------------------------- 10 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 11 | export SPARK_PROG="/pyspark_book/code/chap11/label_propagation_algorithm_example.py" 12 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 13 | # 14 | # run the PySpark program: 15 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 16 | -------------------------------------------------------------------------------- /code/chap11/pagerank_data.txt: -------------------------------------------------------------------------------- 1 | 1,2 2 | 1,3 3 | 1,4 4 | 2,1 5 | 3,1 6 | 4,1 7 | 4,5 8 | 1,5 9 | -------------------------------------------------------------------------------- /code/chap11/pagerank_example.log: -------------------------------------------------------------------------------- 1 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 2 | export SPARK_PROG="/pyspark_book/code/chap11/pagerank_example.py" 3 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 4 | # 5 | # run the PySpark program: 6 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 7 | 8 | +---+-------+---+ 9 | | id| name|age| 10 | +---+-------+---+ 11 | | a| Alice| 34| 12 | | b| Bob| 36| 13 | | c|Charlie| 30| 14 | +---+-------+---+ 15 | 16 | +---+---+------------+ 17 | |src|dst|relationship| 18 | +---+---+------------+ 19 | | a| b| friend| 20 | | b| c| follow| 21 | | c| b| follow| 22 | +---+---+------------+ 23 | 24 | graph= GraphFrame( 25 | v:[id: string, name: string ... 1 more field], 26 | e:[src: string, dst: string ... 1 more field] 27 | ) 28 | 29 | +---+------------------+ 30 | | id| pagerank| 31 | +---+------------------+ 32 | | b|1.0905890109440908| 33 | | a| 0.01| 34 | | c|1.8994109890559092| 35 | +---+------------------+ 36 | -------------------------------------------------------------------------------- /code/chap11/pagerank_example.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building 3 | # 1. a graph using GraphFrames package. 4 | # 2. applying PageRank algorithm to the built graph 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 9 | export SPARK_PROG="/pyspark_book/code/chap11/pagerank_example.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap11/sample_graph_edges.txt: -------------------------------------------------------------------------------- 1 | edge_weight,from_id,to_id 2 | 0,5,15 3 | 1,18,8 4 | 2,6,1 5 | 3,0,10 6 | 4,2,4 7 | 5,19,7 8 | 6,9,7 9 | 7,11,9 10 | 8,14,9 11 | 9,16,11 12 | 10,17,8 13 | 1,3,4 14 | 2,12,15 15 | 3,13,2 16 | 4,21,0 17 | 5,22,4 18 | 16,22,8 19 | 17,24,4 20 | 18,28,7 21 | 19,28,13 22 | 20,28,16 23 | 1,29,11 24 | 2,30,16 25 | 3,31,15 26 | 24,32,2 27 | 25,32,30 28 | 6,35,11 29 | 7,35,24 30 | 28,36,16 31 | 29,39,7 32 | 30,39,28 33 | 1,40,7 34 | 2,40,11 35 | 3,41,5 36 | 4,41,16 37 | 5,41,32 38 | 6,42,32 39 | 7,43,36 40 | 8,44,16 41 | 9,46,7 42 | 6,49,3 43 | 1,5,31 44 | 2,30,42 45 | 4,17,22 46 | 4,18,22 47 | 1,50,51 48 | 2,51,52 49 | 3,50,52 50 | 1,71,72 51 | 1,71,73 52 | 1,72,73 53 | -------------------------------------------------------------------------------- /code/chap11/sample_graph_vertices.txt: -------------------------------------------------------------------------------- 1 | vertex_id 2 | 0 3 | 1 4 | 2 5 | 3 6 | 4 7 | 5 8 | 6 9 | 7 10 | 8 11 | 9 12 | 10 13 | 11 14 | 12 15 | 13 16 | 14 17 | 15 18 | 16 19 | 17 20 | 18 21 | 19 22 | 20 23 | 21 24 | 22 25 | 23 26 | 24 27 | 25 28 | 26 29 | 27 30 | 28 31 | 29 32 | 30 33 | 31 34 | 32 35 | 33 36 | 34 37 | 35 38 | 36 39 | 37 40 | 38 41 | 39 42 | 40 43 | 41 44 | 42 45 | 43 46 | 44 47 | 45 48 | 46 49 | 47 50 | 48 51 | 49 52 | 50 53 | 51 54 | 52 55 | 71 56 | 72 57 | 73 58 | -------------------------------------------------------------------------------- /code/chap11/shortest_path_finder.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for 3 | # 1. building a graph using GraphFrames package. 4 | # 2. finding shortest paths for given landmarks 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 9 | export SPARK_PROG="/pyspark_book/code/chap11/shortest_path_finder.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap11/triangles_counter.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building 3 | # 1. a graph using GraphFrames package. 4 | # 2. applying Triangles Counting algorithm to the built graph 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 9 | export SPARK_PROG="/pyspark_book/code/chap11/triangles_counter.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 11 | # 12 | # run the PySpark program: 13 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG 14 | -------------------------------------------------------------------------------- /code/chap11/unique_triangles_finder.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script for building 3 | # 1. a graph using GraphFrames package. 4 | # 2. find unique Triangles from the built graph 5 | #----------------------------------------------------- 6 | # @author Mahmoud Parsian 7 | #----------------------------------------------------- 8 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 9 | export SPARK_PROG="/pyspark_book/code/code/chap11/unique_triangles_finder.py" 10 | export GRAPH_FRAMES="graphframes:graphframes:0.5.0-spark2.1-s_2.11" 11 | # 12 | export VERTICES_PATH="/pyspark_book/code/chap11/sample_graph_vertices.txt" 13 | export EDGES_PATH="/pyspark_book/code/chap11/sample_graph_edges.txt" 14 | # 15 | # run the PySpark program: 16 | $SPARK_HOME/bin/spark-submit --packages $GRAPH_FRAMES $SPARK_PROG $VERTICES_PATH $EDGES_PATH 17 | -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_aggregatebykey.log: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_aggregatebykey.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | 10 | input_path: sample_input.txt 11 | 12 | records.count(): 12 13 | 14 | records.collect(): 15 | [ 16 | u'a,2', 17 | u'a,3', 18 | u'a,4', 19 | u'a,5', 20 | u'a,7', 21 | u'b,4', 22 | u'b,5', 23 | u'b,6', 24 | u'c,3', 25 | u'c,4', 26 | u'c,5', 27 | u'c,6' 28 | ] 29 | 30 | pairs.count(): 12 31 | 32 | pairs.collect(): 33 | [ 34 | (u'a', 2), 35 | (u'a', 3), 36 | (u'a', 4), 37 | (u'a', 5), 38 | (u'a', 7), 39 | (u'b', 4), 40 | (u'b', 5), 41 | (u'b', 6), 42 | (u'c', 3), 43 | (u'c', 4), 44 | (u'c', 5), 45 | (u'c', 6) 46 | ] 47 | 48 | sum_count.count(): 3 49 | 50 | sum_count.collect(): 51 | [ 52 | (u'a', (21, 5)), 53 | (u'c', (18, 4)), 54 | (u'b', (15, 3)) 55 | ] 56 | 57 | averages.count(): 3 58 | 59 | averages.collect(): 60 | [ 61 | (u'a', 4.2), 62 | (u'c', 4.5), 63 | (u'b', 5.0) 64 | ] 65 | -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_aggregatebykey.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_aggregatebykey.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_combinebykey.log: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_combinebykey.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | 10 | input_path: sample_input.txt 11 | 12 | records.count(): 12 13 | 14 | records.collect(): 15 | [ 16 | u'a,2', 17 | u'a,3', 18 | u'a,4', 19 | u'a,5', 20 | u'a,7', 21 | u'b,4', 22 | u'b,5', 23 | u'b,6', 24 | u'c,3', 25 | u'c,4', 26 | u'c,5', 27 | u'c,6' 28 | ] 29 | 30 | pairs.count(): 12 31 | 32 | pairs.collect(): 33 | [ 34 | (u'a', 2), 35 | (u'a', 3), 36 | (u'a', 4), 37 | (u'a', 5), 38 | (u'a', 7), 39 | (u'b', 4), 40 | (u'b', 5), 41 | (u'b', 6), 42 | (u'c', 3), 43 | (u'c', 4), 44 | (u'c', 5), 45 | (u'c', 6) 46 | ] 47 | 48 | sum_count.count(): 3 49 | 50 | sum_count.collect(): 51 | [ 52 | (u'a', (21, 5)), 53 | (u'c', (18, 4)), 54 | (u'b', (15, 3)) 55 | ] 56 | 57 | averages.count(): 3 58 | 59 | averages.collect(): 60 | [ 61 | (u'a', 4.2), 62 | (u'c', 4.5), 63 | (u'b', 5.0) 64 | ] -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_combinebykey.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_combinebykey.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_groupbykey.log: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_groupbykey.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | 10 | input_path: sample_input.txt 11 | 12 | records : sample_input.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 13 | 14 | records.count(): 12 15 | 16 | records.collect(): 17 | [ 18 | u'a,2', 19 | u'a,3', 20 | u'a,4', 21 | u'a,5', 22 | u'a,7', 23 | u'b,4', 24 | u'b,5', 25 | u'b,6', 26 | u'c,3', 27 | u'c,4', 28 | u'c,5', 29 | u'c,6' 30 | ] 31 | 32 | grouped_by_key : PythonRDD[9] at RDD at PythonRDD.scala:48 33 | 34 | grouped_by_key.count(): 3 35 | 36 | grouped_by_key.collect(): 37 | [ 38 | (u'a', ), 39 | (u'c', ), 40 | (u'b', ) 41 | ] 42 | 43 | grouped_by_key.mapValues(lambda values : list(values)).collect(): 44 | [ 45 | (u'a', [2, 3, 4, 5, 7]), 46 | (u'c', [3, 4, 5, 6]), 47 | (u'b', [4, 5, 6]) 48 | ] 49 | 50 | averages.count(): 3 51 | averages.collect(): 52 | [ 53 | (u'a', 4.2), 54 | (u'c', 4.5), 55 | (u'b', 5.0) 56 | ] -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_groupbykey.sh: -------------------------------------------------------------------------------- 1 | #========================================== 2 | # NOTE: 3 | # 4 | # In general, avoid using groupByKey(), and 5 | # instead use reduceByKey() or combineByKey(). 6 | # For details see: 7 | # https://databricks.gitbooks.io/databricks-spark-knowledge-base/content/best_practices/prefer_reducebykey_over_groupbykey.html 8 | # 9 | # The groupByKey() solution is provided for educational 10 | # purposes. If you need all of the values of a key for 11 | # some aggregation such as finding the "median" (which you 12 | # need all of the values per key), then the groupByKey() 13 | # may be used. 14 | #========================================== 15 | # 16 | # define PySpark program 17 | export PROG="/pyspark_book/code/chap12/average_monoid_use_groupbykey.py" 18 | # define your input path 19 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 20 | # define your Spark home directory 21 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 22 | # run the program 23 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 24 | -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_reducebykey.log: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_reducebykey.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | input_path: /pyspark_book/code/chap12/sample_input.txt 10 | 11 | records.count(): 12 12 | 13 | records.collect(): 14 | [ 15 | u'a,2', 16 | u'a,3', 17 | u'a,4', 18 | u'a,5', 19 | u'a,7', 20 | u'b,4', 21 | u'b,5', 22 | u'b,6', 23 | u'c,3', 24 | u'c,4', 25 | u'c,5', 26 | u'c,6' 27 | ] 28 | 29 | sum_and_freq.count(): 12 30 | 31 | sum_and_freq.collect(): 32 | [ 33 | (u'a', (2, 1)), 34 | (u'a', (3, 1)), 35 | (u'a', (4, 1)), 36 | (u'a', (5, 1)), 37 | (u'a', (7, 1)), 38 | (u'b', (4, 1)), 39 | (u'b', (5, 1)), 40 | (u'b', (6, 1)), 41 | (u'c', (3, 1)), 42 | (u'c', (4, 1)), 43 | (u'c', (5, 1)), 44 | (u'c', (6, 1)) 45 | ] 46 | 47 | sum_count.count(): 3 48 | 49 | sum_count.collect(): 50 | [ 51 | (u'a', (21, 5)), 52 | (u'c', (18, 4)), 53 | (u'b', (15, 3)) 54 | ] 55 | 56 | averages.count(): 3 57 | 58 | averages.collect(): 59 | [ 60 | (u'a', 4.2), 61 | (u'c', 4.5), 62 | (u'b', 5.0) 63 | ] 64 | -------------------------------------------------------------------------------- /code/chap12/average_monoid_use_reducebykey.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/average_monoid_use_reducebykey.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_input.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap12/inmapper_combiner_local_aggregation.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/inmapper_combiner_local_aggregation.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_dna_seq.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap12/inmapper_combiner_use_basic_mapreduce.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/inmapper_combiner_use_basic_mapreduce.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_dna_seq.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap12/inmapper_combiner_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | # define PySpark program 2 | export PROG="/pyspark_book/code/chap12/inmapper_combiner_use_mappartitions.py" 3 | # define your input path 4 | export INPUT="/pyspark_book/code/chap12/sample_dna_seq.txt" 5 | # define your Spark home directory 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | # run the program 8 | $SPARK_HOME/bin/spark-submit $PROG $INPUT 9 | -------------------------------------------------------------------------------- /code/chap12/minmax_force_empty_partitions.log: -------------------------------------------------------------------------------- 1 | #-------------------------------------------------- 2 | # NOTE: 3 | # when you view min_max_count.collect(), 4 | # the triplets (1, -1, 0) denotes the result 5 | # of an empty partition, which is filtered out 6 | #-------------------------------------------------- 7 | ./bin/spark-submit minmax_force_empty_partitions.py sample_numbers.txt 8 | 9 | spark= 10 | 11 | input_path= sample_numbers.txt 12 | 13 | rdd= sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 14 | rdd.count= 11 15 | rdd.collect()= [u'23,24,22,44,66,77,44,44,555,666', u'12,4,555,66,67,68,57,55,56,45,45,45,66,77', u'34,35,36,97300,78,79', u'120,44,444,445,345,345,555', u'11,33,34,35,36,37,47,7777,8888,6666,44,55', u'10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105', u'6,7,8,9,10', u'8,9,10,12,12', u'7777', u'222,333,444,555,666,111,112,5,113,114', u'5555,4444,24'] 16 | rdd.getNumPartitions()= 17 17 | 18 | min_max_count= PythonRDD[3] at RDD at PythonRDD.scala:48 19 | min_max_count.count= 17 20 | min_max_count.collect()= 21 | [ 22 | (22, 666, 10), 23 | (4, 555, 14), 24 | (1, -1, 0), 25 | (1, -1, 0), 26 | (34, 97300, 6), 27 | (44, 555, 7), 28 | (11, 8888, 12), 29 | (1, -1, 0), 30 | (1, -1, 0), 31 | (10, 105, 16), 32 | (1, -1, 0), 33 | (1, -1, 0), 34 | (6, 12, 10), 35 | (5, 7777, 11), 36 | (1, -1, 0), 37 | (24, 5555, 3), 38 | (1, -1, 0) 39 | ] 40 | 41 | final: (min, max, count)= ( 4 , 97300 , 89 ) 42 | -------------------------------------------------------------------------------- /code/chap12/minmax_force_empty_partitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run minmax_use_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_PATH="/pyspark_book/code/chap12/sample_numbers.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap12/minmax_force_empty_partitions.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap12/minmax_use_mappartitions.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit minmax_use_mappartitions.py sample_numbers.txt 2 | 3 | spark= 4 | 5 | input_path= sample_numbers.txt 6 | 7 | rdd= sample_numbers.txt MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0 8 | rdd.count= 11 9 | rdd.collect()= 10 | [ 11 | u'23,24,22,44,66,77,44,44,555,666', 12 | u'12,4,555,66,67,68,57,55,56,45,45,45,66,77', 13 | u'34,35,36,97300,78,79', 14 | u'120,44,444,445,345,345,555', 15 | u'11,33,34,35,36,37,47,7777,8888,6666,44,55', 16 | u'10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105', 17 | u'6,7,8,9,10', 18 | u'8,9,10,12,12', 19 | u'7777', 20 | u'222,333,444,555,666,111,112,5,113,114', 21 | u'5555,4444,24' 22 | ] 23 | 24 | rdd.getNumPartitions()= 2 25 | 26 | min_max_count= PythonRDD[3] at RDD at PythonRDD.scala:48 27 | 28 | type(partition_iterator)= 29 | first_record= 23,24,22,44,66,77,44,44,555,666 30 | 31 | type(partition_iterator)= 32 | first_record= 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105 33 | 34 | min_max_count.count= 2 35 | 36 | min_max_count.collect()= [(4, 97300, 49), (5, 7777, 40)] 37 | 38 | final: (min, max, count)= ( 4 , 97300 , 89 ) -------------------------------------------------------------------------------- /code/chap12/minmax_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run minmax_use_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export INPUT_PATH="/pyspark_book/code/chap12/sample_numbers.txt" 8 | export SPARK_PROG="/pyspark_book/code/chap12/minmax_use_mappartitions.py" 9 | # 10 | # run the PySpark program: 11 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $INPUT_PATH 12 | -------------------------------------------------------------------------------- /code/chap12/sample_dna_seq.txt: -------------------------------------------------------------------------------- 1 | ATCGGGATCCGGG 2 | ATTCCGGGATTCCCC 3 | ATGGCCCCCGGGATCGGG 4 | CGGTATCCGGGGAAAAA 5 | aaattCCGGAACCGGGGGTTT 6 | CCTTTTATCGGGCAAATTTTCCCGG 7 | attttcccccggaaaAAATTTCCGGG 8 | ACTGACTAGCTAGCTAACTG 9 | GCATCGTAGCTAGCTACGAT 10 | AATTCCCGCATCGATCGTACGTACGTAG 11 | ATCGATCGATCGTACGATCG 12 | -------------------------------------------------------------------------------- /code/chap12/sample_input.txt: -------------------------------------------------------------------------------- 1 | a,2 2 | a,3 3 | a,4 4 | a,5 5 | a,7 6 | b,4 7 | b,5 8 | b,6 9 | c,3 10 | c,4 11 | c,5 12 | c,6 13 | -------------------------------------------------------------------------------- /code/chap12/sample_numbers.txt: -------------------------------------------------------------------------------- 1 | 23,24,22,44,66,77,44,44,555,666 2 | 12,4,555,66,67,68,57,55,56,45,45,45,66,77 3 | 34,35,36,97300,78,79 4 | 120,44,444,445,345,345,555 5 | 11,33,34,35,36,37,47,7777,8888,6666,44,55 6 | 10,11,44,66,77,78,79,80,90,98,99,100,102,103,104,105 7 | 6,7,8,9,10 8 | 8,9,10,12,12 9 | 7777 10 | 222,333,444,555,666,111,112,5,113,114 11 | 5555,4444,24 12 | -------------------------------------------------------------------------------- /code/chap12/top_N_use_mappartitions.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run top_N_use_mappartitions.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap12/top_N_use_mappartitions.py" 8 | # 9 | # run the PySpark program: 10 | # find Top-3 11 | export N = 3 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N 13 | -------------------------------------------------------------------------------- /code/chap12/top_N_use_takeordered.log: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit top_N_use_takeordered.py 3 2 | 3 | spark= 4 | 5 | N : 3 6 | 7 | list_of_key_value = 8 | [ 9 | ('a', 1), ('a', 7), ('a', 2), ('a', 3), 10 | ('b', 2), ('b', 4), 11 | ('c', 10), ('c', 50), ('c', 60), ('c', 70), 12 | ('d', 5), ('d', 15), ('d', 25), 13 | ('e', 1), ('e', 2), 14 | ('f', 9), ('f', 2), 15 | ('g', 22) 16 | ] 17 | 18 | rdd= ParallelCollectionRDD[0] at parallelize at PythonRDD.scala:175 19 | rdd.count= 18 20 | rdd.collect()= 21 | [ 22 | ('a', 1), ('a', 7), ('a', 2), ('a', 3), 23 | ('b', 2), ('b', 4), 24 | ('c', 10), ('c', 50), ('c', 60), ('c', 70), 25 | ('d', 5), ('d', 15), ('d', 25), 26 | ('e', 1), ('e', 2), 27 | ('f', 9), ('f', 2), 28 | ('g', 22) 29 | ] 30 | 31 | combined= PythonRDD[6] at RDD at PythonRDD.scala:48 32 | combined.count= 7 33 | combined.collect()= 34 | [ 35 | ('a', 13), 36 | ('c', 190), 37 | ('b', 6), 38 | ('e', 3), 39 | ('d', 45), 40 | ('g', 22), 41 | ('f', 11) 42 | ] 43 | 44 | topN = [('c', 190), ('d', 45), ('g', 22)] 45 | 46 | bottomN = [('e', 3), ('b', 6), ('f', 11)] -------------------------------------------------------------------------------- /code/chap12/top_N_use_takeordered.sh: -------------------------------------------------------------------------------- 1 | #----------------------------------------------------- 2 | # This is a shell script to run top_N_use_takeordered.py 3 | #----------------------------------------------------- 4 | # @author Mahmoud Parsian 5 | #----------------------------------------------------- 6 | export SPARK_HOME="/pyspark_book/spark-2.4.3" 7 | export SPARK_PROG="/pyspark_book/code/chap12/top_N_use_takeordered.py" 8 | # 9 | # run the PySpark program: 10 | # find Top-3 11 | export N = 3 12 | $SPARK_HOME/bin/spark-submit $SPARK_PROG $N 13 | -------------------------------------------------------------------------------- /code/examples/wordcount/foxdata.txt: -------------------------------------------------------------------------------- 1 | red fox jumped high 2 | fox jumped over high fence 3 | red fox jumped 4 | -------------------------------------------------------------------------------- /code/examples/wordcount/wordcount.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | from pyspark.sql import SparkSession 4 | 5 | # 6 | print ("This is the name of the script: ", sys.argv[0]) 7 | print ("Number of arguments: ", len(sys.argv)) 8 | print ("The arguments are: " , str(sys.argv)) 9 | # 10 | if len(sys.argv) != 3: 11 | print("Usage: wordcount.py , ", file=sys.stderr) 12 | exit(-1) 13 | 14 | # DEFINE your input path 15 | input_path = sys.argv[1] 16 | print("input_path: ", input_path) 17 | 18 | # DEFINE your output path 19 | output_path = sys.argv[2] 20 | print("output_path: ", output_path) 21 | 22 | # CREATE an instance of a SparkSession object 23 | spark = SparkSession\ 24 | .builder\ 25 | .appName("PythonWordCount")\ 26 | .getOrCreate() 27 | 28 | # CREATE a new RDD[String] 29 | lines = spark.sparkContext.textFile(input_path) 30 | print("lines=", lines.collect()) 31 | 32 | # APPLY a SET of TRANSFORMATIONS... 33 | # counts: RDD[(String, Integer)] 34 | counts = lines.flatMap(lambda x: x.split(' ')) \ 35 | .map(lambda x: (x, 1)) \ 36 | .reduceByKey(lambda a,b : a+b) 37 | 38 | # output = [(word1, count1), (word2, count2), ...] 39 | output = counts.collect() 40 | for (word, count) in output: 41 | print("%s: %i" % (word, count)) 42 | 43 | # save output 44 | counts.saveAsTextFile(output_path) 45 | 46 | # DONE! 47 | spark.stop() 48 | -------------------------------------------------------------------------------- /code/examples/wordcount/wordcount.py.usage: -------------------------------------------------------------------------------- 1 | ./bin/spark-submit wordcount.py foxdata.txt /tmp/output 2 | -------------------------------------------------------------------------------- /code/jars/avro-mapred-1.7.7-hadoop1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/avro-mapred-1.7.7-hadoop1.jar -------------------------------------------------------------------------------- /code/jars/avro-mapred-1.7.7-hadoop2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/avro-mapred-1.7.7-hadoop2.jar -------------------------------------------------------------------------------- /code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/com-cotdp-hadoop-1.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /code/jars/elasticsearch-hadoop-6.4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/elasticsearch-hadoop-6.4.2.jar -------------------------------------------------------------------------------- /code/jars/elasticsearch-spark_2.11-2.4.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/elasticsearch-spark_2.11-2.4.5.jar -------------------------------------------------------------------------------- /code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/graphframes-0.6.0-spark2.3-s_2.11.jar -------------------------------------------------------------------------------- /code/jars/hbase-spark-connector-1.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/hbase-spark-connector-1.0.0.jar -------------------------------------------------------------------------------- /code/jars/htrace-core-3.1.0-incubating.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/htrace-core-3.1.0-incubating.jar -------------------------------------------------------------------------------- /code/jars/mongo-java-driver-3.8.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mongo-java-driver-3.8.2.jar -------------------------------------------------------------------------------- /code/jars/mongo-spark-connector_2.11-2.2.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mongo-spark-connector_2.11-2.2.5.jar -------------------------------------------------------------------------------- /code/jars/mongodb-driver-3.8.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mongodb-driver-3.8.2.jar -------------------------------------------------------------------------------- /code/jars/mysql-connector-java-5.1.42.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/mysql-connector-java-5.1.42.jar -------------------------------------------------------------------------------- /code/jars/shc-core-1.1.3-2.3-s_2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/shc-core-1.1.3-2.3-s_2.11.jar -------------------------------------------------------------------------------- /code/jars/shc-examples-1.1.3-2.3-s_2.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/shc-examples-1.1.3-2.3-s_2.11.jar -------------------------------------------------------------------------------- /code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/spark-redis-2.3.1-SNAPSHOT-jar-with-dependencies.jar -------------------------------------------------------------------------------- /code/jars/spark-redis-2.3.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/code/jars/spark-redis-2.3.1-SNAPSHOT.jar -------------------------------------------------------------------------------- /images/pyspark_algorithms.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms.jpg -------------------------------------------------------------------------------- /images/pyspark_algorithms0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms0.jpg -------------------------------------------------------------------------------- /images/pyspark_algorithms2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms2.jpg -------------------------------------------------------------------------------- /images/pyspark_algorithms3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/images/pyspark_algorithms3.jpg -------------------------------------------------------------------------------- /sample_chapters/Appendix_Questions_and_Answers.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/Appendix_Questions_and_Answers.epub -------------------------------------------------------------------------------- /sample_chapters/Appendix_Questions_and_Answers.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/Appendix_Questions_and_Answers.pdf -------------------------------------------------------------------------------- /sample_chapters/README.md: -------------------------------------------------------------------------------- 1 | ## Sample Chapters of PySpark Algorithms Book 2 | -------------------------------------------------------------------------------- /sample_chapters/chap04_Getting_Started_with_PySpark.epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/chap04_Getting_Started_with_PySpark.epub -------------------------------------------------------------------------------- /sample_chapters/chap04_Getting_Started_with_PySpark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mahmoudparsian/pyspark-algorithms/26f4090ac8d41fd7fff00c66d8e74a1a8ef5f806/sample_chapters/chap04_Getting_Started_with_PySpark.pdf -------------------------------------------------------------------------------- /where_to_buy_book/README.md: -------------------------------------------------------------------------------- 1 | # PySpark Algorithms Book from Amazon.com 2 | 3 | * Author: Mahmoud Parsian 4 | * Published Date: August 2019 5 | 6 | ## Purchase [PySpark Algorithms Book → PDF Version (.pdf)](https://www.amazon.com/PySpark-Algorithms-Mahmoud-Parsian-ebook/dp/B07WQHTVCJ/) 7 | 8 | ## Purchase [PySpark Algorithms Book → Kindle Version (.kpf)](https://www.amazon.com/dp/B07X4B2218/ref=sr_1_2) 9 | 10 | --------------------------------------------------------------------------------