├── .gitignore ├── README.md ├── books ├── pyspark.pdf ├── pyspark_tutorial.pdf └── sagemaker_pyspark.md ├── cheat-sheets ├── PySpark-SQL-cheat-sheet.pdf ├── PySpark_Cheat_Sheet_Python.pdf └── PySpark_SQL_Cheat_Sheet_Python.pdf ├── content ├── 0_introduction_to_pyspark │ ├── 0_getting_to_know_pyspark │ │ ├── 0_spark.md │ │ ├── 1_spark_in_python.md │ │ ├── 2_dataframes.md │ │ ├── 3_creating_a_spark_session.md │ │ ├── 4_viewing_tables.md │ │ ├── 5_query.md │ │ ├── 6_pandafying.md │ │ ├── 7_data_in_spark.md │ │ └── 8_read_csv.md │ ├── 1_manipulating_data │ │ ├── 0_creating_columns.md │ │ ├── 1_sql_filter.md │ │ ├── 2_selecting_data.md │ │ ├── 3_aggregating_data.md │ │ ├── 4_grouping.md │ │ └── 5_joining.md │ ├── 2_getting_started_ml_pipelines │ │ ├── 0_ml_pipelines.md │ │ ├── 10_vector_assemble.md │ │ ├── 11_pipeline.md │ │ ├── 12_train_vs_test.md │ │ ├── 13_split.md │ │ ├── 1_join_dataframe.md │ │ ├── 2_data_type.md │ │ ├── 3_cast_columns.md │ │ ├── 4_create_columns.md │ │ ├── 5_making_a_boolean.md │ │ ├── 6_strings_and_factors.md │ │ ├── 7_carrier.md │ │ ├── 8_destination.md │ │ └── 9_assemble_a_vector.md │ └── 3_model_tuning_and_selection │ │ ├── 0_logistic_regression.md │ │ ├── 1_modeller.md │ │ ├── 2_cross_validation.md │ │ ├── 3_evaluator.md │ │ ├── 4_grid.md │ │ ├── 5_cross_val.md │ │ ├── 6_fit_model.md │ │ └── 7_model_evaluation.md ├── 1_big_data_fundamentals_with_pyspark │ ├── 0_intro_to_big_data │ │ ├── 0_what_is_big_data.md │ │ ├── 1_pyspark.md │ │ ├── 2_spark_context.md │ │ ├── 3_interactive_pyspark.md │ │ ├── 4_loading_data.md │ │ └── 5_functional_p.md │ ├── 1_pyspark_rdd │ │ ├── 0_abstracting_data_with_RDD.md │ │ ├── 10_remove_stop.md │ │ ├── 11_word_frequencies.md │ │ ├── 1_RDD_dataset.md │ │ ├── 2_read_file.md │ │ ├── 3_partitioning.md │ │ ├── 4_filter_and_count.md │ │ ├── 5_map_collect.md │ │ ├── 6_pair_rdd.md │ │ ├── 7_reduce_by_key.md │ │ ├── 8_counting_by_key.md │ │ └── 9_base_rdd.md │ └── 2_pyspark_sql_dataframes │ │ ├── 0_rdd_to_dataframe.md │ │ ├── 1_read_csv.md │ │ ├── 2_inspecting_data.md │ │ ├── 3_dataframe_subsetting.md │ │ ├── 4_filtering_dataframe.md │ │ ├── 5_running_sql.md │ │ ├── 6_filtering_table.md │ │ ├── 7_df_viz.md │ │ ├── 8_queries_from_dataframe.md │ │ └── 9_data_vis.md ├── 2_ml_with_pyspark_mllib │ ├── 0_overview.md │ ├── 1_pyspark_algorithms.md │ ├── 2_loading_into_rdd.md │ ├── 3_model_evaluation_mse.md │ ├── 4_loading_spam_ham.md │ ├── 5_feature_hashing.md │ ├── 6_logistic_regression.md │ ├── 7_loading.md │ ├── 8_k_means_training.md │ └── 9_cluster_center.md ├── 3_cleaning_data │ ├── 0_defining_a_schema.md │ ├── 10_id_tricks.md │ ├── 11_caching.md │ ├── 12_unpersist.md │ ├── 13_full_split.md │ ├── 14_explain.md │ ├── 15_broadcast.md │ ├── 16_pipeline.md │ ├── 17_pipeline.md │ ├── 18_further_parsing.md │ ├── 19_validation.md │ ├── 1_lazy_processing.md │ ├── 20_invalid_rows.md │ ├── 21_dog_parsing.md │ ├── 22_validation.md │ ├── 23_validation │ ├── 2_parquet.md │ ├── 3_sql_and_parquet.md │ ├── 4_filtering_column.md │ ├── 5_modifying_df_cols.md │ ├── 6_when.md │ ├── 7_udf.md │ ├── 8_id_field.md │ └── 9_partitions.md ├── 4_feature_engineering │ ├── 1.md │ ├── 2.md │ ├── 3.md │ └── 4.md ├── 5_ml │ ├── 1.md │ ├── 2.md │ ├── 3.md │ └── 4.md └── 6_recommendation_system │ ├── 1.md │ ├── 2.md │ ├── 3.md │ └── 4.md ├── datasets ├── airports.csv ├── flights.csv └── planes.csv ├── images ├── certificate.png └── pyspark.png ├── jars ├── snowflake-jdbc-3.13.6.jar └── spark-snowflake_2.12-2.9.0-spark_3.1.jar ├── other_materials ├── AWS-PySpark-tutorial.pdf ├── ES2097017519.pdf ├── Jin_-_Improving_Python__Spark_Performance_-_Spark_Summit_West.pdf └── Lecture3s.pdf ├── poc.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/README.md -------------------------------------------------------------------------------- /books/pyspark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/books/pyspark.pdf -------------------------------------------------------------------------------- /books/pyspark_tutorial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/books/pyspark_tutorial.pdf -------------------------------------------------------------------------------- /books/sagemaker_pyspark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/books/sagemaker_pyspark.md -------------------------------------------------------------------------------- /cheat-sheets/PySpark-SQL-cheat-sheet.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/cheat-sheets/PySpark-SQL-cheat-sheet.pdf -------------------------------------------------------------------------------- /cheat-sheets/PySpark_Cheat_Sheet_Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/cheat-sheets/PySpark_Cheat_Sheet_Python.pdf -------------------------------------------------------------------------------- /cheat-sheets/PySpark_SQL_Cheat_Sheet_Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/cheat-sheets/PySpark_SQL_Cheat_Sheet_Python.pdf -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/0_spark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/0_spark.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/1_spark_in_python.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/1_spark_in_python.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/2_dataframes.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/2_dataframes.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/3_creating_a_spark_session.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/3_creating_a_spark_session.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/4_viewing_tables.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/4_viewing_tables.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/5_query.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/5_query.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/6_pandafying.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/6_pandafying.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/7_data_in_spark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/7_data_in_spark.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/0_getting_to_know_pyspark/8_read_csv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/0_getting_to_know_pyspark/8_read_csv.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/1_manipulating_data/0_creating_columns.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/1_manipulating_data/0_creating_columns.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/1_manipulating_data/1_sql_filter.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/1_manipulating_data/1_sql_filter.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/1_manipulating_data/2_selecting_data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/1_manipulating_data/2_selecting_data.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/1_manipulating_data/3_aggregating_data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/1_manipulating_data/3_aggregating_data.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/1_manipulating_data/4_grouping.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/1_manipulating_data/4_grouping.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/1_manipulating_data/5_joining.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/1_manipulating_data/5_joining.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/0_ml_pipelines.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/0_ml_pipelines.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/10_vector_assemble.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/10_vector_assemble.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/11_pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/11_pipeline.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/12_train_vs_test.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/12_train_vs_test.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/13_split.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/13_split.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/1_join_dataframe.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/1_join_dataframe.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/2_data_type.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/2_data_type.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/3_cast_columns.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/3_cast_columns.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/4_create_columns.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/4_create_columns.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/5_making_a_boolean.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/5_making_a_boolean.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/6_strings_and_factors.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/6_strings_and_factors.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/7_carrier.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/7_carrier.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/8_destination.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/8_destination.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/2_getting_started_ml_pipelines/9_assemble_a_vector.md: -------------------------------------------------------------------------------- 1 | # Assemble a vector 2 | -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/0_logistic_regression.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/0_logistic_regression.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/1_modeller.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/1_modeller.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/2_cross_validation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/2_cross_validation.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/3_evaluator.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/3_evaluator.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/4_grid.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/4_grid.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/5_cross_val.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/5_cross_val.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/6_fit_model.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/6_fit_model.md -------------------------------------------------------------------------------- /content/0_introduction_to_pyspark/3_model_tuning_and_selection/7_model_evaluation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/0_introduction_to_pyspark/3_model_tuning_and_selection/7_model_evaluation.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/0_what_is_big_data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/0_what_is_big_data.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/1_pyspark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/1_pyspark.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/2_spark_context.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/2_spark_context.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/3_interactive_pyspark.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/3_interactive_pyspark.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/4_loading_data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/4_loading_data.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/5_functional_p.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/0_intro_to_big_data/5_functional_p.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/0_abstracting_data_with_RDD.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/0_abstracting_data_with_RDD.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/10_remove_stop.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/10_remove_stop.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/11_word_frequencies.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/11_word_frequencies.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/1_RDD_dataset.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/1_RDD_dataset.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/2_read_file.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/2_read_file.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/3_partitioning.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/3_partitioning.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/4_filter_and_count.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/4_filter_and_count.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/5_map_collect.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/5_map_collect.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/6_pair_rdd.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/6_pair_rdd.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/7_reduce_by_key.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/7_reduce_by_key.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/8_counting_by_key.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/8_counting_by_key.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/9_base_rdd.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/1_pyspark_rdd/9_base_rdd.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/0_rdd_to_dataframe.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/0_rdd_to_dataframe.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/1_read_csv.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/1_read_csv.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/2_inspecting_data.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/2_inspecting_data.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/3_dataframe_subsetting.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/3_dataframe_subsetting.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/4_filtering_dataframe.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/4_filtering_dataframe.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/5_running_sql.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/5_running_sql.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/6_filtering_table.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/6_filtering_table.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/7_df_viz.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/7_df_viz.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/8_queries_from_dataframe.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/8_queries_from_dataframe.md -------------------------------------------------------------------------------- /content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/9_data_vis.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/1_big_data_fundamentals_with_pyspark/2_pyspark_sql_dataframes/9_data_vis.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/0_overview.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/0_overview.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/1_pyspark_algorithms.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/1_pyspark_algorithms.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/2_loading_into_rdd.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/2_loading_into_rdd.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/3_model_evaluation_mse.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/3_model_evaluation_mse.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/4_loading_spam_ham.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/4_loading_spam_ham.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/5_feature_hashing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/5_feature_hashing.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/6_logistic_regression.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/6_logistic_regression.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/7_loading.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/7_loading.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/8_k_means_training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/8_k_means_training.md -------------------------------------------------------------------------------- /content/2_ml_with_pyspark_mllib/9_cluster_center.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/2_ml_with_pyspark_mllib/9_cluster_center.md -------------------------------------------------------------------------------- /content/3_cleaning_data/0_defining_a_schema.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/0_defining_a_schema.md -------------------------------------------------------------------------------- /content/3_cleaning_data/10_id_tricks.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/10_id_tricks.md -------------------------------------------------------------------------------- /content/3_cleaning_data/11_caching.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/11_caching.md -------------------------------------------------------------------------------- /content/3_cleaning_data/12_unpersist.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/12_unpersist.md -------------------------------------------------------------------------------- /content/3_cleaning_data/13_full_split.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/13_full_split.md -------------------------------------------------------------------------------- /content/3_cleaning_data/14_explain.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/14_explain.md -------------------------------------------------------------------------------- /content/3_cleaning_data/15_broadcast.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/15_broadcast.md -------------------------------------------------------------------------------- /content/3_cleaning_data/16_pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/16_pipeline.md -------------------------------------------------------------------------------- /content/3_cleaning_data/17_pipeline.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/17_pipeline.md -------------------------------------------------------------------------------- /content/3_cleaning_data/18_further_parsing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/18_further_parsing.md -------------------------------------------------------------------------------- /content/3_cleaning_data/19_validation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/19_validation.md -------------------------------------------------------------------------------- /content/3_cleaning_data/1_lazy_processing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/1_lazy_processing.md -------------------------------------------------------------------------------- /content/3_cleaning_data/20_invalid_rows.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/20_invalid_rows.md -------------------------------------------------------------------------------- /content/3_cleaning_data/21_dog_parsing.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/21_dog_parsing.md -------------------------------------------------------------------------------- /content/3_cleaning_data/22_validation.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/22_validation.md -------------------------------------------------------------------------------- /content/3_cleaning_data/23_validation: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/23_validation -------------------------------------------------------------------------------- /content/3_cleaning_data/2_parquet.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/2_parquet.md -------------------------------------------------------------------------------- /content/3_cleaning_data/3_sql_and_parquet.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/3_sql_and_parquet.md -------------------------------------------------------------------------------- /content/3_cleaning_data/4_filtering_column.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/4_filtering_column.md -------------------------------------------------------------------------------- /content/3_cleaning_data/5_modifying_df_cols.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/5_modifying_df_cols.md -------------------------------------------------------------------------------- /content/3_cleaning_data/6_when.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/6_when.md -------------------------------------------------------------------------------- /content/3_cleaning_data/7_udf.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/7_udf.md -------------------------------------------------------------------------------- /content/3_cleaning_data/8_id_field.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/8_id_field.md -------------------------------------------------------------------------------- /content/3_cleaning_data/9_partitions.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/3_cleaning_data/9_partitions.md -------------------------------------------------------------------------------- /content/4_feature_engineering/1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/4_feature_engineering/1.md -------------------------------------------------------------------------------- /content/4_feature_engineering/2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/4_feature_engineering/2.md -------------------------------------------------------------------------------- /content/4_feature_engineering/3.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/4_feature_engineering/3.md -------------------------------------------------------------------------------- /content/4_feature_engineering/4.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/4_feature_engineering/4.md -------------------------------------------------------------------------------- /content/5_ml/1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/5_ml/1.md -------------------------------------------------------------------------------- /content/5_ml/2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/5_ml/2.md -------------------------------------------------------------------------------- /content/5_ml/3.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/5_ml/3.md -------------------------------------------------------------------------------- /content/5_ml/4.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/5_ml/4.md -------------------------------------------------------------------------------- /content/6_recommendation_system/1.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/6_recommendation_system/1.md -------------------------------------------------------------------------------- /content/6_recommendation_system/2.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/6_recommendation_system/2.md -------------------------------------------------------------------------------- /content/6_recommendation_system/3.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/6_recommendation_system/3.md -------------------------------------------------------------------------------- /content/6_recommendation_system/4.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/content/6_recommendation_system/4.md -------------------------------------------------------------------------------- /datasets/airports.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/datasets/airports.csv -------------------------------------------------------------------------------- /datasets/flights.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/datasets/flights.csv -------------------------------------------------------------------------------- /datasets/planes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/datasets/planes.csv -------------------------------------------------------------------------------- /images/certificate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/images/certificate.png -------------------------------------------------------------------------------- /images/pyspark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/images/pyspark.png -------------------------------------------------------------------------------- /jars/snowflake-jdbc-3.13.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/jars/snowflake-jdbc-3.13.6.jar -------------------------------------------------------------------------------- /jars/spark-snowflake_2.12-2.9.0-spark_3.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/jars/spark-snowflake_2.12-2.9.0-spark_3.1.jar -------------------------------------------------------------------------------- /other_materials/AWS-PySpark-tutorial.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/other_materials/AWS-PySpark-tutorial.pdf -------------------------------------------------------------------------------- /other_materials/ES2097017519.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/other_materials/ES2097017519.pdf -------------------------------------------------------------------------------- /other_materials/Jin_-_Improving_Python__Spark_Performance_-_Spark_Summit_West.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/other_materials/Jin_-_Improving_Python__Spark_Performance_-_Spark_Summit_West.pdf -------------------------------------------------------------------------------- /other_materials/Lecture3s.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/other_materials/Lecture3s.pdf -------------------------------------------------------------------------------- /poc.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/poc.ipynb -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ayushsubedi/big-data-with-pyspark/HEAD/requirements.txt --------------------------------------------------------------------------------