├── .idea ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── libraries │ ├── Maven__com_clearspring_analytics_stream_2_7_0.xml │ ├── Maven__com_esotericsoftware_kryo_shaded_3_0_3.xml │ ├── Maven__com_esotericsoftware_minlog_1_3_0.xml │ ├── Maven__com_fasterxml_jackson_core_jackson_annotations_2_6_5.xml │ ├── Maven__com_fasterxml_jackson_core_jackson_core_2_6_5.xml │ ├── Maven__com_fasterxml_jackson_core_jackson_databind_2_6_5.xml │ ├── Maven__com_fasterxml_jackson_module_jackson_module_paranamer_2_6_5.xml │ ├── Maven__com_fasterxml_jackson_module_jackson_module_scala_2_11_2_6_5.xml │ ├── Maven__com_google_code_findbugs_jsr305_1_3_9.xml │ ├── Maven__com_google_code_gson_gson_2_2_4.xml │ ├── Maven__com_google_guava_guava_16_0_1.xml │ ├── Maven__com_google_protobuf_protobuf_java_2_5_0.xml │ ├── Maven__com_jamesmurty_utils_java_xmlbuilder_1_0.xml │ ├── Maven__com_ning_compress_lzf_1_0_3.xml │ ├── Maven__com_thoughtworks_paranamer_paranamer_2_3.xml │ ├── Maven__com_twitter_chill_2_11_0_8_0.xml │ ├── Maven__com_twitter_chill_java_0_8_0.xml │ ├── Maven__com_univocity_univocity_parsers_2_2_1.xml │ ├── Maven__commons_beanutils_commons_beanutils_1_7_0.xml │ ├── Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml │ ├── Maven__commons_cli_commons_cli_1_2.xml │ ├── Maven__commons_codec_commons_codec_1_8.xml │ ├── Maven__commons_collections_commons_collections_3_2_2.xml │ ├── Maven__commons_configuration_commons_configuration_1_6.xml │ ├── Maven__commons_digester_commons_digester_1_8.xml │ ├── Maven__commons_httpclient_commons_httpclient_3_1.xml │ ├── Maven__commons_io_commons_io_2_4.xml │ ├── Maven__commons_lang_commons_lang_2_6.xml │ ├── Maven__commons_net_commons_net_2_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml │ ├── Maven__io_netty_netty_3_9_9_Final.xml │ ├── Maven__io_netty_netty_all_4_0_43_Final.xml │ ├── Maven__javax_activation_activation_1_1_1.xml │ ├── Maven__javax_annotation_javax_annotation_api_1_2.xml │ ├── Maven__javax_mail_mail_1_4_7.xml │ ├── Maven__javax_servlet_javax_servlet_api_3_1_0.xml │ ├── Maven__javax_validation_validation_api_1_1_0_Final.xml │ ├── Maven__javax_ws_rs_javax_ws_rs_api_2_0_1.xml │ ├── Maven__javax_xml_bind_jaxb_api_2_2_2.xml │ ├── Maven__javax_xml_stream_stax_api_1_0_2.xml │ ├── Maven__log4j_log4j_1_2_17.xml │ ├── Maven__mx4j_mx4j_3_0_2.xml │ ├── Maven__net_iharder_base64_2_3_8.xml │ ├── Maven__net_java_dev_jets3t_jets3t_0_9_3.xml │ ├── Maven__net_jpountz_lz4_lz4_1_3_0.xml │ ├── Maven__net_razorvine_pyrolite_4_13.xml │ ├── Maven__net_sf_py4j_py4j_0_10_4.xml │ ├── Maven__org_antlr_antlr4_runtime_4_5_3.xml │ ├── Maven__org_apache_avro_avro_1_7_7.xml │ ├── Maven__org_apache_avro_avro_ipc_1_7_7.xml │ ├── Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml │ ├── Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml │ ├── Maven__org_apache_commons_commons_compress_1_4_1.xml │ ├── Maven__org_apache_commons_commons_crypto_1_0_0.xml │ ├── Maven__org_apache_commons_commons_lang3_3_5.xml │ ├── Maven__org_apache_commons_commons_math3_3_4_1.xml │ ├── Maven__org_apache_curator_curator_client_2_6_0.xml │ ├── Maven__org_apache_curator_curator_framework_2_6_0.xml │ ├── Maven__org_apache_curator_curator_recipes_2_6_0.xml │ ├── Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml │ ├── Maven__org_apache_directory_api_api_util_1_0_0_M20.xml │ ├── Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml │ ├── Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml │ ├── Maven__org_apache_hadoop_hadoop_annotations_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_auth_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_client_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_common_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_hdfs_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_api_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_client_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_common_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_server_common_2_6_5.xml │ ├── Maven__org_apache_httpcomponents_httpclient_4_3_6.xml │ ├── Maven__org_apache_httpcomponents_httpcore_4_3_3.xml │ ├── Maven__org_apache_ivy_ivy_2_4_0.xml │ ├── Maven__org_apache_parquet_parquet_column_1_8_2.xml │ ├── Maven__org_apache_parquet_parquet_common_1_8_2.xml │ ├── Maven__org_apache_parquet_parquet_encoding_1_8_2.xml │ ├── Maven__org_apache_parquet_parquet_format_2_3_1.xml │ ├── Maven__org_apache_parquet_parquet_hadoop_1_8_2.xml │ ├── Maven__org_apache_parquet_parquet_jackson_1_8_2.xml │ ├── Maven__org_apache_spark_spark_catalyst_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_core_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_launcher_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_network_common_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_network_shuffle_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_sketch_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_sql_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_streaming_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_tags_2_11_2_2_0.xml │ ├── Maven__org_apache_spark_spark_unsafe_2_11_2_2_0.xml │ ├── Maven__org_apache_xbean_xbean_asm5_shaded_4_4.xml │ ├── Maven__org_apache_zookeeper_zookeeper_3_4_6.xml │ ├── Maven__org_bouncycastle_bcprov_jdk15on_1_51.xml │ ├── Maven__org_codehaus_jackson_jackson_core_asl_1_9_13.xml │ ├── Maven__org_codehaus_jackson_jackson_jaxrs_1_9_13.xml │ ├── Maven__org_codehaus_jackson_jackson_mapper_asl_1_9_13.xml │ ├── Maven__org_codehaus_jackson_jackson_xc_1_9_13.xml │ ├── Maven__org_codehaus_janino_commons_compiler_3_0_0.xml │ ├── Maven__org_codehaus_janino_janino_3_0_0.xml │ ├── Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml │ ├── Maven__org_glassfish_hk2_external_aopalliance_repackaged_2_4_0_b34.xml │ ├── Maven__org_glassfish_hk2_external_javax_inject_2_4_0_b34.xml │ ├── Maven__org_glassfish_hk2_hk2_api_2_4_0_b34.xml │ ├── Maven__org_glassfish_hk2_hk2_locator_2_4_0_b34.xml │ ├── Maven__org_glassfish_hk2_hk2_utils_2_4_0_b34.xml │ ├── Maven__org_glassfish_hk2_osgi_resource_locator_1_0_1.xml │ ├── Maven__org_glassfish_jersey_bundles_repackaged_jersey_guava_2_22_2.xml │ ├── Maven__org_glassfish_jersey_containers_jersey_container_servlet_2_22_2.xml │ ├── Maven__org_glassfish_jersey_containers_jersey_container_servlet_core_2_22_2.xml │ ├── Maven__org_glassfish_jersey_core_jersey_client_2_22_2.xml │ ├── Maven__org_glassfish_jersey_core_jersey_common_2_22_2.xml │ ├── Maven__org_glassfish_jersey_core_jersey_server_2_22_2.xml │ ├── Maven__org_glassfish_jersey_media_jersey_media_jaxb_2_22_2.xml │ ├── Maven__org_htrace_htrace_core_3_0_4.xml │ ├── Maven__org_javassist_javassist_3_18_1_GA.xml │ ├── Maven__org_json4s_json4s_ast_2_11_3_2_11.xml │ ├── Maven__org_json4s_json4s_core_2_11_3_2_11.xml │ ├── Maven__org_json4s_json4s_jackson_2_11_3_2_11.xml │ ├── Maven__org_mortbay_jetty_jetty_util_6_1_26.xml │ ├── Maven__org_objenesis_objenesis_2_1.xml │ ├── Maven__org_roaringbitmap_RoaringBitmap_0_5_11.xml │ ├── Maven__org_scala_lang_modules_scala_parser_combinators_2_11_1_0_1.xml │ ├── Maven__org_scala_lang_modules_scala_xml_2_11_1_0_1.xml │ ├── Maven__org_scala_lang_scala_compiler_2_11_0.xml │ ├── Maven__org_scala_lang_scala_library_2_11_8.xml │ ├── Maven__org_scala_lang_scala_reflect_2_11_7.xml │ ├── Maven__org_scala_lang_scalap_2_11_0.xml │ ├── Maven__org_slf4j_jcl_over_slf4j_1_7_16.xml │ ├── Maven__org_slf4j_jul_to_slf4j_1_7_16.xml │ ├── Maven__org_slf4j_slf4j_api_1_7_16.xml │ ├── Maven__org_slf4j_slf4j_log4j12_1_7_16.xml │ ├── Maven__org_spark_project_spark_unused_1_0_0.xml │ ├── Maven__org_tukaani_xz_1_0.xml │ ├── Maven__org_xerial_snappy_snappy_java_1_1_2_6.xml │ ├── Maven__oro_oro_2_0_8.xml │ ├── Maven__xerces_xercesImpl_2_9_1.xml │ ├── Maven__xml_apis_xml_apis_1_3_04.xml │ └── Maven__xmlenc_xmlenc_0_52.xml ├── misc.xml ├── modules.xml ├── scala_compiler.xml ├── uiDesigner.xml └── vcs.xml ├── LICENSE ├── README.md ├── learning-spark-with-java.iml ├── pom.xml └── src └── main ├── java ├── dataframe │ ├── DatasetConversion.java │ ├── FromRowsAndSchema.java │ └── README.md ├── dataset │ ├── Basic.java │ ├── ComplexType.java │ ├── JavaBean.java │ └── README.md ├── pairs │ ├── Basic.java │ └── README.md ├── rdd │ ├── Basic.java │ └── README.md └── streaming │ ├── FileBased.java │ ├── Filtering.java │ ├── MapWithState.java │ ├── MulitpleTransformations.java │ ├── Pairs.java │ ├── README.md │ ├── SimpleRecoveryFromCheckpoint.java │ ├── StateAccumulation.java │ ├── Windowing.java │ └── util │ ├── CSVFileStreamGenerator.java │ └── StreamingItem.java └── resources └── log4j.properties /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_clearspring_analytics_stream_2_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_esotericsoftware_kryo_shaded_3_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_esotericsoftware_minlog_1_3_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_module_jackson_module_paranamer_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_module_jackson_module_scala_2_11_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_code_findbugs_jsr305_1_3_9.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_code_gson_gson_2_2_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_guava_guava_16_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_protobuf_protobuf_java_2_5_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_jamesmurty_utils_java_xmlbuilder_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_ning_compress_lzf_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_thoughtworks_paranamer_paranamer_2_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_twitter_chill_2_11_0_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_twitter_chill_java_0_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_univocity_univocity_parsers_2_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_codec_commons_codec_1_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_collections_commons_collections_3_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_digester_commons_digester_1_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_io_commons_io_2_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_net_commons_net_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_netty_netty_3_9_9_Final.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_netty_netty_all_4_0_43_Final.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_activation_activation_1_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_annotation_javax_annotation_api_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_mail_mail_1_4_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_ws_rs_javax_ws_rs_api_2_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_xml_bind_jaxb_api_2_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_xml_stream_stax_api_1_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__log4j_log4j_1_2_17.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__mx4j_mx4j_3_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_iharder_base64_2_3_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_java_dev_jets3t_jets3t_0_9_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_jpountz_lz4_lz4_1_3_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_razorvine_pyrolite_4_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_sf_py4j_py4j_0_10_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_antlr_antlr4_runtime_4_5_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_ipc_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_compress_1_4_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_crypto_1_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_lang3_3_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_math3_3_4_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_curator_curator_client_2_6_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_curator_curator_framework_2_6_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_curator_curator_recipes_2_6_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_api_api_asn1_api_1_0_0_M20.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_api_api_util_1_0_0_M20.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_server_apacheds_i18n_2_0_0_M15.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_client_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_hdfs_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_api_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_client_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_common_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_server_common_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_3_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_3_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_ivy_ivy_2_4_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_column_1_8_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_common_1_8_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_encoding_1_8_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_format_2_3_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_hadoop_1_8_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_jackson_1_8_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_catalyst_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_core_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_launcher_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_network_common_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_network_shuffle_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_sketch_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_sql_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_streaming_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_tags_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_unsafe_2_11_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_xbean_xbean_asm5_shaded_4_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_bouncycastle_bcprov_jdk15on_1_51.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_core_asl_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_jaxrs_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_mapper_asl_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_xc_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_janino_commons_compiler_3_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_janino_janino_3_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_external_aopalliance_repackaged_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_external_javax_inject_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_hk2_api_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_hk2_locator_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_hk2_utils_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_osgi_resource_locator_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_bundles_repackaged_jersey_guava_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_containers_jersey_container_servlet_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_containers_jersey_container_servlet_core_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_core_jersey_client_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_core_jersey_common_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_core_jersey_server_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_media_jersey_media_jaxb_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_htrace_htrace_core_3_0_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_javassist_javassist_3_18_1_GA.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_json4s_json4s_ast_2_11_3_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_json4s_json4s_core_2_11_3_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_json4s_json4s_jackson_2_11_3_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_objenesis_objenesis_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_roaringbitmap_RoaringBitmap_0_5_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_modules_scala_parser_combinators_2_11_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_modules_scala_xml_2_11_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scala_compiler_2_11_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scala_library_2_11_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scala_reflect_2_11_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scalap_2_11_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_spark_project_spark_unused_1_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_tukaani_xz_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_2_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__oro_oro_2_0_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xerces_xercesImpl_2_9_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xml_apis_xml_apis_1_3_04.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xmlenc_xmlenc_0_52.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014-2016 Spiro Michaylov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Learning Spark with Java 2 | 3 | This project contains snippets of Java code for illustrating various 4 | Apache Spark concepts. It is 5 | intended to help you _get started_ with learning Apache Spark (as a _Java_ programmer) 6 | by providing a super easy on-ramp that _doesn't_ involve cluster configuration, 7 | building from sources or installing Spark or Hadoop. Many of these activities will be 8 | necessary later in your learning experience, 9 | after you've used these examples to achieve basic familiarity. 10 | 11 | The project is intended to accompany a number of posts on the blog 12 | [A River of Bytes](http://www.river-of-bytes.com). 13 | 14 | The basic approach used in this project is to create multiple small, free-standing example 15 | programs that each illustrate an aspect fo Spark usage, and to use code comments to explain as 16 | many details as seems useful to beginning Spark programmers. 17 | 18 | ## Dependencies 19 | 20 | The project is based on Apache Spark 2.2.0 and Java 8. 21 | 22 | *Warning: In Spark 2.2, support for Java 7 is finally gone. 23 | This is documented in the [Spark 2.2.0 release notes](http://spark.apache.org/releases/spark-release-2-2-0.html), 24 | but alas not in the corresponding 25 | [JIRA ticket -- Spark 19493](https://issues.apache.org/jira/browse/SPARK-19493).* 26 | 27 | ## Related projects 28 | 29 | This project is derived from the 30 | [LearningSpark project](https://github.com/spirom/LearningSpark) which had the same goals but for 31 | Scala programmers. In that project you can also find the early Java 7 examples that gave 32 | rise to this project: A lot of Spark programming is a lot less painful in Java 8 than in Java 7. 33 | 34 | The [spark-streaming-with-kafka](https://github.com/spirom/spark-streaming-with-kafka) project is 35 | based on Spark's Scala APIs and illustrates the use of Spark with Apache Kafka, using a similar 36 | approach: small free-standing example programs. 37 | 38 | The [spark-data-sources](https://github.com/spirom/spark-data-sources) project is focused on 39 | the new experimental APIs introduced in Spark 2.3.0 for developing adapters for 40 | external data sources of 41 | various kinds. This API is essentially a Java API (developed in Java) to avoid forcing 42 | developers to adopt Scala for their data source adapters. Consequently, the example data sources 43 | in this project are written in Java, but both Java and Scala usage examples are provided. 44 | 45 | ## Contents 46 | 47 | | Package | What's Illustrated | 48 | |---------|-----------------------| 49 | | [rdd](src/main/java/rdd) | The JavaRDD: core Spark data structure -- see the local README.md in that directory for details. | 50 | | [pairs](src/main/java/pairs) | A special RDD for the common case of pairs of values -- see the local README.md in that directory for details. | 51 | | [dataset](src/main/java/dataset) | A range of Dataset examples (queryable collection that is statically typed) -- see the local README.md in that directory for details. | 52 | | [dataframe](src/main/java/dataframe) | A range of DataFrame/Dataset examples (queryable collection that is dynamically typed) -- see the local README.md in that directory for details. | 53 | | [streaming](src/main/java/streaming) | A range of streaming examples -- see the local README.md in that directory for details. | 54 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.github.spirom 8 | learning-spark-with-java 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 1.8 13 | 1.8 14 | 15 | 16 | 17 | 18 | org.apache.spark 19 | spark-core_2.11 20 | 2.2.0 21 | 22 | 23 | org.apache.spark 24 | spark-streaming_2.11 25 | 2.2.0 26 | 27 | 28 | org.apache.spark 29 | spark-sql_2.11 30 | 2.2.0 31 | 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/main/java/dataframe/DatasetConversion.java: -------------------------------------------------------------------------------- 1 | package dataframe; 2 | 3 | import org.apache.spark.sql.*; 4 | 5 | import java.io.Serializable; 6 | import java.util.Arrays; 7 | import java.util.List; 8 | 9 | import static org.apache.spark.sql.functions.col; 10 | 11 | // 12 | // Explore interoperability between DataFrame and Dataset. Note that Dataset 13 | // is covered in much greater detail in the 'dataset' directory. 14 | // 15 | public class DatasetConversion { 16 | 17 | // 18 | // This must be a JavaBean in order for Spark to infer a schema for it 19 | // 20 | public static class Cust implements Serializable { 21 | private int id; 22 | private String name; 23 | private double sales; 24 | private double discount; 25 | private String state; 26 | 27 | public Cust(int id, String name, double sales, double discount, String state) { 28 | this.id = id; 29 | this.name = name; 30 | this.sales = sales; 31 | this.discount = discount; 32 | this.state = state; 33 | } 34 | 35 | public int getId() { 36 | return id; 37 | } 38 | 39 | public void setId(int id) { 40 | this.id = id; 41 | } 42 | 43 | public String getName() { 44 | return name; 45 | } 46 | 47 | public void setName(String name) { 48 | this.name = name; 49 | } 50 | 51 | public double getSales() { 52 | return sales; 53 | } 54 | 55 | public void setSales(double sales) { 56 | this.sales = sales; 57 | } 58 | 59 | public double getDiscount() { 60 | return discount; 61 | } 62 | 63 | public void setDiscount(double discount) { 64 | this.discount = discount; 65 | } 66 | 67 | public String getState() { 68 | return state; 69 | } 70 | 71 | public void setState(String state) { 72 | this.state = state; 73 | } 74 | } 75 | 76 | // 77 | // A smaller JavaBean for a subset of the fields 78 | // 79 | public static class StateSales implements Serializable { 80 | private double sales; 81 | private String state; 82 | 83 | public StateSales(int id, String name, double sales, double discount, String state) { 84 | this.sales = sales; 85 | this.state = state; 86 | } 87 | 88 | public double getSales() { 89 | return sales; 90 | } 91 | 92 | public void setSales(double sales) { 93 | this.sales = sales; 94 | } 95 | 96 | public String getState() { 97 | return state; 98 | } 99 | 100 | public void setState(String state) { 101 | this.state = state; 102 | } 103 | } 104 | 105 | public static void main(String[] args) { 106 | SparkSession spark = SparkSession 107 | .builder() 108 | .appName("DataFrame-DatasetConversion") 109 | .master("local[4]") 110 | .getOrCreate(); 111 | 112 | // 113 | // The Java API requires you to explicitly instantiate an encoder for 114 | // any JavaBean you want to use for schema inference 115 | // 116 | Encoder custEncoder = Encoders.bean(Cust.class); 117 | // 118 | // Create a container of the JavaBean instances 119 | // 120 | List data = Arrays.asList( 121 | new Cust(1, "Widget Co", 120000.00, 0.00, "AZ"), 122 | new Cust(2, "Acme Widgets", 410500.00, 500.00, "CA"), 123 | new Cust(3, "Widgetry", 410500.00, 200.00, "CA"), 124 | new Cust(4, "Widgets R Us", 410500.00, 0.0, "CA"), 125 | new Cust(5, "Ye Olde Widgete", 500.00, 0.0, "MA") 126 | ); 127 | // 128 | // Use the encoder and the container of JavaBean instances to create a 129 | // Dataset 130 | // 131 | Dataset ds = spark.createDataset(data, custEncoder); 132 | 133 | System.out.println("*** here is the schema inferred from the Cust bean"); 134 | ds.printSchema(); 135 | 136 | System.out.println("*** here is the data"); 137 | ds.show(); 138 | 139 | // 140 | // Querying a Dataset of any type results in a 141 | // DataFrame (i.e. Dastaset) 142 | // 143 | 144 | Dataset smallerDF = 145 | ds.select("sales", "state").filter(col("state").equalTo("CA")); 146 | 147 | System.out.println("*** here is the dataframe schema"); 148 | smallerDF.printSchema(); 149 | 150 | System.out.println("*** here is the data"); 151 | smallerDF.show(); 152 | 153 | // 154 | // But a Dataset can be converted back to a Dataset of some other 155 | // type by using another bean encoder 156 | // 157 | 158 | Encoder stateSalesEncoder = Encoders.bean(StateSales.class); 159 | 160 | Dataset stateSalesDS = smallerDF.as(stateSalesEncoder); 161 | 162 | System.out.println("*** here is the schema inferred from the StateSales bean"); 163 | stateSalesDS.printSchema(); 164 | 165 | System.out.println("*** here is the data"); 166 | stateSalesDS.show(); 167 | 168 | spark.stop(); 169 | } 170 | } 171 | -------------------------------------------------------------------------------- /src/main/java/dataframe/FromRowsAndSchema.java: -------------------------------------------------------------------------------- 1 | package dataframe; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Row; 5 | import org.apache.spark.sql.RowFactory; 6 | import org.apache.spark.sql.SparkSession; 7 | import org.apache.spark.sql.types.DataTypes; 8 | import org.apache.spark.sql.types.StructField; 9 | import org.apache.spark.sql.types.StructType; 10 | 11 | import java.util.Arrays; 12 | import java.util.List; 13 | 14 | import static org.apache.spark.sql.functions.col; 15 | 16 | // 17 | // Note that conceptually a DataFrame is a DataSet, bot the Java API 18 | // doesn't actually have a definition of DataFrame. 19 | // 20 | // Create a Spark Dataset from a list of Row instances and a schema 21 | // constructed explicitly. Query it. 22 | // 23 | // This example is fundamental for Dataset as the chema is created 24 | // explicitly instead of being inferred via an Encoder like in the Dataset 25 | // examples. 26 | // 27 | public class FromRowsAndSchema { 28 | public static void main(String[] args) { 29 | SparkSession spark = SparkSession 30 | .builder() 31 | .appName("DataFrame-FromRowsAndSchema") 32 | .master("local[4]") 33 | .getOrCreate(); 34 | 35 | List customerRows = Arrays.asList( 36 | RowFactory.create(1, "Widget Co", 120000.00, 0.00, "AZ"), 37 | RowFactory.create(2, "Acme Widgets", 410500.00, 500.00, "CA"), 38 | RowFactory.create(3, "Widgetry", 410500.00, 200.00, "CA"), 39 | RowFactory.create(4, "Widgets R Us", 410500.00, 0.0, "CA"), 40 | RowFactory.create(5, "Ye Olde Widgete", 500.00, 0.0, "MA") 41 | ); 42 | 43 | List fields = Arrays.asList( 44 | DataTypes.createStructField("id", DataTypes.IntegerType, true), 45 | DataTypes.createStructField("name", DataTypes.StringType, true), 46 | DataTypes.createStructField("sales", DataTypes.DoubleType, true), 47 | DataTypes.createStructField("discount", DataTypes.DoubleType, true), 48 | DataTypes.createStructField("state", DataTypes.StringType, true) 49 | ); 50 | StructType customerSchema = DataTypes.createStructType(fields); 51 | 52 | Dataset customerDF = 53 | spark.createDataFrame(customerRows, customerSchema); 54 | 55 | System.out.println("*** the schema created"); 56 | customerDF.printSchema(); 57 | 58 | System.out.println("*** the data"); 59 | customerDF.show(); 60 | 61 | System.out.println("*** just the rows from CA"); 62 | customerDF.filter(col("state").equalTo("CA")).show(); 63 | 64 | spark.stop(); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/dataframe/README.md: -------------------------------------------------------------------------------- 1 | # DataFrame, or Dataset 2 | 3 | ## Getting started 4 | 5 | | File | What's Illustrated | 6 | |-----------------------|-----------------------| 7 | | FromRowsAndSchema.java | Create `Dataset` from `Row`s and an explicit schema. | 8 | | DatasetConversion.java | Convert between `Dataset` and Dataset of some other type (in both directions.) | 9 | -------------------------------------------------------------------------------- /src/main/java/dataset/Basic.java: -------------------------------------------------------------------------------- 1 | package dataset; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Encoder; 5 | import org.apache.spark.sql.Encoders; 6 | import org.apache.spark.sql.SparkSession; 7 | import scala.Tuple3; 8 | 9 | import java.util.Arrays; 10 | import java.util.List; 11 | 12 | import static org.apache.spark.sql.functions.col; 13 | 14 | // 15 | // Create a Spark Dataset from an array of tuples. The inferred schema doesn't 16 | // have convenient column names but it can still be queried conveniently. 17 | // 18 | public class Basic { 19 | public static void main(String[] args) { 20 | SparkSession spark = SparkSession 21 | .builder() 22 | .appName("Dataset-Basic") 23 | .master("local[4]") 24 | .getOrCreate(); 25 | 26 | List data = Arrays.asList(10, 11, 12, 13, 14, 15); 27 | Dataset ds = spark.createDataset(data, Encoders.INT()); 28 | 29 | System.out.println("*** only one column, and it always has the same name"); 30 | ds.printSchema(); 31 | 32 | ds.show(); 33 | 34 | System.out.println("*** values > 12"); 35 | 36 | // the harder way to filter 37 | Dataset ds2 = ds.filter((Integer value) -> value > 12); 38 | 39 | ds.show(); 40 | 41 | List> tuples = 42 | Arrays.asList( 43 | new Tuple3<>(1, "one", "un"), 44 | new Tuple3<>(2, "two", "deux"), 45 | new Tuple3<>(3, "three", "trois")); 46 | 47 | Encoder> encoder = 48 | Encoders.tuple(Encoders.INT(), Encoders.STRING(), Encoders.STRING()); 49 | 50 | Dataset> tupleDS = 51 | spark.createDataset(tuples, encoder); 52 | 53 | System.out.println("*** Tuple Dataset types"); 54 | tupleDS.printSchema(); 55 | 56 | // the tuple columns have unfriendly names, but you can use them to query 57 | System.out.println("*** filter by one column and fetch another"); 58 | tupleDS.where(col("_1").gt(2)).select(col("_2"), col("_3")).show(); 59 | 60 | spark.stop(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/dataset/ComplexType.java: -------------------------------------------------------------------------------- 1 | package dataset; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Encoder; 5 | import org.apache.spark.sql.Encoders; 6 | import org.apache.spark.sql.SparkSession; 7 | 8 | import java.io.Serializable; 9 | import java.util.Arrays; 10 | import java.util.HashMap; 11 | import java.util.List; 12 | import java.util.Map; 13 | 14 | import static org.apache.spark.sql.functions.col; 15 | import static org.apache.spark.sql.functions.size; 16 | 17 | 18 | // 19 | // Examples of querying against more complex schema inferred from Java beans. 20 | // Includes JavaBean nesting, arrays and maps. 21 | // 22 | public class ComplexType { 23 | 24 | // 25 | // A JavaBean for all the examples 26 | // 27 | public static class Point implements Serializable { 28 | private double x; 29 | private double y; 30 | 31 | public Point(double x, double y) { 32 | this.x = x; 33 | this.y = y; 34 | } 35 | 36 | public double getX() { 37 | return x; 38 | } 39 | 40 | public void setX(double x) { 41 | this.x = x; 42 | } 43 | 44 | public double getY() { 45 | return y; 46 | } 47 | 48 | public void setY(double y) { 49 | this.y = y; 50 | } 51 | } 52 | 53 | // 54 | // A JavaBean for Example 1 55 | // 56 | public static class Segment implements Serializable { 57 | private Point from; 58 | private Point to; 59 | 60 | public Segment(Point from, Point to) { 61 | this.to = to; 62 | this.from = from; 63 | } 64 | 65 | public Point getFrom() { 66 | return from; 67 | } 68 | 69 | public void setFrom(Point from) { 70 | this.from = from; 71 | } 72 | 73 | public Point getTo() { 74 | return to; 75 | } 76 | 77 | public void setTo(Point to) { 78 | this.to = to; 79 | } 80 | } 81 | 82 | // 83 | // A JavaBean for Example 2 84 | // 85 | public static class Line implements Serializable { 86 | private String name; 87 | private Point[] points; 88 | 89 | public Line(String name, Point[] points) { 90 | this.name = name; 91 | this.points = points; 92 | } 93 | 94 | public String getName() { return name; } 95 | 96 | public void setName(String name) { this.name = name; } 97 | 98 | public Point[] getPoints() { return points; } 99 | 100 | public void setPoints(Point[] points) { this.points = points; } 101 | } 102 | 103 | // 104 | // A JavaBean for Example 3 105 | // 106 | public static class NamedPoints implements Serializable { 107 | private String name; 108 | private Map points; 109 | 110 | public NamedPoints(String name, Map points) { 111 | this.name = name; 112 | this.points = points; 113 | } 114 | 115 | public String getName() { return name; } 116 | 117 | public void setName(String name) { this.name = name; } 118 | 119 | public Map getPoints() { return points; } 120 | 121 | public void setPoints(Map points) { this.points = points; } 122 | } 123 | 124 | public static void main(String[] args) { 125 | SparkSession spark = SparkSession 126 | .builder() 127 | .appName("Dataset-ComplexType") 128 | .master("local[4]") 129 | .getOrCreate(); 130 | 131 | // 132 | // Example 1: nested Java beans 133 | // 134 | 135 | System.out.println("*** Example 1: nested Java beans"); 136 | 137 | Encoder segmentEncoder = Encoders.bean(Segment.class); 138 | 139 | List data = Arrays.asList( 140 | new Segment(new Point(1.0, 2.0), new Point(3.0, 4.0)), 141 | new Segment(new Point(8.0, 2.0), new Point(3.0, 14.0)), 142 | new Segment(new Point(11.0, 2.0), new Point(3.0, 24.0))); 143 | 144 | Dataset ds = spark.createDataset(data, segmentEncoder); 145 | 146 | System.out.println("*** here is the schema inferred from the bean"); 147 | ds.printSchema(); 148 | 149 | System.out.println("*** here is the data"); 150 | ds.show(); 151 | 152 | // Use the convenient bean-inferred column names to query 153 | System.out.println("*** filter by one column and fetch others"); 154 | ds.where(col("from").getField("x").gt(7.0)).select(col("to")).show(); 155 | 156 | // 157 | // Example 2: arrays 158 | // 159 | 160 | System.out.println("*** Example 2: arrays"); 161 | 162 | Encoder lineEncoder = Encoders.bean(Line.class); 163 | List lines = Arrays.asList( 164 | new Line("a", new Point[]{new Point(0.0, 0.0), new Point(2.0, 4.0)}), 165 | new Line("b", new Point[]{new Point(-1.0, 0.0)}), 166 | new Line("c", new Point[] 167 | {new Point(0.0, 0.0), new Point(2.0, 6.0), new Point(10.0, 100.0)}) 168 | ); 169 | 170 | Dataset linesDS = spark.createDataset(lines, lineEncoder); 171 | 172 | System.out.println("*** here is the schema inferred from the bean"); 173 | linesDS.printSchema(); 174 | 175 | System.out.println("*** here is the data"); 176 | linesDS.show(); 177 | 178 | // notice here you can filter by the second element of the array, which 179 | // doesn't even exist in one of the rows 180 | System.out.println("*** filter by an array element"); 181 | linesDS 182 | .where(col("points").getItem(2).getField("y").gt(7.0)) 183 | .select(col("name"), size(col("points")).as("count")).show(); 184 | 185 | // 186 | // Example 3: maps 187 | // 188 | 189 | if (false) { 190 | 191 | // 192 | // In Spark 2.0 this throws 193 | // java.lang.UnsupportedOperationException: map type is not supported currently 194 | // See https://issues.apache.org/jira/browse/SPARK-16706 -- and 195 | // notice it has been marked Fixed for Spark 2.1.0. 196 | // 197 | 198 | System.out.println("*** Example 3: maps"); 199 | 200 | Encoder namedPointsEncoder = Encoders.bean(NamedPoints.class); 201 | HashMap points1 = new HashMap<>(); 202 | points1.put("p1", new Point(0.0, 0.0)); 203 | HashMap points2 = new HashMap<>(); 204 | points2.put("p1", new Point(0.0, 0.0)); 205 | points2.put("p2", new Point(2.0, 6.0)); 206 | points2.put("p3", new Point(10.0, 100.0)); 207 | List namedPoints = Arrays.asList( 208 | new NamedPoints("a", points1), 209 | new NamedPoints("b", points2) 210 | ); 211 | 212 | Dataset namedPointsDS = 213 | spark.createDataset(namedPoints, namedPointsEncoder); 214 | 215 | System.out.println("*** here is the schema inferred from the bean"); 216 | namedPointsDS.printSchema(); 217 | 218 | System.out.println("*** here is the data"); 219 | namedPointsDS.show(); 220 | 221 | System.out.println("*** filter and select using map lookup"); 222 | namedPointsDS 223 | .where(size(col("points")).gt(1)) 224 | .select(col("name"), 225 | size(col("points")).as("count"), 226 | col("points").getItem("p1")).show(); 227 | 228 | } 229 | 230 | spark.stop(); 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /src/main/java/dataset/JavaBean.java: -------------------------------------------------------------------------------- 1 | package dataset; 2 | 3 | import org.apache.spark.sql.Dataset; 4 | import org.apache.spark.sql.Encoder; 5 | import org.apache.spark.sql.Encoders; 6 | import org.apache.spark.sql.SparkSession; 7 | 8 | import java.io.Serializable; 9 | import java.util.Arrays; 10 | import java.util.List; 11 | 12 | import static org.apache.spark.sql.functions.col; 13 | 14 | // 15 | // Create a Spark Dataset from an array of JavaBean instances. 16 | // The inferred schema has convenient column names and it can 17 | // be queried conveniently. 18 | // 19 | public class JavaBean { 20 | 21 | // 22 | // This must be a JavaBean in order for Spark to infer a schema for it 23 | // 24 | public static class Number implements Serializable { 25 | private int i; 26 | private String english; 27 | private String french; 28 | 29 | public Number(int i, String english, String french) { 30 | this.i = i; 31 | this.english = english; 32 | this.french = french; 33 | } 34 | 35 | public int getI() { 36 | return i; 37 | } 38 | 39 | public void setI(int i) { 40 | this.i = i; 41 | } 42 | 43 | public String getEnglish() { 44 | return english; 45 | } 46 | 47 | public void setEnglish(String english) { 48 | this.english = english; 49 | } 50 | 51 | public String getFrench() { 52 | return french; 53 | } 54 | 55 | public void setFrench(String french) { 56 | this.french = french; 57 | } 58 | } 59 | 60 | public static void main(String[] args) { 61 | SparkSession spark = SparkSession 62 | .builder() 63 | .appName("Dataset-JavaBean") 64 | .master("local[4]") 65 | .getOrCreate(); 66 | 67 | // 68 | // The Java API requires you to explicitly instantiate an encoder for 69 | // any JavaBean you want to use for schema inference 70 | // 71 | Encoder numberEncoder = Encoders.bean(Number.class); 72 | // 73 | // Create a container of the JavaBean instances 74 | // 75 | List data = Arrays.asList( 76 | new Number(1, "one", "un"), 77 | new Number(2, "two", "deux"), 78 | new Number(3, "three", "trois")); 79 | // 80 | // Use the encoder and the container of JavaBean instances to create a 81 | // Dataset 82 | // 83 | Dataset ds = spark.createDataset(data, numberEncoder); 84 | 85 | System.out.println("*** here is the schema inferred from the bean"); 86 | ds.printSchema(); 87 | 88 | System.out.println("*** here is the data"); 89 | ds.show(); 90 | 91 | // Use the convenient bean-inferred column names to query 92 | System.out.println("*** filter by one column and fetch others"); 93 | ds.where(col("i").gt(2)).select(col("english"), col("french")).show(); 94 | 95 | spark.stop(); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/main/java/dataset/README.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | 3 | ## Getting started 4 | 5 | | File | What's Illustrated | 6 | |-----------------------|-----------------------| 7 | | Basic.java | How to create a `DataSet`, examine it and perform basic operations. **Start here.** | 8 | | JavaBean.java | A `DataSet` is more convenient to use if you define a JavaBean for the element type. | 9 | 10 | ## Advanced 11 | 12 | | File | What's Illustrated | 13 | |-----------------------|-----------------------| 14 | | ComplexType.java | Creating a Dataset with various forms of complex schema (nesting, arrays, maps), based on JavaBean classes | 15 | -------------------------------------------------------------------------------- /src/main/java/pairs/Basic.java: -------------------------------------------------------------------------------- 1 | package pairs; 2 | 3 | import org.apache.spark.api.java.JavaPairRDD; 4 | import org.apache.spark.api.java.JavaRDD; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.sql.SparkSession; 7 | 8 | import java.util.Arrays; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | import scala.Tuple2; 13 | 14 | // 15 | // Many applications end up performing operations on a kay/value pairs where 16 | // many operations are performed on a perk-key basis, so Spark introduces a 17 | // special type of RDD for pairs, the JavaPairRDD. THis behaves like an RDD, 18 | // but benefits from additional operations in PairRDDFunctions. 19 | // 20 | // Here we explore their basic usage. Elsewhere we see that they get more 21 | // interesting when we can assume that the JavaPairRDD is partitioned so that 22 | // the entries for each key live in just one partition. 23 | // 24 | 25 | public class Basic { 26 | public static void main(String[] args) { 27 | SparkSession spark = SparkSession 28 | .builder() 29 | .appName("Pairs-Basic") 30 | .master("local[4]") 31 | .getOrCreate(); 32 | 33 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 34 | 35 | List> pairs = 36 | Arrays.asList( 37 | new Tuple2<>("1",9), new Tuple2<>("1",2), new Tuple2<>("1",1), 38 | new Tuple2<>("2",3), new Tuple2<>("2",4), new Tuple2<>("3",1), 39 | new Tuple2<>("3",5), new Tuple2<>("6",2), new Tuple2<>("6",1), 40 | new Tuple2<>("6",4), new Tuple2<>("8",1)); 41 | 42 | // a randomly partitioned pair RDD 43 | JavaPairRDD pairsRDD = sc.parallelizePairs(pairs, 4); 44 | 45 | System.out.println("*** the original pairs"); 46 | pairsRDD.foreach(i -> System.out.println(i)); 47 | 48 | // 49 | // Pairs can be collected as a Map of, but this only works well if the 50 | // keys are unique. Here they aren't so an arbitrary value is chosen for each: 51 | // 52 | Map pairsAsMap = pairsRDD.collectAsMap(); 53 | System.out.println("*** the pretty useless map"); 54 | System.out.println(pairsAsMap); 55 | 56 | // let's say we just want the pair with minimum value for each key 57 | // we can use one of the handy methods in PairRDDFunctions. To reduce we need 58 | // only supply a single function to combine all the values for each key -- the result 59 | // has to have the same type as the values 60 | JavaPairRDD reducedRDD = pairsRDD.reduceByKey(Math::min); 61 | 62 | System.out.println("*** the reduced pairs"); 63 | reducedRDD.foreach(i -> System.out.println(i)); 64 | 65 | // the reduced pairs have unique keys so collecting to a map works a lot better 66 | Map reducedAsMap = reducedRDD.collectAsMap(); 67 | System.out.println("*** the reduced pairs as a map"); 68 | System.out.println(reducedAsMap); 69 | 70 | // folding is a little mor general: we get to specifiy the identity value: 71 | // say 0 for adding and 1 for multiplying 72 | JavaPairRDD foldedRDD = 73 | pairsRDD.foldByKey(1, (x, y) -> x * y); 74 | 75 | System.out.println("*** the folded pairs"); 76 | foldedRDD.foreach(i -> System.out.println(i)); 77 | 78 | // Combining is more general: you can produce values of a different type, which is very powerful. 79 | // You need to provide three functions: the first converts an individual value to the new type, the second 80 | // incorporates an additional value into the the result, and the third combines intermediate results, which is 81 | // used by execution to avoid excessive communication between partitions. The first function is applied once 82 | // per partition and the second is used for each additional value in the partition. 83 | // Below is a pretty classical example of its use: compute a per-key average by first computing the sum and count 84 | // for each key and then dividing. 85 | JavaPairRDD> combinedRDD = 86 | pairsRDD.combineByKey( 87 | value -> new Tuple2<>(value, 1), 88 | (sumAndCount, value) -> new Tuple2<>(sumAndCount._1() + value, sumAndCount._2() + 1), 89 | (sumAndCount1, sumAndCount2) -> 90 | new Tuple2<>(sumAndCount1._1() + sumAndCount2._1(), sumAndCount1._2() + sumAndCount2._2()) 91 | ); 92 | 93 | JavaPairRDD averageRDD = 94 | combinedRDD.mapValues(sumAndCount -> (double) sumAndCount._1() / sumAndCount._2()); 95 | 96 | System.out.println("*** the average pairs"); 97 | averageRDD.foreach(i -> System.out.println(i)); 98 | 99 | // The dividing could be done just by calling map, but in Java this requires a lot of conversion between the 100 | // two kinds of RDD and ends up *VERY* cumbersome. 101 | JavaRDD>> tupleCombinedRDD = 102 | JavaRDD.fromRDD(combinedRDD.rdd(), combinedRDD.classTag()); 103 | JavaRDD> tupleDividedRDD = tupleCombinedRDD.map(keyAndsumAndCount -> 104 | new Tuple2<>(keyAndsumAndCount._1(), (double) keyAndsumAndCount._2()._1() / keyAndsumAndCount._2()._2())); 105 | JavaPairRDD averageRDDtheHardWay = JavaPairRDD.fromJavaRDD(tupleDividedRDD); 106 | 107 | // remember these won't necessarily come out int he same order so they may not obviously be 108 | // the same as above 109 | System.out.println("*** the average pairs the hard way"); 110 | averageRDDtheHardWay.foreach(i -> System.out.println(i)); 111 | 112 | spark.stop(); 113 | } 114 | } -------------------------------------------------------------------------------- /src/main/java/pairs/README.md: -------------------------------------------------------------------------------- 1 | # _PairRDD_ Examples 2 | 3 | | File | What's Illustrated | 4 | |-----------------------|-----------------------| 5 | | Basic.java | Creation of and basic operations on a JavaPairRDD. **Start here.** | -------------------------------------------------------------------------------- /src/main/java/rdd/Basic.java: -------------------------------------------------------------------------------- 1 | package rdd; 2 | 3 | import org.apache.spark.api.java.JavaRDD; 4 | import org.apache.spark.api.java.JavaSparkContext; 5 | import org.apache.spark.sql.SparkSession; 6 | 7 | import java.util.Arrays; 8 | import java.util.List; 9 | 10 | // 11 | // This very basic example creates an RDD from a list of integers. It Explores 12 | // how to transform the RDD element by element, convert it back to an array, and 13 | // examine its partitioning. Also it begins to explore the fact that RDDs are in 14 | // practice order preserving but when you operate on them directly their 15 | // distributed nature prevents them from behaving like they are ordered. 16 | // 17 | public class Basic { 18 | public static void main(String[] args) { 19 | // 20 | // The "modern" way to initialize Spark is to create a SparkSession 21 | // although they really come from the world of Spark SQL, and Dataset 22 | // and DataFrame. 23 | // 24 | SparkSession spark = SparkSession 25 | .builder() 26 | .appName("RDD-Basic") 27 | .master("local[4]") 28 | .getOrCreate(); 29 | 30 | // 31 | // Operating on a raw RDD actually requires access to the more low 32 | // level SparkContext -- get the special Java version for convenience 33 | // 34 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 35 | 36 | // put some data in an RDD 37 | List numbers = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); 38 | // 39 | // Since this SparkContext is actually a JavaSparkContext, the methods 40 | // return a JavaRDD, which is more convenient as well. 41 | // 42 | JavaRDD numbersRDD = sc.parallelize(numbers, 4); 43 | System.out.println("*** Print each element of the original RDD"); 44 | System.out.println("*** (they won't necessarily be in any order)"); 45 | // Since printing is delegated to the RDD it happens in parallel. 46 | // For versions of Java without lambda, Spark provides some utility 47 | // interfaces like VoidFunction. 48 | numbersRDD.foreach(i -> System.out.println(i)); 49 | 50 | // NOTE: in the above it may be tempting to replace the above 51 | // lambda expression with a method reference -- System.out::println -- 52 | // but alas this results in 53 | // java.io.NotSerializableException: java.io.PrintStream 54 | 55 | // Transform the RDD element by element -- this time use Function 56 | // instead of a Lambda. Notice how the RDD changes from 57 | // JavaRDD to JavaRDD. 58 | JavaRDD transformedRDD = 59 | numbersRDD.map(n -> new Double(n) / 10); 60 | 61 | // let's see the elements 62 | System.out.println("*** Print each element of the transformed RDD"); 63 | System.out.println("*** (they may not even be in the same order)"); 64 | transformedRDD.foreach(i -> System.out.println(i)); 65 | 66 | // get the data back out as a list -- collect() gathers up all the 67 | // partitions of an RDD and constructs a regular List 68 | List transformedAsList = transformedRDD.collect(); 69 | // interesting how the list comes out sorted but the RDD didn't 70 | System.out.println("*** Now print each element of the transformed list"); 71 | System.out.println("*** (the list is in the same order as the original list)"); 72 | for (Double d : transformedAsList) { 73 | System.out.println(d); 74 | } 75 | 76 | // explore RDD partitioning properties -- glom() keeps the RDD as 77 | // an RDD but the elements are now lists of the original values -- 78 | // the resulting RDD has an element for each partition of 79 | // the original RDD 80 | JavaRDD> partitionsRDD = transformedRDD.glom(); 81 | System.out.println("*** We _should_ have 4 partitions"); 82 | System.out.println("*** (They can't be of equal size)"); 83 | System.out.println("*** # partitions = " + partitionsRDD.count()); 84 | // specifying the type of l is not required here but sometimes it's useful for clarity 85 | partitionsRDD.foreach((List l) -> { 86 | // A string for each partition so the output isn't garbled 87 | // -- remember the RDD is still distributed so this function 88 | // is called in parallel 89 | StringBuffer sb = new StringBuffer(); 90 | for (Double d : l) { 91 | sb.append(d); 92 | sb.append(" "); 93 | } 94 | System.out.println(sb); 95 | }); 96 | 97 | spark.stop(); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/rdd/README.md: -------------------------------------------------------------------------------- 1 | # RDD 2 | 3 | ## Getting started 4 | 5 | | File | What's Illustrated | 6 | |-----------------------|-----------------------| 7 | | Basic.java | How to create a JavaRDD, examine it and perform basic transformations. **Start here.** | 8 | -------------------------------------------------------------------------------- /src/main/java/streaming/FileBased.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.sql.SparkSession; 7 | import org.apache.spark.streaming.api.java.JavaDStream; 8 | import org.apache.spark.streaming.Duration; 9 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 10 | import streaming.util.CSVFileStreamGenerator; 11 | 12 | 13 | /** 14 | * File based streaming requires files to be atomically created in 15 | * the source directory -- in practice this entails creating them somewhere 16 | * else and renaming them in place. 17 | * 18 | * The streaming context is set up to process a batch of new data once per second. Each batch is a single RDD 19 | * containing one entry for each text line of the newly discovered files in the specified directory since the 20 | * last batch was processed. Since it's possible for more than one file to appear in the directory in a batch 21 | * interval, one such RDD may contain the data from more than one file. 22 | * 23 | */ 24 | 25 | public class FileBased { 26 | public static void main(String[] args) { 27 | // 28 | // The "modern" way to initialize Spark is to create a SparkSession 29 | // although they really come from the world of Spark SQL, and Dataset 30 | // and DataFrame. 31 | // 32 | SparkSession spark = SparkSession 33 | .builder() 34 | .appName("streaming-FileBased") 35 | .master("local[4]") 36 | .getOrCreate(); 37 | 38 | // 39 | // Operating on a raw RDD actually requires access to the more low 40 | // level SparkContext -- get the special Java version for convenience 41 | // 42 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 43 | 44 | 45 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 46 | // but it isn't -- it comes from org.apache.spark.streaming) 47 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 48 | 49 | // use the utility class to produce a sequence of 10 files, each containing 100 records 50 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 51 | // create the stream, which will contain the rows of the individual files as strings 52 | // -- notice we can create the stream even though this directory won't have any data until we call 53 | // fm.makeFiles() below 54 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 55 | 56 | // register a function to process data from the stream -- in this case it's a very simple function 57 | // counts the number of elements ine ach RDD and prints it 58 | streamOfRecords.foreachRDD(r -> { 59 | System.out.println(r.count()); 60 | }); 61 | 62 | // start streaming 63 | System.out.println("*** about to start streaming"); 64 | ssc.start(); 65 | 66 | 67 | Thread t = new Thread() { 68 | public void run() { 69 | try { 70 | // A curious fact about files based streaming is that any files written 71 | // before the first RDD is produced are ignored. So wait longer than 72 | // that before producing files. 73 | Thread.sleep(2000); 74 | 75 | System.out.println("*** producing data"); 76 | // start producing files 77 | fm.makeFiles(); 78 | 79 | // give it time to get processed 80 | Thread.sleep(10000); 81 | } catch (InterruptedException ie) { 82 | } catch (IOException ioe) { 83 | throw new RuntimeException("problem in background thread", ioe); 84 | } 85 | ssc.stop(); 86 | System.out.println("*** stopping streaming"); 87 | } 88 | }; 89 | t.start(); 90 | 91 | try { 92 | ssc.awaitTermination(); 93 | } catch (InterruptedException ie) { 94 | 95 | } 96 | System.out.println("*** Streaming terminated"); 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/streaming/Filtering.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | import org.apache.spark.api.java.JavaSparkContext; 4 | import org.apache.spark.sql.SparkSession; 5 | import org.apache.spark.streaming.Duration; 6 | import org.apache.spark.streaming.api.java.JavaDStream; 7 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 8 | import streaming.util.CSVFileStreamGenerator; 9 | import streaming.util.StreamingItem; 10 | 11 | import java.io.IOException; 12 | 13 | 14 | /** 15 | * Much of the processing we require on streams is agnostic about batch boundaries. It's convenient to have 16 | * methods on JavaDStream that allow us to transform the streamed data item by item (using map()), or filter it 17 | * item by item (using filter()) without being concerned about batch boundaries as embodied by individual RDDs. 18 | * This example again uses map() to parse the records int he ext files and then filter() to filter out individual 19 | * entries, so that by the time we receive batch RDDs only the desired items remain. 20 | * 21 | * Of course, similar filtering and transformation methods are available on the JavaRDD class, and its better to 22 | * use those in the case where your algorithm NEEDS to be aware of batch boundaries. The methods on JavaDStream 23 | * illustrated here are useful exactly because they abstract batch boundaries away, and also because they 24 | * create a stream that can be used for additional processing. 25 | * 26 | * On the other hand, if you want to transform data in a way that is aware of batch boundaries but still creates a 27 | * stream, you can uses transform() and similar methods on JavaDStream that are illustrated elsewhere. 28 | */ 29 | 30 | public class Filtering { 31 | public static void main(String[] args) { 32 | // 33 | // The "modern" way to initialize Spark is to create a SparkSession 34 | // although they really come from the world of Spark SQL, and Dataset 35 | // and DataFrame. 36 | // 37 | SparkSession spark = SparkSession 38 | .builder() 39 | .appName("streaming-Filtering") 40 | .master("local[4]") 41 | .getOrCreate(); 42 | 43 | // 44 | // Operating on a raw RDD actually requires access to the more low 45 | // level SparkContext -- get the special Java version for convenience 46 | // 47 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 48 | 49 | 50 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 51 | // but it isn't -- it comes from org.apache.spark.streaming) 52 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 53 | 54 | // use the utility class to produce a sequence of 10 files, each containing 100 records 55 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 56 | // create the stream, which will contain the rows of the individual files as strings 57 | // -- notice we can create the stream even though this directory won't have any data until we call 58 | // fm.makeFiles() below 59 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 60 | 61 | // use a simple transformation to create a derived stream -- the original stream of Records is parsed 62 | // to produce a stream of KeyAndValue objects 63 | JavaDStream streamOfItems = streamOfRecords.map(s -> new StreamingItem(s)); 64 | 65 | // create a derived stream that only contains StreamingItem objects whose category value is MEDIUM 66 | JavaDStream streamOfMediumEntries = 67 | streamOfItems.filter(item -> item.getCategory() == StreamingItem.Category.MEDIUM); 68 | 69 | // now register a function to print the size of each batch -- notice there are fewer items in each one 70 | // as only the MEDIUM entries have been retained. 71 | streamOfMediumEntries.foreachRDD(rdd -> System.out.println("Item count = " + rdd.count())); 72 | 73 | // start streaming 74 | System.out.println("*** about to start streaming"); 75 | ssc.start(); 76 | 77 | 78 | Thread t = new Thread() { 79 | public void run() { 80 | try { 81 | // A curious fact about files based streaming is that any files written 82 | // before the first RDD is produced are ignored. So wait longer than 83 | // that before producing files. 84 | Thread.sleep(2000); 85 | 86 | System.out.println("*** producing data"); 87 | // start producing files 88 | fm.makeFiles(); 89 | 90 | // give it time to get processed 91 | Thread.sleep(10000); 92 | } catch (InterruptedException ie) { 93 | } catch (IOException ioe) { 94 | throw new RuntimeException("problem in background thread", ioe); 95 | } 96 | ssc.stop(); 97 | System.out.println("*** stopping streaming"); 98 | } 99 | }; 100 | t.start(); 101 | 102 | try { 103 | ssc.awaitTermination(); 104 | } catch (InterruptedException ie) { 105 | 106 | } 107 | System.out.println("*** Streaming terminated"); 108 | } 109 | 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/streaming/MapWithState.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | import org.apache.spark.HashPartitioner; 4 | import org.apache.spark.api.java.JavaSparkContext; 5 | import org.apache.spark.api.java.Optional; 6 | import org.apache.spark.api.java.function.Function; 7 | import org.apache.spark.api.java.function.Function2; 8 | import org.apache.spark.api.java.function.Function3; 9 | import org.apache.spark.sql.SparkSession; 10 | import org.apache.spark.streaming.Duration; 11 | import org.apache.spark.streaming.State; 12 | import org.apache.spark.streaming.StateSpec; 13 | import org.apache.spark.streaming.api.java.JavaDStream; 14 | import org.apache.spark.streaming.api.java.JavaMapWithStateDStream; 15 | import org.apache.spark.streaming.api.java.JavaPairDStream; 16 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 17 | import scala.Tuple2; 18 | import streaming.util.CSVFileStreamGenerator; 19 | import streaming.util.StreamingItem; 20 | 21 | import java.io.File; 22 | import java.io.IOException; 23 | 24 | 25 | public class MapWithState { 26 | public static void main(String[] args) { 27 | // 28 | // The "modern" way to initialize Spark is to create a SparkSession 29 | // although they really come from the world of Spark SQL, and Dataset 30 | // and DataFrame. 31 | // 32 | SparkSession spark = SparkSession 33 | .builder() 34 | .appName("streaming-Filtering") 35 | .master("local[4]") 36 | .getOrCreate(); 37 | 38 | // 39 | // Operating on a raw RDD actually requires access to the more low 40 | // level SparkContext -- get the special Java version for convenience 41 | // 42 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 43 | 44 | 45 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 46 | // but it isn't -- it comes from org.apache.spark.streaming) 47 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 48 | 49 | String checkpointPath = File.separator + "tmp" + File.separator + "LSWJ" + File.separator + "checkpoints"; 50 | File checkpointDir = new File(checkpointPath); 51 | checkpointDir.mkdir(); 52 | checkpointDir.deleteOnExit(); 53 | ssc.checkpoint(checkpointPath); 54 | 55 | // use the utility class to produce a sequence of 10 files, each containing 100 records 56 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 57 | // create the stream, which will contain the rows of the individual files as strings 58 | // -- notice we can create the stream even though this directory won't have any data until we call 59 | // fm.makeFiles() below 60 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 61 | 62 | // use a simple transformation to create a derived stream -- the original stream of Records is parsed 63 | // to produce a stream of KeyAndValue objects 64 | JavaDStream streamOfItems = streamOfRecords.map(s -> new StreamingItem(s)); 65 | 66 | JavaPairDStream streamOfPairs = 67 | streamOfItems.mapToPair(si -> 68 | new Tuple2<>(si.getCategory(), si)); 69 | 70 | Function3, State, Tuple2> 71 | mappingFunction = (category, item, state) -> { 72 | int count = 1 + (state.exists() ? state.get() : 0); 73 | Tuple2 thisOne = new Tuple2<>(category, count); 74 | state.update(count); 75 | return thisOne; 76 | }; 77 | 78 | JavaMapWithStateDStream> streamOfCategoryCounts = 79 | streamOfPairs.mapWithState(StateSpec.function(mappingFunction)); 80 | 81 | streamOfCategoryCounts.foreachRDD(rdd -> { 82 | System.out.println("Batch size: " + rdd.count()); 83 | rdd.foreach(e -> System.out.println(e)); 84 | }); 85 | 86 | // start streaming 87 | System.out.println("*** about to start streaming"); 88 | ssc.start(); 89 | 90 | 91 | Thread t = new Thread() { 92 | public void run() { 93 | try { 94 | // A curious fact about files based streaming is that any files written 95 | // before the first RDD is produced are ignored. So wait longer than 96 | // that before producing files. 97 | Thread.sleep(2000); 98 | 99 | System.out.println("*** producing data"); 100 | // start producing files 101 | fm.makeFiles(); 102 | 103 | // give it time to get processed 104 | Thread.sleep(10000); 105 | 106 | fm.makeFiles(); 107 | 108 | // give it time to get processed 109 | Thread.sleep(10000); 110 | } catch (InterruptedException ie) { 111 | } catch (IOException ioe) { 112 | throw new RuntimeException("problem in background thread", ioe); 113 | } 114 | ssc.stop(); 115 | System.out.println("*** stopping streaming"); 116 | } 117 | }; 118 | t.start(); 119 | 120 | try { 121 | ssc.awaitTermination(); 122 | } catch (InterruptedException ie) { 123 | 124 | } 125 | System.out.println("*** Streaming terminated"); 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /src/main/java/streaming/MulitpleTransformations.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | import org.apache.spark.api.java.JavaSparkContext; 4 | import org.apache.spark.sql.SparkSession; 5 | import org.apache.spark.streaming.Duration; 6 | import org.apache.spark.streaming.api.java.JavaDStream; 7 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 8 | import streaming.util.CSVFileStreamGenerator; 9 | import streaming.util.StreamingItem; 10 | 11 | import java.io.IOException; 12 | 13 | 14 | /** 15 | * This example adds to the simple example in FileBased.java by introducing stream transformations 16 | * and by registering multiple batch processing functions on the various streams. 17 | * You can think of the relationships between them all as follows: 18 | * -- root stream of records from the text files 19 | * -- registered function [1]: print the count of records in each batch RDD 20 | * -- derived stream by calling count(): each RDD contains the count of elements in the batch 21 | * -- registered function [2]: print the count (should give same results as [1]) 22 | * -- derived stream by calling map() to parse the records int he CSV files 23 | * -- registered function [3]: print the value of any object with key "Key_40" 24 | * -- registered function [4]: print the fraction of objects whose value is negative 25 | * Because the various outputs tend to get tangled up, the output of each registered function identifies itself by a 26 | * bracketed number: [1], [2], [3] or [4]. 27 | */ 28 | 29 | public class MulitpleTransformations { 30 | public static void main(String[] args) { 31 | // 32 | // The "modern" way to initialize Spark is to create a SparkSession 33 | // although they really come from the world of Spark SQL, and Dataset 34 | // and DataFrame. 35 | // 36 | SparkSession spark = SparkSession 37 | .builder() 38 | .appName("streaming-MultipleTransformations") 39 | .master("local[4]") 40 | .getOrCreate(); 41 | 42 | // 43 | // Operating on a raw RDD actually requires access to the more low 44 | // level SparkContext -- get the special Java version for convenience 45 | // 46 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 47 | 48 | 49 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 50 | // but it isn't -- it comes from org.apache.spark.streaming) 51 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 52 | 53 | // use the utility class to produce a sequence of 10 files, each containing 100 records 54 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 55 | // create the stream, which will contain the rows of the individual files as strings 56 | // -- notice we can create the stream even though this directory won't have any data until we call 57 | // fm.makeFiles() below 58 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 59 | 60 | // Register on function to be applied to each RDD -- counting the elements and printing the count; 61 | // in this case we'll also print the timestamp every time an RDD is received -- notice we provide a function 62 | // that takes two parameters this time: the RDD and the time stamp 63 | streamOfRecords.foreachRDD((rdd, timeStamp) -> { 64 | // NOTE: the [1] below will identify every line printed by this function 65 | System.out.println("[1] Timestamp: " + timeStamp + " Count: " + rdd.count()); 66 | }); 67 | 68 | // By calling count() on the stream we transform it another stream, this time a JavaDStream, that 69 | // where each RDD contains a single element whose value is the count of elements in each RDD produced by 70 | // the original stream -- so if we register a function to print those values, it will produce the same data 71 | // as the function above that counts elements of RDDs int he original stream 72 | streamOfRecords.count().foreachRDD((rdd, timeStamp) -> 73 | rdd.foreach(countValue -> 74 | // NOTE: the [1] below will identify every line printed by this function 75 | System.out.println("[2] Timestamp: " + timeStamp + " Count: " + countValue) 76 | ) 77 | ); 78 | 79 | // use a simple transformation to create a derived stream -- the original stream of Records is parsed 80 | // to produce a stream of StreamingItem objects 81 | JavaDStream streamOfItems = streamOfRecords.map(s -> new StreamingItem(s)); 82 | 83 | // use the stream objects to print the values whose key is Key_40 84 | streamOfItems.foreachRDD(rdd -> { 85 | // NOTE: the [3] below will identify every line printed by this function 86 | rdd.foreach(item -> { 87 | // NOTE: since a batch may contain more than one file, 88 | // and each file will contain a Key_50, this may print more than once per batch 89 | if (item.getKey().equals("Key_40")) System.out.println("[3] Key_40 = " + item.getValue()); 90 | }); 91 | }); 92 | 93 | // Also use the stream of StreamignItem objects to calculate the fraction of negative values in each batch 94 | // (since the values were pseudo-random, it will often be around 0.5 or so) 95 | streamOfItems.foreachRDD(rdd -> { 96 | // NOTE: the [4] below will identify every line printed by this function 97 | 98 | if (rdd.count() > 0) { 99 | double negativeCount = rdd.filter(item -> item.getValue() < 0).count(); 100 | double fraction = negativeCount / rdd.count(); 101 | System.out.println("[4] negative fraction = " + fraction); 102 | } 103 | }); 104 | 105 | // start streaming 106 | System.out.println("*** about to start streaming"); 107 | ssc.start(); 108 | 109 | 110 | Thread t = new Thread() { 111 | public void run() { 112 | try { 113 | // A curious fact about files based streaming is that any files written 114 | // before the first RDD is produced are ignored. So wait longer than 115 | // that before producing files. 116 | Thread.sleep(2000); 117 | 118 | System.out.println("*** producing data"); 119 | // start producing files 120 | fm.makeFiles(); 121 | 122 | // give it time to get processed 123 | Thread.sleep(10000); 124 | } catch (InterruptedException ie) { 125 | } catch (IOException ioe) { 126 | throw new RuntimeException("problem in background thread", ioe); 127 | } 128 | ssc.stop(); 129 | System.out.println("*** stopping streaming"); 130 | } 131 | }; 132 | t.start(); 133 | 134 | try { 135 | ssc.awaitTermination(); 136 | } catch (InterruptedException ie) { 137 | 138 | } 139 | System.out.println("*** Streaming terminated"); 140 | } 141 | 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/streaming/Pairs.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | import org.apache.spark.HashPartitioner; 4 | import org.apache.spark.Partitioner; 5 | import org.apache.spark.api.java.JavaSparkContext; 6 | import org.apache.spark.api.java.function.Function; 7 | import org.apache.spark.api.java.function.Function2; 8 | import org.apache.spark.sql.SparkSession; 9 | import org.apache.spark.streaming.Duration; 10 | import org.apache.spark.streaming.api.java.JavaDStream; 11 | import org.apache.spark.streaming.api.java.JavaPairDStream; 12 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 13 | import scala.Tuple2; 14 | import streaming.util.CSVFileStreamGenerator; 15 | import streaming.util.StreamingItem; 16 | 17 | import java.io.IOException; 18 | 19 | 20 | 21 | 22 | public class Pairs { 23 | public static void main(String[] args) { 24 | // 25 | // The "modern" way to initialize Spark is to create a SparkSession 26 | // although they really come from the world of Spark SQL, and Dataset 27 | // and DataFrame. 28 | // 29 | SparkSession spark = SparkSession 30 | .builder() 31 | .appName("streaming-Filtering") 32 | .master("local[4]") 33 | .getOrCreate(); 34 | 35 | // 36 | // Operating on a raw RDD actually requires access to the more low 37 | // level SparkContext -- get the special Java version for convenience 38 | // 39 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 40 | 41 | 42 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 43 | // but it isn't -- it comes from org.apache.spark.streaming) 44 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 45 | 46 | // use the utility class to produce a sequence of 10 files, each containing 100 records 47 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 48 | // create the stream, which will contain the rows of the individual files as strings 49 | // -- notice we can create the stream even though this directory won't have any data until we call 50 | // fm.makeFiles() below 51 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 52 | 53 | // use a simple transformation to create a derived stream -- the original stream of Records is parsed 54 | // to produce a stream of KeyAndValue objects 55 | JavaDStream streamOfItems = streamOfRecords.map(s -> new StreamingItem(s)); 56 | 57 | JavaPairDStream streamOfPairs = 58 | streamOfItems.mapToPair(si -> 59 | new Tuple2<>(si.getCategory(), si)); 60 | 61 | Function createCombinerFunction = item -> 1; 62 | Function2 mergeValueFunction = (count, item) -> count + 1; 63 | Function2 mergeCombinersFunction = (count1, count2) -> count1 + count2; 64 | 65 | JavaPairDStream streamOfCategoryCounts = 66 | streamOfPairs.combineByKey(createCombinerFunction, mergeValueFunction, mergeCombinersFunction, 67 | new HashPartitioner(4)); 68 | 69 | streamOfCategoryCounts.foreachRDD(rdd -> { 70 | System.out.println("Batch size: " + rdd.count()); 71 | rdd.foreach(e -> System.out.println(e)); 72 | }); 73 | 74 | // start streaming 75 | System.out.println("*** about to start streaming"); 76 | ssc.start(); 77 | 78 | 79 | Thread t = new Thread() { 80 | public void run() { 81 | try { 82 | // A curious fact about files based streaming is that any files written 83 | // before the first RDD is produced are ignored. So wait longer than 84 | // that before producing files. 85 | Thread.sleep(2000); 86 | 87 | System.out.println("*** producing data"); 88 | // start producing files 89 | fm.makeFiles(); 90 | 91 | // give it time to get processed 92 | Thread.sleep(10000); 93 | 94 | fm.makeFiles(); 95 | 96 | // give it time to get processed 97 | Thread.sleep(10000); 98 | } catch (InterruptedException ie) { 99 | } catch (IOException ioe) { 100 | throw new RuntimeException("problem in background thread", ioe); 101 | } 102 | ssc.stop(); 103 | System.out.println("*** stopping streaming"); 104 | } 105 | }; 106 | t.start(); 107 | 108 | try { 109 | ssc.awaitTermination(); 110 | } catch (InterruptedException ie) { 111 | 112 | } 113 | System.out.println("*** Streaming terminated"); 114 | } 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/streaming/README.md: -------------------------------------------------------------------------------- 1 | # Streaming 2 | 3 | ## Some philosophy 4 | 5 |

Spark streaming techniques fall into two broad areas that don't have much to do with each other 6 | until you get to the advanced topics:

7 |
    8 |
  • How to get the data streamed into Spark from some external system like a database or a messaging system -- 9 | and sometimes how to get it streamed back out again.
  • 10 |
  • How to transform the data within Spark while making full use of the streaming features.
  • 11 |
12 | 13 |

After the single "getting started" example below, we'll look at these two areas separately. Eventually there may 14 | need to ba a section of "advanced" examples that tie them together again.

15 | 16 |

Of course, to transform streaming data we need to set up a streaming data source. Many of the sources you'll encounter 17 | in practice take considerable set up, so I've chosen to use Spark's file streaming mechanism and provide a utility 18 | class for generating a stream of files containing random data. My hope is that for most users of these examples it 19 | will need no setup at all, and it has the useful side effect of bringing streaming "down to earth" by using 20 | such a "low tech" mechanism.

21 | 22 | ## Utilities 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 45 | 46 | 47 | 48 | 51 | 52 |
FilePurpose
CVSFileStreamGenerator.java 32 |

A utility for creating a sequence of files of integers in the file system 33 | so that Spark can treat them like a stream. This follows a standard pattern 34 | to ensure correctness: each file is first created in another folder and then 35 | atomically renamed into the destination folder so that the file's point of 36 | creation is unambiguous, and is correctly recognized by the streaming 37 | mechanism.

38 | 39 |

Each generated file has the same number of key/value pairs, where the 40 | keys have the same names from file to file, and the values are random 41 | numbers, and thus vary from file to file.

42 | 43 |

This class is used by several of the streaming examples.

44 |
StreamingItem.java

An item of data to be streamed. This is used to generate the records int he CSV files and 49 | also to parse them. Several of the example stream processing pipelines will parse the text data into these 50 | objects for further processing.

53 | 54 | ## Getting started 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 |
FileWhat's Illustrated
FileBased.javaHow to create a stream of data from files appearing in a directory. Start here.
66 | 67 | ## Processing the Data 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 78 | 79 | 80 | 81 | 86 | 87 | 88 | 89 | 92 | 93 | 94 | 95 | 97 | 98 |
FileWhat's Illustrated
MultipleTransformations.java

How to establish multiple streams on the same source of data and register multiple processing 77 | functions on a single stream.

Filtering.java

Much of the processing we require on streams is agnostic about batch boundaries. It's convenient to have 82 | methods on JavaDStream that allow us to transform the streamed data item by item (using map()), or filter it 83 | item by item (using filter()) without being concerned about batch boundaries as embodied by individual RDDs. 84 | This example again uses map() to parse the records int he ext files and then filter() to filter out individual 85 | entries, so that by the time we receive batch RDDs only the desired items remain.

Windowing.java

This example creates two derived streams with different window and slide durations. 90 | All three streams print their batch size every time they produce a batch, so you can compare the 91 | number of records across streams and batches.

StateAccumulation.java

This example uses an accumulator to keep a running total of the number of records processed. Every batch 96 | that is processed is added to it, and the running total is printed.

99 | 100 | 101 | ## Streaming Sources 102 | 103 | TBD 104 | 105 | ## Advanced Topics 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 |
FileWhat's Illustrated
SimpleRecoveryFromCheckpoint.java

This example demonstrates how to persist configured JavaDStreams across a failure and restart. It simulates 115 | failure by destroying the first streaming context (for which a checkpoint directory is configured) and 116 | creating a second one, not from scratch, but by reading the checkpoint directory.

MapWithState.java(In progress)
Pairs.java(In progress)
127 | -------------------------------------------------------------------------------- /src/main/java/streaming/SimpleRecoveryFromCheckpoint.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | 4 | import org.apache.spark.api.java.JavaSparkContext; 5 | import org.apache.spark.sql.SparkSession; 6 | import org.apache.spark.streaming.Duration; 7 | import org.apache.spark.streaming.api.java.JavaDStream; 8 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 9 | import streaming.util.CSVFileStreamGenerator; 10 | 11 | import java.io.File; 12 | import java.io.IOException; 13 | 14 | /** 15 | * This example demonstrates how to persist configured JavaDStreams across a failure and restart. It simulates 16 | * failure by destroying the first streaming context (for which a checkpoint directory is configured) and 17 | * creating a second one, not from scratch, but by reading the checkpoint directory. The 18 | * JavaStreamingContext.getOrCreate() method is used somewhat artificially here, since the second argument would 19 | * normally be a function that creates the streaming context, so that the same initialization call could be used when 20 | * checkpoint data doesn't exist and when it does. That design pattern is adequately demonstrated in the 21 | * documentation: the goal here is more to explain how it works, so the initialization function provided always throws 22 | * exception to demonstrate that it isn't called a checkpoint is always available. 23 | * 24 | * Other than initializing the streaming context from the checkpoint file, the other key point to notice in this code 25 | * is that after recovery, the new streaming context already has the previous context's stream configured: there's no 26 | * need to call ssc2.textFileStream() to configure one, or even to call forEachRDD() again on anything. 27 | */ 28 | public class SimpleRecoveryFromCheckpoint { 29 | 30 | public static void main(String[] args) { 31 | // 32 | // The "modern" way to initialize Spark is to create a SparkSession 33 | // although they really come from the world of Spark SQL, and Dataset 34 | // and DataFrame. 35 | // 36 | SparkSession spark = SparkSession 37 | .builder() 38 | .appName("streaming-FileBased") 39 | .master("local[4]") 40 | .getOrCreate(); 41 | 42 | // 43 | // Operating on a raw RDD actually requires access to the more low 44 | // level SparkContext -- get the special Java version for convenience 45 | // 46 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 47 | 48 | 49 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 50 | // but it isn't -- it comes from org.apache.spark.streaming) 51 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 52 | 53 | String checkpointPath = File.separator + "tmp" + File.separator + "LSWJ" + File.separator + "checkpoints"; 54 | File checkpointDir = new File(checkpointPath); 55 | checkpointDir.mkdir(); 56 | checkpointDir.deleteOnExit(); 57 | ssc.checkpoint(checkpointPath); 58 | 59 | // use the utility class to produce a sequence of 10 files, each containing 100 records 60 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 61 | 62 | // normally we would use a call to JavaStreamingContext.getOrCreate() to initialize a streaming context 63 | // where want to sue recovery, but here we are simulating recovery, so let's just initialize it from scratch 64 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 65 | 66 | streamOfRecords.foreachRDD(rdd -> { 67 | long records = rdd.count(); 68 | System.out.println("[1] Records in this RDD: " + records); 69 | }); 70 | 71 | // start streaming 72 | System.out.println("*** about to start streaming"); 73 | ssc.start(); 74 | 75 | // send some data through 76 | Thread t = new Thread() { 77 | public void run() { 78 | try { 79 | Thread.sleep(2000); 80 | 81 | System.out.println("*** producing data"); 82 | // start producing files 83 | fm.makeFiles(); 84 | 85 | // give it time to get processed 86 | Thread.sleep(10000); 87 | } catch (InterruptedException ie) { 88 | } catch (IOException ioe) { 89 | throw new RuntimeException("problem in background thread", ioe); 90 | } 91 | 92 | ssc.stop(false, true); 93 | System.out.println("*** stopping streaming"); 94 | } 95 | }; 96 | t.start(); 97 | 98 | try { 99 | ssc.awaitTermination(); 100 | } catch (InterruptedException ie) { 101 | 102 | } 103 | System.out.println("*** First streaming context terminated"); 104 | 105 | // simulate failure and recovery within a signel process execution by using a completely 106 | // new streaming context below 107 | 108 | // this will recover the stream we configuyred above -- no need to configure it again 109 | JavaStreamingContext ssc2 = JavaStreamingContext.getOrCreate(checkpointPath, () -> { 110 | // this would normally contain code to initialize the streaming from scratch because no checkpoint 111 | // was found, but we know that one WILL be found, and so we prove it by making this always throw! 112 | System.out.println("*** shouldn't be getting here: trying to re-create streaming context"); 113 | throw new IllegalStateException(""); 114 | }); 115 | 116 | // start streaming 117 | System.out.println("*** about to start streaming again"); 118 | ssc2.start(); 119 | 120 | // send some more data through the recovered stream 121 | Thread t2 = new Thread() { 122 | public void run() { 123 | try { 124 | Thread.sleep(2000); 125 | 126 | System.out.println("*** producing data"); 127 | // start producing files 128 | fm.makeFiles(); 129 | 130 | // give it time to get processed 131 | Thread.sleep(10000); 132 | } catch (InterruptedException ie) { 133 | } catch (IOException ioe) { 134 | throw new RuntimeException("problem in background thread", ioe); 135 | } 136 | ssc2.stop(false, true); 137 | System.out.println("*** stopping streaming again"); 138 | } 139 | }; 140 | t2.start(); 141 | 142 | try { 143 | ssc2.awaitTermination(); 144 | } catch (InterruptedException ie) { 145 | 146 | } 147 | System.out.println("*** Second streaming context terminated"); 148 | 149 | 150 | } 151 | 152 | 153 | } 154 | 155 | 156 | -------------------------------------------------------------------------------- /src/main/java/streaming/StateAccumulation.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | import org.apache.spark.api.java.JavaSparkContext; 4 | import org.apache.spark.sql.SparkSession; 5 | import org.apache.spark.streaming.Duration; 6 | import org.apache.spark.streaming.api.java.JavaDStream; 7 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 8 | import org.apache.spark.util.LongAccumulator; 9 | import streaming.util.CSVFileStreamGenerator; 10 | 11 | import java.io.File; 12 | import java.io.IOException; 13 | 14 | /** 15 | * This example uses an accumulator to keep a running total of the number of records processed. Every batch 16 | * that is processed is added to it, and the running total is printed. 17 | */ 18 | public class StateAccumulation { 19 | 20 | static class RecordCounter { 21 | 22 | private static volatile LongAccumulator instance = null; 23 | 24 | public static void clobber() { 25 | instance = null; 26 | } 27 | 28 | public static LongAccumulator getInstance(JavaSparkContext jsc) { 29 | if (instance == null) { 30 | synchronized (RecordCounter.class) { 31 | if (instance == null) { 32 | System.out.println("*** Initializing RecordCounter"); 33 | instance = jsc.sc().longAccumulator("RecordCounter"); 34 | } 35 | } 36 | } 37 | return instance; 38 | } 39 | } 40 | 41 | public static void main(String[] args) { 42 | // 43 | // The "modern" way to initialize Spark is to create a SparkSession 44 | // although they really come from the world of Spark SQL, and Dataset 45 | // and DataFrame. 46 | // 47 | SparkSession spark = SparkSession 48 | .builder() 49 | .appName("streaming-FileBased") 50 | .master("local[4]") 51 | .getOrCreate(); 52 | 53 | // 54 | // Operating on a raw RDD actually requires access to the more low 55 | // level SparkContext -- get the special Java version for convenience 56 | // 57 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 58 | 59 | 60 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 61 | // but it isn't -- it comes from org.apache.spark.streaming) 62 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 63 | 64 | // use the utility class to produce a sequence of 10 files, each containing 100 records 65 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 66 | 67 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 68 | 69 | streamOfRecords.foreachRDD(rdd -> { 70 | // The getInstance() pattern ensures that it is only initialized for the first batch. It will also 71 | // be initialized on recovery if you use checkpointing, but it's state won't be recovered for you. 72 | final LongAccumulator recordCounter = RecordCounter.getInstance(new JavaSparkContext(rdd.context())); 73 | long records = rdd.count(); 74 | recordCounter.add(records); 75 | System.out.println("This RDD: " + records + " running total: " + recordCounter.value()); 76 | }); 77 | 78 | // start streaming 79 | System.out.println("*** about to start streaming"); 80 | ssc.start(); 81 | 82 | 83 | Thread t = new Thread() { 84 | public void run() { 85 | try { 86 | Thread.sleep(2000); 87 | 88 | System.out.println("*** producing data"); 89 | // start producing files 90 | fm.makeFiles(); 91 | 92 | // give it time to get processed 93 | Thread.sleep(10000); 94 | } catch (InterruptedException ie) { 95 | } catch (IOException ioe) { 96 | throw new RuntimeException("problem in background thread", ioe); 97 | } 98 | 99 | ssc.stop(false, true); 100 | System.out.println("*** stopping streaming"); 101 | } 102 | }; 103 | t.start(); 104 | 105 | try { 106 | ssc.awaitTermination(); 107 | } catch (InterruptedException ie) { 108 | 109 | } 110 | System.out.println("*** Streaming context terminated"); 111 | 112 | } 113 | 114 | 115 | 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/streaming/Windowing.java: -------------------------------------------------------------------------------- 1 | package streaming; 2 | 3 | import org.apache.spark.api.java.JavaSparkContext; 4 | import org.apache.spark.sql.SparkSession; 5 | import org.apache.spark.streaming.Duration; 6 | import org.apache.spark.streaming.api.java.JavaDStream; 7 | import org.apache.spark.streaming.api.java.JavaStreamingContext; 8 | import streaming.util.CSVFileStreamGenerator; 9 | import streaming.util.StreamingItem; 10 | 11 | import java.io.IOException; 12 | 13 | /** 14 | * The windowing methods on a stream allow you to create a derived stream whose batches contain data from some 15 | * number fo the most recent batches in the parent stream, and producing a batch per some number of parent 16 | * stream batches (default is one.) 17 | * 18 | * Both the sliding window size and the batch frequency are specified as durations, which must be integer 19 | * multiples of the parent stream's batch duration. Of course, the parent stream could itself have been 20 | * derived from another stream, so its batch duration will not necessarily be the duration specified for the 21 | * JavaStreamingContext. 22 | * 23 | * This example creates two derived streams with different window and slide durations. All three streams print 24 | * their batch size every time they produce a batch, so you can compare the number of records across streams 25 | * and batches. 26 | */ 27 | 28 | public class Windowing { 29 | public static void main(String[] args) { 30 | // 31 | // The "modern" way to initialize Spark is to create a SparkSession 32 | // although they really come from the world of Spark SQL, and Dataset 33 | // and DataFrame. 34 | // 35 | SparkSession spark = SparkSession 36 | .builder() 37 | .appName("streaming-Windowing") 38 | .master("local[4]") 39 | .getOrCreate(); 40 | 41 | // 42 | // Operating on a raw RDD actually requires access to the more low 43 | // level SparkContext -- get the special Java version for convenience 44 | // 45 | JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); 46 | 47 | 48 | // streams will produce data every second (note: it would be nice if this was Java 8's Duration class, 49 | // but it isn't -- it comes from org.apache.spark.streaming) 50 | JavaStreamingContext ssc = new JavaStreamingContext(sc, new Duration(1000)); 51 | 52 | // use the utility class to produce a sequence of 10 files, each containing 100 records 53 | CSVFileStreamGenerator fm = new CSVFileStreamGenerator(10, 100, 500); 54 | // create the stream, which will contain the rows of the individual files as strings 55 | // -- notice we can create the stream even though this directory won't have any data until we call 56 | // fm.makeFiles() below 57 | JavaDStream streamOfRecords = ssc.textFileStream(fm.getDestination().getAbsolutePath()); 58 | 59 | // Create a derived stream that will produce a batch every second (just like its parent) but the 60 | // batch will contain data from the parent stream's most recent two batches 61 | JavaDStream threeSecondsEverySecond = streamOfRecords.window(new Duration(3000)); 62 | 63 | // Another derived stream, this time creating batch every two seconds, but containing the data from 64 | // the parent stream's most recent five batches 65 | JavaDStream fiveSecondsEveryTwoSeconds = 66 | streamOfRecords.window(new Duration(5000), new Duration(2000)); 67 | 68 | // 69 | // Register functions to print the batch sizes in all three of the streams. Notice that: 70 | // 1) Each stream identifies itself as either [original], [window 3s] or [window 3s slide 2s] 71 | // 2) The "window duration" is how far back int he parent's stream is included in very batch 72 | // 3) The "slide duration" is how often a "windowed" batch is produced byt he derived stream (default is the 73 | // same as the parent's "batch duration") 74 | // 4) You will see output from the [window 3s] stream every second, but only every two seconds from 75 | // [window 3s slide 2s] -- and you can check the item counts against the right number of most recent item 76 | // counts from the [original] stream. 77 | // 5) Time stamps are included to help you keep track of the batches 78 | // 79 | 80 | streamOfRecords.foreachRDD((rdd, timeStamp) -> 81 | System.out.println("[original] TS: " + timeStamp + " Item count = " + rdd.count())); 82 | 83 | threeSecondsEverySecond.foreachRDD((rdd, timeStamp) -> 84 | System.out.println("[window 3s] TS: " + timeStamp + " Item count = " + rdd.count())); 85 | 86 | fiveSecondsEveryTwoSeconds.foreachRDD((rdd, timeStamp) -> 87 | System.out.println("[window 5s slide 2s] TS: " + timeStamp + " Item count = " + rdd.count())); 88 | 89 | // start streaming 90 | System.out.println("*** about to start streaming"); 91 | ssc.start(); 92 | 93 | 94 | Thread t = new Thread() { 95 | public void run() { 96 | try { 97 | // A curious fact about files based streaming is that any files written 98 | // before the first RDD is produced are ignored. So wait longer than 99 | // that before producing files. 100 | Thread.sleep(2000); 101 | 102 | System.out.println("*** producing data"); 103 | // start producing files 104 | fm.makeFiles(); 105 | 106 | // give it time to get processed 107 | Thread.sleep(10000); 108 | } catch (InterruptedException ie) { 109 | } catch (IOException ioe) { 110 | throw new RuntimeException("problem in background thread", ioe); 111 | } 112 | ssc.stop(); 113 | System.out.println("*** stopping streaming"); 114 | } 115 | }; 116 | t.start(); 117 | 118 | try { 119 | ssc.awaitTermination(); 120 | } catch (InterruptedException ie) { 121 | 122 | } 123 | System.out.println("*** Streaming terminated"); 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/main/java/streaming/util/CSVFileStreamGenerator.java: -------------------------------------------------------------------------------- 1 | package streaming.util; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.io.PrintWriter; 7 | import java.util.Random; 8 | 9 | /** 10 | * A utility for creating a sequence of files of integers in the file system 11 | * so that Spark can treat them like a stream. This follows a standard pattern 12 | * to ensure correctness: each file is first created in another folder and then 13 | * atomically renamed into the destination folder so that the file's point of 14 | * creation is unambiguous, and is correctly recognized by the streaming 15 | * mechanism. 16 | * 17 | * Each generated file has the same number of key/value pairs, where the 18 | * keys have the same names from file to file, and the values are random 19 | * numbers, and thus vary from file to file. 20 | * 21 | * This class is used by several of the streaming examples. 22 | */ 23 | public class CSVFileStreamGenerator { 24 | 25 | private File _root; 26 | private File _prep; 27 | private File _dest; 28 | private int _nFiles; 29 | private int _nRecords; 30 | private int _betweenFilesMsec; 31 | private Random _random = new Random(); 32 | 33 | public CSVFileStreamGenerator(int nFiles, int nRecords, int betweenFilesMsec) { 34 | _nFiles = nFiles; 35 | _nRecords = nRecords; 36 | _betweenFilesMsec = betweenFilesMsec; 37 | 38 | _root = new File(File.separator + "tmp" + File.separator + "streamFiles"); 39 | makeExist(_root); 40 | 41 | _prep = new File(_root.getAbsolutePath() + File.separator + "prep"); 42 | makeExist(_prep); 43 | 44 | _dest = new File(_root.getAbsoluteFile() + File.separator + "dest"); 45 | makeExist(_dest); 46 | } 47 | 48 | public File getDestination() { return _dest; } 49 | 50 | // fill a file with integers 51 | private void writeOutput(File f) throws FileNotFoundException { 52 | PrintWriter p = new java.io.PrintWriter(f); 53 | try { 54 | for (int i = 1; i <= _nRecords; i++) { 55 | StreamingItem item = new StreamingItem(_random, "Key_%d"); 56 | p.println(item); 57 | } 58 | } finally { 59 | p.close(); 60 | } 61 | } 62 | 63 | private static void makeExist(File dir) { 64 | dir.mkdir(); 65 | } 66 | 67 | // make the sequence of files by creating them in one place and renaming 68 | // them into the directory where Spark is looking for them 69 | // (file-based streaming requires "atomic" creation of the files) 70 | public void makeFiles() throws IOException, InterruptedException { 71 | for (int n = 1; n <= _nFiles; n++) { 72 | File f = File.createTempFile("Spark_", ".txt", _prep); 73 | writeOutput(f); 74 | File nf = new File(_dest + File.separator + f.getName()); 75 | f.renameTo(nf); 76 | nf.deleteOnExit(); 77 | Thread.sleep(_betweenFilesMsec); 78 | } 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/streaming/util/StreamingItem.java: -------------------------------------------------------------------------------- 1 | package streaming.util; 2 | 3 | import java.io.Serializable; 4 | import java.util.Random; 5 | 6 | /** 7 | * This utility class complements the CSVFileStreamGenerator providing a way to parse and represent 8 | * the key value pairs generated in the CSV files 9 | */ 10 | public class StreamingItem implements Serializable { 11 | 12 | public enum Category { SMALL, MEDIUM, LARGE, HUGE }; 13 | 14 | /** 15 | * Fill in other fields at random 16 | * @param key 17 | */ 18 | public StreamingItem(Random random, String key) { 19 | _key = key; 20 | _value = random.nextInt(); 21 | int whichCat = random.nextInt(11); 22 | switch (whichCat) { 23 | case 0: 24 | case 1: 25 | _category = Category.SMALL; 26 | break; 27 | case 2: 28 | case 3: 29 | case 4: 30 | case 5: 31 | _category = Category.MEDIUM; 32 | break; 33 | case 6: 34 | case 7: 35 | case 8: 36 | _category = Category.LARGE; 37 | break; 38 | default: 39 | _category = Category.HUGE; 40 | } 41 | } 42 | 43 | 44 | public StreamingItem(String key, Category category, int value) { 45 | _key = key; 46 | _category = category; 47 | _value = value; 48 | } 49 | 50 | /** 51 | * Parse out of the format found in generated CSV files 52 | * @param csvLine A comma separated key and values 53 | * @throws IllegalArgumentException 54 | */ 55 | public StreamingItem(String csvLine) throws IllegalArgumentException { 56 | String[] parts = csvLine.split(","); 57 | if (parts.length != 3) { 58 | throw new IllegalArgumentException("String is not separated by two single commas: " + csvLine); 59 | } 60 | _key = parts[0]; 61 | _category = Category.valueOf(parts[1]); 62 | _value = Integer.valueOf(parts[2]); 63 | } 64 | 65 | public String getKey() { return _key; } 66 | 67 | public Category getCategory() { return _category; } 68 | 69 | public int getValue() { return _value; } 70 | 71 | public String toString() { return _key + "," + _category + "," + _value; } 72 | 73 | private String _key; 74 | 75 | private Category _category; 76 | 77 | private int _value; 78 | } 79 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=WARN, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.err 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN 8 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN 9 | 10 | --------------------------------------------------------------------------------