├── project ├── plugins.sbt └── build.properties ├── .idea ├── copyright │ └── profiles_settings.xml ├── vcs.xml ├── libraries │ ├── SBT__oro_oro_2_0_8_jar.xml │ ├── SBT__org_tukaani_xz_1_0_jar.xml │ ├── SBT__jline_jline_0_9_94_jar.xml │ ├── SBT__xmlenc_xmlenc_0_52_jar.xml │ ├── SBT__log4j_log4j_1_2_17_jar.xml │ ├── SBT__net_sf_py4j_py4j_0_10_6_jar.xml │ ├── SBT__org_apache_ivy_ivy_2_4_0_jar.xml │ ├── SBT__org_lz4_lz4_java_1_4_0_jar.xml │ ├── SBT__com_101tec_zkclient_0_8_jar.xml │ ├── SBT__net_iharder_base64_2_3_8_jar.xml │ ├── SBT__xerces_xercesImpl_2_9_1_jar.xml │ ├── SBT__xml_apis_xml_apis_1_3_04_jar.xml │ ├── SBT__com_google_inject_guice_3_0_jar.xml │ ├── SBT__commons_io_commons_io_2_4_jar.xml │ ├── SBT__io_netty_netty_3_9_9_Final_jar.xml │ ├── SBT__joda_time_joda_time_2_9_9_jar.xml │ ├── SBT__org_apache_avro_avro_1_7_4_jar.xml │ ├── SBT__com_carrotsearch_hppc_0_7_2_jar.xml │ ├── SBT__net_razorvine_pyrolite_4_13_jar.xml │ ├── SBT__org_objenesis_objenesis_2_1_jar.xml │ ├── SBT__org_scala_lang_scalap_2_11_0_jar.xml │ ├── SBT__org_slf4j_slf4j_api_1_7_25_jar.xml │ ├── SBT__aopalliance_aopalliance_1_0_jar.xml │ ├── SBT__com_google_guava_guava_12_0_1_jar.xml │ ├── SBT__com_ning_compress_lzf_1_0_3_jar.xml │ ├── SBT__com_twitter_chill_2_11_0_8_4_jar.xml │ ├── SBT__com_twitter_chill_java_0_8_4_jar.xml │ ├── SBT__commons_cli_commons_cli_1_2_jar.xml │ ├── SBT__commons_net_commons_net_2_2_jar.xml │ ├── SBT__javax_inject_javax_inject_1_jar.xml │ ├── SBT__javax_xml_bind_jaxb_api_2_2_2_jar.xml │ ├── SBT__org_htrace_htrace_core_3_0_4_jar.xml │ ├── SBT__com_google_code_gson_gson_2_2_4_jar.xml │ ├── SBT__commons_lang_commons_lang_2_6_jar.xml │ ├── SBT__io_airlift_aircompressor_0_8_jar.xml │ ├── SBT__javax_xml_stream_stax_api_1_0_2_jar.xml │ ├── SBT__org_antlr_antlr4_runtime_4_7_jar.xml │ ├── SBT__org_apache_avro_avro_ipc_1_7_7_jar.xml │ ├── SBT__org_slf4j_jul_to_slf4j_1_7_16_jar.xml │ ├── SBT__io_netty_netty_all_4_1_17_Final_jar.xml │ ├── SBT__net_java_dev_jets3t_jets3t_0_9_4_jar.xml │ ├── SBT__org_codehaus_janino_janino_3_0_8_jar.xml │ ├── SBT__org_slf4j_slf4j_log4j12_1_7_21_jar.xml │ ├── SBT__com_esotericsoftware_minlog_1_3_0_jar.xml │ ├── SBT__com_github_luben_zstd_jni_1_3_2_2_jar.xml │ ├── SBT__commons_codec_commons_codec_1_11_jar.xml │ ├── SBT__javax_activation_activation_1_1_1_jar.xml │ ├── SBT__org_javassist_javassist_3_18_1_GA_jar.xml │ ├── SBT__org_slf4j_jcl_over_slf4j_1_7_16_jar.xml │ ├── SBT__javax_ws_rs_javax_ws_rs_api_2_0_1_jar.xml │ ├── SBT__net_sf_jopt_simple_jopt_simple_4_9_jar.xml │ ├── SBT__org_apache_avro_avro_ipc_1_7_7_tests_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_auth_2_6_5_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_hdfs_2_6_5_jar.xml │ ├── SBT__org_apache_orc_orc_core_1_4_1_nohive_jar.xml │ ├── SBT__org_codehaus_jettison_jettison_1_1_jar.xml │ ├── SBT__org_glassfish_hk2_hk2_api_2_4_0_b34_jar.xml │ ├── SBT__org_json4s_json4s_ast_2_11_3_2_11_jar.xml │ ├── SBT__org_mortbay_jetty_jetty_util_6_1_26_jar.xml │ ├── SBT__org_spark_project_spark_unused_1_0_0_jar.xml │ ├── SBT__com_clearspring_analytics_stream_2_7_0_jar.xml │ ├── SBT__com_google_code_findbugs_jsr305_3_0_2_jar.xml │ ├── SBT__com_vlkan_flatbuffers_1_2_0_3f79e055_jar.xml │ ├── SBT__org_apache_arrow_arrow_format_0_8_0_jar.xml │ ├── SBT__org_apache_arrow_arrow_memory_0_8_0_jar.xml │ ├── SBT__org_apache_arrow_arrow_vector_0_8_0_jar.xml │ ├── SBT__org_apache_commons_commons_lang3_3_5_jar.xml │ ├── SBT__org_apache_kafka_kafka_2_11_0_10_0_1_jar.xml │ ├── SBT__org_apache_zookeeper_zookeeper_3_4_6_jar.xml │ ├── SBT__org_glassfish_hk2_hk2_utils_2_4_0_b34_jar.xml │ ├── SBT__org_json4s_json4s_core_2_11_3_2_11_jar.xml │ ├── SBT__org_scala_lang_scala_library_2_11_12_jar.xml │ ├── SBT__org_scala_lang_scala_reflect_2_11_12_jar.xml │ ├── SBT__com_yammer_metrics_metrics_core_2_2_0_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_client_2_6_5_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_common_2_6_5_jar.xml │ ├── SBT__org_apache_spark_spark_sql_2_11_2_3_0_jar.xml │ ├── SBT__org_bouncycastle_bcprov_jdk15on_1_52_jar.xml │ ├── SBT__org_codehaus_jackson_jackson_xc_1_9_13_jar.xml │ ├── SBT__org_scala_lang_scala_compiler_2_11_0_jar.xml │ ├── SBT__org_xerial_snappy_snappy_java_1_1_2_6_jar.xml │ ├── SBT__com_esotericsoftware_kryo_shaded_3_0_3_jar.xml │ ├── SBT__com_univocity_univocity_parsers_2_5_9_jar.xml │ ├── SBT__commons_digester_commons_digester_1_8_jar.xml │ ├── SBT__io_dropwizard_metrics_metrics_jvm_3_1_5_jar.xml │ ├── SBT__javax_servlet_javax_servlet_api_3_1_0_jar.xml │ ├── SBT__org_apache_commons_commons_crypto_1_0_0_jar.xml │ ├── SBT__org_apache_commons_commons_math3_3_4_1_jar.xml │ ├── SBT__org_apache_httpcomponents_httpclient_4_5_jar.xml │ ├── SBT__org_apache_httpcomponents_httpcore_4_4_1_jar.xml │ ├── SBT__org_apache_kafka_kafka_clients_0_10_0_1_jar.xml │ ├── SBT__org_apache_parquet_parquet_column_1_8_2_jar.xml │ ├── SBT__org_apache_parquet_parquet_common_1_8_2_jar.xml │ ├── SBT__org_apache_parquet_parquet_format_2_3_1_jar.xml │ ├── SBT__org_apache_parquet_parquet_hadoop_1_8_2_jar.xml │ ├── SBT__org_apache_spark_spark_core_2_11_2_3_0_jar.xml │ ├── SBT__org_apache_spark_spark_tags_2_11_2_3_0_jar.xml │ ├── SBT__org_glassfish_hk2_hk2_locator_2_4_0_b34_jar.xml │ ├── SBT__org_json4s_json4s_jackson_2_11_3_2_11_jar.xml │ ├── SBT__org_roaringbitmap_RoaringBitmap_0_5_11_jar.xml │ ├── SBT__com_google_protobuf_protobuf_java_2_5_0_jar.xml │ ├── SBT__com_jamesmurty_utils_java_xmlbuilder_1_1_jar.xml │ ├── SBT__com_thoughtworks_paranamer_paranamer_2_8_jar.xml │ ├── SBT__io_dropwizard_metrics_metrics_core_3_1_5_jar.xml │ ├── SBT__io_dropwizard_metrics_metrics_json_3_1_5_jar.xml │ ├── SBT__org_apache_avro_avro_mapred_1_7_7_hadoop2_jar.xml │ ├── SBT__org_apache_curator_curator_client_2_6_0_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_yarn_api_2_6_5_jar.xml │ ├── SBT__org_apache_orc_orc_mapreduce_1_4_1_nohive_jar.xml │ ├── SBT__org_apache_parquet_parquet_jackson_1_8_2_jar.xml │ ├── SBT__org_apache_xbean_xbean_asm5_shaded_4_4_jar.xml │ ├── SBT__org_codehaus_jackson_jackson_jaxrs_1_9_13_jar.xml │ ├── SBT__commons_beanutils_commons_beanutils_1_7_0_jar.xml │ ├── SBT__commons_httpclient_commons_httpclient_3_1_jar.xml │ ├── SBT__org_apache_commons_commons_compress_1_4_1_jar.xml │ ├── SBT__org_apache_curator_curator_recipes_2_6_0_jar.xml │ ├── SBT__org_apache_directory_api_api_util_1_0_0_M20_jar.xml │ ├── SBT__org_apache_parquet_parquet_encoding_1_8_2_jar.xml │ ├── SBT__org_apache_spark_spark_kvstore_2_11_2_3_0_jar.xml │ ├── SBT__org_apache_spark_spark_sketch_2_11_2_3_0_jar.xml │ ├── SBT__org_apache_spark_spark_unsafe_2_11_2_3_0_jar.xml │ ├── SBT__javax_annotation_javax_annotation_api_1_2_jar.xml │ ├── SBT__javax_validation_validation_api_1_1_0_Final_jar.xml │ ├── SBT__org_apache_curator_curator_framework_2_6_0_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_annotations_2_6_5_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_yarn_client_2_6_5_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_yarn_common_2_6_5_jar.xml │ ├── SBT__org_apache_spark_spark_catalyst_2_11_2_3_0_jar.xml │ ├── SBT__org_apache_spark_spark_launcher_2_11_2_3_0_jar.xml │ ├── SBT__org_codehaus_janino_commons_compiler_3_0_8_jar.xml │ ├── SBT__org_scala_lang_modules_scala_xml_2_11_1_0_1_jar.xml │ ├── SBT__org_sonatype_sisu_inject_cglib_2_2_1_v20090111_jar.xml │ ├── SBT__com_fasterxml_jackson_core_jackson_core_2_7_9_jar.xml │ ├── SBT__io_dropwizard_metrics_metrics_graphite_3_1_5_jar.xml │ ├── SBT__org_apache_spark_spark_streaming_2_11_2_3_0_jar.xml │ ├── SBT__org_codehaus_jackson_jackson_core_asl_1_9_13_jar.xml │ ├── SBT__org_fusesource_leveldbjni_leveldbjni_all_1_8_jar.xml │ ├── SBT__org_glassfish_jersey_core_jersey_client_2_22_2_jar.xml │ ├── SBT__org_glassfish_jersey_core_jersey_common_2_22_2_jar.xml │ ├── SBT__org_glassfish_jersey_core_jersey_server_2_22_2_jar.xml │ ├── SBT__commons_collections_commons_collections_3_2_2_jar.xml │ ├── SBT__org_apache_directory_api_api_asn1_api_1_0_0_M20_jar.xml │ ├── SBT__org_codehaus_jackson_jackson_mapper_asl_1_9_13_jar.xml │ ├── SBT__org_glassfish_hk2_osgi_resource_locator_1_0_1_jar.xml │ ├── SBT__commons_beanutils_commons_beanutils_core_1_8_0_jar.xml │ ├── SBT__org_glassfish_hk2_external_javax_inject_2_4_0_b34_jar.xml │ ├── SBT__commons_configuration_commons_configuration_1_6_jar.xml │ ├── SBT__org_apache_directory_server_apacheds_i18n_2_0_0_M15_jar.xml │ ├── SBT__org_apache_spark_spark_network_common_2_11_2_3_0_jar.xml │ ├── SBT__org_apache_spark_spark_sql_kafka_0_10_2_11_2_3_0_jar.xml │ ├── SBT__org_glassfish_jersey_media_jersey_media_jaxb_2_22_2_jar.xml │ ├── SBT__com_fasterxml_jackson_core_jackson_databind_2_6_7_1_jar.xml │ ├── SBT__com_typesafe_scala_logging_scala_logging_2_11_3_8_0_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_yarn_server_common_2_6_5_jar.xml │ ├── SBT__org_apache_spark_spark_network_shuffle_2_11_2_3_0_jar.xml │ ├── SBT__com_fasterxml_jackson_core_jackson_annotations_2_6_7_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_5_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_5_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_5_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_5_jar.xml │ ├── SBT__org_apache_spark_spark_streaming_kafka_0_10_2_11_2_3_0_jar.xml │ ├── SBT__org_glassfish_jersey_bundles_repackaged_jersey_guava_2_22_2_jar.xml │ ├── SBT__org_glassfish_hk2_external_aopalliance_repackaged_2_4_0_b34_jar.xml │ ├── SBT__com_fasterxml_jackson_module_jackson_module_paranamer_2_7_9_jar.xml │ ├── SBT__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_5_jar.xml │ ├── SBT__org_scala_lang_modules_scala_parser_combinators_2_11_1_0_4_jar.xml │ ├── SBT__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15_jar.xml │ ├── SBT__com_fasterxml_jackson_module_jackson_module_scala_2_11_2_6_7_1_jar.xml │ ├── SBT__org_glassfish_jersey_containers_jersey_container_servlet_2_22_2_jar.xml │ └── SBT__org_glassfish_jersey_containers_jersey_container_servlet_core_2_22_2_jar.xml ├── hydra.xml ├── scala_compiler.xml ├── modules.xml ├── misc.xml ├── sbt.xml ├── compiler.xml ├── uiDesigner.xml └── modules │ └── spark-streaming-with-kafka-build.iml ├── .travis.yml ├── .gitignore ├── src └── main │ ├── resources │ └── log4j.properties │ └── scala │ ├── util │ ├── DirectServerDemo.scala │ ├── SparkKafkaSink.scala │ ├── TemporaryDirectories.scala │ ├── EmbeddedZookeeper.scala │ ├── PartitionMapAnalyzer.scala │ ├── SimpleKafkaClient.scala │ └── EmbeddedKafkaServer.scala │ ├── ExceptionPropagation.scala │ ├── structured │ ├── SimpleAggregation.scala │ ├── Simple.scala │ ├── README.md │ ├── SubscribeAndPublish.scala │ └── Foreach.scala │ ├── ControlledPartitioning.scala │ ├── SendWithDifferentPartitioning.scala │ ├── SimpleStreaming.scala │ ├── MultipleTopics.scala │ ├── MultipleStreams.scala │ ├── SimpleStreamingFromRDD.scala │ ├── MultipleConsumerGroups.scala │ ├── Timestamp.scala │ ├── AddPartitionsWhileStreaming.scala │ └── applications │ └── stock_price_feed │ └── StockMarketData.scala ├── LICENSE └── README.md /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | 3 | scala: 4 | - 2.11.12 5 | 6 | script: 7 | - sbt clean compile 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/* 2 | target/ 3 | lib_managed/ 4 | src_managed/ 5 | project/boot/ 6 | project/plugins/project/ 7 | .history 8 | .cache 9 | .lib/ 10 | .idea/workspace.xml 11 | 12 | ### Scala ### 13 | *.class 14 | *.log 15 | 16 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__oro_oro_2_0_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_tukaani_xz_1_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__jline_jline_0_9_94_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__xmlenc_xmlenc_0_52_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__log4j_log4j_1_2_17_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__net_sf_py4j_py4j_0_10_6_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_ivy_ivy_2_4_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_lz4_lz4_java_1_4_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_101tec_zkclient_0_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__net_iharder_base64_2_3_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__xerces_xercesImpl_2_9_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__xml_apis_xml_apis_1_3_04_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_google_inject_guice_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_io_commons_io_2_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__io_netty_netty_3_9_9_Final_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__joda_time_joda_time_2_9_9_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_avro_avro_1_7_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_carrotsearch_hppc_0_7_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__net_razorvine_pyrolite_4_13_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_objenesis_objenesis_2_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_scala_lang_scalap_2_11_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_slf4j_slf4j_api_1_7_25_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__aopalliance_aopalliance_1_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_google_guava_guava_12_0_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_ning_compress_lzf_1_0_3_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_twitter_chill_2_11_0_8_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_twitter_chill_java_0_8_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_cli_commons_cli_1_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_net_commons_net_2_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_inject_javax_inject_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_xml_bind_jaxb_api_2_2_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_htrace_htrace_core_3_0_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_google_code_gson_gson_2_2_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_lang_commons_lang_2_6_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__io_airlift_aircompressor_0_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_xml_stream_stax_api_1_0_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_antlr_antlr4_runtime_4_7_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_avro_avro_ipc_1_7_7_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_slf4j_jul_to_slf4j_1_7_16_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__io_netty_netty_all_4_1_17_Final_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__net_java_dev_jets3t_jets3t_0_9_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_codehaus_janino_janino_3_0_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_slf4j_slf4j_log4j12_1_7_21_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_esotericsoftware_minlog_1_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_github_luben_zstd_jni_1_3_2_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_codec_commons_codec_1_11_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_activation_activation_1_1_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_javassist_javassist_3_18_1_GA_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_slf4j_jcl_over_slf4j_1_7_16_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_ws_rs_javax_ws_rs_api_2_0_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__net_sf_jopt_simple_jopt_simple_4_9_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_avro_avro_ipc_1_7_7_tests_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_auth_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_hdfs_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_orc_orc_core_1_4_1_nohive_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_codehaus_jettison_jettison_1_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_hk2_hk2_api_2_4_0_b34_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_json4s_json4s_ast_2_11_3_2_11_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_mortbay_jetty_jetty_util_6_1_26_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_spark_project_spark_unused_1_0_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/hydra.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_clearspring_analytics_stream_2_7_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_google_code_findbugs_jsr305_3_0_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_vlkan_flatbuffers_1_2_0_3f79e055_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_arrow_arrow_format_0_8_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_arrow_arrow_memory_0_8_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_arrow_arrow_vector_0_8_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_commons_commons_lang3_3_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_kafka_kafka_2_11_0_10_0_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_zookeeper_zookeeper_3_4_6_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_hk2_hk2_utils_2_4_0_b34_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_json4s_json4s_core_2_11_3_2_11_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_scala_lang_scala_library_2_11_12_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_scala_lang_scala_reflect_2_11_12_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_yammer_metrics_metrics_core_2_2_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_client_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_common_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_sql_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_bouncycastle_bcprov_jdk15on_1_52_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_codehaus_jackson_jackson_xc_1_9_13_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_scala_lang_scala_compiler_2_11_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_xerial_snappy_snappy_java_1_1_2_6_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_esotericsoftware_kryo_shaded_3_0_3_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_univocity_univocity_parsers_2_5_9_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_digester_commons_digester_1_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__io_dropwizard_metrics_metrics_jvm_3_1_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_servlet_javax_servlet_api_3_1_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_commons_commons_crypto_1_0_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_commons_commons_math3_3_4_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_httpcomponents_httpclient_4_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_httpcomponents_httpcore_4_4_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_kafka_kafka_clients_0_10_0_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_parquet_parquet_column_1_8_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_parquet_parquet_common_1_8_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_parquet_parquet_format_2_3_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_parquet_parquet_hadoop_1_8_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_core_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_tags_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_hk2_hk2_locator_2_4_0_b34_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_json4s_json4s_jackson_2_11_3_2_11_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_roaringbitmap_RoaringBitmap_0_5_11_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_google_protobuf_protobuf_java_2_5_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_jamesmurty_utils_java_xmlbuilder_1_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_thoughtworks_paranamer_paranamer_2_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__io_dropwizard_metrics_metrics_core_3_1_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__io_dropwizard_metrics_metrics_json_3_1_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_avro_avro_mapred_1_7_7_hadoop2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_curator_curator_client_2_6_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_yarn_api_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_orc_orc_mapreduce_1_4_1_nohive_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_parquet_parquet_jackson_1_8_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_xbean_xbean_asm5_shaded_4_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_codehaus_jackson_jackson_jaxrs_1_9_13_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_beanutils_commons_beanutils_1_7_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_httpclient_commons_httpclient_3_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_commons_commons_compress_1_4_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_curator_curator_recipes_2_6_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_directory_api_api_util_1_0_0_M20_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_parquet_parquet_encoding_1_8_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_kvstore_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_sketch_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_unsafe_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_annotation_javax_annotation_api_1_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__javax_validation_validation_api_1_1_0_Final_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_curator_curator_framework_2_6_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_annotations_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_yarn_client_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_yarn_common_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_catalyst_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_launcher_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_codehaus_janino_commons_compiler_3_0_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_scala_lang_modules_scala_xml_2_11_1_0_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_sonatype_sisu_inject_cglib_2_2_1_v20090111_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_fasterxml_jackson_core_jackson_core_2_7_9_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__io_dropwizard_metrics_metrics_graphite_3_1_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_streaming_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_codehaus_jackson_jackson_core_asl_1_9_13_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_fusesource_leveldbjni_leveldbjni_all_1_8_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_jersey_core_jersey_client_2_22_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_jersey_core_jersey_common_2_22_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_jersey_core_jersey_server_2_22_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_collections_commons_collections_3_2_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_directory_api_api_asn1_api_1_0_0_M20_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_codehaus_jackson_jackson_mapper_asl_1_9_13_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_hk2_osgi_resource_locator_1_0_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_beanutils_commons_beanutils_core_1_8_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_hk2_external_javax_inject_2_4_0_b34_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__commons_configuration_commons_configuration_1_6_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_directory_server_apacheds_i18n_2_0_0_M15_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_network_common_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_sql_kafka_0_10_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_jersey_media_jersey_media_jaxb_2_22_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_fasterxml_jackson_core_jackson_databind_2_6_7_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_typesafe_scala_logging_scala_logging_2_11_3_8_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_yarn_server_common_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_network_shuffle_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_fasterxml_jackson_core_jackson_annotations_2_6_7_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_mapreduce_client_app_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_mapreduce_client_core_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_mapreduce_client_common_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_spark_spark_streaming_kafka_0_10_2_11_2_3_0_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_jersey_bundles_repackaged_jersey_guava_2_22_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_hk2_external_aopalliance_repackaged_2_4_0_b34_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_fasterxml_jackson_module_jackson_module_paranamer_2_7_9_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_6_5_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_scala_lang_modules_scala_parser_combinators_2_11_1_0_4_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_apache_directory_server_apacheds_kerberos_codec_2_0_0_M15_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__com_fasterxml_jackson_module_jackson_module_scala_2_11_2_6_7_1_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_jersey_containers_jersey_container_servlet_2_22_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/libraries/SBT__org_glassfish_jersey_containers_jersey_container_servlet_core_2_22_2_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | log4j.rootCategory=WARN, console 2 | log4j.appender.console=org.apache.log4j.ConsoleAppender 3 | log4j.appender.console.target=System.err 4 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 5 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 6 | 7 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=WARN 8 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=WARN 9 | 10 | log4j.logger.kafka.server.KafkaServer = DEBUG 11 | log4j.logger.kafka.admin.TopicCommand = DEBUG 12 | 13 | log4j.logger.util.EmbeddedKafkaServer = INFO 14 | log4j.logger.util.EmbeddedZookeeper = INFO -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.idea/sbt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | he MIT License (MIT) 2 | 3 | Copyright (c) 2016 Spiro Michaylov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/main/scala/util/DirectServerDemo.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | /** 4 | * Run this first to verify that the embedded Kafka setup is working for you. 5 | * It starts an embedded Kafka server, creates a topic, publishes some messages, 6 | * reads them back and shuts down the embedded server. 7 | */ 8 | 9 | object DirectServerDemo { 10 | def main (args: Array[String]) { 11 | 12 | val topic = "foo" 13 | 14 | println("*** about to start embedded Kafka server") 15 | 16 | val kafkaServer = new EmbeddedKafkaServer() 17 | kafkaServer.start() 18 | 19 | println("*** server started") 20 | 21 | kafkaServer.createTopic(topic, 4) 22 | 23 | println("*** topic [" + topic + "] created") 24 | 25 | Thread.sleep(5000) 26 | 27 | val kafkaClient = new SimpleKafkaClient(kafkaServer) 28 | 29 | println("*** about to produce messages") 30 | 31 | kafkaClient.send(topic, Seq( 32 | ("Key_1", "Value_1"), 33 | ("Key_2", "Value_2"), 34 | ("Key_3", "Value_3"), 35 | ("Key_4", "Value_4"), 36 | ("Key_5", "Value_5") 37 | )) 38 | 39 | println("*** produced messages") 40 | 41 | Thread.sleep(5000) 42 | 43 | println("*** about to consume messages") 44 | 45 | kafkaClient.consumeAndPrint( 46 | topic, 47 | 5) 48 | 49 | println("*** stopping embedded Kafka server") 50 | 51 | kafkaServer.stop() 52 | 53 | println("*** done") 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/util/SparkKafkaSink.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.util.Properties 4 | 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | 7 | /** 8 | * For publishing to Kafka from every partition of an RDD -- see 9 | * http://allegro.tech/2015/08/spark-kafka-integration.html 10 | * 11 | * @param createProducer 12 | */ 13 | class SparkKafkaSink(createProducer: () => KafkaProducer[String, String]) extends Serializable { 14 | 15 | lazy val producer = createProducer() 16 | 17 | /** 18 | * Records assigned to partitions using the configured partitioner. 19 | * 20 | * @param topic 21 | * @param key 22 | * @param value 23 | */ 24 | def send(topic: String, key: String, value: String): Unit = { 25 | producer.send(new ProducerRecord(topic, key, value)) 26 | } 27 | 28 | /** 29 | * Records assigned to partitions explicitly, ignoring the configured partitioner. 30 | * 31 | * @param topic 32 | * @param partition 33 | * @param key 34 | * @param value 35 | */ 36 | def send(topic: String, partition: Int, key: String, value: String): Unit = { 37 | producer.send(new ProducerRecord(topic, partition, key, value)) 38 | } 39 | } 40 | 41 | object SparkKafkaSink { 42 | def apply(config: Properties): SparkKafkaSink = { 43 | val f = () => { 44 | val producer = new KafkaProducer[String, String](config) 45 | 46 | sys.addShutdownHook { 47 | producer.close() 48 | } 49 | 50 | producer 51 | } 52 | new SparkKafkaSink(f) 53 | } 54 | } -------------------------------------------------------------------------------- /src/main/scala/util/TemporaryDirectories.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.io.{IOException, File} 4 | 5 | import org.apache.commons.io.FileUtils 6 | 7 | /** 8 | * Set up temporary directories, to be deleted automatically at shutdown. If the directories 9 | * exist at creation time they will be cleaned up (deleted) first. 10 | */ 11 | private[util] class TemporaryDirectories { 12 | val tempRootPath = java.io.File.separator + "tmp" + java.io.File.separator + "SSWK" 13 | 14 | val checkpointPath = tempRootPath + File.separator + "checkpoints" 15 | 16 | private val rootDir = new File(tempRootPath) 17 | 18 | // delete in advance in case last cleanup didn't 19 | deleteRecursively(rootDir) 20 | rootDir.mkdir 21 | 22 | val zkSnapshotPath = tempRootPath + File.separator + "zookeeper-snapshot" 23 | val zkSnapshotDir = new File(zkSnapshotPath) 24 | zkSnapshotDir.mkdir() 25 | 26 | val zkLogDirPath = tempRootPath + File.separator + "zookeeper-logs" 27 | val zkLogDir = new File(zkLogDirPath) 28 | zkLogDir.mkdir() 29 | 30 | val kafkaLogDirPath = tempRootPath + File.separator + "kafka-logs" 31 | val kafkaLogDir = new File(kafkaLogDirPath) 32 | kafkaLogDir.mkdir() 33 | 34 | 35 | Runtime.getRuntime().addShutdownHook(new Thread(new Runnable() { 36 | def run { 37 | try { 38 | deleteRecursively(rootDir) 39 | } 40 | catch { 41 | case e: Exception => { 42 | } 43 | } 44 | } 45 | })) 46 | 47 | 48 | private def deleteRecursively(file: File): Unit = { 49 | if (file.isDirectory) 50 | file.listFiles.foreach(deleteRecursively) 51 | if (file.exists && !file.delete) 52 | throw new Exception(s"Unable to delete ${file.getAbsolutePath}") 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/util/EmbeddedZookeeper.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.io.{IOException, File} 4 | import java.net.InetSocketAddress 5 | 6 | import com.typesafe.scalalogging.Logger 7 | import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer, ServerCnxnFactory} 8 | 9 | /** 10 | * Start/stop a single Zookeeper instance for use by EmbeddedKafkaServer. Do not create one of these directly. 11 | * @param port 12 | */ 13 | private[util] class EmbeddedZookeeper(port: Int, tempDirs: TemporaryDirectories) { 14 | private val LOGGER = Logger[EmbeddedZookeeper] 15 | private var serverConnectionFactory: Option[ServerCnxnFactory] = None 16 | 17 | /** 18 | * Start a single instance. 19 | */ 20 | def start() { 21 | LOGGER.info(s"starting Zookeeper on $port") 22 | 23 | try { 24 | val zkMaxConnections = 32 25 | val zkTickTime = 2000 26 | val zkServer = new ZooKeeperServer(tempDirs.zkSnapshotDir, tempDirs.zkLogDir, zkTickTime) 27 | serverConnectionFactory = Some(new NIOServerCnxnFactory()) 28 | serverConnectionFactory.get.configure(new InetSocketAddress("localhost", port), zkMaxConnections) 29 | serverConnectionFactory.get.startup(zkServer) 30 | } 31 | catch { 32 | case e: InterruptedException => { 33 | Thread.currentThread.interrupt() 34 | } 35 | case e: IOException => { 36 | throw new RuntimeException("Unable to start ZooKeeper", e) 37 | } 38 | } 39 | } 40 | 41 | /** 42 | * Stop the instance if running. 43 | */ 44 | def stop() { 45 | LOGGER.info(s"shutting down Zookeeper on $port") 46 | serverConnectionFactory match { 47 | case Some(f) => { 48 | f.shutdown 49 | serverConnectionFactory = None 50 | } 51 | case None => 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala/util/PartitionMapAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord 4 | import org.apache.spark.rdd.RDD 5 | 6 | // 7 | // This is a general tool for analyzing an RDD of ConsumerRecord, such as is normally produced by a Kafka stream. 8 | // The goal is to see how subscribed Kafka topics and their Kafka partitions map to partitions in the RDD that is 9 | // emitted by the Spark stream. The code is a little convoluted because of its contradictory goals: 10 | // 1) avoid collecting the RDD tot he driver node (thus having to serialize ConsumerRecord 11 | // 2) print the partition information sequentially to keep the output from being jumbled 12 | // 13 | // It may be fun to rewrite it so that a data structure containing this infrastructure is produced in parallel 14 | // and then collected and printed sequentially. 15 | // 16 | object PartitionMapAnalyzer { 17 | 18 | def analyze[K,V](r: RDD[ConsumerRecord[K,V]], 19 | dumpRecords: Boolean = false) : Unit = 20 | { 21 | if (r.count() > 0) { 22 | println("*** " + r.getNumPartitions + " partitions") 23 | 24 | val partitions = r.glom().zipWithIndex() 25 | 26 | // this loop will be sequential; each iteration analyzes one partition 27 | (0l to partitions.count() - 1).foreach(n => analyzeOnePartition(partitions, n, dumpRecords)) 28 | 29 | } else { 30 | println("*** RDD is empty") 31 | } 32 | } 33 | 34 | private def analyzeOnePartition[K,V](partitions: RDD[(Array[ConsumerRecord[K, V]], Long)], 35 | which: Long, 36 | dumpRecords: Boolean) : Unit = 37 | { 38 | partitions.foreach({ 39 | case (data: Array[ConsumerRecord[K, V]], index: Long) => { 40 | if (index == which) { 41 | println(s"*** partition $index has ${data.length} records") 42 | data.groupBy(cr => (cr.topic(), cr.partition())).foreach({ 43 | case (k: (String, Int), v: Array[ConsumerRecord[K, V]]) => 44 | println(s"*** rdd partition = $index, topic = ${k._1}, topic partition = ${k._2}, record count = ${v.length}.") 45 | }) 46 | if (dumpRecords) data.foreach(cr => println(s"RDD partition $index record $cr")) 47 | } 48 | } 49 | }) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/scala/ExceptionPropagation.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import util.{EmbeddedKafkaServer, SimpleKafkaClient} 7 | 8 | /** 9 | * This example demonstrates that exceptions encountered in stream processing are 10 | * rethrown from the call to awaitTermination(). 11 | * See https://issues.apache.org/jira/browse/SPARK-17397 . 12 | * Notice this example doesn't even publish any data: the exception is thrown when an empty RDD is received. 13 | */ 14 | object ExceptionPropagation { 15 | 16 | case class SomeException(s: String) extends Exception(s) 17 | 18 | def main (args: Array[String]) { 19 | 20 | val topic = "foo" 21 | 22 | val kafkaServer = new EmbeddedKafkaServer() 23 | kafkaServer.start() 24 | kafkaServer.createTopic(topic, 4) 25 | 26 | val client = new SimpleKafkaClient(kafkaServer) 27 | 28 | 29 | val conf = new SparkConf().setAppName("ExceptionPropagation").setMaster("local[4]") 30 | val sc = new SparkContext(conf) 31 | 32 | // streams will produce data every second 33 | val ssc = new StreamingContext(sc, Seconds(1)) 34 | 35 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 36 | 37 | val kafkaStream = 38 | KafkaUtils.createDirectStream( 39 | ssc, 40 | LocationStrategies.PreferConsistent, 41 | ConsumerStrategies.Subscribe[String, String]( 42 | Arrays.asList(topic), 43 | props.asInstanceOf[java.util.Map[String, Object]] 44 | ) 45 | 46 | ) 47 | 48 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 49 | kafkaStream.foreachRDD(r => { 50 | println("*** got an RDD, size = " + r.count()) 51 | // throw the custom exception here and see it get caught in the code below 52 | throw SomeException("error while processing RDD"); 53 | }) 54 | 55 | ssc.start() 56 | 57 | try { 58 | ssc.awaitTermination() 59 | println("*** streaming terminated") 60 | } catch { 61 | case e: Exception => { 62 | println("*** streaming exception caught in monitor thread") 63 | ssc.stop() // stop it now since we're not blocked 64 | } 65 | } 66 | 67 | // stop Spark 68 | sc.stop() 69 | 70 | // stop Kafka 71 | kafkaServer.stop() 72 | 73 | println("*** done") 74 | } 75 | } -------------------------------------------------------------------------------- /src/main/scala/structured/SimpleAggregation.scala: -------------------------------------------------------------------------------- 1 | package structured 2 | 3 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 4 | import org.apache.spark.sql.SparkSession 5 | import util.{EmbeddedKafkaServer, SimpleKafkaClient} 6 | 7 | /** 8 | * A streaming DataFrame is created from a single Kafka topic, an aggregating query is set up to count 9 | * occurrences of each key, and the results are streamed to a console. Each batch results in the entire 10 | * aggregation result to date being output. 11 | */ 12 | object SimpleAggregation { 13 | 14 | def main (args: Array[String]) { 15 | 16 | val topic = "foo" 17 | 18 | println("*** starting Kafka server") 19 | val kafkaServer = new EmbeddedKafkaServer() 20 | kafkaServer.start() 21 | kafkaServer.createTopic(topic, 4) 22 | 23 | Thread.sleep(5000) 24 | 25 | // publish some messages 26 | println("*** Publishing messages") 27 | val max = 1000 28 | val client = new SimpleKafkaClient(kafkaServer) 29 | val numbers = 1 to max 30 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 31 | numbers.foreach { n => 32 | producer.send(new ProducerRecord(topic, "key_" + (n % 4), "string_" + n)) 33 | } 34 | Thread.sleep(5000) 35 | 36 | println("*** Starting to stream") 37 | 38 | val spark = SparkSession 39 | .builder 40 | .appName("Structured_Simple") 41 | .config("spark.master", "local[4]") 42 | .getOrCreate() 43 | 44 | import spark.implicits._ 45 | 46 | val ds1 = spark 47 | .readStream 48 | .format("kafka") 49 | .option("kafka.bootstrap.servers", kafkaServer.getKafkaConnect) 50 | .option("subscribe", topic) 51 | .option("startingOffsets", "earliest") // equivalent of auto.offset.reset which is not allowed here 52 | .load() 53 | 54 | val counts = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 55 | .groupBy("key") 56 | .count() 57 | 58 | val query = counts.writeStream 59 | .outputMode("complete") 60 | .format("console") 61 | .start() 62 | 63 | println("*** done setting up streaming") 64 | 65 | Thread.sleep(5000) 66 | 67 | println("*** publishing more messages") 68 | numbers.foreach { n => 69 | producer.send(new ProducerRecord(topic, "key_" + (n % 4), "string_" + n)) 70 | } 71 | 72 | Thread.sleep(5000) 73 | 74 | println("*** Stopping stream") 75 | query.stop() 76 | 77 | query.awaitTermination() 78 | spark.stop() 79 | 80 | println("*** Streaming terminated") 81 | 82 | // stop Kafka 83 | println("*** Stopping Kafka") 84 | kafkaServer.stop() 85 | 86 | println("*** done") 87 | } 88 | } -------------------------------------------------------------------------------- /src/main/scala/structured/Simple.scala: -------------------------------------------------------------------------------- 1 | package structured 2 | 3 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 4 | import org.apache.spark.sql.SparkSession 5 | import util.{EmbeddedKafkaServer, SimpleKafkaClient} 6 | 7 | /** 8 | * A very simple example of structured streaming from a Kafka source, where the messages 9 | * are produced directly via calls to a KafkaProducer. A streaming DataFrame is created from a 10 | * single Kafka topic, and feeds all the data received to a streaming computation that outputs it to a console. 11 | * 12 | * Note that writing all the incremental data in each batch to output only makes sense because there is no 13 | * aggregation performed. In subsequent examples with aggregation this will not be possible. 14 | */ 15 | object Simple { 16 | 17 | def main (args: Array[String]) { 18 | 19 | val topic = "foo" 20 | 21 | println("*** starting Kafka server") 22 | val kafkaServer = new EmbeddedKafkaServer() 23 | kafkaServer.start() 24 | kafkaServer.createTopic(topic, 4) 25 | 26 | Thread.sleep(5000) 27 | 28 | // publish some messages 29 | println("*** Publishing messages") 30 | val max = 5 31 | val client = new SimpleKafkaClient(kafkaServer) 32 | val numbers = 1 to max 33 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 34 | numbers.foreach { n => 35 | producer.send(new ProducerRecord(topic, "[1]key_" + n, "[1]string_" + n)) 36 | } 37 | Thread.sleep(5000) 38 | 39 | println("*** Starting to stream") 40 | 41 | val spark = SparkSession 42 | .builder 43 | .appName("Structured_Simple") 44 | .config("spark.master", "local[4]") 45 | .getOrCreate() 46 | 47 | val ds1 = spark 48 | .readStream 49 | .format("kafka") 50 | .option("kafka.bootstrap.servers", kafkaServer.getKafkaConnect) 51 | .option("subscribe", topic) 52 | .option("startingOffsets", "earliest") // equivalent of auto.offset.reset which is not allowed here 53 | .load() 54 | 55 | val counts = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 56 | 57 | val query = counts.writeStream 58 | .format("console") // write all counts to console when updated 59 | .start() 60 | 61 | println("*** done setting up streaming") 62 | 63 | Thread.sleep(5000) 64 | 65 | println("*** publishing more messages") 66 | numbers.foreach { n => 67 | producer.send(new ProducerRecord(topic, "[2]key_" + n, "[2]string_" + n)) 68 | } 69 | 70 | Thread.sleep(5000) 71 | 72 | println("*** Stopping stream") 73 | query.stop() 74 | 75 | query.awaitTermination() 76 | spark.stop() 77 | 78 | println("*** Streaming terminated") 79 | 80 | // stop Kafka 81 | println("*** Stopping Kafka") 82 | kafkaServer.stop() 83 | 84 | println("*** done") 85 | } 86 | } -------------------------------------------------------------------------------- /src/main/scala/structured/README.md: -------------------------------------------------------------------------------- 1 | # Structured Streaming 2 | 3 | Structured Streaming (an Alpha feature in Spark 2.1, but a mainstream feature in Spark 2.2) has its own 4 | [programming guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) 5 | in the official documentation. There's also a [Kafka integration guide](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html). 6 | 7 | ## Basic Examples 8 | 9 | 10 | 11 | 12 | 13 | 20 | 21 | 22 | 23 | 28 | 29 | 30 | 31 | 37 | 38 | 39 | 40 | 41 | 57 | 58 | 59 | 60 |
FileWhat's Illustrated
Simple.scala 14 |

A very simple example of structured streaming from a Kafka source, where the messages 15 | are produced directly via calls to a KafkaProducer. A streaming DataFrame is created from a 16 | single Kafka topic, and feeds all the data received to a streaming computation that outputs it to a console.

17 |

Note that writing all the incremental data in each batch to output only makes sense because there is no 18 | aggregation performed. In subsequent examples with aggregation this will not be possible.

19 |
SimpleAggregation.scala 24 |

A streaming DataFrame is created from a single Kafka topic, an aggregating query is set up to count 25 | occurrences of each key, and the results are streamed to a console. Each batch results in the entire 26 | aggregation result to date being output.

27 |
SubscribeAndPublish.scala 32 |

Two Kafka topics are set up and a KafkaProducer is used to publish to the first topic. 33 | Then structured streaming is used to subscribe to that topic and publish a running aggregation to the 34 | second topic. Finally structured streaming is used to subscribe to the second topic and print the data received. 35 |

36 |
Foreach.scala 42 |

43 | The 'foreach' operation allows arbitrary computations on the output data in way that is both 44 | partition-aware (computed on the executors and aware of which partition is being processed) and batch-aware 45 | (via a separate invocation for each partition/batch combination.) 46 |

47 | It is always used by passing the operation an object that implements the 'ForeachWriter' interface. In this 48 | example, the object doesn't do any "useful" work: instead it is set up to illustrate its slightly arcane state 49 | management by printing its arguments and state in each of the three overridden methods. 50 |

51 | Each instance of ForeachWriter is used for processing a sequence of partition/batch combinations, but at any point 52 | in time is is setup (via a single open() call) to process one partition/batch combination. Then it gets multiple 53 | process() calls, providing the the actual data for that partition and batch, and then a single close() call to 54 | signal that the partition/batch combination has been completely processed. 55 |

56 |
61 | 62 | -------------------------------------------------------------------------------- /src/main/scala/util/SimpleKafkaClient.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.util 4 | import java.util.Properties 5 | 6 | import scala.collection.JavaConversions._ 7 | import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecords, KafkaConsumer} 8 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 9 | import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer} 10 | 11 | 12 | /** 13 | * Simple utilities for connecting directly to Kafka. 14 | */ 15 | class SimpleKafkaClient(server: EmbeddedKafkaServer) { 16 | 17 | def send(topic: String, pairs: Seq[(String, String)]) : Unit = { 18 | val producer = new KafkaProducer[String, String](basicStringStringProducer) 19 | pairs.foreach(pair => { 20 | producer send(new ProducerRecord(topic, pair._1, pair._2)) 21 | }) 22 | producer.close() 23 | } 24 | 25 | /** 26 | * Read and print the specified number of records from the specified topic. 27 | * Poll for as long as necessary. 28 | * @param topic 29 | * @param max 30 | */ 31 | def consumeAndPrint(topic: String, max: Int): Unit = { 32 | // configure a consumer 33 | 34 | 35 | val consumer = new KafkaConsumer[String, String](basicStringStringConsumer); 36 | 37 | // need to subscribe to the topic 38 | 39 | consumer.subscribe(util.Arrays.asList(topic)) 40 | 41 | // and read the records back -- just keep polling until we have read 42 | // all of them (poll each 100 msec) as the Kafka server may not make 43 | // them available immediately 44 | 45 | var count = 0; 46 | 47 | while (count < max) { 48 | println("*** Polling ") 49 | 50 | val records: ConsumerRecords[String, String] = 51 | consumer.poll(100) 52 | println(s"*** received ${records.count} messages") 53 | count = count + records.count 54 | 55 | // must specify the topic as we could have subscribed to more than one 56 | records.records(topic).foreach(rec => { 57 | println("*** [ " + rec.partition() + " ] " + rec.key() + ":" + rec.value()) 58 | }) 59 | } 60 | 61 | println("*** got the expected number of messages") 62 | 63 | consumer.close() 64 | } 65 | 66 | def basicStringStringProducer : Properties = { 67 | val config: Properties = new Properties 68 | config.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getCanonicalName) 69 | config.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[StringSerializer].getCanonicalName) 70 | config.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, server.getKafkaConnect) 71 | //config.put(ProducerConfig.PARTITIONER_CLASS_CONFIG, "org.apache.kafka.clients.producer.internals.DefaultPartitioner") 72 | config 73 | } 74 | 75 | def basicStringStringConsumer : Properties = { 76 | SimpleKafkaClient.getBasicStringStringConsumer(server) 77 | } 78 | } 79 | 80 | object SimpleKafkaClient { 81 | 82 | def getBasicStringStringConsumer(server: EmbeddedKafkaServer, group:String = "MyGroup") : Properties = { 83 | val consumerConfig: Properties = new Properties 84 | consumerConfig.put(ConsumerConfig.GROUP_ID_CONFIG, group) 85 | consumerConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[StringDeserializer].getCanonicalName) 86 | consumerConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, classOf[StringDeserializer].getCanonicalName) 87 | consumerConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, server.getKafkaConnect) 88 | consumerConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest") 89 | 90 | //consumerConfig.put(ConsumerConfig.PARTITION_ASSIGNMENT_STRATEGY, "roundrobin") 91 | 92 | consumerConfig 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/structured/SubscribeAndPublish.scala: -------------------------------------------------------------------------------- 1 | package structured 2 | 3 | import java.io.File 4 | 5 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 6 | import org.apache.spark.sql.SparkSession 7 | import org.apache.spark.sql.streaming.{ProcessingTime, Trigger} 8 | import util.{TemporaryDirectories, EmbeddedKafkaServer, SimpleKafkaClient} 9 | 10 | /** 11 | * Two Kafka topics are set up and a KafkaProducer is used to publish to the first topic. 12 | * Then structured streaming is used to subscribe to that topic and publish a running aggregation to the 13 | * second topic. Finally structured streaming is used to subscribe to the second topic and print the data received. 14 | */ 15 | object SubscribeAndPublish { 16 | 17 | def main (args: Array[String]) { 18 | 19 | val topic1 = "foo" 20 | val topic2 = "bar" 21 | 22 | println("*** starting Kafka server") 23 | val kafkaServer = new EmbeddedKafkaServer() 24 | kafkaServer.start() 25 | kafkaServer.createTopic(topic1, 4) 26 | kafkaServer.createTopic(topic2, 4) 27 | 28 | Thread.sleep(5000) 29 | 30 | // publish some messages 31 | println("*** Publishing messages") 32 | val max = 1000 33 | val client = new SimpleKafkaClient(kafkaServer) 34 | val numbers = 1 to max 35 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 36 | numbers.foreach { n => 37 | producer.send(new ProducerRecord(topic1, "key_" + n, "string_" + n)) 38 | } 39 | Thread.sleep(5000) 40 | 41 | val checkpointPath = kafkaServer.tempDirs.checkpointPath 42 | 43 | println("*** Starting to stream") 44 | 45 | val spark = SparkSession 46 | .builder 47 | .appName("Structured_SubscribeAndPublish") 48 | .config("spark.master", "local[4]") 49 | .getOrCreate() 50 | 51 | import spark.implicits._ 52 | 53 | val ds1 = spark 54 | .readStream 55 | .format("kafka") 56 | .option("kafka.bootstrap.servers", kafkaServer.getKafkaConnect) 57 | .option("subscribe", topic1) 58 | .option("startingOffsets", "earliest") 59 | .load() 60 | 61 | val counts = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 62 | .groupBy() 63 | .count() 64 | 65 | val publishQuery = 66 | counts 67 | .selectExpr("'RunningCount' AS key", "CAST(count AS STRING) AS value") 68 | .writeStream 69 | .outputMode("complete") 70 | .format("kafka") 71 | .option("checkpointLocation", checkpointPath) 72 | .option("kafka.bootstrap.servers", kafkaServer.getKafkaConnect) 73 | .option("topic", topic2) 74 | .start() 75 | 76 | val ds2 = spark 77 | .readStream 78 | .format("kafka") 79 | .option("kafka.bootstrap.servers", kafkaServer.getKafkaConnect) 80 | .option("subscribe", topic2) 81 | .option("startingOffsets", "earliest") 82 | .load() 83 | 84 | val counts2 = ds2.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 85 | .as[(String, String)] 86 | 87 | val query = counts2 88 | .writeStream 89 | .trigger(Trigger.ProcessingTime("4 seconds")) 90 | .format("console") 91 | .start() 92 | 93 | println("*** done setting up streaming") 94 | 95 | Thread.sleep(2000) 96 | 97 | println("*** publishing more messages") 98 | numbers.foreach { n => 99 | producer.send(new ProducerRecord(topic1, "key_" + n, "string_" + n)) 100 | } 101 | 102 | Thread.sleep(8000) 103 | 104 | println("*** Stopping stream") 105 | query.stop() 106 | 107 | query.awaitTermination() 108 | spark.stop() 109 | 110 | println("*** Streaming terminated") 111 | 112 | // stop Kafka 113 | println("*** Stopping Kafka") 114 | kafkaServer.stop() 115 | 116 | println("*** done") 117 | } 118 | } -------------------------------------------------------------------------------- /src/main/scala/util/EmbeddedKafkaServer.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.io.IOException 4 | import scala.collection.JavaConversions._ 5 | 6 | import com.typesafe.scalalogging.Logger 7 | 8 | import kafka.admin.TopicCommand 9 | import kafka.server.{KafkaServerStartable, KafkaConfig} 10 | import kafka.utils.ZkUtils 11 | 12 | import org.apache.kafka.common.security.JaasUtils 13 | 14 | /** 15 | * A single embedded Kafka server and its associated Zookeeper 16 | */ 17 | @throws[IOException] 18 | class EmbeddedKafkaServer() { 19 | private val LOGGER = Logger[EmbeddedKafkaServer] 20 | val tempDirs = new TemporaryDirectories 21 | val zkPort = 39001 22 | val kbPort = 39002 23 | val zkSessionTimeout = 20000 24 | val zkConnectionTimeout = 20000 25 | 26 | private var zookeeperHandle: Option[EmbeddedZookeeper] = None 27 | private var kafkaBrokerHandle: Option[KafkaServerStartable] = None 28 | 29 | /** 30 | * Start first the Zookeeper and then the Kafka broker. 31 | */ 32 | def start() { 33 | LOGGER.info(s"starting on [$zkPort $kbPort]") 34 | zookeeperHandle = Some(new EmbeddedZookeeper(zkPort, tempDirs)) 35 | zookeeperHandle.get.start 36 | 37 | val kafkaProps = Map( 38 | "port" -> Integer.toString(kbPort), 39 | "broker.id" -> "1", 40 | "host.name" -> "localhost", 41 | "log.dir" -> tempDirs.kafkaLogDirPath, 42 | "zookeeper.connect" -> ("localhost:" + zkPort)) 43 | 44 | kafkaBrokerHandle = Some(new KafkaServerStartable(new KafkaConfig(kafkaProps))) 45 | kafkaBrokerHandle.get.startup() 46 | } 47 | 48 | /** 49 | * If running, shut down first the Kafka broker and then the Zookeeper 50 | */ 51 | def stop() { 52 | LOGGER.info(s"shutting down broker on $kbPort") 53 | kafkaBrokerHandle match { 54 | case Some(b) => { 55 | b.shutdown() 56 | b.awaitShutdown() 57 | kafkaBrokerHandle = None 58 | } 59 | case None => 60 | } 61 | Thread.sleep(5000) 62 | LOGGER.info(s"shutting down zookeeper on $zkPort") 63 | zookeeperHandle match { 64 | case Some(zk) => { 65 | zk.stop() 66 | zookeeperHandle = None 67 | } 68 | case None => 69 | } 70 | } 71 | 72 | /** 73 | * Create a topic, optionally setting the number of partitions to a non default value and configuring timestamps. 74 | * @param topic 75 | * @param partitions 76 | * @param logAppendTime 77 | */ 78 | def createTopic(topic: String, partitions: Int = 1, logAppendTime: Boolean = false) : Unit = { 79 | LOGGER.debug(s"Creating [$topic]") 80 | 81 | val arguments = Array[String]( 82 | "--create", 83 | "--topic", 84 | topic 85 | ) ++ ( 86 | if (logAppendTime) { 87 | Array[String]("--config", "message.timestamp.type=LogAppendTime") 88 | } else { 89 | Array[String]() 90 | }) ++ Array[String]( 91 | "--partitions", 92 | "" + partitions, 93 | "--replication-factor", 94 | "1" 95 | ) 96 | 97 | val opts = new TopicCommand.TopicCommandOptions(arguments) 98 | 99 | val zkUtils = ZkUtils.apply(getZkConnect, 100 | zkSessionTimeout, zkConnectionTimeout, 101 | JaasUtils.isZkSecurityEnabled) 102 | 103 | TopicCommand.createTopic(zkUtils, opts) 104 | 105 | LOGGER.debug(s"Finished creating topic [$topic]") 106 | } 107 | 108 | def addPartitions(topic: String, partitions: Int) : Unit = { 109 | LOGGER.debug(s"Adding [$partitions] partitions to [$topic]") 110 | 111 | val arguments = Array[String]( 112 | "--alter", 113 | "--topic", 114 | topic, 115 | "--partitions", 116 | "" + partitions 117 | ) 118 | 119 | val opts = new TopicCommand.TopicCommandOptions(arguments) 120 | 121 | val zkUtils = ZkUtils.apply(getZkConnect, 122 | zkSessionTimeout, zkConnectionTimeout, 123 | JaasUtils.isZkSecurityEnabled) 124 | 125 | TopicCommand.alterTopic(zkUtils, opts) 126 | 127 | LOGGER.debug(s"Finished adding [$partitions] partitions to [$topic]") 128 | } 129 | 130 | def getKafkaConnect: String = "localhost:" + kbPort 131 | 132 | def getZkConnect: String = "localhost:" + zkPort 133 | 134 | 135 | } 136 | -------------------------------------------------------------------------------- /src/main/scala/ControlledPartitioning.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | import util.{EmbeddedKafkaServer, SimpleKafkaClient, SparkKafkaSink} 8 | 9 | /** 10 | * Here the topic has six partitions but instead of writing to it using the configured 11 | * partitioner, we assign all records to the same partition explicitly. Although the 12 | * generated RDDs still have the same number of partitions as the topic, only one 13 | * partition has all the data in it. THis is a rather extreme way to use topic partitions, 14 | * but it opens up the whole range of algorithms for selecting the partition when sending. 15 | */ 16 | object ControlledPartitioning { 17 | 18 | /** 19 | * Publish some data to a topic. Encapsulated here to ensure serializability. 20 | * @param max 21 | * @param sc 22 | * @param topic 23 | * @param config 24 | */ 25 | def send(max: Int, sc: SparkContext, topic: String, config: Properties): Unit = { 26 | 27 | // put some data in an RDD and publish to Kafka 28 | val numbers = 1 to max 29 | val numbersRDD = sc.parallelize(numbers, 5) 30 | 31 | val kafkaSink = sc.broadcast(SparkKafkaSink(config)) 32 | 33 | println("*** producing data") 34 | 35 | // use the overload that explicitly assigns a partition (0) 36 | numbersRDD.foreach { n => 37 | kafkaSink.value.send(topic, 0, "key_" + n, "string_" + n) 38 | } 39 | } 40 | 41 | def main (args: Array[String]) { 42 | 43 | val topic = "foo" 44 | 45 | val kafkaServer = new EmbeddedKafkaServer() 46 | kafkaServer.start() 47 | kafkaServer.createTopic(topic, 6) 48 | 49 | 50 | 51 | val conf = new SparkConf().setAppName("ControlledPartitioning").setMaster("local[7]") 52 | val sc = new SparkContext(conf) 53 | 54 | // streams will produce data every second 55 | val ssc = new StreamingContext(sc, Seconds(1)) 56 | 57 | val max = 1000 58 | 59 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 60 | 61 | val kafkaStream = 62 | KafkaUtils.createDirectStream( 63 | ssc, 64 | LocationStrategies.PreferConsistent, 65 | ConsumerStrategies.Subscribe[String, String]( 66 | Arrays.asList(topic), 67 | props.asInstanceOf[java.util.Map[String, Object]] 68 | ) 69 | 70 | ) 71 | 72 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 73 | kafkaStream.foreachRDD(r => { 74 | println("*** got an RDD, size = " + r.count()) 75 | r.foreach(s => println(s)) 76 | if (r.count() > 0) { 77 | // let's see how many partitions the resulting RDD has -- notice that it has nothing 78 | // to do with the number of partitions in the RDD used to publish the data (4), nor 79 | // the number of partitions of the topic (which also happens to be four.) 80 | println("*** " + r.getNumPartitions + " partitions") 81 | r.glom().foreach(a => println("*** partition size = " + a.size)) 82 | } 83 | }) 84 | 85 | ssc.start() 86 | 87 | println("*** started termination monitor") 88 | 89 | // streams seem to need some time to get going 90 | Thread.sleep(5000) 91 | 92 | val producerThread = new Thread("Streaming Termination Controller") { 93 | override def run() { 94 | val client = new SimpleKafkaClient(kafkaServer) 95 | 96 | send(max, sc, topic, client.basicStringStringProducer) 97 | Thread.sleep(5000) 98 | println("*** requesting streaming termination") 99 | ssc.stop(stopSparkContext = false, stopGracefully = true) 100 | } 101 | } 102 | producerThread.start() 103 | 104 | try { 105 | ssc.awaitTermination() 106 | println("*** streaming terminated") 107 | } catch { 108 | case e: Exception => { 109 | println("*** streaming exception caught in monitor thread") 110 | } 111 | } 112 | 113 | // stop Spark 114 | sc.stop() 115 | 116 | // stop Kafka 117 | kafkaServer.stop() 118 | 119 | println("*** done") 120 | } 121 | } -------------------------------------------------------------------------------- /src/main/scala/SendWithDifferentPartitioning.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import kafka.serializer.StringDecoder 4 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | import util.{EmbeddedKafkaServer, SimpleKafkaClient, SparkKafkaSink} 8 | 9 | /** 10 | * This example is very similar to SimpleStreaming, except that the data is sent 11 | * from an RDD with 5 partitions to a Kafka topic with 6 partitions. WThe KafkaStream consuming 12 | * the topic produces RDDs with size partitions. This is because the data is repartitioned when sent, 13 | * as we continue use the KafkaProducer constructor overload that doesn't allow us to specify 14 | * the destination partition. 15 | */ 16 | object SendWithDifferentPartitioning { 17 | 18 | /** 19 | * Publish some data to a topic. Encapsulated here to ensure serializability. 20 | * @param max 21 | * @param sc 22 | * @param topic 23 | * @param config 24 | */ 25 | def send(max: Int, sc: SparkContext, topic: String, config: Properties): Unit = { 26 | 27 | // put some data in an RDD and publish to Kafka 28 | val numbers = 1 to max 29 | val numbersRDD = sc.parallelize(numbers, 5) 30 | 31 | val kafkaSink = sc.broadcast(SparkKafkaSink(config)) 32 | 33 | println("*** producing data") 34 | 35 | numbersRDD.foreach { n => 36 | // NOTE: 37 | // 1) the keys and values are strings, which is important when receiving them 38 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 39 | // is used to determine this 40 | kafkaSink.value.send(topic, "key_" + n, "string_" + n) 41 | } 42 | } 43 | 44 | def main (args: Array[String]) { 45 | 46 | val topic = "foo" 47 | 48 | val kafkaServer = new EmbeddedKafkaServer() 49 | kafkaServer.start() 50 | kafkaServer.createTopic(topic, 6) 51 | 52 | 53 | 54 | val conf = new SparkConf().setAppName("SendWithDifferentPartitioning").setMaster("local[7]") 55 | val sc = new SparkContext(conf) 56 | 57 | // streams will produce data every second 58 | val ssc = new StreamingContext(sc, Seconds(1)) 59 | 60 | val max = 1000 61 | 62 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 63 | 64 | val kafkaStream = 65 | KafkaUtils.createDirectStream( 66 | ssc, 67 | LocationStrategies.PreferConsistent, 68 | ConsumerStrategies.Subscribe[String, String]( 69 | Arrays.asList(topic), 70 | props.asInstanceOf[java.util.Map[String, Object]] 71 | ) 72 | 73 | ) 74 | 75 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 76 | kafkaStream.foreachRDD(r => { 77 | println("*** got an RDD, size = " + r.count()) 78 | r.foreach(s => println(s)) 79 | if (r.count() > 0) { 80 | // let's see how many partitions the resulting RDD has -- notice that it has nothing 81 | // to do with the number of partitions in the RDD used to publish the data (4), nor 82 | // the number of partitions of the topic (which also happens to be four.) 83 | println("*** " + r.getNumPartitions + " partitions") 84 | r.glom().foreach(a => println("*** partition size = " + a.size)) 85 | } 86 | }) 87 | 88 | ssc.start() 89 | 90 | println("*** started termination monitor") 91 | 92 | // streams seem to need some time to get going 93 | Thread.sleep(5000) 94 | 95 | val producerThread = new Thread("Streaming Termination Controller") { 96 | override def run() { 97 | val client = new SimpleKafkaClient(kafkaServer) 98 | 99 | send(max, sc, topic, client.basicStringStringProducer) 100 | Thread.sleep(5000) 101 | println("*** requesting streaming termination") 102 | ssc.stop(stopSparkContext = false, stopGracefully = true) 103 | } 104 | } 105 | producerThread.start() 106 | 107 | try { 108 | ssc.awaitTermination() 109 | println("*** streaming terminated") 110 | } catch { 111 | case e: Exception => { 112 | println("*** streaming exception caught in monitor thread") 113 | } 114 | } 115 | 116 | // stop Spark 117 | sc.stop() 118 | 119 | // stop Kafka 120 | kafkaServer.stop() 121 | 122 | println("*** done") 123 | } 124 | } -------------------------------------------------------------------------------- /src/main/scala/SimpleStreaming.scala: -------------------------------------------------------------------------------- 1 | import java.util.Properties 2 | import java.util.Arrays 3 | 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | import org.apache.kafka.common.serialization.StringDeserializer 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | import util.{EmbeddedKafkaServer, SimpleKafkaClient, SparkKafkaSink} 8 | import java.util 9 | 10 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 11 | import org.apache.spark.broadcast.Broadcast 12 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 13 | 14 | /** 15 | * The most basic streaming example: starts a Kafka server, creates a topic, creates a stream 16 | * to process that topic, and publishes some data using the SparkKafkaSink. 17 | * 18 | * Notice there's quite a lot of waiting. It takes some time for streaming to get going, 19 | * and data published too early tends to be missed by the stream. (No doubt, this is partly 20 | * because this example uses the simplest method to create the stream, and thus doesn't 21 | * get an opportunity to set auto.offset.reset to "earliest". 22 | * 23 | * Also, data that is published takes some time to propagate to the stream. 24 | * This seems inevitable, and is almost guaranteed to be slower 25 | * in a self-contained example like this. 26 | */ 27 | object SimpleStreaming { 28 | 29 | def main (args: Array[String]) { 30 | 31 | val topic = "foo" 32 | 33 | val kafkaServer = new EmbeddedKafkaServer() 34 | kafkaServer.start() 35 | kafkaServer.createTopic(topic, 4) 36 | 37 | 38 | 39 | val conf = new SparkConf().setAppName("SimpleStreaming").setMaster("local[4]") 40 | val sc = new SparkContext(conf) 41 | 42 | // streams will produce data every second 43 | val ssc = new StreamingContext(sc, Seconds(1)) 44 | 45 | // this many messages 46 | val max = 1000 47 | 48 | // Create the stream. 49 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 50 | 51 | val kafkaStream = 52 | KafkaUtils.createDirectStream( 53 | ssc, 54 | LocationStrategies.PreferConsistent, 55 | ConsumerStrategies.Subscribe[String, String]( 56 | Arrays.asList(topic), 57 | props.asInstanceOf[java.util.Map[String, Object]] 58 | ) 59 | 60 | ) 61 | 62 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 63 | kafkaStream.foreachRDD(r => { 64 | println("*** got an RDD, size = " + r.count()) 65 | r.foreach(s => println(s)) 66 | if (r.count() > 0) { 67 | // let's see how many partitions the resulting RDD has -- notice that it has nothing 68 | // to do with the number of partitions in the RDD used to publish the data (4), nor 69 | // the number of partitions of the topic (which also happens to be four.) 70 | println("*** " + r.getNumPartitions + " partitions") 71 | r.glom().foreach(a => println("*** partition size = " + a.size)) 72 | } 73 | }) 74 | 75 | ssc.start() 76 | 77 | println("*** started termination monitor") 78 | 79 | // streams seem to need some time to get going 80 | Thread.sleep(5000) 81 | 82 | val producerThread = new Thread("Streaming Termination Controller") { 83 | override def run() { 84 | val client = new SimpleKafkaClient(kafkaServer) 85 | 86 | val numbers = 1 to max 87 | 88 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 89 | 90 | numbers.foreach { n => 91 | // NOTE: 92 | // 1) the keys and values are strings, which is important when receiving them 93 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 94 | // is used to determine this 95 | producer.send(new ProducerRecord(topic, "key_" + n, "string_" + n)) 96 | } 97 | Thread.sleep(5000) 98 | println("*** requesting streaming termination") 99 | ssc.stop(stopSparkContext = false, stopGracefully = true) 100 | } 101 | } 102 | producerThread.start() 103 | 104 | try { 105 | ssc.awaitTermination() 106 | println("*** streaming terminated") 107 | } catch { 108 | case e: Exception => { 109 | println("*** streaming exception caught in monitor thread") 110 | } 111 | } 112 | 113 | // stop Spark 114 | sc.stop() 115 | 116 | // stop Kafka 117 | kafkaServer.stop() 118 | 119 | println("*** done") 120 | } 121 | } -------------------------------------------------------------------------------- /src/main/scala/MultipleTopics.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import org.apache.kafka.clients.consumer.ConsumerRecord 4 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 5 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 6 | import org.apache.spark.streaming.{Seconds, StreamingContext} 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | import util.{EmbeddedKafkaServer, PartitionMapAnalyzer, SimpleKafkaClient} 9 | 10 | /** 11 | * A single stream subscribing to the two topics receives data from both of them. 12 | * The partitioning behavior here is quite interesting, as the topics have three and six partitions respectively, 13 | * each RDD has nine partitions, and each RDD partition receives data from exactly one partition of one topic. 14 | * 15 | * Partitioning is analyzed using the PartitionMapAnalyzer. 16 | */ 17 | object MultipleTopics { 18 | 19 | def main (args: Array[String]) { 20 | 21 | val topic1 = "foo" 22 | val topic2 = "bar" 23 | 24 | // topics are partitioned differently 25 | val kafkaServer = new EmbeddedKafkaServer() 26 | kafkaServer.start() 27 | kafkaServer.createTopic(topic1, 3) 28 | kafkaServer.createTopic(topic2, 6) 29 | 30 | val conf = new SparkConf().setAppName("MultipleTopics").setMaster("local[10]") 31 | val sc = new SparkContext(conf) 32 | 33 | // streams will produce data every second 34 | val ssc = new StreamingContext(sc, Seconds(1)) 35 | 36 | // this many messages 37 | val max = 100 38 | 39 | // Create the stream. 40 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 41 | 42 | val kafkaStream = 43 | KafkaUtils.createDirectStream( 44 | ssc, 45 | LocationStrategies.PreferConsistent, 46 | ConsumerStrategies.Subscribe[String, String]( 47 | Arrays.asList(topic1, topic2), 48 | props.asInstanceOf[java.util.Map[String, Object]] 49 | ) 50 | 51 | ) 52 | 53 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 54 | kafkaStream.foreachRDD(r => { 55 | println("*** got an RDD, size = " + r.count()) 56 | 57 | PartitionMapAnalyzer.analyze(r) 58 | 59 | }) 60 | 61 | ssc.start() 62 | 63 | println("*** started streaming context") 64 | 65 | // streams seem to need some time to get going 66 | Thread.sleep(5000) 67 | 68 | val producerThreadTopic1 = new Thread("Producer thread 1") { 69 | override def run() { 70 | val client = new SimpleKafkaClient(kafkaServer) 71 | 72 | val numbers = 1 to max 73 | 74 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 75 | 76 | numbers.foreach { n => 77 | // NOTE: 78 | // 1) the keys and values are strings, which is important when receiving them 79 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 80 | // is used to determine this 81 | producer.send(new ProducerRecord(topic1, "key_1_" + n, "string_1_" + n)) 82 | } 83 | 84 | } 85 | } 86 | 87 | val producerThreadTopic2 = new Thread("Producer thread 2; controlling termination") { 88 | override def run() { 89 | val client = new SimpleKafkaClient(kafkaServer) 90 | 91 | val numbers = 1 to max 92 | 93 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 94 | 95 | numbers.foreach { n => 96 | // NOTE: 97 | // 1) the keys and values are strings, which is important when receiving them 98 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 99 | // is used to determine this 100 | producer.send(new ProducerRecord(topic2, "key_2_" + n, "string_2_" + n)) 101 | } 102 | Thread.sleep(10000) 103 | println("*** requesting streaming termination") 104 | ssc.stop(stopSparkContext = false, stopGracefully = true) 105 | } 106 | } 107 | 108 | producerThreadTopic1.start() 109 | producerThreadTopic2.start() 110 | 111 | try { 112 | ssc.awaitTermination() 113 | println("*** streaming terminated") 114 | } catch { 115 | case e: Exception => { 116 | println("*** streaming exception caught in monitor thread") 117 | } 118 | } 119 | 120 | // stop Spark 121 | sc.stop() 122 | 123 | // stop Kafka 124 | kafkaServer.stop() 125 | 126 | println("*** done") 127 | } 128 | } -------------------------------------------------------------------------------- /src/main/scala/MultipleStreams.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import util.{EmbeddedKafkaServer, SimpleKafkaClient, SparkKafkaSink} 7 | 8 | /** 9 | * This example creates two streams based on a single consumer group, so they divide up the data. 10 | * There's an interesting partitioning interaction here as the streams each get data from two fo the four 11 | * topic partitions, and each produce RDDs with two partitions each. 12 | */ 13 | 14 | object MultipleStreams { 15 | 16 | /** 17 | * Publish some data to a topic. Encapsulated here to ensure serializability. 18 | * @param max 19 | * @param sc 20 | * @param topic 21 | * @param config 22 | */ 23 | def send(max: Int, sc: SparkContext, topic: String, config: Properties): Unit = { 24 | 25 | // put some data in an RDD and publish to Kafka 26 | val numbers = 1 to max 27 | val numbersRDD = sc.parallelize(numbers, 4) 28 | 29 | val kafkaSink = sc.broadcast(SparkKafkaSink(config)) 30 | 31 | println("*** producing data") 32 | 33 | numbersRDD.foreach { n => 34 | kafkaSink.value.send(topic, "key_" + n, "string_" + n) 35 | } 36 | } 37 | 38 | def main (args: Array[String]) { 39 | 40 | val topic = "foo" 41 | 42 | val kafkaServer = new EmbeddedKafkaServer() 43 | kafkaServer.start() 44 | kafkaServer.createTopic(topic, 4) 45 | 46 | val conf = new SparkConf().setAppName("MultipleStreams").setMaster("local[4]") 47 | val sc = new SparkContext(conf) 48 | 49 | // streams will produce data every second 50 | val ssc = new StreamingContext(sc, Seconds(1)) 51 | 52 | val max = 1000 53 | 54 | 55 | // 56 | // the first stream subscribes to the default consumer group in our SParkKafkaClient class 57 | // 58 | 59 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 60 | 61 | val kafkaStream1 = 62 | KafkaUtils.createDirectStream( 63 | ssc, 64 | LocationStrategies.PreferConsistent, 65 | ConsumerStrategies.Subscribe[String, String]( 66 | Arrays.asList(topic), 67 | props.asInstanceOf[java.util.Map[String, Object]] 68 | ) 69 | 70 | ) 71 | 72 | kafkaStream1.foreachRDD(r => { 73 | println("*** [stream 1] got an RDD, size = " + r.count()) 74 | r.foreach(s => println("*** [stream 1] " + s)) 75 | if (r.count() > 0) { 76 | println("*** [stream 1] " + r.getNumPartitions + " partitions") 77 | r.glom().foreach(a => println("*** [stream 1] partition size = " + a.size)) 78 | } 79 | }) 80 | 81 | // 82 | // a second stream, uses the same props and hence the same consumer group 83 | // 84 | 85 | val kafkaStream2 = 86 | KafkaUtils.createDirectStream( 87 | ssc, 88 | LocationStrategies.PreferConsistent, 89 | ConsumerStrategies.Subscribe[String, String]( 90 | Arrays.asList(topic), 91 | props.asInstanceOf[java.util.Map[String, Object]] 92 | ) 93 | 94 | ) 95 | 96 | kafkaStream2.foreachRDD(r => { 97 | println("*** [stream 2] got an RDD, size = " + r.count()) 98 | r.foreach(s => println("*** [stream 2] " + s)) 99 | if (r.count() > 0) { 100 | println("*** [stream 2] " + r.getNumPartitions + " partitions") 101 | r.glom().foreach(a => println("*** [stream 2] partition size = " + a.size)) 102 | } 103 | }) 104 | 105 | ssc.start() 106 | 107 | println("*** started termination monitor") 108 | 109 | // streams seem to need some time to get going 110 | Thread.sleep(5000) 111 | 112 | val producerThread = new Thread("Streaming Termination Controller") { 113 | override def run() { 114 | val client = new SimpleKafkaClient(kafkaServer) 115 | 116 | send(max, sc, topic, client.basicStringStringProducer) 117 | Thread.sleep(5000) 118 | println("*** requesting streaming termination") 119 | ssc.stop(stopSparkContext = false, stopGracefully = true) 120 | } 121 | } 122 | producerThread.start() 123 | 124 | try { 125 | ssc.awaitTermination() 126 | println("*** streaming terminated") 127 | } catch { 128 | case e: Exception => { 129 | println("*** streaming exception caught in monitor thread") 130 | } 131 | } 132 | 133 | // stop Spark 134 | sc.stop() 135 | 136 | // stop Kafka 137 | kafkaServer.stop() 138 | 139 | println("*** done") 140 | } 141 | } -------------------------------------------------------------------------------- /src/main/scala/SimpleStreamingFromRDD.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import util.{EmbeddedKafkaServer, SimpleKafkaClient, SparkKafkaSink} 7 | 8 | /** 9 | * The most basic streaming example: starts a Kafka server, creates a topic, creates a stream 10 | * to process that topic, and publishes some data using the SparkKafkaSink. 11 | * 12 | * Notice there's quite a lot of waiting. It takes some time for streaming to get going, 13 | * and data published too early tends to be missed by the stream. (No doubt, this is partly 14 | * because this example uses the simplest method to create the stream, and thus doesn't 15 | * get an opportunity to set auto.offset.reset to "earliest". 16 | * 17 | * Also, data that is published takes some time to propagate to the stream. 18 | * This seems inevitable, and is almost guaranteed to be slower 19 | * in a self-contained example like this. 20 | */ 21 | object SimpleStreamingFromRDD { 22 | 23 | /** 24 | * Publish some data to a topic. Encapsulated here to ensure serializability. 25 | * @param max 26 | * @param sc 27 | * @param topic 28 | * @param config 29 | */ 30 | def send(max: Int, sc: SparkContext, topic: String, config: Properties): Unit = { 31 | 32 | // put some data in an RDD and publish to Kafka 33 | val numbers = 1 to max 34 | val numbersRDD = sc.parallelize(numbers, 4) 35 | 36 | val kafkaSink = sc.broadcast(SparkKafkaSink(config)) 37 | 38 | println("*** producing data") 39 | 40 | numbersRDD.foreach { n => 41 | // NOTE: 42 | // 1) the keys and values are strings, which is important when receiving them 43 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 44 | // is used to determine this 45 | kafkaSink.value.send(topic, "key_" + n, "string_" + n) 46 | } 47 | } 48 | 49 | def main (args: Array[String]) { 50 | 51 | val topic = "foo" 52 | 53 | val kafkaServer = new EmbeddedKafkaServer() 54 | kafkaServer.start() 55 | kafkaServer.createTopic(topic, 4) 56 | 57 | 58 | 59 | val conf = new SparkConf().setAppName("SimpleStreamingFromRDD").setMaster("local[4]") 60 | val sc = new SparkContext(conf) 61 | 62 | // streams will produce data every second 63 | val ssc = new StreamingContext(sc, Seconds(1)) 64 | 65 | // this many messages 66 | val max = 1000 67 | 68 | // Create the stream. 69 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 70 | 71 | val kafkaStream = 72 | KafkaUtils.createDirectStream( 73 | ssc, 74 | LocationStrategies.PreferConsistent, 75 | ConsumerStrategies.Subscribe[String, String]( 76 | Arrays.asList(topic), 77 | props.asInstanceOf[java.util.Map[String, Object]] 78 | ) 79 | 80 | ) 81 | 82 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 83 | kafkaStream.foreachRDD(r => { 84 | println("*** got an RDD, size = " + r.count()) 85 | r.foreach(s => println(s)) 86 | if (r.count() > 0) { 87 | // let's see how many partitions the resulting RDD has -- notice that it has nothing 88 | // to do with the number of partitions in the RDD used to publish the data (4), nor 89 | // the number of partitions of the topic (which also happens to be four.) 90 | println("*** " + r.getNumPartitions + " partitions") 91 | r.glom().foreach(a => println("*** partition size = " + a.size)) 92 | } 93 | }) 94 | 95 | ssc.start() 96 | 97 | println("*** started termination monitor") 98 | 99 | // streams seem to need some time to get going 100 | Thread.sleep(5000) 101 | 102 | val producerThread = new Thread("Streaming Termination Controller") { 103 | override def run() { 104 | val client = new SimpleKafkaClient(kafkaServer) 105 | 106 | send(max, sc, topic, client.basicStringStringProducer) 107 | 108 | Thread.sleep(5000) 109 | println("*** requesting streaming termination") 110 | ssc.stop(stopSparkContext = false, stopGracefully = true) 111 | } 112 | } 113 | producerThread.start() 114 | 115 | try { 116 | ssc.awaitTermination() 117 | println("*** streaming terminated") 118 | } catch { 119 | case e: Exception => { 120 | println("*** streaming exception caught in monitor thread") 121 | } 122 | } 123 | 124 | // stop Spark 125 | sc.stop() 126 | 127 | // stop Kafka 128 | kafkaServer.stop() 129 | 130 | println("*** done") 131 | } 132 | } -------------------------------------------------------------------------------- /src/main/scala/MultipleConsumerGroups.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import util.{EmbeddedKafkaServer, SimpleKafkaClient, SparkKafkaSink} 7 | 8 | /** 9 | * This example creates two streams based on two different consumer groups, so both streams 10 | * get a copy of the same data. It's simply a matter of specifying the two names of the 11 | * two different consumer groups in the two calls to createStream() -- no special 12 | * configuration is needed. 13 | */ 14 | 15 | object MultipleConsumerGroups { 16 | 17 | /** 18 | * Publish some data to a topic. Encapsulated here to ensure serializability. 19 | * @param max 20 | * @param sc 21 | * @param topic 22 | * @param config 23 | */ 24 | def send(max: Int, sc: SparkContext, topic: String, config: Properties): Unit = { 25 | 26 | // put some data in an RDD and publish to Kafka 27 | val numbers = 1 to max 28 | val numbersRDD = sc.parallelize(numbers, 4) 29 | 30 | val kafkaSink = sc.broadcast(SparkKafkaSink(config)) 31 | 32 | println("*** producing data") 33 | 34 | numbersRDD.foreach { n => 35 | kafkaSink.value.send(topic, "key_" + n, "string_" + n) 36 | } 37 | } 38 | 39 | def main (args: Array[String]) { 40 | 41 | val topic = "foo" 42 | 43 | val kafkaServer = new EmbeddedKafkaServer() 44 | kafkaServer.start() 45 | kafkaServer.createTopic(topic, 4) 46 | 47 | val conf = new SparkConf().setAppName("MultipleConsumerGroups").setMaster("local[4]") 48 | val sc = new SparkContext(conf) 49 | 50 | // streams will produce data every second 51 | val ssc = new StreamingContext(sc, Seconds(1)) 52 | 53 | val max = 1000 54 | 55 | 56 | // 57 | // the first stream subscribes to consumer group Group1 58 | // 59 | 60 | val props1: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer, "Group1") 61 | 62 | val kafkaStream1 = 63 | KafkaUtils.createDirectStream( 64 | ssc, 65 | LocationStrategies.PreferConsistent, 66 | ConsumerStrategies.Subscribe[String, String]( 67 | Arrays.asList(topic), 68 | props1.asInstanceOf[java.util.Map[String, Object]] 69 | ) 70 | 71 | ) 72 | 73 | kafkaStream1.foreachRDD(r => { 74 | println("*** [stream 1] got an RDD, size = " + r.count()) 75 | r.foreach(s => println("*** [stream 1] " + s)) 76 | if (r.count() > 0) { 77 | println("*** [stream 1] " + r.getNumPartitions + " partitions") 78 | r.glom().foreach(a => println("*** [stream 1] partition size = " + a.size)) 79 | } 80 | }) 81 | 82 | // 83 | // a second stream, subscribing to the second consumer group (Group2), will 84 | // see all of the same data 85 | // 86 | 87 | val props2: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer, "Group2") 88 | 89 | val kafkaStream2 = 90 | KafkaUtils.createDirectStream( 91 | ssc, 92 | LocationStrategies.PreferConsistent, 93 | ConsumerStrategies.Subscribe[String, String]( 94 | Arrays.asList(topic), 95 | props2.asInstanceOf[java.util.Map[String, Object]] 96 | ) 97 | 98 | ) 99 | 100 | kafkaStream2.foreachRDD(r => { 101 | println("*** [stream 2] got an RDD, size = " + r.count()) 102 | r.foreach(s => println("*** [stream 2] " + s)) 103 | if (r.count() > 0) { 104 | println("*** [stream 2] " + r.getNumPartitions + " partitions") 105 | r.glom().foreach(a => println("*** [stream 2] partition size = " + a.size)) 106 | } 107 | }) 108 | 109 | ssc.start() 110 | 111 | println("*** started termination monitor") 112 | 113 | // streams seem to need some time to get going 114 | Thread.sleep(5000) 115 | 116 | val producerThread = new Thread("Streaming Termination Controller") { 117 | override def run() { 118 | val client = new SimpleKafkaClient(kafkaServer) 119 | 120 | send(max, sc, topic, client.basicStringStringProducer) 121 | Thread.sleep(5000) 122 | println("*** requesting streaming termination") 123 | ssc.stop(stopSparkContext = false, stopGracefully = true) 124 | } 125 | } 126 | producerThread.start() 127 | 128 | try { 129 | ssc.awaitTermination() 130 | println("*** streaming terminated") 131 | } catch { 132 | case e: Exception => { 133 | println("*** streaming exception caught in monitor thread") 134 | } 135 | } 136 | 137 | // stop Spark 138 | sc.stop() 139 | 140 | // stop Kafka 141 | kafkaServer.stop() 142 | 143 | println("*** done") 144 | } 145 | } -------------------------------------------------------------------------------- /src/main/scala/Timestamp.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Calendar, Properties, TimeZone} 2 | 3 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.streaming.{Seconds, StreamingContext} 6 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 7 | import util.{EmbeddedKafkaServer, SimpleKafkaClient} 8 | 9 | /** 10 | * Record timestamps were introduced into Kafka 0.10 as described in 11 | * https://cwiki.apache.org/confluence/display/KAFKA/KIP-32+-+Add+timestamps+to+Kafka+message 12 | * and 13 | * https://cwiki.apache.org/confluence/display/KAFKA/KIP-33+-+Add+a+time+based+log+index . 14 | * 15 | * This example sets up two different topics that handle timestamps differently -- topic A has the timestamp 16 | * set by the broker when it receives the record, while topic B passes through the timestamp provided in the record 17 | * (either programmatically when the record was created, as shown here, or otherwise automatically by the producer.) 18 | * 19 | * Since the record carries information about where its timestamp originates, its easy to subscribe to the two topics 20 | * to create a single stream, and then examine the timestamp of every received record and its type. 21 | */ 22 | object Timestamp { 23 | def main (args: Array[String]) { 24 | 25 | val topicLogAppendTime = "A" 26 | val topicCreateTime = "B" 27 | 28 | val kafkaServer = new EmbeddedKafkaServer() 29 | kafkaServer.start() 30 | kafkaServer.createTopic(topicLogAppendTime, 4, logAppendTime = true) 31 | kafkaServer.createTopic(topicCreateTime, 4) 32 | 33 | val conf = new SparkConf().setAppName("Timestamp").setMaster("local[4]") 34 | val sc = new SparkContext(conf) 35 | 36 | // streams will produce data every second 37 | val ssc = new StreamingContext(sc, Seconds(1)) 38 | 39 | // this many messages 40 | val max = 1000 41 | 42 | // Create the stream. 43 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 44 | 45 | val kafkaStream = 46 | KafkaUtils.createDirectStream( 47 | ssc, 48 | LocationStrategies.PreferConsistent, 49 | ConsumerStrategies.Subscribe[String, String]( 50 | Arrays.asList(topicLogAppendTime, topicCreateTime), 51 | props.asInstanceOf[java.util.Map[String, Object]] 52 | ) 53 | 54 | ) 55 | 56 | val timeFormat = new java.text.SimpleDateFormat("HH:mm:ss.SSS") 57 | 58 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 59 | kafkaStream.foreachRDD(r => { 60 | println("*** got an RDD, size = " + r.count()) 61 | r.foreach(cr => { 62 | 63 | val time = timeFormat.format(cr.timestamp()) 64 | println("Topic [" + cr.topic() + "] Key [" + cr.key + "] Type [" + cr.timestampType().toString + 65 | "] Timestamp [" + time + "]") 66 | }) 67 | }) 68 | 69 | ssc.start() 70 | 71 | println("*** started termination monitor") 72 | 73 | // streams seem to need some time to get going 74 | Thread.sleep(5000) 75 | 76 | val producerThread = new Thread("Streaming Termination Controller") { 77 | override def run() { 78 | val client = new SimpleKafkaClient(kafkaServer) 79 | 80 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 81 | 82 | // the two records are created at almost the same time, so should have similar creation time stamps 83 | // if we didn't provide one, the producer would so so, but then we wouldn't know what it was ... 84 | 85 | val timestamp = Calendar.getInstance().getTime().getTime 86 | 87 | println("Record creation time: " + timeFormat.format(timestamp)) 88 | 89 | val record1 = new ProducerRecord(topicLogAppendTime, 1, timestamp, "key1", "value1") 90 | val record2 = new ProducerRecord(topicCreateTime, 1, timestamp, "key2", "value2") 91 | 92 | Thread.sleep(2000) 93 | 94 | // the two records are sent to the Kafka broker two seconds after they are created, and three seconds apart 95 | 96 | producer.send(record1) 97 | Thread.sleep(3000) 98 | producer.send(record2) 99 | 100 | Thread.sleep(5000) 101 | println("*** requesting streaming termination") 102 | ssc.stop(stopSparkContext = false, stopGracefully = true) 103 | } 104 | } 105 | producerThread.start() 106 | 107 | try { 108 | ssc.awaitTermination() 109 | println("*** streaming terminated") 110 | } catch { 111 | case e: Exception => { 112 | println("*** streaming exception caught in monitor thread") 113 | } 114 | } 115 | 116 | // stop Spark 117 | sc.stop() 118 | 119 | // stop Kafka 120 | kafkaServer.stop() 121 | 122 | println("*** done") 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/scala/structured/Foreach.scala: -------------------------------------------------------------------------------- 1 | package structured 2 | 3 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} 4 | import org.apache.spark.sql.{ForeachWriter, Row, SparkSession} 5 | import util.{EmbeddedKafkaServer, SimpleKafkaClient} 6 | 7 | /** 8 | * The 'foreach' operation allows arbitrary computations on the output data in way that is both 9 | * partition-aware (computed on the executors and aware of which partition is being processed) and batch-aware 10 | * (via a separate invocation for each partition/batch combination.) 11 | * 12 | * It is always used by passing the operation an object that implements the 'ForeachWriter' interface. In this 13 | * example, the object doesn't do any "useful" work: instead it is set up to illustrate its slightly arcane state 14 | * management by printing its arguments and state in each of the three overridden methods. 15 | * 16 | * Each instance of ForeachWriter is used for processing a sequence of partition/batch combinations, but at any point 17 | * in time is is setup (via a single open() call) to process one partition/batch combination. Then it gets multiple 18 | * process() calls, providing the the actual data for that partition and batch, and then a single close() call to 19 | * signal that the partition/batch combination has been completely processed. 20 | */ 21 | object Foreach { 22 | 23 | def main (args: Array[String]) { 24 | 25 | val topic = "foo" 26 | 27 | println("*** starting Kafka server") 28 | val kafkaServer = new EmbeddedKafkaServer() 29 | kafkaServer.start() 30 | kafkaServer.createTopic(topic, 4) 31 | 32 | Thread.sleep(5000) 33 | 34 | // publish some messages 35 | println("*** Publishing messages") 36 | val messageCount = 16 37 | val client = new SimpleKafkaClient(kafkaServer) 38 | val numbers = 1 to messageCount 39 | val producer = new KafkaProducer[String, String](client.basicStringStringProducer) 40 | numbers.foreach { n => 41 | producer.send(new ProducerRecord(topic, "[1]key_" + n, "[1]string_" + n)) 42 | } 43 | Thread.sleep(5000) 44 | 45 | println("*** Starting to stream") 46 | 47 | val spark = SparkSession 48 | .builder 49 | .appName("Structured_Foreach") 50 | .config("spark.master", "local[4]") 51 | .getOrCreate() 52 | 53 | val ds1 = spark 54 | .readStream 55 | .format("kafka") 56 | .option("kafka.bootstrap.servers", kafkaServer.getKafkaConnect) 57 | .option("subscribe", topic) 58 | .option("startingOffsets", "earliest") // equivalent of auto.offset.reset which is not allowed here 59 | .load() 60 | 61 | val counts = ds1.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") 62 | 63 | // process the stream using a custom ForeachWriter that simply prints the data and the state of the ForeachWriter 64 | // in order to illustrate how it works 65 | val query = counts.writeStream 66 | .foreach(new ForeachWriter[Row] { 67 | 68 | // Notice the initialization here is very simple, as it gets called on the driver, but never called 69 | // again on the executor. Any initialization that needs to be called repeatedly on the executor 70 | // needs to go in the open() method. 71 | 72 | // By using an Option, initializing with None and replacing with None in the close() method, we verify that 73 | // process() is only ever called between a matched pair of open() and close() calls. 74 | var myPartition: Option[Long] = None 75 | var myVersion: Option[Long] = None 76 | 77 | /** 78 | * Apart from printing the partition and version, we only accept batches from even numbered partitions. 79 | */ 80 | override def open(partitionId: Long, version: Long): Boolean = { 81 | myPartition = Some(partitionId) 82 | myVersion = Some(version) 83 | println(s"*** ForEachWriter: open partition=[$partitionId] version=[$version]") 84 | val processThisOne = partitionId % 2 == 0 85 | // We only accept this partition/batch combination if we return true -- in this case we'll only do so for 86 | // even numbered partitions. This decision could have been based on the version ID as well. 87 | processThisOne 88 | } 89 | 90 | /** 91 | * Since we've saved the partition and batch IDs, we can see which combination each record comes from. 92 | * Notice we only get records from even numbered partitions, since we rejected the odd numbered 93 | * ones in the open() method by returning false. 94 | */ 95 | override def process(record: Row) : Unit = { 96 | println(s"*** ForEachWriter: process partition=[$myPartition] version=[$myVersion] record=$record") 97 | } 98 | 99 | /** 100 | * Again we've saved the partition and batch IDs, so we can see which combination is being closed. 101 | * We'll leave error handling for a more advanced example. 102 | */ 103 | override def close(errorOrNull: Throwable): Unit = { 104 | println(s"*** ForEachWriter: close partition=[$myPartition] version=[$myVersion]") 105 | myPartition = None 106 | myVersion = None 107 | } 108 | }).start() 109 | 110 | println("*** done setting up streaming") 111 | 112 | Thread.sleep(5000) 113 | 114 | println("*** publishing more messages") 115 | numbers.foreach { n => 116 | producer.send(new ProducerRecord(topic, "[2]key_" + n, "[2]string_" + n)) 117 | } 118 | 119 | Thread.sleep(5000) 120 | 121 | println("*** Stopping stream") 122 | query.stop() 123 | 124 | query.awaitTermination() 125 | spark.stop() 126 | println("*** Streaming terminated") 127 | 128 | // stop Kafka 129 | println("*** Stopping Kafka") 130 | kafkaServer.stop() 131 | 132 | println("*** done") 133 | } 134 | } -------------------------------------------------------------------------------- /src/main/scala/AddPartitionsWhileStreaming.scala: -------------------------------------------------------------------------------- 1 | import java.util.{Arrays, Properties} 2 | 3 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 4 | import org.apache.spark.streaming.{Seconds, StreamingContext} 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | import util.{EmbeddedKafkaServer, SimpleKafkaClient, SparkKafkaSink} 7 | 8 | /** 9 | * Partitions can be added to a Kafka topic dynamically. This example shows that an existing stream 10 | * will not see the data published to the new partitions, and only when the existing streaming context is terminated 11 | * and a new stream is started from a new context will that data be delivered. 12 | * 13 | * The topic is created with three partitions, and so each RDD the stream produces has three partitions as well, 14 | * even after two more partitions are added to the topic. When a new stream is subsequently created, the RDDs produced 15 | * have five partitions, but only two of them contain data, as all the data has been drained from the initial three 16 | * partitions of the topic, by the first stream. 17 | */ 18 | object AddPartitionsWhileStreaming { 19 | 20 | /** 21 | * Publish some data to a topic. Encapsulated here to ensure serializability. 22 | * @param max 23 | * @param sc 24 | * @param topic 25 | * @param config 26 | */ 27 | def send(max: Int, sc: SparkContext, topic: String, config: Properties): Unit = { 28 | 29 | // put some data in an RDD and publish to Kafka 30 | val numbers = 1 to max 31 | val numbersRDD = sc.parallelize(numbers, 5) 32 | 33 | val kafkaSink = sc.broadcast(SparkKafkaSink(config)) 34 | 35 | println("*** producing data") 36 | 37 | numbersRDD.foreach { n => 38 | // NOTE: 39 | // 1) the keys and values are strings, which is important when receiving them 40 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 41 | // is used to determine this 42 | kafkaSink.value.send(topic, "key_" + n, "string_" + n) 43 | } 44 | } 45 | 46 | def main (args: Array[String]) { 47 | 48 | val topic = "foo" 49 | 50 | val kafkaServer = new EmbeddedKafkaServer() 51 | kafkaServer.start() 52 | kafkaServer.createTopic(topic, 3) 53 | 54 | 55 | 56 | val conf = new SparkConf().setAppName("AddPartitionsWhileStreaming").setMaster("local[7]") 57 | val sc = new SparkContext(conf) 58 | 59 | // streams will produce data every second 60 | val ssc = new StreamingContext(sc, Seconds(1)) 61 | 62 | val max = 500 63 | 64 | val props: Properties = SimpleKafkaClient.getBasicStringStringConsumer(kafkaServer) 65 | 66 | val kafkaStream = 67 | KafkaUtils.createDirectStream( 68 | ssc, 69 | LocationStrategies.PreferConsistent, 70 | ConsumerStrategies.Subscribe[String, String]( 71 | Arrays.asList(topic), 72 | props.asInstanceOf[java.util.Map[String, Object]] 73 | ) 74 | 75 | ) 76 | 77 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 78 | kafkaStream.foreachRDD(r => { 79 | println("[1] *** got an RDD, size = " + r.count()) 80 | r.foreach(s => println(s)) 81 | if (r.count() > 0) { 82 | // let's see how many partitions the resulting RDD has -- notice that it has nothing 83 | // to do with the number of partitions in the RDD used to publish the data (4), nor 84 | // the number of partitions of the topic (which also happens to be four.) 85 | println("[1] *** " + r.getNumPartitions + " partitions") 86 | r.glom().foreach(a => println("[1] *** partition size = " + a.size)) 87 | } 88 | }) 89 | 90 | ssc.start() 91 | 92 | println("*** started streaming context") 93 | 94 | // streams seem to need some time to get going 95 | Thread.sleep(5000) 96 | 97 | 98 | val client = new SimpleKafkaClient(kafkaServer) 99 | 100 | send(max, sc, topic, client.basicStringStringProducer) 101 | Thread.sleep(5000) 102 | 103 | println("*** adding partitions to topic") 104 | 105 | kafkaServer.addPartitions(topic, 5) 106 | 107 | Thread.sleep(5000) 108 | 109 | send(max, sc, topic, client.basicStringStringProducer) 110 | 111 | Thread.sleep(5000) 112 | 113 | println("*** stop first streaming context") 114 | ssc.stop(stopSparkContext = false) 115 | try { 116 | ssc.awaitTermination() 117 | println("*** streaming terminated for the first time") 118 | } catch { 119 | case e: Exception => { 120 | println("*** streaming exception caught in monitor thread (first context)") 121 | } 122 | } 123 | 124 | println("*** create second streaming context") 125 | val ssc2 = new StreamingContext(sc, Seconds(1)) 126 | 127 | println("*** create a second stream from the second streaming context") 128 | val kafkaStream2 = 129 | KafkaUtils.createDirectStream( 130 | ssc2, 131 | LocationStrategies.PreferConsistent, 132 | ConsumerStrategies.Subscribe[String, String]( 133 | Arrays.asList(topic), 134 | props.asInstanceOf[java.util.Map[String, Object]] 135 | ) 136 | 137 | ) 138 | 139 | kafkaStream2.foreachRDD(r => { 140 | println("[2] *** got an RDD, size = " + r.count()) 141 | r.foreach(s => println(s)) 142 | if (r.count() > 0) { 143 | // let's see how many partitions the resulting RDD has -- notice that it has nothing 144 | // to do with the number of partitions in the RDD used to publish the data (4), nor 145 | // the number of partitions of the topic (which also happens to be four.) 146 | println("[2] *** " + r.getNumPartitions + " partitions") 147 | r.glom().foreach(a => println("[2] *** partition size = " + a.size)) 148 | } 149 | }) 150 | 151 | println("*** start second streaming context") 152 | ssc2.start() 153 | 154 | Thread.sleep(5000) 155 | 156 | println("*** requesting streaming termination") 157 | ssc2.stop(stopSparkContext = false, stopGracefully = true) 158 | 159 | 160 | try { 161 | ssc2.awaitTermination() 162 | println("*** streaming terminated") 163 | } catch { 164 | case e: Exception => { 165 | println("*** streaming exception caught in monitor thread") 166 | } 167 | } 168 | 169 | // stop Spark 170 | sc.stop() 171 | 172 | // stop Kafka 173 | kafkaServer.stop() 174 | 175 | println("*** done") 176 | } 177 | } -------------------------------------------------------------------------------- /src/main/scala/applications/stock_price_feed/StockMarketData.scala: -------------------------------------------------------------------------------- 1 | package applications.stock_price_feed 2 | 3 | import java.io.{ByteArrayInputStream, ByteArrayOutputStream, ObjectInputStream, ObjectOutputStream} 4 | import java.util.{Arrays, Properties} 5 | 6 | import org.apache.kafka.clients.consumer.ConsumerConfig 7 | import org.apache.kafka.clients.producer.{KafkaProducer, ProducerConfig, ProducerRecord} 8 | import org.apache.kafka.common.serialization.{Deserializer, Serializer} 9 | import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} 10 | import org.apache.spark.streaming.{Seconds, StreamingContext} 11 | import org.apache.spark.{SparkConf, SparkContext} 12 | import util.{EmbeddedKafkaServer, PartitionMapAnalyzer, SimpleKafkaClient} 13 | 14 | import scala.collection.{Iterator, mutable} 15 | 16 | class TradeData(val symbol: String, val price: Double, val volume: Long) extends Serializable { 17 | 18 | } 19 | 20 | class ChunkedTradeData(val symbol: String) extends Serializable { 21 | var trades = 0 22 | var totalAmount = 0.0 23 | var totalVolume: Long = 0 24 | 25 | def addTrade(trade: TradeData) : Unit = { 26 | trades = trades + 1 27 | totalVolume = totalVolume + trade.volume 28 | totalAmount = totalAmount + trade.volume * trade.price 29 | } 30 | 31 | def averagePrice = totalAmount / totalVolume 32 | } 33 | 34 | class TradeDataSerializer extends Serializer[TradeData] { 35 | 36 | override def close(): Unit = {} 37 | 38 | override def configure(config: java.util.Map[String, _], isKey: Boolean) : Unit = {} 39 | 40 | override def serialize(topic: String, data: TradeData) : Array[Byte] = { 41 | val stream: ByteArrayOutputStream = new ByteArrayOutputStream() 42 | val oos = new ObjectOutputStream(stream) 43 | oos.writeObject(data) 44 | oos.close 45 | stream.toByteArray 46 | } 47 | } 48 | 49 | class TradeDataDeserializer extends Deserializer[TradeData] { 50 | 51 | override def close(): Unit = {} 52 | 53 | override def configure(config: java.util.Map[String, _], isKey: Boolean) : Unit = {} 54 | 55 | override def deserialize(topic: String, data: Array[Byte]) : TradeData = { 56 | val ois = new ObjectInputStream(new ByteArrayInputStream(data)) 57 | val value = ois.readObject 58 | ois.close 59 | value.asInstanceOf[TradeData] 60 | } 61 | } 62 | 63 | 64 | 65 | object StockMarketData { 66 | 67 | def getProducer(server: EmbeddedKafkaServer) : Properties = { 68 | val config: Properties = new Properties 69 | config.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, classOf[TradeDataSerializer].getCanonicalName) 70 | config.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, classOf[TradeDataSerializer].getCanonicalName) 71 | config.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, server.getKafkaConnect) 72 | config 73 | } 74 | 75 | def getConsumer(server: EmbeddedKafkaServer, group:String = "MyGroup") : Properties = { 76 | val consumerConfig: Properties = new Properties 77 | consumerConfig.put(ConsumerConfig.GROUP_ID_CONFIG, group) 78 | consumerConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, classOf[TradeDataDeserializer].getCanonicalName) 79 | consumerConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, classOf[TradeDataDeserializer].getCanonicalName) 80 | consumerConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, server.getKafkaConnect) 81 | consumerConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest") 82 | consumerConfig 83 | } 84 | 85 | def main (args: Array[String]) { 86 | 87 | val topic1 = "SYM1" 88 | val topic2 = "SYM2" 89 | 90 | // topics are partitioned differently 91 | val kafkaServer = new EmbeddedKafkaServer() 92 | kafkaServer.start() 93 | kafkaServer.createTopic(topic1, 1) 94 | kafkaServer.createTopic(topic2, 1) 95 | 96 | val conf = new SparkConf().setAppName("StockMarketData").setMaster("local[10]") 97 | val sc = new SparkContext(conf) 98 | 99 | // streams will produce data every second 100 | val ssc = new StreamingContext(sc, Seconds(1)) 101 | 102 | // this many messages 103 | val max = 100 104 | 105 | // Create the stream. 106 | val props: Properties = getConsumer(kafkaServer) 107 | 108 | val rawDataFeed = 109 | KafkaUtils.createDirectStream( 110 | ssc, 111 | LocationStrategies.PreferConsistent, 112 | ConsumerStrategies.Subscribe[String, TradeData]( 113 | Arrays.asList(topic1, topic2), 114 | props.asInstanceOf[java.util.Map[String, Object]] 115 | ) 116 | 117 | ) 118 | 119 | // now, whenever this Kafka stream produces data the resulting RDD will be printed 120 | rawDataFeed.foreachRDD(r => { 121 | println("*** got an RDD, size = " + r.count()) 122 | 123 | PartitionMapAnalyzer.analyze(r) 124 | 125 | }) 126 | 127 | def chunkingFunc(i: Iterator[TradeData]) : Iterator[Map[String, ChunkedTradeData]] = { 128 | val m = new mutable.HashMap[String, ChunkedTradeData]() 129 | i.foreach { 130 | case trade: TradeData => 131 | if (m.contains(trade.symbol)) { 132 | m(trade.symbol).addTrade(trade) 133 | } else { 134 | val chunked = new ChunkedTradeData(trade.symbol) 135 | chunked.addTrade(trade) 136 | m(trade.symbol) = chunked 137 | } 138 | } 139 | Iterator.single(m.toMap) 140 | } 141 | 142 | val decodedFeed = rawDataFeed.map(cr => cr.value()) 143 | 144 | val chunkedDataFeed = decodedFeed.mapPartitions(chunkingFunc, preservePartitioning = true) 145 | 146 | chunkedDataFeed.foreachRDD(rdd => { 147 | rdd.foreach(m => 148 | m.foreach { 149 | case (symbol, chunk) => 150 | println(s"Symbol ${chunk.symbol} Price ${chunk.averagePrice} Volume ${chunk.totalVolume} Trades ${chunk.trades}") 151 | }) 152 | }) 153 | 154 | ssc.start() 155 | 156 | println("*** started streaming context") 157 | 158 | // streams seem to need some time to get going 159 | Thread.sleep(5000) 160 | 161 | val producerThreadTopic1 = new Thread("Producer thread 1") { 162 | override def run() { 163 | val client = new SimpleKafkaClient(kafkaServer) 164 | 165 | val numbers = 1 to max 166 | 167 | val producer = new KafkaProducer[String, TradeData](getProducer(kafkaServer)) 168 | 169 | numbers.foreach { n => 170 | // NOTE: 171 | // 1) the keys and values are strings, which is important when receiving them 172 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 173 | // is used to determine this 174 | producer.send(new ProducerRecord(topic1, new TradeData("SYM1", 12.0, 100))) 175 | } 176 | 177 | } 178 | } 179 | 180 | val producerThreadTopic2 = new Thread("Producer thread 2; controlling termination") { 181 | override def run() { 182 | val client = new SimpleKafkaClient(kafkaServer) 183 | 184 | val numbers = 1 to max 185 | 186 | val producer = new KafkaProducer[String, TradeData](getProducer(kafkaServer)) 187 | 188 | numbers.foreach { n => 189 | // NOTE: 190 | // 1) the keys and values are strings, which is important when receiving them 191 | // 2) We don't specify which Kafka partition to send to, so a hash of the key 192 | // is used to determine this 193 | producer.send(new ProducerRecord(topic2, new TradeData("SYM2", 123.0, 200))) 194 | } 195 | Thread.sleep(10000) 196 | println("*** requesting streaming termination") 197 | ssc.stop(stopSparkContext = false, stopGracefully = true) 198 | } 199 | } 200 | 201 | producerThreadTopic1.start() 202 | producerThreadTopic2.start() 203 | 204 | try { 205 | ssc.awaitTermination() 206 | println("*** streaming terminated") 207 | } catch { 208 | case e: Exception => { 209 | println("*** streaming exception caught in monitor thread") 210 | } 211 | } 212 | 213 | // stop Spark 214 | sc.stop() 215 | 216 | // stop Kafka 217 | kafkaServer.stop() 218 | 219 | println("*** done") 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Self-contained examples of Spark streaming integrated with Kafka 2 | 3 | [![Build Status](https://travis-ci.org/spirom/spark-streaming-with-kafka.svg?branch=master)](https://travis-ci.org/spirom/spark-streaming-with-kafka) 4 | 5 | The goal of this project is to make it easy to experiment with Spark Streaming based on Kafka, 6 | by creating examples that run against an embedded Kafka server and an embedded Spark instance. 7 | Of course, in making everything easy to work with we also make it perform poorly. It would be a 8 | really bad idea to try to learn anything about performance from this project: it's all 9 | about functionality, although we sometimes get insight into performance issues by understanding 10 | the way the 11 | code interacts with RDD partitioning in Spark and topic partitioning in Kafka. 12 | 13 | ## Related projects 14 | 15 | This project is derived from the 16 | [LearningSpark project](https://github.com/spirom/LearningSpark) which explores the full range of Spark APIs from the 17 | viewpoint of Scala developers. There is a corresponding, but much less comprehensive Java version at 18 | [learning-spark-with-java](https://github.com/spirom/learning-spark-with-java). 19 | 20 | The [spark-data-sources](https://github.com/spirom/spark-data-sources) project is focused on 21 | the new experimental APIs introduced in Spark 2.3.0 for developing adapters for 22 | external data sources of 23 | various kinds. This API is essentially a Java API (developed in Java) to avoid forcing 24 | developers to adopt Scala for their data source adapters. Consequently, the example data sources 25 | in this project are written in Java, but both Java and Scala usage examples are provided. 26 | 27 | ## Dependencies 28 | 29 | The project was created with IntelliJ Idea 14 Community Edition. It is known to work with 30 | JDK 1.8, Scala 2.11.12, and Spark 2.3.0 with its Kafka 0.10 shim library on Ubuntu Linux. 31 | 32 | It uses the Direct DStream package spark-streaming-kafka-0-10 for 33 | Spark Streaming integration with Kafka 0.10.0.1. The details behind this are explained in the 34 | [Spark 2.3.0 documentation](https://spark.apache.org/docs/2.3.0/streaming-kafka-integration.html). 35 | 36 | Note that, with the release of Spark 2.3.0, the formerly stable Receiver DStream APIs are now deprecated, and the 37 | formerly experimental Direct DStream APIs are now stable. 38 | 39 | ## Using the deprecated (Receiver DStream) Kafka 0.8.0 APIs 40 | 41 | I've kept around the examples for the older, stable Kafka integration on the 42 | [kafka0.8](https://github.com/spirom/spark-streaming-with-kafka/tree/kafka0.8) branch 43 | 44 | # Structured Streaming 45 | 46 | There's a [separate set of examples](src/main/scala/structured) for 47 | Kafka integration with the new Structured Streaming features (mainstream as of Spark 2.2). 48 | 49 | ## Utilities 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 82 | 83 |
FilePurpose
util/DirectServerDemo.scalaRun this first as a Spark-free sanity check for embedded server and clients.
util/EmbeddedKafkaServer.scalaStarting and stopping an embedded Kafka server, and creating and modifying topics.
util/EmbeddedZookeeper.scalaStarting and stopping an embedded Zookeeper.
util/PartitionMapAnalyzer.scalaSupport for understanding how subscribed Kafka topics and their Kafka partitions map to partitions in the 68 | RDD that is emitted by the Spark stream.
util/SimpleKafkaClient.scalaDirectly connect to Kafka without using Spark.
util/SparkKafkaSink.scalaSupport for publishing to Kafka topic in parallel from Spark.
util/TemporaryDirectories.scalaSupport for creating and cleaning up temporary directories needed for Kafka broker, Zookeeper and 81 | Spark streaming.
84 | 85 | ## Basic Examples 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 |
FileWhat's Illustrated
SimpleStreaming.scalaSimple way to set up streaming from a Kafka topic. While this program also publishes to the topic, the publishing does not involve Spark
ExceptionPropagation.scalaShow how call to awaitTermination() throws propagated exceptions.
98 | 99 | ## Partitioning Examples 100 | 101 | Partitioning is an important factor in determining the scalability oif Kafka-based streaming applications. 102 | In this set of examples you can see the relationship between a number of facets of partitioning. 103 | * The number of partitions in the RDD that is being published to a topic -- if indeed this involves an RDD, as the data is often published from a non-Spark application 104 | * The number of partitions of the topic itself (usually specified at topic creation) 105 | * THe number of partitions in the RDDs created by the Kafka stream 106 | * Whether and how messages move between partitions when they are transferred 107 | 108 | When running these examples, look for: 109 | * The topic partition number that is printed with each ConsumerRecord 110 | * After all the records are printed, the number of partitions in the resulting RDD and size of each partition. For example:
111 | *** 4 partitions
112 | *** partition size = 253
113 | *** partition size = 252
114 | *** partition size = 258
115 | *** partition size = 237
116 | 117 | 118 | Another way these examples differ from the basic examples above is that Spark is used to publish to the topic. 119 | Perhaps surprisingly, this is not completely straightforward, and relies on [util/SparkKafkaSink.scala](src/main/scala/util/SparkKafkaSink.scala). 120 | An alternative approach to this can be found [here](https://docs.cloud.databricks.com/docs/latest/databricks_guide/07%20Spark%20Streaming/09%20Write%20Output%20To%20Kafka.html). 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 178 | 179 |
FileWhat's Illustrated
SimpleStreamingFromRDD.scalaData is published by Spark from an RDD, but is repartitioned even through the publishing RDD and the topic have the same number of partitions.
SendWithDifferentPartitioning.scalaSend to a topic with different number of partitions.
ControlledPartitioning.scalaWhen publishing to the topic, explicitly assign each record to a partition.
AddPartitionsWhileStreaming.scala

Partitions can be added to a Kafka topic dynamically. This example shows that an existing stream 140 | will not see the data published to the new partitions, and only when the existing streaming context is terminated 141 | and a new stream is started from a new context will that data be delivered.

142 |

143 | The topic is created with three partitions, and so each RDD the stream produces has three partitions as well, 144 | even after two more partitions are added to the topic. This is what's received after the first 500 records 145 | are published to the topic while it has only three partitions:

146 |
147 | [1] *** got an RDD, size = 500
148 | [1] *** 3 partitions
149 | [1] *** partition size = 155
150 | [1] *** partition size = 173
151 | [1] *** partition size = 172
152 | 
153 |

When two partitions are added and another 500 messages are published, this is what's received 154 | (note both the number of partitions and the number of messages):

155 |
156 | [1] *** got an RDD, size = 288
157 | [1] *** 3 partitions
158 | [1] *** partition size = 98
159 | [1] *** partition size = 89
160 | [1] *** partition size = 101
161 | 
162 | 163 |

When a new stream is subsequently created, the RDDs produced 164 | have five partitions, but only two of them contain data, as all the data has been drained from the initial three 165 | partitions of the topic, by the first stream. Now all 500 messages (288 + 212) from the second set have been delivered.

166 | 167 |
168 | [2] *** got an RDD, size = 212
169 | [2] *** 5 partitions
170 | [2] *** partition size = 0
171 | [2] *** partition size = 0
172 | [2] *** partition size = 0
173 | [2] *** partition size = 112
174 | [2] *** partition size = 100
175 | 
176 | 177 |
180 | 181 | ## Other Examples 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 194 | 195 | 196 | 197 | 226 | 227 | 228 | 229 | 245 | 246 |
FileWhat's Illustrated
MultipleConsumerGroups.scalaTwo streams subscribing to the same topic via two consumer groups see all the same data.
MultipleStreams.scalaTwo streams subscribing to the same topic via a single consumer group divide up the data. 192 | There's an interesting partitioning interaction here as the streams each get data from two fo the four topic 193 | partitions, and each produce RDDs with two partitions each.
MultipleTopics.scalaA single stream subscribing to the two topics receives data from both of them. 198 | The partitioning behavior here is quite interesting. 199 |
    200 |
  • The topics have three and six partitions respectively.
  • 201 |
  • Each RDD has nine partitions.
  • 202 |
  • Each RDD partition receives data from exactly one partition of one topic.
  • 203 |
204 | Hence the output of the PartitionMapAnalyzer: 205 |
206 | *** got an RDD, size = 200
207 | *** 9 partitions
208 | *** partition 1 has 27 records
209 | *** rdd partition = 1, topic = foo, topic partition = 0, record count = 27.
210 | *** partition 2 has 15 records
211 | *** rdd partition = 2, topic = bar, topic partition = 1, record count = 15.
212 | *** partition 3 has 17 records
213 | *** rdd partition = 3, topic = bar, topic partition = 0, record count = 17.
214 | *** partition 4 has 39 records
215 | *** rdd partition = 4, topic = foo, topic partition = 1, record count = 39.
216 | *** partition 5 has 34 records
217 | *** rdd partition = 5, topic = foo, topic partition = 2, record count = 34.
218 | *** partition 6 has 11 records
219 | *** rdd partition = 6, topic = bar, topic partition = 3, record count = 11.
220 | *** partition 7 has 18 records
221 | *** rdd partition = 7, topic = bar, topic partition = 4, record count = 18.
222 | *** partition 8 has 20 records
223 | *** rdd partition = 8, topic = bar, topic partition = 2, record count = 20.
224 | 
225 |
Timestamp.scala 230 |

Record timestamps were introduced into Kafka 0.10 as described in 231 | KIP-32 232 | and 233 | KIP-33.

234 | 235 |

This example sets up two different topics that handle timestamps differently -- topic A has the timestamp 236 | set by the broker when it receives the record, while topic B passes through the timestamp provided in the record 237 | (either programmatically when the record was created, as shown here, or otherwise automatically by the producer.)

238 | 239 |

Since the record carries information about where its timestamp originates, its easy to subscribe to the two topics 240 | to create a single stream, and then examine the timestamp of every received record and its type.

241 | 242 |

NOTE: The use of timestamps to filter topics in the broker, as introduced in Kafka 0.10.1, is blocked on 243 | SPARK-18057.

244 |
247 | 248 | 249 | 250 | -------------------------------------------------------------------------------- /.idea/modules/spark-streaming-with-kafka-build.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | --------------------------------------------------------------------------------