├── .gitignore ├── src └── main │ ├── resources │ ├── META-INF │ │ └── MANIFEST.MF │ └── ansj_library.properties │ └── scala │ └── com │ └── lenovo │ └── ml │ ├── Word2Vector.scala │ ├── XGBoostInference.scala │ ├── DataPreprocess.scala │ └── XGBoostTrain.scala ├── .idea ├── copyright │ └── profiles_settings.xml ├── encodings.xml ├── vcs.xml ├── modules.xml ├── artifacts │ └── xgbspark_text_classification_jar.xml ├── libraries │ ├── Maven__oro_oro_2_0_8.xml │ ├── Maven__junit_junit_4_12.xml │ ├── Maven__org_tukaani_xz_1_0.xml │ ├── Maven__antlr_antlr_2_7_7.xml │ ├── Maven__log4j_log4j_1_2_17.xml │ ├── Maven__org_antlr_ST4_4_0_4.xml │ ├── Maven__xmlenc_xmlenc_0_52.xml │ ├── Maven__stax_stax_api_1_0_1.xml │ ├── Maven__javax_transaction_jta_1_1.xml │ ├── Maven__net_sf_py4j_py4j_0_10_4.xml │ ├── Maven__org_apache_ivy_ivy_2_4_0.xml │ ├── Maven__javax_jdo_jdo_api_3_0_1.xml │ ├── Maven__net_jpountz_lz4_lz4_1_3_0.xml │ ├── Maven__org_ansj_ansj_seg_5_1_2.xml │ ├── Maven__org_apache_avro_avro_1_7_7.xml │ ├── Maven__org_iq80_snappy_snappy_0_2.xml │ ├── Maven__com_google_inject_guice_3_0.xml │ ├── Maven__net_sf_opencsv_opencsv_2_3.xml │ ├── Maven__org_jodd_jodd_core_3_5_2.xml │ ├── Maven__org_nlpcn_nlp_lang_1_7_2.xml │ ├── Maven__commons_io_commons_io_2_4.xml │ ├── Maven__joda_time_joda_time_2_9_3.xml │ ├── Maven__net_razorvine_pyrolite_4_13.xml │ ├── Maven__org_objenesis_objenesis_2_1.xml │ ├── Maven__aopalliance_aopalliance_1_0.xml │ ├── Maven__com_google_guava_guava_14_0_1.xml │ ├── Maven__commons_cli_commons_cli_1_2.xml │ ├── Maven__commons_net_commons_net_2_2.xml │ ├── Maven__io_netty_netty_3_8_0_Final.xml │ ├── Maven__javax_inject_javax_inject_1.xml │ ├── Maven__org_scala_lang_scalap_2_11_0.xml │ ├── Maven__org_slf4j_slf4j_api_1_7_16.xml │ ├── Maven__javolution_javolution_5_5_1.xml │ ├── Maven__org_antlr_antlr_runtime_3_4.xml │ ├── Maven__org_jpmml_pmml_model_1_2_15.xml │ ├── Maven__com_ning_compress_lzf_1_0_3.xml │ ├── Maven__com_twitter_chill_2_11_0_8_0.xml │ ├── Maven__com_twitter_chill_java_0_8_0.xml │ ├── Maven__commons_dbcp_commons_dbcp_1_4.xml │ ├── Maven__commons_lang_commons_lang_2_6.xml │ ├── Maven__ml_dmlc_xgboost4j_spark_0_7.xml │ ├── Maven__net_java_dev_jets3t_jets3t_0_7_1.xml │ ├── Maven__org_apache_avro_avro_ipc_1_7_7.xml │ ├── Maven__org_codehaus_janino_janino_3_0_0.xml │ ├── Maven__org_jpmml_pmml_schema_1_2_15.xml │ ├── Maven__org_scalanlp_breeze_2_11_0_12.xml │ ├── Maven__com_esotericsoftware_minlog_1_3_0.xml │ ├── Maven__org_hamcrest_hamcrest_core_1_3.xml │ ├── Maven__com_github_fommil_netlib_core_1_1_2.xml │ ├── Maven__org_apache_derby_derby_10_10_2_0.xml │ ├── Maven__org_apache_thrift_libfb303_0_9_3.xml │ ├── Maven__org_slf4j_jul_to_slf4j_1_7_16.xml │ ├── Maven__org_spire_math_spire_2_11_0_7_4.xml │ ├── Maven__com_github_rwl_jtransforms_2_4_0.xml │ ├── Maven__commons_pool_commons_pool_1_5_4.xml │ ├── Maven__org_antlr_antlr4_runtime_4_5_3.xml │ ├── Maven__org_antlr_stringtemplate_3_2_1.xml │ ├── Maven__org_apache_thrift_libthrift_0_9_3.xml │ ├── Maven__org_slf4j_slf4j_log4j12_1_7_16.xml │ ├── Maven__com_jolbox_bonecp_0_8_0_RELEASE.xml │ ├── Maven__commons_codec_commons_codec_1_10.xml │ ├── Maven__org_spark_project_spark_unused_1_0_0.xml │ ├── Maven__com_chuusai_shapeless_2_11_2_0_0.xml │ ├── Maven__com_google_code_findbugs_jsr305_1_3_9.xml │ ├── Maven__io_netty_netty_all_4_0_42_Final.xml │ ├── Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml │ ├── Maven__org_slf4j_jcl_over_slf4j_1_7_16.xml │ ├── Maven__com_clearspring_analytics_stream_2_7_0.xml │ ├── Maven__org_apache_commons_commons_math_2_1.xml │ ├── Maven__org_apache_hadoop_hadoop_auth_2_2_0.xml │ ├── Maven__org_apache_hadoop_hadoop_hdfs_2_2_0.xml │ ├── Maven__org_apache_zookeeper_zookeeper_3_4_5.xml │ ├── Maven__org_glassfish_hk2_hk2_api_2_4_0_b34.xml │ ├── Maven__org_javassist_javassist_3_18_1_GA.xml │ ├── Maven__org_mortbay_jetty_jetty_util_6_1_26.xml │ ├── Maven__javax_ws_rs_javax_ws_rs_api_2_0_1.xml │ ├── Maven__org_apache_commons_commons_lang3_3_5.xml │ ├── Maven__org_json4s_json4s_ast_2_11_3_2_11.xml │ ├── Maven__com_googlecode_javaewah_JavaEWAH_0_3_2.xml │ ├── Maven__org_scala_lang_scala_library_2_11_8.xml │ ├── Maven__org_scala_lang_scala_reflect_2_11_7.xml │ ├── Maven__com_esotericsoftware_kryo_shaded_3_0_3.xml │ ├── Maven__com_thoughtworks_paranamer_paranamer_2_3.xml │ ├── Maven__log4j_apache_log4j_extras_1_2_17.xml │ ├── Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml │ ├── Maven__org_json4s_json4s_core_2_11_3_2_11.xml │ ├── Maven__commons_digester_commons_digester_1_8.xml │ ├── Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml │ ├── Maven__org_apache_hadoop_hadoop_client_2_2_0.xml │ ├── Maven__org_apache_hadoop_hadoop_common_2_2_0.xml │ ├── Maven__org_apache_httpcomponents_httpcore_4_4_4.xml │ ├── Maven__org_apache_spark_spark_sql_2_11_2_1_1.xml │ ├── Maven__org_glassfish_hk2_hk2_utils_2_4_0_b34.xml │ ├── Maven__org_scala_lang_scala_compiler_2_11_0.xml │ ├── Maven__org_xerial_snappy_snappy_java_1_1_2_6.xml │ ├── Maven__commons_logging_commons_logging_1_1_3.xml │ ├── Maven__org_apache_commons_commons_math3_3_4_1.xml │ ├── Maven__org_roaringbitmap_RoaringBitmap_0_5_11.xml │ ├── Maven__org_scalanlp_breeze_macros_2_11_0_12.xml │ ├── Maven__com_google_protobuf_protobuf_java_2_5_0.xml │ ├── Maven__com_univocity_univocity_parsers_2_2_1.xml │ ├── Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml │ ├── Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml │ ├── Maven__javax_servlet_javax_servlet_api_3_1_0.xml │ ├── Maven__org_apache_commons_commons_crypto_1_0_0.xml │ ├── Maven__org_apache_curator_curator_client_2_4_0.xml │ ├── Maven__org_apache_parquet_parquet_column_1_8_1.xml │ ├── Maven__org_apache_parquet_parquet_common_1_8_1.xml │ ├── Maven__org_apache_parquet_parquet_hadoop_1_8_1.xml │ ├── Maven__org_apache_spark_spark_core_2_11_2_1_1.xml │ ├── Maven__org_apache_spark_spark_hive_2_11_2_1_1.xml │ ├── Maven__org_apache_spark_spark_tags_2_11_2_1_1.xml │ ├── Maven__org_apache_xbean_xbean_asm5_shaded_4_4.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_api_2_2_0.xml │ ├── Maven__org_apache_httpcomponents_httpclient_4_5_2.xml │ ├── Maven__org_apache_spark_spark_mllib_2_11_2_1_1.xml │ ├── Maven__org_glassfish_hk2_hk2_locator_2_4_0_b34.xml │ ├── Maven__org_spire_math_spire_macros_2_11_0_7_4.xml │ ├── Maven__org_apache_curator_curator_recipes_2_4_0.xml │ ├── Maven__org_apache_parquet_parquet_jackson_1_8_1.xml │ ├── Maven__org_datanucleus_datanucleus_core_3_2_10.xml │ ├── Maven__org_datanucleus_datanucleus_rdbms_3_2_9.xml │ ├── Maven__org_json4s_json4s_jackson_2_11_3_2_11.xml │ ├── Maven__commons_httpclient_commons_httpclient_3_1.xml │ ├── Maven__org_apache_commons_commons_compress_1_4_1.xml │ ├── Maven__org_apache_parquet_parquet_encoding_1_8_1.xml │ ├── Maven__org_apache_spark_spark_graphx_2_11_2_1_1.xml │ ├── Maven__org_apache_spark_spark_sketch_2_11_2_1_1.xml │ ├── Maven__org_apache_spark_spark_unsafe_2_11_2_1_1.xml │ ├── Maven__com_twitter_parquet_hadoop_bundle_1_6_0.xml │ ├── Maven__commons_beanutils_commons_beanutils_1_7_0.xml │ ├── Maven__org_codehaus_janino_commons_compiler_3_0_0.xml │ ├── Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml │ ├── Maven__org_scala_lang_modules_scala_xml_2_11_1_0_1.xml │ ├── Maven__com_fasterxml_jackson_core_jackson_core_2_6_5.xml │ ├── Maven__javax_annotation_javax_annotation_api_1_2.xml │ ├── Maven__org_apache_curator_curator_framework_2_4_0.xml │ ├── Maven__org_datanucleus_datanucleus_api_jdo_3_2_6.xml │ ├── Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml │ ├── Maven__net_hydromatic_eigenbase_properties_1_1_5.xml │ ├── Maven__net_sourceforge_f2j_arpack_combined_all_0_1.xml │ ├── Maven__org_apache_hadoop_hadoop_annotations_2_2_0.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_client_2_2_0.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_common_2_2_0.xml │ ├── Maven__org_apache_spark_spark_catalyst_2_11_2_1_1.xml │ ├── Maven__org_apache_spark_spark_launcher_2_11_2_1_1.xml │ ├── Maven__org_codehaus_jackson_jackson_core_asl_1_9_13.xml │ ├── Maven__org_glassfish_jersey_core_jersey_client_2_22_2.xml │ ├── Maven__org_glassfish_jersey_core_jersey_common_2_22_2.xml │ ├── Maven__org_glassfish_jersey_core_jersey_server_2_22_2.xml │ ├── Maven__org_spark_project_hive_hive_exec_1_2_1_spark2.xml │ ├── Maven__javax_validation_validation_api_1_1_0_Final.xml │ ├── Maven__org_apache_spark_spark_streaming_2_11_2_1_1.xml │ ├── Maven__commons_collections_commons_collections_3_2_1.xml │ ├── Maven__org_codehaus_jackson_jackson_mapper_asl_1_9_13.xml │ ├── Maven__org_glassfish_hk2_osgi_resource_locator_1_0_1.xml │ ├── Maven__com_fasterxml_jackson_core_jackson_databind_2_6_5.xml │ ├── Maven__commons_configuration_commons_configuration_1_6.xml │ ├── Maven__org_apache_spark_spark_mllib_local_2_11_2_1_1.xml │ ├── Maven__org_glassfish_hk2_external_javax_inject_2_4_0_b34.xml │ ├── Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml │ ├── Maven__org_apache_calcite_calcite_core_1_2_0_incubating.xml │ ├── Maven__org_glassfish_jersey_media_jersey_media_jaxb_2_22_2.xml │ ├── Maven__org_spark_project_hive_hive_metastore_1_2_1_spark2.xml │ ├── Maven__com_fasterxml_jackson_core_jackson_annotations_2_6_5.xml │ ├── Maven__org_apache_spark_spark_network_common_2_11_2_1_1.xml │ ├── Maven__org_apache_hadoop_hadoop_yarn_server_common_2_2_0.xml │ ├── Maven__org_apache_spark_spark_network_shuffle_2_11_2_1_1.xml │ ├── Maven__org_apache_calcite_calcite_linq4j_1_2_0_incubating.xml │ ├── Maven__org_apache_parquet_parquet_format_2_3_0_incubating.xml │ ├── Maven__org_apache_calcite_calcite_avatica_1_2_0_incubating.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_2_0.xml │ ├── Maven__org_glassfish_jersey_bundles_repackaged_jersey_guava_2_22_2.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_2_0.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_2_0.xml │ ├── Maven__com_fasterxml_jackson_module_jackson_module_paranamer_2_6_5.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_2_0.xml │ ├── Maven__org_glassfish_hk2_external_aopalliance_repackaged_2_4_0_b34.xml │ ├── Maven__com_fasterxml_jackson_module_jackson_module_scala_2_11_2_6_5.xml │ ├── Maven__org_scala_lang_modules_scala_parser_combinators_2_11_1_0_1.xml │ ├── Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_2_0.xml │ ├── Maven__org_glassfish_jersey_containers_jersey_container_servlet_2_22_2.xml │ └── Maven__org_glassfish_jersey_containers_jersey_container_servlet_core_2_22_2.xml ├── misc.xml ├── compiler.xml └── uiDesigner.xml ├── README.md ├── pom.xml ├── LICENSE └── xgbspark-text-classification.iml /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/MANIFEST.MF: -------------------------------------------------------------------------------- 1 | Manifest-Version: 1.0 2 | Main-Class: com.lenovo.ml.XGBoostTrain 3 | 4 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/artifacts/xgbspark_text_classification_jar.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | $PROJECT_DIR$/out/artifacts/xgbspark_text_classification_jar 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__oro_oro_2_0_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__junit_junit_4_12.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_tukaani_xz_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__antlr_antlr_2_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__log4j_log4j_1_2_17.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_antlr_ST4_4_0_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__xmlenc_xmlenc_0_52.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__stax_stax_api_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_transaction_jta_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_sf_py4j_py4j_0_10_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_ivy_ivy_2_4_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Features 2 | 3 | * Data Source: `Hive` 4 | * Word Segmentation: `Ansj` 5 | * Feature Engineering: `NGram + TF-IDF` or `Pre-Trained Word2Vec` 6 | * Classification Algorithm: `XGBoost` 7 | * Model Training: `Spark Pipeline` 8 | * Model Selection and Tuning: `Cross Validation + Grid Search` 9 | 10 | ## Environments 11 | 12 | * [Spark](http://spark.apache.org) 2.1.1 13 | * [Hive](https://hive.apache.org) 1.2.1 14 | * [XGBoost4J-Spark](https://github.com/dmlc/xgboost/tree/master/jvm-packages) 0.7 15 | * [Ansj](https://github.com/NLPchina/ansj_seg) 5.1.2 16 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_jdo_jdo_api_3_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_jpountz_lz4_lz4_1_3_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_ansj_ansj_seg_5_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_iq80_snappy_snappy_0_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_inject_guice_3_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_sf_opencsv_opencsv_2_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_jodd_jodd_core_3_5_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_nlpcn_nlp_lang_1_7_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_io_commons_io_2_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__joda_time_joda_time_2_9_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_razorvine_pyrolite_4_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_objenesis_objenesis_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /src/main/resources/ansj_library.properties: -------------------------------------------------------------------------------- 1 | #path of userLibrary this is default library 2 | #dic=library/default.dic 3 | 4 | #path of crfModel 5 | #crf_dic1=library/crf.model 6 | 7 | #stop_dic1=library/stop.dic 8 | 9 | #redress dic file path 10 | #ambiguityLibrary=library/ambiguity.dic 11 | #synonymsLibrary=library/synonyms.dic 12 | 13 | #set real name 14 | isRealName=true 15 | 16 | #isNameRecognition default true 17 | isNameRecognition=true 18 | 19 | #isNumRecognition default true 20 | isNumRecognition=true 21 | 22 | #digital quantifier merge default true 23 | isQuantifierRecognition=true -------------------------------------------------------------------------------- /.idea/libraries/Maven__aopalliance_aopalliance_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_guava_guava_14_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_cli_commons_cli_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_net_commons_net_2_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_netty_netty_3_8_0_Final.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_inject_javax_inject_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scalap_2_11_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_api_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javolution_javolution_5_5_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_antlr_antlr_runtime_3_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_jpmml_pmml_model_1_2_15.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_ning_compress_lzf_1_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_twitter_chill_2_11_0_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_twitter_chill_java_0_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_dbcp_commons_dbcp_1_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__ml_dmlc_xgboost4j_spark_0_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_java_dev_jets3t_jets3t_0_7_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_ipc_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_janino_janino_3_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_jpmml_pmml_schema_1_2_15.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scalanlp_breeze_2_11_0_12.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_esotericsoftware_minlog_1_3_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_hamcrest_hamcrest_core_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_github_fommil_netlib_core_1_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_derby_derby_10_10_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_thrift_libfb303_0_9_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_jul_to_slf4j_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_spire_math_spire_2_11_0_7_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_github_rwl_jtransforms_2_4_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_pool_commons_pool_1_5_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_antlr_antlr4_runtime_4_5_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_antlr_stringtemplate_3_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_thrift_libthrift_0_9_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_slf4j_log4j12_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_jolbox_bonecp_0_8_0_RELEASE.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_codec_commons_codec_1_10.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_spark_project_spark_unused_1_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_chuusai_shapeless_2_11_2_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_code_findbugs_jsr305_1_3_9.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_netty_netty_all_4_0_42_Final.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_ipc_tests_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_slf4j_jcl_over_slf4j_1_7_16.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_clearspring_analytics_stream_2_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_math_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_auth_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_hdfs_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_zookeeper_zookeeper_3_4_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_hk2_api_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_javassist_javassist_3_18_1_GA.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_mortbay_jetty_jetty_util_6_1_26.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_ws_rs_javax_ws_rs_api_2_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_lang3_3_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_json4s_json4s_ast_2_11_3_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_googlecode_javaewah_JavaEWAH_0_3_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scala_library_2_11_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scala_reflect_2_11_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_esotericsoftware_kryo_shaded_3_0_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_thoughtworks_paranamer_paranamer_2_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__log4j_apache_log4j_extras_1_2_17.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_avro_avro_mapred_hadoop2_1_7_7.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_json4s_json4s_core_2_11_3_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_digester_commons_digester_1_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_jvm_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_client_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_common_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_httpcomponents_httpcore_4_4_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_sql_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_hk2_utils_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_scala_compiler_2_11_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_xerial_snappy_snappy_java_1_1_2_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_logging_commons_logging_1_1_3.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_math3_3_4_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_roaringbitmap_RoaringBitmap_0_5_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scalanlp_breeze_macros_2_11_0_12.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_google_protobuf_protobuf_java_2_5_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_univocity_univocity_parsers_2_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_core_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_json_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_servlet_javax_servlet_api_3_1_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_crypto_1_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_curator_curator_client_2_4_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_column_1_8_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_common_1_8_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_hadoop_1_8_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_core_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_hive_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_tags_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_xbean_xbean_asm5_shaded_4_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_api_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_httpcomponents_httpclient_4_5_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_mllib_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_hk2_locator_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_spire_math_spire_macros_2_11_0_7_4.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_curator_curator_recipes_2_4_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_jackson_1_8_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_datanucleus_datanucleus_core_3_2_10.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_datanucleus_datanucleus_rdbms_3_2_9.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_json4s_json4s_jackson_2_11_3_2_11.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_httpclient_commons_httpclient_3_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_commons_commons_compress_1_4_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_encoding_1_8_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_graphx_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_sketch_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_unsafe_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_twitter_parquet_hadoop_bundle_1_6_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_beanutils_commons_beanutils_1_7_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_janino_commons_compiler_3_0_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_fusesource_leveldbjni_leveldbjni_all_1_8.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_modules_scala_xml_2_11_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_core_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_annotation_javax_annotation_api_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_curator_curator_framework_2_4_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_datanucleus_datanucleus_api_jdo_3_2_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__io_dropwizard_metrics_metrics_graphite_3_1_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_hydromatic_eigenbase_properties_1_1_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__net_sourceforge_f2j_arpack_combined_all_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_annotations_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_client_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_common_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_catalyst_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_launcher_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_core_asl_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_core_jersey_client_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_core_jersey_common_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_core_jersey_server_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_spark_project_hive_hive_exec_1_2_1_spark2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__javax_validation_validation_api_1_1_0_Final.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_streaming_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_collections_commons_collections_3_2_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_codehaus_jackson_jackson_mapper_asl_1_9_13.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_osgi_resource_locator_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_databind_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_configuration_commons_configuration_1_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_mllib_local_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_external_javax_inject_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_beanutils_commons_beanutils_core_1_8_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_calcite_calcite_core_1_2_0_incubating.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_media_jersey_media_jaxb_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_spark_project_hive_hive_metastore_1_2_1_spark2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_core_jackson_annotations_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_network_common_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_yarn_server_common_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_spark_spark_network_shuffle_2_11_2_1_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_calcite_calcite_linq4j_1_2_0_incubating.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_parquet_parquet_format_2_3_0_incubating.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_calcite_calcite_avatica_1_2_0_incubating.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_app_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_bundles_repackaged_jersey_guava_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_core_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_common_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_module_jackson_module_paranamer_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_shuffle_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_hk2_external_aopalliance_repackaged_2_4_0_b34.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_fasterxml_jackson_module_jackson_module_scala_2_11_2_6_5.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_scala_lang_modules_scala_parser_combinators_2_11_1_0_1.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_apache_hadoop_hadoop_mapreduce_client_jobclient_2_2_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_containers_jersey_container_servlet_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__org_glassfish_jersey_containers_jersey_container_servlet_core_2_22_2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/main/scala/com/lenovo/ml/Word2Vector.scala: -------------------------------------------------------------------------------- 1 | package com.lenovo.ml 2 | 3 | /** 4 | * Created by YangChenguang on 2017/10/17. 5 | */ 6 | import org.apache.spark.sql.SparkSession 7 | import DataPreprocess.segWords 8 | import org.apache.spark.ml.feature._ 9 | import org.apache.spark.ml.Pipeline 10 | 11 | object Word2Vector { 12 | def main(args:Array[String]): Unit = { 13 | // 1、创建Spark程序入口 14 | val sparkSession = SparkSession.builder().appName("Word2Vector").enableHiveSupport().getOrCreate() 15 | 16 | // 2、读取训练数据,对文本预处理后分词 17 | val tableName = args(0) 18 | val matrix = sparkSession.sql("SELECT text FROM " + tableName + " where text is not null") 19 | val words = segWords(sparkSession, args(1), args(2), args(3), args(4), matrix).repartition(6).cache() 20 | 21 | // 3、数据准备 22 | val tokenizer = new RegexTokenizer().setInputCol("words").setOutputCol("wordsArray") 23 | val remover = new StopWordsRemover().setInputCol("wordsArray").setOutputCol("filteredWords") 24 | 25 | // 4、训练Word2Vec模型 26 | val word2Vec = new Word2Vec().setInputCol("filteredWords").setOutputCol("features").setStepSize(0.025).setNumPartitions(1) 27 | .setMaxIter(1).setMaxSentenceLength(1000).setWindowSize(5).setVectorSize(args(5).toInt).setMinCount(10).setSeed(12345L) 28 | val pipeline = new Pipeline().setStages(Array(tokenizer, remover, word2Vec)) 29 | val Word2VecModel = pipeline.fit(words) 30 | 31 | // 5、保存模型 32 | Word2VecModel.write.save(args(6)) 33 | 34 | sparkSession.stop() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/main/scala/com/lenovo/ml/XGBoostInference.scala: -------------------------------------------------------------------------------- 1 | package com.lenovo.ml 2 | 3 | /** 4 | * Created by YangChenguang on 2017/9/15. 5 | */ 6 | import org.apache.spark.sql.{Row, SparkSession} 7 | import org.apache.spark.sql.types.StructType 8 | import DataPreprocess.segWords 9 | import org.apache.spark.ml.PipelineModel 10 | 11 | object XGBoostInference { 12 | def main(args:Array[String]): Unit = { 13 | // 1、创建Spark程序入口 14 | val sparkSession = SparkSession.builder().appName("XGBoostInference").enableHiveSupport().getOrCreate() 15 | 16 | // 2、读取训练数据,对文本预处理后分词 17 | val tableName = args(0) 18 | val matrix = sparkSession.sql("SELECT * FROM " + tableName) 19 | val words = segWords(sparkSession, args(1), args(2), args(3), args(4), matrix.select("text")) 20 | 21 | // 3、将原数据与分词结果关联起来 22 | val rows = matrix.rdd.zip(words.rdd).map{ 23 | case (rowLeft, rowRight) => Row.fromSeq(rowLeft.toSeq ++ rowRight.toSeq) 24 | } 25 | val schema = StructType(matrix.schema.fields ++ words.schema.fields) 26 | val matrixMerge = sparkSession.createDataFrame(rows, schema) 27 | 28 | // 4、构建特征向量 29 | val featuredModelTrained = sparkSession.sparkContext.broadcast(PipelineModel.read.load(args(5))) 30 | val dataPrepared = featuredModelTrained.value.transform(matrixMerge).repartition(18).cache() 31 | 32 | // 5、加载分类模型,产出故障预测结果 33 | val xgbModelTrained = sparkSession.sparkContext.broadcast(PipelineModel.read.load(args(6))) 34 | val prediction = xgbModelTrained.value.transform(dataPrepared) 35 | 36 | // 6、将预测结果写到HDFS 37 | prediction.select("text", "predictedLabel", "probabilities").rdd.coalesce(1).saveAsTextFile(args(7)) 38 | 39 | sparkSession.stop() 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.lenovo.ml 8 | xgbspark-text-classification 9 | 1.0-SNAPSHOT 10 | 11 | 12 | UTF-8 13 | 2.1.1 14 | 2.11 15 | 2.6.4 16 | 17 | 18 | 19 | 20 | org.apache.spark 21 | spark-core_${scala.version} 22 | ${spark.version} 23 | 24 | 25 | org.apache.spark 26 | spark-sql_${scala.version} 27 | ${spark.version} 28 | 29 | 30 | org.apache.spark 31 | spark-hive_${scala.version} 32 | ${spark.version} 33 | 34 | 35 | org.apache.spark 36 | spark-mllib_${scala.version} 37 | ${spark.version} 38 | 39 | 40 | org.nlpcn 41 | nlp-lang 42 | 1.7.2 43 | 44 | 45 | org.ansj 46 | ansj_seg 47 | 5.1.2 48 | 49 | 50 | ml.dmlc 51 | xgboost4j-spark 52 | 0.7 53 | 54 | 55 | junit 56 | junit 57 | 4.12 58 | 59 | 60 | 61 | 62 | src/main/scala 63 | src/test/scala 64 | 65 | 66 | 67 | org.apache.maven.plugins 68 | maven-compiler-plugin 69 | 3.3 70 | 71 | 1.8 72 | 1.8 73 | UTF-8 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /src/main/scala/com/lenovo/ml/DataPreprocess.scala: -------------------------------------------------------------------------------- 1 | package com.lenovo.ml 2 | 3 | import org.apache.spark.sql.{SparkSession, DataFrame, Dataset} 4 | import scala.collection.mutable 5 | import scala.util.matching.Regex 6 | import org.ansj.library.DicLibrary 7 | import org.ansj.recognition.impl.StopRecognition 8 | import org.ansj.splitWord.analysis.DicAnalysis 9 | 10 | /** 11 | * Created by YangChenguang on 2017/12/27. 12 | */ 13 | object DataPreprocess { 14 | def textCleaner(sparkSession: SparkSession, rawText: DataFrame): Dataset[String] = { 15 | // 过滤文本中的时间、网址和邮箱 16 | val regex1 = new Regex("""[-—0-9a-z]+[:]+[0-9a-z]+[:]?""") 17 | val regex2 = new Regex("""[0-9]+年|[0-9]+月|[0-9]+[日]|[0-9]+[天]|[0-9]+[号]|[0-9]+[次]""") 18 | val regex3 = new Regex("""http[s]?://[a-z0-9./?=_-]+""") 19 | val regex4 = new Regex("""[0-9_a-z]+([-+.][0-9_a-z]+)*@[0-9_a-z]+([-.][0-9_a-z]+)*\.[0-9_a-z]+([-.][0-9_a-z]+)*""") 20 | 21 | import sparkSession.implicits._ 22 | rawText.map(x => x.toString).map(x => x.substring(1,x.length - 1).toLowerCase).map(x => regex1.replaceAllIn(x,"")) 23 | .map(x => regex2.replaceAllIn(x,"")).map(x => regex3.replaceAllIn(x,"")).map(x => regex4.replaceAllIn(x,"")) 24 | } 25 | 26 | def segWords(sparkSession: SparkSession, stopWordsPath: String, dictionaryPath: String, synonymWordsPath: String, 27 | singleWordsPath: String, rawText: DataFrame): DataFrame = { 28 | val filter = new StopRecognition() 29 | // 设定停用词性 30 | filter.insertStopNatures("w","ns","nr","t","r","u","e","y","o") 31 | // 加载停用词表 32 | val stopWords = sparkSession.sparkContext.textFile(stopWordsPath).cache() 33 | stopWords.collect().foreach{line => filter.insertStopWords(line)} 34 | // 加载自定义词表 35 | val dictionary = sparkSession.sparkContext.textFile(dictionaryPath).cache() 36 | dictionary.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)} 37 | stopWords.collect().foreach{line => DicLibrary.insert(DicLibrary.DEFAULT, line)} 38 | // 构建同义词表 39 | val synonymWords = sparkSession.sparkContext.textFile(synonymWordsPath).cache() 40 | var synonymMap: Map[String, String] = Map() 41 | synonymWords.collect().foreach{line => 42 | val data = line.split(" ",2) 43 | synonymMap = synonymMap + (data(0) -> data(1)) 44 | } 45 | // 构建单字白名单 46 | val singleWords = sparkSession.sparkContext.textFile(singleWordsPath).cache() 47 | val singleWhiteList: mutable.Set[String] = mutable.Set() 48 | singleWords.collect().foreach{line => singleWhiteList.add(line)} 49 | 50 | // 通过广播将词表发送给各节点 51 | val stop = sparkSession.sparkContext.broadcast(filter) 52 | val dic = sparkSession.sparkContext.broadcast(DicLibrary.get(DicLibrary.DEFAULT)) 53 | val synonym = sparkSession.sparkContext.broadcast(synonymMap) 54 | val single = sparkSession.sparkContext.broadcast(singleWhiteList) 55 | 56 | // 读取文本数据,过滤后分词 57 | import sparkSession.implicits._ 58 | textCleaner(sparkSession, rawText).map { x => 59 | val parse = DicAnalysis.parse(x, dic.value).recognition(stop.value) 60 | // 抽取分词结果,不附带词性 61 | val words = for(i<-Range(0,parse.size())) yield parse.get(i).getName 62 | val filterWords = words.map(_.trim).filter(x => x.length > 1 || single.value.contains(x)) 63 | filterWords.map(x => if(synonym.value.contains(x)) synonym.value(x) else x).mkString(" ") 64 | }.toDF("words") 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/com/lenovo/ml/XGBoostTrain.scala: -------------------------------------------------------------------------------- 1 | package com.lenovo.ml 2 | 3 | /** 4 | * Created by YangChenguang on 2017/9/14. 5 | */ 6 | import org.apache.spark.SparkException 7 | import org.apache.spark.sql.types.StructType 8 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 9 | import DataPreprocess.segWords 10 | import scala.collection.mutable 11 | import org.apache.spark.ml.feature._ 12 | import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostEstimator} 13 | import org.apache.spark.ml.{Pipeline, PipelineModel} 14 | import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator 15 | import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit, TrainValidationSplitModel} 16 | 17 | object XGBoostTrain { 18 | def featureEngineeringTFIDF(sparkSession: SparkSession, dataMatrix: DataFrame, savePath: String): DataFrame ={ 19 | // 获取nGram 20 | val tokenizer = new RegexTokenizer().setInputCol("words").setOutputCol("wordsArray") 21 | val remover = new StopWordsRemover().setInputCol("wordsArray").setOutputCol("filteredWords") 22 | val nGram2 = new NGram().setN(2).setInputCol("filteredWords").setOutputCol("gram-2") 23 | val nGram3 = new NGram().setN(3).setInputCol("filteredWords").setOutputCol("gram-3") 24 | 25 | // 计算TF-IDF 26 | val countVectorizer_1gram = new CountVectorizer().setInputCol("filteredWords") 27 | val countVectorizer_2gram = new CountVectorizer().setInputCol("gram-2") 28 | val countVectorizer_3gram = new CountVectorizer().setInputCol("gram-3") 29 | val idf_1gram = new IDF().setInputCol(countVectorizer_1gram.getOutputCol).setOutputCol("tfidf-1gram").setMinDocFreq(10) 30 | val idf_2gram = new IDF().setInputCol(countVectorizer_2gram.getOutputCol).setOutputCol("tfidf-2gram").setMinDocFreq(10) 31 | val idf_3gram = new IDF().setInputCol(countVectorizer_3gram.getOutputCol).setOutputCol("tfidf-3gram").setMinDocFreq(10) 32 | val assembler = new VectorAssembler().setInputCols(Array("tfidf-1gram", "tfidf-2gram", "tfidf-3gram")).setOutputCol("features") 33 | 34 | // 构造特征向量 35 | val pipeline = new Pipeline().setStages(Array(tokenizer, remover, nGram2, nGram3, countVectorizer_1gram, 36 | countVectorizer_2gram, countVectorizer_3gram, idf_1gram, idf_2gram, idf_3gram, assembler)) 37 | pipeline.fit(dataMatrix).write.save(savePath) 38 | val pipelineModelTrained = sparkSession.sparkContext.broadcast(PipelineModel.read.load(savePath)) 39 | pipelineModelTrained.value.transform(dataMatrix) 40 | } 41 | 42 | def featureEngineeringWord2Vec(sparkSession: SparkSession, dataMatrix: DataFrame, savePath: String): DataFrame ={ 43 | // 加载预训练的Word2Vec模型,构造特征向量 44 | val pipelineModelTrained = sparkSession.sparkContext.broadcast(PipelineModel.read.load(savePath)) 45 | pipelineModelTrained.value.transform(dataMatrix) 46 | } 47 | 48 | def crossValidation(xgboostParam: Map[String, Any], labelIndexer: StringIndexerModel, 49 | evaluator: MulticlassClassificationEvaluator, trainingData: DataFrame): TrainValidationSplitModel = { 50 | // XGBoost Pipeline Model 51 | val xgbEstimator = new XGBoostEstimator(xgboostParam).setLabelCol("labelIndex").setFeaturesCol("features").setPredictionCol("prediction") 52 | val labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels) 53 | val pipeline = new Pipeline().setStages(Array(xgbEstimator, labelConverter)) 54 | 55 | // Grid Search + Cross Validation 56 | val paramGrid = new ParamGridBuilder() 57 | .addGrid(xgbEstimator.eta, Array(0.08, 0.1)) 58 | .addGrid(xgbEstimator.round, Array(50, 100)) 59 | .addGrid(xgbEstimator.maxDepth, Array(300, 500)) 60 | .build() 61 | val crossValidator = new TrainValidationSplit() 62 | .setEstimator(pipeline) 63 | .setEvaluator(evaluator) 64 | .setEstimatorParamMaps(paramGrid) 65 | .setTrainRatio(0.9) 66 | crossValidator.fit(trainingData) 67 | } 68 | 69 | def main(args:Array[String]): Unit ={ 70 | // 1、创建Spark程序入口 71 | val sparkSession = SparkSession.builder().appName("XGBoostTrain").enableHiveSupport().getOrCreate() 72 | 73 | // 2、读取训练数据,对文本预处理后分词 74 | val tableName = args(0) 75 | val matrix = sparkSession.sql("SELECT * FROM " + tableName + " where text is not null") 76 | val words = segWords(sparkSession, args(1), args(2), args(3), args(4), matrix.select("text")) 77 | 78 | // 3、将原数据与分词结果关联起来 79 | val rows = matrix.rdd.zip(words.rdd).map{ 80 | case (rowLeft, rowRight) => Row.fromSeq(rowLeft.toSeq ++ rowRight.toSeq) 81 | } 82 | val schema = StructType(matrix.schema.fields ++ words.schema.fields) 83 | val matrixMerge = sparkSession.createDataFrame(rows, schema) 84 | 85 | // 4、构建特征向量 86 | var featuredData = sparkSession.emptyDataFrame 87 | if (args(5).toLowerCase == "tfidf") 88 | featuredData = featureEngineeringTFIDF(sparkSession, matrixMerge, args(6)) 89 | else if (args(5).toLowerCase == "word2vec") 90 | featuredData = featureEngineeringWord2Vec(sparkSession, matrixMerge, args(6)) 91 | else 92 | throw new SparkException("Feature engineering algorithm must be TFIDF or Word2Vec") 93 | 94 | // 5、将label转化为数值 95 | val labelIndexer = new StringIndexer().setInputCol("label").setOutputCol("labelIndex").fit(featuredData) 96 | val dataPrepared = labelIndexer.transform(featuredData).select("text", "features", "label", "labelIndex") 97 | 98 | // 6、按比例划分训练数据和测试数据 99 | val testSize = args(7).toDouble 100 | val splits = dataPrepared.randomSplit(Array(1 - testSize, testSize), seed = 12345L) 101 | val (trainingData, testData) = (splits(0).repartition(18).cache(), splits(1).repartition(18).cache()) 102 | 103 | // 7、定义模型评估方法 104 | val evaluator = new MulticlassClassificationEvaluator() 105 | .setLabelCol("labelIndex") 106 | .setPredictionCol("prediction") 107 | .setMetricName("accuracy") // Spark2.0以前为"precision" 108 | 109 | // 8、设定模型参数,训练XGBoost文本分类模型 110 | val paramMap = new mutable.HashMap[String, Any]() 111 | paramMap += "nworkers" -> 18 112 | paramMap += "use_external_memory" -> false 113 | // paramMap += "eta" -> 0.1f 114 | // paramMap += "num_round" -> 50 115 | // paramMap += "max_depth" -> 300 116 | paramMap += "min_child_weight" -> 3 117 | paramMap += "alpha" -> 0.01 118 | paramMap += "gamma" -> 0 119 | paramMap += "subsample" -> 0.8 120 | paramMap += "colsample_bytree" -> 0.8 121 | paramMap += "scale_pos_weight" -> 1 122 | paramMap += "num_class" -> args(8).toInt 123 | paramMap += "objective" -> "multi:softprob" 124 | paramMap += "numEarlyStoppingRounds" -> 0 125 | paramMap += "trainTestRatio" -> 0.9 126 | paramMap += "booster" -> "dart" 127 | paramMap += "rate_drop" -> 0.1 128 | paramMap += "skip_drop" -> 0.5 129 | paramMap += "seed" -> 12345L 130 | val cvModel = crossValidation(paramMap.toMap, labelIndexer, evaluator, trainingData) 131 | 132 | // 9、分类模型的保存与加载 133 | val bestPipelineModel = cvModel.bestModel.asInstanceOf[PipelineModel] 134 | bestPipelineModel.write.save(args(9)) 135 | val xgbModelTrained = sparkSession.sparkContext.broadcast(PipelineModel.read.load(args(9))) 136 | 137 | // 10、使用训练好的模型对测试集样本进行分类 138 | val prediction = xgbModelTrained.value.transform(testData) 139 | 140 | // 11、评估模型效果 141 | prediction.select("text", "label", "predictedLabel", "probabilities").rdd.coalesce(1).saveAsTextFile(args(10)) 142 | val accuracy = evaluator.evaluate(prediction) 143 | sparkSession.sparkContext.parallelize(List("Accuracy = " + accuracy)).coalesce(1).saveAsTextFile(args(11)) 144 | 145 | // 12、保存模型参数 146 | val stages = xgbModelTrained.value.stages 147 | val modelTrainingStage = stages(0).asInstanceOf[XGBoostClassificationModel] 148 | sparkSession.sparkContext.makeRDD(modelTrainingStage.extractParamMap().toSeq).coalesce(1).saveAsTextFile(args(12)) 149 | 150 | sparkSession.stop() 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /xgbspark-text-classification.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | --------------------------------------------------------------------------------