├── .gitignore ├── .idea ├── .name ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── encodings.xml ├── misc.xml ├── modules.xml ├── modules │ ├── NaturalLanguageProces.iml │ ├── calssification-build.iml │ ├── calssification.iml │ ├── classification-build.iml │ └── naturallanguageprocessing-build.iml ├── sbt.xml ├── scala_compiler.xml ├── uiDesigner.xml ├── vcs.xml └── workspace.xml ├── README.md ├── build.sbt ├── project ├── build.properties ├── plugins.sbt └── target │ ├── resolution-cache │ ├── default │ │ ├── calssification-build │ │ │ └── scala_2.10 │ │ │ │ └── sbt_0.13 │ │ │ │ └── 0.1-SNAPSHOT │ │ │ │ ├── resolved.xml.properties │ │ │ │ └── resolved.xml.xml │ │ ├── classification-build │ │ │ └── scala_2.10 │ │ │ │ └── sbt_0.13 │ │ │ │ └── 0.1-SNAPSHOT │ │ │ │ ├── resolved.xml.properties │ │ │ │ └── resolved.xml.xml │ │ └── naturallanguageprocessing-build │ │ │ └── scala_2.10 │ │ │ └── sbt_0.13 │ │ │ └── 0.1-SNAPSHOT │ │ │ ├── resolved.xml.properties │ │ │ └── resolved.xml.xml │ └── reports │ │ ├── default-calssification-build-compile-internal.xml │ │ ├── default-calssification-build-compile.xml │ │ ├── default-calssification-build-docs.xml │ │ ├── default-calssification-build-optional.xml │ │ ├── default-calssification-build-plugin.xml │ │ ├── default-calssification-build-pom.xml │ │ ├── default-calssification-build-provided.xml │ │ ├── default-calssification-build-runtime-internal.xml │ │ ├── default-calssification-build-runtime.xml │ │ ├── default-calssification-build-scala-tool.xml │ │ ├── default-calssification-build-sources.xml │ │ ├── default-calssification-build-test-internal.xml │ │ ├── default-calssification-build-test.xml │ │ ├── default-classification-build-compile-internal.xml │ │ ├── default-classification-build-compile.xml │ │ ├── default-classification-build-docs.xml │ │ ├── default-classification-build-optional.xml │ │ ├── default-classification-build-plugin.xml │ │ ├── default-classification-build-pom.xml │ │ ├── default-classification-build-provided.xml │ │ ├── default-classification-build-runtime-internal.xml │ │ ├── default-classification-build-runtime.xml │ │ ├── default-classification-build-scala-tool.xml │ │ ├── default-classification-build-sources.xml │ │ ├── default-classification-build-test-internal.xml │ │ ├── default-classification-build-test.xml │ │ ├── default-naturallanguageprocessing-build-compile-internal.xml │ │ ├── default-naturallanguageprocessing-build-compile.xml │ │ ├── default-naturallanguageprocessing-build-docs.xml │ │ ├── default-naturallanguageprocessing-build-optional.xml │ │ ├── default-naturallanguageprocessing-build-plugin.xml │ │ ├── default-naturallanguageprocessing-build-pom.xml │ │ ├── default-naturallanguageprocessing-build-provided.xml │ │ ├── default-naturallanguageprocessing-build-runtime-internal.xml │ │ ├── default-naturallanguageprocessing-build-runtime.xml │ │ ├── default-naturallanguageprocessing-build-scala-tool.xml │ │ ├── default-naturallanguageprocessing-build-sources.xml │ │ ├── default-naturallanguageprocessing-build-test-internal.xml │ │ ├── default-naturallanguageprocessing-build-test.xml │ │ ├── ivy-report.css │ │ └── ivy-report.xsl │ └── streams │ ├── $global │ ├── $global │ │ └── $global │ │ │ └── streams │ │ │ └── out │ ├── dependencyPositions │ │ └── $global │ │ │ └── streams │ │ │ └── update_cache_2.10 │ │ │ ├── input_dsp │ │ │ └── output_dsp │ ├── ivyConfiguration │ │ └── $global │ │ │ └── streams │ │ │ └── out │ ├── ivySbt │ │ └── $global │ │ │ └── streams │ │ │ └── out │ ├── projectDescriptors │ │ └── $global │ │ │ └── streams │ │ │ └── out │ └── update │ │ └── $global │ │ └── streams │ │ ├── out │ │ └── update_cache_2.10 │ │ ├── inputs │ │ └── output │ ├── compile │ ├── $global │ │ └── $global │ │ │ └── discoveredMainClasses │ │ │ └── data │ ├── compile │ │ └── $global │ │ │ └── streams │ │ │ └── out │ ├── compileIncremental │ │ └── $global │ │ │ └── streams │ │ │ ├── export │ │ │ └── out │ ├── copyResources │ │ └── $global │ │ │ └── streams │ │ │ ├── copy-resources │ │ │ └── out │ ├── dependencyClasspath │ │ └── $global │ │ │ └── streams │ │ │ └── export │ ├── exportedProducts │ │ └── $global │ │ │ └── streams │ │ │ └── export │ ├── externalDependencyClasspath │ │ └── $global │ │ │ └── streams │ │ │ └── export │ ├── internalDependencyClasspath │ │ └── $global │ │ │ └── streams │ │ │ └── export │ ├── managedClasspath │ │ └── $global │ │ │ └── streams │ │ │ └── export │ ├── unmanagedClasspath │ │ └── $global │ │ │ └── streams │ │ │ └── export │ └── unmanagedJars │ │ └── $global │ │ └── streams │ │ └── export │ └── runtime │ ├── dependencyClasspath │ └── $global │ │ └── streams │ │ └── export │ ├── exportedProducts │ └── $global │ │ └── streams │ │ └── export │ ├── externalDependencyClasspath │ └── $global │ │ └── streams │ │ └── export │ ├── fullClasspath │ └── $global │ │ └── streams │ │ └── export │ ├── internalDependencyClasspath │ └── $global │ │ └── streams │ │ └── export │ ├── managedClasspath │ └── $global │ │ └── streams │ │ └── export │ ├── unmanagedClasspath │ └── $global │ │ └── streams │ │ └── export │ └── unmanagedJars │ └── $global │ └── streams │ └── export ├── src ├── main │ └── scala │ │ ├── deeplearning │ │ ├── cae │ │ │ └── CAE.scala │ │ ├── cnn │ │ │ ├── CNN.scala │ │ │ └── CNNModel.scala │ │ └── tests │ │ │ └── Test_example_CNN.scala │ │ ├── intactprogram │ │ ├── telecomdataprocessing │ │ │ ├── TelecomDataProcess.scala │ │ │ └── util │ │ │ │ ├── HBaseUtil.scala │ │ │ │ └── LoggerUtil.scala │ │ ├── telecomdataprocessingAll │ │ │ ├── TDP.scala │ │ │ ├── TelecomDataProcess.scala │ │ │ ├── TelecomDataProcessing.scala │ │ │ ├── TelecomDataProcessingByHour.scala │ │ │ ├── readFromHdfs.scala │ │ │ └── util │ │ │ │ ├── HBaseUtil.scala │ │ │ │ ├── HDFSUtil.scala │ │ │ │ ├── LoggerUtil.scala │ │ │ │ └── TimeUtil.scala │ │ └── vipstockstatistic │ │ │ ├── CorpusBuild.scala │ │ │ ├── PredictWithDic.scala │ │ │ ├── VipStockStatistic.scala │ │ │ └── util │ │ │ ├── AnsjAnalyzer.scala │ │ │ ├── HBaseUtil.scala │ │ │ ├── LoggerUtil.scala │ │ │ ├── RedisUtil.scala │ │ │ └── config.xml │ │ ├── meachinelearning │ │ ├── Recommendation │ │ │ └── SparkMLlibColbFilter.scala │ │ ├── classification │ │ │ ├── BinaryClassification.scala │ │ │ ├── BinaryClassificationParaOptimization.scala │ │ │ ├── BinaryClassificationRDDWithPCA.scala │ │ │ ├── BinaryClassificationWithALS.scala │ │ │ ├── BinaryClassificationWithPCA.scala │ │ │ ├── GaussianKernelSVM.scala │ │ │ ├── PCAtest.scala │ │ │ └── TrainingProcessWithPCA.scala │ │ ├── correlationanalysis │ │ │ └── correlationAnalysis.scala │ │ ├── data │ │ │ └── SupportVectorMachineWithGaussianKernel.txt │ │ ├── hotdegreecalculate │ │ │ ├── CommunityFrequencyStatistics.scala │ │ │ ├── HotDegreeCalculate.scala │ │ │ ├── HotDegreeCalculation.scala │ │ │ ├── HotDegreeCalculationRDD.scala │ │ │ └── fileIO.scala │ │ ├── textrank │ │ │ ├── AbstractExtract.scala │ │ │ ├── ConstructTextGraph.scala │ │ │ ├── KeywordExtractor.scala │ │ │ ├── PropertyExtractor.scala │ │ │ └── TextRank.scala │ │ ├── topicmodel │ │ │ ├── LDAModel.scala │ │ │ ├── LDATest.scala │ │ │ └── LatentDirichletAllocationExample.scala │ │ └── word2vec │ │ │ ├── ClassifyModel.scala │ │ │ ├── ClassifyPredict.scala │ │ │ ├── DataPrepare.scala │ │ │ ├── DeleteDirectory.scala │ │ │ ├── Word2Vec.scala │ │ │ ├── model │ │ │ ├── data │ │ │ │ ├── .part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet.crc │ │ │ │ ├── _SUCCESS │ │ │ │ ├── _common_metadata │ │ │ │ ├── _metadata │ │ │ │ └── part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet │ │ │ └── metadata │ │ │ │ ├── .part-00000.crc │ │ │ │ ├── _SUCCESS │ │ │ │ └── part-00000 │ │ │ ├── readme.md │ │ │ ├── textVectors.scala │ │ │ └── twc │ │ │ ├── W2VJsonConf.json │ │ │ ├── processing.scala │ │ │ └── training.scala │ │ ├── test │ │ └── regularExpression.scala │ │ ├── util │ │ ├── DataTransform.scala │ │ ├── DirectoryUtil.scala │ │ ├── FileUtil.scala │ │ ├── HBaseUtil.scala │ │ ├── HDFSUtil.scala │ │ ├── JsonUtil.scala │ │ ├── LoggerUtil.scala │ │ ├── MySQLUtil.scala │ │ ├── RedisUtil.scala │ │ ├── TextProcessing.scala │ │ ├── TimeUtil.scala │ │ ├── UrlCategoryTrim.scala │ │ ├── XMLUtil.scala │ │ └── regularExpression.scala │ │ └── wordSegmentation │ │ ├── AnsjAnalyzer.scala │ │ └── wordSegmentAnalyser.scala └── test │ ├── resources │ ├── 2016-07-11-15.txt │ ├── 2016-07-12-13.txt │ ├── 2016-07-12-15.txt │ ├── 2016-07-12-16.txt │ └── text │ │ ├── 1.txt │ │ ├── 2.txt │ │ └── abstract │ └── scala │ ├── CNNTest.scala │ ├── ClassificationTest.scala │ ├── HDFSUtilTest.scala │ ├── HotWordsTest.scala │ ├── JSONUtilTest.scala │ ├── MySQLUtilTest.scala │ ├── Test.scala │ ├── TextRankTest.scala │ ├── classification.scala │ ├── keywordExtractorTest.scala │ ├── telecomDataProcessingTest.scala │ ├── testRankTest.scala │ ├── timeutilTest.scala │ └── word2vecTest.scala └── target ├── .history ├── resolution-cache ├── default │ ├── classification$sbt_2.10 │ │ └── 1.0 │ │ │ ├── resolved.xml.properties │ │ │ └── resolved.xml.xml │ ├── classification$sources_2.10 │ │ └── 1.0 │ │ │ ├── resolved.xml.properties │ │ │ └── resolved.xml.xml │ ├── classification_2.10 │ │ └── 1.0 │ │ │ ├── resolved.xml.properties │ │ │ └── resolved.xml.xml │ ├── naturallanguageprocessing$sbt_2.10 │ │ └── 1.0 │ │ │ ├── resolved.xml.properties │ │ │ └── resolved.xml.xml │ └── naturallanguageprocessing$sources_2.10 │ │ └── 1.0 │ │ ├── resolved.xml.properties │ │ └── resolved.xml.xml ├── meachinelearning-classification │ ├── meachinelearning-classification$sbt_2.10 │ │ └── 1.0 │ │ │ ├── resolved.xml.properties │ │ │ └── resolved.xml.xml │ ├── meachinelearning-classification$sources_2.10 │ │ └── 1.0 │ │ │ ├── resolved.xml.properties │ │ │ └── resolved.xml.xml │ └── meachinelearning-classification_2.10 │ │ └── 1.0 │ │ ├── resolved.xml.properties │ │ └── resolved.xml.xml └── reports │ ├── default-classification$sbt_2.10-default.xml │ ├── default-classification$sources_2.10-compile-internal.xml │ ├── default-classification$sources_2.10-compile.xml │ ├── default-classification$sources_2.10-docs.xml │ ├── default-classification$sources_2.10-optional.xml │ ├── default-classification$sources_2.10-plugin.xml │ ├── default-classification$sources_2.10-pom.xml │ ├── default-classification$sources_2.10-provided.xml │ ├── default-classification$sources_2.10-runtime-internal.xml │ ├── default-classification$sources_2.10-runtime.xml │ ├── default-classification$sources_2.10-scala-tool.xml │ ├── default-classification$sources_2.10-sources.xml │ ├── default-classification$sources_2.10-test-internal.xml │ ├── default-classification$sources_2.10-test.xml │ ├── default-classification_2.10-compile-internal.xml │ ├── default-classification_2.10-compile.xml │ ├── default-classification_2.10-docs.xml │ ├── default-classification_2.10-optional.xml │ ├── default-classification_2.10-plugin.xml │ ├── default-classification_2.10-pom.xml │ ├── default-classification_2.10-provided.xml │ ├── default-classification_2.10-runtime-internal.xml │ ├── default-classification_2.10-runtime.xml │ ├── default-classification_2.10-scala-tool.xml │ ├── default-classification_2.10-sources.xml │ ├── default-classification_2.10-test-internal.xml │ ├── default-classification_2.10-test.xml │ ├── default-naturallanguageprocessing$sbt_2.10-default.xml │ ├── ivy-report.css │ ├── ivy-report.xsl │ ├── meachinelearning-classification-meachinelearning-classification$sbt_2.10-default.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-compile-internal.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-compile.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-docs.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-optional.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-plugin.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-pom.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-provided.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-runtime-internal.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-runtime.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-scala-tool.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-sources.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-test-internal.xml │ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-test.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-compile-internal.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-compile.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-docs.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-optional.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-plugin.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-pom.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-provided.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-runtime-internal.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-runtime.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-scala-tool.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-sources.xml │ ├── meachinelearning-classification-meachinelearning-classification_2.10-test-internal.xml │ └── meachinelearning-classification-meachinelearning-classification_2.10-test.xml ├── scala-2.10 └── test-classes │ └── text │ ├── 1.txt │ ├── 2.txt │ └── abstract └── streams ├── $global ├── $global │ └── dumpStructure │ │ └── $global │ │ └── streams │ │ └── out ├── clean │ └── $global │ │ └── streams │ │ └── out ├── dependencyPositions │ └── $global │ │ └── streams │ │ └── update_cache_2.10 │ │ ├── input_dsp │ │ └── output_dsp ├── ivyConfiguration │ └── $global │ │ └── streams │ │ └── out ├── ivySbt │ └── $global │ │ └── streams │ │ └── out ├── projectDescriptors │ └── $global │ │ └── streams │ │ └── out ├── update │ └── $global │ │ └── streams │ │ ├── out │ │ └── update_cache_2.10 │ │ ├── inputs │ │ └── output ├── updateClassifiers │ └── $global │ │ └── streams │ │ └── out └── updateSbtClassifiers │ └── $global │ └── streams │ └── out ├── compile ├── externalDependencyClasspath │ └── $global │ │ └── streams │ │ └── export ├── managedClasspath │ └── $global │ │ └── streams │ │ └── export ├── unmanagedClasspath │ └── $global │ │ └── streams │ │ └── export └── unmanagedJars │ └── $global │ └── streams │ └── export ├── runtime ├── externalDependencyClasspath │ └── $global │ │ └── streams │ │ └── export ├── managedClasspath │ └── $global │ │ └── streams │ │ └── export ├── unmanagedClasspath │ └── $global │ │ └── streams │ │ └── export └── unmanagedJars │ └── $global │ └── streams │ └── export └── test ├── externalDependencyClasspath └── $global │ └── streams │ └── export ├── managedClasspath └── $global │ └── streams │ └── export ├── unmanagedClasspath └── $global │ └── streams │ └── export └── unmanagedJars └── $global └── streams └── export /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | calssification -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/modules/NaturalLanguageProces.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /.idea/modules/calssification-build.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/modules/calssification.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/sbt.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 26 | 27 | -------------------------------------------------------------------------------- /.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Natural Language Processing 2 | ## introduce 3 | 4 | This is a Natural Language Processing package, it including Machine Learning utils and basic NLp utils. 5 | 6 | ## Machine Learning 7 | 8 | Natural Language Processing by using Machine Learning algorithms. 9 | 10 | ### TextClassification 11 | 12 | Text classification by using Bayesian, svmWithSGD, GaussianKernelSVM. 13 | 14 | #### Bayesian 15 | 16 | #### SVMWithSGD 17 | 18 | #### GaussianKernelSVM 19 | 20 | ### CorrelationAnalysis 21 | 22 | ### HotDegreeCalculate 23 | 24 | The hot degree of keywords using bayes average and law of newton cooling. 25 | 26 | ### TextRank 27 | 28 | Based on pageRank. 29 | 30 | ### TopicModel 31 | 32 | LDA 33 | 34 | ## Util 35 | 36 | Preprocessor tools 37 | 38 | shipment of gold damaged in a fire, shipment of gold damaged in a fire, 39 | delivery of silver arrived in a silver truck 40 | shipment of gold arrived in a truck -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "NaturalLanguageProcessing" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.10.4" 6 | 7 | // kunyan分词接口 8 | resolvers += "Kunyan Repo" at "http://61.147.114.67:8081/nexus/content/groups/public/" 9 | 10 | libraryDependencies += "com.kunyan" % "nlpsuit-package" % "0.2.8.3" 11 | 12 | libraryDependencies += "org.scalactic" %% "scalactic" % "2.2.5" % "test" 13 | 14 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.5" % "test" 15 | 16 | libraryDependencies += "org.scala-lang" % "scala-compiler" % "2.10.4" 17 | 18 | libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.1" excludeAll ExclusionRule(organization = "javax.servlet") 19 | 20 | libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % "2.7.1" % "provided" 21 | 22 | libraryDependencies += "org.apache.spark" % "spark-core_2.10" % "1.5.2" 23 | 24 | libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "1.5.2" 25 | 26 | libraryDependencies += "mysql" % "mysql-connector-java" % "3.1.14" 27 | 28 | libraryDependencies += "org.graphstream" % "gs-core" % "1.1.2" 29 | 30 | libraryDependencies += "org.apache.spark" % "spark-graphx_2.10" % "1.5.2" 31 | 32 | libraryDependencies += "com.ibm.icu" % "icu4j" % "56.1" 33 | 34 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.98.2-hadoop2" 35 | 36 | libraryDependencies += "org.apache.hbase" % "hbase-client" % "1.1.2" 37 | 38 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2" 39 | 40 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2" 41 | 42 | //libraryDependencies += "org.scalanlp" % "breeze_2.10" % "0.11.2" 43 | 44 | libraryDependencies += "org.scalanlp" % "breeze-math_2.10" % "0.4" intransitive() 45 | 46 | //libraryDependencies += "org.scalanlp" % "breeze-learn_2.9.2" % "0.2" intransitive() 47 | 48 | libraryDependencies += "org.scalanlp" % "breeze-process_2.10" % "0.3" intransitive() 49 | 50 | libraryDependencies += "org.scalanlp" % "breeze-viz_2.10" % "0.12" exclude("org.scalanlp", "breeze_2.10") 51 | 52 | libraryDependencies += "org.scalanlp" % "nak_2.10" % "1.3" 53 | 54 | libraryDependencies += "redis.clients" % "jedis" % "2.8.0" 55 | 56 | libraryDependencies += "org.ansj" % "ansj_seg" % "5.0.2" 57 | 58 | libraryDependencies += "org.json" % "json" % "20160212" 59 | 60 | libraryDependencies += "org.nlpcn" % "nlp-lang" % "1.7" 61 | 62 | assemblyMergeStrategy in assembly := { 63 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last 64 | case PathList("javax", "activation", xs @ _*) => MergeStrategy.last 65 | case PathList("javax", "el", xs @ _*) => MergeStrategy.last 66 | case PathList("org", "apache", xs @ _*) => MergeStrategy.last 67 | case PathList("com", "google", xs @ _*) => MergeStrategy.last 68 | case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last 69 | case PathList("com", "codahale", xs @ _*) => MergeStrategy.last 70 | case PathList("com", "yammer", xs @ _*) => MergeStrategy.last 71 | case "about.html" => MergeStrategy.rename 72 | case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last 73 | case "META-INF/mailcap" => MergeStrategy.last 74 | case "META-INF/mimetypes.default" => MergeStrategy.last 75 | case "plugin.properties" => MergeStrategy.last 76 | case "log4j.properties" => MergeStrategy.last 77 | case x => 78 | val oldStrategy = (assemblyMergeStrategy in assembly).value 79 | oldStrategy(x) 80 | } 81 | 82 | 83 | test in assembly := {} 84 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version = 0.13.8 -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | logLevel := Level.Warn 2 | 3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.1") -------------------------------------------------------------------------------- /project/target/resolution-cache/default/calssification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.properties: -------------------------------------------------------------------------------- 1 | #default#calssification-build;0.1-SNAPSHOT resolved revisions 2 | #Wed Mar 30 14:23:46 CST 2016 3 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.4 null 4 | +e\:scalaVersion\:\#@\#\:+2.10\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+e\:sbtVersion\:\#@\#\:+0.13\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 release 0.14.1 null 5 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null 6 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null 7 | -------------------------------------------------------------------------------- /project/target/resolution-cache/default/calssification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 10 | 11 | calssification-build 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /project/target/resolution-cache/default/classification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.properties: -------------------------------------------------------------------------------- 1 | #default#classification-build;0.1-SNAPSHOT resolved revisions 2 | #Tue Apr 12 10:12:42 CST 2016 3 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.4 null 4 | +e\:scalaVersion\:\#@\#\:+2.10\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+e\:sbtVersion\:\#@\#\:+0.13\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 release 0.14.1 null 5 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null 6 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null 7 | -------------------------------------------------------------------------------- /project/target/resolution-cache/default/classification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 10 | 11 | classification-build 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /project/target/resolution-cache/default/naturallanguageprocessing-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.properties: -------------------------------------------------------------------------------- 1 | #default#naturallanguageprocessing-build;0.1-SNAPSHOT resolved revisions 2 | #Wed Oct 12 10:38:54 CST 2016 3 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.4 null 4 | +e\:scalaVersion\:\#@\#\:+2.10\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+e\:sbtVersion\:\#@\#\:+0.13\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 release 0.14.1 null 5 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null 6 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null 7 | -------------------------------------------------------------------------------- /project/target/resolution-cache/default/naturallanguageprocessing-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 10 | 11 | naturallanguageprocessing-build 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-calssification-build-docs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-calssification-build-optional.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-calssification-build-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-calssification-build-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-calssification-build-sources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-classification-build-docs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-classification-build-optional.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-classification-build-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-classification-build-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-classification-build-sources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-naturallanguageprocessing-build-docs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-naturallanguageprocessing-build-optional.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-naturallanguageprocessing-build-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-naturallanguageprocessing-build-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/resolution-cache/reports/default-naturallanguageprocessing-build-sources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /project/target/streams/$global/$global/$global/streams/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/$global/$global/streams/out -------------------------------------------------------------------------------- /project/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/input_dsp: -------------------------------------------------------------------------------- 1 | org.scala-lang scala-library2.10.4provided com.eed3si9n sbt-assembly0.14.1 e:sbtVersion0.13e:scalaVersion2.10 -------------------------------------------------------------------------------- /project/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp -------------------------------------------------------------------------------- /project/target/streams/$global/ivyConfiguration/$global/streams/out: -------------------------------------------------------------------------------- 1 | [debug] Other repositories: 2 | [debug] Default repositories: 3 | [debug] Using inline dependencies specified in Scala. 4 | -------------------------------------------------------------------------------- /project/target/streams/$global/ivySbt/$global/streams/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/ivySbt/$global/streams/out -------------------------------------------------------------------------------- /project/target/streams/$global/projectDescriptors/$global/streams/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/projectDescriptors/$global/streams/out -------------------------------------------------------------------------------- /project/target/streams/$global/update/$global/streams/update_cache_2.10/inputs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/update/$global/streams/update_cache_2.10/inputs -------------------------------------------------------------------------------- /project/target/streams/$global/update/$global/streams/update_cache_2.10/output: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/update/$global/streams/update_cache_2.10/output -------------------------------------------------------------------------------- /project/target/streams/compile/$global/$global/discoveredMainClasses/data: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /project/target/streams/compile/compile/$global/streams/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/compile/compile/$global/streams/out -------------------------------------------------------------------------------- /project/target/streams/compile/compileIncremental/$global/streams/export: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/compile/compileIncremental/$global/streams/export -------------------------------------------------------------------------------- /project/target/streams/compile/compileIncremental/$global/streams/out: -------------------------------------------------------------------------------- 1 | [debug] 2 | [debug] Initial source changes: 3 | [debug] removed:Set() 4 | [debug] added: Set() 5 | [debug] modified: Set() 6 | [debug] Removed products: Set() 7 | [debug] External API changes: API Changes: Set() 8 | [debug] Modified binary dependencies: Set() 9 | [debug] Initial directly invalidated sources: Set() 10 | [debug] 11 | [debug] Sources indirectly invalidated by: 12 | [debug] product: Set() 13 | [debug] binary dep: Set() 14 | [debug] external source: Set() 15 | [debug] All initially invalidated sources: Set() 16 | -------------------------------------------------------------------------------- /project/target/streams/compile/copyResources/$global/streams/copy-resources: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/compile/copyResources/$global/streams/copy-resources -------------------------------------------------------------------------------- /project/target/streams/compile/copyResources/$global/streams/out: -------------------------------------------------------------------------------- 1 | [debug] Copy resource mappings: 2 | [debug] 3 | -------------------------------------------------------------------------------- /project/target/streams/compile/exportedProducts/$global/streams/export: -------------------------------------------------------------------------------- 1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes 2 | -------------------------------------------------------------------------------- /project/target/streams/compile/internalDependencyClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project/target/streams/compile/unmanagedClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project/target/streams/compile/unmanagedJars/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/dependencyClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes:/Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/exportedProducts/$global/streams/export: -------------------------------------------------------------------------------- 1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/externalDependencyClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | /Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/fullClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes:/Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/internalDependencyClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/managedClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | /Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/unmanagedClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /project/target/streams/runtime/unmanagedJars/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/main/scala/deeplearning/cae/CAE.scala: -------------------------------------------------------------------------------- 1 | package deeplearning.cae 2 | 3 | /** 4 | * Created by li on 16/8/15. 5 | */ 6 | object CAE { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/main/scala/deeplearning/cnn/CNNModel.scala: -------------------------------------------------------------------------------- 1 | package deeplearning.cnn 2 | 3 | import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, DenseVector => BDV, Matrix => BM, SparseVector => BSV, Vector => BV} 4 | import org.apache.spark.rdd.RDD 5 | 6 | /** 7 | * label:目标矩阵 8 | * features:特征矩阵 9 | * predict_label:预测矩阵 10 | * error:误差 11 | */ 12 | case class PredictCNNLabel(label: BDM[Double], features: BDM[Double], predict_label: BDM[Double], error: BDM[Double]) extends Serializable 13 | 14 | class CNNModel( 15 | val cnn_layers: Array[CNNLayers], 16 | val cnn_ffW: BDM[Double], 17 | val cnn_ffb: BDM[Double]) extends Serializable { 18 | 19 | /** 20 | * 返回预测结果 21 | * 返回格式:(label, feature, predict_label, error) 22 | */ 23 | def predict(dataMatrix: RDD[(BDM[Double], BDM[Double])]): RDD[PredictCNNLabel] = { 24 | val sc = dataMatrix.sparkContext 25 | val bc_cnn_layers = sc.broadcast(cnn_layers) 26 | val bc_cnn_ffW = sc.broadcast(cnn_ffW) 27 | val bc_cnn_ffb = sc.broadcast(cnn_ffb) 28 | // CNNff是进行前向传播 29 | val train_cnnff = CNN.CNNff(dataMatrix, bc_cnn_layers, bc_cnn_ffb, bc_cnn_ffW) 30 | val rdd_predict = train_cnnff.map { f => 31 | val label = f._1 32 | val nna1 = f._2(0)(0) 33 | val nnan = f._4 34 | val error = f._4 - f._1 35 | PredictCNNLabel(label, nna1, nnan, error) 36 | } 37 | rdd_predict 38 | } 39 | 40 | /** 41 | * 计算输出误差 42 | * 平均误差; 43 | */ 44 | def Loss(predict: RDD[PredictCNNLabel]): Double = { 45 | val predict1 = predict.map(f => f.error) 46 | // error and loss 47 | // 输出误差计算 48 | val loss1 = predict1 49 | val (loss2, counte) = loss1.treeAggregate((0.0, 0L))( 50 | seqOp = (c, v) => { 51 | // c: (e, count), v: (m) 52 | val e1 = c._1 53 | val e2 = (v :* v).sum 54 | val esum = e1 + e2 55 | (esum, c._2 + 1) 56 | }, 57 | combOp = (c1, c2) => { 58 | // c: (e, count) 59 | val e1 = c1._1 60 | val e2 = c2._1 61 | val esum = e1 + e2 62 | (esum, c1._2 + c2._2) 63 | }) 64 | val Loss = (loss2 / counte.toDouble) * 0.5 65 | Loss 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/scala/deeplearning/tests/Test_example_CNN.scala: -------------------------------------------------------------------------------- 1 | package tests 2 | 3 | import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, DenseVector => BDV, Matrix => BM, SparseVector => BSV, Vector => BV, axpy => brzAxpy, max => Bmax, min => Bmin, sum => Bsum, svd => brzSvd} 4 | import deeplearning.cnn.CNN 5 | import org.apache.log4j.{Level, Logger} 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | object Test_example_CNN { 9 | 10 | def main(args: Array[String]) { 11 | //1 构建Spark对象 12 | val conf = new SparkConf().setAppName("CNNtest").setMaster("local") 13 | val sc = new SparkContext(conf) 14 | 15 | //2 测试数据 16 | Logger.getRootLogger.setLevel(Level.WARN) 17 | val data_path = "/Users/li/workshop/DataSet/deeplearning/train_d3.txt" 18 | val examples = sc.textFile(data_path).cache() 19 | val train_d1 = examples.map { line => 20 | val f1 = line.split("\t") 21 | val f = f1.map(f => f.toDouble) 22 | val y = f.slice(0, 4) 23 | val x = f.slice(4, f.length) 24 | (new BDM(1, y.length, y), new BDM(1, x.length, x)) 25 | } 26 | 27 | val train_d = train_d1.map(f => (f._1, f._2)) 28 | 29 | 30 | //3 设置训练参数,建立模型 31 | // opts:迭代步长,迭代次数,交叉验证比例 32 | val opts = Array(50.0, 1.0, 0.0) 33 | train_d.cache 34 | val numExamples = train_d.count() 35 | println(s"numExamples = $numExamples.") 36 | 37 | val CNNmodel = new CNN() 38 | .setMapsize(new BDM(1, 2, Array(28.0, 28.0))) 39 | .setTypes(Array("i", "c", "s", "c", "s")) 40 | .setLayer(5) 41 | .setOnum(10) 42 | .setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)) 43 | .setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)) 44 | .setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)) 45 | .setAlpha(1.0) 46 | .CNNtrain(train_d, opts) 47 | 48 | //4 模型测试 49 | val CNNforecast = CNNmodel.predict(train_d) 50 | val CNNerror = CNNmodel.Loss(CNNforecast) 51 | println(s"NNerror = $CNNerror.") 52 | val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200) 53 | println("预测值") 54 | for (i <- 0 until printf1.length) { 55 | val outi = printf1(i)._2.mkString("\t") 56 | println(outi) 57 | } 58 | 59 | } 60 | } -------------------------------------------------------------------------------- /src/main/scala/intactprogram/telecomdataprocessing/util/HBaseUtil.scala: -------------------------------------------------------------------------------- 1 | package telecomdataprocessing.util 2 | 3 | import com.ibm.icu.text.CharsetDetector 4 | import org.apache.hadoop.conf.Configuration 5 | import org.apache.hadoop.hbase.HBaseConfiguration 6 | import org.apache.hadoop.hbase.client.Result 7 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 8 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 9 | import org.apache.spark.SparkContext 10 | import org.apache.spark.rdd.RDD 11 | 12 | import scala.xml.{Elem, XML} 13 | 14 | /** 15 | * Created by li on 16/7/7. 16 | */ 17 | object HBaseUtil { 18 | 19 | /** 20 | * 识别字符编码 21 | * 22 | * @param html 地址编码 23 | * @return 字符编码 24 | */ 25 | def judgeChaser(html: Array[Byte]): String = { 26 | 27 | val icu4j = new CharsetDetector() 28 | icu4j.setText(html) 29 | val encoding = icu4j.detect() 30 | 31 | encoding.getName 32 | } 33 | 34 | /** 35 | * 获取xml格式的配置文件 36 | * 37 | * @param dir 配置文件所在的文件目录 38 | * @return 39 | * @return Li Yu 40 | * @note rowNum: 2 41 | */ 42 | def readConfigFile(dir: String): Elem = { 43 | 44 | val configFile = XML.loadFile(dir) 45 | 46 | configFile 47 | } 48 | 49 | /** 50 | * 获取hbase配置内容,并且初始化hbase配置 51 | * 52 | * @param configFile hbase配置文件 53 | * @return 54 | * @return Li Yu 55 | * @note rowNum: 7 56 | */ 57 | def setHBaseConfigure(configFile: Elem): Configuration = { 58 | 59 | val rootDir = (configFile \ "hbase" \ "rootDir").text 60 | val ip = (configFile \ "hbase" \ "ip").text 61 | 62 | // 初始化配置 63 | val configuration = HBaseConfiguration.create() 64 | configuration.set("hbase.rootdir", rootDir) 65 | configuration.set("hbase.zookeeper.quorum", ip) 66 | 67 | configuration 68 | } 69 | 70 | /** 71 | * 获取hbase中的内容 72 | * 73 | * @param sc SparkContext 74 | * @param confDir 配置文件所在的文件夹 75 | * @author Li Yu 76 | * @note rowNum: 7 77 | */ 78 | def getHBaseConf(sc: SparkContext, confDir: String, tableName: String) : RDD[(ImmutableBytesWritable, Result)] = { 79 | 80 | val configFile = HBaseUtil.readConfigFile(confDir) 81 | val configuration = HBaseUtil.setHBaseConfigure(configFile) 82 | 83 | configuration.set(TableInputFormat.INPUT_TABLE, tableName) 84 | 85 | // 使用Hadoop api来创建一个RDD 86 | val hBaseRDD = sc.newAPIHadoopRDD(configuration, 87 | classOf[TableInputFormat], 88 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 89 | classOf[org.apache.hadoop.hbase.client.Result]) 90 | 91 | hBaseRDD 92 | } 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/telecomdataprocessing/util/LoggerUtil.scala: -------------------------------------------------------------------------------- 1 | package telecomdataprocessing.util 2 | 3 | import org.apache.log4j.{BasicConfigurator, Logger} 4 | 5 | /** 6 | * 写Log操作 7 | */ 8 | object LoggerUtil { 9 | 10 | var logger = Logger.getLogger("TelecomData_Processing") 11 | BasicConfigurator.configure() 12 | // PropertyConfigurator.configure("/home/mlearning/tdt/conf/log4j.properties") 13 | 14 | def exception(e: Exception) = { 15 | 16 | logger.error(e.printStackTrace()) 17 | 18 | } 19 | 20 | def error(msg: String): Unit = { 21 | 22 | logger.error(msg) 23 | } 24 | 25 | def warn(msg: String): Unit = { 26 | 27 | logger.warn(msg) 28 | } 29 | 30 | def info(msg: String): Unit = { 31 | 32 | logger.info(msg) 33 | } 34 | 35 | def debug(msg: String): Unit = { 36 | 37 | logger.debug(msg) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/telecomdataprocessingAll/TelecomDataProcess.scala: -------------------------------------------------------------------------------- 1 | //package com.kunyan.dxdataprocess 2 | // 3 | //import java.text.SimpleDateFormat 4 | // 5 | //import org.apache.spark.{SparkConf, SparkContext} 6 | //import util.HBaseUtil 7 | // 8 | //import scala.collection.mutable.ArrayBuffer 9 | // 10 | ///** 11 | // * Created by QQ on 7/25/16. 12 | // */ 13 | //object TelecomDataProcess { 14 | // 15 | // def getDayTimeStamp(startDay: String): Long = { 16 | // 17 | // val sdf = new SimpleDateFormat("yyyy-MM-dd") 18 | // val dayStamp = sdf.parse(startDay).getTime 19 | // 20 | // dayStamp 21 | // } 22 | // 23 | // /** 24 | // * 给定时间范围,根据时间窗口长度,获取若干组时间窗口 25 | // * 26 | // * @param startTime 起始时间戳 27 | // * @param endTime 结束时间戳 28 | // * @param timeRange 事件窗口 29 | // * @return Array[(Long, Long)] 30 | // * @note rowNum:11 31 | // */ 32 | // def makeHourTimeWindows(startTime: Long, endTime: Long, timeRange: Int): Array[(Long, Long)] = { 33 | // 34 | // var count = startTime 35 | // val dayWindows = ArrayBuffer[(Long, Long)]() 36 | // 37 | // do { 38 | // 39 | // // (start, start + timeRange - 1) 40 | // dayWindows.append((count, count + 60L * 60 * 1000 * timeRange - 1)) 41 | // count += 60L * 60 * 1000 42 | // 43 | // } while (count < endTime) 44 | // 45 | // dayWindows.toArray 46 | // } 47 | // 48 | // def judgeTimeWindow(time: Long, timeWindow: Array[(Long, Long)]): (Long, Long) = { 49 | // 50 | // timeWindow.foreach(line => { 51 | // if (time >= line._1 && time <= line._2){ 52 | // return line 53 | // } 54 | // }) 55 | // 56 | // (-1L, -1L) 57 | // } 58 | // 59 | // def urlFormat(url: String): String = { 60 | // 61 | // val temp = url.split("://") 62 | // 63 | // temp.length match { 64 | // case 1 => temp(0).replaceAll("wwww", "") 65 | // case 2 => temp(1).replaceAll("wwww", "") 66 | // } 67 | // } 68 | // 69 | // def main(args: Array[String]) { 70 | // 71 | // val conf = new SparkConf() 72 | // .setAppName(s"Warren_TelecomData_Processing_${args(0)}") 73 | // .set("dfs.replication", "1") 74 | // // .setMaster("local") 75 | // // .set("spark.driver.host","192.168.2.90") 76 | // val sc = new SparkContext(conf) 77 | // 78 | // val jsonConfig = new JsonConfig 79 | // jsonConfig.initConfig(args(1)) 80 | // 81 | // val hbaseConfig = HBaseUtil.getHbaseConf(jsonConfig.getValue("hbase", "rootDir"), 82 | // jsonConfig.getValue("hbase", "ips")) 83 | // 84 | // val startDayTimeStamp = getDayTimeStamp(args(0)) 85 | // val endDayTimeStamp = startDayTimeStamp + 24L * 60 * 60 * 1000 86 | // 87 | // // 获取时间窗口 88 | // val timeRanges = sc.broadcast(makeHourTimeWindows(startDayTimeStamp, endDayTimeStamp, 1)) 89 | // 90 | // // 获取电信数据 91 | // val teleData = sc.textFile(jsonConfig.getValue("tp", "telecomDataPath") + s"/${args(0)}}", 92 | // jsonConfig.getValue("tp", "partition").toInt) 93 | // 94 | // // 获取所有需要匹配的,并广播 95 | // val urlsBr = sc.broadcast(HBaseUtil.getRDD(sc, hbaseConfig).map(x => urlFormat(x.split("\n\t")(0))).collect()) // 这一步需要对从其他地方获取到新闻url做一些处理,例如去掉www和http 96 | // 97 | // // 分组计算 98 | // teleData.map(row => { 99 | // val tmp = row.split("\t") 100 | // val url = urlFormat(tmp(3) + tmp(4)) 101 | // val time = tmp(0) 102 | // 103 | // (url, time) 104 | // }).filter(x => urlsBr.value.contains(x._1)).map(row => { 105 | // 106 | // val timeWindow = judgeTimeWindow(row._2.toLong, timeRanges.value) 107 | // 108 | // ((timeWindow._1, timeWindow._2, row._1), 1L) 109 | // }).reduceByKey(_ + _).saveAsTextFile(jsonConfig.getValue("tp", "outputPath") + s"/${args(0)}") 110 | // } 111 | //} 112 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/telecomdataprocessingAll/readFromHdfs.scala: -------------------------------------------------------------------------------- 1 | //package telecomdataprocessingAll 2 | // 3 | //import org.apache.spark.{SparkConf, SparkContext} 4 | //import util.LoggerUtil 5 | // 6 | ///** 7 | // * Created by li on 16/7/27. 8 | // */ 9 | //object readFromHdfs { 10 | // 11 | // def main(args: Array[String]) { 12 | // 13 | // val conf = new SparkConf().setAppName("Warren_ReadFrom_Hdfs_filter") 14 | // 15 | // val sc = new SparkContext(conf) 16 | // 17 | // val hdfsDir = args(0) 18 | //// val hdfsDir = "hdfs://222.73.57.12:9000/telecom/shdx/origin/data/" 19 | // 20 | // val setTime = args(1) 21 | //// val setTime = "2016-07-23" 22 | // 23 | // 24 | // val time = System.currentTimeMillis() 25 | // 26 | // LoggerUtil.warn("time2Start:" +"%s".format(time)+ " 》》》》》》》》》》》》") 27 | // // 数据获取开始和截止时间 28 | // val stopTimeStamp = TDP.getDayTimeStamp(setTime) 29 | // val startTimeStamp = stopTimeStamp - 24 * 60 * 60 * 1000 30 | // val timeRanges = sc.broadcast(TDP.makeHourTimeWindows(startTimeStamp, stopTimeStamp -1, 1)) 31 | // 32 | // // 23个新闻网站的host域名 33 | // val urlUnion = Array("yicai.com", "21cn.com", "d.weibo.com","xueqiu.com","10jqka.com.cn","gw.com.cn", 34 | // "eastmoney.com","p5w.net","stockstar.com","hexun.com","caijing.com.cn","jrj.com.cn","cfi.net.cn","cs.com.cn", 35 | // "cnstock.com", "stcn.com","news.cn","finance.ifeng.com","finance.sina.com.cn","business.sohu.com","money.163.com", 36 | // "wallstreetcn.com","finance.qq.com","moer.jiemian.com","www.szse.cn","weixin.sogou.com","sse.com.cn","zqyjbg.com") 37 | // 38 | // val dataFromHDFS2 = sc.textFile(hdfsDir + setTime + "/*") 39 | // .filter(! _.contains("home/telecom")) 40 | // .filter(! _.contains("youchaojiang")) 41 | // .map(_.split("\t")) 42 | // .filter(_.length == 8) 43 | // .filter(x => urlUnion.contains(TDP.urlFormat(x(3)))) 44 | // .map(x => (TDP.urlFormat(x(3) + x(4)), x(0))) 45 | // 46 | // val result = dataFromHDFS2.map(row => { 47 | // 48 | // val timeWindow = TDP.judgeTimeWindow(row._2.toLong, timeRanges.value) 49 | // 50 | // ((timeWindow._1, timeWindow._2, row._1), 1L) 51 | // }).reduceByKey(_ + _).count() 52 | // 53 | // println(result) 54 | // 55 | // 56 | // LoggerUtil.warn("time2End:" +"%s".format(time)+ " 》》》》》》》》》》》》") 57 | // 58 | // } 59 | // 60 | //} 61 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/telecomdataprocessingAll/util/HDFSUtil.scala: -------------------------------------------------------------------------------- 1 | package telecomdataprocessingAll.util 2 | 3 | import java.text.SimpleDateFormat 4 | 5 | /** 6 | * Created by li on 16/7/25. 7 | */ 8 | object HDFSUtil { 9 | 10 | 11 | def main(args: Array[String]) { 12 | val dataFormat = new SimpleDateFormat("yyyy-MM-dd") 13 | val startTime = dataFormat.parse("2012-12-12") 14 | val startTimeStamp = startTime.getTime 15 | val stopTimeStamp = startTime.getTime - 24 * 60 * 60 * 1000 -1 16 | 17 | 18 | println(startTimeStamp, stopTimeStamp) 19 | } 20 | 21 | 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/telecomdataprocessingAll/util/LoggerUtil.scala: -------------------------------------------------------------------------------- 1 | package telecomdataprocessingAll.util 2 | 3 | import org.apache.log4j.{BasicConfigurator, Logger} 4 | 5 | /** 6 | * 写Log操作 7 | */ 8 | object LoggerUtil { 9 | 10 | var logger = Logger.getLogger("TelecomData_Processing") 11 | BasicConfigurator.configure() 12 | // PropertyConfigurator.configure("/home/mlearning/tdt/conf/log4j.properties") 13 | 14 | def exception(e: Exception) = { 15 | 16 | logger.error(e.printStackTrace()) 17 | 18 | } 19 | 20 | def error(msg: String): Unit = { 21 | 22 | logger.error(msg) 23 | } 24 | 25 | def warn(msg: String): Unit = { 26 | 27 | logger.warn(msg) 28 | } 29 | 30 | def info(msg: String): Unit = { 31 | 32 | logger.info(msg) 33 | } 34 | 35 | def debug(msg: String): Unit = { 36 | 37 | logger.debug(msg) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/telecomdataprocessingAll/util/TimeUtil.scala: -------------------------------------------------------------------------------- 1 | package telecomdataprocessingAll.util 2 | 3 | import java.math.BigInteger 4 | import java.text.SimpleDateFormat 5 | import java.util.{Calendar, Date} 6 | 7 | import org.apache.hadoop.hbase.client.Scan 8 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil 9 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos 10 | import org.apache.hadoop.hbase.util.Base64 11 | 12 | /** 13 | * Created by C.J.YOU on 2016/1/13. 14 | * 格式化时间的工具类 15 | */ 16 | object TimeUtil { 17 | 18 | def getTime(timeStamp: String): String = { 19 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss") 20 | val bigInt: BigInteger = new BigInteger(timeStamp) 21 | val date: String = sdf.format(bigInt) 22 | date 23 | } 24 | 25 | def getDay: String = { 26 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd") 27 | val date: String = sdf.format(new Date) 28 | date 29 | } 30 | 31 | def getCurrentHour: Int = { 32 | val calendar = Calendar.getInstance 33 | calendar.setTime(new Date) 34 | calendar.get(Calendar.HOUR_OF_DAY) 35 | } 36 | 37 | def getPreHourStr: String = { 38 | val date = new Date(new Date().getTime - 60 * 60 * 1000) 39 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd-HH") 40 | sdf.format(date) 41 | } 42 | 43 | /** 44 | * 获取今天的日期 45 | * 46 | * @return 47 | */ 48 | def getNowDate(): String = { 49 | val now: Date = new Date() 50 | val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd") 51 | val res = dateFormat.format( now ) 52 | res 53 | } 54 | 55 | 56 | /** 57 | * 获取本周的开始时间 58 | */ 59 | def Null(){ 60 | 61 | } 62 | 63 | /** 64 | * 获取本月的开始时间 65 | * http://blog.csdn.net/springlustre/article/details/47273353 66 | */ 67 | 68 | 69 | /** 70 | * 设置时间范围 71 | * 72 | * @return 时间范围 73 | * @author yangshuai 74 | */ 75 | def setTimeRange(): String = { 76 | 77 | val scan = new Scan() 78 | val date = new Date(new Date().getTime - 30 * 24 * 60 * 60 * 1000) 79 | val format = new SimpleDateFormat("yyyy-MM-dd HH") 80 | val time = format.format(date) 81 | val time1 = format.format(new Date().getTime) 82 | val startTime = time + "-00-00" 83 | val stopTime = time1 + "-00-00" 84 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss") 85 | val startRow: Long = sdf.parse(startTime).getTime 86 | val stopRow: Long = sdf.parse(stopTime).getTime 87 | 88 | scan.setTimeRange(startRow, stopRow) 89 | val proto: ClientProtos.Scan = ProtobufUtil.toScan(scan) 90 | 91 | Base64.encodeBytes(proto.toByteArray) 92 | } 93 | 94 | /** 95 | * 设置制定的时间范围(一天) 96 | * @param time 指定的日期 97 | * @return 指定日期至前一天时间范围 98 | */ 99 | def setAssignedTimeRange(time: String): String = { 100 | 101 | val format = new SimpleDateFormat("yyyy-MM-dd") 102 | 103 | val date = format.parse(time) 104 | 105 | val endTime = new Date(date.getTime - 24 * 60 * 60 * 1000) 106 | 107 | val stopTime = format.format(endTime) 108 | 109 | val startDate = time + "-00-00-00" 110 | val stopDate = stopTime + "-00-00-00" 111 | 112 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss") 113 | val startRaw = sdf.parse(startDate).getTime 114 | val stopRaw = sdf.parse(stopDate).getTime 115 | 116 | val scan = new Scan() 117 | scan.setTimeRange(startRaw, stopRaw) 118 | 119 | val proto = ProtobufUtil.toScan(scan) 120 | 121 | Base64.encodeBytes(proto.toByteArray) 122 | } 123 | 124 | 125 | } 126 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/vipstockstatistic/CorpusBuild.scala: -------------------------------------------------------------------------------- 1 | package dataprocess.vipstockstatistic 2 | 3 | import com.kunyandata.nlpsuit.util.{TextPreprocessing, KunyanConf} 4 | import org.apache.spark.rdd.RDD 5 | 6 | import scala.xml.XML 7 | 8 | /** 9 | * Created by li on 2016/8/23. 10 | * 调用坤雁分词系统 11 | */ 12 | object CorpusBuild { 13 | 14 | /** 15 | * 配置文件初始化 16 | * 17 | * @param xmlConfPath 配置文件输入路径 18 | * @return 初始化后的配置文件 19 | * @author Li Yu 20 | * @note rowNum = 6 21 | */ 22 | def paramInit(xmlConfPath: String): KunyanConf = { 23 | 24 | val kunyanConf = new KunyanConf 25 | val confFile = XML.loadFile(xmlConfPath) 26 | 27 | val kunyanHost = { confFile \ "kunyan" \ "kunyanHost" }.text 28 | val kunyanPort = { confFile \ "kunyan" \ "kunyanPort" }.text.toInt 29 | kunyanConf.set(kunyanHost, kunyanPort) 30 | 31 | kunyanConf 32 | } 33 | 34 | /** 35 | * 分词程序 36 | * 37 | * @param xmlPath 主程序输入参数 38 | * @author Li Yu 39 | * @note rownum = 6 40 | */ 41 | def run(xmlPath: String, news: RDD[Array[String]]): RDD[(String, String)] = { 42 | 43 | System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 44 | 45 | // 配置文件初始化 46 | val kunyanConf = paramInit(xmlPath) 47 | 48 | // 调用分词系统,输出内容为URL 分词结果 49 | val stopWords = Array(" ") 50 | val corpus = news.map(row => (row(2), TextPreprocessing.process(row(3), stopWords, kunyanConf).mkString(","))) 51 | 52 | corpus 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/vipstockstatistic/util/AnsjAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package dataprocess.vipstockstatistic.util 2 | 3 | import org.ansj.library.UserDefineLibrary 4 | import org.ansj.splitWord.analysis.{NlpAnalysis, ToAnalysis} 5 | import org.apache.spark.SparkContext 6 | 7 | 8 | /** 9 | * Created by zhangxin on 2016/3/8 10 | * 基于ansj的分词工具 11 | */ 12 | object AnsjAnalyzer { 13 | 14 | /** 15 | * ansj分词器初始化, 添加用户词典 16 | * 17 | * @param sc spark程序入口 18 | * @param userDic 用户词典数组 19 | * @return 无 20 | * @author zhangxin 21 | */ 22 | def init(sc: SparkContext, userDic: Array[String]): Unit = { 23 | 24 | if(userDic != null ){ 25 | userDic.foreach(addUserDic(_, sc)) 26 | } 27 | 28 | } 29 | 30 | /** 31 | * 添加用户词典到分词器 32 | * 33 | * @param dicPath 词典路径 34 | * @param sc spark程序入口 35 | * @return 无 36 | * @author zhangxin 37 | */ 38 | def addUserDic(dicPath: String, sc: SparkContext): Unit = { 39 | 40 | //读取词典 41 | val dic = sc.textFile(dicPath).collect() 42 | 43 | //添加到ansj中 44 | dic.foreach(UserDefineLibrary.insertWord(_, "userDefine", 100)) 45 | 46 | } 47 | 48 | /** 49 | * 标准分词 ,无词性标注 50 | * 51 | * @param sentence 待分词语句 52 | * @return 分词结果 53 | * @author zhangxin 54 | */ 55 | def cutNoTag(sentence: String): Array[String] = { 56 | 57 | //切词 58 | val sent = ToAnalysis.parse(sentence) 59 | 60 | //提取分词结果,过滤词性 61 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName 62 | 63 | words.toArray 64 | } 65 | 66 | /** 67 | * 自然语言分词,带词性标注 68 | * 69 | * @param sentence 待分词句子 70 | * @return 分词结果 71 | * @author zhangxin 72 | */ 73 | def cutWithTag(sentence: String):Array[String]={ 74 | 75 | // 切词 76 | val sent = NlpAnalysis.parse(sentence) 77 | 78 | // 提取分词结果 79 | val words= for(i <- Range(0, sent.size())) yield sent.get(i).getName 80 | 81 | words.toArray 82 | } 83 | 84 | } 85 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/vipstockstatistic/util/HBaseUtil.scala: -------------------------------------------------------------------------------- 1 | package dataprocess.vipstockstatistic.util 2 | 3 | import java.text.SimpleDateFormat 4 | import com.ibm.icu.text.CharsetDetector 5 | import org.apache.hadoop.conf.Configuration 6 | import org.apache.hadoop.hbase.HBaseConfiguration 7 | import org.apache.hadoop.hbase.client.{Result, Scan} 8 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable 9 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat 10 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil 11 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos 12 | import org.apache.hadoop.hbase.util.Base64 13 | import org.apache.spark.SparkContext 14 | import org.apache.spark.rdd.RDD 15 | import scala.xml.{Elem, XML} 16 | 17 | /** 18 | * Created by li on 16/7/7. 19 | */ 20 | object HBaseUtil { 21 | 22 | /** 23 | * 设置时间范围 24 | * 25 | * @return 时间范围 26 | * @author yangshuai 27 | */ 28 | def setTimeRange(startDay: String): String = { 29 | 30 | val scan = new Scan() 31 | 32 | val sdf = new SimpleDateFormat("yyyy-MM-dd") 33 | val startRow = sdf.parse(startDay).getTime 34 | val stopRow = startRow + 24 * 60 * 60 * 1000 - 1 35 | 36 | scan.setTimeRange(startRow, stopRow) 37 | val proto: ClientProtos.Scan = ProtobufUtil.toScan(scan) 38 | 39 | Base64.encodeBytes(proto.toByteArray) 40 | } 41 | 42 | /** 43 | * 识别字符编码 44 | * 45 | * @param html 地址编码 46 | * @return 字符编码 47 | */ 48 | def judgeChaser(html: Array[Byte]): String = { 49 | 50 | val icu4j = new CharsetDetector() 51 | icu4j.setText(html) 52 | val encoding = icu4j.detect() 53 | 54 | encoding.getName 55 | } 56 | 57 | /** 58 | * 获取xml格式的配置文件 59 | * 60 | * @param dir 配置文件所在的文件目录 61 | * @return 62 | * @return Li Yu 63 | * @note rowNum: 2 64 | */ 65 | def readConfigFile(dir: String): Elem = { 66 | 67 | val configFile = XML.loadFile(dir) 68 | 69 | configFile 70 | } 71 | 72 | /** 73 | * 获取hbase配置内容,并且初始化hbase配置 74 | * 75 | * @param configFile hbase配置文件 76 | * @return 77 | * @return Li Yu 78 | * @note rowNum: 7 79 | */ 80 | def setHBaseConfigure(configFile: Elem): Configuration = { 81 | 82 | val rootDir = (configFile \ "hbase" \ "rootDir").text 83 | val ip = (configFile \ "hbase" \ "ip").text 84 | 85 | // 初始化配置 86 | val configuration = HBaseConfiguration.create() 87 | configuration.set("hbase.rootdir", rootDir) 88 | configuration.set("hbase.zookeeper.quorum", ip) 89 | 90 | configuration 91 | } 92 | 93 | /** 94 | * 获取hbase中的内容 95 | * 96 | * @param sc SparkContext 97 | * @param confDir 配置文件所在的文件夹 98 | * @author Li Yu 99 | * @note rowNum: 7 100 | */ 101 | def getHBaseConf(sc: SparkContext, confDir: String, tableName: String) : RDD[(ImmutableBytesWritable, Result)] = { 102 | 103 | val configFile = readConfigFile(confDir) 104 | val configuration = setHBaseConfigure(configFile) 105 | 106 | configuration.set(TableInputFormat.INPUT_TABLE, tableName) 107 | // configuration.set(TableInputFormat.SCAN, timeRange) 108 | 109 | // 使用Hadoop api来创建一个RDD 110 | val hBaseRDD = sc.newAPIHadoopRDD(configuration, 111 | classOf[TableInputFormat], 112 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable], 113 | classOf[org.apache.hadoop.hbase.client.Result]) 114 | 115 | hBaseRDD 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/vipstockstatistic/util/LoggerUtil.scala: -------------------------------------------------------------------------------- 1 | package dataprocess.vipstockstatistic.util 2 | 3 | import org.apache.log4j.{BasicConfigurator, Logger} 4 | 5 | /** 6 | * 写Log操作 7 | */ 8 | object LoggerUtil { 9 | 10 | var logger = Logger.getLogger("Warren_VipStockStatistic_Processing") 11 | BasicConfigurator.configure() 12 | // PropertyConfigurator.configure("/home/alg/telecomdataprocess/conf/log4j.properties") 13 | 14 | def exception(e: Exception) = { 15 | 16 | logger.error(e.printStackTrace()) 17 | 18 | } 19 | 20 | def error(msg: String): Unit = { 21 | 22 | logger.error(msg) 23 | } 24 | 25 | def warn(msg: String): Unit = { 26 | 27 | logger.warn(msg) 28 | } 29 | 30 | def info(msg: String): Unit = { 31 | 32 | logger.info(msg) 33 | } 34 | 35 | def debug(msg: String): Unit = { 36 | 37 | logger.debug(msg) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/vipstockstatistic/util/RedisUtil.scala: -------------------------------------------------------------------------------- 1 | package dataprocess.vipstockstatistic.util 2 | 3 | import redis.clients.jedis.Jedis 4 | 5 | import scala.xml.XML 6 | 7 | /** 8 | * Created by li on 16/8/23. 9 | */ 10 | object RedisUtil { 11 | 12 | var jedis: Jedis = null 13 | 14 | /** 15 | * 初始化 redis 16 | * 17 | * @param confDir 配置文件对应的 xml 对象 18 | * @note rowNum: 10 19 | */ 20 | def initRedis(confDir: String): Jedis = { 21 | 22 | val configFile = XML.loadFile(confDir) 23 | 24 | val redisIp = (configFile \ "redis" \ "ip").text 25 | val redisPort = (configFile \ "redis" \ "port").text.toInt 26 | val redisDB = (configFile \ "redis" \ "db").text.toInt 27 | val redisAuth = (configFile \ "redis" \ "auth").text 28 | 29 | jedis = new Jedis(redisIp, redisPort) 30 | jedis.auth(redisAuth) 31 | jedis.select(redisDB) 32 | 33 | jedis 34 | } 35 | 36 | /** 37 | * 将结果保存到redis 38 | * 39 | * @param resultData 需要保存的数据 40 | * @author Li Yu 41 | * @note rowNum: 12 42 | */ 43 | def write2Redis(resultData: Array[(String, String)], time: String, dataType: String, confDir: String): Unit = { 44 | 45 | val jedis = initRedis(confDir) 46 | 47 | resultData.foreach{ x => { 48 | 49 | jedis.zadd(s"vipstockstatistic_$dataType" + s"_$time", x._2.toDouble, x._1) 50 | }} 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /src/main/scala/intactprogram/vipstockstatistic/util/config.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hdfs://61.147.114.85:9000/hbase 6 | slave1,slave2,slave3 7 | 8 | 9 | 10 | hdfs://61.147.114.85:9000 11 | 12 | 13 | 14 | 61.147.114.88 15 | 16003 16 | 17 | 18 | 19 | 61.147.114.72 20 | 6666 21 | db9 22 | backtest 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/Recommendation/SparkMLlibColbFilter.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.Recommendation 2 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating} 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | 6 | /** 7 | * Created by li on 2017/3/23. 8 | * 协同过滤ALS算法推荐过程如下: 9 | * 加载数据到 ratings RDD,每行记录包括:user, product, rate 10 | * 从 ratings 得到用户商品的数据集:(user, product) 11 | * 使用ALS对 ratings 进行训练 12 | * 通过 model 对用户商品进行预测评分:((user, product), rate) 13 | * 从 ratings 得到用户商品的实际评分:((user, product), rate) 14 | * 合并预测评分和实际评分的两个数据集,并求均方差 15 | */ 16 | 17 | object SparkMLlibColbFilter { 18 | 19 | def main(args: Array[String]) { 20 | val conf = new SparkConf().setAppName("Java Collaborative Filtering Example").setMaster("local") 21 | val sc = new SparkContext(conf) 22 | 23 | // Load and parse the data 24 | val path = "file:///data/hadoop/spark-2.0.0-bin-hadoop2.7/data/mllib/als/test.data" 25 | val data = sc.textFile(path) 26 | val ratings = data.map(_.split(",") match { case Array(user, item, rate) => 27 | Rating(user.toInt, item.toInt, rate.toDouble) 28 | }) 29 | 30 | // Build the recommendation model using ALS 31 | val rank = 10 32 | val numIterations = 10 33 | val model = ALS.train(ratings, rank, numIterations, 0.01) 34 | 35 | // Evauate the model on rating data 36 | val usersProducts = ratings.map { case Rating(user, product, rate) => 37 | (user, product) 38 | } 39 | 40 | val predictions = 41 | model.predict(usersProducts).map { case Rating(user, product, rate) => 42 | ((user, product), rate) 43 | } 44 | 45 | val ratesAndPreds = ratings.map { case Rating(user, product, rate) => 46 | ((user, product), rate) 47 | }.join(predictions) 48 | 49 | val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) => 50 | val err = r1 - r2 51 | err * err 52 | }.mean() 53 | 54 | System.out.println("Mean Squared Error = " + MSE) 55 | 56 | // Save and load model 57 | model.save(sc, "target/tmp/myCollaborativeFilter") 58 | val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter") 59 | 60 | 61 | //为每个用户进行推荐,推荐的结果可以以用户id为key,结果为value存入redis或者hbase中 62 | val users = data.map(_.split(",")(0)).distinct().collect() 63 | 64 | for (elem <- users) { 65 | 66 | val res = model.recommendProducts(elem.toInt, numIterations) 67 | res.foreach(itm => (itm.user, itm.product, itm.rating)) 68 | } 69 | } 70 | } -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/classification/BinaryClassificationWithALS.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.classification 2 | 3 | import org.apache.spark.ml.feature.{HashingTF, IDF, StopWordsRemover, Tokenizer} 4 | import org.apache.spark.mllib.linalg.{Vector, Vectors} 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.sql.{Row, SQLContext} 7 | import org.apache.spark.{SparkConf, SparkContext} 8 | 9 | import scala.io.Source 10 | 11 | 12 | 13 | /** 14 | * Created by li on 16/4/8. 15 | */ 16 | object BinaryClassificationWithALS { 17 | 18 | val conf = new SparkConf().setMaster("local").setAppName("StopWordRemove") 19 | val sc = new SparkContext(conf) 20 | val sqlContext = new SQLContext(sc) 21 | // val hivecontext = new HiveContext(sc) 22 | import sqlContext.implicits._ 23 | 24 | 25 | // DataFrame type 数据集导入 26 | // val src = Source.fromFile("/users/li/Intellij/Native-Byes/nativebyes/wordseg_881156.txt").getLines().toArray 27 | 28 | // 总数据集获取未平衡 29 | // case class RawDataRecord( category: String ,labels: Double ,text: String) 30 | // 31 | // val src = Source.fromFile("/Users/li/Downloads/traningset/HGHQ.txt").getLines().toArray.map{ 32 | // line => 33 | // val data = line.split("\t") 34 | // RawDataRecord(data(1),data(0).toDouble,data(2)) 35 | // } 36 | 37 | 38 | // // 平衡数据集获取 39 | case class RawDataRecord(labels: Double ,text: String) 40 | val src = sc.textFile("/Users/li/Downloads/trainingSets/保险").map{ 41 | line => 42 | val data = line.split("\t") 43 | RawDataRecord(data(0).toDouble, data(1)) 44 | } 45 | 46 | 47 | val srcDF = sqlContext.createDataFrame(src) 48 | 49 | 50 | // RDD type 51 | // val srcRDD = sc.textFile("/users/li/Intellij/Native-Byes/nativebyes/wordseg_881156.txt").map { 52 | // x => 53 | // val data = x.split("\t") 54 | // RawDataRecord(data(0),data(1),labels = if(data(1) == "881108" ) 1.0 else 0.0, data(2)) 55 | // }.toDF()//to DataFrame 56 | 57 | var tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") 58 | var wordsData = tokenizer.transform(srcDF) 59 | 60 | // 去停用词 61 | // 读取停用词表 62 | // val filter = Source.fromFile("/users/li/Intellij/Native-Byes/nativebyes/1.txt" ).getLines().toArray 63 | val filter = Source.fromFile("/users/li/Intellij/Native-Byes/nativebyes/stop_words_CN" ).getLines().toArray 64 | 65 | val remover = new StopWordsRemover() 66 | .setInputCol("words") 67 | .setOutputCol("filtered") 68 | .setStopWords(filter) 69 | 70 | val removeword = remover.transform(wordsData) 71 | 72 | 73 | //70%作为训练数据,30%作为测试数据 74 | val splits = removeword.randomSplit(Array(0.7, 0.3),seed = 11L) 75 | //splits.foreach(println) 76 | var trainingDF = splits(0) 77 | var testDF = splits(1) 78 | 79 | 80 | 81 | //使用hashingTF计算每个词在文档中的词频 82 | val hashingTF = new HashingTF().setNumFeatures(2000).setInputCol("filtered").setOutputCol("rawFeatures") 83 | val featurizedData = hashingTF.transform(trainingDF) 84 | // println("output2:") 85 | // featurizedData.select($"category", $"words", $"rawFeatures").foreach(println) 86 | // featurizedData.show() 87 | 88 | 89 | //计算每个词的TF-IDF 90 | var idf = new IDF().setInputCol("rawFeatures").setOutputCol("features") 91 | val idfModel = idf.fit(featurizedData) 92 | var rescaledData = idfModel.transform(featurizedData) 93 | // println("output3:") 94 | // rescaledData.select($"category", $"features").foreach(println) 95 | // rescaledData.select($"labels",$"features").show() 96 | 97 | 98 | // 转换成Bayes的输入格式 99 | var trainDataRdd = rescaledData.select($"labels",$"features").map { 100 | case Row(label: Double, features: Vector) => 101 | LabeledPoint(label , Vectors.dense(features.toArray)) 102 | }.cache() 103 | 104 | //trainDataRdd.foreach(println) 105 | 106 | 107 | /** ALS降维 */ 108 | // val pca = new PCA(trainDataRdd.first().features.size/2).fit(trainDataRdd.map(_.features)) 109 | // val als = new ALSModel() 110 | // val pcl = new ALS().setNonnegative(true).setMaxIter(100).fit(trainDataRdd.map(_.features)) 111 | 112 | 113 | 114 | 115 | 116 | 117 | } 118 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/classification/PCAtest.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.classification 2 | 3 | import org.apache.spark.mllib.feature.PCA 4 | import org.apache.spark.mllib.linalg.Vectors 5 | import org.apache.spark.mllib.regression.{LinearRegressionWithSGD, LabeledPoint} 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | 9 | /** 10 | * Created by li on 16/4/7. 11 | */ 12 | object PCAtest extends App{ 13 | 14 | val conf = new SparkConf().setAppName("test").setMaster("local") 15 | val sc = new SparkContext(conf) 16 | 17 | val data = sc.textFile("/Users/li/Downloads/lpsa.data").map { line => 18 | val parts = line.split(',') 19 | LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) 20 | }.cache() 21 | 22 | 23 | 24 | 25 | val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) 26 | val training = splits(0).cache() 27 | val test = splits(1) 28 | 29 | // training.foreach(println) 30 | // println(training.first()) 31 | // println(training.first().features.size/2) 32 | 33 | 34 | val pca = new PCA(training.first().features.size/2).fit(data.map(_.features)) 35 | 36 | val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) 37 | val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) 38 | 39 | val numIterations = 100 40 | val model = LinearRegressionWithSGD.train(training, numIterations) 41 | val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) 42 | 43 | val valuesAndPreds = test.map { point => 44 | val score = model.predict(point.features) 45 | (score, point.label) 46 | } 47 | 48 | val valuesAndPreds_pca = test_pca.map { point => 49 | val score = model_pca.predict(point.features) 50 | (score, point.label) 51 | } 52 | 53 | val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean() 54 | val MSE_pca = valuesAndPreds_pca.map{case(v, p) => math.pow((v - p), 2)}.mean() 55 | 56 | println("Mean Squared Error = " + MSE) 57 | println("PCA Mean Squared Error = " + MSE_pca) 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/correlationanalysis/correlationAnalysis.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.correlationanalysis 2 | 3 | /** 4 | * Created by li on 16/7/5. 5 | */ 6 | object correlationAnalysis { 7 | 8 | } 9 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/hotdegreecalculate/CommunityFrequencyStatistics.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.hotdegreecalculate 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import scala.collection.mutable 6 | 7 | /** 8 | * Created by li on 16/7/5. 9 | * 计算社区热度的功能模块, 揉合了社区热词的热度计算, 10 | */ 11 | object CommunityFrequencyStatistics { 12 | 13 | 14 | /** 15 | * 筛选出出现了社区内词的所有文章 16 | * 17 | * @param communityWords 社区中的词 18 | * @param textWords 新闻 19 | * @return Boolean 新闻中存在社区中的词返回true 20 | * @author Li Yu 21 | * @note rowNum: 11 22 | */ 23 | def filterFunc(communityWords: Array[String], 24 | textWords: Array[String]): Boolean = { 25 | 26 | communityWords.foreach { 27 | word => { 28 | 29 | if (textWords.contains(word)) { 30 | 31 | return true 32 | } 33 | } 34 | } 35 | 36 | false 37 | } 38 | 39 | /** 40 | * 统计当前文档库中, 包含社区中提取的关键词的文档数,重复的根据文本ID(url)合并, 41 | * 特别针对社区(事件)词, 一个社区中包含若干个词, 并且词变化后对应的社区却没有变化. 42 | * 43 | * @param fileList 当前文档 44 | * @param communityWordList textRank提取的每个社区的关键词 45 | * @return [社区ID, 包含社区中关键词的文档总数]包含社区中关键词的文档总数 46 | * @author Li Yu 47 | * @note rowNum: 13 48 | */ 49 | def communityFrequencyStatisticsRDD(fileList: RDD[Array[String]], 50 | communityWordList: Array[(String, Array[String])]): Array[(String, Double)] = { 51 | 52 | val communityList = new mutable.HashMap[String, Double] 53 | 54 | communityWordList.foreach { 55 | community => { 56 | 57 | val communityID = community._1 58 | val communityWords = community._2 59 | val temp = fileList.filter(content => filterFunc(communityWords, content)).count().toDouble 60 | 61 | communityList.+=((communityID, temp)) 62 | } 63 | } 64 | 65 | communityList.toArray 66 | } 67 | 68 | 69 | /** 70 | * 统计当前文档库中, 包含社区中提取的关键词的文档数,重复的根据文本ID(url)合并, 71 | * 特别针对社区(事件)词, 一个社区中包含若干个词, 并且词变化后对应的社区却没有变化. 72 | * 73 | * @param fileList 当前文档 74 | * @param communityWordList textRank提取的每个社区的关键词 75 | * @return [社区ID, 包含社区中关键词的文档总数]包含社区中关键词的文档总数 76 | * @author Li Yu 77 | * @note rowNum: 22 78 | */ 79 | def communityFrequencyStatistics(fileList: Array[(String, Array[String])], 80 | communityWordList: Array[(String, Array[String])]): Array[(String, Double)] = { 81 | 82 | val communityList = new mutable.HashMap[String, Double] 83 | 84 | communityWordList.foreach { 85 | line => { 86 | 87 | val item = new mutable.ArrayBuffer[String] 88 | val communityId = line._1 89 | val communityWords = line._2 90 | 91 | fileList.foreach { 92 | file => { 93 | 94 | val fileId = file._1 95 | val fileWordsList = file._2.distinct 96 | 97 | communityWords.foreach { word => { 98 | 99 | if (fileWordsList.contains(word)) item.append(fileId) 100 | } 101 | 102 | communityList.put(communityId, item.distinct.length) 103 | } 104 | } 105 | } 106 | } 107 | } 108 | 109 | communityList.toArray 110 | } 111 | 112 | } 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/hotdegreecalculate/fileIO.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.hotdegreecalculate 2 | 3 | import java.io.{File, PrintWriter} 4 | 5 | import _root_.util.TimeUtil 6 | 7 | import scala.collection.mutable 8 | import scala.io.Source 9 | 10 | /** 11 | * Created by li on 16/7/11. 12 | * 计算社区热度的过程中,分别将计算的结果保存到本地的文件系统中, 以及从本地文件中读取前一小时的数据 13 | */ 14 | object fileIO { 15 | 16 | /** 将结果保存到本地,将每小时数据保存为一个txt文件,一天的数据保存在一个文件夹里. 17 | * 18 | * @param dir 文件保存的目录 19 | * @param result 20 | */ 21 | def saveAsTextFile(dir: String, result: Array[(String, Double)]): Unit ={ 22 | 23 | val day = TimeUtil.getDay 24 | val hour = TimeUtil.getCurrentHour 25 | 26 | val writer = new PrintWriter(new File(dir +"%s".format(day) + "-" + "%s".format(hour) + ".txt")) 27 | 28 | for (line <- result) { 29 | 30 | writer.write(line._1 + "\t" + line._2 + "\n") 31 | 32 | } 33 | 34 | writer.close() 35 | } 36 | 37 | 38 | /** 39 | * 读取当前时间前一个小时的数据,读取本地文件中的结果. 40 | * 41 | * @param dir 数据保存的目录 42 | * @return 43 | */ 44 | def readFromFile(dir: String): Array[(String, Double)] ={ 45 | 46 | val date = TimeUtil.getPreHourStr 47 | val res = new mutable.ArrayBuffer[(String, Double)] 48 | 49 | if (Source.fromFile(dir + "%s".format(date) + ".txt" ) != null) { 50 | val temp = Source.fromFile(dir + "%s".format(date) + ".txt" ) 51 | 52 | temp.getLines().foreach{ 53 | line =>{ 54 | val temp = line.split("\t") 55 | res.+=((temp(0), temp(1).toDouble)) 56 | } 57 | } 58 | } else { 59 | 60 | res.+=(("init", 0.0)) 61 | } 62 | 63 | res.toArray 64 | } 65 | 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/textrank/AbstractExtract.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.textrank 2 | 3 | import org.graphstream.graph.implementations.SingleGraph 4 | 5 | import scala.collection.mutable.ListBuffer 6 | 7 | /** 8 | * Created by li on 16/6/23. 9 | */ 10 | class AbstractExtract (val graphName: String, val segWord: ListBuffer[ListBuffer[(String)]] ){ 11 | 12 | var graph = new SingleGraph(graphName) 13 | 14 | // 获取文本网络的句子节点 15 | segWord.foreach { 16 | sentenceList => { 17 | val sentence = sentenceList.toString 18 | if (graph.getNode(sentence) == null) graph.addNode(sentence) 19 | } 20 | } 21 | 22 | // 边的获取,通过计算句子的相似度 23 | 24 | 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/textrank/ConstructTextGraph.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.textrank 2 | 3 | import org.graphstream.graph.implementations.SingleGraph 4 | 5 | import scala.collection.mutable 6 | import scala.collection.mutable.ListBuffer 7 | 8 | /** 9 | * Created by li on 16/6/23. 10 | */ 11 | 12 | /** 13 | * 构建候选关键词图 14 | * @param graphName 图标识s 15 | * @param winSize 窗口大小 16 | * @param segWord 分词的结果 17 | * @return 候选关键词图 18 | * @author LiYu 19 | */ 20 | class ConstructTextGraph(val graphName: String, val winSize: Int, val segWord: List[String]) { 21 | 22 | /** 23 | * 构建候选关键词图 24 | * @return 候选关键词图 25 | */ 26 | def constructGraph: SingleGraph = { 27 | 28 | val graph = new SingleGraph(graphName) 29 | 30 | // 获取文本网络的节点 31 | segWord.foreach( 32 | word => if (graph.getNode(word) == null) graph.addNode(word) 33 | ) 34 | 35 | // 导入分完词的数据,并通过设置的窗口截取 36 | var wordSeg = new ListBuffer[(ListBuffer[(String)])] 37 | 38 | val num = segWord.size - winSize 39 | 40 | for (i <- 0 to num) { 41 | 42 | val item = new ListBuffer[(String)] 43 | 44 | for (j <- 0 until winSize) { 45 | 46 | item += segWord(i + j) 47 | } 48 | 49 | wordSeg += item 50 | 51 | } 52 | 53 | // 获取每个顶点以及所包含的窗口内的邻居节点 54 | val wordSet = segWord.toSet 55 | 56 | val edgeSet = wordSet.map { 57 | word => { 58 | val edgeList = new mutable.HashSet[(String)] 59 | wordSeg.foreach { 60 | list => { 61 | if (list.contains(word)){ 62 | list.foreach(x => edgeList.+=(x)) 63 | } 64 | } 65 | } 66 | 67 | (word, edgeList -= word) 68 | 69 | } 70 | } 71 | 72 | // 构建关键词图的边 73 | edgeSet.toArray.foreach { 74 | edge => { 75 | edge._2.toList.foreach { 76 | edges => 77 | 78 | if (graph.getEdge(s"${edge._1}-${edges}") == null && 79 | graph.getEdge(s"${edges}-${edge._1}") == null) { 80 | graph.addEdge(s"${edge._1}-${edges}", edge._1, edges) 81 | None 82 | } 83 | } 84 | } 85 | } 86 | 87 | graph 88 | 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/textrank/PropertyExtractor.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.textrank 2 | 3 | import org.graphstream.graph.implementations.SingleGraph 4 | import org.graphstream.graph.{Edge, Node} 5 | 6 | import scala.collection.mutable 7 | 8 | /** 9 | * Created by li on 16/6/24. 10 | */ 11 | 12 | /** 13 | * 关键词提取, 输出个文章提取的关键词, 无向图名称为文章的url 14 | * 15 | * @param graph 节点图 16 | * @param keywordNum 关键词个数 17 | * @return 文本的关键词 18 | * @author LiYu 19 | */ 20 | class PropertyExtractor(val graph: SingleGraph, val keywordNum: Int) { 21 | 22 | /** 23 | * 24 | * @param iterator textRank迭代次数 25 | * @param df 阻尼系数(Damping Factor) 26 | * @return 关键词和得分 27 | */ 28 | // 使用textRank提取关键词 29 | def extractKeywords(iterator: Int, df: Double) = { 30 | 31 | val nodes = graph.getNodeSet.toArray.map(_.asInstanceOf[Node]) 32 | val scoreMap = new mutable.HashMap[String, Double] 33 | 34 | // 节点权重初始化 35 | nodes.foreach(node => scoreMap.put(node.getId, 1.0f)) 36 | 37 | // 迭代 迭代传播各节点的权重,直至收敛。 38 | (1 to iterator).foreach { 39 | i => 40 | nodes.foreach { 41 | node => 42 | val edges = node.getEdgeSet.toArray.map(_.asInstanceOf[Edge]) 43 | var score = 1.0f - df 44 | edges.foreach { 45 | edge => 46 | val node0 = edge.getNode0.asInstanceOf[Node] 47 | val node1 = edge.getNode1.asInstanceOf[Node] 48 | val tempNode = if (node0.getId.equals(node.getId)) node1 else node0 49 | score += df * (1.0f * scoreMap(tempNode.getId) / tempNode.getDegree) 50 | } 51 | scoreMap.put(node.getId, score) 52 | } 53 | } 54 | 55 | // 对节点权重进行倒序排序,从而得到最重要的num个单词,作为候选关键词。 56 | scoreMap.toList.sortWith(_._2 > _._2).slice(0, keywordNum) 57 | 58 | } 59 | } -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/textrank/TextRank.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.textrank 2 | 3 | /** 4 | * Created by li on 16/6/24. 5 | */ 6 | object TextRank { 7 | /** 8 | * 9 | * @param graphName 图标识 10 | * @param window 词窗口大小 11 | * @param doc 待抽取文本 12 | * @param keywordNum 提取关键词个数 13 | * @param iterator textRank迭代次数 14 | * @param df 阻尼系数 15 | * @return 关键词, 得分 16 | */ 17 | def run(graphName: String, window: Int, doc: List[String], 18 | keywordNum: Int, iterator: Int, df: Double): List[(String, Double)] = { 19 | 20 | // 生成关键词图 21 | val constructTextGraph = new ConstructTextGraph(graphName, window, doc) 22 | val textGraph = constructTextGraph.constructGraph 23 | 24 | // 输出提取的关键词 25 | val keywordExtractor = new PropertyExtractor(textGraph, keywordNum) 26 | val result = keywordExtractor.extractKeywords(iterator, df) 27 | 28 | result 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/topicmodel/LDAModel.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.topicmodel 2 | 3 | import org.apache.spark.rdd.RDD 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | //import org.apache.spark.mllib.clustering.LDA 6 | //import org.apache.spark.rdd.RDD 7 | 8 | /** 9 | * Created by li on 2016/4/28. 10 | */ 11 | 12 | object LDAModel extends App{ 13 | 14 | val conf = new SparkConf().setAppName("TopicModel").setMaster("local") 15 | val sc = new SparkContext(conf) 16 | 17 | // Load documents from text files, 1 document per file 18 | val corpus: RDD[String] = sc.wholeTextFiles("/Users/li/kunyan/docs/*.md").map(_._2) 19 | 20 | // Split each document into a sequence of terms (words) 21 | val tokenized: RDD[Array[String]] = 22 | corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).filter(_.forall(java.lang.Character.isLetter))) 23 | 24 | tokenized.collect().foreach(println) 25 | 26 | // Choose the vocabulary. 27 | // termCounts: Sorted list of (term, termCount) pairs 28 | val termCounts: Array[(String, Long)] = 29 | tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2) 30 | 31 | termCounts.foreach(println) 32 | 33 | // vocabArray: Chosen vocab (removing common terms) 34 | val numStopwords = 20 35 | val vocabArray: Array[String] = 36 | termCounts.takeRight(termCounts.length - numStopwords).map(_._1) 37 | 38 | // vocab: Map term -> term index 39 | val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap 40 | // vocab.foreach(println) 41 | 42 | // // Convert documents into term count vectors 43 | // val documents: RDD[(Long, Vector)] = 44 | // tokenized.zipWithIndex.map { 45 | // case (tokens, id) => 46 | // val counts = new mutable.HashMap[Int, Double]() 47 | // tokens.foreach { term => 48 | // if (vocab.contains(term)) { 49 | // val idx = vocab(term) 50 | // counts(idx) = counts.getOrElse(idx, 0.0) + 1.0 51 | // } 52 | // } 53 | // (id, Vectors.sparse(vocab.size, counts.toSeq)) 54 | // } 55 | // 56 | // documents.foreach(println) 57 | // 58 | // // Set LDA parameters 59 | // val numTopics = 3 60 | // val lda = new LDA().setK(numTopics).setMaxIterations(8) 61 | // 62 | // val ldaModel = lda.run(documents) 63 | //// val avgLogLikelihood = ldaModel.logLikelihood / documents.count() 64 | // 65 | // // Print topics, showing top-weighted 10 terms for each topic. 66 | // val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10) 67 | // topicIndices.foreach { case (terms, termWeights) => 68 | // println("TOPIC:") 69 | // terms.zip(termWeights).foreach { case (term, weight) => 70 | // println(s"${vocabArray(term.toInt)}\t$weight") 71 | // } 72 | // println() 73 | // } 74 | // 75 | } 76 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/topicmodel/LatentDirichletAllocationExample.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.topicmodel 2 | 3 | import org.apache.spark.mllib.clustering.LDA 4 | import org.apache.spark.{SparkConf, SparkContext} 5 | import org.apache.spark.mllib.linalg.Vectors 6 | 7 | object LatentDirichletAllocationExample { 8 | 9 | def main(args: Array[String]) { 10 | 11 | val conf = new SparkConf().setAppName("LatentDirichletAllocationExample").setMaster("local") 12 | val sc = new SparkContext(conf) 13 | 14 | // $example on$ 15 | // Load and parse the data 16 | val data = sc.textFile("/Users/li/kunyan/spark/data/mllib/sample_lda_data.txt") 17 | data.foreach(println) 18 | 19 | val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))) 20 | parsedData.foreach(println) 21 | 22 | // Index documents with unique IDs 23 | val corpus = parsedData.zipWithIndex.map(_.swap).cache() 24 | 25 | // Cluster the documents into three topics using LDA 26 | val ldaModel = new LDA().setK(3).run(corpus) 27 | // 28 | // // Output topics. Each is a distribution over words (matching word count vectors) 29 | // println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):") 30 | // val topics = ldaModel.topicsMatrix 31 | // for (topic <- Range(0, 3)) { 32 | // print("Topic " + topic + ":") 33 | // for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); } 34 | // println() 35 | // } 36 | // 37 | // // Save and load model. 38 | // ldaModel.save(sc, "/Users/li/kunyan/spark/LatentDirichletAllocationExample/LDAModel") 39 | // val sameModel = DistributedLDAModel.load(sc, 40 | // "/Users/li/kunyan/spark/LatentDirichletAllocationExample/LDAModel") 41 | // // $example off$ 42 | // 43 | // sc.stop() 44 | } 45 | } -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/ClassifyModel.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.word2vec 2 | 3 | import java.io.File 4 | 5 | import util.{DirectoryUtil, JSONUtil} 6 | import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD} 7 | import org.apache.spark.mllib.feature.Word2VecModel 8 | import org.apache.spark.mllib.regression.LabeledPoint 9 | import org.apache.spark.rdd.RDD 10 | import org.apache.spark.{SparkConf, SparkContext} 11 | 12 | /** 13 | * Created by li on 2016/10/13. 14 | * 15 | */ 16 | object ClassifyModel { 17 | 18 | 19 | def classify(trainDataRdd: RDD[LabeledPoint]): SVMModel = { 20 | 21 | /** NativeBayes训练模型 */ 22 | // val model = NaiveBayes.train(trainDataRdd, lambda = 1.0, modelType = "multinomial") 23 | 24 | /** SVM训练模型 */ 25 | val numIterations = 1000 26 | val model = SVMWithSGD.train(trainDataRdd , numIterations) 27 | 28 | /** RandomForest训练模型 */ 29 | // val numClasses = 2 30 | // val categoricalFeaturesInfo = Map[Int, Int]() 31 | // val numTrees = 3 32 | // val featureSubsetStrategy = "auto" 33 | // val impurity = "gini" 34 | // val maxDepth = 4 35 | // val maxBins = 32 36 | // val model = RandomForest.trainClassifier(trainDataRdd, numClasses, categoricalFeaturesInfo,numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins) 37 | 38 | /** GradientBoostedTrees训练模型 */ 39 | // // Train a GradientBoostedTrees model. 40 | // // The defaultParams for Classification use LogLoss by default. 41 | // val boostingStrategy = BoostingStrategy.defaultParams("Classification") 42 | // boostingStrategy.numIterations = 3 // Note: Use more iterations in practice. 43 | // boostingStrategy.treeStrategy.numClasses = 2 44 | // boostingStrategy.treeStrategy.maxDepth = 5 45 | // // Empty categoricalFeaturesInfo indicates all features are continuous. 46 | // boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]() 47 | // 48 | // val model = GradientBoostedTrees.train(trainDataRdd, boostingStrategy) 49 | 50 | model 51 | 52 | } 53 | 54 | def main(args: Array[String]) { 55 | 56 | val conf = new SparkConf().setAppName("textVectors").setMaster("local") 57 | val sc = new SparkContext(conf) 58 | 59 | val jsonPath = "/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json" 60 | 61 | JSONUtil.initConfig(jsonPath) 62 | 63 | val word2vecModelPath = JSONUtil.getValue("w2v", "w2vmodelPath") 64 | val modelSize = JSONUtil.getValue("w2v", "w2vmodelSize").toInt 65 | val isModel = JSONUtil.getValue("w2v", "isModel").toBoolean 66 | 67 | // val word2vecModelPath = "hdfs://master:9000/home/word2vec/classifyModel-10-100-20/2016-08-16-word2VectorModel" 68 | val w2vModel = Word2VecModel.load(sc, word2vecModelPath) 69 | 70 | // 构建训练集的labeledpoint格式 71 | // val trainSetPath = "/Users/li/workshop/DataSet/trainingsetUnbalance/BXX.txt" 72 | // val trainSetPath = "/Users/li/workshop/DataSet/trainingSets/计算机" 73 | val trainSetPath = "/Users/li/workshop/DataSet/trainingSets/机械" 74 | 75 | val trainSet = DataPrepare.readData(trainSetPath) 76 | val trainSetRdd = sc.parallelize(trainSet).cache() 77 | //val trainSetRdd = sc.textFile(trainSetPath) 78 | 79 | // val trainSetVec = trainSetRdd.map( row => { 80 | // val x = row.split("\t") 81 | // (x(0), x(1).split(","))}) // 在文章进行分词的情况下,用逗号隔开 82 | // //(x(0), AnsjAnalyzer.cutNoTag(x(1)}) // 如果没有分词,就调用ansj进行分词 83 | // .map(row => (row._1.toDouble, DataPrepare.docVec(w2vModel, row._2))) 84 | 85 | val trainDataRdd = TextVectors.textVectorsWithWeight(trainSetRdd, w2vModel, modelSize, isModel).cache() 86 | 87 | val classifyModel = classify(trainDataRdd) 88 | 89 | val classifyModelPath = JSONUtil.getValue("classify", "classifymodelpath") 90 | DirectoryUtil.deleteDir(new File(classifyModelPath)) 91 | classifyModel.save(sc, classifyModelPath) 92 | println("分类模型保存完毕。") 93 | 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/ClassifyPredict.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.word2vec 2 | 3 | import org.apache.spark.mllib.classification.SVMModel 4 | import org.apache.spark.mllib.evaluation.MulticlassMetrics 5 | import org.apache.spark.mllib.feature.Word2VecModel 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | import util.JSONUtil 10 | import wordSegmentation.AnsjAnalyzer 11 | 12 | /** 13 | * Created by li on 2016/10/17. 14 | */ 15 | object ClassifyPredict { 16 | 17 | 18 | /** 19 | * 准确度统计分析 20 | * 21 | * @param predictionAndLabel 22 | */ 23 | def acc(predictionAndLabel: RDD[(Double, Double)], 24 | predictDataRdd: RDD[LabeledPoint]): Unit = { 25 | 26 | //统计分类准确率 27 | val testAccuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / predictDataRdd.count() 28 | println("testAccuracy:" + testAccuracy) 29 | 30 | val metrics = new MulticlassMetrics(predictionAndLabel) 31 | println("Confusion matrix:" + metrics.confusionMatrix) 32 | 33 | // Precision by label 34 | val label = metrics.labels 35 | label.foreach { l => 36 | println(s"Precision($l) = " + metrics.precision(l)) 37 | } 38 | 39 | // Recall by label 40 | label.foreach { l => 41 | println(s"Recall($l) = " + metrics.recall(l)) 42 | } 43 | 44 | // False positive rate by label 45 | label.foreach { l => 46 | println(s"FPR($l) = " + metrics.falsePositiveRate(l)) 47 | } 48 | 49 | // F-measure by label 50 | label.foreach { l => 51 | println(s"F1-Score($l) = " + metrics.fMeasure(l)) 52 | } 53 | 54 | // val roc = metrics.roc 55 | 56 | // // AUROC 57 | // val auROC = metrics.areaUnderROC 58 | // println("Area under ROC = " + auROC) 59 | 60 | } 61 | 62 | 63 | def main(args: Array[String]) { 64 | 65 | val conf = new SparkConf().setAppName("textVectors").setMaster("local") 66 | val sc = new SparkContext(conf) 67 | 68 | val jsonPath = "/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json" 69 | 70 | JSONUtil.initConfig(jsonPath) 71 | 72 | val word2vecModelPath = JSONUtil.getValue("w2v", "w2vmodelPath") 73 | val modelSize = JSONUtil.getValue("w2v", "w2vmodelSize").toInt 74 | val isModel = JSONUtil.getValue("w2v", "isModel").toBoolean 75 | // load word2vec model 76 | val w2vModel = Word2VecModel.load(sc, word2vecModelPath) 77 | 78 | // load classify model 79 | val classifyModelPath = JSONUtil.getValue("classify", "classifymodelpath") 80 | val classifyModel = SVMModel.load(sc, classifyModelPath) 81 | 82 | // 构建测试集labeledpoint格式 83 | val predictSetPath = "/Users/li/workshop/DataSet/trainingSets/test" 84 | val predictSet = DataPrepare.readData(predictSetPath) 85 | val predictSetRdd = sc.parallelize(predictSet) 86 | 87 | // 对于单篇没有分词的文章 88 | val predictSetVec = predictSetRdd.map(row => { 89 | 1.0 + "\t" + AnsjAnalyzer.cutNoTag(row) 90 | }) 91 | val predictDataRdd = TextVectors.textVectorsWithWeight(predictSetVec, w2vModel, modelSize, isModel).cache() 92 | 93 | // val predictDataRdd = TextVectors.textVectorsWithWeight(predictSetRdd, w2vModel, modelSize, isModel).cache() 94 | 95 | /** 对测试数据集使用训练模型进行分类预测 */ 96 | // classifyModel.clearThreshold() 97 | // Compute raw scores on the test set. 98 | val predictionAndLabel = predictDataRdd.map{ point => { 99 | val predictionFeature = classifyModel.predict(point.features) 100 | (predictionFeature, point.label) 101 | }} 102 | 103 | // 准确度统计分析 104 | acc(predictionAndLabel, predictDataRdd) 105 | //predictionAndLabel.foreach(println) 106 | sc.stop() 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/DataPrepare.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.word2vec 2 | 3 | import dataprocess.vipstockstatistic.util.AnsjAnalyzer 4 | import org.apache.spark.mllib.feature.Word2VecModel 5 | import org.apache.spark.mllib.linalg.Vectors 6 | import org.apache.spark.mllib.regression.LabeledPoint 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.{SparkConf, SparkContext} 9 | 10 | import scala.io.Source 11 | 12 | /** 13 | * Created by li on 2016/10/14. 14 | */ 15 | object DataPrepare { 16 | 17 | /** 18 | * 读文件 19 | * 20 | * @param filePath 文本保存的位置 21 | * @return 22 | */ 23 | def readData(filePath: String): Array[String] = { 24 | 25 | val doc = Source.fromFile(filePath).getLines().toArray 26 | 27 | doc 28 | } 29 | 30 | 31 | /** 32 | * 分词 33 | * 34 | * @param doc 35 | * @return 36 | */ 37 | def docCut(doc: Array[String]): Array[String] = { 38 | 39 | val docSeg = doc.map(x => AnsjAnalyzer.cutNoTag(x)).flatMap(x =>x) 40 | 41 | docSeg 42 | } 43 | 44 | 45 | /** 46 | * 构建文本向量 47 | * 48 | * @param word2vecModel 49 | * @param docSeg 50 | * @return 51 | */ 52 | def docVec(word2vecModel: Word2VecModel, docSeg: Array[String], modelSize: Int): Array[Double] = { 53 | 54 | val docVectors = TextVectors.textVectorsWithModel(docSeg, word2vecModel, modelSize).toArray 55 | 56 | docVectors 57 | } 58 | 59 | /** 60 | * 打标签,文本集合构建labeledPoint,集合中文章属于同一类 61 | * 62 | * @param label 63 | * @param docVec 64 | * @return 65 | */ 66 | def tagAttacheBatchSingle(label: Double, docVec: RDD[Array[Double]]): RDD[LabeledPoint] = { 67 | 68 | docVec.map{ 69 | row => 70 | LabeledPoint(label , Vectors.dense(row)) 71 | } 72 | } 73 | 74 | /** 75 | * 打标签,文本集合构建labeledPoint 76 | * 77 | * @param docVec 78 | * @return 79 | */ 80 | def tagAttacheBatchWhole(docVec: RDD[(Double, Array[Double])]): RDD[LabeledPoint] = { 81 | 82 | docVec.map{ 83 | row => 84 | LabeledPoint(row._1 , Vectors.dense(row._2)) 85 | } 86 | } 87 | 88 | 89 | /** 90 | * 打标签,单篇文本构建labeledPoint 91 | * 92 | * @param label 93 | * @param docVec 94 | * @return 95 | */ 96 | def tagAttacheSingle(label: Double, docVec: Array[Double]): LabeledPoint = { 97 | 98 | LabeledPoint(label=1.0 , Vectors.dense(docVec)) 99 | } 100 | 101 | 102 | /** 103 | * 测试代码 104 | */ 105 | def dataPrepareTest(): Unit ={ 106 | val conf = new SparkConf().setMaster("local").setAppName("DataPrepare") 107 | val sc = new SparkContext(conf) 108 | 109 | val filePath = "/Users/li/workshop/DataSet/111.txt" 110 | // val filePath = "/Users/li/workshop/DataSet/SogouC.reduced/Reduced/C000008/10.txt" 111 | 112 | val word2vecModelPath = "/Users/li/workshop/DataSet/word2vec/result/2016-07-18-15-word2VectorModel" 113 | val model = Word2VecModel.load(sc, word2vecModelPath) 114 | 115 | val data = readData(filePath) 116 | 117 | val splitData = docCut(data) 118 | 119 | val doVec = docVec(model, splitData, 100) 120 | 121 | val labeledP = tagAttacheSingle(1.0, doVec) 122 | println(labeledP) 123 | 124 | 125 | } 126 | 127 | 128 | def main(args: Array[String]) { 129 | 130 | dataPrepareTest() 131 | 132 | } 133 | 134 | } 135 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/DeleteDirectory.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.word2vec 2 | 3 | import java.io.File 4 | 5 | /** 6 | * Created by li on 16/7/15. 7 | */ 8 | 9 | object DeleteDirectory { 10 | 11 | /** 12 | * 删除空目录 13 | * @param dir 将要删除的目录路径 14 | */ 15 | private def doDeleteEmptyDir(dir: String): Unit = { 16 | 17 | val success: Boolean = new File(dir).delete() 18 | 19 | if (success) { 20 | 21 | System.out.println("Successfully deleted empty directory: " + dir) 22 | 23 | } else { 24 | 25 | System.out.println("Failed to delete empty directory: " + dir) 26 | } 27 | } 28 | 29 | /** 30 | * 递归删除目录下的所有文件及子目录下所有文件 31 | * @param dir 将要删除的文件目录 32 | * @return boolean Returns "true" if all deletions were successful. 33 | * If a deletion fails, the method stops attempting to 34 | * delete and returns "false". 35 | */ 36 | private def deleteDir(dir: File): Boolean = { 37 | 38 | if (dir.isDirectory) { 39 | 40 | val children = dir.list() 41 | 42 | //递归删除目录中的子目录下 43 | for (i <- 0 until children.length){ 44 | 45 | val success = deleteDir(new File(dir, children(i))) 46 | 47 | if (! success){ 48 | return false 49 | } 50 | 51 | } 52 | } 53 | // 目录此时为空,可以删除 54 | dir.delete() 55 | } 56 | 57 | 58 | /** 59 | *测试 60 | */ 61 | def main(args: Array[String]): Unit = { 62 | 63 | val dir = "/Users/li/kunyan/DataSet/1111" 64 | 65 | doDeleteEmptyDir(dir) 66 | 67 | val success = deleteDir(new File(dir)) 68 | 69 | if (success) System.out.println("Successfully deleted populated directory: " + dir) 70 | 71 | else System.out.println("Failed to delete populated directory: " + dir) 72 | } 73 | 74 | } 75 | 76 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/data/.part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/.part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet.crc -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/data/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/_SUCCESS -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/data/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/_common_metadata -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/data/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/_metadata -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/data/part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/metadata/.part-00000.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/metadata/.part-00000.crc -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/metadata/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/metadata/_SUCCESS -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/model/metadata/part-00000: -------------------------------------------------------------------------------- 1 | {"class":"org.apache.spark.mllib.classification.SVMModel","version":"1.0","numFeatures":100,"numClasses":2} 2 | -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/readme.md: -------------------------------------------------------------------------------- 1 | # Classification based on word2vec 2 | 3 | ## TextRank model 4 | 5 | Exact key words from news articles, instead of calculating whole words of wordVectors. 6 | 7 | ## Word2Vec model 8 | 9 | Build the LabeledPoint(model type) using the word2vec, instead of calculating tfIdf, dimension reduce and so on. 10 | 11 | 12 | ## Classification model 13 | 14 | Using SVM to classify. 15 | 16 | 17 | ## Test results 18 | testAccuracy = 19 | Precision(0.0) = 20 | Precision(1.0) = 21 | Recall(0.0) = 22 | Recall(1.0) = 23 | FPR(0.0) = 24 | FPR(1.0) = 25 | F1-Score(0.0) = 26 | F1-Score(1.0) = -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json: -------------------------------------------------------------------------------- 1 | { 2 | "kunyan": { 3 | "ip": "61.147.114.88", 4 | "port": "16003", 5 | "stopwords": "16003" 6 | }, 7 | "w2v": { 8 | "w2vmodelPath": "hdfs://61.147.114.85:9000/home/word2vec/model-10-100-20/2016-08-16-word2VectorModel", 9 | "w2vmodelSize": "100", 10 | "isModel":"true" 11 | }, 12 | "classify": { 13 | "classifymodelpath":"/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/model2" 14 | } 15 | } -------------------------------------------------------------------------------- /src/main/scala/meachinelearning/word2vec/twc/training.scala: -------------------------------------------------------------------------------- 1 | package meachinelearning.word2vec.twc 2 | 3 | import org.apache.spark.mllib.classification.SVMWithSGD 4 | import org.apache.spark.mllib.evaluation.MulticlassMetrics 5 | import org.apache.spark.mllib.regression.LabeledPoint 6 | import org.apache.spark.{SparkConf, SparkContext} 7 | 8 | /** 9 | * Created by zhangxin on 16-11-9. 10 | * 11 | * 分类模型训练 12 | */ 13 | object training { 14 | 15 | 16 | def training(): Unit ={ 17 | 18 | val conf = new SparkConf().setAppName("W2V").setMaster("local") 19 | val sc = new SparkContext(conf) 20 | // val jsonPath = "/home/zhangxin/work/workplace_scala/Sentiment/src/main/scala/classificationW2V/W2VJsonConf.json" 21 | val jsonPath = "/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json" 22 | 23 | // 非平衡集 24 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingsetUnbalance/JSJ.txt" 25 | // val docs = sc.textFile(docsPath).collect() 26 | 27 | // 平衡集 28 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/房地产" 29 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/有色金属" 30 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/保险" 31 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/医药" 32 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/计算机" 33 | val docsPath = "/Users/li/workshop/DataSet/trainingSets/工程建筑" 34 | 35 | val docs = sc.textFile(docsPath).collect() 36 | 37 | // inputs 38 | val data = processing.process_weight(docs, sc, jsonPath) 39 | println("[完成DOC2Vec模型]>>>>>>>>>>>>>>>>>") 40 | 41 | val dataRdd = sc.parallelize(data) 42 | val splits = dataRdd.randomSplit(Array(0.8, 0.2), seed = 11L) 43 | val train = splits(0) 44 | val test = splits(1) 45 | 46 | val model = SVMWithSGD.train(train, 50) 47 | // val model = LogisticRegressionWithSGD.train(train, 5000) 48 | println("[完成模型训练]>>>>>>>>>>>>>>>>>>>") 49 | 50 | 51 | val predictAndLabels = test.map{ 52 | case LabeledPoint(label, features) => 53 | val prediction = model.predict(features) 54 | (prediction, label) 55 | } 56 | 57 | val metrics = new MulticlassMetrics(predictAndLabels) 58 | println(s"[综合_Precison] ${metrics.precision}") 59 | println(s"[Labels] ${metrics.labels.toList}") 60 | metrics.labels.foreach(label => { 61 | println(s"[${label}_Precison] ${metrics.precision(label)}") 62 | }) 63 | 64 | } 65 | 66 | def main(args: Array[String]): Unit = { 67 | training() 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /src/main/scala/test/regularExpression.scala: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | /** 4 | * Created by li on 16/7/22. 5 | */ 6 | object regularExpression { 7 | 8 | def main(args: Array[String]) { 9 | 10 | val numPatten = """([0-9]+) ([a-z]+\s+)""".r 11 | 12 | // val numPatten = """(\s+[0-9]+\s+) ([0-9]+) ()""".r 13 | 14 | val res = numPatten.findAllIn("99 bottles, 89 bottles").toArray 15 | 16 | res.foreach(println) 17 | 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/scala/util/DirectoryUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.io.File 4 | 5 | /** 6 | * Created by li on 16/7/18. 7 | * 文件夹处理工具,删除空文件夹, 删除非空文件夹及其中的文件 8 | */ 9 | object DirectoryUtil { 10 | 11 | /** 12 | * 删除空目录 13 | * 14 | * @param dir 将要删除的目录路径 15 | */ 16 | def doDeleteEmptyDir(dir: String): Unit = { 17 | 18 | val success: Boolean = new File(dir).delete() 19 | 20 | if (success) { 21 | 22 | System.out.println("Successfully deleted empty directory: " + dir) 23 | 24 | } else { 25 | 26 | System.out.println("Failed to delete empty directory: " + dir) 27 | } 28 | } 29 | 30 | /** 31 | * 递归删除目录下的所有文件及子目录下所有文件 32 | * 33 | * @param dir 将要删除的文件目录 34 | * @return boolean Returns "true" if all deletions were successful. 35 | * If a deletion fails, the method stops attempting to 36 | * delete and returns "false". 37 | */ 38 | def deleteDir(dir: File): Boolean = { 39 | 40 | if (dir.isDirectory) { 41 | 42 | val children = dir.list() 43 | 44 | //递归删除目录中的子目录下 45 | for (i <- 0 until children.length){ 46 | 47 | val success = deleteDir(new File(dir, children(i))) 48 | 49 | if (! success){ 50 | return false 51 | } 52 | 53 | } 54 | } 55 | // 目录此时为空,可以删除 56 | dir.delete() 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/main/scala/util/FileUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.io.{File, BufferedReader, FileReader, PrintWriter} 4 | 5 | import scala.collection.mutable 6 | import scala.collection.mutable.ListBuffer 7 | import scala.io.Source 8 | 9 | /** 10 | * Created by li on 2016/2/22. 11 | */ 12 | object FileUtil { 13 | 14 | /** 15 | * override the old one 16 | */ 17 | def createFile(path: String, lines: Seq[String]): Unit = { 18 | 19 | val writer = new PrintWriter(path, "UTF-8") 20 | 21 | for (line <- lines) { 22 | writer.println(line) 23 | } 24 | writer.close() 25 | } 26 | 27 | def readFile(path: String): ListBuffer[String] = { 28 | 29 | var lines = new ListBuffer[String]() 30 | 31 | val br = new BufferedReader(new FileReader(path)) 32 | try { 33 | var line = br.readLine() 34 | 35 | while (line != null) { 36 | lines += line 37 | line = br.readLine() 38 | } 39 | lines 40 | } finally { 41 | br.close() 42 | } 43 | } 44 | 45 | /** 将结果保存到本地,将每小时数据保存为一个txt文件,一天的数据保存在一个文件夹里. 46 | * 47 | * @param dir 文件保存的目录 48 | * @param result 49 | * @author Li Yu 50 | */ 51 | def saveAsTextFile(dir: String, result: Array[(String, Double)]): Unit ={ 52 | 53 | val day = TimeUtil.getDay 54 | val hour = TimeUtil.getCurrentHour 55 | 56 | val writer = new PrintWriter(new File(dir +"%s".format(day) + "-" + "%s".format(hour) + ".txt")) 57 | 58 | for (line <- result) { 59 | 60 | writer.write(line._1 + "\t" + line._2 + "\n") 61 | 62 | } 63 | 64 | writer.close() 65 | } 66 | 67 | /** 68 | * 读取当前时间前一个小时的数据,读取本地文件中的结果. 69 | * 70 | * @param dir 数据保存的目录 71 | * @return 72 | */ 73 | def readFromFile(dir: String): Array[(String, Double)] ={ 74 | 75 | val date = TimeUtil.getPreHourStr 76 | 77 | val temp = Source.fromFile(dir + "%s".format(date) + ".txt" ) 78 | 79 | val res = new mutable.ArrayBuffer[(String, Double)] 80 | temp.getLines().foreach( 81 | line =>{ 82 | val temp = line.split("\t") 83 | res.+=((temp(0), temp(1).toDouble)) 84 | } 85 | ) 86 | res.toArray 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/main/scala/util/JsonUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import org.json.JSONObject 4 | 5 | import scala.util.parsing.json.JSON 6 | import scala.io.Source 7 | 8 | 9 | /** 10 | * Created by li on 16/8/29. 11 | * 读取json格式的额配置文件信息. 12 | */ 13 | object JSONUtil { 14 | 15 | private var config = new JSONObject() 16 | 17 | /** 18 | * 初始化类 19 | * 20 | * @param confDir 配置文件路径 21 | */ 22 | def initConfig(confDir: String): Unit = { 23 | 24 | val jsObj = Source.fromFile(confDir).getLines().mkString("") 25 | config = new JSONObject(jsObj) 26 | } 27 | 28 | 29 | private def readConfigFile(confDir: String): Map[String, Any] = { 30 | 31 | val jsonFile = Source.fromFile(confDir).mkString 32 | 33 | val json = JSON.parseFull(jsonFile) 34 | 35 | json match { 36 | 37 | case Some(map: Map[String, Any]) => map 38 | // case None => println("Parsing failed") 39 | // case other => println("Unknown data structure: " + other) 40 | } 41 | 42 | } 43 | 44 | /** 45 | * 获取配置文件中的相应的值 46 | * @param key1 定位key 47 | * @param key2 定位key 48 | * @return 返回字符串 49 | */ 50 | def getValue(key1: String, key2: String): String = { 51 | 52 | config.getJSONObject(key1).getString(key2) 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/scala/util/LoggerUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import org.apache.log4j.{BasicConfigurator, Logger} 4 | 5 | /** 6 | * 写Log操作 7 | */ 8 | object LoggerUtil { 9 | 10 | var logger = Logger.getLogger("word2vector") 11 | BasicConfigurator.configure() 12 | // PropertyConfigurator.configure("/home/mlearning/tdt/conf/log4j.properties") 13 | 14 | def exception(e: Exception) = { 15 | 16 | logger.error(e.printStackTrace()) 17 | 18 | } 19 | 20 | def error(msg: String): Unit = { 21 | 22 | logger.error(msg) 23 | } 24 | 25 | def warn(msg: String): Unit = { 26 | 27 | logger.warn(msg) 28 | } 29 | 30 | def info(msg: String): Unit = { 31 | 32 | logger.info(msg) 33 | } 34 | 35 | def debug(msg: String): Unit = { 36 | 37 | logger.debug(msg) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/scala/util/MySQLUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet} 4 | 5 | import scala.collection.mutable.ArrayBuffer 6 | import scala.xml.Elem 7 | 8 | /** 9 | * Created by li on 16/7/12. 10 | */ 11 | object MySQLUtil { 12 | 13 | /** 14 | * 读取配置文件中的内容,并建立连接 15 | * 16 | * @param configFile 配置文件 17 | * @return 18 | */ 19 | def getConnect(configFile: Elem): Connection = { 20 | 21 | //写在配置文件中 22 | val url = (configFile \ "mysql" \ "url" ).text 23 | val userName = (configFile \ "mysql" \ "username").text 24 | val password = (configFile \ "mysql" \ "password").text 25 | 26 | //设置驱动 27 | Class.forName("com.mysql.jdbc.Driver") 28 | 29 | //初始化 30 | val conn = DriverManager.getConnection(url, userName, password) 31 | 32 | conn 33 | } 34 | 35 | /** 36 | * 向mysql中写数据 37 | * 38 | * @param configFile 配置文件 39 | * @param data 需要写进数据库里面的数据 40 | * @param sql sql查询语句, 格式(sql = "INSERT INTO quotes (quote, author) VALUES (?, ?)") 41 | */ 42 | def write2Mysql(configFile: Elem, data: Iterator[String], sql: String): Unit ={ 43 | 44 | var conn: Connection = null 45 | var prep: PreparedStatement = null 46 | 47 | try{ 48 | 49 | // 读取配置文件并建立连接 50 | conn = getConnect(configFile) 51 | 52 | /** 对需要写入的内容(data)的每一行进行操作 */ 53 | data.foreach{ line => { 54 | 55 | val temp = line.split(",") 56 | 57 | /** sql插入语句: */ 58 | prep = conn.prepareStatement(sql) 59 | prep.setString(1, temp(0)) 60 | prep.setString(2, temp(1)) 61 | 62 | prep.executeUpdate() 63 | }} 64 | } catch { 65 | 66 | case e: Exception => println("Mysql Exception") 67 | } finally { 68 | 69 | if(conn != null) { 70 | 71 | conn.close() 72 | } 73 | 74 | if(prep != null) { 75 | 76 | prep.close() 77 | } 78 | } 79 | } 80 | 81 | /** 82 | * 从mysql中读取数据 83 | * 84 | * @param configFile 配置文件 85 | * @param sql mysql查询语句 86 | */ 87 | def readFromMysql(configFile: Elem, sql: String): Array[(String, String)] = { 88 | 89 | var conn: Connection = null 90 | 91 | try { 92 | 93 | // 读取配置文件并建立连接 94 | conn = getConnect(configFile) 95 | 96 | val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE) 97 | // 通过sql查询语句查询的结果 98 | // val sql = "select symbol, sename from bt_stcode where (EXCHANGE = '001002' or EXCHANGE = '001003') and SETYPE = '101' and CUR = 'CNY' and ISVALID = 1 and LISTSTATUS <> '2'" 99 | val result = statement.executeQuery(sql) 100 | 101 | val stocks = ArrayBuffer[(String, String)]() 102 | while(result.next()) { 103 | 104 | /** todo 对查询的结果进行操作 */ 105 | val stockID = result.getString("symbol") // symbol: row name 106 | val stock = stockID + "," + result.getString("sename") // sename: row name 107 | stocks +=((stockID, stock)) 108 | } 109 | 110 | stocks.toArray 111 | } catch { 112 | 113 | case e: Exception => Array(("error", "error")) 114 | } finally { 115 | 116 | conn.close() 117 | } 118 | } 119 | 120 | } 121 | -------------------------------------------------------------------------------- /src/main/scala/util/RedisUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import redis.clients.jedis.Jedis 4 | 5 | import scala.collection.mutable 6 | import scala.xml.Elem 7 | 8 | /** 9 | * Created by li on 16/7/8. 10 | */ 11 | object RedisUtil { 12 | 13 | var jedis: Jedis = null 14 | /** 15 | * 初始化 redis 16 | * 17 | * @param configFile 配置文件对应的 xml 对象 18 | * @note rowNum: 10 19 | */ 20 | def initRedis(configFile: Elem): Jedis = { 21 | 22 | val redisIp = (configFile \ "redis" \ "ip").text 23 | val redisPort = (configFile \ "redis" \ "port").text.toInt 24 | val redisDB = (configFile \ "redis" \ "db").text.toInt 25 | val redisAuth = (configFile \ "redis" \ "auth").text 26 | 27 | jedis = new Jedis(redisIp, redisPort) 28 | jedis.auth(redisAuth) 29 | jedis.select(redisDB) 30 | 31 | jedis 32 | } 33 | 34 | /** 35 | * 36 | */ 37 | def readFromRedis: Unit ={ 38 | 39 | } 40 | 41 | /** 42 | * 将结果保存到redis 43 | * 44 | * @param resultData 需要保存的数据, hset格式 45 | * @author LiYu 46 | * @note rowNum: 12 47 | */ 48 | def write2RedisWithHset(resultData: Array[(String, String)], time: String, dataType: String): Unit = { 49 | 50 | val resultDataMap = mutable.HashMap[String, String]() 51 | 52 | resultData.foreach{line => { 53 | resultDataMap.put(line._1, line._2) 54 | }} 55 | 56 | val pipeline = jedis.pipelined() 57 | 58 | resultDataMap.toSeq.foreach{ x => { 59 | 60 | pipeline.hset(s"vipstockstatistic_$dataType" + s"_$time", x._1, x._2) 61 | // pipeline.expire("hotwordsrank_test:", 60 * 60 * 12) 62 | }} 63 | 64 | pipeline.sync() 65 | } 66 | 67 | /** 68 | * 将结果保存到redis 69 | * 70 | * @param resultData 需要保存的数据,zet格式 71 | * @author Li Yu 72 | * @note rowNum: 12 73 | */ 74 | def write2RedisWithZset(resultData: Array[(String, String)], time: String, dataType: String, jedis: Jedis): Unit = { 75 | 76 | resultData.foreach{x => { 77 | 78 | jedis.zadd(s"vipstockstatistic_$dataType" + s"_$time", x._2.toDouble, x._1) 79 | }} 80 | 81 | jedis.close() 82 | } 83 | 84 | 85 | } 86 | -------------------------------------------------------------------------------- /src/main/scala/util/TextProcessing.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | 4 | /** 5 | * Created by li on 16/4/11. 6 | */ 7 | object TextProcessing { 8 | 9 | } 10 | -------------------------------------------------------------------------------- /src/main/scala/util/TimeUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.math.BigInteger 4 | import java.text.SimpleDateFormat 5 | import java.util.{Calendar, Date} 6 | 7 | import org.apache.hadoop.hbase.client.Scan 8 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil 9 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos 10 | import org.apache.hadoop.hbase.util.Base64 11 | 12 | /** 13 | * 格式化时间的工具类 14 | */ 15 | object TimeUtil { 16 | 17 | 18 | /** 19 | * 获取时间戳对应的时间 20 | * @param timeStamp 时间戳 21 | * @return 22 | */ 23 | def getTime(timeStamp: String): String = { 24 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss") 25 | val bigInt: BigInteger = new BigInteger(timeStamp) 26 | val date: String = sdf.format(bigInt) 27 | date 28 | } 29 | 30 | /** 31 | * 获取当前时间,并转换成制定的格式 32 | * @return 33 | */ 34 | def getDay: String = { 35 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd") 36 | val date: String = sdf.format(new Date) 37 | date 38 | } 39 | 40 | /** 41 | * 获取当前小时 42 | * @return 43 | */ 44 | def getCurrentHour: Int = { 45 | val calendar = Calendar.getInstance 46 | calendar.setTime(new Date) 47 | calendar.get(Calendar.HOUR_OF_DAY) 48 | } 49 | 50 | /** 51 | * 获取当前小时的前一个小时 52 | * @return 53 | */ 54 | def getPreHourStr: String = { 55 | val date = new Date(new Date().getTime - 60 * 60 * 1000) 56 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd-HH") 57 | sdf.format(date) 58 | } 59 | 60 | /** 61 | * 获取今天的日期 62 | * 63 | * @return 64 | */ 65 | def getNowDate(): String = { 66 | val now: Date = new Date() 67 | val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd") 68 | val res = dateFormat.format(now) 69 | res 70 | } 71 | 72 | 73 | /** 74 | * 获取本周的开始时间 75 | */ 76 | def Null(){ 77 | 78 | } 79 | 80 | /** 81 | * 获取本月的开始时间 82 | * http://blog.csdn.net/springlustre/article/details/47273353 83 | */ 84 | 85 | 86 | /** 87 | * 设置时间范围 88 | * 89 | * @return 时间范围 90 | * @author 91 | */ 92 | def setTimeRange(): String = { 93 | 94 | val scan = new Scan() 95 | val date = new Date(new Date().getTime - 30 * 24 * 60 * 60 * 1000) 96 | val format = new SimpleDateFormat("yyyy-MM-dd HH") 97 | val time = format.format(date) 98 | val time1 = format.format(new Date().getTime) 99 | val startTime = time + "-00-00" 100 | val stopTime = time1 + "-00-00" 101 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss") 102 | val startRow: Long = sdf.parse(startTime).getTime 103 | val stopRow: Long = sdf.parse(stopTime).getTime 104 | 105 | scan.setTimeRange(startRow, stopRow) 106 | val proto: ClientProtos.Scan = ProtobufUtil.toScan(scan) 107 | 108 | Base64.encodeBytes(proto.toByteArray) 109 | } 110 | 111 | /** 112 | * 设置指定的时间范围(一天) 113 | * @param time 指定的日期 114 | * @return 指定日期至前一天时间范围 115 | */ 116 | def setAssignedTimeRange(time: String): String = { 117 | 118 | val format = new SimpleDateFormat("yyyy-MM-dd") 119 | 120 | val date = format.parse(time) 121 | 122 | val endTime = new Date(date.getTime - 24 * 60 * 60 * 1000) 123 | 124 | val stopTime = format.format(endTime) 125 | 126 | val startDate = time + "-00-00-00" 127 | val stopDate = stopTime + "-00-00-00" 128 | 129 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss") 130 | val startRaw = sdf.parse(startDate).getTime 131 | val stopRaw = sdf.parse(stopDate).getTime 132 | 133 | val scan = new Scan() 134 | scan.setTimeRange(startRaw, stopRaw) 135 | 136 | val proto = ProtobufUtil.toScan(scan) 137 | 138 | Base64.encodeBytes(proto.toByteArray) 139 | } 140 | 141 | 142 | } 143 | -------------------------------------------------------------------------------- /src/main/scala/util/UrlCategoryTrim.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import java.io.{BufferedWriter, File, FileWriter} 4 | 5 | import org.apache.spark.{SparkConf, SparkContext} 6 | 7 | import scala.collection.mutable.ListBuffer 8 | import scala.io.Source 9 | 10 | /** 11 | * Created by li on 16/4/6. 12 | * 所输入的数据中有的一个url会对应多个catagory,将具有相同URL的catagory单独分隔开,变成一一对应的值 13 | */ 14 | object UrlCategoryTrim { 15 | 16 | // 判断如果catagory中有多个的将其分开并与url对应 17 | def splitCategory(tuple:(String,String)): ListBuffer[(String)] = { 18 | 19 | val listBuffer = new ListBuffer[(String)] 20 | val cata = tuple._1.split(",") 21 | 22 | if(cata.length < 1){ 23 | 24 | listBuffer.+=(tuple._2 + "\t" + tuple._1) 25 | } else { 26 | 27 | for(item <- cata){ 28 | 29 | listBuffer.+=(tuple._2+ "\t" +item) 30 | } 31 | } 32 | listBuffer 33 | } 34 | 35 | def main(args: Array[String]) { 36 | val conf = new SparkConf().setAppName("urlCatagoryTrim").setMaster("local") 37 | val sc = new SparkContext(conf) 38 | 39 | 40 | val data = Source.fromFile("/Users/li/Downloads/trainingLabel(0).new").getLines().toArray.map{ 41 | line => 42 | val tmp = line.split("\t") 43 | (tmp(1), tmp(0)) 44 | 45 | } 46 | 47 | // data.flatMap(splitCatagory).foreach(println) 48 | // 保存到文件中 49 | val dataFile = new File("/users/li/Downloads/trainglabel3.txt") 50 | val fileWriter = new FileWriter(dataFile) 51 | val bufferWriter = new BufferedWriter(fileWriter) 52 | 53 | data.flatMap(x => splitCategory(x)).foreach ( 54 | line => 55 | bufferWriter.write(line + "\n") 56 | ) 57 | 58 | bufferWriter.flush() 59 | bufferWriter.close() 60 | 61 | } 62 | 63 | 64 | 65 | } 66 | 67 | 68 | -------------------------------------------------------------------------------- /src/main/scala/util/XMLUtil.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import scala.xml.{XML, Elem} 4 | 5 | /** 6 | * Created by li on 16/8/29. 7 | */ 8 | object XMLUtil { 9 | 10 | /** 11 | * 获取xml格式的配置文件 12 | * 13 | * @param dir 配置文件所在的文件目录 14 | * @return 15 | */ 16 | def readConfigFile(dir: String): Elem = { 17 | 18 | val configFile = XML.loadFile(dir) 19 | 20 | configFile 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/util/regularExpression.scala: -------------------------------------------------------------------------------- 1 | package util 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | 5 | import scala.collection.mutable.ListBuffer 6 | import scala.util.matching.Regex 7 | 8 | /** 9 | * Created by li on 16/6/20. 10 | * 正则表达式,读取文本中所有双引号里面的内容. 11 | */ 12 | object regularExpression extends App{ 13 | val conf = new SparkConf().setMaster("local").setAppName("regularexpression") 14 | val sc = new SparkContext(conf) 15 | 16 | val data = sc.textFile("file:/Users/li/kunyan/111.txt") 17 | 18 | def quotationMatch(sentence:String): Array[String] = { 19 | 20 | // val regex = new Regex("\"([^\"]*)\"") //匹配双引号 21 | // val regex = new Regex("(?<=\").{1,}(?=\")") //匹配双引号 22 | val regex = new Regex("([-+]?\\d+(\\.\\d+)?%)|[-+]?\\d+(\\.\\d+)?")//匹配正(负)整数\浮点数\含有百分号的数 23 | 24 | // val regex = "\"([^\"]*)\"".r 25 | val num = regex.findAllIn(sentence) 26 | val res = new ListBuffer[String] 27 | while(num.hasNext){ 28 | val item = num.next() 29 | res += item.replaceAll("\"", "") 30 | } 31 | res.toArray 32 | } 33 | 34 | // val res = quotationMatch(data) 35 | data.foreach { 36 | 37 | x =>{ 38 | val res = quotationMatch(x) 39 | res.foreach(println) 40 | } 41 | } 42 | 43 | 44 | 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/wordSegmentation/AnsjAnalyzer.scala: -------------------------------------------------------------------------------- 1 | package wordSegmentation 2 | 3 | import org.ansj.domain.Term 4 | import org.ansj.library.UserDefineLibrary 5 | import org.ansj.splitWord.analysis.{NlpAnalysis, ToAnalysis} 6 | import org.apache.spark.SparkContext 7 | import org.nlpcn.commons.lang.tire.domain.Value 8 | import org.nlpcn.commons.lang.tire.library.Library 9 | 10 | /** 11 | * Created by zhangxin on 2016/3/8 12 | * 基于ansj的分词工具 13 | */ 14 | object AnsjAnalyzer { 15 | 16 | /** 17 | * ansj分词器初始化, 添加用户词典 18 | * 19 | * @param sc spark程序入口 20 | * @param userDic 用户词典数组 21 | * @return 无 22 | * @author zhangxin 23 | */ 24 | def init(sc: SparkContext, userDic: Array[String]): Unit = { 25 | 26 | val forest = Library.makeForest("library/default.dic") 27 | // val forest = new Forest() 28 | 29 | if(userDic != null ){ 30 | userDic.foreach(addUserDic(_, sc)) 31 | } 32 | 33 | } 34 | 35 | /** 36 | * 添加用户词典到分词器 37 | * 38 | * @param dicPath 词典路径 39 | * @param sc spark程序入口 40 | * @return 无 41 | * @author zhangxin 42 | */ 43 | def addUserDic(dicPath: String, sc: SparkContext): Unit = { 44 | 45 | //读取词典 46 | val dic = sc.textFile(dicPath).collect() 47 | 48 | //添加到ansj中 49 | dic.foreach(UserDefineLibrary.insertWord(_, "userDefine", 100)) 50 | 51 | 52 | } 53 | 54 | /** 55 | * 标准分词 ,无词性标注 56 | * 57 | * @param sentence 待分词语句 58 | * @return 分词结果 59 | * @author zhangxin 60 | */ 61 | def cutNoTag(sentence: String): Array[String] = { 62 | 63 | // 添加词典,这边有问题,还需继续研究 64 | val value = new Value("济南 \tn") 65 | 66 | Library.insertWord(UserDefineLibrary.ambiguityForest, value) 67 | 68 | //切词 69 | val sent = ToAnalysis.parse(sentence) 70 | 71 | //提取分词结果,过滤词性 72 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName 73 | 74 | words.toArray 75 | } 76 | 77 | /** 78 | * 自然语言分词,带词性标注 79 | * 80 | * @param sentence 待分词句子 81 | * @return 分词结果 82 | * @author zhangxin 83 | */ 84 | def cutWithTag(sentence: String): Array[Term] = { 85 | 86 | // 切词 87 | val sent = NlpAnalysis.parse(sentence) 88 | 89 | // 提取分词结果 90 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).next() 91 | 92 | words.toArray 93 | } 94 | 95 | 96 | /** 97 | * 标准分词 ,无词性标注 98 | * 99 | * @param sentence 待分词语句 100 | * @return 分词结果 101 | */ 102 | def cutTag(sentence: String, option: Int): Array[String] = { 103 | 104 | val value = new Value("济南\tn") 105 | 106 | Library.insertWord(UserDefineLibrary.ambiguityForest, value) 107 | 108 | //切词 109 | val sent = ToAnalysis.parse(sentence) 110 | 111 | option match { 112 | case 0 => { 113 | 114 | //提取分词结果,过滤词性 115 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName 116 | 117 | words.toArray 118 | } 119 | 120 | case 1 => { 121 | 122 | // 提取分词结果 123 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName 124 | 125 | words.toArray 126 | } 127 | } 128 | } 129 | 130 | } 131 | -------------------------------------------------------------------------------- /src/main/scala/wordSegmentation/wordSegmentAnalyser.scala: -------------------------------------------------------------------------------- 1 | package wordSegmentation 2 | 3 | 4 | /** 5 | * Created by li on 16/8/29. 6 | * 调用ansj分词系统 7 | */ 8 | object wordSegmentAnalyser { 9 | 10 | val content = "我是中国人,我经济南下车到广州。中国经济南下势头迅猛!" 11 | 12 | def sentenceSegment(content: String): Array[Array[String]] = { 13 | 14 | // 文章切分为句子 15 | val sentenceArr = content.split(",|。|\t|\n|,|:") 16 | // 句子分词 17 | val segResult = sentenceArr.map(AnsjAnalyzer.cutNoTag) 18 | 19 | segResult.foreach(x => { 20 | 21 | x.foreach(x => print(x + "| ")) 22 | println() 23 | }) 24 | 25 | segResult 26 | } 27 | 28 | 29 | // def isElem(sentence: Array[String], candidate: Array[String]): Boolean = { 30 | // 31 | // candidate.map{ line => { 32 | // 33 | // if(sentence.contains(line)) { 34 | // 35 | // return true 36 | // 37 | // } else { 38 | // 39 | // return false 40 | // } 41 | // }} 42 | // 43 | // } 44 | // 45 | // def identify(sentenceSeg: Array[Array[String]], 46 | // candidateDic: (String, Array[String])): Array[(Array[String], Array[String])] = { 47 | // 48 | // sentenceSeg.map{line => { 49 | // if (isElem(line, candidateDic._2)){ 50 | // 51 | // (line, candidateDic._1) 52 | // } else { 53 | // (line, "0") 54 | // } 55 | // }} 56 | // 57 | // } 58 | 59 | 60 | def main(args: Array[String]) { 61 | 62 | //每个句子分词 63 | 64 | sentenceSegment(content) 65 | 66 | //匹配窗口设定 67 | 68 | 69 | //名词提出 70 | 71 | 72 | 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /src/test/resources/2016-07-11-15.txt: -------------------------------------------------------------------------------- 1 | 好男儿 2.313289243522607 2 | 太正宵 0.7779809171400112 3 | 婚纱 0.7515285506073754 4 | 俞思远 0.6920085439132682 5 | 董文华 0.6858591525761419 6 | 直播 0.5917747547425979 7 | 六合彩 0.5647028809538401 8 | 李宇春 0.5534632615609104 9 | 男同志 0.43542120073545265 10 | 演唱会 0.415335092651389 11 | 无耻 0.4137490483483452 12 | 敢死队 0.2982491500059149 -------------------------------------------------------------------------------- /src/test/resources/2016-07-12-13.txt: -------------------------------------------------------------------------------- 1 | t1 1.0564168992636667 2 | t3 0.9591311372616367 3 | t2 0.6064584948059578 4 | 敢死队 0.03318315879765851 5 | 好男儿 0.028594324469757446 6 | -------------------------------------------------------------------------------- /src/test/resources/2016-07-12-15.txt: -------------------------------------------------------------------------------- 1 | 好男儿 0.313289243522607 2 | 太正宵 1.7779809171400112 3 | 婚纱 0.7515285506073754 4 | 俞思远 2.6920085439132682 5 | 董文华 0.6858591525761419 6 | 直播 0.5917747547425979 7 | 六合彩 0.5647028809538401 8 | 李宇春 0.5534632615609104 9 | 男同志 0.43542120073545265 10 | 演唱会 0.415335092651389 11 | 无耻 0.4137490483483452 12 | 敢死队 0.2982491500059149 -------------------------------------------------------------------------------- /src/test/resources/2016-07-12-16.txt: -------------------------------------------------------------------------------- 1 | t3 1.0564168992636667 2 | t1 0.9591311372616367 3 | t2 0.6064584948059578 4 | 敢死队 0.03318315879765851 5 | 好男儿 0.028594324469757446 6 | -------------------------------------------------------------------------------- /src/test/resources/text/1.txt: -------------------------------------------------------------------------------- 1 | 光伏,中国人民银行,列,入,绿色,债券,支援,专案,目录,2015年12月22日,19:00:00,中国人民银行,发布,2015,第39,号,公告,公告,称为,加快,建设生态文明,引导,金融机构,服务,绿色发展,推动,经济结构转型,升级,经济发展方式转变,支援,金融机构,发行,绿色,金融债券,募集资金,支援,绿色,产业发展,笔者,目录,第5,项,清洁能源,发电,中,风力发电,光伏发电,智慧,电网,能源,因特网,分布式能源,太阳能热利用,水力发电,新能源,利用,列,入,太阳能光伏发电站,太阳能,高,温热,发电站,不含,分布式,太阳能光伏发电,系统,需,限定,条件,多晶硅,电池,组件,光电,转化,效率,≥,15.5%,组件,专案,投产,运行,日,一年,衰减率,≤,2.5%,年,衰减率,≤,0.7%,单晶硅,电池,组件,光电,转化,效率,≥,16%,组件,专案,投产,运行,日,一年,衰减率,≤,3%,年,衰减率,≤,0.7%,高,倍,聚光光伏,组件,光电,转化,效率,≥,28%,项目,投产,运行,日,一年,衰减率,≤,2%,年,衰减率,≤,0.5%,项目全生命周期,衰减率,≤,10%,硅基,薄膜电池,组件,光电,转化,效率,≥,8%,铜铟镓硒,CIGS,薄膜电池,组件,光电,转化,效率,≥,11%,碲化镉,CdTe,薄膜电池,组件,光电,转化,效率,≥,11%,薄膜电池,组件,光电,转化,效率,≥,10%,多晶硅,单晶硅,薄膜电池,项目全生命周期,衰减率,≤,20%,智能电网,能源,因特网,指,提高,供,需,负荷,平衡,回应,能力,改善,电网,综合,能效,降低,输变电,损耗,增强,可再生能源,接,入,能力,电网建设,运营,技术,升级,改造,专案,1.,智能电网,指,采用,智慧,型,电气设备,即时,双向,集成,通信技术,先进技术,电网建设,运营,专案,电网,智慧,化,升级,改造,项目,2.,能源,因特网,指,综合,电力电子,资讯,智慧,管理技术,连接,分布式能源,含,分布式,可再生能源,分布式,储能,装置,类型,负荷,能量,双向,流动,交换,共享,电网,微电网,能源,燃气,网络,设施,建设,运营,专案,分布式能源,指,区域,能源站,包括,天然气,区域,能源站,分布式光伏发电,系统,分布式能源,设施,建设,运营,分布式能源,接,入,峰谷,调节,系统,分布式,电力,交易平台,能源管理系统,建设,运营,附,中国人民银行公告,2015,第39,号,绿色,债券,支援,专案,目录 -------------------------------------------------------------------------------- /src/test/resources/text/2.txt: -------------------------------------------------------------------------------- 1 | 记者,国家电网公司,获悉,9月23日,河北丰宁,二期,山东文登,重庆,蟠龙,抽水蓄能电站,工程,以下简称,丰宁,二期,文登,蟠龙,抽,蓄,座,抽,蓄,电站,正式,开工,总投资,244.4亿,元,总装机容量,480万,千瓦,计划,2022年,竣工,投产,项目,预计,增加,发电,装备制造业,产值,111亿,元,推动,相关,装备制造业,发展,开工,动员大会,国家电网公司,董事长,党组书记,刘振亚,丰宁,二期,文登,蟠龙,抽,蓄,国家电网公司,推进,特高压电网,建设,服务,清洁能源,发展,重大工程,继,2015年6月,安徽金寨,山东沂蒙,河南,天池,座,抽水蓄能电站,第二批,开工,电站,标志,我国,抽水蓄能电站,加快,发展,新,阶段,介绍,河北丰宁,二期,抽水蓄能电站,项目,位于,河北省承德市,丰宁县,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,华北电网,工程投资,87.5亿,元,丰宁抽水蓄能电站,一期,二期,装机容量,360万,千瓦,世界上,装机容量,抽水蓄能电站,山东,文登抽水蓄能电站,位于,山东省,威海市文登区,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,山东电网,工程投资,85.7亿,元,重庆,蟠龙,抽水蓄能电站,位于,重庆市綦江区,装机容量,120万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,重庆电网,工程投资,71.2亿,元,国网,座,受,端,电网,地区,抽水蓄能电站,建成,更好地,接纳,区,外,来电,优化,电源,结构,提高,北,西南,地区,清洁能源,消纳,能力,提高,特高压电网,系统安全,可靠性,综合,煤电,机组,消纳,清洁能源,效果,建设,丰宁,二期,文登,蟠龙,抽,蓄,年,节约,原煤,消耗,291万,吨,减排,烟尘,0.3万,吨,二氧化硫,1.4万,吨,氮氧化物,1.3万,吨,二氧化碳,485万,吨,节能减排,大气污染防治,国家电网公司,经营,区域,内在,运,抽水蓄能电站,装机容量,1674.5万,千瓦,建,规模,1880万,千瓦,预计,2017年,我国,抽水蓄能,装机,3300万,千瓦,超过,美国,世界上,抽水蓄能电站,第一,大国 -------------------------------------------------------------------------------- /src/test/resources/text/abstract: -------------------------------------------------------------------------------- 1 | 算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。 2 | 算法可以宽泛的分为三类, 3 | 一,有限的确定性算法,这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务,但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。 4 | 二,有限的非确定算法,这类算法在有限的时间内终止。然而,对于一个(或一些)给定的数值,算法的结果并不是唯一的或确定的。 5 | 三,无限的算法,是那些由于没有定义终止定义条件,或定义的条件无法由输入的数据满足而不终止运行的算法。通常,无限算法的产生是由于未能确定的定义终止条件。 -------------------------------------------------------------------------------- /src/test/scala/CNNTest.scala: -------------------------------------------------------------------------------- 1 | import breeze.linalg.{DenseMatrix, DenseVector} 2 | 3 | //import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, DenseVector => BDV, Matrix => BM, SparseVector => BSV, Vector => BV, accumulate => Accumulate, axpy => brzAxpy, rot90 => Rot90, sum => Bsum, svd => brzSvd, DenseVector} 4 | //import breeze.numerics.{exp => Bexp, tanh => Btanh} 5 | //import org.apache.spark.mllib.linalg.DenseMatrix 6 | 7 | 8 | /** 9 | * Created by li on 16/8/15. 10 | */ 11 | object CNNTest { 12 | 13 | 14 | def main(args: Array[String]) { 15 | // 16 | // def sigm(matrix: BDM[Double]): BDM[Double] = { 17 | // val s1 = 1.0 / (Bexp(matrix * (-1.0)) + 1.0) 18 | // s1 19 | // } 20 | // 21 | // val result = BDM.ones[Double](2, 3) + 1.8 22 | 23 | 24 | 25 | val a = DenseVector(1.0, 2.0, 3.0, 4.0, 5.0) 26 | 27 | val b = DenseVector(1.0, 2.0, 3.0, 4.0, 5.0) 28 | 29 | val c = DenseMatrix.ones[Double](5, 2) 30 | 31 | val d = DenseMatrix.ones[Double](5, 5) 32 | 33 | println((a.toDenseMatrix :* d)) 34 | 35 | 36 | // val c = (a :* b) :* d 37 | // 38 | // println(c) 39 | 40 | 41 | 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /src/test/scala/HDFSUtilTest.scala: -------------------------------------------------------------------------------- 1 | import util.HDFSUtil 2 | 3 | import scala.xml.XML 4 | 5 | /** 6 | * Created by li on 16/7/25. 7 | */ 8 | object HDFSUtilTest { 9 | 10 | def main(args: Array[String]) { 11 | 12 | val configFile = XML.loadFile("/Users/li/Kunyan/NaturalLanguageProcessing/src/main/scala/util/config.xml") 13 | 14 | val filesystem = HDFSUtil.setHdfsConfigure(configFile) 15 | 16 | } 17 | 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/test/scala/JSONUtilTest.scala: -------------------------------------------------------------------------------- 1 | import util.JSONUtil 2 | 3 | /** 4 | * Created by li on 16/8/29. 5 | */ 6 | object JSONUtilTest { 7 | 8 | 9 | def main(args: Array[String]) { 10 | 11 | val confDir = "/Users/li/Kunyan/NaturalLanguageProcessing/src/main/resources/jsonConfig.json" 12 | 13 | JSONUtil.initConfig(confDir) 14 | 15 | val res = JSONUtil.getValue("hbase", "rootDir") 16 | 17 | println(res) 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/test/scala/MySQLUtilTest.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.{SparkContext, SparkConf} 2 | import util.{XMLUtil, MySQLUtil} 3 | 4 | /** 5 | * Created by li on 16/8/29. 6 | */ 7 | object MySQLUtilTest { 8 | 9 | def main(args: Array[String]) { 10 | val conf = new SparkConf().setAppName("MySQLUtilTest").setMaster("local") 11 | val sc = new SparkContext(conf) 12 | 13 | val confDir = "/Users/li/Kunyan/workShop/VipStockStatistic/src/main/scala/util/config.xml" 14 | 15 | val stockSql = "select symbol, sename from bt_stcode where (EXCHANGE = '001002' or EXCHANGE = '001003') " + 16 | "and SETYPE = '101' and CUR = 'CNY' and ISVALID = 1 and LISTSTATUS <> '2'" 17 | 18 | val configFile = XMLUtil.readConfigFile(confDir) 19 | 20 | val stockDic = MySQLUtil.readFromMysql(configFile, stockSql) 21 | .map(row => (row._1, row._2.split(","))).toMap 22 | 23 | stockDic.foreach(x => print(x._1, x._2(0))) 24 | 25 | 26 | 27 | 28 | 29 | 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/test/scala/Test.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.ml.feature.Word2Vec 2 | import org.apache.spark.sql.SQLContext 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | 5 | 6 | /** 7 | * Created by li on 16/4/15. 8 | * 9 | * 10 | * 11 | * import org.apache.spark.ml.feature.Word2Vec 12 | 13 | */ 14 | object Test { 15 | 16 | def main(args: Array[String]) { 17 | // val setPath = "/Users/li/kunyan/DataSet/trainingsetUnbalance/YSJS.txt" 18 | // val industry = "化工化纤" 19 | // BinaryClassificationRDD.dataOperation(setPath, industry) 20 | val conf = new SparkConf().setAppName("test").setMaster("local") 21 | val sc = new SparkContext(conf) 22 | val sqlContext = new SQLContext(sc) 23 | 24 | 25 | // Input data: Each row is a bag of words from a sentence or document. 26 | val documentDF = sqlContext.createDataFrame(Seq( 27 | "Hi I heard about Spark".split(" "), 28 | "I wish Java could use case classes".split(" "), 29 | "Logistic regression models are neat".split(" ") 30 | ).map(Tuple1.apply)).toDF("text") 31 | 32 | // Learn a mapping from words to Vectors. 33 | val word2Vec = new Word2Vec() 34 | .setInputCol("text") 35 | .setOutputCol("result") 36 | .setVectorSize(3) 37 | .setMinCount(0) 38 | val model = word2Vec.fit(documentDF) 39 | val result = model.transform(documentDF) 40 | result.select("result").foreach(println) 41 | result.show() 42 | 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/test/scala/TextRankTest.scala: -------------------------------------------------------------------------------- 1 | import meachinelearning.textrank.{PropertyExtractor, ConstructTextGraph} 2 | import org.graphstream.graph.Node 3 | 4 | import scala.collection.mutable.ListBuffer 5 | import scala.io.Source 6 | 7 | /** 8 | * Created by li on 16/6/23. 9 | */ 10 | object TextRankTest { 11 | 12 | def main(args: Array[String]) { 13 | 14 | val doc = new ListBuffer[(String)] 15 | 16 | val text = Source.fromURL(getClass.getResource(s"/text/${2}.txt")).getLines().mkString("\n") 17 | text.split(",").foreach(x => doc.+=(x)) 18 | 19 | 20 | // 构建候选关键词图, 设置窗口大小5 21 | val textGraph = new ConstructTextGraph("url", 10, doc.toList).constructGraph 22 | 23 | // 输出构建的无向图的边和顶点 24 | // textGraph.getEdgeSet.toArray.foreach(println) 25 | // textGraph.getNodeSet.toArray.foreach(println) 26 | // assert(textGraph.getEdgeSet.size() > 0) 27 | println((1 to 30).map(i => "=").mkString) 28 | 29 | // 输出提取的关键词 30 | val keywordExtractor = new PropertyExtractor(textGraph, 5) 31 | keywordExtractor.extractKeywords(100, 0.85f).foreach( 32 | node => 33 | println(" 关键词: "+node._1," 得分: "+node._2) 34 | ) 35 | println((1 to 30).map(i => "=").mkString) 36 | 37 | // 获取每个关键词节点的度 38 | textGraph.getNodeSet.toArray.map(_.asInstanceOf[Node]).foreach { 39 | node => 40 | println (node.getId, node.getDegree) 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/scala/classification.scala: -------------------------------------------------------------------------------- 1 | import java.io.{BufferedWriter, File, FileWriter} 2 | 3 | import org.apache.spark.{SparkContext, SparkConf} 4 | import org.apache.spark.rdd.RDD 5 | 6 | 7 | /** 8 | * Created by li on 16/3/31. 9 | */ 10 | object classification { 11 | 12 | val conf = new SparkConf().setAppName("meachinelearning/classification").setMaster("local") 13 | val sc = new SparkContext(conf) 14 | 15 | 16 | def getFile(url: String): RDD[(String, String)] ={ 17 | val content = sc.textFile(url).map{ 18 | line => 19 | val data = line.split("\t") 20 | if (data.length > 1) data(0) -> data(1) 21 | }.filter( _ != ()).map(_.asInstanceOf[(String, String)]) 22 | content 23 | } 24 | 25 | 26 | def getTrainingset(catagory: RDD[(String, String)], content: RDD[(String, String)], label: String, dataFile: String): Unit ={ 27 | // val trainingSet = new ArrayBuffer[String ] 28 | val DataFile = new File(dataFile) 29 | val bufferWriter = new BufferedWriter(new FileWriter(DataFile)) 30 | content.map { 31 | line => 32 | catagory.map{ 33 | data => 34 | bufferWriter.write((if(label == line._1) "1" else "0") + "\t" + line._1 + "\t"+ line._2 + "\n") 35 | // val trainingdata = (if(catagory == line._1) "1" else "0") + "\t" + line._1 + "\t"+ line._2 36 | // trainingSet += trainingdata 37 | } 38 | } 39 | bufferWriter.flush() 40 | bufferWriter.close() 41 | 42 | } 43 | 44 | 45 | 46 | // val DataFile = new File(dataFile) 47 | // val bufferWriter = new BufferedWriter(new FileWriter(DataFile)) 48 | // for(item <- list) { 49 | // val cata = map.get(item._1).get 50 | // bufferWriter.write((if(cata == catagory) "1" else "0") + "\t" + cata + "\t"+ item._2 + "\n") 51 | // } 52 | // bufferWriter.flush() 53 | // bufferWriter.close() 54 | // } 55 | 56 | def main(args: Array[String]) { 57 | 58 | // val urlContent = new collection.mutable.HashMap[String , String ] 59 | // val urlCatagory = new ListBuffer[(String, String)] 60 | val catagory1 = "有色金属" 61 | val datafile1 = "/users/li/Downloads/2222.txt" 62 | 63 | val url1 = "/users/li/Downloads/segTraining" 64 | val url2 = "/users/li/Downloads/traininglabel" 65 | 66 | val urlContent = getFile(url1) 67 | val urlCatagory = getFile(url2) 68 | 69 | val res = getTrainingset(urlCatagory, urlContent, catagory1, datafile1) 70 | 71 | } 72 | 73 | 74 | 75 | 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/test/scala/keywordExtractorTest.scala: -------------------------------------------------------------------------------- 1 | 2 | 3 | /** 4 | * Created by li on 16/6/27. 5 | */ 6 | object keywordExtractorTest { 7 | 8 | 9 | def main(args: Array[String]) { 10 | 11 | 12 | val url = "http://anotherbug.blog.chinajavaworld.com/entry/4545/0/" 13 | 14 | println(url.contains("blog")) 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/test/scala/telecomDataProcessingTest.scala: -------------------------------------------------------------------------------- 1 | //import org.apache.spark.{SparkConf, SparkContext} 2 | // 3 | //import scala.collection.mutable 4 | // 5 | ///** 6 | // * Created by li on 16/7/20. 7 | // */ 8 | //object TelecomDataProcessingTest { 9 | // 10 | // 11 | // def main(args: Array[String]) { 12 | // 13 | // val conf = new SparkConf().setAppName("test").setMaster("local") 14 | // val sc = new SparkContext(conf) 15 | // 16 | // val setTime = "2016-07-15" 17 | // 18 | // //设置时间段,一小时为一个间隔 19 | // val timeRangeHour = TelecomDataProcessing.setAssignedHourRange(setTime) 20 | // 21 | // // Hdfs上的数据,一天的数据 22 | // val dir = "hdfs://222.73.57.12:9000/telecom/shdx/origin/data/" 23 | // val dataFromHDFS = TelecomDataProcessing.dataReadFromHDFS(sc, dir, setTime).filter(! _._1.contains("home/telecom")) 24 | // 25 | // println("dataFromHDFS结束") 26 | // // dataFromHDFS.foreach(println) 27 | // 28 | // // hbase上的数据 29 | // val confDir = "/Users/li/kunyan/NaturalLanguageProcessing/src/main/scala/util/config.xml" // hbase配置文件目录 30 | // val tableName = "wk_detail" // 表名 31 | // 32 | // val result = new mutable.ArrayBuffer[(String, Array[(String, Long)])] 33 | // 34 | // for (item <- 0 until 1) { 35 | // 36 | // val temp = dataFromHDFS.filter { line => { 37 | // 38 | // (timeRangeHour(item)._1 <= line._1.toLong) && (line._1.toLong <= timeRangeHour(item)._2) 39 | // 40 | // }}.map(_._2) 41 | // 42 | // println("temp读取结束") 43 | // 44 | // temp.foreach(println) 45 | // 46 | // val hBaseConf = TelecomDataProcessing.getHBaseConf(sc, confDir, timeRangeHour(item), tableName) 47 | // 48 | // val newsFromHBase = TelecomDataProcessing.newsReadFromHBase(hBaseConf) 49 | // 50 | // newsFromHBase.foreach(println) 51 | // 52 | // val res = TelecomDataProcessing.urlMatching(temp, newsFromHBase) 53 | // 54 | // result.+=((item.toString, res)) 55 | // 56 | // } 57 | // 58 | // result.toArray.foreach( x => { 59 | // println(x._1) 60 | // x._2.foreach(x => println((x._1, x._2))) 61 | // }) 62 | // 63 | // 64 | // sc.stop() 65 | // 66 | // } 67 | // 68 | //} 69 | -------------------------------------------------------------------------------- /src/test/scala/testRankTest.scala: -------------------------------------------------------------------------------- 1 | 2 | 3 | import meachinelearning.textrank.TextRank 4 | 5 | import scala.collection.mutable.ListBuffer 6 | import scala.io.Source 7 | 8 | /** 9 | * Created by li on 16/6/24. 10 | */ 11 | object testRankTest { 12 | 13 | def main(args: Array[String]) { 14 | 15 | val doc = new ListBuffer[(String)] 16 | 17 | val text = Source.fromURL(getClass.getResource(s"/text/${2}.txt")).getLines().mkString("\n") 18 | text.split(",").foreach(x => doc.+=(x)) 19 | 20 | val keyWordList = TextRank.run("url", 5, doc.toList, 3, 100, 0.85f) 21 | 22 | keyWordList.foreach { 23 | word => { 24 | println(word._1, word._2) 25 | } 26 | } 27 | } 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/test/scala/timeutilTest.scala: -------------------------------------------------------------------------------- 1 | import util.TimeUtil 2 | 3 | /** 4 | * Created by li on 16/7/19. 5 | */ 6 | object TimeUtilTest { 7 | 8 | def main(args: Array[String]) { 9 | 10 | TimeUtil.setAssignedTimeRange("2016-2-1") 11 | 12 | } 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/test/scala/word2vecTest.scala: -------------------------------------------------------------------------------- 1 | import meachinelearning.word2vec.Word2Vec 2 | import org.apache.spark.{SparkConf, SparkContext} 3 | 4 | /** 5 | * Created by li on 16/7/15. 6 | */ 7 | object word2vecTest { 8 | 9 | 10 | def main(args: Array[String]) { 11 | 12 | 13 | val conf = new SparkConf().setAppName("word2vec").setMaster("local") 14 | val sc = new SparkContext(conf) 15 | 16 | val data = sc.parallelize(List("sadfad\tsdfasdfasdf\tasdfasdfasdfasdfasdf\t中欧,8,美国,成都,;,", "dddddd\tfdasdfvvv\tdfafasfdsadfs\t日本,中欧,.,中国,加州,/,顺分")) 17 | 18 | val punctuation = sc.textFile("/Users/li/kunyan/DataSet/punctuations.txt").collect() 19 | 20 | val s = Word2Vec.formatTransform(data, punctuation) 21 | 22 | s.foreach(println) 23 | 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /target/.history: -------------------------------------------------------------------------------- 1 | all 2 | help sbt 3 | help clean 4 | help clear 5 | exit 6 | -------------------------------------------------------------------------------- /target/resolution-cache/default/classification$sbt_2.10/1.0/resolved.xml.properties: -------------------------------------------------------------------------------- 1 | #default#classification$sbt_2.10;1.0 resolved revisions 2 | #Tue Jul 05 15:26:43 CST 2016 3 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-tree\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null 4 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null 5 | +revision\:\#@\#\:+3.0.20\:\#@\#\:+module\:\#@\#\:+plexus-utils\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.0.20 release 3.0.20 null 6 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant-launcher\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null 7 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null 8 | +revision\:\#@\#\:+2.2.1\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.1 release 2.2.1 null 9 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null 10 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+jsr250-api\:\#@\#\:+organisation\:\#@\#\:+javax.annotation\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null 11 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-reflect\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null 12 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.plexus\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null 13 | +revision\:\#@\#\:+1.5.5\:\#@\#\:+module\:\#@\#\:+plexus-component-annotations\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.5 release 1.5.5 null 14 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.inject\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null 15 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-plugin-api\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null 16 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+cdi-api\:\#@\#\:+organisation\:\#@\#\:+javax.enterprise\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null 17 | +revision\:\#@\#\:+2.5.2\:\#@\#\:+module\:\#@\#\:+plexus-classworlds\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.5.2 release 2.5.2 null 18 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null 19 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-model\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null 20 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-commons\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null 21 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-artifact\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null 22 | +revision\:\#@\#\:+1.6.0\:\#@\#\:+module\:\#@\#\:+jarjar\:\#@\#\:+organisation\:\#@\#\:+org.pantsbuild\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.6.0 release 1.6.0 null 23 | +revision\:\#@\#\:+1\:\#@\#\:+module\:\#@\#\:+javax.inject\:\#@\#\:+organisation\:\#@\#\:+javax.inject\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1 release 1 null 24 | +sbtVersion\:\#@\#\:+0.13\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+scalaVersion\:\#@\#\:+2.10\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 ? 0.14.1 null 25 | -------------------------------------------------------------------------------- /target/resolution-cache/default/classification$sbt_2.10/1.0/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /target/resolution-cache/default/classification_2.10/1.0/resolved.xml.properties: -------------------------------------------------------------------------------- 1 | #default#classification_2.10;1.0 resolved revisions 2 | #Fri Jun 24 11:03:35 CST 2016 3 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-graphx_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null 4 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.5 null 5 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalatest_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalatest\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null 6 | +revision\:\#@\#\:+3.1.14\:\#@\#\:+module\:\#@\#\:+mysql-connector-java\:\#@\#\:+organisation\:\#@\#\:+mysql\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.1.14 release 3.1.14 null 7 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null 8 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null 9 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-mllib_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null 10 | +revision\:\#@\#\:+1.1.2\:\#@\#\:+module\:\#@\#\:+gs-core\:\#@\#\:+organisation\:\#@\#\:+org.graphstream\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.1.2 release 1.1.2 null 11 | +revision\:\#@\#\:+2.7.1\:\#@\#\:+module\:\#@\#\:+hadoop-common\:\#@\#\:+organisation\:\#@\#\:+org.apache.hadoop\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.7.1 release 2.7.1 null 12 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-core_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null 13 | -------------------------------------------------------------------------------- /target/resolution-cache/default/classification_2.10/1.0/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | classification 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /target/resolution-cache/default/naturallanguageprocessing$sbt_2.10/1.0/resolved.xml.properties: -------------------------------------------------------------------------------- 1 | #default#naturallanguageprocessing$sbt_2.10;1.0 resolved revisions 2 | #Thu Mar 23 16:16:57 CST 2017 3 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-tree\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null 4 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null 5 | +revision\:\#@\#\:+3.0.20\:\#@\#\:+module\:\#@\#\:+plexus-utils\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.0.20 release 3.0.20 null 6 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant-launcher\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null 7 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null 8 | +revision\:\#@\#\:+2.2.1\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.1 release 2.2.1 null 9 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null 10 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+jsr250-api\:\#@\#\:+organisation\:\#@\#\:+javax.annotation\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null 11 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-reflect\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null 12 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.plexus\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null 13 | +revision\:\#@\#\:+1.5.5\:\#@\#\:+module\:\#@\#\:+plexus-component-annotations\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.5 release 1.5.5 null 14 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.inject\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null 15 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-plugin-api\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null 16 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+cdi-api\:\#@\#\:+organisation\:\#@\#\:+javax.enterprise\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null 17 | +revision\:\#@\#\:+2.5.2\:\#@\#\:+module\:\#@\#\:+plexus-classworlds\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.5.2 release 2.5.2 null 18 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null 19 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-model\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null 20 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-commons\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null 21 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-artifact\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null 22 | +revision\:\#@\#\:+1.6.0\:\#@\#\:+module\:\#@\#\:+jarjar\:\#@\#\:+organisation\:\#@\#\:+org.pantsbuild\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.6.0 release 1.6.0 null 23 | +revision\:\#@\#\:+1\:\#@\#\:+module\:\#@\#\:+javax.inject\:\#@\#\:+organisation\:\#@\#\:+javax.inject\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1 release 1 null 24 | +sbtVersion\:\#@\#\:+0.13\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+scalaVersion\:\#@\#\:+2.10\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 ? 0.14.1 null 25 | -------------------------------------------------------------------------------- /target/resolution-cache/default/naturallanguageprocessing$sbt_2.10/1.0/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /target/resolution-cache/meachinelearning-classification/meachinelearning-classification$sbt_2.10/1.0/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /target/resolution-cache/meachinelearning-classification/meachinelearning-classification_2.10/1.0/resolved.xml.properties: -------------------------------------------------------------------------------- 1 | #meachinelearning-classification#meachinelearning-classification_2.10;1.0 resolved revisions 2 | #Thu Jul 07 14:51:12 CST 2016 3 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-graphx_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null 4 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.5 null 5 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalatest_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalatest\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null 6 | +revision\:\#@\#\:+3.1.14\:\#@\#\:+module\:\#@\#\:+mysql-connector-java\:\#@\#\:+organisation\:\#@\#\:+mysql\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.1.14 release 3.1.14 null 7 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null 8 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null 9 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-mllib_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null 10 | +revision\:\#@\#\:+1.1.2\:\#@\#\:+module\:\#@\#\:+gs-core\:\#@\#\:+organisation\:\#@\#\:+org.graphstream\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.1.2 release 1.1.2 null 11 | +revision\:\#@\#\:+2.7.1\:\#@\#\:+module\:\#@\#\:+hadoop-common\:\#@\#\:+organisation\:\#@\#\:+org.apache.hadoop\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.7.1 release 2.7.1 null 12 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-core_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null 13 | -------------------------------------------------------------------------------- /target/resolution-cache/meachinelearning-classification/meachinelearning-classification_2.10/1.0/resolved.xml.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 9 | 10 | MeachineLearning/classification 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification$sources_2.10-docs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification$sources_2.10-optional.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification$sources_2.10-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification$sources_2.10-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification$sources_2.10-provided.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification$sources_2.10-sources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification_2.10-docs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification_2.10-optional.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification_2.10-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification_2.10-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification_2.10-provided.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/default-classification_2.10-sources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-docs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-optional.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-provided.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-sources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-docs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-optional.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-plugin.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-provided.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-sources.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /target/scala-2.10/test-classes/text/1.txt: -------------------------------------------------------------------------------- 1 | 光伏,中国人民银行,列,入,绿色,债券,支援,专案,目录,2015年12月22日,19:00:00,中国人民银行,发布,2015,第39,号,公告,公告,称为,加快,建设生态文明,引导,金融机构,服务,绿色发展,推动,经济结构转型,升级,经济发展方式转变,支援,金融机构,发行,绿色,金融债券,募集资金,支援,绿色,产业发展,笔者,目录,第5,项,清洁能源,发电,中,风力发电,光伏发电,智慧,电网,能源,因特网,分布式能源,太阳能热利用,水力发电,新能源,利用,列,入,太阳能光伏发电站,太阳能,高,温热,发电站,不含,分布式,太阳能光伏发电,系统,需,限定,条件,多晶硅,电池,组件,光电,转化,效率,≥,15.5%,组件,专案,投产,运行,日,一年,衰减率,≤,2.5%,年,衰减率,≤,0.7%,单晶硅,电池,组件,光电,转化,效率,≥,16%,组件,专案,投产,运行,日,一年,衰减率,≤,3%,年,衰减率,≤,0.7%,高,倍,聚光光伏,组件,光电,转化,效率,≥,28%,项目,投产,运行,日,一年,衰减率,≤,2%,年,衰减率,≤,0.5%,项目全生命周期,衰减率,≤,10%,硅基,薄膜电池,组件,光电,转化,效率,≥,8%,铜铟镓硒,CIGS,薄膜电池,组件,光电,转化,效率,≥,11%,碲化镉,CdTe,薄膜电池,组件,光电,转化,效率,≥,11%,薄膜电池,组件,光电,转化,效率,≥,10%,多晶硅,单晶硅,薄膜电池,项目全生命周期,衰减率,≤,20%,智能电网,能源,因特网,指,提高,供,需,负荷,平衡,回应,能力,改善,电网,综合,能效,降低,输变电,损耗,增强,可再生能源,接,入,能力,电网建设,运营,技术,升级,改造,专案,1.,智能电网,指,采用,智慧,型,电气设备,即时,双向,集成,通信技术,先进技术,电网建设,运营,专案,电网,智慧,化,升级,改造,项目,2.,能源,因特网,指,综合,电力电子,资讯,智慧,管理技术,连接,分布式能源,含,分布式,可再生能源,分布式,储能,装置,类型,负荷,能量,双向,流动,交换,共享,电网,微电网,能源,燃气,网络,设施,建设,运营,专案,分布式能源,指,区域,能源站,包括,天然气,区域,能源站,分布式光伏发电,系统,分布式能源,设施,建设,运营,分布式能源,接,入,峰谷,调节,系统,分布式,电力,交易平台,能源管理系统,建设,运营,附,中国人民银行公告,2015,第39,号,绿色,债券,支援,专案,目录 -------------------------------------------------------------------------------- /target/scala-2.10/test-classes/text/2.txt: -------------------------------------------------------------------------------- 1 | 记者,国家电网公司,获悉,9月23日,河北丰宁,二期,山东文登,重庆,蟠龙,抽水蓄能电站,工程,以下简称,丰宁,二期,文登,蟠龙,抽,蓄,座,抽,蓄,电站,正式,开工,总投资,244.4亿,元,总装机容量,480万,千瓦,计划,2022年,竣工,投产,项目,预计,增加,发电,装备制造业,产值,111亿,元,推动,相关,装备制造业,发展,开工,动员大会,国家电网公司,董事长,党组书记,刘振亚,丰宁,二期,文登,蟠龙,抽,蓄,国家电网公司,推进,特高压电网,建设,服务,清洁能源,发展,重大工程,继,2015年6月,安徽金寨,山东沂蒙,河南,天池,座,抽水蓄能电站,第二批,开工,电站,标志,我国,抽水蓄能电站,加快,发展,新,阶段,介绍,河北丰宁,二期,抽水蓄能电站,项目,位于,河北省承德市,丰宁县,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,华北电网,工程投资,87.5亿,元,丰宁抽水蓄能电站,一期,二期,装机容量,360万,千瓦,世界上,装机容量,抽水蓄能电站,山东,文登抽水蓄能电站,位于,山东省,威海市文登区,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,山东电网,工程投资,85.7亿,元,重庆,蟠龙,抽水蓄能电站,位于,重庆市綦江区,装机容量,120万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,重庆电网,工程投资,71.2亿,元,国网,座,受,端,电网,地区,抽水蓄能电站,建成,更好地,接纳,区,外,来电,优化,电源,结构,提高,北,西南,地区,清洁能源,消纳,能力,提高,特高压电网,系统安全,可靠性,综合,煤电,机组,消纳,清洁能源,效果,建设,丰宁,二期,文登,蟠龙,抽,蓄,年,节约,原煤,消耗,291万,吨,减排,烟尘,0.3万,吨,二氧化硫,1.4万,吨,氮氧化物,1.3万,吨,二氧化碳,485万,吨,节能减排,大气污染防治,国家电网公司,经营,区域,内在,运,抽水蓄能电站,装机容量,1674.5万,千瓦,建,规模,1880万,千瓦,预计,2017年,我国,抽水蓄能,装机,3300万,千瓦,超过,美国,世界上,抽水蓄能电站,第一,大国 -------------------------------------------------------------------------------- /target/scala-2.10/test-classes/text/abstract: -------------------------------------------------------------------------------- 1 | 算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。 2 | 算法可以宽泛的分为三类, 3 | 一,有限的确定性算法,这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务,但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。 4 | 二,有限的非确定算法,这类算法在有限的时间内终止。然而,对于一个(或一些)给定的数值,算法的结果并不是唯一的或确定的。 5 | 三,无限的算法,是那些由于没有定义终止定义条件,或定义的条件无法由输入的数据满足而不终止运行的算法。通常,无限算法的产生是由于未能确定的定义终止条件。 -------------------------------------------------------------------------------- /target/streams/$global/$global/dumpStructure/$global/streams/out: -------------------------------------------------------------------------------- 1 | [info] Writing structure to /private/var/folders/7j/trxrd6ms0rg3v8tlck57__4h0000gn/T/sbt-structure0.xml... 2 | [info] Done. 3 | -------------------------------------------------------------------------------- /target/streams/$global/clean/$global/streams/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/clean/$global/streams/out -------------------------------------------------------------------------------- /target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/input_dsp: -------------------------------------------------------------------------------- 1 | org.scala-lang scala-library2.10.4 2 | com.kunyannlpsuit-package0.2.8.3 org.scalactic scalactic2.2.5test org.scalatest scalatest2.2.5testorg.scala-langscala-compiler2.10.4org.apache.hadoop hadoop-common2.7.1 javax.servlet**org.apache.hadoop hadoop-hdfs2.7.1providedorg.apache.sparkspark-core_2.101.5.2org.apache.sparkspark-mllib_2.101.5.2mysqlmysql-connector-java3.1.14org.graphstreamgs-core1.1.2org.apache.sparkspark-graphx_2.101.5.2 com.ibm.icuicu4j56.1org.apache.hbasehbase0.98.2-hadoop2org.apache.hbase hbase-client1.1.2org.apache.hbase hbase-common1.1.2org.apache.hbase hbase-server1.1.2 org.scalanlpbreeze-math_2.100.4 org.scalanlpbreeze-process_2.100.3 org.scalanlpbreeze-viz_2.100.12 org.scalanlp breeze_2.10* org.scalanlpnak_2.101.3 redis.clientsjedis2.8.0org.ansjansj_seg5.0.2org.jsonjson20160212 org.nlpcnnlp-lang1.7 -------------------------------------------------------------------------------- /target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp -------------------------------------------------------------------------------- /target/streams/$global/ivyConfiguration/$global/streams/out: -------------------------------------------------------------------------------- 1 | [debug] Other repositories: 2 | [debug] Default repositories: 3 | [debug] Using inline dependencies specified in Scala. 4 | -------------------------------------------------------------------------------- /target/streams/$global/ivySbt/$global/streams/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/ivySbt/$global/streams/out -------------------------------------------------------------------------------- /target/streams/$global/projectDescriptors/$global/streams/out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/projectDescriptors/$global/streams/out -------------------------------------------------------------------------------- /target/streams/$global/update/$global/streams/update_cache_2.10/inputs: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/update/$global/streams/update_cache_2.10/inputs -------------------------------------------------------------------------------- /target/streams/$global/update/$global/streams/update_cache_2.10/output: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/update/$global/streams/update_cache_2.10/output -------------------------------------------------------------------------------- /target/streams/compile/unmanagedClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /target/streams/compile/unmanagedJars/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /target/streams/runtime/unmanagedClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /target/streams/runtime/unmanagedJars/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /target/streams/test/unmanagedClasspath/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /target/streams/test/unmanagedJars/$global/streams/export: -------------------------------------------------------------------------------- 1 | 2 | --------------------------------------------------------------------------------