├── .gitignore
├── .idea
├── .name
├── compiler.xml
├── copyright
│ └── profiles_settings.xml
├── encodings.xml
├── misc.xml
├── modules.xml
├── modules
│ ├── NaturalLanguageProces.iml
│ ├── calssification-build.iml
│ ├── calssification.iml
│ ├── classification-build.iml
│ └── naturallanguageprocessing-build.iml
├── sbt.xml
├── scala_compiler.xml
├── uiDesigner.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── build.sbt
├── project
├── build.properties
├── plugins.sbt
└── target
│ ├── resolution-cache
│ ├── default
│ │ ├── calssification-build
│ │ │ └── scala_2.10
│ │ │ │ └── sbt_0.13
│ │ │ │ └── 0.1-SNAPSHOT
│ │ │ │ ├── resolved.xml.properties
│ │ │ │ └── resolved.xml.xml
│ │ ├── classification-build
│ │ │ └── scala_2.10
│ │ │ │ └── sbt_0.13
│ │ │ │ └── 0.1-SNAPSHOT
│ │ │ │ ├── resolved.xml.properties
│ │ │ │ └── resolved.xml.xml
│ │ └── naturallanguageprocessing-build
│ │ │ └── scala_2.10
│ │ │ └── sbt_0.13
│ │ │ └── 0.1-SNAPSHOT
│ │ │ ├── resolved.xml.properties
│ │ │ └── resolved.xml.xml
│ └── reports
│ │ ├── default-calssification-build-compile-internal.xml
│ │ ├── default-calssification-build-compile.xml
│ │ ├── default-calssification-build-docs.xml
│ │ ├── default-calssification-build-optional.xml
│ │ ├── default-calssification-build-plugin.xml
│ │ ├── default-calssification-build-pom.xml
│ │ ├── default-calssification-build-provided.xml
│ │ ├── default-calssification-build-runtime-internal.xml
│ │ ├── default-calssification-build-runtime.xml
│ │ ├── default-calssification-build-scala-tool.xml
│ │ ├── default-calssification-build-sources.xml
│ │ ├── default-calssification-build-test-internal.xml
│ │ ├── default-calssification-build-test.xml
│ │ ├── default-classification-build-compile-internal.xml
│ │ ├── default-classification-build-compile.xml
│ │ ├── default-classification-build-docs.xml
│ │ ├── default-classification-build-optional.xml
│ │ ├── default-classification-build-plugin.xml
│ │ ├── default-classification-build-pom.xml
│ │ ├── default-classification-build-provided.xml
│ │ ├── default-classification-build-runtime-internal.xml
│ │ ├── default-classification-build-runtime.xml
│ │ ├── default-classification-build-scala-tool.xml
│ │ ├── default-classification-build-sources.xml
│ │ ├── default-classification-build-test-internal.xml
│ │ ├── default-classification-build-test.xml
│ │ ├── default-naturallanguageprocessing-build-compile-internal.xml
│ │ ├── default-naturallanguageprocessing-build-compile.xml
│ │ ├── default-naturallanguageprocessing-build-docs.xml
│ │ ├── default-naturallanguageprocessing-build-optional.xml
│ │ ├── default-naturallanguageprocessing-build-plugin.xml
│ │ ├── default-naturallanguageprocessing-build-pom.xml
│ │ ├── default-naturallanguageprocessing-build-provided.xml
│ │ ├── default-naturallanguageprocessing-build-runtime-internal.xml
│ │ ├── default-naturallanguageprocessing-build-runtime.xml
│ │ ├── default-naturallanguageprocessing-build-scala-tool.xml
│ │ ├── default-naturallanguageprocessing-build-sources.xml
│ │ ├── default-naturallanguageprocessing-build-test-internal.xml
│ │ ├── default-naturallanguageprocessing-build-test.xml
│ │ ├── ivy-report.css
│ │ └── ivy-report.xsl
│ └── streams
│ ├── $global
│ ├── $global
│ │ └── $global
│ │ │ └── streams
│ │ │ └── out
│ ├── dependencyPositions
│ │ └── $global
│ │ │ └── streams
│ │ │ └── update_cache_2.10
│ │ │ ├── input_dsp
│ │ │ └── output_dsp
│ ├── ivyConfiguration
│ │ └── $global
│ │ │ └── streams
│ │ │ └── out
│ ├── ivySbt
│ │ └── $global
│ │ │ └── streams
│ │ │ └── out
│ ├── projectDescriptors
│ │ └── $global
│ │ │ └── streams
│ │ │ └── out
│ └── update
│ │ └── $global
│ │ └── streams
│ │ ├── out
│ │ └── update_cache_2.10
│ │ ├── inputs
│ │ └── output
│ ├── compile
│ ├── $global
│ │ └── $global
│ │ │ └── discoveredMainClasses
│ │ │ └── data
│ ├── compile
│ │ └── $global
│ │ │ └── streams
│ │ │ └── out
│ ├── compileIncremental
│ │ └── $global
│ │ │ └── streams
│ │ │ ├── export
│ │ │ └── out
│ ├── copyResources
│ │ └── $global
│ │ │ └── streams
│ │ │ ├── copy-resources
│ │ │ └── out
│ ├── dependencyClasspath
│ │ └── $global
│ │ │ └── streams
│ │ │ └── export
│ ├── exportedProducts
│ │ └── $global
│ │ │ └── streams
│ │ │ └── export
│ ├── externalDependencyClasspath
│ │ └── $global
│ │ │ └── streams
│ │ │ └── export
│ ├── internalDependencyClasspath
│ │ └── $global
│ │ │ └── streams
│ │ │ └── export
│ ├── managedClasspath
│ │ └── $global
│ │ │ └── streams
│ │ │ └── export
│ ├── unmanagedClasspath
│ │ └── $global
│ │ │ └── streams
│ │ │ └── export
│ └── unmanagedJars
│ │ └── $global
│ │ └── streams
│ │ └── export
│ └── runtime
│ ├── dependencyClasspath
│ └── $global
│ │ └── streams
│ │ └── export
│ ├── exportedProducts
│ └── $global
│ │ └── streams
│ │ └── export
│ ├── externalDependencyClasspath
│ └── $global
│ │ └── streams
│ │ └── export
│ ├── fullClasspath
│ └── $global
│ │ └── streams
│ │ └── export
│ ├── internalDependencyClasspath
│ └── $global
│ │ └── streams
│ │ └── export
│ ├── managedClasspath
│ └── $global
│ │ └── streams
│ │ └── export
│ ├── unmanagedClasspath
│ └── $global
│ │ └── streams
│ │ └── export
│ └── unmanagedJars
│ └── $global
│ └── streams
│ └── export
├── src
├── main
│ └── scala
│ │ ├── deeplearning
│ │ ├── cae
│ │ │ └── CAE.scala
│ │ ├── cnn
│ │ │ ├── CNN.scala
│ │ │ └── CNNModel.scala
│ │ └── tests
│ │ │ └── Test_example_CNN.scala
│ │ ├── intactprogram
│ │ ├── telecomdataprocessing
│ │ │ ├── TelecomDataProcess.scala
│ │ │ └── util
│ │ │ │ ├── HBaseUtil.scala
│ │ │ │ └── LoggerUtil.scala
│ │ ├── telecomdataprocessingAll
│ │ │ ├── TDP.scala
│ │ │ ├── TelecomDataProcess.scala
│ │ │ ├── TelecomDataProcessing.scala
│ │ │ ├── TelecomDataProcessingByHour.scala
│ │ │ ├── readFromHdfs.scala
│ │ │ └── util
│ │ │ │ ├── HBaseUtil.scala
│ │ │ │ ├── HDFSUtil.scala
│ │ │ │ ├── LoggerUtil.scala
│ │ │ │ └── TimeUtil.scala
│ │ └── vipstockstatistic
│ │ │ ├── CorpusBuild.scala
│ │ │ ├── PredictWithDic.scala
│ │ │ ├── VipStockStatistic.scala
│ │ │ └── util
│ │ │ ├── AnsjAnalyzer.scala
│ │ │ ├── HBaseUtil.scala
│ │ │ ├── LoggerUtil.scala
│ │ │ ├── RedisUtil.scala
│ │ │ └── config.xml
│ │ ├── meachinelearning
│ │ ├── Recommendation
│ │ │ └── SparkMLlibColbFilter.scala
│ │ ├── classification
│ │ │ ├── BinaryClassification.scala
│ │ │ ├── BinaryClassificationParaOptimization.scala
│ │ │ ├── BinaryClassificationRDDWithPCA.scala
│ │ │ ├── BinaryClassificationWithALS.scala
│ │ │ ├── BinaryClassificationWithPCA.scala
│ │ │ ├── GaussianKernelSVM.scala
│ │ │ ├── PCAtest.scala
│ │ │ └── TrainingProcessWithPCA.scala
│ │ ├── correlationanalysis
│ │ │ └── correlationAnalysis.scala
│ │ ├── data
│ │ │ └── SupportVectorMachineWithGaussianKernel.txt
│ │ ├── hotdegreecalculate
│ │ │ ├── CommunityFrequencyStatistics.scala
│ │ │ ├── HotDegreeCalculate.scala
│ │ │ ├── HotDegreeCalculation.scala
│ │ │ ├── HotDegreeCalculationRDD.scala
│ │ │ └── fileIO.scala
│ │ ├── textrank
│ │ │ ├── AbstractExtract.scala
│ │ │ ├── ConstructTextGraph.scala
│ │ │ ├── KeywordExtractor.scala
│ │ │ ├── PropertyExtractor.scala
│ │ │ └── TextRank.scala
│ │ ├── topicmodel
│ │ │ ├── LDAModel.scala
│ │ │ ├── LDATest.scala
│ │ │ └── LatentDirichletAllocationExample.scala
│ │ └── word2vec
│ │ │ ├── ClassifyModel.scala
│ │ │ ├── ClassifyPredict.scala
│ │ │ ├── DataPrepare.scala
│ │ │ ├── DeleteDirectory.scala
│ │ │ ├── Word2Vec.scala
│ │ │ ├── model
│ │ │ ├── data
│ │ │ │ ├── .part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ ├── _common_metadata
│ │ │ │ ├── _metadata
│ │ │ │ └── part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet
│ │ │ └── metadata
│ │ │ │ ├── .part-00000.crc
│ │ │ │ ├── _SUCCESS
│ │ │ │ └── part-00000
│ │ │ ├── readme.md
│ │ │ ├── textVectors.scala
│ │ │ └── twc
│ │ │ ├── W2VJsonConf.json
│ │ │ ├── processing.scala
│ │ │ └── training.scala
│ │ ├── test
│ │ └── regularExpression.scala
│ │ ├── util
│ │ ├── DataTransform.scala
│ │ ├── DirectoryUtil.scala
│ │ ├── FileUtil.scala
│ │ ├── HBaseUtil.scala
│ │ ├── HDFSUtil.scala
│ │ ├── JsonUtil.scala
│ │ ├── LoggerUtil.scala
│ │ ├── MySQLUtil.scala
│ │ ├── RedisUtil.scala
│ │ ├── TextProcessing.scala
│ │ ├── TimeUtil.scala
│ │ ├── UrlCategoryTrim.scala
│ │ ├── XMLUtil.scala
│ │ └── regularExpression.scala
│ │ └── wordSegmentation
│ │ ├── AnsjAnalyzer.scala
│ │ └── wordSegmentAnalyser.scala
└── test
│ ├── resources
│ ├── 2016-07-11-15.txt
│ ├── 2016-07-12-13.txt
│ ├── 2016-07-12-15.txt
│ ├── 2016-07-12-16.txt
│ └── text
│ │ ├── 1.txt
│ │ ├── 2.txt
│ │ └── abstract
│ └── scala
│ ├── CNNTest.scala
│ ├── ClassificationTest.scala
│ ├── HDFSUtilTest.scala
│ ├── HotWordsTest.scala
│ ├── JSONUtilTest.scala
│ ├── MySQLUtilTest.scala
│ ├── Test.scala
│ ├── TextRankTest.scala
│ ├── classification.scala
│ ├── keywordExtractorTest.scala
│ ├── telecomDataProcessingTest.scala
│ ├── testRankTest.scala
│ ├── timeutilTest.scala
│ └── word2vecTest.scala
└── target
├── .history
├── resolution-cache
├── default
│ ├── classification$sbt_2.10
│ │ └── 1.0
│ │ │ ├── resolved.xml.properties
│ │ │ └── resolved.xml.xml
│ ├── classification$sources_2.10
│ │ └── 1.0
│ │ │ ├── resolved.xml.properties
│ │ │ └── resolved.xml.xml
│ ├── classification_2.10
│ │ └── 1.0
│ │ │ ├── resolved.xml.properties
│ │ │ └── resolved.xml.xml
│ ├── naturallanguageprocessing$sbt_2.10
│ │ └── 1.0
│ │ │ ├── resolved.xml.properties
│ │ │ └── resolved.xml.xml
│ └── naturallanguageprocessing$sources_2.10
│ │ └── 1.0
│ │ ├── resolved.xml.properties
│ │ └── resolved.xml.xml
├── meachinelearning-classification
│ ├── meachinelearning-classification$sbt_2.10
│ │ └── 1.0
│ │ │ ├── resolved.xml.properties
│ │ │ └── resolved.xml.xml
│ ├── meachinelearning-classification$sources_2.10
│ │ └── 1.0
│ │ │ ├── resolved.xml.properties
│ │ │ └── resolved.xml.xml
│ └── meachinelearning-classification_2.10
│ │ └── 1.0
│ │ ├── resolved.xml.properties
│ │ └── resolved.xml.xml
└── reports
│ ├── default-classification$sbt_2.10-default.xml
│ ├── default-classification$sources_2.10-compile-internal.xml
│ ├── default-classification$sources_2.10-compile.xml
│ ├── default-classification$sources_2.10-docs.xml
│ ├── default-classification$sources_2.10-optional.xml
│ ├── default-classification$sources_2.10-plugin.xml
│ ├── default-classification$sources_2.10-pom.xml
│ ├── default-classification$sources_2.10-provided.xml
│ ├── default-classification$sources_2.10-runtime-internal.xml
│ ├── default-classification$sources_2.10-runtime.xml
│ ├── default-classification$sources_2.10-scala-tool.xml
│ ├── default-classification$sources_2.10-sources.xml
│ ├── default-classification$sources_2.10-test-internal.xml
│ ├── default-classification$sources_2.10-test.xml
│ ├── default-classification_2.10-compile-internal.xml
│ ├── default-classification_2.10-compile.xml
│ ├── default-classification_2.10-docs.xml
│ ├── default-classification_2.10-optional.xml
│ ├── default-classification_2.10-plugin.xml
│ ├── default-classification_2.10-pom.xml
│ ├── default-classification_2.10-provided.xml
│ ├── default-classification_2.10-runtime-internal.xml
│ ├── default-classification_2.10-runtime.xml
│ ├── default-classification_2.10-scala-tool.xml
│ ├── default-classification_2.10-sources.xml
│ ├── default-classification_2.10-test-internal.xml
│ ├── default-classification_2.10-test.xml
│ ├── default-naturallanguageprocessing$sbt_2.10-default.xml
│ ├── ivy-report.css
│ ├── ivy-report.xsl
│ ├── meachinelearning-classification-meachinelearning-classification$sbt_2.10-default.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-compile-internal.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-compile.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-docs.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-optional.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-plugin.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-pom.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-provided.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-runtime-internal.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-runtime.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-scala-tool.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-sources.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-test-internal.xml
│ ├── meachinelearning-classification-meachinelearning-classification$sources_2.10-test.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-compile-internal.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-compile.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-docs.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-optional.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-plugin.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-pom.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-provided.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-runtime-internal.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-runtime.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-scala-tool.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-sources.xml
│ ├── meachinelearning-classification-meachinelearning-classification_2.10-test-internal.xml
│ └── meachinelearning-classification-meachinelearning-classification_2.10-test.xml
├── scala-2.10
└── test-classes
│ └── text
│ ├── 1.txt
│ ├── 2.txt
│ └── abstract
└── streams
├── $global
├── $global
│ └── dumpStructure
│ │ └── $global
│ │ └── streams
│ │ └── out
├── clean
│ └── $global
│ │ └── streams
│ │ └── out
├── dependencyPositions
│ └── $global
│ │ └── streams
│ │ └── update_cache_2.10
│ │ ├── input_dsp
│ │ └── output_dsp
├── ivyConfiguration
│ └── $global
│ │ └── streams
│ │ └── out
├── ivySbt
│ └── $global
│ │ └── streams
│ │ └── out
├── projectDescriptors
│ └── $global
│ │ └── streams
│ │ └── out
├── update
│ └── $global
│ │ └── streams
│ │ ├── out
│ │ └── update_cache_2.10
│ │ ├── inputs
│ │ └── output
├── updateClassifiers
│ └── $global
│ │ └── streams
│ │ └── out
└── updateSbtClassifiers
│ └── $global
│ └── streams
│ └── out
├── compile
├── externalDependencyClasspath
│ └── $global
│ │ └── streams
│ │ └── export
├── managedClasspath
│ └── $global
│ │ └── streams
│ │ └── export
├── unmanagedClasspath
│ └── $global
│ │ └── streams
│ │ └── export
└── unmanagedJars
│ └── $global
│ └── streams
│ └── export
├── runtime
├── externalDependencyClasspath
│ └── $global
│ │ └── streams
│ │ └── export
├── managedClasspath
│ └── $global
│ │ └── streams
│ │ └── export
├── unmanagedClasspath
│ └── $global
│ │ └── streams
│ │ └── export
└── unmanagedJars
│ └── $global
│ └── streams
│ └── export
└── test
├── externalDependencyClasspath
└── $global
│ └── streams
│ └── export
├── managedClasspath
└── $global
│ └── streams
│ └── export
├── unmanagedClasspath
└── $global
│ └── streams
│ └── export
└── unmanagedJars
└── $global
└── streams
└── export
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 |
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | calssification
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/.idea/copyright/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.idea/modules/NaturalLanguageProces.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/.idea/modules/calssification-build.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/modules/calssification.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/sbt.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/.idea/scala_compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Natural Language Processing
2 | ## introduce
3 |
4 | This is a Natural Language Processing package, it including Machine Learning utils and basic NLp utils.
5 |
6 | ## Machine Learning
7 |
8 | Natural Language Processing by using Machine Learning algorithms.
9 |
10 | ### TextClassification
11 |
12 | Text classification by using Bayesian, svmWithSGD, GaussianKernelSVM.
13 |
14 | #### Bayesian
15 |
16 | #### SVMWithSGD
17 |
18 | #### GaussianKernelSVM
19 |
20 | ### CorrelationAnalysis
21 |
22 | ### HotDegreeCalculate
23 |
24 | The hot degree of keywords using bayes average and law of newton cooling.
25 |
26 | ### TextRank
27 |
28 | Based on pageRank.
29 |
30 | ### TopicModel
31 |
32 | LDA
33 |
34 | ## Util
35 |
36 | Preprocessor tools
37 |
38 | shipment of gold damaged in a fire, shipment of gold damaged in a fire,
39 | delivery of silver arrived in a silver truck
40 | shipment of gold arrived in a truck
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "NaturalLanguageProcessing"
2 |
3 | version := "1.0"
4 |
5 | scalaVersion := "2.10.4"
6 |
7 | // kunyan分词接口
8 | resolvers += "Kunyan Repo" at "http://61.147.114.67:8081/nexus/content/groups/public/"
9 |
10 | libraryDependencies += "com.kunyan" % "nlpsuit-package" % "0.2.8.3"
11 |
12 | libraryDependencies += "org.scalactic" %% "scalactic" % "2.2.5" % "test"
13 |
14 | libraryDependencies += "org.scalatest" %% "scalatest" % "2.2.5" % "test"
15 |
16 | libraryDependencies += "org.scala-lang" % "scala-compiler" % "2.10.4"
17 |
18 | libraryDependencies += "org.apache.hadoop" % "hadoop-common" % "2.7.1" excludeAll ExclusionRule(organization = "javax.servlet")
19 |
20 | libraryDependencies += "org.apache.hadoop" % "hadoop-hdfs" % "2.7.1" % "provided"
21 |
22 | libraryDependencies += "org.apache.spark" % "spark-core_2.10" % "1.5.2"
23 |
24 | libraryDependencies += "org.apache.spark" % "spark-mllib_2.10" % "1.5.2"
25 |
26 | libraryDependencies += "mysql" % "mysql-connector-java" % "3.1.14"
27 |
28 | libraryDependencies += "org.graphstream" % "gs-core" % "1.1.2"
29 |
30 | libraryDependencies += "org.apache.spark" % "spark-graphx_2.10" % "1.5.2"
31 |
32 | libraryDependencies += "com.ibm.icu" % "icu4j" % "56.1"
33 |
34 | libraryDependencies += "org.apache.hbase" % "hbase" % "0.98.2-hadoop2"
35 |
36 | libraryDependencies += "org.apache.hbase" % "hbase-client" % "1.1.2"
37 |
38 | libraryDependencies += "org.apache.hbase" % "hbase-common" % "1.1.2"
39 |
40 | libraryDependencies += "org.apache.hbase" % "hbase-server" % "1.1.2"
41 |
42 | //libraryDependencies += "org.scalanlp" % "breeze_2.10" % "0.11.2"
43 |
44 | libraryDependencies += "org.scalanlp" % "breeze-math_2.10" % "0.4" intransitive()
45 |
46 | //libraryDependencies += "org.scalanlp" % "breeze-learn_2.9.2" % "0.2" intransitive()
47 |
48 | libraryDependencies += "org.scalanlp" % "breeze-process_2.10" % "0.3" intransitive()
49 |
50 | libraryDependencies += "org.scalanlp" % "breeze-viz_2.10" % "0.12" exclude("org.scalanlp", "breeze_2.10")
51 |
52 | libraryDependencies += "org.scalanlp" % "nak_2.10" % "1.3"
53 |
54 | libraryDependencies += "redis.clients" % "jedis" % "2.8.0"
55 |
56 | libraryDependencies += "org.ansj" % "ansj_seg" % "5.0.2"
57 |
58 | libraryDependencies += "org.json" % "json" % "20160212"
59 |
60 | libraryDependencies += "org.nlpcn" % "nlp-lang" % "1.7"
61 |
62 | assemblyMergeStrategy in assembly := {
63 | case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
64 | case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
65 | case PathList("javax", "el", xs @ _*) => MergeStrategy.last
66 | case PathList("org", "apache", xs @ _*) => MergeStrategy.last
67 | case PathList("com", "google", xs @ _*) => MergeStrategy.last
68 | case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
69 | case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
70 | case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
71 | case "about.html" => MergeStrategy.rename
72 | case "META-INF/ECLIPSEF.RSA" => MergeStrategy.last
73 | case "META-INF/mailcap" => MergeStrategy.last
74 | case "META-INF/mimetypes.default" => MergeStrategy.last
75 | case "plugin.properties" => MergeStrategy.last
76 | case "log4j.properties" => MergeStrategy.last
77 | case x =>
78 | val oldStrategy = (assemblyMergeStrategy in assembly).value
79 | oldStrategy(x)
80 | }
81 |
82 |
83 | test in assembly := {}
84 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version = 0.13.8
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | logLevel := Level.Warn
2 |
3 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.1")
--------------------------------------------------------------------------------
/project/target/resolution-cache/default/calssification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.properties:
--------------------------------------------------------------------------------
1 | #default#calssification-build;0.1-SNAPSHOT resolved revisions
2 | #Wed Mar 30 14:23:46 CST 2016
3 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.4 null
4 | +e\:scalaVersion\:\#@\#\:+2.10\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+e\:sbtVersion\:\#@\#\:+0.13\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 release 0.14.1 null
5 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null
6 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null
7 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/default/calssification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
10 |
11 | calssification-build
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/default/classification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.properties:
--------------------------------------------------------------------------------
1 | #default#classification-build;0.1-SNAPSHOT resolved revisions
2 | #Tue Apr 12 10:12:42 CST 2016
3 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.4 null
4 | +e\:scalaVersion\:\#@\#\:+2.10\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+e\:sbtVersion\:\#@\#\:+0.13\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 release 0.14.1 null
5 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null
6 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null
7 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/default/classification-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
10 |
11 | classification-build
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/default/naturallanguageprocessing-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.properties:
--------------------------------------------------------------------------------
1 | #default#naturallanguageprocessing-build;0.1-SNAPSHOT resolved revisions
2 | #Wed Oct 12 10:38:54 CST 2016
3 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.4 null
4 | +e\:scalaVersion\:\#@\#\:+2.10\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+e\:sbtVersion\:\#@\#\:+0.13\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 release 0.14.1 null
5 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null
6 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null
7 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/default/naturallanguageprocessing-build/scala_2.10/sbt_0.13/0.1-SNAPSHOT/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
10 |
11 | naturallanguageprocessing-build
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-calssification-build-docs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-calssification-build-optional.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-calssification-build-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-calssification-build-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-calssification-build-sources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-classification-build-docs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-classification-build-optional.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-classification-build-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-classification-build-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-classification-build-sources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-naturallanguageprocessing-build-docs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-naturallanguageprocessing-build-optional.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-naturallanguageprocessing-build-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-naturallanguageprocessing-build-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/resolution-cache/reports/default-naturallanguageprocessing-build-sources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/project/target/streams/$global/$global/$global/streams/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/$global/$global/streams/out
--------------------------------------------------------------------------------
/project/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/input_dsp:
--------------------------------------------------------------------------------
1 | org.scala-lang
scala-library 2.10.4 provided com.eed3si9n sbt-assembly 0.14.1 e:sbtVersion 0.13 e:scalaVersion 2.10
--------------------------------------------------------------------------------
/project/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp
--------------------------------------------------------------------------------
/project/target/streams/$global/ivyConfiguration/$global/streams/out:
--------------------------------------------------------------------------------
1 | [debug] Other repositories:
2 | [debug] Default repositories:
3 | [debug] Using inline dependencies specified in Scala.
4 |
--------------------------------------------------------------------------------
/project/target/streams/$global/ivySbt/$global/streams/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/ivySbt/$global/streams/out
--------------------------------------------------------------------------------
/project/target/streams/$global/projectDescriptors/$global/streams/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/projectDescriptors/$global/streams/out
--------------------------------------------------------------------------------
/project/target/streams/$global/update/$global/streams/update_cache_2.10/inputs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/update/$global/streams/update_cache_2.10/inputs
--------------------------------------------------------------------------------
/project/target/streams/$global/update/$global/streams/update_cache_2.10/output:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/$global/update/$global/streams/update_cache_2.10/output
--------------------------------------------------------------------------------
/project/target/streams/compile/$global/$global/discoveredMainClasses/data:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/project/target/streams/compile/compile/$global/streams/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/compile/compile/$global/streams/out
--------------------------------------------------------------------------------
/project/target/streams/compile/compileIncremental/$global/streams/export:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/compile/compileIncremental/$global/streams/export
--------------------------------------------------------------------------------
/project/target/streams/compile/compileIncremental/$global/streams/out:
--------------------------------------------------------------------------------
1 | [debug]
2 | [debug] Initial source changes:
3 | [debug] removed:Set()
4 | [debug] added: Set()
5 | [debug] modified: Set()
6 | [debug] Removed products: Set()
7 | [debug] External API changes: API Changes: Set()
8 | [debug] Modified binary dependencies: Set()
9 | [debug] Initial directly invalidated sources: Set()
10 | [debug]
11 | [debug] Sources indirectly invalidated by:
12 | [debug] product: Set()
13 | [debug] binary dep: Set()
14 | [debug] external source: Set()
15 | [debug] All initially invalidated sources: Set()
16 |
--------------------------------------------------------------------------------
/project/target/streams/compile/copyResources/$global/streams/copy-resources:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/project/target/streams/compile/copyResources/$global/streams/copy-resources
--------------------------------------------------------------------------------
/project/target/streams/compile/copyResources/$global/streams/out:
--------------------------------------------------------------------------------
1 | [debug] Copy resource mappings:
2 | [debug]
3 |
--------------------------------------------------------------------------------
/project/target/streams/compile/exportedProducts/$global/streams/export:
--------------------------------------------------------------------------------
1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes
2 |
--------------------------------------------------------------------------------
/project/target/streams/compile/internalDependencyClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/project/target/streams/compile/unmanagedClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/project/target/streams/compile/unmanagedJars/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/dependencyClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes:/Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/exportedProducts/$global/streams/export:
--------------------------------------------------------------------------------
1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/externalDependencyClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 | /Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/fullClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes:/Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/internalDependencyClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 | /Users/li/workshop/NaturalLanguageProcessing/project/target/scala-2.10/sbt-0.13/classes
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/managedClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 | /Users/li/.ivy2/cache/scala_2.10/sbt_0.13/com.eed3si9n/sbt-assembly/jars/sbt-assembly-0.14.1.jar:/Users/li/.ivy2/cache/org.scalactic/scalactic_2.10/bundles/scalactic_2.10-2.2.1.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-library.jar:/Users/li/.sbt/boot/scala-2.10.4/lib/scala-reflect.jar:/Users/li/.ivy2/cache/org.pantsbuild/jarjar/jars/jarjar-1.6.0.jar:/Users/li/.ivy2/cache/org.apache.ant/ant/jars/ant-1.9.6.jar:/Users/li/.ivy2/cache/org.apache.ant/ant-launcher/jars/ant-launcher-1.9.6.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm/jars/asm-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-commons/jars/asm-commons-5.0.4.jar:/Users/li/.ivy2/cache/org.ow2.asm/asm-tree/jars/asm-tree-5.0.4.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-plugin-api/jars/maven-plugin-api-3.3.3.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-model/jars/maven-model-3.3.3.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-utils/jars/plexus-utils-3.0.20.jar:/Users/li/.ivy2/cache/org.apache.maven/maven-artifact/jars/maven-artifact-3.3.3.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.plexus/eclipse-plugins/org.eclipse.sisu.plexus-0.3.0.jar:/Users/li/.ivy2/cache/javax.enterprise/cdi-api/jars/cdi-api-1.0.jar:/Users/li/.ivy2/cache/javax.annotation/jsr250-api/jars/jsr250-api-1.0.jar:/Users/li/.ivy2/cache/javax.inject/javax.inject/jars/javax.inject-1.jar:/Users/li/.ivy2/cache/org.eclipse.sisu/org.eclipse.sisu.inject/eclipse-plugins/org.eclipse.sisu.inject-0.3.0.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-component-annotations/jars/plexus-component-annotations-1.5.5.jar:/Users/li/.ivy2/cache/org.codehaus.plexus/plexus-classworlds/bundles/plexus-classworlds-2.5.2.jar
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/unmanagedClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/project/target/streams/runtime/unmanagedJars/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/src/main/scala/deeplearning/cae/CAE.scala:
--------------------------------------------------------------------------------
1 | package deeplearning.cae
2 |
3 | /**
4 | * Created by li on 16/8/15.
5 | */
6 | object CAE {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/scala/deeplearning/cnn/CNNModel.scala:
--------------------------------------------------------------------------------
1 | package deeplearning.cnn
2 |
3 | import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, DenseVector => BDV, Matrix => BM, SparseVector => BSV, Vector => BV}
4 | import org.apache.spark.rdd.RDD
5 |
6 | /**
7 | * label:目标矩阵
8 | * features:特征矩阵
9 | * predict_label:预测矩阵
10 | * error:误差
11 | */
12 | case class PredictCNNLabel(label: BDM[Double], features: BDM[Double], predict_label: BDM[Double], error: BDM[Double]) extends Serializable
13 |
14 | class CNNModel(
15 | val cnn_layers: Array[CNNLayers],
16 | val cnn_ffW: BDM[Double],
17 | val cnn_ffb: BDM[Double]) extends Serializable {
18 |
19 | /**
20 | * 返回预测结果
21 | * 返回格式:(label, feature, predict_label, error)
22 | */
23 | def predict(dataMatrix: RDD[(BDM[Double], BDM[Double])]): RDD[PredictCNNLabel] = {
24 | val sc = dataMatrix.sparkContext
25 | val bc_cnn_layers = sc.broadcast(cnn_layers)
26 | val bc_cnn_ffW = sc.broadcast(cnn_ffW)
27 | val bc_cnn_ffb = sc.broadcast(cnn_ffb)
28 | // CNNff是进行前向传播
29 | val train_cnnff = CNN.CNNff(dataMatrix, bc_cnn_layers, bc_cnn_ffb, bc_cnn_ffW)
30 | val rdd_predict = train_cnnff.map { f =>
31 | val label = f._1
32 | val nna1 = f._2(0)(0)
33 | val nnan = f._4
34 | val error = f._4 - f._1
35 | PredictCNNLabel(label, nna1, nnan, error)
36 | }
37 | rdd_predict
38 | }
39 |
40 | /**
41 | * 计算输出误差
42 | * 平均误差;
43 | */
44 | def Loss(predict: RDD[PredictCNNLabel]): Double = {
45 | val predict1 = predict.map(f => f.error)
46 | // error and loss
47 | // 输出误差计算
48 | val loss1 = predict1
49 | val (loss2, counte) = loss1.treeAggregate((0.0, 0L))(
50 | seqOp = (c, v) => {
51 | // c: (e, count), v: (m)
52 | val e1 = c._1
53 | val e2 = (v :* v).sum
54 | val esum = e1 + e2
55 | (esum, c._2 + 1)
56 | },
57 | combOp = (c1, c2) => {
58 | // c: (e, count)
59 | val e1 = c1._1
60 | val e2 = c2._1
61 | val esum = e1 + e2
62 | (esum, c1._2 + c2._2)
63 | })
64 | val Loss = (loss2 / counte.toDouble) * 0.5
65 | Loss
66 | }
67 |
68 | }
69 |
--------------------------------------------------------------------------------
/src/main/scala/deeplearning/tests/Test_example_CNN.scala:
--------------------------------------------------------------------------------
1 | package tests
2 |
3 | import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, DenseVector => BDV, Matrix => BM, SparseVector => BSV, Vector => BV, axpy => brzAxpy, max => Bmax, min => Bmin, sum => Bsum, svd => brzSvd}
4 | import deeplearning.cnn.CNN
5 | import org.apache.log4j.{Level, Logger}
6 | import org.apache.spark.{SparkConf, SparkContext}
7 |
8 | object Test_example_CNN {
9 |
10 | def main(args: Array[String]) {
11 | //1 构建Spark对象
12 | val conf = new SparkConf().setAppName("CNNtest").setMaster("local")
13 | val sc = new SparkContext(conf)
14 |
15 | //2 测试数据
16 | Logger.getRootLogger.setLevel(Level.WARN)
17 | val data_path = "/Users/li/workshop/DataSet/deeplearning/train_d3.txt"
18 | val examples = sc.textFile(data_path).cache()
19 | val train_d1 = examples.map { line =>
20 | val f1 = line.split("\t")
21 | val f = f1.map(f => f.toDouble)
22 | val y = f.slice(0, 4)
23 | val x = f.slice(4, f.length)
24 | (new BDM(1, y.length, y), new BDM(1, x.length, x))
25 | }
26 |
27 | val train_d = train_d1.map(f => (f._1, f._2))
28 |
29 |
30 | //3 设置训练参数,建立模型
31 | // opts:迭代步长,迭代次数,交叉验证比例
32 | val opts = Array(50.0, 1.0, 0.0)
33 | train_d.cache
34 | val numExamples = train_d.count()
35 | println(s"numExamples = $numExamples.")
36 |
37 | val CNNmodel = new CNN()
38 | .setMapsize(new BDM(1, 2, Array(28.0, 28.0)))
39 | .setTypes(Array("i", "c", "s", "c", "s"))
40 | .setLayer(5)
41 | .setOnum(10)
42 | .setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0))
43 | .setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0))
44 | .setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0))
45 | .setAlpha(1.0)
46 | .CNNtrain(train_d, opts)
47 |
48 | //4 模型测试
49 | val CNNforecast = CNNmodel.predict(train_d)
50 | val CNNerror = CNNmodel.Loss(CNNforecast)
51 | println(s"NNerror = $CNNerror.")
52 | val printf1 = CNNforecast.map(f => (f.label.data, f.predict_label.data)).take(200)
53 | println("预测值")
54 | for (i <- 0 until printf1.length) {
55 | val outi = printf1(i)._2.mkString("\t")
56 | println(outi)
57 | }
58 |
59 | }
60 | }
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/telecomdataprocessing/util/HBaseUtil.scala:
--------------------------------------------------------------------------------
1 | package telecomdataprocessing.util
2 |
3 | import com.ibm.icu.text.CharsetDetector
4 | import org.apache.hadoop.conf.Configuration
5 | import org.apache.hadoop.hbase.HBaseConfiguration
6 | import org.apache.hadoop.hbase.client.Result
7 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
8 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
9 | import org.apache.spark.SparkContext
10 | import org.apache.spark.rdd.RDD
11 |
12 | import scala.xml.{Elem, XML}
13 |
14 | /**
15 | * Created by li on 16/7/7.
16 | */
17 | object HBaseUtil {
18 |
19 | /**
20 | * 识别字符编码
21 | *
22 | * @param html 地址编码
23 | * @return 字符编码
24 | */
25 | def judgeChaser(html: Array[Byte]): String = {
26 |
27 | val icu4j = new CharsetDetector()
28 | icu4j.setText(html)
29 | val encoding = icu4j.detect()
30 |
31 | encoding.getName
32 | }
33 |
34 | /**
35 | * 获取xml格式的配置文件
36 | *
37 | * @param dir 配置文件所在的文件目录
38 | * @return
39 | * @return Li Yu
40 | * @note rowNum: 2
41 | */
42 | def readConfigFile(dir: String): Elem = {
43 |
44 | val configFile = XML.loadFile(dir)
45 |
46 | configFile
47 | }
48 |
49 | /**
50 | * 获取hbase配置内容,并且初始化hbase配置
51 | *
52 | * @param configFile hbase配置文件
53 | * @return
54 | * @return Li Yu
55 | * @note rowNum: 7
56 | */
57 | def setHBaseConfigure(configFile: Elem): Configuration = {
58 |
59 | val rootDir = (configFile \ "hbase" \ "rootDir").text
60 | val ip = (configFile \ "hbase" \ "ip").text
61 |
62 | // 初始化配置
63 | val configuration = HBaseConfiguration.create()
64 | configuration.set("hbase.rootdir", rootDir)
65 | configuration.set("hbase.zookeeper.quorum", ip)
66 |
67 | configuration
68 | }
69 |
70 | /**
71 | * 获取hbase中的内容
72 | *
73 | * @param sc SparkContext
74 | * @param confDir 配置文件所在的文件夹
75 | * @author Li Yu
76 | * @note rowNum: 7
77 | */
78 | def getHBaseConf(sc: SparkContext, confDir: String, tableName: String) : RDD[(ImmutableBytesWritable, Result)] = {
79 |
80 | val configFile = HBaseUtil.readConfigFile(confDir)
81 | val configuration = HBaseUtil.setHBaseConfigure(configFile)
82 |
83 | configuration.set(TableInputFormat.INPUT_TABLE, tableName)
84 |
85 | // 使用Hadoop api来创建一个RDD
86 | val hBaseRDD = sc.newAPIHadoopRDD(configuration,
87 | classOf[TableInputFormat],
88 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
89 | classOf[org.apache.hadoop.hbase.client.Result])
90 |
91 | hBaseRDD
92 | }
93 |
94 | }
95 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/telecomdataprocessing/util/LoggerUtil.scala:
--------------------------------------------------------------------------------
1 | package telecomdataprocessing.util
2 |
3 | import org.apache.log4j.{BasicConfigurator, Logger}
4 |
5 | /**
6 | * 写Log操作
7 | */
8 | object LoggerUtil {
9 |
10 | var logger = Logger.getLogger("TelecomData_Processing")
11 | BasicConfigurator.configure()
12 | // PropertyConfigurator.configure("/home/mlearning/tdt/conf/log4j.properties")
13 |
14 | def exception(e: Exception) = {
15 |
16 | logger.error(e.printStackTrace())
17 |
18 | }
19 |
20 | def error(msg: String): Unit = {
21 |
22 | logger.error(msg)
23 | }
24 |
25 | def warn(msg: String): Unit = {
26 |
27 | logger.warn(msg)
28 | }
29 |
30 | def info(msg: String): Unit = {
31 |
32 | logger.info(msg)
33 | }
34 |
35 | def debug(msg: String): Unit = {
36 |
37 | logger.debug(msg)
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/telecomdataprocessingAll/TelecomDataProcess.scala:
--------------------------------------------------------------------------------
1 | //package com.kunyan.dxdataprocess
2 | //
3 | //import java.text.SimpleDateFormat
4 | //
5 | //import org.apache.spark.{SparkConf, SparkContext}
6 | //import util.HBaseUtil
7 | //
8 | //import scala.collection.mutable.ArrayBuffer
9 | //
10 | ///**
11 | // * Created by QQ on 7/25/16.
12 | // */
13 | //object TelecomDataProcess {
14 | //
15 | // def getDayTimeStamp(startDay: String): Long = {
16 | //
17 | // val sdf = new SimpleDateFormat("yyyy-MM-dd")
18 | // val dayStamp = sdf.parse(startDay).getTime
19 | //
20 | // dayStamp
21 | // }
22 | //
23 | // /**
24 | // * 给定时间范围,根据时间窗口长度,获取若干组时间窗口
25 | // *
26 | // * @param startTime 起始时间戳
27 | // * @param endTime 结束时间戳
28 | // * @param timeRange 事件窗口
29 | // * @return Array[(Long, Long)]
30 | // * @note rowNum:11
31 | // */
32 | // def makeHourTimeWindows(startTime: Long, endTime: Long, timeRange: Int): Array[(Long, Long)] = {
33 | //
34 | // var count = startTime
35 | // val dayWindows = ArrayBuffer[(Long, Long)]()
36 | //
37 | // do {
38 | //
39 | // // (start, start + timeRange - 1)
40 | // dayWindows.append((count, count + 60L * 60 * 1000 * timeRange - 1))
41 | // count += 60L * 60 * 1000
42 | //
43 | // } while (count < endTime)
44 | //
45 | // dayWindows.toArray
46 | // }
47 | //
48 | // def judgeTimeWindow(time: Long, timeWindow: Array[(Long, Long)]): (Long, Long) = {
49 | //
50 | // timeWindow.foreach(line => {
51 | // if (time >= line._1 && time <= line._2){
52 | // return line
53 | // }
54 | // })
55 | //
56 | // (-1L, -1L)
57 | // }
58 | //
59 | // def urlFormat(url: String): String = {
60 | //
61 | // val temp = url.split("://")
62 | //
63 | // temp.length match {
64 | // case 1 => temp(0).replaceAll("wwww", "")
65 | // case 2 => temp(1).replaceAll("wwww", "")
66 | // }
67 | // }
68 | //
69 | // def main(args: Array[String]) {
70 | //
71 | // val conf = new SparkConf()
72 | // .setAppName(s"Warren_TelecomData_Processing_${args(0)}")
73 | // .set("dfs.replication", "1")
74 | // // .setMaster("local")
75 | // // .set("spark.driver.host","192.168.2.90")
76 | // val sc = new SparkContext(conf)
77 | //
78 | // val jsonConfig = new JsonConfig
79 | // jsonConfig.initConfig(args(1))
80 | //
81 | // val hbaseConfig = HBaseUtil.getHbaseConf(jsonConfig.getValue("hbase", "rootDir"),
82 | // jsonConfig.getValue("hbase", "ips"))
83 | //
84 | // val startDayTimeStamp = getDayTimeStamp(args(0))
85 | // val endDayTimeStamp = startDayTimeStamp + 24L * 60 * 60 * 1000
86 | //
87 | // // 获取时间窗口
88 | // val timeRanges = sc.broadcast(makeHourTimeWindows(startDayTimeStamp, endDayTimeStamp, 1))
89 | //
90 | // // 获取电信数据
91 | // val teleData = sc.textFile(jsonConfig.getValue("tp", "telecomDataPath") + s"/${args(0)}}",
92 | // jsonConfig.getValue("tp", "partition").toInt)
93 | //
94 | // // 获取所有需要匹配的,并广播
95 | // val urlsBr = sc.broadcast(HBaseUtil.getRDD(sc, hbaseConfig).map(x => urlFormat(x.split("\n\t")(0))).collect()) // 这一步需要对从其他地方获取到新闻url做一些处理,例如去掉www和http
96 | //
97 | // // 分组计算
98 | // teleData.map(row => {
99 | // val tmp = row.split("\t")
100 | // val url = urlFormat(tmp(3) + tmp(4))
101 | // val time = tmp(0)
102 | //
103 | // (url, time)
104 | // }).filter(x => urlsBr.value.contains(x._1)).map(row => {
105 | //
106 | // val timeWindow = judgeTimeWindow(row._2.toLong, timeRanges.value)
107 | //
108 | // ((timeWindow._1, timeWindow._2, row._1), 1L)
109 | // }).reduceByKey(_ + _).saveAsTextFile(jsonConfig.getValue("tp", "outputPath") + s"/${args(0)}")
110 | // }
111 | //}
112 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/telecomdataprocessingAll/readFromHdfs.scala:
--------------------------------------------------------------------------------
1 | //package telecomdataprocessingAll
2 | //
3 | //import org.apache.spark.{SparkConf, SparkContext}
4 | //import util.LoggerUtil
5 | //
6 | ///**
7 | // * Created by li on 16/7/27.
8 | // */
9 | //object readFromHdfs {
10 | //
11 | // def main(args: Array[String]) {
12 | //
13 | // val conf = new SparkConf().setAppName("Warren_ReadFrom_Hdfs_filter")
14 | //
15 | // val sc = new SparkContext(conf)
16 | //
17 | // val hdfsDir = args(0)
18 | //// val hdfsDir = "hdfs://222.73.57.12:9000/telecom/shdx/origin/data/"
19 | //
20 | // val setTime = args(1)
21 | //// val setTime = "2016-07-23"
22 | //
23 | //
24 | // val time = System.currentTimeMillis()
25 | //
26 | // LoggerUtil.warn("time2Start:" +"%s".format(time)+ " 》》》》》》》》》》》》")
27 | // // 数据获取开始和截止时间
28 | // val stopTimeStamp = TDP.getDayTimeStamp(setTime)
29 | // val startTimeStamp = stopTimeStamp - 24 * 60 * 60 * 1000
30 | // val timeRanges = sc.broadcast(TDP.makeHourTimeWindows(startTimeStamp, stopTimeStamp -1, 1))
31 | //
32 | // // 23个新闻网站的host域名
33 | // val urlUnion = Array("yicai.com", "21cn.com", "d.weibo.com","xueqiu.com","10jqka.com.cn","gw.com.cn",
34 | // "eastmoney.com","p5w.net","stockstar.com","hexun.com","caijing.com.cn","jrj.com.cn","cfi.net.cn","cs.com.cn",
35 | // "cnstock.com", "stcn.com","news.cn","finance.ifeng.com","finance.sina.com.cn","business.sohu.com","money.163.com",
36 | // "wallstreetcn.com","finance.qq.com","moer.jiemian.com","www.szse.cn","weixin.sogou.com","sse.com.cn","zqyjbg.com")
37 | //
38 | // val dataFromHDFS2 = sc.textFile(hdfsDir + setTime + "/*")
39 | // .filter(! _.contains("home/telecom"))
40 | // .filter(! _.contains("youchaojiang"))
41 | // .map(_.split("\t"))
42 | // .filter(_.length == 8)
43 | // .filter(x => urlUnion.contains(TDP.urlFormat(x(3))))
44 | // .map(x => (TDP.urlFormat(x(3) + x(4)), x(0)))
45 | //
46 | // val result = dataFromHDFS2.map(row => {
47 | //
48 | // val timeWindow = TDP.judgeTimeWindow(row._2.toLong, timeRanges.value)
49 | //
50 | // ((timeWindow._1, timeWindow._2, row._1), 1L)
51 | // }).reduceByKey(_ + _).count()
52 | //
53 | // println(result)
54 | //
55 | //
56 | // LoggerUtil.warn("time2End:" +"%s".format(time)+ " 》》》》》》》》》》》》")
57 | //
58 | // }
59 | //
60 | //}
61 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/telecomdataprocessingAll/util/HDFSUtil.scala:
--------------------------------------------------------------------------------
1 | package telecomdataprocessingAll.util
2 |
3 | import java.text.SimpleDateFormat
4 |
5 | /**
6 | * Created by li on 16/7/25.
7 | */
8 | object HDFSUtil {
9 |
10 |
11 | def main(args: Array[String]) {
12 | val dataFormat = new SimpleDateFormat("yyyy-MM-dd")
13 | val startTime = dataFormat.parse("2012-12-12")
14 | val startTimeStamp = startTime.getTime
15 | val stopTimeStamp = startTime.getTime - 24 * 60 * 60 * 1000 -1
16 |
17 |
18 | println(startTimeStamp, stopTimeStamp)
19 | }
20 |
21 |
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/telecomdataprocessingAll/util/LoggerUtil.scala:
--------------------------------------------------------------------------------
1 | package telecomdataprocessingAll.util
2 |
3 | import org.apache.log4j.{BasicConfigurator, Logger}
4 |
5 | /**
6 | * 写Log操作
7 | */
8 | object LoggerUtil {
9 |
10 | var logger = Logger.getLogger("TelecomData_Processing")
11 | BasicConfigurator.configure()
12 | // PropertyConfigurator.configure("/home/mlearning/tdt/conf/log4j.properties")
13 |
14 | def exception(e: Exception) = {
15 |
16 | logger.error(e.printStackTrace())
17 |
18 | }
19 |
20 | def error(msg: String): Unit = {
21 |
22 | logger.error(msg)
23 | }
24 |
25 | def warn(msg: String): Unit = {
26 |
27 | logger.warn(msg)
28 | }
29 |
30 | def info(msg: String): Unit = {
31 |
32 | logger.info(msg)
33 | }
34 |
35 | def debug(msg: String): Unit = {
36 |
37 | logger.debug(msg)
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/telecomdataprocessingAll/util/TimeUtil.scala:
--------------------------------------------------------------------------------
1 | package telecomdataprocessingAll.util
2 |
3 | import java.math.BigInteger
4 | import java.text.SimpleDateFormat
5 | import java.util.{Calendar, Date}
6 |
7 | import org.apache.hadoop.hbase.client.Scan
8 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil
9 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
10 | import org.apache.hadoop.hbase.util.Base64
11 |
12 | /**
13 | * Created by C.J.YOU on 2016/1/13.
14 | * 格式化时间的工具类
15 | */
16 | object TimeUtil {
17 |
18 | def getTime(timeStamp: String): String = {
19 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss")
20 | val bigInt: BigInteger = new BigInteger(timeStamp)
21 | val date: String = sdf.format(bigInt)
22 | date
23 | }
24 |
25 | def getDay: String = {
26 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
27 | val date: String = sdf.format(new Date)
28 | date
29 | }
30 |
31 | def getCurrentHour: Int = {
32 | val calendar = Calendar.getInstance
33 | calendar.setTime(new Date)
34 | calendar.get(Calendar.HOUR_OF_DAY)
35 | }
36 |
37 | def getPreHourStr: String = {
38 | val date = new Date(new Date().getTime - 60 * 60 * 1000)
39 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd-HH")
40 | sdf.format(date)
41 | }
42 |
43 | /**
44 | * 获取今天的日期
45 | *
46 | * @return
47 | */
48 | def getNowDate(): String = {
49 | val now: Date = new Date()
50 | val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
51 | val res = dateFormat.format( now )
52 | res
53 | }
54 |
55 |
56 | /**
57 | * 获取本周的开始时间
58 | */
59 | def Null(){
60 |
61 | }
62 |
63 | /**
64 | * 获取本月的开始时间
65 | * http://blog.csdn.net/springlustre/article/details/47273353
66 | */
67 |
68 |
69 | /**
70 | * 设置时间范围
71 | *
72 | * @return 时间范围
73 | * @author yangshuai
74 | */
75 | def setTimeRange(): String = {
76 |
77 | val scan = new Scan()
78 | val date = new Date(new Date().getTime - 30 * 24 * 60 * 60 * 1000)
79 | val format = new SimpleDateFormat("yyyy-MM-dd HH")
80 | val time = format.format(date)
81 | val time1 = format.format(new Date().getTime)
82 | val startTime = time + "-00-00"
83 | val stopTime = time1 + "-00-00"
84 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss")
85 | val startRow: Long = sdf.parse(startTime).getTime
86 | val stopRow: Long = sdf.parse(stopTime).getTime
87 |
88 | scan.setTimeRange(startRow, stopRow)
89 | val proto: ClientProtos.Scan = ProtobufUtil.toScan(scan)
90 |
91 | Base64.encodeBytes(proto.toByteArray)
92 | }
93 |
94 | /**
95 | * 设置制定的时间范围(一天)
96 | * @param time 指定的日期
97 | * @return 指定日期至前一天时间范围
98 | */
99 | def setAssignedTimeRange(time: String): String = {
100 |
101 | val format = new SimpleDateFormat("yyyy-MM-dd")
102 |
103 | val date = format.parse(time)
104 |
105 | val endTime = new Date(date.getTime - 24 * 60 * 60 * 1000)
106 |
107 | val stopTime = format.format(endTime)
108 |
109 | val startDate = time + "-00-00-00"
110 | val stopDate = stopTime + "-00-00-00"
111 |
112 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss")
113 | val startRaw = sdf.parse(startDate).getTime
114 | val stopRaw = sdf.parse(stopDate).getTime
115 |
116 | val scan = new Scan()
117 | scan.setTimeRange(startRaw, stopRaw)
118 |
119 | val proto = ProtobufUtil.toScan(scan)
120 |
121 | Base64.encodeBytes(proto.toByteArray)
122 | }
123 |
124 |
125 | }
126 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/vipstockstatistic/CorpusBuild.scala:
--------------------------------------------------------------------------------
1 | package dataprocess.vipstockstatistic
2 |
3 | import com.kunyandata.nlpsuit.util.{TextPreprocessing, KunyanConf}
4 | import org.apache.spark.rdd.RDD
5 |
6 | import scala.xml.XML
7 |
8 | /**
9 | * Created by li on 2016/8/23.
10 | * 调用坤雁分词系统
11 | */
12 | object CorpusBuild {
13 |
14 | /**
15 | * 配置文件初始化
16 | *
17 | * @param xmlConfPath 配置文件输入路径
18 | * @return 初始化后的配置文件
19 | * @author Li Yu
20 | * @note rowNum = 6
21 | */
22 | def paramInit(xmlConfPath: String): KunyanConf = {
23 |
24 | val kunyanConf = new KunyanConf
25 | val confFile = XML.loadFile(xmlConfPath)
26 |
27 | val kunyanHost = { confFile \ "kunyan" \ "kunyanHost" }.text
28 | val kunyanPort = { confFile \ "kunyan" \ "kunyanPort" }.text.toInt
29 | kunyanConf.set(kunyanHost, kunyanPort)
30 |
31 | kunyanConf
32 | }
33 |
34 | /**
35 | * 分词程序
36 | *
37 | * @param xmlPath 主程序输入参数
38 | * @author Li Yu
39 | * @note rownum = 6
40 | */
41 | def run(xmlPath: String, news: RDD[Array[String]]): RDD[(String, String)] = {
42 |
43 | System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
44 |
45 | // 配置文件初始化
46 | val kunyanConf = paramInit(xmlPath)
47 |
48 | // 调用分词系统,输出内容为URL 分词结果
49 | val stopWords = Array(" ")
50 | val corpus = news.map(row => (row(2), TextPreprocessing.process(row(3), stopWords, kunyanConf).mkString(",")))
51 |
52 | corpus
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/vipstockstatistic/util/AnsjAnalyzer.scala:
--------------------------------------------------------------------------------
1 | package dataprocess.vipstockstatistic.util
2 |
3 | import org.ansj.library.UserDefineLibrary
4 | import org.ansj.splitWord.analysis.{NlpAnalysis, ToAnalysis}
5 | import org.apache.spark.SparkContext
6 |
7 |
8 | /**
9 | * Created by zhangxin on 2016/3/8
10 | * 基于ansj的分词工具
11 | */
12 | object AnsjAnalyzer {
13 |
14 | /**
15 | * ansj分词器初始化, 添加用户词典
16 | *
17 | * @param sc spark程序入口
18 | * @param userDic 用户词典数组
19 | * @return 无
20 | * @author zhangxin
21 | */
22 | def init(sc: SparkContext, userDic: Array[String]): Unit = {
23 |
24 | if(userDic != null ){
25 | userDic.foreach(addUserDic(_, sc))
26 | }
27 |
28 | }
29 |
30 | /**
31 | * 添加用户词典到分词器
32 | *
33 | * @param dicPath 词典路径
34 | * @param sc spark程序入口
35 | * @return 无
36 | * @author zhangxin
37 | */
38 | def addUserDic(dicPath: String, sc: SparkContext): Unit = {
39 |
40 | //读取词典
41 | val dic = sc.textFile(dicPath).collect()
42 |
43 | //添加到ansj中
44 | dic.foreach(UserDefineLibrary.insertWord(_, "userDefine", 100))
45 |
46 | }
47 |
48 | /**
49 | * 标准分词 ,无词性标注
50 | *
51 | * @param sentence 待分词语句
52 | * @return 分词结果
53 | * @author zhangxin
54 | */
55 | def cutNoTag(sentence: String): Array[String] = {
56 |
57 | //切词
58 | val sent = ToAnalysis.parse(sentence)
59 |
60 | //提取分词结果,过滤词性
61 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName
62 |
63 | words.toArray
64 | }
65 |
66 | /**
67 | * 自然语言分词,带词性标注
68 | *
69 | * @param sentence 待分词句子
70 | * @return 分词结果
71 | * @author zhangxin
72 | */
73 | def cutWithTag(sentence: String):Array[String]={
74 |
75 | // 切词
76 | val sent = NlpAnalysis.parse(sentence)
77 |
78 | // 提取分词结果
79 | val words= for(i <- Range(0, sent.size())) yield sent.get(i).getName
80 |
81 | words.toArray
82 | }
83 |
84 | }
85 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/vipstockstatistic/util/HBaseUtil.scala:
--------------------------------------------------------------------------------
1 | package dataprocess.vipstockstatistic.util
2 |
3 | import java.text.SimpleDateFormat
4 | import com.ibm.icu.text.CharsetDetector
5 | import org.apache.hadoop.conf.Configuration
6 | import org.apache.hadoop.hbase.HBaseConfiguration
7 | import org.apache.hadoop.hbase.client.{Result, Scan}
8 | import org.apache.hadoop.hbase.io.ImmutableBytesWritable
9 | import org.apache.hadoop.hbase.mapreduce.TableInputFormat
10 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil
11 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
12 | import org.apache.hadoop.hbase.util.Base64
13 | import org.apache.spark.SparkContext
14 | import org.apache.spark.rdd.RDD
15 | import scala.xml.{Elem, XML}
16 |
17 | /**
18 | * Created by li on 16/7/7.
19 | */
20 | object HBaseUtil {
21 |
22 | /**
23 | * 设置时间范围
24 | *
25 | * @return 时间范围
26 | * @author yangshuai
27 | */
28 | def setTimeRange(startDay: String): String = {
29 |
30 | val scan = new Scan()
31 |
32 | val sdf = new SimpleDateFormat("yyyy-MM-dd")
33 | val startRow = sdf.parse(startDay).getTime
34 | val stopRow = startRow + 24 * 60 * 60 * 1000 - 1
35 |
36 | scan.setTimeRange(startRow, stopRow)
37 | val proto: ClientProtos.Scan = ProtobufUtil.toScan(scan)
38 |
39 | Base64.encodeBytes(proto.toByteArray)
40 | }
41 |
42 | /**
43 | * 识别字符编码
44 | *
45 | * @param html 地址编码
46 | * @return 字符编码
47 | */
48 | def judgeChaser(html: Array[Byte]): String = {
49 |
50 | val icu4j = new CharsetDetector()
51 | icu4j.setText(html)
52 | val encoding = icu4j.detect()
53 |
54 | encoding.getName
55 | }
56 |
57 | /**
58 | * 获取xml格式的配置文件
59 | *
60 | * @param dir 配置文件所在的文件目录
61 | * @return
62 | * @return Li Yu
63 | * @note rowNum: 2
64 | */
65 | def readConfigFile(dir: String): Elem = {
66 |
67 | val configFile = XML.loadFile(dir)
68 |
69 | configFile
70 | }
71 |
72 | /**
73 | * 获取hbase配置内容,并且初始化hbase配置
74 | *
75 | * @param configFile hbase配置文件
76 | * @return
77 | * @return Li Yu
78 | * @note rowNum: 7
79 | */
80 | def setHBaseConfigure(configFile: Elem): Configuration = {
81 |
82 | val rootDir = (configFile \ "hbase" \ "rootDir").text
83 | val ip = (configFile \ "hbase" \ "ip").text
84 |
85 | // 初始化配置
86 | val configuration = HBaseConfiguration.create()
87 | configuration.set("hbase.rootdir", rootDir)
88 | configuration.set("hbase.zookeeper.quorum", ip)
89 |
90 | configuration
91 | }
92 |
93 | /**
94 | * 获取hbase中的内容
95 | *
96 | * @param sc SparkContext
97 | * @param confDir 配置文件所在的文件夹
98 | * @author Li Yu
99 | * @note rowNum: 7
100 | */
101 | def getHBaseConf(sc: SparkContext, confDir: String, tableName: String) : RDD[(ImmutableBytesWritable, Result)] = {
102 |
103 | val configFile = readConfigFile(confDir)
104 | val configuration = setHBaseConfigure(configFile)
105 |
106 | configuration.set(TableInputFormat.INPUT_TABLE, tableName)
107 | // configuration.set(TableInputFormat.SCAN, timeRange)
108 |
109 | // 使用Hadoop api来创建一个RDD
110 | val hBaseRDD = sc.newAPIHadoopRDD(configuration,
111 | classOf[TableInputFormat],
112 | classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
113 | classOf[org.apache.hadoop.hbase.client.Result])
114 |
115 | hBaseRDD
116 | }
117 |
118 | }
119 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/vipstockstatistic/util/LoggerUtil.scala:
--------------------------------------------------------------------------------
1 | package dataprocess.vipstockstatistic.util
2 |
3 | import org.apache.log4j.{BasicConfigurator, Logger}
4 |
5 | /**
6 | * 写Log操作
7 | */
8 | object LoggerUtil {
9 |
10 | var logger = Logger.getLogger("Warren_VipStockStatistic_Processing")
11 | BasicConfigurator.configure()
12 | // PropertyConfigurator.configure("/home/alg/telecomdataprocess/conf/log4j.properties")
13 |
14 | def exception(e: Exception) = {
15 |
16 | logger.error(e.printStackTrace())
17 |
18 | }
19 |
20 | def error(msg: String): Unit = {
21 |
22 | logger.error(msg)
23 | }
24 |
25 | def warn(msg: String): Unit = {
26 |
27 | logger.warn(msg)
28 | }
29 |
30 | def info(msg: String): Unit = {
31 |
32 | logger.info(msg)
33 | }
34 |
35 | def debug(msg: String): Unit = {
36 |
37 | logger.debug(msg)
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/vipstockstatistic/util/RedisUtil.scala:
--------------------------------------------------------------------------------
1 | package dataprocess.vipstockstatistic.util
2 |
3 | import redis.clients.jedis.Jedis
4 |
5 | import scala.xml.XML
6 |
7 | /**
8 | * Created by li on 16/8/23.
9 | */
10 | object RedisUtil {
11 |
12 | var jedis: Jedis = null
13 |
14 | /**
15 | * 初始化 redis
16 | *
17 | * @param confDir 配置文件对应的 xml 对象
18 | * @note rowNum: 10
19 | */
20 | def initRedis(confDir: String): Jedis = {
21 |
22 | val configFile = XML.loadFile(confDir)
23 |
24 | val redisIp = (configFile \ "redis" \ "ip").text
25 | val redisPort = (configFile \ "redis" \ "port").text.toInt
26 | val redisDB = (configFile \ "redis" \ "db").text.toInt
27 | val redisAuth = (configFile \ "redis" \ "auth").text
28 |
29 | jedis = new Jedis(redisIp, redisPort)
30 | jedis.auth(redisAuth)
31 | jedis.select(redisDB)
32 |
33 | jedis
34 | }
35 |
36 | /**
37 | * 将结果保存到redis
38 | *
39 | * @param resultData 需要保存的数据
40 | * @author Li Yu
41 | * @note rowNum: 12
42 | */
43 | def write2Redis(resultData: Array[(String, String)], time: String, dataType: String, confDir: String): Unit = {
44 |
45 | val jedis = initRedis(confDir)
46 |
47 | resultData.foreach{ x => {
48 |
49 | jedis.zadd(s"vipstockstatistic_$dataType" + s"_$time", x._2.toDouble, x._1)
50 | }}
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/scala/intactprogram/vipstockstatistic/util/config.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | hdfs://61.147.114.85:9000/hbase
6 | slave1,slave2,slave3
7 |
8 |
9 |
10 | hdfs://61.147.114.85:9000
11 |
12 |
13 |
14 | 61.147.114.88
15 | 16003
16 |
17 |
18 |
19 | 61.147.114.72
20 | 6666
21 | db9
22 | backtest
23 |
24 |
25 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/Recommendation/SparkMLlibColbFilter.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.Recommendation
2 | import org.apache.spark.mllib.recommendation.{ALS, MatrixFactorizationModel, Rating}
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 |
6 | /**
7 | * Created by li on 2017/3/23.
8 | * 协同过滤ALS算法推荐过程如下:
9 | * 加载数据到 ratings RDD,每行记录包括:user, product, rate
10 | * 从 ratings 得到用户商品的数据集:(user, product)
11 | * 使用ALS对 ratings 进行训练
12 | * 通过 model 对用户商品进行预测评分:((user, product), rate)
13 | * 从 ratings 得到用户商品的实际评分:((user, product), rate)
14 | * 合并预测评分和实际评分的两个数据集,并求均方差
15 | */
16 |
17 | object SparkMLlibColbFilter {
18 |
19 | def main(args: Array[String]) {
20 | val conf = new SparkConf().setAppName("Java Collaborative Filtering Example").setMaster("local")
21 | val sc = new SparkContext(conf)
22 |
23 | // Load and parse the data
24 | val path = "file:///data/hadoop/spark-2.0.0-bin-hadoop2.7/data/mllib/als/test.data"
25 | val data = sc.textFile(path)
26 | val ratings = data.map(_.split(",") match { case Array(user, item, rate) =>
27 | Rating(user.toInt, item.toInt, rate.toDouble)
28 | })
29 |
30 | // Build the recommendation model using ALS
31 | val rank = 10
32 | val numIterations = 10
33 | val model = ALS.train(ratings, rank, numIterations, 0.01)
34 |
35 | // Evauate the model on rating data
36 | val usersProducts = ratings.map { case Rating(user, product, rate) =>
37 | (user, product)
38 | }
39 |
40 | val predictions =
41 | model.predict(usersProducts).map { case Rating(user, product, rate) =>
42 | ((user, product), rate)
43 | }
44 |
45 | val ratesAndPreds = ratings.map { case Rating(user, product, rate) =>
46 | ((user, product), rate)
47 | }.join(predictions)
48 |
49 | val MSE = ratesAndPreds.map { case ((user, product), (r1, r2)) =>
50 | val err = r1 - r2
51 | err * err
52 | }.mean()
53 |
54 | System.out.println("Mean Squared Error = " + MSE)
55 |
56 | // Save and load model
57 | model.save(sc, "target/tmp/myCollaborativeFilter")
58 | val sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
59 |
60 |
61 | //为每个用户进行推荐,推荐的结果可以以用户id为key,结果为value存入redis或者hbase中
62 | val users = data.map(_.split(",")(0)).distinct().collect()
63 |
64 | for (elem <- users) {
65 |
66 | val res = model.recommendProducts(elem.toInt, numIterations)
67 | res.foreach(itm => (itm.user, itm.product, itm.rating))
68 | }
69 | }
70 | }
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/classification/BinaryClassificationWithALS.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.classification
2 |
3 | import org.apache.spark.ml.feature.{HashingTF, IDF, StopWordsRemover, Tokenizer}
4 | import org.apache.spark.mllib.linalg.{Vector, Vectors}
5 | import org.apache.spark.mllib.regression.LabeledPoint
6 | import org.apache.spark.sql.{Row, SQLContext}
7 | import org.apache.spark.{SparkConf, SparkContext}
8 |
9 | import scala.io.Source
10 |
11 |
12 |
13 | /**
14 | * Created by li on 16/4/8.
15 | */
16 | object BinaryClassificationWithALS {
17 |
18 | val conf = new SparkConf().setMaster("local").setAppName("StopWordRemove")
19 | val sc = new SparkContext(conf)
20 | val sqlContext = new SQLContext(sc)
21 | // val hivecontext = new HiveContext(sc)
22 | import sqlContext.implicits._
23 |
24 |
25 | // DataFrame type 数据集导入
26 | // val src = Source.fromFile("/users/li/Intellij/Native-Byes/nativebyes/wordseg_881156.txt").getLines().toArray
27 |
28 | // 总数据集获取未平衡
29 | // case class RawDataRecord( category: String ,labels: Double ,text: String)
30 | //
31 | // val src = Source.fromFile("/Users/li/Downloads/traningset/HGHQ.txt").getLines().toArray.map{
32 | // line =>
33 | // val data = line.split("\t")
34 | // RawDataRecord(data(1),data(0).toDouble,data(2))
35 | // }
36 |
37 |
38 | // // 平衡数据集获取
39 | case class RawDataRecord(labels: Double ,text: String)
40 | val src = sc.textFile("/Users/li/Downloads/trainingSets/保险").map{
41 | line =>
42 | val data = line.split("\t")
43 | RawDataRecord(data(0).toDouble, data(1))
44 | }
45 |
46 |
47 | val srcDF = sqlContext.createDataFrame(src)
48 |
49 |
50 | // RDD type
51 | // val srcRDD = sc.textFile("/users/li/Intellij/Native-Byes/nativebyes/wordseg_881156.txt").map {
52 | // x =>
53 | // val data = x.split("\t")
54 | // RawDataRecord(data(0),data(1),labels = if(data(1) == "881108" ) 1.0 else 0.0, data(2))
55 | // }.toDF()//to DataFrame
56 |
57 | var tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
58 | var wordsData = tokenizer.transform(srcDF)
59 |
60 | // 去停用词
61 | // 读取停用词表
62 | // val filter = Source.fromFile("/users/li/Intellij/Native-Byes/nativebyes/1.txt" ).getLines().toArray
63 | val filter = Source.fromFile("/users/li/Intellij/Native-Byes/nativebyes/stop_words_CN" ).getLines().toArray
64 |
65 | val remover = new StopWordsRemover()
66 | .setInputCol("words")
67 | .setOutputCol("filtered")
68 | .setStopWords(filter)
69 |
70 | val removeword = remover.transform(wordsData)
71 |
72 |
73 | //70%作为训练数据,30%作为测试数据
74 | val splits = removeword.randomSplit(Array(0.7, 0.3),seed = 11L)
75 | //splits.foreach(println)
76 | var trainingDF = splits(0)
77 | var testDF = splits(1)
78 |
79 |
80 |
81 | //使用hashingTF计算每个词在文档中的词频
82 | val hashingTF = new HashingTF().setNumFeatures(2000).setInputCol("filtered").setOutputCol("rawFeatures")
83 | val featurizedData = hashingTF.transform(trainingDF)
84 | // println("output2:")
85 | // featurizedData.select($"category", $"words", $"rawFeatures").foreach(println)
86 | // featurizedData.show()
87 |
88 |
89 | //计算每个词的TF-IDF
90 | var idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
91 | val idfModel = idf.fit(featurizedData)
92 | var rescaledData = idfModel.transform(featurizedData)
93 | // println("output3:")
94 | // rescaledData.select($"category", $"features").foreach(println)
95 | // rescaledData.select($"labels",$"features").show()
96 |
97 |
98 | // 转换成Bayes的输入格式
99 | var trainDataRdd = rescaledData.select($"labels",$"features").map {
100 | case Row(label: Double, features: Vector) =>
101 | LabeledPoint(label , Vectors.dense(features.toArray))
102 | }.cache()
103 |
104 | //trainDataRdd.foreach(println)
105 |
106 |
107 | /** ALS降维 */
108 | // val pca = new PCA(trainDataRdd.first().features.size/2).fit(trainDataRdd.map(_.features))
109 | // val als = new ALSModel()
110 | // val pcl = new ALS().setNonnegative(true).setMaxIter(100).fit(trainDataRdd.map(_.features))
111 |
112 |
113 |
114 |
115 |
116 |
117 | }
118 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/classification/PCAtest.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.classification
2 |
3 | import org.apache.spark.mllib.feature.PCA
4 | import org.apache.spark.mllib.linalg.Vectors
5 | import org.apache.spark.mllib.regression.{LinearRegressionWithSGD, LabeledPoint}
6 | import org.apache.spark.{SparkConf, SparkContext}
7 |
8 |
9 | /**
10 | * Created by li on 16/4/7.
11 | */
12 | object PCAtest extends App{
13 |
14 | val conf = new SparkConf().setAppName("test").setMaster("local")
15 | val sc = new SparkContext(conf)
16 |
17 | val data = sc.textFile("/Users/li/Downloads/lpsa.data").map { line =>
18 | val parts = line.split(',')
19 | LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
20 | }.cache()
21 |
22 |
23 |
24 |
25 | val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
26 | val training = splits(0).cache()
27 | val test = splits(1)
28 |
29 | // training.foreach(println)
30 | // println(training.first())
31 | // println(training.first().features.size/2)
32 |
33 |
34 | val pca = new PCA(training.first().features.size/2).fit(data.map(_.features))
35 |
36 | val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
37 | val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))
38 |
39 | val numIterations = 100
40 | val model = LinearRegressionWithSGD.train(training, numIterations)
41 | val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)
42 |
43 | val valuesAndPreds = test.map { point =>
44 | val score = model.predict(point.features)
45 | (score, point.label)
46 | }
47 |
48 | val valuesAndPreds_pca = test_pca.map { point =>
49 | val score = model_pca.predict(point.features)
50 | (score, point.label)
51 | }
52 |
53 | val MSE = valuesAndPreds.map{case(v, p) => math.pow((v - p), 2)}.mean()
54 | val MSE_pca = valuesAndPreds_pca.map{case(v, p) => math.pow((v - p), 2)}.mean()
55 |
56 | println("Mean Squared Error = " + MSE)
57 | println("PCA Mean Squared Error = " + MSE_pca)
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/correlationanalysis/correlationAnalysis.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.correlationanalysis
2 |
3 | /**
4 | * Created by li on 16/7/5.
5 | */
6 | object correlationAnalysis {
7 |
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/hotdegreecalculate/CommunityFrequencyStatistics.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.hotdegreecalculate
2 |
3 | import org.apache.spark.rdd.RDD
4 |
5 | import scala.collection.mutable
6 |
7 | /**
8 | * Created by li on 16/7/5.
9 | * 计算社区热度的功能模块, 揉合了社区热词的热度计算,
10 | */
11 | object CommunityFrequencyStatistics {
12 |
13 |
14 | /**
15 | * 筛选出出现了社区内词的所有文章
16 | *
17 | * @param communityWords 社区中的词
18 | * @param textWords 新闻
19 | * @return Boolean 新闻中存在社区中的词返回true
20 | * @author Li Yu
21 | * @note rowNum: 11
22 | */
23 | def filterFunc(communityWords: Array[String],
24 | textWords: Array[String]): Boolean = {
25 |
26 | communityWords.foreach {
27 | word => {
28 |
29 | if (textWords.contains(word)) {
30 |
31 | return true
32 | }
33 | }
34 | }
35 |
36 | false
37 | }
38 |
39 | /**
40 | * 统计当前文档库中, 包含社区中提取的关键词的文档数,重复的根据文本ID(url)合并,
41 | * 特别针对社区(事件)词, 一个社区中包含若干个词, 并且词变化后对应的社区却没有变化.
42 | *
43 | * @param fileList 当前文档
44 | * @param communityWordList textRank提取的每个社区的关键词
45 | * @return [社区ID, 包含社区中关键词的文档总数]包含社区中关键词的文档总数
46 | * @author Li Yu
47 | * @note rowNum: 13
48 | */
49 | def communityFrequencyStatisticsRDD(fileList: RDD[Array[String]],
50 | communityWordList: Array[(String, Array[String])]): Array[(String, Double)] = {
51 |
52 | val communityList = new mutable.HashMap[String, Double]
53 |
54 | communityWordList.foreach {
55 | community => {
56 |
57 | val communityID = community._1
58 | val communityWords = community._2
59 | val temp = fileList.filter(content => filterFunc(communityWords, content)).count().toDouble
60 |
61 | communityList.+=((communityID, temp))
62 | }
63 | }
64 |
65 | communityList.toArray
66 | }
67 |
68 |
69 | /**
70 | * 统计当前文档库中, 包含社区中提取的关键词的文档数,重复的根据文本ID(url)合并,
71 | * 特别针对社区(事件)词, 一个社区中包含若干个词, 并且词变化后对应的社区却没有变化.
72 | *
73 | * @param fileList 当前文档
74 | * @param communityWordList textRank提取的每个社区的关键词
75 | * @return [社区ID, 包含社区中关键词的文档总数]包含社区中关键词的文档总数
76 | * @author Li Yu
77 | * @note rowNum: 22
78 | */
79 | def communityFrequencyStatistics(fileList: Array[(String, Array[String])],
80 | communityWordList: Array[(String, Array[String])]): Array[(String, Double)] = {
81 |
82 | val communityList = new mutable.HashMap[String, Double]
83 |
84 | communityWordList.foreach {
85 | line => {
86 |
87 | val item = new mutable.ArrayBuffer[String]
88 | val communityId = line._1
89 | val communityWords = line._2
90 |
91 | fileList.foreach {
92 | file => {
93 |
94 | val fileId = file._1
95 | val fileWordsList = file._2.distinct
96 |
97 | communityWords.foreach { word => {
98 |
99 | if (fileWordsList.contains(word)) item.append(fileId)
100 | }
101 |
102 | communityList.put(communityId, item.distinct.length)
103 | }
104 | }
105 | }
106 | }
107 | }
108 |
109 | communityList.toArray
110 | }
111 |
112 | }
113 |
114 |
115 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/hotdegreecalculate/fileIO.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.hotdegreecalculate
2 |
3 | import java.io.{File, PrintWriter}
4 |
5 | import _root_.util.TimeUtil
6 |
7 | import scala.collection.mutable
8 | import scala.io.Source
9 |
10 | /**
11 | * Created by li on 16/7/11.
12 | * 计算社区热度的过程中,分别将计算的结果保存到本地的文件系统中, 以及从本地文件中读取前一小时的数据
13 | */
14 | object fileIO {
15 |
16 | /** 将结果保存到本地,将每小时数据保存为一个txt文件,一天的数据保存在一个文件夹里.
17 | *
18 | * @param dir 文件保存的目录
19 | * @param result
20 | */
21 | def saveAsTextFile(dir: String, result: Array[(String, Double)]): Unit ={
22 |
23 | val day = TimeUtil.getDay
24 | val hour = TimeUtil.getCurrentHour
25 |
26 | val writer = new PrintWriter(new File(dir +"%s".format(day) + "-" + "%s".format(hour) + ".txt"))
27 |
28 | for (line <- result) {
29 |
30 | writer.write(line._1 + "\t" + line._2 + "\n")
31 |
32 | }
33 |
34 | writer.close()
35 | }
36 |
37 |
38 | /**
39 | * 读取当前时间前一个小时的数据,读取本地文件中的结果.
40 | *
41 | * @param dir 数据保存的目录
42 | * @return
43 | */
44 | def readFromFile(dir: String): Array[(String, Double)] ={
45 |
46 | val date = TimeUtil.getPreHourStr
47 | val res = new mutable.ArrayBuffer[(String, Double)]
48 |
49 | if (Source.fromFile(dir + "%s".format(date) + ".txt" ) != null) {
50 | val temp = Source.fromFile(dir + "%s".format(date) + ".txt" )
51 |
52 | temp.getLines().foreach{
53 | line =>{
54 | val temp = line.split("\t")
55 | res.+=((temp(0), temp(1).toDouble))
56 | }
57 | }
58 | } else {
59 |
60 | res.+=(("init", 0.0))
61 | }
62 |
63 | res.toArray
64 | }
65 |
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/textrank/AbstractExtract.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.textrank
2 |
3 | import org.graphstream.graph.implementations.SingleGraph
4 |
5 | import scala.collection.mutable.ListBuffer
6 |
7 | /**
8 | * Created by li on 16/6/23.
9 | */
10 | class AbstractExtract (val graphName: String, val segWord: ListBuffer[ListBuffer[(String)]] ){
11 |
12 | var graph = new SingleGraph(graphName)
13 |
14 | // 获取文本网络的句子节点
15 | segWord.foreach {
16 | sentenceList => {
17 | val sentence = sentenceList.toString
18 | if (graph.getNode(sentence) == null) graph.addNode(sentence)
19 | }
20 | }
21 |
22 | // 边的获取,通过计算句子的相似度
23 |
24 |
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/textrank/ConstructTextGraph.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.textrank
2 |
3 | import org.graphstream.graph.implementations.SingleGraph
4 |
5 | import scala.collection.mutable
6 | import scala.collection.mutable.ListBuffer
7 |
8 | /**
9 | * Created by li on 16/6/23.
10 | */
11 |
12 | /**
13 | * 构建候选关键词图
14 | * @param graphName 图标识s
15 | * @param winSize 窗口大小
16 | * @param segWord 分词的结果
17 | * @return 候选关键词图
18 | * @author LiYu
19 | */
20 | class ConstructTextGraph(val graphName: String, val winSize: Int, val segWord: List[String]) {
21 |
22 | /**
23 | * 构建候选关键词图
24 | * @return 候选关键词图
25 | */
26 | def constructGraph: SingleGraph = {
27 |
28 | val graph = new SingleGraph(graphName)
29 |
30 | // 获取文本网络的节点
31 | segWord.foreach(
32 | word => if (graph.getNode(word) == null) graph.addNode(word)
33 | )
34 |
35 | // 导入分完词的数据,并通过设置的窗口截取
36 | var wordSeg = new ListBuffer[(ListBuffer[(String)])]
37 |
38 | val num = segWord.size - winSize
39 |
40 | for (i <- 0 to num) {
41 |
42 | val item = new ListBuffer[(String)]
43 |
44 | for (j <- 0 until winSize) {
45 |
46 | item += segWord(i + j)
47 | }
48 |
49 | wordSeg += item
50 |
51 | }
52 |
53 | // 获取每个顶点以及所包含的窗口内的邻居节点
54 | val wordSet = segWord.toSet
55 |
56 | val edgeSet = wordSet.map {
57 | word => {
58 | val edgeList = new mutable.HashSet[(String)]
59 | wordSeg.foreach {
60 | list => {
61 | if (list.contains(word)){
62 | list.foreach(x => edgeList.+=(x))
63 | }
64 | }
65 | }
66 |
67 | (word, edgeList -= word)
68 |
69 | }
70 | }
71 |
72 | // 构建关键词图的边
73 | edgeSet.toArray.foreach {
74 | edge => {
75 | edge._2.toList.foreach {
76 | edges =>
77 |
78 | if (graph.getEdge(s"${edge._1}-${edges}") == null &&
79 | graph.getEdge(s"${edges}-${edge._1}") == null) {
80 | graph.addEdge(s"${edge._1}-${edges}", edge._1, edges)
81 | None
82 | }
83 | }
84 | }
85 | }
86 |
87 | graph
88 |
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/textrank/PropertyExtractor.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.textrank
2 |
3 | import org.graphstream.graph.implementations.SingleGraph
4 | import org.graphstream.graph.{Edge, Node}
5 |
6 | import scala.collection.mutable
7 |
8 | /**
9 | * Created by li on 16/6/24.
10 | */
11 |
12 | /**
13 | * 关键词提取, 输出个文章提取的关键词, 无向图名称为文章的url
14 | *
15 | * @param graph 节点图
16 | * @param keywordNum 关键词个数
17 | * @return 文本的关键词
18 | * @author LiYu
19 | */
20 | class PropertyExtractor(val graph: SingleGraph, val keywordNum: Int) {
21 |
22 | /**
23 | *
24 | * @param iterator textRank迭代次数
25 | * @param df 阻尼系数(Damping Factor)
26 | * @return 关键词和得分
27 | */
28 | // 使用textRank提取关键词
29 | def extractKeywords(iterator: Int, df: Double) = {
30 |
31 | val nodes = graph.getNodeSet.toArray.map(_.asInstanceOf[Node])
32 | val scoreMap = new mutable.HashMap[String, Double]
33 |
34 | // 节点权重初始化
35 | nodes.foreach(node => scoreMap.put(node.getId, 1.0f))
36 |
37 | // 迭代 迭代传播各节点的权重,直至收敛。
38 | (1 to iterator).foreach {
39 | i =>
40 | nodes.foreach {
41 | node =>
42 | val edges = node.getEdgeSet.toArray.map(_.asInstanceOf[Edge])
43 | var score = 1.0f - df
44 | edges.foreach {
45 | edge =>
46 | val node0 = edge.getNode0.asInstanceOf[Node]
47 | val node1 = edge.getNode1.asInstanceOf[Node]
48 | val tempNode = if (node0.getId.equals(node.getId)) node1 else node0
49 | score += df * (1.0f * scoreMap(tempNode.getId) / tempNode.getDegree)
50 | }
51 | scoreMap.put(node.getId, score)
52 | }
53 | }
54 |
55 | // 对节点权重进行倒序排序,从而得到最重要的num个单词,作为候选关键词。
56 | scoreMap.toList.sortWith(_._2 > _._2).slice(0, keywordNum)
57 |
58 | }
59 | }
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/textrank/TextRank.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.textrank
2 |
3 | /**
4 | * Created by li on 16/6/24.
5 | */
6 | object TextRank {
7 | /**
8 | *
9 | * @param graphName 图标识
10 | * @param window 词窗口大小
11 | * @param doc 待抽取文本
12 | * @param keywordNum 提取关键词个数
13 | * @param iterator textRank迭代次数
14 | * @param df 阻尼系数
15 | * @return 关键词, 得分
16 | */
17 | def run(graphName: String, window: Int, doc: List[String],
18 | keywordNum: Int, iterator: Int, df: Double): List[(String, Double)] = {
19 |
20 | // 生成关键词图
21 | val constructTextGraph = new ConstructTextGraph(graphName, window, doc)
22 | val textGraph = constructTextGraph.constructGraph
23 |
24 | // 输出提取的关键词
25 | val keywordExtractor = new PropertyExtractor(textGraph, keywordNum)
26 | val result = keywordExtractor.extractKeywords(iterator, df)
27 |
28 | result
29 | }
30 |
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/topicmodel/LDAModel.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.topicmodel
2 |
3 | import org.apache.spark.rdd.RDD
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | //import org.apache.spark.mllib.clustering.LDA
6 | //import org.apache.spark.rdd.RDD
7 |
8 | /**
9 | * Created by li on 2016/4/28.
10 | */
11 |
12 | object LDAModel extends App{
13 |
14 | val conf = new SparkConf().setAppName("TopicModel").setMaster("local")
15 | val sc = new SparkContext(conf)
16 |
17 | // Load documents from text files, 1 document per file
18 | val corpus: RDD[String] = sc.wholeTextFiles("/Users/li/kunyan/docs/*.md").map(_._2)
19 |
20 | // Split each document into a sequence of terms (words)
21 | val tokenized: RDD[Array[String]] =
22 | corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).filter(_.forall(java.lang.Character.isLetter)))
23 |
24 | tokenized.collect().foreach(println)
25 |
26 | // Choose the vocabulary.
27 | // termCounts: Sorted list of (term, termCount) pairs
28 | val termCounts: Array[(String, Long)] =
29 | tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2)
30 |
31 | termCounts.foreach(println)
32 |
33 | // vocabArray: Chosen vocab (removing common terms)
34 | val numStopwords = 20
35 | val vocabArray: Array[String] =
36 | termCounts.takeRight(termCounts.length - numStopwords).map(_._1)
37 |
38 | // vocab: Map term -> term index
39 | val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
40 | // vocab.foreach(println)
41 |
42 | // // Convert documents into term count vectors
43 | // val documents: RDD[(Long, Vector)] =
44 | // tokenized.zipWithIndex.map {
45 | // case (tokens, id) =>
46 | // val counts = new mutable.HashMap[Int, Double]()
47 | // tokens.foreach { term =>
48 | // if (vocab.contains(term)) {
49 | // val idx = vocab(term)
50 | // counts(idx) = counts.getOrElse(idx, 0.0) + 1.0
51 | // }
52 | // }
53 | // (id, Vectors.sparse(vocab.size, counts.toSeq))
54 | // }
55 | //
56 | // documents.foreach(println)
57 | //
58 | // // Set LDA parameters
59 | // val numTopics = 3
60 | // val lda = new LDA().setK(numTopics).setMaxIterations(8)
61 | //
62 | // val ldaModel = lda.run(documents)
63 | //// val avgLogLikelihood = ldaModel.logLikelihood / documents.count()
64 | //
65 | // // Print topics, showing top-weighted 10 terms for each topic.
66 | // val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10)
67 | // topicIndices.foreach { case (terms, termWeights) =>
68 | // println("TOPIC:")
69 | // terms.zip(termWeights).foreach { case (term, weight) =>
70 | // println(s"${vocabArray(term.toInt)}\t$weight")
71 | // }
72 | // println()
73 | // }
74 | //
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/topicmodel/LatentDirichletAllocationExample.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.topicmodel
2 |
3 | import org.apache.spark.mllib.clustering.LDA
4 | import org.apache.spark.{SparkConf, SparkContext}
5 | import org.apache.spark.mllib.linalg.Vectors
6 |
7 | object LatentDirichletAllocationExample {
8 |
9 | def main(args: Array[String]) {
10 |
11 | val conf = new SparkConf().setAppName("LatentDirichletAllocationExample").setMaster("local")
12 | val sc = new SparkContext(conf)
13 |
14 | // $example on$
15 | // Load and parse the data
16 | val data = sc.textFile("/Users/li/kunyan/spark/data/mllib/sample_lda_data.txt")
17 | data.foreach(println)
18 |
19 | val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble)))
20 | parsedData.foreach(println)
21 |
22 | // Index documents with unique IDs
23 | val corpus = parsedData.zipWithIndex.map(_.swap).cache()
24 |
25 | // Cluster the documents into three topics using LDA
26 | val ldaModel = new LDA().setK(3).run(corpus)
27 | //
28 | // // Output topics. Each is a distribution over words (matching word count vectors)
29 | // println("Learned topics (as distributions over vocab of " + ldaModel.vocabSize + " words):")
30 | // val topics = ldaModel.topicsMatrix
31 | // for (topic <- Range(0, 3)) {
32 | // print("Topic " + topic + ":")
33 | // for (word <- Range(0, ldaModel.vocabSize)) { print(" " + topics(word, topic)); }
34 | // println()
35 | // }
36 | //
37 | // // Save and load model.
38 | // ldaModel.save(sc, "/Users/li/kunyan/spark/LatentDirichletAllocationExample/LDAModel")
39 | // val sameModel = DistributedLDAModel.load(sc,
40 | // "/Users/li/kunyan/spark/LatentDirichletAllocationExample/LDAModel")
41 | // // $example off$
42 | //
43 | // sc.stop()
44 | }
45 | }
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/ClassifyModel.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.word2vec
2 |
3 | import java.io.File
4 |
5 | import util.{DirectoryUtil, JSONUtil}
6 | import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
7 | import org.apache.spark.mllib.feature.Word2VecModel
8 | import org.apache.spark.mllib.regression.LabeledPoint
9 | import org.apache.spark.rdd.RDD
10 | import org.apache.spark.{SparkConf, SparkContext}
11 |
12 | /**
13 | * Created by li on 2016/10/13.
14 | *
15 | */
16 | object ClassifyModel {
17 |
18 |
19 | def classify(trainDataRdd: RDD[LabeledPoint]): SVMModel = {
20 |
21 | /** NativeBayes训练模型 */
22 | // val model = NaiveBayes.train(trainDataRdd, lambda = 1.0, modelType = "multinomial")
23 |
24 | /** SVM训练模型 */
25 | val numIterations = 1000
26 | val model = SVMWithSGD.train(trainDataRdd , numIterations)
27 |
28 | /** RandomForest训练模型 */
29 | // val numClasses = 2
30 | // val categoricalFeaturesInfo = Map[Int, Int]()
31 | // val numTrees = 3
32 | // val featureSubsetStrategy = "auto"
33 | // val impurity = "gini"
34 | // val maxDepth = 4
35 | // val maxBins = 32
36 | // val model = RandomForest.trainClassifier(trainDataRdd, numClasses, categoricalFeaturesInfo,numTrees, featureSubsetStrategy, impurity, maxDepth, maxBins)
37 |
38 | /** GradientBoostedTrees训练模型 */
39 | // // Train a GradientBoostedTrees model.
40 | // // The defaultParams for Classification use LogLoss by default.
41 | // val boostingStrategy = BoostingStrategy.defaultParams("Classification")
42 | // boostingStrategy.numIterations = 3 // Note: Use more iterations in practice.
43 | // boostingStrategy.treeStrategy.numClasses = 2
44 | // boostingStrategy.treeStrategy.maxDepth = 5
45 | // // Empty categoricalFeaturesInfo indicates all features are continuous.
46 | // boostingStrategy.treeStrategy.categoricalFeaturesInfo = Map[Int, Int]()
47 | //
48 | // val model = GradientBoostedTrees.train(trainDataRdd, boostingStrategy)
49 |
50 | model
51 |
52 | }
53 |
54 | def main(args: Array[String]) {
55 |
56 | val conf = new SparkConf().setAppName("textVectors").setMaster("local")
57 | val sc = new SparkContext(conf)
58 |
59 | val jsonPath = "/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json"
60 |
61 | JSONUtil.initConfig(jsonPath)
62 |
63 | val word2vecModelPath = JSONUtil.getValue("w2v", "w2vmodelPath")
64 | val modelSize = JSONUtil.getValue("w2v", "w2vmodelSize").toInt
65 | val isModel = JSONUtil.getValue("w2v", "isModel").toBoolean
66 |
67 | // val word2vecModelPath = "hdfs://master:9000/home/word2vec/classifyModel-10-100-20/2016-08-16-word2VectorModel"
68 | val w2vModel = Word2VecModel.load(sc, word2vecModelPath)
69 |
70 | // 构建训练集的labeledpoint格式
71 | // val trainSetPath = "/Users/li/workshop/DataSet/trainingsetUnbalance/BXX.txt"
72 | // val trainSetPath = "/Users/li/workshop/DataSet/trainingSets/计算机"
73 | val trainSetPath = "/Users/li/workshop/DataSet/trainingSets/机械"
74 |
75 | val trainSet = DataPrepare.readData(trainSetPath)
76 | val trainSetRdd = sc.parallelize(trainSet).cache()
77 | //val trainSetRdd = sc.textFile(trainSetPath)
78 |
79 | // val trainSetVec = trainSetRdd.map( row => {
80 | // val x = row.split("\t")
81 | // (x(0), x(1).split(","))}) // 在文章进行分词的情况下,用逗号隔开
82 | // //(x(0), AnsjAnalyzer.cutNoTag(x(1)}) // 如果没有分词,就调用ansj进行分词
83 | // .map(row => (row._1.toDouble, DataPrepare.docVec(w2vModel, row._2)))
84 |
85 | val trainDataRdd = TextVectors.textVectorsWithWeight(trainSetRdd, w2vModel, modelSize, isModel).cache()
86 |
87 | val classifyModel = classify(trainDataRdd)
88 |
89 | val classifyModelPath = JSONUtil.getValue("classify", "classifymodelpath")
90 | DirectoryUtil.deleteDir(new File(classifyModelPath))
91 | classifyModel.save(sc, classifyModelPath)
92 | println("分类模型保存完毕。")
93 |
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/ClassifyPredict.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.word2vec
2 |
3 | import org.apache.spark.mllib.classification.SVMModel
4 | import org.apache.spark.mllib.evaluation.MulticlassMetrics
5 | import org.apache.spark.mllib.feature.Word2VecModel
6 | import org.apache.spark.mllib.regression.LabeledPoint
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.{SparkConf, SparkContext}
9 | import util.JSONUtil
10 | import wordSegmentation.AnsjAnalyzer
11 |
12 | /**
13 | * Created by li on 2016/10/17.
14 | */
15 | object ClassifyPredict {
16 |
17 |
18 | /**
19 | * 准确度统计分析
20 | *
21 | * @param predictionAndLabel
22 | */
23 | def acc(predictionAndLabel: RDD[(Double, Double)],
24 | predictDataRdd: RDD[LabeledPoint]): Unit = {
25 |
26 | //统计分类准确率
27 | val testAccuracy = 1.0 * predictionAndLabel.filter(x => x._1 == x._2).count() / predictDataRdd.count()
28 | println("testAccuracy:" + testAccuracy)
29 |
30 | val metrics = new MulticlassMetrics(predictionAndLabel)
31 | println("Confusion matrix:" + metrics.confusionMatrix)
32 |
33 | // Precision by label
34 | val label = metrics.labels
35 | label.foreach { l =>
36 | println(s"Precision($l) = " + metrics.precision(l))
37 | }
38 |
39 | // Recall by label
40 | label.foreach { l =>
41 | println(s"Recall($l) = " + metrics.recall(l))
42 | }
43 |
44 | // False positive rate by label
45 | label.foreach { l =>
46 | println(s"FPR($l) = " + metrics.falsePositiveRate(l))
47 | }
48 |
49 | // F-measure by label
50 | label.foreach { l =>
51 | println(s"F1-Score($l) = " + metrics.fMeasure(l))
52 | }
53 |
54 | // val roc = metrics.roc
55 |
56 | // // AUROC
57 | // val auROC = metrics.areaUnderROC
58 | // println("Area under ROC = " + auROC)
59 |
60 | }
61 |
62 |
63 | def main(args: Array[String]) {
64 |
65 | val conf = new SparkConf().setAppName("textVectors").setMaster("local")
66 | val sc = new SparkContext(conf)
67 |
68 | val jsonPath = "/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json"
69 |
70 | JSONUtil.initConfig(jsonPath)
71 |
72 | val word2vecModelPath = JSONUtil.getValue("w2v", "w2vmodelPath")
73 | val modelSize = JSONUtil.getValue("w2v", "w2vmodelSize").toInt
74 | val isModel = JSONUtil.getValue("w2v", "isModel").toBoolean
75 | // load word2vec model
76 | val w2vModel = Word2VecModel.load(sc, word2vecModelPath)
77 |
78 | // load classify model
79 | val classifyModelPath = JSONUtil.getValue("classify", "classifymodelpath")
80 | val classifyModel = SVMModel.load(sc, classifyModelPath)
81 |
82 | // 构建测试集labeledpoint格式
83 | val predictSetPath = "/Users/li/workshop/DataSet/trainingSets/test"
84 | val predictSet = DataPrepare.readData(predictSetPath)
85 | val predictSetRdd = sc.parallelize(predictSet)
86 |
87 | // 对于单篇没有分词的文章
88 | val predictSetVec = predictSetRdd.map(row => {
89 | 1.0 + "\t" + AnsjAnalyzer.cutNoTag(row)
90 | })
91 | val predictDataRdd = TextVectors.textVectorsWithWeight(predictSetVec, w2vModel, modelSize, isModel).cache()
92 |
93 | // val predictDataRdd = TextVectors.textVectorsWithWeight(predictSetRdd, w2vModel, modelSize, isModel).cache()
94 |
95 | /** 对测试数据集使用训练模型进行分类预测 */
96 | // classifyModel.clearThreshold()
97 | // Compute raw scores on the test set.
98 | val predictionAndLabel = predictDataRdd.map{ point => {
99 | val predictionFeature = classifyModel.predict(point.features)
100 | (predictionFeature, point.label)
101 | }}
102 |
103 | // 准确度统计分析
104 | acc(predictionAndLabel, predictDataRdd)
105 | //predictionAndLabel.foreach(println)
106 | sc.stop()
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/DataPrepare.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.word2vec
2 |
3 | import dataprocess.vipstockstatistic.util.AnsjAnalyzer
4 | import org.apache.spark.mllib.feature.Word2VecModel
5 | import org.apache.spark.mllib.linalg.Vectors
6 | import org.apache.spark.mllib.regression.LabeledPoint
7 | import org.apache.spark.rdd.RDD
8 | import org.apache.spark.{SparkConf, SparkContext}
9 |
10 | import scala.io.Source
11 |
12 | /**
13 | * Created by li on 2016/10/14.
14 | */
15 | object DataPrepare {
16 |
17 | /**
18 | * 读文件
19 | *
20 | * @param filePath 文本保存的位置
21 | * @return
22 | */
23 | def readData(filePath: String): Array[String] = {
24 |
25 | val doc = Source.fromFile(filePath).getLines().toArray
26 |
27 | doc
28 | }
29 |
30 |
31 | /**
32 | * 分词
33 | *
34 | * @param doc
35 | * @return
36 | */
37 | def docCut(doc: Array[String]): Array[String] = {
38 |
39 | val docSeg = doc.map(x => AnsjAnalyzer.cutNoTag(x)).flatMap(x =>x)
40 |
41 | docSeg
42 | }
43 |
44 |
45 | /**
46 | * 构建文本向量
47 | *
48 | * @param word2vecModel
49 | * @param docSeg
50 | * @return
51 | */
52 | def docVec(word2vecModel: Word2VecModel, docSeg: Array[String], modelSize: Int): Array[Double] = {
53 |
54 | val docVectors = TextVectors.textVectorsWithModel(docSeg, word2vecModel, modelSize).toArray
55 |
56 | docVectors
57 | }
58 |
59 | /**
60 | * 打标签,文本集合构建labeledPoint,集合中文章属于同一类
61 | *
62 | * @param label
63 | * @param docVec
64 | * @return
65 | */
66 | def tagAttacheBatchSingle(label: Double, docVec: RDD[Array[Double]]): RDD[LabeledPoint] = {
67 |
68 | docVec.map{
69 | row =>
70 | LabeledPoint(label , Vectors.dense(row))
71 | }
72 | }
73 |
74 | /**
75 | * 打标签,文本集合构建labeledPoint
76 | *
77 | * @param docVec
78 | * @return
79 | */
80 | def tagAttacheBatchWhole(docVec: RDD[(Double, Array[Double])]): RDD[LabeledPoint] = {
81 |
82 | docVec.map{
83 | row =>
84 | LabeledPoint(row._1 , Vectors.dense(row._2))
85 | }
86 | }
87 |
88 |
89 | /**
90 | * 打标签,单篇文本构建labeledPoint
91 | *
92 | * @param label
93 | * @param docVec
94 | * @return
95 | */
96 | def tagAttacheSingle(label: Double, docVec: Array[Double]): LabeledPoint = {
97 |
98 | LabeledPoint(label=1.0 , Vectors.dense(docVec))
99 | }
100 |
101 |
102 | /**
103 | * 测试代码
104 | */
105 | def dataPrepareTest(): Unit ={
106 | val conf = new SparkConf().setMaster("local").setAppName("DataPrepare")
107 | val sc = new SparkContext(conf)
108 |
109 | val filePath = "/Users/li/workshop/DataSet/111.txt"
110 | // val filePath = "/Users/li/workshop/DataSet/SogouC.reduced/Reduced/C000008/10.txt"
111 |
112 | val word2vecModelPath = "/Users/li/workshop/DataSet/word2vec/result/2016-07-18-15-word2VectorModel"
113 | val model = Word2VecModel.load(sc, word2vecModelPath)
114 |
115 | val data = readData(filePath)
116 |
117 | val splitData = docCut(data)
118 |
119 | val doVec = docVec(model, splitData, 100)
120 |
121 | val labeledP = tagAttacheSingle(1.0, doVec)
122 | println(labeledP)
123 |
124 |
125 | }
126 |
127 |
128 | def main(args: Array[String]) {
129 |
130 | dataPrepareTest()
131 |
132 | }
133 |
134 | }
135 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/DeleteDirectory.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.word2vec
2 |
3 | import java.io.File
4 |
5 | /**
6 | * Created by li on 16/7/15.
7 | */
8 |
9 | object DeleteDirectory {
10 |
11 | /**
12 | * 删除空目录
13 | * @param dir 将要删除的目录路径
14 | */
15 | private def doDeleteEmptyDir(dir: String): Unit = {
16 |
17 | val success: Boolean = new File(dir).delete()
18 |
19 | if (success) {
20 |
21 | System.out.println("Successfully deleted empty directory: " + dir)
22 |
23 | } else {
24 |
25 | System.out.println("Failed to delete empty directory: " + dir)
26 | }
27 | }
28 |
29 | /**
30 | * 递归删除目录下的所有文件及子目录下所有文件
31 | * @param dir 将要删除的文件目录
32 | * @return boolean Returns "true" if all deletions were successful.
33 | * If a deletion fails, the method stops attempting to
34 | * delete and returns "false".
35 | */
36 | private def deleteDir(dir: File): Boolean = {
37 |
38 | if (dir.isDirectory) {
39 |
40 | val children = dir.list()
41 |
42 | //递归删除目录中的子目录下
43 | for (i <- 0 until children.length){
44 |
45 | val success = deleteDir(new File(dir, children(i)))
46 |
47 | if (! success){
48 | return false
49 | }
50 |
51 | }
52 | }
53 | // 目录此时为空,可以删除
54 | dir.delete()
55 | }
56 |
57 |
58 | /**
59 | *测试
60 | */
61 | def main(args: Array[String]): Unit = {
62 |
63 | val dir = "/Users/li/kunyan/DataSet/1111"
64 |
65 | doDeleteEmptyDir(dir)
66 |
67 | val success = deleteDir(new File(dir))
68 |
69 | if (success) System.out.println("Successfully deleted populated directory: " + dir)
70 |
71 | else System.out.println("Failed to delete populated directory: " + dir)
72 | }
73 |
74 | }
75 |
76 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/data/.part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/.part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet.crc
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/data/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/_SUCCESS
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/data/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/_common_metadata
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/data/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/_metadata
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/data/part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/data/part-r-00000-e1c254b3-21ba-4759-b7eb-b69f39950551.gz.parquet
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/metadata/.part-00000.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/metadata/.part-00000.crc
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/metadata/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/src/main/scala/meachinelearning/word2vec/model/metadata/_SUCCESS
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/model/metadata/part-00000:
--------------------------------------------------------------------------------
1 | {"class":"org.apache.spark.mllib.classification.SVMModel","version":"1.0","numFeatures":100,"numClasses":2}
2 |
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/readme.md:
--------------------------------------------------------------------------------
1 | # Classification based on word2vec
2 |
3 | ## TextRank model
4 |
5 | Exact key words from news articles, instead of calculating whole words of wordVectors.
6 |
7 | ## Word2Vec model
8 |
9 | Build the LabeledPoint(model type) using the word2vec, instead of calculating tfIdf, dimension reduce and so on.
10 |
11 |
12 | ## Classification model
13 |
14 | Using SVM to classify.
15 |
16 |
17 | ## Test results
18 | testAccuracy =
19 | Precision(0.0) =
20 | Precision(1.0) =
21 | Recall(0.0) =
22 | Recall(1.0) =
23 | FPR(0.0) =
24 | FPR(1.0) =
25 | F1-Score(0.0) =
26 | F1-Score(1.0) =
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json:
--------------------------------------------------------------------------------
1 | {
2 | "kunyan": {
3 | "ip": "61.147.114.88",
4 | "port": "16003",
5 | "stopwords": "16003"
6 | },
7 | "w2v": {
8 | "w2vmodelPath": "hdfs://61.147.114.85:9000/home/word2vec/model-10-100-20/2016-08-16-word2VectorModel",
9 | "w2vmodelSize": "100",
10 | "isModel":"true"
11 | },
12 | "classify": {
13 | "classifymodelpath":"/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/model2"
14 | }
15 | }
--------------------------------------------------------------------------------
/src/main/scala/meachinelearning/word2vec/twc/training.scala:
--------------------------------------------------------------------------------
1 | package meachinelearning.word2vec.twc
2 |
3 | import org.apache.spark.mllib.classification.SVMWithSGD
4 | import org.apache.spark.mllib.evaluation.MulticlassMetrics
5 | import org.apache.spark.mllib.regression.LabeledPoint
6 | import org.apache.spark.{SparkConf, SparkContext}
7 |
8 | /**
9 | * Created by zhangxin on 16-11-9.
10 | *
11 | * 分类模型训练
12 | */
13 | object training {
14 |
15 |
16 | def training(): Unit ={
17 |
18 | val conf = new SparkConf().setAppName("W2V").setMaster("local")
19 | val sc = new SparkContext(conf)
20 | // val jsonPath = "/home/zhangxin/work/workplace_scala/Sentiment/src/main/scala/classificationW2V/W2VJsonConf.json"
21 | val jsonPath = "/Users/li/workshop/NaturalLanguageProcessing/src/main/scala/meachinelearning/word2vec/twc/W2VJsonConf.json"
22 |
23 | // 非平衡集
24 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingsetUnbalance/JSJ.txt"
25 | // val docs = sc.textFile(docsPath).collect()
26 |
27 | // 平衡集
28 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/房地产"
29 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/有色金属"
30 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/保险"
31 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/医药"
32 | // val docsPath = "/home/zhangxin/work/workplace_scala/Data/trainingSets/计算机"
33 | val docsPath = "/Users/li/workshop/DataSet/trainingSets/工程建筑"
34 |
35 | val docs = sc.textFile(docsPath).collect()
36 |
37 | // inputs
38 | val data = processing.process_weight(docs, sc, jsonPath)
39 | println("[完成DOC2Vec模型]>>>>>>>>>>>>>>>>>")
40 |
41 | val dataRdd = sc.parallelize(data)
42 | val splits = dataRdd.randomSplit(Array(0.8, 0.2), seed = 11L)
43 | val train = splits(0)
44 | val test = splits(1)
45 |
46 | val model = SVMWithSGD.train(train, 50)
47 | // val model = LogisticRegressionWithSGD.train(train, 5000)
48 | println("[完成模型训练]>>>>>>>>>>>>>>>>>>>")
49 |
50 |
51 | val predictAndLabels = test.map{
52 | case LabeledPoint(label, features) =>
53 | val prediction = model.predict(features)
54 | (prediction, label)
55 | }
56 |
57 | val metrics = new MulticlassMetrics(predictAndLabels)
58 | println(s"[综合_Precison] ${metrics.precision}")
59 | println(s"[Labels] ${metrics.labels.toList}")
60 | metrics.labels.foreach(label => {
61 | println(s"[${label}_Precison] ${metrics.precision(label)}")
62 | })
63 |
64 | }
65 |
66 | def main(args: Array[String]): Unit = {
67 | training()
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/src/main/scala/test/regularExpression.scala:
--------------------------------------------------------------------------------
1 | package test
2 |
3 | /**
4 | * Created by li on 16/7/22.
5 | */
6 | object regularExpression {
7 |
8 | def main(args: Array[String]) {
9 |
10 | val numPatten = """([0-9]+) ([a-z]+\s+)""".r
11 |
12 | // val numPatten = """(\s+[0-9]+\s+) ([0-9]+) ()""".r
13 |
14 | val res = numPatten.findAllIn("99 bottles, 89 bottles").toArray
15 |
16 | res.foreach(println)
17 |
18 | }
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/src/main/scala/util/DirectoryUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import java.io.File
4 |
5 | /**
6 | * Created by li on 16/7/18.
7 | * 文件夹处理工具,删除空文件夹, 删除非空文件夹及其中的文件
8 | */
9 | object DirectoryUtil {
10 |
11 | /**
12 | * 删除空目录
13 | *
14 | * @param dir 将要删除的目录路径
15 | */
16 | def doDeleteEmptyDir(dir: String): Unit = {
17 |
18 | val success: Boolean = new File(dir).delete()
19 |
20 | if (success) {
21 |
22 | System.out.println("Successfully deleted empty directory: " + dir)
23 |
24 | } else {
25 |
26 | System.out.println("Failed to delete empty directory: " + dir)
27 | }
28 | }
29 |
30 | /**
31 | * 递归删除目录下的所有文件及子目录下所有文件
32 | *
33 | * @param dir 将要删除的文件目录
34 | * @return boolean Returns "true" if all deletions were successful.
35 | * If a deletion fails, the method stops attempting to
36 | * delete and returns "false".
37 | */
38 | def deleteDir(dir: File): Boolean = {
39 |
40 | if (dir.isDirectory) {
41 |
42 | val children = dir.list()
43 |
44 | //递归删除目录中的子目录下
45 | for (i <- 0 until children.length){
46 |
47 | val success = deleteDir(new File(dir, children(i)))
48 |
49 | if (! success){
50 | return false
51 | }
52 |
53 | }
54 | }
55 | // 目录此时为空,可以删除
56 | dir.delete()
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/main/scala/util/FileUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import java.io.{File, BufferedReader, FileReader, PrintWriter}
4 |
5 | import scala.collection.mutable
6 | import scala.collection.mutable.ListBuffer
7 | import scala.io.Source
8 |
9 | /**
10 | * Created by li on 2016/2/22.
11 | */
12 | object FileUtil {
13 |
14 | /**
15 | * override the old one
16 | */
17 | def createFile(path: String, lines: Seq[String]): Unit = {
18 |
19 | val writer = new PrintWriter(path, "UTF-8")
20 |
21 | for (line <- lines) {
22 | writer.println(line)
23 | }
24 | writer.close()
25 | }
26 |
27 | def readFile(path: String): ListBuffer[String] = {
28 |
29 | var lines = new ListBuffer[String]()
30 |
31 | val br = new BufferedReader(new FileReader(path))
32 | try {
33 | var line = br.readLine()
34 |
35 | while (line != null) {
36 | lines += line
37 | line = br.readLine()
38 | }
39 | lines
40 | } finally {
41 | br.close()
42 | }
43 | }
44 |
45 | /** 将结果保存到本地,将每小时数据保存为一个txt文件,一天的数据保存在一个文件夹里.
46 | *
47 | * @param dir 文件保存的目录
48 | * @param result
49 | * @author Li Yu
50 | */
51 | def saveAsTextFile(dir: String, result: Array[(String, Double)]): Unit ={
52 |
53 | val day = TimeUtil.getDay
54 | val hour = TimeUtil.getCurrentHour
55 |
56 | val writer = new PrintWriter(new File(dir +"%s".format(day) + "-" + "%s".format(hour) + ".txt"))
57 |
58 | for (line <- result) {
59 |
60 | writer.write(line._1 + "\t" + line._2 + "\n")
61 |
62 | }
63 |
64 | writer.close()
65 | }
66 |
67 | /**
68 | * 读取当前时间前一个小时的数据,读取本地文件中的结果.
69 | *
70 | * @param dir 数据保存的目录
71 | * @return
72 | */
73 | def readFromFile(dir: String): Array[(String, Double)] ={
74 |
75 | val date = TimeUtil.getPreHourStr
76 |
77 | val temp = Source.fromFile(dir + "%s".format(date) + ".txt" )
78 |
79 | val res = new mutable.ArrayBuffer[(String, Double)]
80 | temp.getLines().foreach(
81 | line =>{
82 | val temp = line.split("\t")
83 | res.+=((temp(0), temp(1).toDouble))
84 | }
85 | )
86 | res.toArray
87 | }
88 |
89 | }
90 |
--------------------------------------------------------------------------------
/src/main/scala/util/JsonUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import org.json.JSONObject
4 |
5 | import scala.util.parsing.json.JSON
6 | import scala.io.Source
7 |
8 |
9 | /**
10 | * Created by li on 16/8/29.
11 | * 读取json格式的额配置文件信息.
12 | */
13 | object JSONUtil {
14 |
15 | private var config = new JSONObject()
16 |
17 | /**
18 | * 初始化类
19 | *
20 | * @param confDir 配置文件路径
21 | */
22 | def initConfig(confDir: String): Unit = {
23 |
24 | val jsObj = Source.fromFile(confDir).getLines().mkString("")
25 | config = new JSONObject(jsObj)
26 | }
27 |
28 |
29 | private def readConfigFile(confDir: String): Map[String, Any] = {
30 |
31 | val jsonFile = Source.fromFile(confDir).mkString
32 |
33 | val json = JSON.parseFull(jsonFile)
34 |
35 | json match {
36 |
37 | case Some(map: Map[String, Any]) => map
38 | // case None => println("Parsing failed")
39 | // case other => println("Unknown data structure: " + other)
40 | }
41 |
42 | }
43 |
44 | /**
45 | * 获取配置文件中的相应的值
46 | * @param key1 定位key
47 | * @param key2 定位key
48 | * @return 返回字符串
49 | */
50 | def getValue(key1: String, key2: String): String = {
51 |
52 | config.getJSONObject(key1).getString(key2)
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/src/main/scala/util/LoggerUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import org.apache.log4j.{BasicConfigurator, Logger}
4 |
5 | /**
6 | * 写Log操作
7 | */
8 | object LoggerUtil {
9 |
10 | var logger = Logger.getLogger("word2vector")
11 | BasicConfigurator.configure()
12 | // PropertyConfigurator.configure("/home/mlearning/tdt/conf/log4j.properties")
13 |
14 | def exception(e: Exception) = {
15 |
16 | logger.error(e.printStackTrace())
17 |
18 | }
19 |
20 | def error(msg: String): Unit = {
21 |
22 | logger.error(msg)
23 | }
24 |
25 | def warn(msg: String): Unit = {
26 |
27 | logger.warn(msg)
28 | }
29 |
30 | def info(msg: String): Unit = {
31 |
32 | logger.info(msg)
33 | }
34 |
35 | def debug(msg: String): Unit = {
36 |
37 | logger.debug(msg)
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/scala/util/MySQLUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
4 |
5 | import scala.collection.mutable.ArrayBuffer
6 | import scala.xml.Elem
7 |
8 | /**
9 | * Created by li on 16/7/12.
10 | */
11 | object MySQLUtil {
12 |
13 | /**
14 | * 读取配置文件中的内容,并建立连接
15 | *
16 | * @param configFile 配置文件
17 | * @return
18 | */
19 | def getConnect(configFile: Elem): Connection = {
20 |
21 | //写在配置文件中
22 | val url = (configFile \ "mysql" \ "url" ).text
23 | val userName = (configFile \ "mysql" \ "username").text
24 | val password = (configFile \ "mysql" \ "password").text
25 |
26 | //设置驱动
27 | Class.forName("com.mysql.jdbc.Driver")
28 |
29 | //初始化
30 | val conn = DriverManager.getConnection(url, userName, password)
31 |
32 | conn
33 | }
34 |
35 | /**
36 | * 向mysql中写数据
37 | *
38 | * @param configFile 配置文件
39 | * @param data 需要写进数据库里面的数据
40 | * @param sql sql查询语句, 格式(sql = "INSERT INTO quotes (quote, author) VALUES (?, ?)")
41 | */
42 | def write2Mysql(configFile: Elem, data: Iterator[String], sql: String): Unit ={
43 |
44 | var conn: Connection = null
45 | var prep: PreparedStatement = null
46 |
47 | try{
48 |
49 | // 读取配置文件并建立连接
50 | conn = getConnect(configFile)
51 |
52 | /** 对需要写入的内容(data)的每一行进行操作 */
53 | data.foreach{ line => {
54 |
55 | val temp = line.split(",")
56 |
57 | /** sql插入语句: */
58 | prep = conn.prepareStatement(sql)
59 | prep.setString(1, temp(0))
60 | prep.setString(2, temp(1))
61 |
62 | prep.executeUpdate()
63 | }}
64 | } catch {
65 |
66 | case e: Exception => println("Mysql Exception")
67 | } finally {
68 |
69 | if(conn != null) {
70 |
71 | conn.close()
72 | }
73 |
74 | if(prep != null) {
75 |
76 | prep.close()
77 | }
78 | }
79 | }
80 |
81 | /**
82 | * 从mysql中读取数据
83 | *
84 | * @param configFile 配置文件
85 | * @param sql mysql查询语句
86 | */
87 | def readFromMysql(configFile: Elem, sql: String): Array[(String, String)] = {
88 |
89 | var conn: Connection = null
90 |
91 | try {
92 |
93 | // 读取配置文件并建立连接
94 | conn = getConnect(configFile)
95 |
96 | val statement = conn.createStatement(ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_UPDATABLE)
97 | // 通过sql查询语句查询的结果
98 | // val sql = "select symbol, sename from bt_stcode where (EXCHANGE = '001002' or EXCHANGE = '001003') and SETYPE = '101' and CUR = 'CNY' and ISVALID = 1 and LISTSTATUS <> '2'"
99 | val result = statement.executeQuery(sql)
100 |
101 | val stocks = ArrayBuffer[(String, String)]()
102 | while(result.next()) {
103 |
104 | /** todo 对查询的结果进行操作 */
105 | val stockID = result.getString("symbol") // symbol: row name
106 | val stock = stockID + "," + result.getString("sename") // sename: row name
107 | stocks +=((stockID, stock))
108 | }
109 |
110 | stocks.toArray
111 | } catch {
112 |
113 | case e: Exception => Array(("error", "error"))
114 | } finally {
115 |
116 | conn.close()
117 | }
118 | }
119 |
120 | }
121 |
--------------------------------------------------------------------------------
/src/main/scala/util/RedisUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import redis.clients.jedis.Jedis
4 |
5 | import scala.collection.mutable
6 | import scala.xml.Elem
7 |
8 | /**
9 | * Created by li on 16/7/8.
10 | */
11 | object RedisUtil {
12 |
13 | var jedis: Jedis = null
14 | /**
15 | * 初始化 redis
16 | *
17 | * @param configFile 配置文件对应的 xml 对象
18 | * @note rowNum: 10
19 | */
20 | def initRedis(configFile: Elem): Jedis = {
21 |
22 | val redisIp = (configFile \ "redis" \ "ip").text
23 | val redisPort = (configFile \ "redis" \ "port").text.toInt
24 | val redisDB = (configFile \ "redis" \ "db").text.toInt
25 | val redisAuth = (configFile \ "redis" \ "auth").text
26 |
27 | jedis = new Jedis(redisIp, redisPort)
28 | jedis.auth(redisAuth)
29 | jedis.select(redisDB)
30 |
31 | jedis
32 | }
33 |
34 | /**
35 | *
36 | */
37 | def readFromRedis: Unit ={
38 |
39 | }
40 |
41 | /**
42 | * 将结果保存到redis
43 | *
44 | * @param resultData 需要保存的数据, hset格式
45 | * @author LiYu
46 | * @note rowNum: 12
47 | */
48 | def write2RedisWithHset(resultData: Array[(String, String)], time: String, dataType: String): Unit = {
49 |
50 | val resultDataMap = mutable.HashMap[String, String]()
51 |
52 | resultData.foreach{line => {
53 | resultDataMap.put(line._1, line._2)
54 | }}
55 |
56 | val pipeline = jedis.pipelined()
57 |
58 | resultDataMap.toSeq.foreach{ x => {
59 |
60 | pipeline.hset(s"vipstockstatistic_$dataType" + s"_$time", x._1, x._2)
61 | // pipeline.expire("hotwordsrank_test:", 60 * 60 * 12)
62 | }}
63 |
64 | pipeline.sync()
65 | }
66 |
67 | /**
68 | * 将结果保存到redis
69 | *
70 | * @param resultData 需要保存的数据,zet格式
71 | * @author Li Yu
72 | * @note rowNum: 12
73 | */
74 | def write2RedisWithZset(resultData: Array[(String, String)], time: String, dataType: String, jedis: Jedis): Unit = {
75 |
76 | resultData.foreach{x => {
77 |
78 | jedis.zadd(s"vipstockstatistic_$dataType" + s"_$time", x._2.toDouble, x._1)
79 | }}
80 |
81 | jedis.close()
82 | }
83 |
84 |
85 | }
86 |
--------------------------------------------------------------------------------
/src/main/scala/util/TextProcessing.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 |
4 | /**
5 | * Created by li on 16/4/11.
6 | */
7 | object TextProcessing {
8 |
9 | }
10 |
--------------------------------------------------------------------------------
/src/main/scala/util/TimeUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import java.math.BigInteger
4 | import java.text.SimpleDateFormat
5 | import java.util.{Calendar, Date}
6 |
7 | import org.apache.hadoop.hbase.client.Scan
8 | import org.apache.hadoop.hbase.protobuf.ProtobufUtil
9 | import org.apache.hadoop.hbase.protobuf.generated.ClientProtos
10 | import org.apache.hadoop.hbase.util.Base64
11 |
12 | /**
13 | * 格式化时间的工具类
14 | */
15 | object TimeUtil {
16 |
17 |
18 | /**
19 | * 获取时间戳对应的时间
20 | * @param timeStamp 时间戳
21 | * @return
22 | */
23 | def getTime(timeStamp: String): String = {
24 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd_HH:mm:ss")
25 | val bigInt: BigInteger = new BigInteger(timeStamp)
26 | val date: String = sdf.format(bigInt)
27 | date
28 | }
29 |
30 | /**
31 | * 获取当前时间,并转换成制定的格式
32 | * @return
33 | */
34 | def getDay: String = {
35 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
36 | val date: String = sdf.format(new Date)
37 | date
38 | }
39 |
40 | /**
41 | * 获取当前小时
42 | * @return
43 | */
44 | def getCurrentHour: Int = {
45 | val calendar = Calendar.getInstance
46 | calendar.setTime(new Date)
47 | calendar.get(Calendar.HOUR_OF_DAY)
48 | }
49 |
50 | /**
51 | * 获取当前小时的前一个小时
52 | * @return
53 | */
54 | def getPreHourStr: String = {
55 | val date = new Date(new Date().getTime - 60 * 60 * 1000)
56 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd-HH")
57 | sdf.format(date)
58 | }
59 |
60 | /**
61 | * 获取今天的日期
62 | *
63 | * @return
64 | */
65 | def getNowDate(): String = {
66 | val now: Date = new Date()
67 | val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd")
68 | val res = dateFormat.format(now)
69 | res
70 | }
71 |
72 |
73 | /**
74 | * 获取本周的开始时间
75 | */
76 | def Null(){
77 |
78 | }
79 |
80 | /**
81 | * 获取本月的开始时间
82 | * http://blog.csdn.net/springlustre/article/details/47273353
83 | */
84 |
85 |
86 | /**
87 | * 设置时间范围
88 | *
89 | * @return 时间范围
90 | * @author
91 | */
92 | def setTimeRange(): String = {
93 |
94 | val scan = new Scan()
95 | val date = new Date(new Date().getTime - 30 * 24 * 60 * 60 * 1000)
96 | val format = new SimpleDateFormat("yyyy-MM-dd HH")
97 | val time = format.format(date)
98 | val time1 = format.format(new Date().getTime)
99 | val startTime = time + "-00-00"
100 | val stopTime = time1 + "-00-00"
101 | val sdf: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss")
102 | val startRow: Long = sdf.parse(startTime).getTime
103 | val stopRow: Long = sdf.parse(stopTime).getTime
104 |
105 | scan.setTimeRange(startRow, stopRow)
106 | val proto: ClientProtos.Scan = ProtobufUtil.toScan(scan)
107 |
108 | Base64.encodeBytes(proto.toByteArray)
109 | }
110 |
111 | /**
112 | * 设置指定的时间范围(一天)
113 | * @param time 指定的日期
114 | * @return 指定日期至前一天时间范围
115 | */
116 | def setAssignedTimeRange(time: String): String = {
117 |
118 | val format = new SimpleDateFormat("yyyy-MM-dd")
119 |
120 | val date = format.parse(time)
121 |
122 | val endTime = new Date(date.getTime - 24 * 60 * 60 * 1000)
123 |
124 | val stopTime = format.format(endTime)
125 |
126 | val startDate = time + "-00-00-00"
127 | val stopDate = stopTime + "-00-00-00"
128 |
129 | val sdf = new SimpleDateFormat("yyyy-MM-dd HH-mm-ss")
130 | val startRaw = sdf.parse(startDate).getTime
131 | val stopRaw = sdf.parse(stopDate).getTime
132 |
133 | val scan = new Scan()
134 | scan.setTimeRange(startRaw, stopRaw)
135 |
136 | val proto = ProtobufUtil.toScan(scan)
137 |
138 | Base64.encodeBytes(proto.toByteArray)
139 | }
140 |
141 |
142 | }
143 |
--------------------------------------------------------------------------------
/src/main/scala/util/UrlCategoryTrim.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import java.io.{BufferedWriter, File, FileWriter}
4 |
5 | import org.apache.spark.{SparkConf, SparkContext}
6 |
7 | import scala.collection.mutable.ListBuffer
8 | import scala.io.Source
9 |
10 | /**
11 | * Created by li on 16/4/6.
12 | * 所输入的数据中有的一个url会对应多个catagory,将具有相同URL的catagory单独分隔开,变成一一对应的值
13 | */
14 | object UrlCategoryTrim {
15 |
16 | // 判断如果catagory中有多个的将其分开并与url对应
17 | def splitCategory(tuple:(String,String)): ListBuffer[(String)] = {
18 |
19 | val listBuffer = new ListBuffer[(String)]
20 | val cata = tuple._1.split(",")
21 |
22 | if(cata.length < 1){
23 |
24 | listBuffer.+=(tuple._2 + "\t" + tuple._1)
25 | } else {
26 |
27 | for(item <- cata){
28 |
29 | listBuffer.+=(tuple._2+ "\t" +item)
30 | }
31 | }
32 | listBuffer
33 | }
34 |
35 | def main(args: Array[String]) {
36 | val conf = new SparkConf().setAppName("urlCatagoryTrim").setMaster("local")
37 | val sc = new SparkContext(conf)
38 |
39 |
40 | val data = Source.fromFile("/Users/li/Downloads/trainingLabel(0).new").getLines().toArray.map{
41 | line =>
42 | val tmp = line.split("\t")
43 | (tmp(1), tmp(0))
44 |
45 | }
46 |
47 | // data.flatMap(splitCatagory).foreach(println)
48 | // 保存到文件中
49 | val dataFile = new File("/users/li/Downloads/trainglabel3.txt")
50 | val fileWriter = new FileWriter(dataFile)
51 | val bufferWriter = new BufferedWriter(fileWriter)
52 |
53 | data.flatMap(x => splitCategory(x)).foreach (
54 | line =>
55 | bufferWriter.write(line + "\n")
56 | )
57 |
58 | bufferWriter.flush()
59 | bufferWriter.close()
60 |
61 | }
62 |
63 |
64 |
65 | }
66 |
67 |
68 |
--------------------------------------------------------------------------------
/src/main/scala/util/XMLUtil.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import scala.xml.{XML, Elem}
4 |
5 | /**
6 | * Created by li on 16/8/29.
7 | */
8 | object XMLUtil {
9 |
10 | /**
11 | * 获取xml格式的配置文件
12 | *
13 | * @param dir 配置文件所在的文件目录
14 | * @return
15 | */
16 | def readConfigFile(dir: String): Elem = {
17 |
18 | val configFile = XML.loadFile(dir)
19 |
20 | configFile
21 | }
22 |
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/util/regularExpression.scala:
--------------------------------------------------------------------------------
1 | package util
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 |
5 | import scala.collection.mutable.ListBuffer
6 | import scala.util.matching.Regex
7 |
8 | /**
9 | * Created by li on 16/6/20.
10 | * 正则表达式,读取文本中所有双引号里面的内容.
11 | */
12 | object regularExpression extends App{
13 | val conf = new SparkConf().setMaster("local").setAppName("regularexpression")
14 | val sc = new SparkContext(conf)
15 |
16 | val data = sc.textFile("file:/Users/li/kunyan/111.txt")
17 |
18 | def quotationMatch(sentence:String): Array[String] = {
19 |
20 | // val regex = new Regex("\"([^\"]*)\"") //匹配双引号
21 | // val regex = new Regex("(?<=\").{1,}(?=\")") //匹配双引号
22 | val regex = new Regex("([-+]?\\d+(\\.\\d+)?%)|[-+]?\\d+(\\.\\d+)?")//匹配正(负)整数\浮点数\含有百分号的数
23 |
24 | // val regex = "\"([^\"]*)\"".r
25 | val num = regex.findAllIn(sentence)
26 | val res = new ListBuffer[String]
27 | while(num.hasNext){
28 | val item = num.next()
29 | res += item.replaceAll("\"", "")
30 | }
31 | res.toArray
32 | }
33 |
34 | // val res = quotationMatch(data)
35 | data.foreach {
36 |
37 | x =>{
38 | val res = quotationMatch(x)
39 | res.foreach(println)
40 | }
41 | }
42 |
43 |
44 |
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/main/scala/wordSegmentation/AnsjAnalyzer.scala:
--------------------------------------------------------------------------------
1 | package wordSegmentation
2 |
3 | import org.ansj.domain.Term
4 | import org.ansj.library.UserDefineLibrary
5 | import org.ansj.splitWord.analysis.{NlpAnalysis, ToAnalysis}
6 | import org.apache.spark.SparkContext
7 | import org.nlpcn.commons.lang.tire.domain.Value
8 | import org.nlpcn.commons.lang.tire.library.Library
9 |
10 | /**
11 | * Created by zhangxin on 2016/3/8
12 | * 基于ansj的分词工具
13 | */
14 | object AnsjAnalyzer {
15 |
16 | /**
17 | * ansj分词器初始化, 添加用户词典
18 | *
19 | * @param sc spark程序入口
20 | * @param userDic 用户词典数组
21 | * @return 无
22 | * @author zhangxin
23 | */
24 | def init(sc: SparkContext, userDic: Array[String]): Unit = {
25 |
26 | val forest = Library.makeForest("library/default.dic")
27 | // val forest = new Forest()
28 |
29 | if(userDic != null ){
30 | userDic.foreach(addUserDic(_, sc))
31 | }
32 |
33 | }
34 |
35 | /**
36 | * 添加用户词典到分词器
37 | *
38 | * @param dicPath 词典路径
39 | * @param sc spark程序入口
40 | * @return 无
41 | * @author zhangxin
42 | */
43 | def addUserDic(dicPath: String, sc: SparkContext): Unit = {
44 |
45 | //读取词典
46 | val dic = sc.textFile(dicPath).collect()
47 |
48 | //添加到ansj中
49 | dic.foreach(UserDefineLibrary.insertWord(_, "userDefine", 100))
50 |
51 |
52 | }
53 |
54 | /**
55 | * 标准分词 ,无词性标注
56 | *
57 | * @param sentence 待分词语句
58 | * @return 分词结果
59 | * @author zhangxin
60 | */
61 | def cutNoTag(sentence: String): Array[String] = {
62 |
63 | // 添加词典,这边有问题,还需继续研究
64 | val value = new Value("济南 \tn")
65 |
66 | Library.insertWord(UserDefineLibrary.ambiguityForest, value)
67 |
68 | //切词
69 | val sent = ToAnalysis.parse(sentence)
70 |
71 | //提取分词结果,过滤词性
72 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName
73 |
74 | words.toArray
75 | }
76 |
77 | /**
78 | * 自然语言分词,带词性标注
79 | *
80 | * @param sentence 待分词句子
81 | * @return 分词结果
82 | * @author zhangxin
83 | */
84 | def cutWithTag(sentence: String): Array[Term] = {
85 |
86 | // 切词
87 | val sent = NlpAnalysis.parse(sentence)
88 |
89 | // 提取分词结果
90 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).next()
91 |
92 | words.toArray
93 | }
94 |
95 |
96 | /**
97 | * 标准分词 ,无词性标注
98 | *
99 | * @param sentence 待分词语句
100 | * @return 分词结果
101 | */
102 | def cutTag(sentence: String, option: Int): Array[String] = {
103 |
104 | val value = new Value("济南\tn")
105 |
106 | Library.insertWord(UserDefineLibrary.ambiguityForest, value)
107 |
108 | //切词
109 | val sent = ToAnalysis.parse(sentence)
110 |
111 | option match {
112 | case 0 => {
113 |
114 | //提取分词结果,过滤词性
115 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName
116 |
117 | words.toArray
118 | }
119 |
120 | case 1 => {
121 |
122 | // 提取分词结果
123 | val words = for(i <- Range(0, sent.size())) yield sent.get(i).getName
124 |
125 | words.toArray
126 | }
127 | }
128 | }
129 |
130 | }
131 |
--------------------------------------------------------------------------------
/src/main/scala/wordSegmentation/wordSegmentAnalyser.scala:
--------------------------------------------------------------------------------
1 | package wordSegmentation
2 |
3 |
4 | /**
5 | * Created by li on 16/8/29.
6 | * 调用ansj分词系统
7 | */
8 | object wordSegmentAnalyser {
9 |
10 | val content = "我是中国人,我经济南下车到广州。中国经济南下势头迅猛!"
11 |
12 | def sentenceSegment(content: String): Array[Array[String]] = {
13 |
14 | // 文章切分为句子
15 | val sentenceArr = content.split(",|。|\t|\n|,|:")
16 | // 句子分词
17 | val segResult = sentenceArr.map(AnsjAnalyzer.cutNoTag)
18 |
19 | segResult.foreach(x => {
20 |
21 | x.foreach(x => print(x + "| "))
22 | println()
23 | })
24 |
25 | segResult
26 | }
27 |
28 |
29 | // def isElem(sentence: Array[String], candidate: Array[String]): Boolean = {
30 | //
31 | // candidate.map{ line => {
32 | //
33 | // if(sentence.contains(line)) {
34 | //
35 | // return true
36 | //
37 | // } else {
38 | //
39 | // return false
40 | // }
41 | // }}
42 | //
43 | // }
44 | //
45 | // def identify(sentenceSeg: Array[Array[String]],
46 | // candidateDic: (String, Array[String])): Array[(Array[String], Array[String])] = {
47 | //
48 | // sentenceSeg.map{line => {
49 | // if (isElem(line, candidateDic._2)){
50 | //
51 | // (line, candidateDic._1)
52 | // } else {
53 | // (line, "0")
54 | // }
55 | // }}
56 | //
57 | // }
58 |
59 |
60 | def main(args: Array[String]) {
61 |
62 | //每个句子分词
63 |
64 | sentenceSegment(content)
65 |
66 | //匹配窗口设定
67 |
68 |
69 | //名词提出
70 |
71 |
72 |
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/resources/2016-07-11-15.txt:
--------------------------------------------------------------------------------
1 | 好男儿 2.313289243522607
2 | 太正宵 0.7779809171400112
3 | 婚纱 0.7515285506073754
4 | 俞思远 0.6920085439132682
5 | 董文华 0.6858591525761419
6 | 直播 0.5917747547425979
7 | 六合彩 0.5647028809538401
8 | 李宇春 0.5534632615609104
9 | 男同志 0.43542120073545265
10 | 演唱会 0.415335092651389
11 | 无耻 0.4137490483483452
12 | 敢死队 0.2982491500059149
--------------------------------------------------------------------------------
/src/test/resources/2016-07-12-13.txt:
--------------------------------------------------------------------------------
1 | t1 1.0564168992636667
2 | t3 0.9591311372616367
3 | t2 0.6064584948059578
4 | 敢死队 0.03318315879765851
5 | 好男儿 0.028594324469757446
6 |
--------------------------------------------------------------------------------
/src/test/resources/2016-07-12-15.txt:
--------------------------------------------------------------------------------
1 | 好男儿 0.313289243522607
2 | 太正宵 1.7779809171400112
3 | 婚纱 0.7515285506073754
4 | 俞思远 2.6920085439132682
5 | 董文华 0.6858591525761419
6 | 直播 0.5917747547425979
7 | 六合彩 0.5647028809538401
8 | 李宇春 0.5534632615609104
9 | 男同志 0.43542120073545265
10 | 演唱会 0.415335092651389
11 | 无耻 0.4137490483483452
12 | 敢死队 0.2982491500059149
--------------------------------------------------------------------------------
/src/test/resources/2016-07-12-16.txt:
--------------------------------------------------------------------------------
1 | t3 1.0564168992636667
2 | t1 0.9591311372616367
3 | t2 0.6064584948059578
4 | 敢死队 0.03318315879765851
5 | 好男儿 0.028594324469757446
6 |
--------------------------------------------------------------------------------
/src/test/resources/text/1.txt:
--------------------------------------------------------------------------------
1 | 光伏,中国人民银行,列,入,绿色,债券,支援,专案,目录,2015年12月22日,19:00:00,中国人民银行,发布,2015,第39,号,公告,公告,称为,加快,建设生态文明,引导,金融机构,服务,绿色发展,推动,经济结构转型,升级,经济发展方式转变,支援,金融机构,发行,绿色,金融债券,募集资金,支援,绿色,产业发展,笔者,目录,第5,项,清洁能源,发电,中,风力发电,光伏发电,智慧,电网,能源,因特网,分布式能源,太阳能热利用,水力发电,新能源,利用,列,入,太阳能光伏发电站,太阳能,高,温热,发电站,不含,分布式,太阳能光伏发电,系统,需,限定,条件,多晶硅,电池,组件,光电,转化,效率,≥,15.5%,组件,专案,投产,运行,日,一年,衰减率,≤,2.5%,年,衰减率,≤,0.7%,单晶硅,电池,组件,光电,转化,效率,≥,16%,组件,专案,投产,运行,日,一年,衰减率,≤,3%,年,衰减率,≤,0.7%,高,倍,聚光光伏,组件,光电,转化,效率,≥,28%,项目,投产,运行,日,一年,衰减率,≤,2%,年,衰减率,≤,0.5%,项目全生命周期,衰减率,≤,10%,硅基,薄膜电池,组件,光电,转化,效率,≥,8%,铜铟镓硒,CIGS,薄膜电池,组件,光电,转化,效率,≥,11%,碲化镉,CdTe,薄膜电池,组件,光电,转化,效率,≥,11%,薄膜电池,组件,光电,转化,效率,≥,10%,多晶硅,单晶硅,薄膜电池,项目全生命周期,衰减率,≤,20%,智能电网,能源,因特网,指,提高,供,需,负荷,平衡,回应,能力,改善,电网,综合,能效,降低,输变电,损耗,增强,可再生能源,接,入,能力,电网建设,运营,技术,升级,改造,专案,1.,智能电网,指,采用,智慧,型,电气设备,即时,双向,集成,通信技术,先进技术,电网建设,运营,专案,电网,智慧,化,升级,改造,项目,2.,能源,因特网,指,综合,电力电子,资讯,智慧,管理技术,连接,分布式能源,含,分布式,可再生能源,分布式,储能,装置,类型,负荷,能量,双向,流动,交换,共享,电网,微电网,能源,燃气,网络,设施,建设,运营,专案,分布式能源,指,区域,能源站,包括,天然气,区域,能源站,分布式光伏发电,系统,分布式能源,设施,建设,运营,分布式能源,接,入,峰谷,调节,系统,分布式,电力,交易平台,能源管理系统,建设,运营,附,中国人民银行公告,2015,第39,号,绿色,债券,支援,专案,目录
--------------------------------------------------------------------------------
/src/test/resources/text/2.txt:
--------------------------------------------------------------------------------
1 | 记者,国家电网公司,获悉,9月23日,河北丰宁,二期,山东文登,重庆,蟠龙,抽水蓄能电站,工程,以下简称,丰宁,二期,文登,蟠龙,抽,蓄,座,抽,蓄,电站,正式,开工,总投资,244.4亿,元,总装机容量,480万,千瓦,计划,2022年,竣工,投产,项目,预计,增加,发电,装备制造业,产值,111亿,元,推动,相关,装备制造业,发展,开工,动员大会,国家电网公司,董事长,党组书记,刘振亚,丰宁,二期,文登,蟠龙,抽,蓄,国家电网公司,推进,特高压电网,建设,服务,清洁能源,发展,重大工程,继,2015年6月,安徽金寨,山东沂蒙,河南,天池,座,抽水蓄能电站,第二批,开工,电站,标志,我国,抽水蓄能电站,加快,发展,新,阶段,介绍,河北丰宁,二期,抽水蓄能电站,项目,位于,河北省承德市,丰宁县,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,华北电网,工程投资,87.5亿,元,丰宁抽水蓄能电站,一期,二期,装机容量,360万,千瓦,世界上,装机容量,抽水蓄能电站,山东,文登抽水蓄能电站,位于,山东省,威海市文登区,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,山东电网,工程投资,85.7亿,元,重庆,蟠龙,抽水蓄能电站,位于,重庆市綦江区,装机容量,120万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,重庆电网,工程投资,71.2亿,元,国网,座,受,端,电网,地区,抽水蓄能电站,建成,更好地,接纳,区,外,来电,优化,电源,结构,提高,北,西南,地区,清洁能源,消纳,能力,提高,特高压电网,系统安全,可靠性,综合,煤电,机组,消纳,清洁能源,效果,建设,丰宁,二期,文登,蟠龙,抽,蓄,年,节约,原煤,消耗,291万,吨,减排,烟尘,0.3万,吨,二氧化硫,1.4万,吨,氮氧化物,1.3万,吨,二氧化碳,485万,吨,节能减排,大气污染防治,国家电网公司,经营,区域,内在,运,抽水蓄能电站,装机容量,1674.5万,千瓦,建,规模,1880万,千瓦,预计,2017年,我国,抽水蓄能,装机,3300万,千瓦,超过,美国,世界上,抽水蓄能电站,第一,大国
--------------------------------------------------------------------------------
/src/test/resources/text/abstract:
--------------------------------------------------------------------------------
1 | 算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。
2 | 算法可以宽泛的分为三类,
3 | 一,有限的确定性算法,这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务,但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。
4 | 二,有限的非确定算法,这类算法在有限的时间内终止。然而,对于一个(或一些)给定的数值,算法的结果并不是唯一的或确定的。
5 | 三,无限的算法,是那些由于没有定义终止定义条件,或定义的条件无法由输入的数据满足而不终止运行的算法。通常,无限算法的产生是由于未能确定的定义终止条件。
--------------------------------------------------------------------------------
/src/test/scala/CNNTest.scala:
--------------------------------------------------------------------------------
1 | import breeze.linalg.{DenseMatrix, DenseVector}
2 |
3 | //import breeze.linalg.{CSCMatrix => BSM, DenseMatrix => BDM, DenseVector => BDV, Matrix => BM, SparseVector => BSV, Vector => BV, accumulate => Accumulate, axpy => brzAxpy, rot90 => Rot90, sum => Bsum, svd => brzSvd, DenseVector}
4 | //import breeze.numerics.{exp => Bexp, tanh => Btanh}
5 | //import org.apache.spark.mllib.linalg.DenseMatrix
6 |
7 |
8 | /**
9 | * Created by li on 16/8/15.
10 | */
11 | object CNNTest {
12 |
13 |
14 | def main(args: Array[String]) {
15 | //
16 | // def sigm(matrix: BDM[Double]): BDM[Double] = {
17 | // val s1 = 1.0 / (Bexp(matrix * (-1.0)) + 1.0)
18 | // s1
19 | // }
20 | //
21 | // val result = BDM.ones[Double](2, 3) + 1.8
22 |
23 |
24 |
25 | val a = DenseVector(1.0, 2.0, 3.0, 4.0, 5.0)
26 |
27 | val b = DenseVector(1.0, 2.0, 3.0, 4.0, 5.0)
28 |
29 | val c = DenseMatrix.ones[Double](5, 2)
30 |
31 | val d = DenseMatrix.ones[Double](5, 5)
32 |
33 | println((a.toDenseMatrix :* d))
34 |
35 |
36 | // val c = (a :* b) :* d
37 | //
38 | // println(c)
39 |
40 |
41 |
42 | }
43 |
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/scala/HDFSUtilTest.scala:
--------------------------------------------------------------------------------
1 | import util.HDFSUtil
2 |
3 | import scala.xml.XML
4 |
5 | /**
6 | * Created by li on 16/7/25.
7 | */
8 | object HDFSUtilTest {
9 |
10 | def main(args: Array[String]) {
11 |
12 | val configFile = XML.loadFile("/Users/li/Kunyan/NaturalLanguageProcessing/src/main/scala/util/config.xml")
13 |
14 | val filesystem = HDFSUtil.setHdfsConfigure(configFile)
15 |
16 | }
17 |
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/src/test/scala/JSONUtilTest.scala:
--------------------------------------------------------------------------------
1 | import util.JSONUtil
2 |
3 | /**
4 | * Created by li on 16/8/29.
5 | */
6 | object JSONUtilTest {
7 |
8 |
9 | def main(args: Array[String]) {
10 |
11 | val confDir = "/Users/li/Kunyan/NaturalLanguageProcessing/src/main/resources/jsonConfig.json"
12 |
13 | JSONUtil.initConfig(confDir)
14 |
15 | val res = JSONUtil.getValue("hbase", "rootDir")
16 |
17 | println(res)
18 | }
19 |
20 | }
21 |
--------------------------------------------------------------------------------
/src/test/scala/MySQLUtilTest.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.{SparkContext, SparkConf}
2 | import util.{XMLUtil, MySQLUtil}
3 |
4 | /**
5 | * Created by li on 16/8/29.
6 | */
7 | object MySQLUtilTest {
8 |
9 | def main(args: Array[String]) {
10 | val conf = new SparkConf().setAppName("MySQLUtilTest").setMaster("local")
11 | val sc = new SparkContext(conf)
12 |
13 | val confDir = "/Users/li/Kunyan/workShop/VipStockStatistic/src/main/scala/util/config.xml"
14 |
15 | val stockSql = "select symbol, sename from bt_stcode where (EXCHANGE = '001002' or EXCHANGE = '001003') " +
16 | "and SETYPE = '101' and CUR = 'CNY' and ISVALID = 1 and LISTSTATUS <> '2'"
17 |
18 | val configFile = XMLUtil.readConfigFile(confDir)
19 |
20 | val stockDic = MySQLUtil.readFromMysql(configFile, stockSql)
21 | .map(row => (row._1, row._2.split(","))).toMap
22 |
23 | stockDic.foreach(x => print(x._1, x._2(0)))
24 |
25 |
26 |
27 |
28 |
29 |
30 | }
31 |
32 | }
33 |
--------------------------------------------------------------------------------
/src/test/scala/Test.scala:
--------------------------------------------------------------------------------
1 | import org.apache.spark.ml.feature.Word2Vec
2 | import org.apache.spark.sql.SQLContext
3 | import org.apache.spark.{SparkContext, SparkConf}
4 |
5 |
6 | /**
7 | * Created by li on 16/4/15.
8 | *
9 | *
10 | *
11 | * import org.apache.spark.ml.feature.Word2Vec
12 |
13 | */
14 | object Test {
15 |
16 | def main(args: Array[String]) {
17 | // val setPath = "/Users/li/kunyan/DataSet/trainingsetUnbalance/YSJS.txt"
18 | // val industry = "化工化纤"
19 | // BinaryClassificationRDD.dataOperation(setPath, industry)
20 | val conf = new SparkConf().setAppName("test").setMaster("local")
21 | val sc = new SparkContext(conf)
22 | val sqlContext = new SQLContext(sc)
23 |
24 |
25 | // Input data: Each row is a bag of words from a sentence or document.
26 | val documentDF = sqlContext.createDataFrame(Seq(
27 | "Hi I heard about Spark".split(" "),
28 | "I wish Java could use case classes".split(" "),
29 | "Logistic regression models are neat".split(" ")
30 | ).map(Tuple1.apply)).toDF("text")
31 |
32 | // Learn a mapping from words to Vectors.
33 | val word2Vec = new Word2Vec()
34 | .setInputCol("text")
35 | .setOutputCol("result")
36 | .setVectorSize(3)
37 | .setMinCount(0)
38 | val model = word2Vec.fit(documentDF)
39 | val result = model.transform(documentDF)
40 | result.select("result").foreach(println)
41 | result.show()
42 |
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/src/test/scala/TextRankTest.scala:
--------------------------------------------------------------------------------
1 | import meachinelearning.textrank.{PropertyExtractor, ConstructTextGraph}
2 | import org.graphstream.graph.Node
3 |
4 | import scala.collection.mutable.ListBuffer
5 | import scala.io.Source
6 |
7 | /**
8 | * Created by li on 16/6/23.
9 | */
10 | object TextRankTest {
11 |
12 | def main(args: Array[String]) {
13 |
14 | val doc = new ListBuffer[(String)]
15 |
16 | val text = Source.fromURL(getClass.getResource(s"/text/${2}.txt")).getLines().mkString("\n")
17 | text.split(",").foreach(x => doc.+=(x))
18 |
19 |
20 | // 构建候选关键词图, 设置窗口大小5
21 | val textGraph = new ConstructTextGraph("url", 10, doc.toList).constructGraph
22 |
23 | // 输出构建的无向图的边和顶点
24 | // textGraph.getEdgeSet.toArray.foreach(println)
25 | // textGraph.getNodeSet.toArray.foreach(println)
26 | // assert(textGraph.getEdgeSet.size() > 0)
27 | println((1 to 30).map(i => "=").mkString)
28 |
29 | // 输出提取的关键词
30 | val keywordExtractor = new PropertyExtractor(textGraph, 5)
31 | keywordExtractor.extractKeywords(100, 0.85f).foreach(
32 | node =>
33 | println(" 关键词: "+node._1," 得分: "+node._2)
34 | )
35 | println((1 to 30).map(i => "=").mkString)
36 |
37 | // 获取每个关键词节点的度
38 | textGraph.getNodeSet.toArray.map(_.asInstanceOf[Node]).foreach {
39 | node =>
40 | println (node.getId, node.getDegree)
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/test/scala/classification.scala:
--------------------------------------------------------------------------------
1 | import java.io.{BufferedWriter, File, FileWriter}
2 |
3 | import org.apache.spark.{SparkContext, SparkConf}
4 | import org.apache.spark.rdd.RDD
5 |
6 |
7 | /**
8 | * Created by li on 16/3/31.
9 | */
10 | object classification {
11 |
12 | val conf = new SparkConf().setAppName("meachinelearning/classification").setMaster("local")
13 | val sc = new SparkContext(conf)
14 |
15 |
16 | def getFile(url: String): RDD[(String, String)] ={
17 | val content = sc.textFile(url).map{
18 | line =>
19 | val data = line.split("\t")
20 | if (data.length > 1) data(0) -> data(1)
21 | }.filter( _ != ()).map(_.asInstanceOf[(String, String)])
22 | content
23 | }
24 |
25 |
26 | def getTrainingset(catagory: RDD[(String, String)], content: RDD[(String, String)], label: String, dataFile: String): Unit ={
27 | // val trainingSet = new ArrayBuffer[String ]
28 | val DataFile = new File(dataFile)
29 | val bufferWriter = new BufferedWriter(new FileWriter(DataFile))
30 | content.map {
31 | line =>
32 | catagory.map{
33 | data =>
34 | bufferWriter.write((if(label == line._1) "1" else "0") + "\t" + line._1 + "\t"+ line._2 + "\n")
35 | // val trainingdata = (if(catagory == line._1) "1" else "0") + "\t" + line._1 + "\t"+ line._2
36 | // trainingSet += trainingdata
37 | }
38 | }
39 | bufferWriter.flush()
40 | bufferWriter.close()
41 |
42 | }
43 |
44 |
45 |
46 | // val DataFile = new File(dataFile)
47 | // val bufferWriter = new BufferedWriter(new FileWriter(DataFile))
48 | // for(item <- list) {
49 | // val cata = map.get(item._1).get
50 | // bufferWriter.write((if(cata == catagory) "1" else "0") + "\t" + cata + "\t"+ item._2 + "\n")
51 | // }
52 | // bufferWriter.flush()
53 | // bufferWriter.close()
54 | // }
55 |
56 | def main(args: Array[String]) {
57 |
58 | // val urlContent = new collection.mutable.HashMap[String , String ]
59 | // val urlCatagory = new ListBuffer[(String, String)]
60 | val catagory1 = "有色金属"
61 | val datafile1 = "/users/li/Downloads/2222.txt"
62 |
63 | val url1 = "/users/li/Downloads/segTraining"
64 | val url2 = "/users/li/Downloads/traininglabel"
65 |
66 | val urlContent = getFile(url1)
67 | val urlCatagory = getFile(url2)
68 |
69 | val res = getTrainingset(urlCatagory, urlContent, catagory1, datafile1)
70 |
71 | }
72 |
73 |
74 |
75 |
76 |
77 | }
78 |
--------------------------------------------------------------------------------
/src/test/scala/keywordExtractorTest.scala:
--------------------------------------------------------------------------------
1 |
2 |
3 | /**
4 | * Created by li on 16/6/27.
5 | */
6 | object keywordExtractorTest {
7 |
8 |
9 | def main(args: Array[String]) {
10 |
11 |
12 | val url = "http://anotherbug.blog.chinajavaworld.com/entry/4545/0/"
13 |
14 | println(url.contains("blog"))
15 | }
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/src/test/scala/telecomDataProcessingTest.scala:
--------------------------------------------------------------------------------
1 | //import org.apache.spark.{SparkConf, SparkContext}
2 | //
3 | //import scala.collection.mutable
4 | //
5 | ///**
6 | // * Created by li on 16/7/20.
7 | // */
8 | //object TelecomDataProcessingTest {
9 | //
10 | //
11 | // def main(args: Array[String]) {
12 | //
13 | // val conf = new SparkConf().setAppName("test").setMaster("local")
14 | // val sc = new SparkContext(conf)
15 | //
16 | // val setTime = "2016-07-15"
17 | //
18 | // //设置时间段,一小时为一个间隔
19 | // val timeRangeHour = TelecomDataProcessing.setAssignedHourRange(setTime)
20 | //
21 | // // Hdfs上的数据,一天的数据
22 | // val dir = "hdfs://222.73.57.12:9000/telecom/shdx/origin/data/"
23 | // val dataFromHDFS = TelecomDataProcessing.dataReadFromHDFS(sc, dir, setTime).filter(! _._1.contains("home/telecom"))
24 | //
25 | // println("dataFromHDFS结束")
26 | // // dataFromHDFS.foreach(println)
27 | //
28 | // // hbase上的数据
29 | // val confDir = "/Users/li/kunyan/NaturalLanguageProcessing/src/main/scala/util/config.xml" // hbase配置文件目录
30 | // val tableName = "wk_detail" // 表名
31 | //
32 | // val result = new mutable.ArrayBuffer[(String, Array[(String, Long)])]
33 | //
34 | // for (item <- 0 until 1) {
35 | //
36 | // val temp = dataFromHDFS.filter { line => {
37 | //
38 | // (timeRangeHour(item)._1 <= line._1.toLong) && (line._1.toLong <= timeRangeHour(item)._2)
39 | //
40 | // }}.map(_._2)
41 | //
42 | // println("temp读取结束")
43 | //
44 | // temp.foreach(println)
45 | //
46 | // val hBaseConf = TelecomDataProcessing.getHBaseConf(sc, confDir, timeRangeHour(item), tableName)
47 | //
48 | // val newsFromHBase = TelecomDataProcessing.newsReadFromHBase(hBaseConf)
49 | //
50 | // newsFromHBase.foreach(println)
51 | //
52 | // val res = TelecomDataProcessing.urlMatching(temp, newsFromHBase)
53 | //
54 | // result.+=((item.toString, res))
55 | //
56 | // }
57 | //
58 | // result.toArray.foreach( x => {
59 | // println(x._1)
60 | // x._2.foreach(x => println((x._1, x._2)))
61 | // })
62 | //
63 | //
64 | // sc.stop()
65 | //
66 | // }
67 | //
68 | //}
69 |
--------------------------------------------------------------------------------
/src/test/scala/testRankTest.scala:
--------------------------------------------------------------------------------
1 |
2 |
3 | import meachinelearning.textrank.TextRank
4 |
5 | import scala.collection.mutable.ListBuffer
6 | import scala.io.Source
7 |
8 | /**
9 | * Created by li on 16/6/24.
10 | */
11 | object testRankTest {
12 |
13 | def main(args: Array[String]) {
14 |
15 | val doc = new ListBuffer[(String)]
16 |
17 | val text = Source.fromURL(getClass.getResource(s"/text/${2}.txt")).getLines().mkString("\n")
18 | text.split(",").foreach(x => doc.+=(x))
19 |
20 | val keyWordList = TextRank.run("url", 5, doc.toList, 3, 100, 0.85f)
21 |
22 | keyWordList.foreach {
23 | word => {
24 | println(word._1, word._2)
25 | }
26 | }
27 | }
28 |
29 | }
30 |
--------------------------------------------------------------------------------
/src/test/scala/timeutilTest.scala:
--------------------------------------------------------------------------------
1 | import util.TimeUtil
2 |
3 | /**
4 | * Created by li on 16/7/19.
5 | */
6 | object TimeUtilTest {
7 |
8 | def main(args: Array[String]) {
9 |
10 | TimeUtil.setAssignedTimeRange("2016-2-1")
11 |
12 | }
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/src/test/scala/word2vecTest.scala:
--------------------------------------------------------------------------------
1 | import meachinelearning.word2vec.Word2Vec
2 | import org.apache.spark.{SparkConf, SparkContext}
3 |
4 | /**
5 | * Created by li on 16/7/15.
6 | */
7 | object word2vecTest {
8 |
9 |
10 | def main(args: Array[String]) {
11 |
12 |
13 | val conf = new SparkConf().setAppName("word2vec").setMaster("local")
14 | val sc = new SparkContext(conf)
15 |
16 | val data = sc.parallelize(List("sadfad\tsdfasdfasdf\tasdfasdfasdfasdfasdf\t中欧,8,美国,成都,;,", "dddddd\tfdasdfvvv\tdfafasfdsadfs\t日本,中欧,.,中国,加州,/,顺分"))
17 |
18 | val punctuation = sc.textFile("/Users/li/kunyan/DataSet/punctuations.txt").collect()
19 |
20 | val s = Word2Vec.formatTransform(data, punctuation)
21 |
22 | s.foreach(println)
23 |
24 | }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/target/.history:
--------------------------------------------------------------------------------
1 | all
2 | help sbt
3 | help clean
4 | help clear
5 | exit
6 |
--------------------------------------------------------------------------------
/target/resolution-cache/default/classification$sbt_2.10/1.0/resolved.xml.properties:
--------------------------------------------------------------------------------
1 | #default#classification$sbt_2.10;1.0 resolved revisions
2 | #Tue Jul 05 15:26:43 CST 2016
3 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-tree\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null
4 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null
5 | +revision\:\#@\#\:+3.0.20\:\#@\#\:+module\:\#@\#\:+plexus-utils\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.0.20 release 3.0.20 null
6 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant-launcher\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null
7 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null
8 | +revision\:\#@\#\:+2.2.1\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.1 release 2.2.1 null
9 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null
10 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+jsr250-api\:\#@\#\:+organisation\:\#@\#\:+javax.annotation\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null
11 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-reflect\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null
12 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.plexus\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null
13 | +revision\:\#@\#\:+1.5.5\:\#@\#\:+module\:\#@\#\:+plexus-component-annotations\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.5 release 1.5.5 null
14 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.inject\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null
15 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-plugin-api\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null
16 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+cdi-api\:\#@\#\:+organisation\:\#@\#\:+javax.enterprise\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null
17 | +revision\:\#@\#\:+2.5.2\:\#@\#\:+module\:\#@\#\:+plexus-classworlds\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.5.2 release 2.5.2 null
18 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null
19 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-model\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null
20 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-commons\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null
21 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-artifact\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null
22 | +revision\:\#@\#\:+1.6.0\:\#@\#\:+module\:\#@\#\:+jarjar\:\#@\#\:+organisation\:\#@\#\:+org.pantsbuild\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.6.0 release 1.6.0 null
23 | +revision\:\#@\#\:+1\:\#@\#\:+module\:\#@\#\:+javax.inject\:\#@\#\:+organisation\:\#@\#\:+javax.inject\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1 release 1 null
24 | +sbtVersion\:\#@\#\:+0.13\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+scalaVersion\:\#@\#\:+2.10\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 ? 0.14.1 null
25 |
--------------------------------------------------------------------------------
/target/resolution-cache/default/classification$sbt_2.10/1.0/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/target/resolution-cache/default/classification_2.10/1.0/resolved.xml.properties:
--------------------------------------------------------------------------------
1 | #default#classification_2.10;1.0 resolved revisions
2 | #Fri Jun 24 11:03:35 CST 2016
3 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-graphx_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null
4 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.5 null
5 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalatest_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalatest\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null
6 | +revision\:\#@\#\:+3.1.14\:\#@\#\:+module\:\#@\#\:+mysql-connector-java\:\#@\#\:+organisation\:\#@\#\:+mysql\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.1.14 release 3.1.14 null
7 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null
8 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null
9 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-mllib_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null
10 | +revision\:\#@\#\:+1.1.2\:\#@\#\:+module\:\#@\#\:+gs-core\:\#@\#\:+organisation\:\#@\#\:+org.graphstream\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.1.2 release 1.1.2 null
11 | +revision\:\#@\#\:+2.7.1\:\#@\#\:+module\:\#@\#\:+hadoop-common\:\#@\#\:+organisation\:\#@\#\:+org.apache.hadoop\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.7.1 release 2.7.1 null
12 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-core_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null
13 |
--------------------------------------------------------------------------------
/target/resolution-cache/default/classification_2.10/1.0/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 | classification
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/target/resolution-cache/default/naturallanguageprocessing$sbt_2.10/1.0/resolved.xml.properties:
--------------------------------------------------------------------------------
1 | #default#naturallanguageprocessing$sbt_2.10;1.0 resolved revisions
2 | #Thu Mar 23 16:16:57 CST 2017
3 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-tree\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null
4 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null
5 | +revision\:\#@\#\:+3.0.20\:\#@\#\:+module\:\#@\#\:+plexus-utils\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.0.20 release 3.0.20 null
6 | +revision\:\#@\#\:+1.9.6\:\#@\#\:+module\:\#@\#\:+ant-launcher\:\#@\#\:+organisation\:\#@\#\:+org.apache.ant\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.9.6 release 1.9.6 null
7 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null
8 | +revision\:\#@\#\:+2.2.1\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.1 release 2.2.1 null
9 | +revision\:\#@\#\:+0.13.8\:\#@\#\:+module\:\#@\#\:+sbt\:\#@\#\:+organisation\:\#@\#\:+org.scala-sbt\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.13.8 release 0.13.8 null
10 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+jsr250-api\:\#@\#\:+organisation\:\#@\#\:+javax.annotation\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null
11 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-reflect\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null
12 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.plexus\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null
13 | +revision\:\#@\#\:+1.5.5\:\#@\#\:+module\:\#@\#\:+plexus-component-annotations\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.5 release 1.5.5 null
14 | +revision\:\#@\#\:+0.3.0\:\#@\#\:+module\:\#@\#\:+org.eclipse.sisu.inject\:\#@\#\:+organisation\:\#@\#\:+org.eclipse.sisu\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.3.0 release 0.3.0 null
15 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-plugin-api\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null
16 | +revision\:\#@\#\:+1.0\:\#@\#\:+module\:\#@\#\:+cdi-api\:\#@\#\:+organisation\:\#@\#\:+javax.enterprise\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.0 release 1.0 null
17 | +revision\:\#@\#\:+2.5.2\:\#@\#\:+module\:\#@\#\:+plexus-classworlds\:\#@\#\:+organisation\:\#@\#\:+org.codehaus.plexus\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.5.2 release 2.5.2 null
18 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:+info.apiURL\:\#@\#\:+http\://www.scala-lang.org/api/2.10.4/\:\#@\#\:=2.10.4 ? 2.10.4 null
19 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-model\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null
20 | +revision\:\#@\#\:+5.0.4\:\#@\#\:+module\:\#@\#\:+asm-commons\:\#@\#\:+organisation\:\#@\#\:+org.ow2.asm\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=5.0.4 release 5.0.4 null
21 | +revision\:\#@\#\:+3.3.3\:\#@\#\:+module\:\#@\#\:+maven-artifact\:\#@\#\:+organisation\:\#@\#\:+org.apache.maven\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.3.3 release 3.3.3 null
22 | +revision\:\#@\#\:+1.6.0\:\#@\#\:+module\:\#@\#\:+jarjar\:\#@\#\:+organisation\:\#@\#\:+org.pantsbuild\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.6.0 release 1.6.0 null
23 | +revision\:\#@\#\:+1\:\#@\#\:+module\:\#@\#\:+javax.inject\:\#@\#\:+organisation\:\#@\#\:+javax.inject\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1 release 1 null
24 | +sbtVersion\:\#@\#\:+0.13\:\#@\#\:+revision\:\#@\#\:+0.14.1\:\#@\#\:+module\:\#@\#\:+sbt-assembly\:\#@\#\:+organisation\:\#@\#\:+com.eed3si9n\:\#@\#\:+scalaVersion\:\#@\#\:+2.10\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=0.14.1 ? 0.14.1 null
25 |
--------------------------------------------------------------------------------
/target/resolution-cache/default/naturallanguageprocessing$sbt_2.10/1.0/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/target/resolution-cache/meachinelearning-classification/meachinelearning-classification$sbt_2.10/1.0/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/target/resolution-cache/meachinelearning-classification/meachinelearning-classification_2.10/1.0/resolved.xml.properties:
--------------------------------------------------------------------------------
1 | #meachinelearning-classification#meachinelearning-classification_2.10;1.0 resolved revisions
2 | #Thu Jul 07 14:51:12 CST 2016
3 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-graphx_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null
4 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-library\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 ? 2.10.5 null
5 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalatest_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalatest\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null
6 | +revision\:\#@\#\:+3.1.14\:\#@\#\:+module\:\#@\#\:+mysql-connector-java\:\#@\#\:+organisation\:\#@\#\:+mysql\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=3.1.14 release 3.1.14 null
7 | +revision\:\#@\#\:+2.2.5\:\#@\#\:+module\:\#@\#\:+scalactic_2.10\:\#@\#\:+organisation\:\#@\#\:+org.scalactic\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.2.5 release 2.2.5 null
8 | +revision\:\#@\#\:+2.10.4\:\#@\#\:+module\:\#@\#\:+scala-compiler\:\#@\#\:+organisation\:\#@\#\:+org.scala-lang\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.10.4 release 2.10.4 null
9 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-mllib_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null
10 | +revision\:\#@\#\:+1.1.2\:\#@\#\:+module\:\#@\#\:+gs-core\:\#@\#\:+organisation\:\#@\#\:+org.graphstream\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.1.2 release 1.1.2 null
11 | +revision\:\#@\#\:+2.7.1\:\#@\#\:+module\:\#@\#\:+hadoop-common\:\#@\#\:+organisation\:\#@\#\:+org.apache.hadoop\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=2.7.1 release 2.7.1 null
12 | +revision\:\#@\#\:+1.5.2\:\#@\#\:+module\:\#@\#\:+spark-core_2.10\:\#@\#\:+organisation\:\#@\#\:+org.apache.spark\:\#@\#\:+branch\:\#@\#\:+@\#\:NULL\:\#@\:\#@\#\:=1.5.2 release 1.5.2 null
13 |
--------------------------------------------------------------------------------
/target/resolution-cache/meachinelearning-classification/meachinelearning-classification_2.10/1.0/resolved.xml.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
10 | MeachineLearning/classification
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification$sources_2.10-docs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification$sources_2.10-optional.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification$sources_2.10-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification$sources_2.10-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification$sources_2.10-provided.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification$sources_2.10-sources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification_2.10-docs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification_2.10-optional.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification_2.10-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification_2.10-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification_2.10-provided.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/default-classification_2.10-sources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-docs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-optional.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-provided.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification$sources_2.10-sources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-docs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-optional.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-plugin.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-provided.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/resolution-cache/reports/meachinelearning-classification-meachinelearning-classification_2.10-sources.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/target/scala-2.10/test-classes/text/1.txt:
--------------------------------------------------------------------------------
1 | 光伏,中国人民银行,列,入,绿色,债券,支援,专案,目录,2015年12月22日,19:00:00,中国人民银行,发布,2015,第39,号,公告,公告,称为,加快,建设生态文明,引导,金融机构,服务,绿色发展,推动,经济结构转型,升级,经济发展方式转变,支援,金融机构,发行,绿色,金融债券,募集资金,支援,绿色,产业发展,笔者,目录,第5,项,清洁能源,发电,中,风力发电,光伏发电,智慧,电网,能源,因特网,分布式能源,太阳能热利用,水力发电,新能源,利用,列,入,太阳能光伏发电站,太阳能,高,温热,发电站,不含,分布式,太阳能光伏发电,系统,需,限定,条件,多晶硅,电池,组件,光电,转化,效率,≥,15.5%,组件,专案,投产,运行,日,一年,衰减率,≤,2.5%,年,衰减率,≤,0.7%,单晶硅,电池,组件,光电,转化,效率,≥,16%,组件,专案,投产,运行,日,一年,衰减率,≤,3%,年,衰减率,≤,0.7%,高,倍,聚光光伏,组件,光电,转化,效率,≥,28%,项目,投产,运行,日,一年,衰减率,≤,2%,年,衰减率,≤,0.5%,项目全生命周期,衰减率,≤,10%,硅基,薄膜电池,组件,光电,转化,效率,≥,8%,铜铟镓硒,CIGS,薄膜电池,组件,光电,转化,效率,≥,11%,碲化镉,CdTe,薄膜电池,组件,光电,转化,效率,≥,11%,薄膜电池,组件,光电,转化,效率,≥,10%,多晶硅,单晶硅,薄膜电池,项目全生命周期,衰减率,≤,20%,智能电网,能源,因特网,指,提高,供,需,负荷,平衡,回应,能力,改善,电网,综合,能效,降低,输变电,损耗,增强,可再生能源,接,入,能力,电网建设,运营,技术,升级,改造,专案,1.,智能电网,指,采用,智慧,型,电气设备,即时,双向,集成,通信技术,先进技术,电网建设,运营,专案,电网,智慧,化,升级,改造,项目,2.,能源,因特网,指,综合,电力电子,资讯,智慧,管理技术,连接,分布式能源,含,分布式,可再生能源,分布式,储能,装置,类型,负荷,能量,双向,流动,交换,共享,电网,微电网,能源,燃气,网络,设施,建设,运营,专案,分布式能源,指,区域,能源站,包括,天然气,区域,能源站,分布式光伏发电,系统,分布式能源,设施,建设,运营,分布式能源,接,入,峰谷,调节,系统,分布式,电力,交易平台,能源管理系统,建设,运营,附,中国人民银行公告,2015,第39,号,绿色,债券,支援,专案,目录
--------------------------------------------------------------------------------
/target/scala-2.10/test-classes/text/2.txt:
--------------------------------------------------------------------------------
1 | 记者,国家电网公司,获悉,9月23日,河北丰宁,二期,山东文登,重庆,蟠龙,抽水蓄能电站,工程,以下简称,丰宁,二期,文登,蟠龙,抽,蓄,座,抽,蓄,电站,正式,开工,总投资,244.4亿,元,总装机容量,480万,千瓦,计划,2022年,竣工,投产,项目,预计,增加,发电,装备制造业,产值,111亿,元,推动,相关,装备制造业,发展,开工,动员大会,国家电网公司,董事长,党组书记,刘振亚,丰宁,二期,文登,蟠龙,抽,蓄,国家电网公司,推进,特高压电网,建设,服务,清洁能源,发展,重大工程,继,2015年6月,安徽金寨,山东沂蒙,河南,天池,座,抽水蓄能电站,第二批,开工,电站,标志,我国,抽水蓄能电站,加快,发展,新,阶段,介绍,河北丰宁,二期,抽水蓄能电站,项目,位于,河北省承德市,丰宁县,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,华北电网,工程投资,87.5亿,元,丰宁抽水蓄能电站,一期,二期,装机容量,360万,千瓦,世界上,装机容量,抽水蓄能电站,山东,文登抽水蓄能电站,位于,山东省,威海市文登区,装机容量,180万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,山东电网,工程投资,85.7亿,元,重庆,蟠龙,抽水蓄能电站,位于,重庆市綦江区,装机容量,120万,千瓦,安装,台,30万,千瓦,可逆,式,水轮发电机组,500,千伏,电压,接,入,重庆电网,工程投资,71.2亿,元,国网,座,受,端,电网,地区,抽水蓄能电站,建成,更好地,接纳,区,外,来电,优化,电源,结构,提高,北,西南,地区,清洁能源,消纳,能力,提高,特高压电网,系统安全,可靠性,综合,煤电,机组,消纳,清洁能源,效果,建设,丰宁,二期,文登,蟠龙,抽,蓄,年,节约,原煤,消耗,291万,吨,减排,烟尘,0.3万,吨,二氧化硫,1.4万,吨,氮氧化物,1.3万,吨,二氧化碳,485万,吨,节能减排,大气污染防治,国家电网公司,经营,区域,内在,运,抽水蓄能电站,装机容量,1674.5万,千瓦,建,规模,1880万,千瓦,预计,2017年,我国,抽水蓄能,装机,3300万,千瓦,超过,美国,世界上,抽水蓄能电站,第一,大国
--------------------------------------------------------------------------------
/target/scala-2.10/test-classes/text/abstract:
--------------------------------------------------------------------------------
1 | 算法可大致分为基本算法、数据结构的算法、数论算法、计算几何的算法、图的算法、动态规划以及数值分析、加密算法、排序算法、检索算法、随机化算法、并行算法、厄米变形模型、随机森林算法。
2 | 算法可以宽泛的分为三类,
3 | 一,有限的确定性算法,这类算法在有限的一段时间内终止。他们可能要花很长时间来执行指定的任务,但仍将在一定的时间内终止。这类算法得出的结果常取决于输入值。
4 | 二,有限的非确定算法,这类算法在有限的时间内终止。然而,对于一个(或一些)给定的数值,算法的结果并不是唯一的或确定的。
5 | 三,无限的算法,是那些由于没有定义终止定义条件,或定义的条件无法由输入的数据满足而不终止运行的算法。通常,无限算法的产生是由于未能确定的定义终止条件。
--------------------------------------------------------------------------------
/target/streams/$global/$global/dumpStructure/$global/streams/out:
--------------------------------------------------------------------------------
1 | [info] Writing structure to /private/var/folders/7j/trxrd6ms0rg3v8tlck57__4h0000gn/T/sbt-structure0.xml...
2 | [info] Done.
3 |
--------------------------------------------------------------------------------
/target/streams/$global/clean/$global/streams/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/clean/$global/streams/out
--------------------------------------------------------------------------------
/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/input_dsp:
--------------------------------------------------------------------------------
1 | org.scala-lang
scala-library 2.10.4
2 | com.kunyan nlpsuit-package 0.2.8.3
org.scalactic scalactic 2.2.5 test
org.scalatest scalatest 2.2.5 test org.scala-lang scala-compiler 2.10.4 org.apache.hadoop
hadoop-common 2.7.1
javax.servlet * * org.apache.hadoop hadoop-hdfs 2.7.1 provided org.apache.spark spark-core_2.10 1.5.2 org.apache.spark spark-mllib_2.10 1.5.2 mysql mysql-connector-java 3.1.14 org.graphstream gs-core 1.1.2 org.apache.spark spark-graphx_2.10 1.5.2 com.ibm.icu icu4j 56.1 org.apache.hbase hbase 0.98.2-hadoop2 org.apache.hbase hbase-client 1.1.2 org.apache.hbase hbase-common 1.1.2 org.apache.hbase hbase-server 1.1.2 org.scalanlp breeze-math_2.10 0.4 org.scalanlp breeze-process_2.10 0.3 org.scalanlp breeze-viz_2.10 0.12 org.scalanlp breeze_2.10 * org.scalanlp nak_2.10 1.3
redis.clients jedis 2.8.0 org.ansj ansj_seg 5.0.2 org.json json 20160212 org.nlpcn nlp-lang 1.7
--------------------------------------------------------------------------------
/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/dependencyPositions/$global/streams/update_cache_2.10/output_dsp
--------------------------------------------------------------------------------
/target/streams/$global/ivyConfiguration/$global/streams/out:
--------------------------------------------------------------------------------
1 | [debug] Other repositories:
2 | [debug] Default repositories:
3 | [debug] Using inline dependencies specified in Scala.
4 |
--------------------------------------------------------------------------------
/target/streams/$global/ivySbt/$global/streams/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/ivySbt/$global/streams/out
--------------------------------------------------------------------------------
/target/streams/$global/projectDescriptors/$global/streams/out:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/projectDescriptors/$global/streams/out
--------------------------------------------------------------------------------
/target/streams/$global/update/$global/streams/update_cache_2.10/inputs:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/update/$global/streams/update_cache_2.10/inputs
--------------------------------------------------------------------------------
/target/streams/$global/update/$global/streams/update_cache_2.10/output:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/STHSF/NaturalLanguageProcessing_Spark/43d4b89910f169879606affec6cf04defb6603e2/target/streams/$global/update/$global/streams/update_cache_2.10/output
--------------------------------------------------------------------------------
/target/streams/compile/unmanagedClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/target/streams/compile/unmanagedJars/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/target/streams/runtime/unmanagedClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/target/streams/runtime/unmanagedJars/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/target/streams/test/unmanagedClasspath/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/target/streams/test/unmanagedJars/$global/streams/export:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------