├── .github
    └── workflows
    │   └── gradle.yml
├── .gitignore
├── CHANGES.md
├── LICENSE
├── README.adoc
├── build.gradle.kts
├── doc
    ├── advanced.adoc
    ├── fasttext.adoc
    ├── highlight
    │   ├── highlight.min.js
    │   └── styles
    │   │   └── github.min.css
    ├── images
    │   ├── WordSplitAlgorithm.png
    │   ├── WordpathProcessor.png
    │   ├── cli.jpg
    │   ├── crf_model.jpg
    │   ├── fasttext-c.png
    │   ├── lexer.png
    │   ├── mynlp-pipeline.png
    │   ├── pipelineLexer.jpg
    │   ├── weixin.jpeg
    │   ├── worddict.png
    │   ├── wordnet-ds.png
    │   ├── wordnet-framework.jpg
    │   ├── wordnet-g.png
    │   ├── wordnet.png
    │   └── wordpath.png
    ├── lexer.adoc
    ├── modules.adoc
    ├── mynlp-docinfo-footer.html
    ├── mynlp.adoc
    ├── mynlp.docx
    ├── other.adoc
    ├── perceptron.adoc
    ├── started.adoc
    └── update.sh
├── gradle.properties
├── gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── licenses
    ├── Ansj-LICENSE.txt
    ├── ApacheCommonsCli-LICENSE.txt
    ├── DimSim-LICENSE.txt
    ├── FastText-LICENSE.txt
    ├── Hanlp-LICENSE.txt
    └── StartSpace-LICENSE.txt
├── mynlp-all
    └── build.gradle.kts
├── mynlp-example
    ├── build.gradle.kts
    └── src
    │   ├── main
    │       └── java
    │       │   ├── Demo.java
    │       │   ├── classification
    │       │       └── HotelCommentExampleTrain.java
    │       │   ├── pinyin
    │       │       └── PinyinExample.java
    │       │   ├── segment
    │       │       ├── CombineExample.java
    │       │       ├── CoreSegment.java
    │       │       ├── CustomSegment.java
    │       │       ├── HowFast.java
    │       │       └── UseStreamApi.java
    │       │   ├── starspace
    │       │       └── AgNews.kt
    │       │   └── transform
    │       │       └── TraditionalExample.java
    │   └── test
    │       └── java
    │           └── TestHighlight.java
├── mynlp-experimental
    ├── .gitignore
    └── build.gradle.kts
├── mynlp
    ├── build.gradle.kts
    ├── shell
    │   └── mynlp.sh
    └── src
    │   ├── main
    │       ├── java
    │       │   └── com
    │       │   │   └── mayabot
    │       │   │       └── nlp
    │       │   │           ├── Mynlp.kt
    │       │   │           ├── MynlpBuilder.java
    │       │   │           ├── MynlpConfigs.kt
    │       │   │           ├── MynlpEnv.java
    │       │   │           ├── algorithm
    │       │   │               ├── HMM.kt
    │       │   │               ├── Heap.kt
    │       │   │               ├── TopIntMinK.kt
    │       │   │               ├── TopMaxK.java
    │       │   │               ├── Viterbi.kt
    │       │   │               ├── collection
    │       │   │               │   ├── Trie.java
    │       │   │               │   ├── ahocorasick
    │       │   │               │   │   ├── AhoCoraickDoubleArrayTrieBuilder.java
    │       │   │               │   │   ├── AhoCorasickDoubleArrayTrie.java
    │       │   │               │   │   ├── Hit.java
    │       │   │               │   │   ├── IHit.java
    │       │   │               │   │   ├── IHitFull.java
    │       │   │               │   │   └── State.java
    │       │   │               │   ├── bintrie
    │       │   │               │   │   ├── AbstractTrieNode.java
    │       │   │               │   │   ├── ArrayTrieNode.java
    │       │   │               │   │   ├── BinTrieNode.java
    │       │   │               │   │   ├── BinTrieTree.java
    │       │   │               │   │   ├── BinTrieTreeBuilder.java
    │       │   │               │   │   ├── HashTrieNode.java
    │       │   │               │   │   ├── TrieTreeAllMatcher.java
    │       │   │               │   │   ├── TrieTreeForwardMaxMatcher.java
    │       │   │               │   │   └── TrieTreeMatcher.java
    │       │   │               │   └── dat
    │       │   │               │   │   ├── DATLongMatcher.java
    │       │   │               │   │   ├── DATMapMatcher.java
    │       │   │               │   │   ├── DATMatcher.java
    │       │   │               │   │   ├── DoubleArrayMaker.java
    │       │   │               │   │   ├── DoubleArrayTrie.java
    │       │   │               │   │   ├── DoubleArrayTrieMap.java
    │       │   │               │   │   ├── DoubleArrayTrieStringIntMap.java
    │       │   │               │   │   └── FastDatCharSet.java
    │       │   │               └── distance
    │       │   │               │   ├── JaroWinklerDistance.java
    │       │   │               │   ├── LevenshteinDistance.java
    │       │   │               │   ├── NGramDistance.java
    │       │   │               │   └── StringDistance.java
    │       │   │           ├── blas
    │       │   │               ├── BlasUtils.kt
    │       │   │               ├── DenseArrayMatrix.kt
    │       │   │               ├── DenseVector.kt
    │       │   │               ├── Matrix.kt
    │       │   │               └── Vector.kt
    │       │   │           ├── character
    │       │   │               └── ChineseCharInfos.kt
    │       │   │           ├── cli
    │       │   │               └── MynlpCli.kt
    │       │   │           ├── common
    │       │   │               ├── ArraySizingStrategy.java
    │       │   │               ├── BoundedProportionalArraySizingStrategy.java
    │       │   │               ├── BufferedReaderLFCR.java
    │       │   │               ├── EncryptionUtil.java
    │       │   │               ├── FastCharReader.java
    │       │   │               ├── FastStringBuilder.java
    │       │   │               ├── Guava.kt
    │       │   │               ├── IntArrayList.kt
    │       │   │               ├── LongArrayList.kt
    │       │   │               ├── Pair.java
    │       │   │               ├── ParagraphIterable.kt
    │       │   │               ├── ParagraphReader.java
    │       │   │               ├── ParagraphReaderSmart.java
    │       │   │               ├── ParagraphReaderString.java
    │       │   │               ├── SettingItem.java
    │       │   │               ├── Settings.java
    │       │   │               ├── TagAndScore.java
    │       │   │               ├── TokenizerSplitter.java
    │       │   │               ├── cli
    │       │   │               │   ├── AlreadySelectedException.java
    │       │   │               │   ├── AmbiguousOptionException.java
    │       │   │               │   ├── BasicParser.java
    │       │   │               │   ├── CommandLine.java
    │       │   │               │   ├── CommandLineParser.java
    │       │   │               │   ├── DefaultParser.java
    │       │   │               │   ├── GnuParser.java
    │       │   │               │   ├── HelpFormatter.java
    │       │   │               │   ├── MissingArgumentException.java
    │       │   │               │   ├── MissingOptionException.java
    │       │   │               │   ├── Option.java
    │       │   │               │   ├── OptionBuilder.java
    │       │   │               │   ├── OptionGroup.java
    │       │   │               │   ├── OptionValidator.java
    │       │   │               │   ├── Options.java
    │       │   │               │   ├── ParseException.java
    │       │   │               │   ├── Parser.java
    │       │   │               │   ├── PatternOptionBuilder.java
    │       │   │               │   ├── PosixParser.java
    │       │   │               │   ├── TypeHandler.java
    │       │   │               │   ├── UnrecognizedOptionException.java
    │       │   │               │   ├── Util.java
    │       │   │               │   └── package-info.java
    │       │   │               ├── hash
    │       │   │               │   ├── ByteUtils.java
    │       │   │               │   ├── MessageDigests.java
    │       │   │               │   ├── MurmurHash3.java
    │       │   │               │   └── MurmurHash3Kotlin.kt
    │       │   │               ├── hppc
    │       │   │               │   ├── BufferAllocationException.java
    │       │   │               │   ├── CharObjectHashMap.java
    │       │   │               │   ├── CharObjectMap.java
    │       │   │               │   └── IntArrayList.java
    │       │   │               ├── injector
    │       │   │               │   ├── BeanFactory.java
    │       │   │               │   ├── ImplementedBy.java
    │       │   │               │   ├── Injector.kt
    │       │   │               │   └── Singleton.java
    │       │   │               ├── logging
    │       │   │               │   ├── AbstractInternalLogger.java
    │       │   │               │   ├── CommonsLogger.java
    │       │   │               │   ├── CommonsLoggerFactory.java
    │       │   │               │   ├── FormattingTuple.java
    │       │   │               │   ├── InternalLogLevel.java
    │       │   │               │   ├── InternalLogger.java
    │       │   │               │   ├── InternalLoggerFactory.java
    │       │   │               │   ├── JdkLogger.java
    │       │   │               │   ├── JdkLoggerFactory.java
    │       │   │               │   ├── Log4J2Logger.java
    │       │   │               │   ├── Log4J2LoggerFactory.java
    │       │   │               │   ├── Log4JLogger.java
    │       │   │               │   ├── Log4JLoggerFactory.java
    │       │   │               │   ├── MessageFormatter.java
    │       │   │               │   ├── Slf4JLogger.java
    │       │   │               │   ├── Slf4JLoggerFactory.java
    │       │   │               │   └── package-info.java
    │       │   │               ├── matrix
    │       │   │               │   └── CSRSparseMatrix.java
    │       │   │               ├── resources
    │       │   │               │   ├── ClasspathNlpResourceFactory.java
    │       │   │               │   ├── FileNlpResourceFactory.kt
    │       │   │               │   ├── JarNlpResourceFactory.kt
    │       │   │               │   ├── NlpResource.java
    │       │   │               │   ├── NlpResourceFactory.java
    │       │   │               │   ├── URLNlpResource.java
    │       │   │               │   └── UseLines.kt
    │       │   │               └── utils
    │       │   │               │   ├── CartesianList.java
    │       │   │               │   ├── CharNormUtils.java
    │       │   │               │   ├── CharSourceLineReader.java
    │       │   │               │   ├── Characters.java
    │       │   │               │   ├── CustomCharSequence.java
    │       │   │               │   ├── DataInOutputUtils.java
    │       │   │               │   ├── DictResources.kt
    │       │   │               │   ├── DownloadUtils.kt
    │       │   │               │   ├── Jars.kt
    │       │   │               │   ├── MyInts.java
    │       │   │               │   ├── MynlpFactories.java
    │       │   │               │   ├── PathUtils.java
    │       │   │               │   └── StringUtils.java
    │       │   │           ├── fasttext
    │       │   │               ├── FastText.kt
    │       │   │               ├── FasttextTranUtils.kt
    │       │   │               ├── Meter.kt
    │       │   │               ├── Model.kt
    │       │   │               ├── ProductQuant.kt
    │       │   │               ├── QuantMatrix.kt
    │       │   │               ├── args
    │       │   │               │   ├── Args.kt
    │       │   │               │   └── InputArgs.kt
    │       │   │               ├── autotune
    │       │   │               │   └── AutotuneStrategy.kt
    │       │   │               ├── dictionary
    │       │   │               │   ├── BuildDictFromSource.kt
    │       │   │               │   ├── DictUtils.kt
    │       │   │               │   ├── Dictionary.kt
    │       │   │               │   ├── DictionaryBuilder.kt
    │       │   │               │   ├── FastWordMap.kt
    │       │   │               │   └── LoadDictFromDataInput.kt
    │       │   │               ├── loss
    │       │   │               │   ├── HierarchicalSoftmaxLoss.kt
    │       │   │               │   ├── Loss.kt
    │       │   │               │   ├── NegativeSamplingLoss.kt
    │       │   │               │   ├── OneVsAlLoss.kt
    │       │   │               │   └── SoftmaxLoss.kt
    │       │   │               ├── train
    │       │   │               │   ├── FastTextTrain.kt
    │       │   │               │   ├── LoadPretraindVector.kt
    │       │   │               │   └── SampleLines.kt
    │       │   │               └── utils
    │       │   │               │   ├── AutoDataInput.kt
    │       │   │               │   ├── ByteUtils.java
    │       │   │               │   ├── IOUtils.kt
    │       │   │               │   ├── LogUtils.kt
    │       │   │               │   └── TopMaxK.kt
    │       │   │           ├── module
    │       │   │               ├── Highlight.kt
    │       │   │               ├── TextHash.kt
    │       │   │               ├── lucene
    │       │   │               │   ├── BaseSynTokenFilter.kt
    │       │   │               │   ├── IterableMode.java
    │       │   │               │   ├── LetterTokenizer.java
    │       │   │               │   ├── MynlpAnalyzer.java
    │       │   │               │   ├── MynlpTokenizer.java
    │       │   │               │   └── PinyinTokenizerFilter.kt
    │       │   │               ├── nwd
    │       │   │               │   ├── FilesNewWordFind.kt
    │       │   │               │   ├── NewWordFindEngine.kt
    │       │   │               │   ├── TopCounter.kt
    │       │   │               │   ├── ValueObjects.kt
    │       │   │               │   └── package-info.java
    │       │   │               ├── pinyin
    │       │   │               │   ├── CustomPinyin.java
    │       │   │               │   ├── PinyinDistance.kt
    │       │   │               │   ├── PinyinResult.java
    │       │   │               │   ├── PinyinService.kt
    │       │   │               │   ├── Tex2PinyinComputer.java
    │       │   │               │   ├── model
    │       │   │               │   │   ├── Pinyin.java
    │       │   │               │   │   ├── PinyinFuzzy.kt
    │       │   │               │   │   ├── PinyinHead.java
    │       │   │               │   │   ├── Shengmu.java
    │       │   │               │   │   ├── SimplePinyin.kt
    │       │   │               │   │   └── Yunmu.java
    │       │   │               │   └── split
    │       │   │               │   │   ├── PinyinSplitApp.kt
    │       │   │               │   │   └── PinyinSplitDefinition.kt
    │       │   │               ├── summary
    │       │   │               │   ├── BM25.java
    │       │   │               │   ├── KeywordSummary.java
    │       │   │               │   ├── SentenceSummary.java
    │       │   │               │   └── TextRankSentence.java
    │       │   │               └── trans
    │       │   │               │   ├── BaseTransformDictionary.java
    │       │   │               │   ├── Simplified2Traditional.kt
    │       │   │               │   ├── Traditional2Simplified.kt
    │       │   │               │   └── TransformService.java
    │       │   │           ├── perceptron
    │       │   │               ├── ConvertHanlpModel.kt
    │       │   │               ├── EvaluateFunction.java
    │       │   │               ├── EvaluateResult.kt
    │       │   │               ├── EvaluateUtils.kt
    │       │   │               ├── FeatureSet.kt
    │       │   │               ├── PerceptronComputer.kt
    │       │   │               ├── PerceptronDefinition.kt
    │       │   │               ├── PerceptronModel.kt
    │       │   │               ├── PerceptronModelImpl.kt
    │       │   │               └── PerceptronTrainer.kt
    │       │   │           ├── segment
    │       │   │               ├── CharNormalize.java
    │       │   │               ├── FluentLexerBuilder.kt
    │       │   │               ├── IterableMode.kt
    │       │   │               ├── KotlinLexers.kt
    │       │   │               ├── Lexer.java
    │       │   │               ├── LexerBuilder.java
    │       │   │               ├── LexerReader.java
    │       │   │               ├── Lexers.java
    │       │   │               ├── Nature.java
    │       │   │               ├── SegmentComponent.java
    │       │   │               ├── SegmentModule.kt
    │       │   │               ├── Sentence.java
    │       │   │               ├── WordAndNature.java
    │       │   │               ├── WordSplitAlgorithm.java
    │       │   │               ├── WordTerm.java
    │       │   │               ├── WordTermSequence.kt
    │       │   │               ├── WordpathProcessor.java
    │       │   │               ├── common
    │       │   │               │   ├── BaseSegmentComponent.java
    │       │   │               │   ├── DefaultCharNormalize.java
    │       │   │               │   ├── PerceptronUtils.kt
    │       │   │               │   ├── String2.java
    │       │   │               │   └── VertexHelper.java
    │       │   │               ├── lexer
    │       │   │               │   ├── bigram
    │       │   │               │   │   ├── BaseExternalizable.java
    │       │   │               │   │   ├── BiGramTableDictionary.java
    │       │   │               │   │   ├── BiGramTableDictionaryImpl.java
    │       │   │               │   │   ├── BiGramTableReader.kt
    │       │   │               │   │   ├── CoreDictPatch.kt
    │       │   │               │   │   ├── CoreDictionary.java
    │       │   │               │   │   ├── CoreDictionaryImpl.java
    │       │   │               │   │   ├── CoreDictionaryReader.kt
    │       │   │               │   │   ├── CoreDictionarySplitAlgorithm.java
    │       │   │               │   │   ├── DictionaryAbsWords.java
    │       │   │               │   │   ├── HmmLexerPlugin.java
    │       │   │               │   │   └── ViterbiBestPathAlgorithm.java
    │       │   │               │   ├── crf
    │       │   │               │   │   ├── CWSCrf.kt
    │       │   │               │   │   ├── FeatureTemplate.kt
    │       │   │               │   │   ├── NerCrf.kt
    │       │   │               │   │   ├── tokenizer
    │       │   │               │   │   │   ├── CrfBaseSegmentInitializer.java
    │       │   │               │   │   │   └── CrfTokenizerBuilder.java
    │       │   │               │   │   └── utils
    │       │   │               │   │   │   └── ConvertCrfText2PerceptronModel.kt
    │       │   │               │   └── perceptron
    │       │   │               │   │   ├── PerceptronSegment.kt
    │       │   │               │   │   ├── PerceptronSegmentAlgorithm.java
    │       │   │               │   │   ├── PerceptronSegmentDefinition.kt
    │       │   │               │   │   ├── PerceptronSegmentPatch.kt
    │       │   │               │   │   ├── PerceptronSegmentPlugin.java
    │       │   │               │   │   ├── PerceptronsSegmentService.java
    │       │   │               │   │   └── inner
    │       │   │               │   │       └── Train.kt
    │       │   │               ├── pipeline
    │       │   │               │   ├── PipelineLexer.java
    │       │   │               │   ├── PipelineLexerBuilder.java
    │       │   │               │   ├── PipelineLexerBuilderKts.kt
    │       │   │               │   └── PipelineLexerPlugin.java
    │       │   │               ├── plugins
    │       │   │               │   ├── atom
    │       │   │               │   │   ├── AtomSplitAlgorithm.kt
    │       │   │               │   │   ├── AtomTemplateParser.kt
    │       │   │               │   │   └── DefaultTemplate.kt
    │       │   │               │   ├── bestpath
    │       │   │               │   │   ├── AtomWordViterbiBestPathAlgorithm.java
    │       │   │               │   │   └── LongpathBestPathAlgorithm.java
    │       │   │               │   ├── collector
    │       │   │               │   │   ├── CoreDictSubwordInfoSetup.java
    │       │   │               │   │   ├── CustomDictSubwordInfoSetup.java
    │       │   │               │   │   ├── IndexSubwordComputer.java
    │       │   │               │   │   ├── RuleDictSubwordComputer.kt
    │       │   │               │   │   ├── SentenceCollector.kt
    │       │   │               │   │   ├── SentenceCollectorBuilder.kt
    │       │   │               │   │   ├── SmartSubwordComputer.java
    │       │   │               │   │   ├── SubwordComputer.kt
    │       │   │               │   │   ├── SubwordInfoSetup.kt
    │       │   │               │   │   └── WordTermCollector.kt
    │       │   │               │   ├── correction
    │       │   │               │   │   ├── CorrectionDictionary.java
    │       │   │               │   │   ├── CorrectionPlugin.java
    │       │   │               │   │   ├── CorrectionWord.kt
    │       │   │               │   │   ├── CorrectionWordpathProcessor.java
    │       │   │               │   │   ├── DefaultCorrectionDictionary.java
    │       │   │               │   │   ├── FileCorrectionDictionary.kt
    │       │   │               │   │   ├── MemCorrectionDictionary.java
    │       │   │               │   │   └── package-info.java
    │       │   │               │   ├── customwords
    │       │   │               │   │   ├── CustomDictionary.java
    │       │   │               │   │   ├── CustomDictionaryPlugin.java
    │       │   │               │   │   ├── CustomDictionaryProcessor.java
    │       │   │               │   │   ├── DefaultCustomDictionary.java
    │       │   │               │   │   ├── FileCustomDictionary.java
    │       │   │               │   │   └── MemCustomDictionary.java
    │       │   │               │   ├── ner
    │       │   │               │   │   ├── NERPerceptron.kt
    │       │   │               │   │   ├── NerPlugin.java
    │       │   │               │   │   ├── NerProcessor.java
    │       │   │               │   │   └── PerceptronNerService.java
    │       │   │               │   ├── pattern
    │       │   │               │   │   ├── PatternPlugin.java
    │       │   │               │   │   └── PatternWordpathProcessor.java
    │       │   │               │   ├── personname
    │       │   │               │   │   ├── PerceptronPersonNameService.java
    │       │   │               │   │   ├── PersonNameAlgorithm.java
    │       │   │               │   │   ├── PersonNamePerceptron.kt
    │       │   │               │   │   ├── PersonNamePlugin.java
    │       │   │               │   │   └── PersonNameProcessor.java
    │       │   │               │   └── pos
    │       │   │               │   │   ├── CommonPosModel.kt
    │       │   │               │   │   ├── PerceptronPosService.java
    │       │   │               │   │   ├── PosPerceptron.kt
    │       │   │               │   │   ├── PosPerceptronDef.kt
    │       │   │               │   │   ├── PosPerceptronProcessor.java
    │       │   │               │   │   ├── PosPerceptronUtils.kt
    │       │   │               │   │   └── PosPlugin.java
    │       │   │               ├── reader
    │       │   │               │   ├── BaseFilterLexerReader.java
    │       │   │               │   ├── DefaultLexerReader.java
    │       │   │               │   ├── LexerItreabler.kt
    │       │   │               │   ├── PunctuationFilter.java
    │       │   │               │   ├── StopWordDict.kt
    │       │   │               │   └── StopwordFilter.java
    │       │   │               └── wordnet
    │       │   │               │   ├── BestPathAlgorithm.java
    │       │   │               │   ├── Vertex.java
    │       │   │               │   ├── VertexRow.java
    │       │   │               │   ├── WordNetToStringBuilder.java
    │       │   │               │   ├── Wordnet.java
    │       │   │               │   ├── Wordpath.java
    │       │   │               │   └── package-info.java
    │       │   │           ├── similarity
    │       │   │               └── BM25.kt
    │       │   │           └── starspace
    │       │   │               ├── Args.kt
    │       │   │               ├── DataHandler.kt
    │       │   │               ├── Dictionary.kt
    │       │   │               ├── Evaluate.kt
    │       │   │               ├── Parser.kt
    │       │   │               ├── Prediction.kt
    │       │   │               ├── SparseLinear.kt
    │       │   │               ├── StarSpace.kt
    │       │   │               ├── Train.kt
    │       │   │               └── Utils.kt
    │       └── resources
    │       │   ├── META-INF
    │       │       └── mynlp.factories
    │       │   ├── com
    │       │       └── mayabot
    │       │       │   └── nlp
    │       │       │       └── common
    │       │       │           └── utils
    │       │       │               └── char_norm
    │       │   ├── mynlp
    │       │       ├── char_four_code.txt
    │       │       ├── char_py.txt
    │       │       ├── char_struct.txt
    │       │       ├── char_write_num.txt
    │       │       └── py_hard_code_map.txt
    │       │   ├── patch
    │       │       └── cws-default.txt
    │       │   └── stopwords.txt
    │   └── test
    │       ├── java
    │           └── com
    │           │   └── mayabot
    │           │       └── nlp
    │           │           ├── BM25Test.kt
    │           │           ├── DoubleArrayTrieTest.java
    │           │           ├── Highlight.kt
    │           │           ├── InjectTest.kt
    │           │           ├── LuceneAnalyzerTest.java
    │           │           ├── Mynlps.kt
    │           │           ├── SentenceSummaryTest.java
    │           │           ├── TestFileMap.java
    │           │           ├── TransTest.java
    │           │           ├── XxHashTest.kt
    │           │           ├── commmon
    │           │               ├── CsrSparseMatrixTest.kt
    │           │               └── TokenizerSplitterTest.java
    │           │           ├── fasttext
    │           │               ├── CFtzModelBugTest.kt
    │           │               ├── Java.java
    │           │               ├── SupTest.kt
    │           │               ├── TestCModelFTZ.kt
    │           │               ├── TestSup.kt
    │           │               ├── TestWords.kt
    │           │               └── Utils.kt
    │           │           ├── module
    │           │               └── lucene
    │           │               │   ├── LuceneUtils.kt
    │           │               │   └── TestPinyinTokenizer.kt
    │           │           ├── pa
    │           │               └── GeleiCode.kt
    │           │           ├── perceptron
    │           │               └── TestCompresParamBin.kt
    │           │           ├── pinyin
    │           │               ├── PinyinDistance.kt
    │           │               └── PinyinTest.kt
    │           │           └── segment
    │           │               ├── CmbSegment.kt
    │           │               ├── CombineTest.java
    │           │               ├── CoreTokenizerTest.java
    │           │               ├── CustomDictTest.kt
    │           │               ├── IndexSegmentTest.java
    │           │               ├── KeepOriCharOutputTest.kt
    │           │               ├── KotlinTest.kt
    │           │               ├── OffsetTest.kt
    │           │               ├── PerceptronCwsTest.kt
    │           │               ├── PosTest.java
    │           │               ├── SegmentErrorCasesTest.kt
    │           │               ├── SubwordTest.kt
    │           │               ├── Test.kt
    │           │               ├── TestPosAndSubWord.kt
    │           │               ├── atom
    │           │                   └── AtomSplitAlgorithmTest.kt
    │           │               ├── collector
    │           │                   └── SentenceIndexWordCollectorTest.java
    │           │               ├── dictionary
    │           │                   └── CoreGiGramTableDictionaryTest.java
    │           │               ├── lexer
    │           │                   └── perceptron
    │           │                   │   ├── CWSPerceptronTest.kt
    │           │                   │   ├── NERPerceptronTest.kt
    │           │                   │   ├── POSPerceptronTest.kt
    │           │                   │   ├── PerceptronNerServiceTest.java
    │           │                   │   └── PerceptronServiceTest.kt
    │           │               ├── ner
    │           │                   ├── OrgTest.java
    │           │                   ├── PersonNameTest.kt
    │           │                   └── PlaceTest.java
    │           │               ├── utils
    │           │                   └── TokenizerTestHelp.java
    │           │               └── wordnet
    │           │                   ├── VertexRowTest.java
    │           │                   └── WordpathTest.java
    │       └── resources
    │           └── GrapCode.txt
└── settings.gradle.kts


/.github/workflows/gradle.yml:
--------------------------------------------------------------------------------
 1 | name: Java CI
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 8 * * *'
 6 | 
 7 | jobs:
 8 |   build:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v1
14 |     - name: Set up JDK 1.8
15 |       uses: actions/setup-java@v1
16 |       with:
17 |         java-version: 1.8
18 |     - name: Build with Gradle
19 |       run: ./gradlew build
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled class file
 2 | *.class
 3 | 
 4 | # Log file
 5 | *.log
 6 | 
 7 | # BlueJ files
 8 | *.ctxt
 9 | 
10 | # Mobile Tools for Java (J2ME)
11 | .mtj.tmp/
12 | 
13 | # Package Files #
14 | *.jar
15 | *.war
16 | *.ear
17 | # *.zip
18 | *.tar.gz
19 | *.rar
20 | 
21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
22 | hs_err_pid*
23 | 
24 | **/out
25 | **/mynlp_work/
26 | **/mynlp_work/**
27 | /mynlp_work/
28 | mynlp_work/**
29 | **/temp/
30 | data/
31 | /data_dir/
32 | **/data/
33 | example.data
34 | 
35 | /dependency-reduced-pom.xml
36 | *.iml
37 | /.idea
38 | *.iws
39 | 
40 | /.gradle
41 | **/*.iml
42 | *.ipr
43 | 
44 | data.work/
45 | **/model/dependency
46 | **/model/segment
47 | /temp/
48 | /mynlp-fasttext/xbackjava/
49 | /mynlp-fasttext/xcsource/
50 | /mynlp-fasttext/data/
51 | /testdata/
52 | !/gradle/wrapper/gradle-wrapper.jar
53 | ### macOS template
54 | # General
55 | .DS_Store
56 | .AppleDouble
57 | .LSOverride
58 | 
59 | # Icon must end with two \r
60 | Icon
61 | 
62 | # Thumbnails
63 | ._*
64 | 
65 | # Files that might appear in the root of a volume
66 | .DocumentRevisions-V100
67 | .fseventsd
68 | .Spotlight-V100
69 | .TemporaryItems
70 | .Trashes
71 | .VolumeIcon.icns
72 | .com.apple.timemachine.donotpresent
73 | 
74 | # Directories potentially created on remote AFP share
75 | .AppleDB
76 | .AppleDesktop
77 | Network Trash Folder
78 | Temporary Items
79 | .apdisk
80 | debug
81 | public
82 | docs/public
83 | doc/*.html
84 | doc/*.pdf
85 | *.html
86 | 
87 | !/doc/mynlp-docinfo-footer.html


--------------------------------------------------------------------------------
/CHANGES.md:
--------------------------------------------------------------------------------
 1 | # 4.0.0
 2 | - 新增StarSpace模块
 3 | - 感知机相关实现基于通用框架
 4 | - 规划data文件夹,提供下载
 5 | - 命令行工具
 6 | - 重构训练工具API
 7 | - 去除Mynlps类
 8 | 
 9 | # 3.3.0
10 | 
11 | # 3.2.2
12 | 
13 | - 增加QuickReplacer和Highlight功能
14 | - murmur3
15 | - text hash use xxHash
16 | 
17 | # 3.2.1
18 | 
19 | - kotlin update to 1.4.0
20 | - fixbug： injector 单例在key不一致的情况下单例实例化两个对象
21 | - 清理了Setting，彻底去除配置文件
22 | - fix junit test内存溢出的问题
23 | 
24 | # 3.2.0
25 | 
26 | - 代码结构做出改变，合并到mynlp单一项目
27 | - fasttext也合并到mynlp中
28 | - mynlp不再自动依赖词典资源，需要独立引入资源
29 | - mynlp-with-res这个自动引入常用资源，可以通过exclude排除不需要的资源
30 | - elasticsearch-plugin将独立项目，支持7.0以上的版本
31 | 
32 | # 3.1.5
33 | 去除Gauva依赖，mynlp只依赖kotlin运行时
34 | 
35 | # 3.1.2
36 | - FastText 模型保存为单个文件，也可以从单个文件加载
37 | ```kotlin
38 | fastText.saveModelToSingleFile(File("fastText4j/data/model.fjbin"))
39 | 
40 | FastText.loadModelFromSingleFile(File("fastText4j/data/model.fjbin"))
41 | ```
42 | 
43 | # 3.1.1
44 | - fix 标点符号过滤bug
45 | 
46 | # v3.1.0
47 | - 合并了mynlp-core,mynlp-perceptron,mynlp-segment模块
48 | - 重构了感知机模块，自定义感知机只需要实现一个接口定义
49 | - 感知机分词、词性分析使用新的感知机API
50 | - 开放词性分析在线学习接口；简化词性感知机特征提取函数
51 | - 在规则层面提高人名识别准确性
52 | - 合并fastText4j代码到mynlp项目
53 |   - 按照最新C语言版本fastText重构
54 |   - 新增OneVsAiLoss损失函数
55 |   - 新增test接口
56 |   - fix预测结果数量少一个的bug
57 |     
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/doc/advanced.adoc:
--------------------------------------------------------------------------------
 1 | == 高级主题
 2 | 
 3 | === Wordnet
 4 | 
 5 | :imagesdir: images
 6 | 
 7 | 分词系统中需要一个数据结构来表达一段文字来多种分词可能性。距离来说"商品和服务"，"商品/和服/务"就是其中一个 错误的切分可能。 各种分词算法的目标就是找出最合理的切分方法。
 8 | 
 9 | image::wordnet-g.png[词图篱笆网络,600]
10 | 
11 | 站在每个字的角度来看，会有一个或多个跳转路径。从Start节点到End节点中必定存在一个最优路径，这个路径就是 分词结果了。但是上图如果利用Node和Edge的数据结构来表达的话，性能和方便程度都很差。
12 | 
13 | Wordnet是经典的数据结构，mynlp用链表的方式实现了一个高效的Wordnet类。
14 | 
15 | 
16 | image::wordnet.png[,600]
17 | 每个数字节点，表示一个边，也表示从当前这个字构成的词的长度。
18 | 
19 | 对应的Java数据为：
20 | 
21 | image::wordnet-ds.png[,600]
22 | 
23 | 每个字对应一个`VertexRow`,每个VertexRow指向一个Vertex链表，其中Vertex链表中的数字大小**一定是不可重复且有序的**。
24 | 
25 | 分词的基本逻辑就是填充Wordnet，使用路径选算法从多种可能性选出最佳的分词路径。
26 | 
27 | === Wordpath
28 | 
29 | 类Wordpath表示一个路径，如果路径不在变化，那么也就无所谓采用什么数据结构。但是在Pipeline中，不同的组件和算法还需要对这个 **唯一的路径再进行修改**。会涉及到很多`联合`、`打破-再联合`等操作。在List的基础上操作起来，代码非常复杂且不容易理解。
30 | 
31 | 这里我们使用BitSet来表示唯一分词路径。
32 | 
33 | image::wordpath.png[,600]
34 | 
35 | 图中的字之间的斜线，表示要切断。我们用bitset中和字对应的Index，设置为true。 比如"提高"是一个词，那么设置bitset的下标1为true。
36 | 
37 | 就是这么简单，使用这种数据结构的好处是，combine或者划词的操作非常简单，而且内存上消耗非常非常低。
38 | 
39 | === Injector IOC容器


--------------------------------------------------------------------------------
/doc/highlight/styles/github.min.css:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | github.com style (c) Vasily Polovnyov <vast@whiteants.net>
  4 | 
  5 | */
  6 | 
  7 | .hljs {
  8 |   display: block;
  9 |   overflow-x: auto;
 10 |   padding: 0.5em;
 11 |   color: #333;
 12 |   background: #f8f8f8;
 13 | }
 14 | 
 15 | .hljs-comment,
 16 | .hljs-quote {
 17 |   color: #998;
 18 |   font-style: italic;
 19 | }
 20 | 
 21 | .hljs-keyword,
 22 | .hljs-selector-tag,
 23 | .hljs-subst {
 24 |   color: #333;
 25 |   font-weight: bold;
 26 | }
 27 | 
 28 | .hljs-number,
 29 | .hljs-literal,
 30 | .hljs-variable,
 31 | .hljs-template-variable,
 32 | .hljs-tag .hljs-attr {
 33 |   color: #008080;
 34 | }
 35 | 
 36 | .hljs-string,
 37 | .hljs-doctag {
 38 |   color: #d14;
 39 | }
 40 | 
 41 | .hljs-title,
 42 | .hljs-section,
 43 | .hljs-selector-id {
 44 |   color: #900;
 45 |   font-weight: bold;
 46 | }
 47 | 
 48 | .hljs-subst {
 49 |   font-weight: normal;
 50 | }
 51 | 
 52 | .hljs-type,
 53 | .hljs-class .hljs-title {
 54 |   color: #458;
 55 |   font-weight: bold;
 56 | }
 57 | 
 58 | .hljs-tag,
 59 | .hljs-name,
 60 | .hljs-attribute {
 61 |   color: #000080;
 62 |   font-weight: normal;
 63 | }
 64 | 
 65 | .hljs-regexp,
 66 | .hljs-link {
 67 |   color: #009926;
 68 | }
 69 | 
 70 | .hljs-symbol,
 71 | .hljs-bullet {
 72 |   color: #990073;
 73 | }
 74 | 
 75 | .hljs-built_in,
 76 | .hljs-builtin-name {
 77 |   color: #0086b3;
 78 | }
 79 | 
 80 | .hljs-meta {
 81 |   color: #999;
 82 |   font-weight: bold;
 83 | }
 84 | 
 85 | .hljs-deletion {
 86 |   background: #fdd;
 87 | }
 88 | 
 89 | .hljs-addition {
 90 |   background: #dfd;
 91 | }
 92 | 
 93 | .hljs-emphasis {
 94 |   font-style: italic;
 95 | }
 96 | 
 97 | .hljs-strong {
 98 |   font-weight: bold;
 99 | }
100 | 


--------------------------------------------------------------------------------
/doc/images/WordSplitAlgorithm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/WordSplitAlgorithm.png


--------------------------------------------------------------------------------
/doc/images/WordpathProcessor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/WordpathProcessor.png


--------------------------------------------------------------------------------
/doc/images/cli.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/cli.jpg


--------------------------------------------------------------------------------
/doc/images/crf_model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/crf_model.jpg


--------------------------------------------------------------------------------
/doc/images/fasttext-c.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/fasttext-c.png


--------------------------------------------------------------------------------
/doc/images/lexer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/lexer.png


--------------------------------------------------------------------------------
/doc/images/mynlp-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/mynlp-pipeline.png


--------------------------------------------------------------------------------
/doc/images/pipelineLexer.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/pipelineLexer.jpg


--------------------------------------------------------------------------------
/doc/images/weixin.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/weixin.jpeg


--------------------------------------------------------------------------------
/doc/images/worddict.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/worddict.png


--------------------------------------------------------------------------------
/doc/images/wordnet-ds.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet-ds.png


--------------------------------------------------------------------------------
/doc/images/wordnet-framework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet-framework.jpg


--------------------------------------------------------------------------------
/doc/images/wordnet-g.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet-g.png


--------------------------------------------------------------------------------
/doc/images/wordnet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet.png


--------------------------------------------------------------------------------
/doc/images/wordpath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordpath.png


--------------------------------------------------------------------------------
/doc/lexer.adoc:
--------------------------------------------------------------------------------
 1 | :imagesdir: images
 2 | 
 3 | == 中文分词、词性标注、命名实体
 4 | 
 5 | === lexer架构
 6 | 
 7 | 分词、词性、命名实体这三个任务一起被称为 `词法分析` ，mynlp中使用Lexer接口这个功能进行定义。
 8 | 
 9 | Lexer负责对有限短的文本（一句话、一个段落）进行词法分析。
10 | 
11 | nlp中有各种各样的分词算法，mynlp并没有为每个算法定义一个分词器类，而是使用Pipeline方式进行组装。
12 | 
13 | .Pipeline架构图
14 | image::lexer.png[width=600]
15 | 
16 | .接口描述
17 | - WordSplitAlgorithm: 基础切词算法，词典、感知机、CRF等等
18 | - WordpathProcessor: 对Wordpath进行调整或计算词性等操作
19 | - BestPathAlgorithm: 从Wordnet中选择最优路径
20 | - WordTermCollector: 分词收集器，可以进行索引分词、子词再切分等操作 ---
21 | 
22 | .WordSplitAlgorithm接口以及实现类
23 | image::WordSplitAlgorithm.png[width=600]
24 | 
25 | .WordpathProcessor接口以及实现类
26 | image::WordpathProcessor.png[width=700]
27 | 
28 | === PipelineBuilder
29 | 
30 | === CharNormalize
31 | 
32 | === WordSplitAlgorithm
33 | 
34 | ==== CORE
35 | 
36 | ==== 感知机
37 | 
38 | ==== ATOM
39 | 
40 | === WordpathProcessor
41 | 
42 | ==== 人名识别
43 | 
44 | ==== NER
45 | 
46 | ==== 分词纠错
47 | 
48 | ==== 自定义词典
49 | 
50 | === WordTermCollector
51 | 
52 | === 扩展插件
53 | 
54 | === 自定义分词粒度插件示例


--------------------------------------------------------------------------------
/doc/modules.adoc:
--------------------------------------------------------------------------------
 1 | == 综合模块
 2 | 
 3 | === 拼音
 4 | 
 5 | === 文本分类
 6 | 
 7 | === 繁简体转换
 8 | 
 9 | [source,java]
10 | ----
11 | Simplified2Traditional s2t = TransformService.simplified2Traditional();
12 | System.out.println(s2t.transform("软件和体育的艺术"));
13 | 
14 | Traditional2Simplified t2s = TransformService.traditional2Simplified();
15 | System.out.println(t2s.transform("軟件和體育的藝術"));
16 | 
17 | ----
18 | 
19 | === 摘要
20 | 
21 | 文本摘要包含了两个简单TextRank的实现。
22 | 
23 | .关键字摘要
24 | [source,java]
25 | ----
26 | KeywordSummary keywordSummary = new KeywordSummary();
27 | keywordSummary.keyword("text",10);
28 | ----
29 | 
30 | .句子摘要
31 | [source,java]
32 | ----
33 | SentenceSummary sentenceSummary = new SentenceSummary();
34 | List<String> result = sentenceSummary.summarySentences(document, 10);
35 | ----
36 | 
37 | === 相似度
38 | 
39 | 还没开发。
40 | 
41 | === 高亮与关键字替换
42 | 
43 | 对文本关键字进行高亮。
44 | 
45 | [source,java]
46 | ----
47 | List<String> keywords = new ArrayList<>();
48 | 
49 | keywords.add("居住证");
50 | keywords.add("居住");
51 | 
52 | Highlighter highlighter = new Highlighter(keywords);//<1>
53 | 
54 | String text = "居住在上海需要办理居住证";
55 | String text = highlighter.replace(text);
56 | ----
57 | <1> Highlighter对象可重复使用
58 | 
59 | Highlighter内部使用了Trie结构，所以replace的时间复杂度和keywords的数量几乎无关，只对原始text扫描一次。 替换过程采用前向最大匹配算法。
60 | 
61 | 另外还可以通过 `QuickReplacer` 类来自定义替换内容。
62 | 
63 | [source,java]
64 | ----
65 | List<String> keywords = new ArrayList<>();
66 | 
67 | keywords.add("居住证");
68 | keywords.add("居住");
69 | 
70 | QuickReplacer quickReplacer = new QuickReplacer(keywords);
71 | 
72 | String result = quickReplacer.replace("居住在上海需要办理居住证",
73 |         (Function<String, String>) word -> "<a href='xxx'>"+word+"</a>");
74 | ----
75 | 
76 | Kotlin便捷扩展函数
77 | 
78 | [source,kotlin]
79 | ----
80 | "居住在上海需要办理居住证".highlight(listOf("居住证","居住"))
81 | ----
82 | 
83 | === 新词发现
84 | 
85 | 这个文档怎么写


--------------------------------------------------------------------------------
/doc/mynlp-docinfo-footer.html:
--------------------------------------------------------------------------------
 1 | <script>
 2 |     var _hmt = _hmt || [];
 3 |     (function () {
 4 |         var hm = document.createElement("script");
 5 |         hm.src = "https://hm.baidu.com/hm.js?fb3d179a7eed12c80f3736a42213bbbc";
 6 |         var s = document.getElementsByTagName("script")[0];
 7 |         s.parentNode.insertBefore(hm, s);
 8 |     })();
 9 | </script>
10 | 


--------------------------------------------------------------------------------
/doc/mynlp.adoc:
--------------------------------------------------------------------------------
 1 | = Mynlp技术参考手册
 2 | Jimi <jimi@mayabot.com>
 3 | :doctype: book
 4 | :toc: left
 5 | :toc-title: 目录
 6 | :toclevels: 5
 7 | :icons: font
 8 | :docinfo: shared,private-footer
 9 | :imagesdir: images
10 | :source-highlighter: highlightjs
11 | :source-indent: 1
12 | :source-language: java
13 | :highlightjsdir: highlight
14 | 
15 | == Mynlp介绍
16 | 
17 | image::https://cdn.mayabot.com/mynlp/mynlp-banner.png[Logo,400,]
18 | MYNLP是一个Java实现的高性能、柔性API、可扩展的中文NLP工具包。
19 | 
20 | .功能
21 | - 感知机分词
22 | - CORE二元语言模型&词典分词
23 | - 词性标注
24 | - 通用感知机
25 | - 命名实体识别（人名、地名、组织机构名）
26 | - fastText
27 | - 文本分类
28 | - 新词发现
29 | - 拼音转换&切分
30 | - 简繁体转换
31 | 
32 | .欢迎关注微信公众号，获取最新动态和相关文章
33 | image::weixin.jpeg[weixin,150,,,align="center"]
34 | 
35 | include::started.adoc[]
36 | 
37 | include::lexer.adoc[]
38 | 
39 | include::perceptron.adoc[]
40 | 
41 | include::fasttext.adoc[]
42 | 
43 | include::modules.adoc[]
44 | 
45 | include::advanced.adoc[]
46 | 
47 | include::other.adoc[]
48 | 
49 | == 致谢以下优秀开源项目
50 | 
51 | - HanLP
52 | - ansj_seg
53 | 
54 | mynlp实现参考了他们算法实现和部分代码


--------------------------------------------------------------------------------
/doc/mynlp.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/mynlp.docx


--------------------------------------------------------------------------------
/doc/perceptron.adoc:
--------------------------------------------------------------------------------
 1 | == 结构化平均感知机框架
 2 | 
 3 | 什么是结构化平均感知机框架？请阅读这一篇文章link:http://www.hankcs.com/nlp/segment/implementation-of-word-segmentation-device-java-based-on-structured-average-perceptron.html[《基于结构化平均感知机的分词器Java实现》]
 4 | 
 5 | 在这里我们用结构化平均感知机框架来解决序列化标注问题，例如BMES标注。因为BMES这四个TAG之前是有转移关系的， 所以肯定是结构化预测问题。
 6 | 
 7 | mynlp提供了通用的AP框架，方便实现各种自定义标签、特征函数、语料格式。
 8 | 
 9 | 你只需告知感知机框架三件事情：
10 | 
11 | - 特征提取函数
12 | - label编码
13 | - 原始语料如果转换为(输入=标签)二元组
14 | 
15 | 


--------------------------------------------------------------------------------
/doc/update.sh:
--------------------------------------------------------------------------------
1 | 
2 | scp mynlp.html root@www.mayabot.com:/opt/mynlp-doc
3 | scp -r images root@www.mayabot.com:/opt/mynlp-doc
4 | scp -r highlight root@www.mayabot.com:/opt/mynlp-doc


--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
 1 | kotlin.code.style=official
 2 | 
 3 | systemProp.org.gradle.internal.http.connectionTimeout=120000
 4 | systemProp.org.gradle.internal.http.socketTimeout=120000
 5 | //开启kotlin的增量和并行编译
 6 | kotlin.incremental=true
 7 | kotlin.incremental.java=true
 8 | kotlin.incremental.js=true
 9 | kotlin.caching.enabled=true
10 | kotlin.parallel.tasks.in.project=true


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | #distributionUrl=https\://services.gradle.org/distributions/gradle-6.7-bin.zip
4 | distributionUrl=https\://mirrors.cloud.tencent.com/gradle/gradle-6.7-bin.zip
5 | zipStoreBase=GRADLE_USER_HOME
6 | zipStorePath=wrapper/dists
7 | wrapper.keep=true


--------------------------------------------------------------------------------
/licenses/FastText-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016-present, Facebook, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/licenses/StartSpace-LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 | 
3 | Copyright (c) Facebook, Inc. and its affiliates.
4 | 
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6 | 
7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8 | 
9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/mynlp-all/build.gradle.kts:
--------------------------------------------------------------------------------
 1 | 
 2 | description = "自动依赖必要的资源"
 3 | 
 4 | dependencies {
 5 | 
 6 |     api("org.jetbrains.kotlin:kotlin-stdlib")
 7 | 
 8 |     api(project(":mynlp"))
 9 | 
10 |     // 核心词典
11 |     implementation("com.mayabot.mynlp.resource:mynlp-resource-coredict:1.0.0")
12 |     // 词性标注
13 |     implementation("com.mayabot.mynlp.resource:mynlp-resource-pos:1.0.0")
14 |     // 命名实体
15 |     implementation("com.mayabot.mynlp.resource:mynlp-resource-ner:1.0.0")
16 |     // pinyin
17 |     implementation("com.mayabot.mynlp.resource:mynlp-resource-pinyin:1.1.0")
18 |     // 繁简体转换
19 |     implementation("com.mayabot.mynlp.resource:mynlp-resource-transform:1.0.1")
20 | 
21 | 
22 |     // 感知机分词模型
23 | //   implementation 'com.mayabot.mynlp.resource:mynlp-resource-cws:1.0.0'
24 | 
25 |     // 自定义扩展词库
26 | //   implementation 'com.mayabot.mynlp.resource:mynlp-resource-custom:1.0.0'
27 | }


--------------------------------------------------------------------------------
/mynlp-example/build.gradle.kts:
--------------------------------------------------------------------------------
 1 | description = "Example"
 2 | 
 3 | project.afterEvaluate {
 4 |     project.tasks.withType<AbstractPublishToMaven>{
 5 |         enabled = false
 6 |     }
 7 | }
 8 | 
 9 | dependencies {
10 | 
11 |     implementation(project(":mynlp-all"))
12 | 
13 |     implementation( "com.mayabot.mynlp.resource:mynlp-resource-cws:1.0.0")
14 |     implementation( "com.mayabot.mynlp.resource:mynlp-resource-custom:1.0.0")
15 |     implementation( "org.fusesource.jansi:jansi:1.16")
16 |     implementation( "ch.qos.logback:logback-classic:1.2.3")
17 | 
18 | 
19 | }


--------------------------------------------------------------------------------
/mynlp-example/src/main/java/Demo.java:
--------------------------------------------------------------------------------
 1 | import com.mayabot.nlp.Mynlp;
 2 | 
 3 | public class Demo {
 4 |     public static void main(String[] args) {
 5 | //        Mynlp.configer()
 6 | //                .set("a","1");
 7 | //        System.out.println(Mynlp.instance().segment("扫描二维码即可下载"));
 8 | //
 9 |         Mynlp mynlp = Mynlp.instance();
10 | //
11 | //        System.out.println(mynlp.segment("请勿大声喧哗"));
12 | ////
13 | //
14 | //        System.out.println(mynlp.convertPinyin("信息公开"));
15 | //
16 | //
17 | //        System.out.println(mynlp.splitPinyin("xinxigongkai"));
18 | 
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/mynlp-example/src/main/java/pinyin/PinyinExample.java:
--------------------------------------------------------------------------------
 1 | package pinyin;
 2 | 
 3 | 
 4 | public class PinyinExample {
 5 |     public static void main(String[] args) {
 6 | //        PinyinResult result = Pinyins.convert("朝朝暮暮");
 7 | //
 8 | //        System.out.println(result.asString());
 9 | //        System.out.println(result.asHeadList());
10 | //        System.out.println(result.asList());
11 |     }
12 | }
13 | 


--------------------------------------------------------------------------------
/mynlp-example/src/main/java/segment/CombineExample.java:
--------------------------------------------------------------------------------
1 | package segment;
2 | 
3 | public class CombineExample {
4 |     public static void main(String[] args) {
5 | 
6 | 
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/mynlp-example/src/main/java/segment/CoreSegment.java:
--------------------------------------------------------------------------------
 1 | package segment;
 2 | 
 3 | import com.mayabot.nlp.segment.*;
 4 | 
 5 | import java.io.Reader;
 6 | import java.io.StringReader;
 7 | 
 8 | public class CoreSegment {
 9 | 
10 |     public static void main(String[] args) {
11 |         long t1 = System.currentTimeMillis();
12 | 
13 |         Lexer tokenizer = Lexers.core();
14 | 
15 | 
16 |         Sentence sentence = tokenizer.scan("mynlp是mayabot开源的中文NLP工具包。");
17 | 
18 |         System.out.println(sentence.toWordList());
19 | 
20 | 
21 |         LexerReader analyzer = tokenizer.reader();
22 | 
23 |         Reader reader = new StringReader("假装这是一个大文本");
24 |         WordTermSequence result = analyzer.scan(reader);
25 |         long t2 = System.currentTimeMillis();
26 |         System.out.println(t2 - t1);
27 |         System.out.printf("result" + result.toSentence());
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/mynlp-example/src/main/java/segment/CustomSegment.java:
--------------------------------------------------------------------------------
 1 | package segment;
 2 | 
 3 | import com.mayabot.nlp.segment.FluentLexerBuilder;
 4 | import com.mayabot.nlp.segment.Lexer;
 5 | import com.mayabot.nlp.segment.Lexers;
 6 | import com.mayabot.nlp.segment.plugins.customwords.CustomDictionaryPlugin;
 7 | import com.mayabot.nlp.segment.plugins.customwords.MemCustomDictionary;
 8 | 
 9 | public class CustomSegment {
10 | 
11 |     public static void main(String[] args) {
12 | 
13 |         MemCustomDictionary memCustomDictionary = new MemCustomDictionary();
14 | 
15 |         FluentLexerBuilder builder = Lexers.coreBuilder();
16 | 
17 |         builder.with(new CustomDictionaryPlugin(memCustomDictionary));
18 | 
19 |         Lexer tokenizer = builder.build();
20 | 
21 |         System.out.println(tokenizer);
22 | 
23 |         System.out.println(tokenizer.scan("欢迎来到松江临港科技城"));
24 | 
25 |         memCustomDictionary.addWord("临港科技城");
26 |         memCustomDictionary.rebuild();
27 | 
28 |         System.out.println(tokenizer.scan("欢迎来到松江临港科技城"));
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/mynlp-example/src/main/java/segment/UseStreamApi.java:
--------------------------------------------------------------------------------
 1 | package segment;
 2 | 
 3 | import com.mayabot.nlp.segment.LexerReader;
 4 | import com.mayabot.nlp.segment.Lexers;
 5 | import com.mayabot.nlp.segment.WordTerm;
 6 | 
 7 | import java.io.BufferedReader;
 8 | import java.io.File;
 9 | import java.io.FileInputStream;
10 | import java.io.InputStreamReader;
11 | import java.util.stream.Stream;
12 | 
13 | public class UseStreamApi {
14 | 
15 |     public static void main(String[] args) throws Exception {
16 | 
17 |         LexerReader lexerReader = Lexers.core().reader();
18 | 
19 |         try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(
20 |                 new File("data/红楼梦.txt"))))) {
21 | 
22 |             Stream<WordTerm> stream = lexerReader.scan(bufferedReader)
23 |                     .stream()
24 |                     .filter(it -> it.word.length() > 1);
25 |             stream.forEach(term -> {
26 | 
27 |             });
28 | 
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/mynlp-example/src/main/java/transform/TraditionalExample.java:
--------------------------------------------------------------------------------
 1 | package transform;
 2 | 
 3 | import com.mayabot.nlp.module.trans.TransformService;
 4 | 
 5 | public class TraditionalExample {
 6 | 
 7 |     public static void main(String[] args) {
 8 |         String text = "軟件和體育的藝術";
 9 |         System.out.println(TransformService.t2s(text));
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/mynlp-example/src/test/java/TestHighlight.java:
--------------------------------------------------------------------------------
 1 | import com.mayabot.nlp.module.QuickReplacer;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | import java.util.function.Function;
 6 | 
 7 | public class TestHighlight {
 8 | 
 9 |     public static void main(String[] args) {
10 |         List<String> keywords = new ArrayList<>();
11 | 
12 |         keywords.add("居住证");
13 |         keywords.add("居住");
14 | 
15 |         QuickReplacer quickReplacer = new QuickReplacer(keywords);
16 | 
17 |         String result = quickReplacer.replaceForJava("居住在上海需要办理居住证",
18 |                 (Function<String, String>) word -> "<a href='xxx'>" + word + "</a>");
19 | 
20 |         System.out.println(result);
21 | 
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/mynlp-experimental/.gitignore:
--------------------------------------------------------------------------------
1 | JRNN-master
2 | src
3 | backup


--------------------------------------------------------------------------------
/mynlp-experimental/build.gradle.kts:
--------------------------------------------------------------------------------
 1 | dependencies {
 2 |     api("org.jetbrains.kotlin:kotlin-stdlib")
 3 | }
 4 | //    dependencies {
 5 | ////    implementation 'org.jblas:jblas:1.2.5'
 6 | ////    compile 'org.apache.commons:commons-lang3:3.3.2'
 7 | ////    compile 'com.google.guava:guava:18.0'
 8 | ////    compile 'commons-io:commons-io:2.4'
 9 | //    compile ("org.jetbrains.kotlin:kotlin-stdlib") {
10 | //        exclude module:"kotlin-stdlib-jdk7"
11 | //        exclude module:"kotlin-stdlib-jdk8"
12 | //    }
13 | //}
14 | 
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/mynlp/shell/mynlp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env sh
 2 | # Attempt to set APP_HOME
 3 | # Resolve links: $0 may be a link
 4 | PRG="$0"
 5 | # Need this for relative symlinks.
 6 | while [ -h "$PRG" ] ; do
 7 |     ls=`ls -ld "$PRG"`
 8 |     link=`expr "$ls" : '.*-> \(.*\)$'`
 9 |     if expr "$link" : '/.*' > /dev/null; then
10 |         PRG="$link"
11 |     else
12 |         PRG=`dirname "$PRG"`"/$link"
13 |     fi
14 | done
15 | SAVED="`pwd`"
16 | cd "`dirname \"$PRG\"`/" >/dev/null
17 | APP_HOME="`pwd -P`"
18 | cd "$SAVED" >/dev/null
19 | 
20 | APP_NAME="Mynlp"
21 | APP_BASE_NAME=`basename "$0"`
22 | 
23 | # 下载mynlp-bin.jar
24 | # 准备JDK环境变量
25 | # exe jar


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/MynlpConfigs.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp
 2 | 
 3 | import com.mayabot.nlp.common.SettingItem
 4 | import com.mayabot.nlp.common.SettingItem.stringSetting
 5 | 
 6 | object MynlpConfigs {
 7 | 
 8 |     @JvmField
 9 |     val server: SettingItem<String> = stringSetting("mynlp.server", "")
10 | 
11 |     /**
12 |      * AP分词器的模型名
13 |      */
14 |     @JvmField
15 |     val cwsModelItem: SettingItem<String> = stringSetting("cws.model", "cws-model")
16 | 
17 |     /**
18 |      * 自定义词典的路径
19 |      * value可以是用逗号分隔的多个值，表示多个文件
20 |      */
21 |     @JvmField
22 |     val dictPathSetting: SettingItem<String> = stringSetting(
23 |         "custom.dictionary.path", "custom-dict/CustomDictionary.txt"
24 |     )
25 | 
26 |     /**
27 |      * 主要拼音的资源文件名
28 |      */
29 |     @JvmField
30 |     val pinyinSetting: SettingItem<String> = stringSetting("pinyin.dict", "mynlp-pinyin.txt")
31 | 
32 |     /**
33 |      * 拼音自定义扩展词典的文件名（可选）
34 |      */
35 |     @JvmField
36 |     val pinyinExtDicSetting: SettingItem<String> = stringSetting("pinyin.ext.dict", null)
37 | 
38 |     /**
39 |      * 分词纠错词典配置
40 |      */
41 |     @JvmField
42 |     val correctionDict: SettingItem<String> = stringSetting("correction.dict", "dictionary/correction.txt")
43 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/Heap.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.algorithm
 2 | 
 3 | /**
 4 |  * 默认小顶堆。如果需要大顶堆
 5 |  *
 6 |  *
 7 |  */
 8 | class TopHeap<T>(
 9 |         val maxSize: Int,
10 |         val comparator: Comparator<T>,
11 |         /**
12 |          * false 表示大顶堆
13 |          */
14 |         val minTop: Boolean = true
15 | ) {
16 | 
17 |     private val data = arrayOfNulls<Any>(maxSize)
18 | 
19 |     private var size: Int = 0
20 | 
21 |     fun push(data: T) {
22 | 
23 |     }
24 | 
25 |     private fun heapify() {
26 | 
27 |     }
28 | 
29 |     fun root(): T {
30 |         TODO()
31 |     }
32 | 
33 |     /**
34 |      * 获取里面的所有元素，但是并不是排好序的
35 |      */
36 |     fun toList(): List<T> {
37 |         TODO()
38 |     }
39 | 
40 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/TopIntMinK.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.algorithm
 2 | 
 3 | /**
 4 |  * Top K 最小值。
 5 |  */
 6 | class TopIntMinK(private val k: Int) {
 7 | 
 8 |     private val heap = FloatArray(k)
 9 |     private val idIndex = IntArray(k) { -1 }
10 | 
11 |     var size = 0
12 | 
13 |     fun push(id: Int, score: Float) {
14 |         if (size < k) {
15 |             heap[size] = score
16 |             idIndex[size] = id
17 |             size++
18 | 
19 |             if (size == k) {
20 |                 buildMinHeap()
21 |             }
22 |         } else {
23 |             // 如果这个数据小于最大值，那么有资格进入
24 |             if (score < heap[0]) {
25 |                 heap[0] = score
26 |                 idIndex[0] = id
27 | 
28 |                 topify(0)
29 |             }
30 |         }
31 |     }
32 | 
33 |     fun result(): ArrayList<Pair<Int, Float>> {
34 |         val top = Math.min(k, size)
35 |         val list = ArrayList<Pair<Int, Float>>(top)
36 | 
37 |         for (i in 0 until top) {
38 |             list += idIndex[i] to heap[i]
39 |         }
40 | 
41 |         list.sortBy { it.second }
42 |         return list
43 |     }
44 | 
45 |     private fun buildMinHeap() {
46 |         for (i in k / 2 - 1 downTo 0) {
47 |             topify(i)// 依次向上将当前子树最大堆化
48 |         }
49 |     }
50 | 
51 |     private fun topify(i: Int) {
52 |         val l = 2 * i + 1
53 |         val r = 2 * i + 2
54 |         var max: Int
55 | 
56 |         if (l < k && heap[l] > heap[i])
57 |             max = l
58 |         else
59 |             max = i
60 | 
61 |         if (r < k && heap[r] > heap[max]) {
62 |             max = r
63 |         }
64 | 
65 |         if (max == i || max >= k)
66 |         // 如果largest等于i说明i是最大元素
67 |         // largest超出heap范围说明不存在比i节点大的子女
68 |             return
69 | 
70 |         swap(i, max)
71 |         topify(max)
72 |     }
73 | 
74 |     private fun swap(i: Int, j: Int) {
75 |         val tmp = heap[i]
76 |         heap[i] = heap[j]
77 |         heap[j] = tmp
78 | 
79 |         val tmp2 = idIndex[i]
80 |         idIndex[i] = idIndex[j]
81 |         idIndex[j] = tmp2
82 |     }
83 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/Trie.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.algorithm.collection;
17 | 
18 | 
19 | /**
20 |  * 字典树接口
21 |  *
22 |  * @author jimichan
23 |  */
24 | public interface Trie<T> {
25 | 
26 |     T get(char[] key);
27 | 
28 |     T get(CharSequence key);
29 | 
30 |     T get(char[] key, int offset, int len);
31 | 
32 |     boolean containsKey(String key);
33 | 
34 | }
35 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/ahocorasick/Hit.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * <summary></summary>
19 |  * <author>He Han</author>
20 |  * <email>hankcs.cn@gmail.com</email>
21 |  * 源代码来自于 https://github.com/hankcs/HanLP
22 |  */
23 | package com.mayabot.nlp.algorithm.collection.ahocorasick;
24 | 
25 | /**
26 |  * 一个命中结果
27 |  *
28 |  * @param <V>
29 |  */
30 | public class Hit<V> {
31 |     /**
32 |      * 模式串在母文本中的起始位置
33 |      */
34 |     public final int begin;
35 |     /**
36 |      * 模式串在母文本中的终止位置
37 |      */
38 |     public final int end;
39 |     /**
40 |      * 模式串对应的值
41 |      */
42 |     public final V value;
43 | 
44 |     public Hit(int begin, int end, V value) {
45 |         this.begin = begin;
46 |         this.end = end;
47 |         this.value = value;
48 |     }
49 | 
50 |     @Override
51 |     public String toString() {
52 |         return String.format("[%d:%d]=%s", begin, end, value);
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/ahocorasick/IHit.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * <summary></summary>
19 |  * <author>He Han</author>
20 |  * <email>hankcs.cn@gmail.com</email>
21 |  * 源代码来自于 https://github.com/hankcs/HanLP
22 |  */
23 | package com.mayabot.nlp.algorithm.collection.ahocorasick;
24 | 
25 | /**
26 |  * 命中一个模式串的处理方法
27 |  */
28 | public interface IHit<V> {
29 |     /**
30 |      * 命中一个模式串
31 |      *
32 |      * @param begin 模式串在母文本中的起始位置
33 |      * @param end   模式串在母文本中的终止位置
34 |      * @param value 模式串对应的值
35 |      */
36 |     void hit(int begin, int end, V value);
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/ahocorasick/IHitFull.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * <summary></summary>
19 |  * <author>He Han</author>
20 |  * <email>hankcs.cn@gmail.com</email>
21 |  * 源代码来自于 https://github.com/hankcs/HanLP
22 |  */
23 | package com.mayabot.nlp.algorithm.collection.ahocorasick;
24 | 
25 | public interface IHitFull<V> {
26 |     /**
27 |      * 命中一个模式串
28 |      *
29 |      * @param begin 模式串在母文本中的起始位置
30 |      * @param end   模式串在母文本中的终止位置
31 |      * @param value 模式串对应的值
32 |      * @param index 模式串对应的值的下标
33 |      */
34 |     void hit(int begin, int end, V value, int index);
35 | }
36 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/bintrie/BinTrieNode.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.algorithm.collection.bintrie;
18 | 
19 | /**
20 |  * @param <T>
21 |  * @author jimichan
22 |  */
23 | public interface BinTrieNode<T> {
24 | 
25 |     BinTrieNode<T> addChildNode(BinTrieNode<T> nodeToInsert);
26 | 
27 |     BinTrieNode<T> findChild(char c);
28 | 
29 |     byte getStatus();
30 | 
31 |     T getValue();
32 | 
33 |     int compareTo(char c);
34 | 
35 |     boolean contains(char c);
36 | 
37 | 
38 |     default BinTrieNode<T> findNode(char[] keyWord) {
39 |         BinTrieNode<T> point = this;
40 |         for (int j = 0; j < keyWord.length; j++) {
41 |             point = point.findChild(keyWord[j]);
42 |             if (point == null) {
43 |                 return null;
44 |             }
45 |         }
46 |         return point;
47 |     }
48 | 
49 |     /**
50 |      * 寻找到这个路径的最后一个节点
51 |      *
52 |      * @param key
53 |      * @return BinTrieNode
54 |      */
55 |     default BinTrieNode<T> findNode(CharSequence key) {
56 |         BinTrieNode<T> branch = this;
57 |         int len = key.length();
58 |         for (int i = 0; i < len; i++) {
59 |             char _char = key.charAt(i);
60 |             if (branch == null) {
61 |                 return null;
62 |             }
63 |             branch = branch.findChild(_char);
64 |         }
65 |         return branch;
66 |     }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/bintrie/TrieTreeMatcher.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.algorithm.collection.bintrie;
18 | 
19 | /**
20 |  * @param <T>
21 |  * @author jimichan
22 |  */
23 | public interface TrieTreeMatcher<T> {
24 | 
25 |     /**
26 |      * 詞典中全部命中的詞語
27 |      *
28 |      * @return String
29 |      */
30 |     String next();
31 | 
32 | 
33 |     /**
34 |      * 得到全部参数
35 |      *
36 |      * @return String
37 |      */
38 |     T getParams();
39 | 
40 |     /**
41 |      * 当参数对象是列表或者数组的时候，返回指定下标的内容。否则返回null
42 |      *
43 |      * @param i
44 |      * @return String
45 |      */
46 |     String getParam(int i);
47 | 
48 | 
49 |     int getOffset();
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/dat/DATMapMatcher.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * 源代码参考和部分引用来自 https://github.com/hankcs/HanLP https://github.com/NLPchina/ansj_seg
19 |  */
20 | package com.mayabot.nlp.algorithm.collection.dat;
21 | 
22 | /**
23 |  * 一个搜索工具（注意，当调用next()返回false后不应该继续调用next()，除非reset状态）
24 |  * <p>
25 |  * DAT的匹配器是一个多匹配器，把各种可能都计算出来
26 |  *
27 |  * @author jimichan
28 |  */
29 | public interface DATMapMatcher<V> {
30 | 
31 |     boolean next();
32 | 
33 |     int getBegin();
34 | 
35 |     int getLength();
36 | 
37 |     V getValue();
38 | 
39 |     int getIndex();
40 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/dat/FastDatCharSet.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.algorithm.collection.dat;
 2 | 
 3 | import java.util.HashSet;
 4 | import java.util.Set;
 5 | import java.util.TreeSet;
 6 | 
 7 | public class FastDatCharSet {
 8 | 
 9 |     private DoubleArrayTrie map;
10 | 
11 |     public FastDatCharSet(char... chars) {
12 |         HashSet<Character> set = new HashSet<>();
13 |         for (char aChar : chars) {
14 |             set.add(aChar);
15 |         }
16 |         set(set);
17 |     }
18 | 
19 |     public FastDatCharSet(Set<Character> characterSet) {
20 |         set(characterSet);
21 |     }
22 | 
23 |     private void set(Set<Character> characterSet) {
24 |         TreeSet<String> treeMap = new TreeSet<>();
25 | 
26 |         for (Character character : characterSet) {
27 |             treeMap.add(character.toString());
28 |         }
29 | 
30 |         this.map = new DoubleArrayTrie(treeMap);
31 |     }
32 | 
33 |     public boolean contains(char ch) {
34 |         return map.indexOf(ch) != -1;
35 |     }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/algorithm/distance/StringDistance.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.mayabot.nlp.algorithm.distance;
18 | 
19 | /**
20 |  * Interface for string distances.
21 |  */
22 | public interface StringDistance {
23 | 
24 |   /**
25 |    * Returns a float between 0 and 1 based on how similar the specified strings are to one another.  
26 |    * Returning a value of 1 means the specified strings are identical and 0 means the
27 |    * string are maximally different.
28 |    * @param s1 The first string.
29 |    * @param s2 The second string.
30 |    * @return a float between 0 and 1 based on how similar the specified strings are to one another.
31 |    */
32 |   public float getDistance(String s1,String s2);
33 |   
34 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/blas/BlasUtils.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.blas
 2 | 
 3 | import java.nio.ByteBuffer
 4 | import kotlin.math.sqrt
 5 | 
 6 | /**
 7 |  * 向量点积
 8 |  */
 9 | fun dot(a: Vector, b: Vector): Float {
10 |     return a * b
11 | }
12 | 
13 | /**
14 |  * 向量余弦
15 |  */
16 | fun cosine(a: Vector, b: Vector): Float {
17 |     val normA = a * a
18 |     val normB = b * b
19 |     return if (normA == 0.0f || normB == 0.0f) {
20 |         0.0f
21 |     } else {
22 |         (a * b / sqrt((normA * normB).toDouble())).toFloat()
23 |     }
24 | }
25 | 
26 | fun floatArrayVector(size: Int) = DenseVector(size)
27 | fun byteBufferVector(size: Int) = ByteBufferDenseVector(ByteBuffer.allocate(size shl 2), 0, size)
28 | fun directByteBufferVector(size: Int) = ByteBufferDenseVector(ByteBuffer.allocateDirect(size shl 2), 0, size)
29 | 
30 | fun floatArrayMatrix(rows: Int, cols: Int, data: FloatArray) = DenseArrayMatrix(rows, cols, data)
31 | fun floatArrayMatrix(rows: Int, cols: Int) = DenseArrayMatrix(rows, cols)
32 | fun byteBufferMatrix(rows: Int, cols: Int) = ByteBufferMatrix(rows, cols, false)
33 | fun directByteBufferMatrix(rows: Int, cols: Int) = ByteBufferMatrix(rows, cols, true)
34 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/blas/Matrix.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.blas
 2 | 
 3 | import java.io.File
 4 | import java.io.Serializable
 5 | import java.nio.channels.FileChannel
 6 | 
 7 | /**
 8 |  * Float矩阵
 9 |  */
10 | interface Matrix : Serializable {
11 | 
12 |     val row: Int
13 |     val col: Int
14 | 
15 |     /**
16 |      * 矩阵的第i行和vec进行点积计算
17 |      */
18 |     fun dotRow(vec: Vector, i: Int): Float
19 | 
20 |     /**
21 |      * 把[vector]加到指定的[row] , [a]是系数
22 |      */
23 |     fun addVectorToRow(vector: Vector, row: Int, a: Float)
24 | 
25 |     fun addRowToVector(target: Vector, i: Int, a: Double? = null)
26 | 
27 |     fun save(file: File)
28 | 
29 |     fun save(channel: FileChannel)
30 | 
31 | 
32 | }
33 | 
34 | interface DenseMatrix : Matrix {
35 | 
36 |     fun zero()
37 |     //    fun fill(v: Float)
38 |     fun uniform(number: Number)
39 | 
40 |     operator fun get(row: Int): Vector
41 |     operator fun get(i: Int, j: Int): Float
42 | 
43 |     operator fun set(i: Int, j: Int, v: Float)
44 | 
45 |     /**
46 |      * 乘法
47 |      *
48 |      * 从ib到ie这些行，系数存在vector里面
49 |      */
50 |     fun multiplyRow(nums: Vector, ib: Int = 0, ie: Int = -1)
51 | 
52 |     /**
53 |      * 除法
54 |      */
55 |     fun divideRow(nums: Vector, ib: Int = 0, ie: Int = -1)
56 | 
57 |     fun l2NormRow(i: Int): Float
58 |     fun l2NormRow(norms: Vector)
59 | 
60 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/cli/MynlpCli.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.cli
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.common.logging.InternalLogLevel
 5 | import com.mayabot.nlp.common.logging.JdkLogger
 6 | import com.mayabot.nlp.segment.segment
 7 | 
 8 | fun main(args: Array<String>) {
 9 | 
10 |     JdkLogger.defaultLevel = InternalLogLevel.WARN;
11 | 
12 |     Mynlp.configer().setAutoDownloadRes(true)
13 | 
14 |     println("2012年的冬天".segment())
15 | 
16 |     if (args.isEmpty()) {
17 |         printTopHelp()
18 |         return
19 |     }
20 |     val subcommand = args.first()
21 |     val commandArgs = args.drop(1).toTypedArray()
22 | 
23 | }
24 | 
25 | fun printTopHelp() {
26 |     println(
27 |         """
28 |         Usage: mynlp subcommand [OPTION]...
29 |         
30 |         Mynlp实用工具,提供多个subcommand执行不同的功能.
31 |         
32 |         Subcommand List:
33 |         
34 |         segment     中文分词
35 |         ner         命名实体
36 |         pos         词性分析
37 |         name        人名模型
38 |         perceptron  通用AP训练和评估
39 |         train       内部模型训练入口
40 |         nwd         新词发现
41 |         fastText    分类模型和词嵌入
42 |         t2s         繁简体转换
43 |         pinyin      文字转拼音
44 |         pinyin-split    拼音流切分（nihaoshijie --> ni hao shi jie）
45 |         hash
46 |         classify    便捷的文本分类
47 |         
48 |         
49 |     """.trimIndent()
50 |     )
51 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/ArraySizingStrategy.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common;
 2 | 
 3 | /**
 4 |  * Resizing (growth) strategy for array-backed buffers.
 5 |  */
 6 | public interface ArraySizingStrategy {
 7 |     /**
 8 |      * @param currentBufferLength Current size of the array (buffer). This number should comply with
 9 |      *                            the strategy's policies (it is a result of initial rounding or
10 |      *                            further growCalls). It can also be zero, indicating the growth
11 |      *                            from an empty buffer.
12 |      * @param elementsCount       Number of elements stored in the buffer.
13 |      * @param expectedAdditions   Expected number of additions (resize hint).
14 |      * @return Must return a new size at least as big as to hold
15 |      * <code>elementsCount + expectedAdditions</code>.
16 |      */
17 |     int grow(int currentBufferLength, int elementsCount, int expectedAdditions);
18 | }
19 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/Pair.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common;
 2 | 
 3 | public final class Pair<T, R> {
 4 | 
 5 |     public T first;
 6 |     public R second;
 7 | 
 8 |     public Pair(T first, R second) {
 9 |         this.first = first;
10 |         this.second = second;
11 |     }
12 | 
13 |     @Override
14 |     public String toString() {
15 |         return "(" + first + ", " + second + ')';
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/ParagraphIterable.kt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.common
18 | 
19 | 
20 | /**
21 |  * ParagraphReader包装成iterable对象
22 |  *
23 |  * @author jimichan
24 |  */
25 | 
26 | class ParagraphIterable(private val reader: ParagraphReader) : Iterable<String> {
27 | 
28 |     override fun iterator(): Iterator<String> {
29 | 
30 |         return object : AbstractIterator<String>() {
31 |             override fun computeNext() {
32 |                     val n = reader.next()
33 |                     if (n == null) {
34 |                         done()
35 |                     }else{
36 |                         setNext(n)
37 |                     }
38 |             }
39 |         }
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/ParagraphReader.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.common;
18 | 
19 | import java.io.IOException;
20 | 
21 | /**
22 |  * 分段接口
23 |  *
24 |  * @author jimichan
25 |  */
26 | public interface ParagraphReader {
27 |     /**
28 |      * 返回一个段落，最后返回null
29 |      *
30 |      * @return String
31 |      * @throws IOException
32 |      */
33 |     String next() throws IOException;
34 | }
35 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/ParagraphReaderString.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.common;
18 | 
19 | /**
20 |  * 有的时候给定的文本很短，那么就做个假的
21 |  *
22 |  * @author jimichan
23 |  */
24 | public class ParagraphReaderString implements ParagraphReader {
25 | 
26 |     private String string = null;
27 | 
28 |     public ParagraphReaderString(String string) {
29 |         this.string = string;
30 |     }
31 | 
32 |     @Override
33 |     public String next() {
34 |         String old = string;
35 |         string = null;
36 |         return old;
37 |     }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/TagAndScore.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common;
 2 | 
 3 | /**
 4 |  * @author jimichan
 5 |  */
 6 | public class TagAndScore {
 7 | 
 8 |     private String tag;
 9 | 
10 |     private float score;
11 | 
12 |     public TagAndScore(String tag, float score) {
13 |         this.tag = tag;
14 |         this.score = score;
15 |     }
16 | 
17 |     public String getTag() {
18 |         return tag;
19 |     }
20 | 
21 |     public void setTag(String tag) {
22 |         this.tag = tag;
23 |     }
24 | 
25 |     public float getScore() {
26 |         return score;
27 |     }
28 | 
29 |     public void setScore(float score) {
30 |         this.score = score;
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/cli/ParseException.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  * <p>
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  * <p>
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | 
18 | package com.mayabot.nlp.common.cli;
19 | 
20 | /**
21 |  * Base for Exceptions thrown during parsing of a command-line.
22 |  *
23 |  * @version $Id: ParseException.java 1443102 2013-02-06 18:12:16Z tn $
24 |  */
25 | public class ParseException extends Exception {
26 |     /**
27 |      * This exception {@code serialVersionUID}.
28 |      */
29 |     private static final long serialVersionUID = 9112808380089253192L;
30 | 
31 |     /**
32 |      * Construct a new <code>ParseException</code>
33 |      * with the specified detail message.
34 |      *
35 |      * @param message the detail message
36 |      */
37 |     public ParseException(String message) {
38 |         super(message);
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/cli/package-info.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  * <p>
 9 |  * http://www.apache.org/licenses/LICENSE-2.0
10 |  * <p>
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  * <p>
17 |  * Commons CLI 1.3
18 |  *
19 |  * @version $Id: package-info.java 1443102 2013-02-06 18:12:16Z tn $
20 |  */
21 | 
22 | /**
23 |  * Commons CLI 1.3
24 |  *
25 |  * @version $Id: package-info.java 1443102 2013-02-06 18:12:16Z tn $
26 |  */
27 | package com.mayabot.nlp.common.cli;
28 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/hash/MurmurHash3Kotlin.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.hash
 2 | 
 3 | object MurmurHash3Utils {
 4 | 
 5 |     fun hashBytes(byteArray: ByteArray, offset: Int, length: Int, seed: Long = 0L, hash: MurmurHash3.Hash128 = MurmurHash3.Hash128()): MurmurHash3.Hash128 {
 6 |         return MurmurHash3.hash128(byteArray, offset, length, seed, hash)
 7 |     }
 8 | 
 9 |     fun hashBytes(byteArray: ByteArray): MurmurHash3.Hash128 {
10 |         return this.hashBytes(byteArray, 0, byteArray.size)
11 |     }
12 | 
13 |     fun hashString(text: String): Long {
14 |         val bytes = text.toByteArray(Charsets.UTF_8)
15 |         return hashBytes(bytes).h1
16 |     }
17 | 
18 | }
19 | 
20 | fun String.murmur3(): Long {
21 |     return MurmurHash3Utils.hashString(this)
22 | }
23 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/hppc/BufferAllocationException.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.hppc;
 2 | 
 3 | import java.util.IllegalFormatException;
 4 | import java.util.Locale;
 5 | 
 6 | public class BufferAllocationException extends RuntimeException {
 7 |     public BufferAllocationException(String message) {
 8 |         super(message);
 9 |     }
10 | 
11 |     public BufferAllocationException(String message, Object... args) {
12 |         this(message, null, args);
13 |     }
14 | 
15 |     public BufferAllocationException(String message, Throwable t, Object... args) {
16 |         super(formatMessage(message, t, args), t);
17 |     }
18 | 
19 |     private static String formatMessage(String message, Throwable t, Object... args) {
20 |         try {
21 |             return String.format(Locale.ROOT, message, args);
22 |         } catch (IllegalFormatException e) {
23 |             BufferAllocationException substitute =
24 |                     new BufferAllocationException(message + " [ILLEGAL FORMAT, ARGS SUPPRESSED]");
25 |             if (t != null) {
26 |                 substitute.addSuppressed(t);
27 |             }
28 |             substitute.addSuppressed(e);
29 |             throw substitute;
30 |         }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/injector/BeanFactory.java:
--------------------------------------------------------------------------------
1 | package com.mayabot.nlp.common.injector;
2 | 
3 | import org.jetbrains.annotations.NotNull;
4 | 
5 | public interface BeanFactory {
6 |     public Object create(@NotNull Injector injector) ;
7 | }
8 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/injector/ImplementedBy.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.injector;
 2 | 
 3 | import java.lang.annotation.Retention;
 4 | import java.lang.annotation.Target;
 5 | 
 6 | import static java.lang.annotation.ElementType.TYPE;
 7 | import static java.lang.annotation.RetentionPolicy.RUNTIME;
 8 | 
 9 | @Retention(RUNTIME)
10 | @Target(TYPE)
11 | public @interface ImplementedBy {
12 |     /** The implementation type. */
13 |     Class<?> value();
14 | }
15 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/injector/Singleton.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.injector;
 2 | 
 3 | import java.lang.annotation.Retention;
 4 | import java.lang.annotation.Target;
 5 | 
 6 | import static java.lang.annotation.ElementType.TYPE;
 7 | import static java.lang.annotation.RetentionPolicy.RUNTIME;
 8 | 
 9 | @Retention(RUNTIME)
10 | @Target(TYPE)
11 | public @interface Singleton {
12 | }
13 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/logging/FormattingTuple.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | 
18 | package com.mayabot.nlp.common.logging;
19 | 
20 | /**
21 |  * Holds the results of formatting done by {@link MessageFormatter}.
22 |  */
23 | final class FormattingTuple {
24 | 
25 |     private final String message;
26 |     private final Throwable throwable;
27 | 
28 |     FormattingTuple(String message, Throwable throwable) {
29 |         this.message = message;
30 |         this.throwable = throwable;
31 |     }
32 | 
33 |     public String getMessage() {
34 |         return message;
35 |     }
36 | 
37 |     public Throwable getThrowable() {
38 |         return throwable;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/logging/InternalLogLevel.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * Copyright 2012 The Netty Project
19 |  *
20 |  * The Netty Project licenses this file to you under the Apache License,
21 |  * version 2.0 (the "License"); you may not use this file except in compliance
22 |  * with the License. You may obtain a copy of the License at:
23 |  *
24 |  *   http://www.apache.org/licenses/LICENSE-2.0
25 |  *
26 |  * Unless required by applicable law or agreed to in writing, software
27 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
28 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
29 |  * License for the specific language governing permissions and limitations
30 |  * under the License.
31 |  */
32 | package com.mayabot.nlp.common.logging;
33 | 
34 | /**
35 |  * The log level that {@link InternalLogger} can log at.
36 |  */
37 | public enum InternalLogLevel {
38 |     /**
39 |      * 'TRACE' log level.
40 |      */
41 |     TRACE,
42 |     /**
43 |      * 'DEBUG' log level.
44 |      */
45 |     DEBUG,
46 |     /**
47 |      * 'INFO' log level.
48 |      */
49 |     INFO,
50 |     /**
51 |      * 'WARN' log level.
52 |      */
53 |     WARN,
54 |     /**
55 |      * 'ERROR' log level.
56 |      */
57 |     ERROR
58 | }
59 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/logging/Log4J2LoggerFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * Copyright 2016 The Netty Project
19 |  *
20 |  * The Netty Project licenses this file to you under the Apache License,
21 |  * version 2.0 (the "License"); you may not use this file except in compliance
22 |  * with the License. You may obtain a copy of the License at:
23 |  *
24 |  *   http://www.apache.org/licenses/LICENSE-2.0
25 |  *
26 |  * Unless required by applicable law or agreed to in writing, software
27 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
28 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
29 |  * License for the specific language governing permissions and limitations
30 |  * under the License.
31 |  */
32 | package com.mayabot.nlp.common.logging;
33 | 
34 | import org.apache.logging.log4j.LogManager;
35 | 
36 | public final class Log4J2LoggerFactory extends InternalLoggerFactory {
37 | 
38 |     public static final InternalLoggerFactory INSTANCE = new Log4J2LoggerFactory();
39 | 
40 |     /**
41 |      * @deprecated Use {@link #INSTANCE} instead.
42 |      */
43 |     @Deprecated
44 |     public Log4J2LoggerFactory() {
45 |     }
46 | 
47 |     @Override
48 |     public InternalLogger newInstance(String name) {
49 |         return new Log4J2Logger(LogManager.getLogger(name));
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/logging/Log4JLoggerFactory.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * Copyright 2012 The Netty Project
19 |  *
20 |  * The Netty Project licenses this file to you under the Apache License,
21 |  * version 2.0 (the "License"); you may not use this file except in compliance
22 |  * with the License. You may obtain a copy of the License at:
23 |  *
24 |  *   http://www.apache.org/licenses/LICENSE-2.0
25 |  *
26 |  * Unless required by applicable law or agreed to in writing, software
27 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
28 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
29 |  * License for the specific language governing permissions and limitations
30 |  * under the License.
31 |  */
32 | package com.mayabot.nlp.common.logging;
33 | 
34 | import org.apache.log4j.Logger;
35 | 
36 | /**
37 |  * Logger factory which creates an
38 |  * <a href="http://logging.apache.org/log4j/1.2/index.html">Apache Log4J</a>
39 |  * logger.
40 |  */
41 | public class Log4JLoggerFactory extends InternalLoggerFactory {
42 | 
43 |     public static final InternalLoggerFactory INSTANCE = new Log4JLoggerFactory();
44 | 
45 |     /**
46 |      * @deprecated Use {@link #INSTANCE} instead.
47 |      */
48 |     @Deprecated
49 |     public Log4JLoggerFactory() {
50 |     }
51 | 
52 |     @Override
53 |     public InternalLogger newInstance(String name) {
54 |         return new Log4JLogger(Logger.getLogger(name));
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/logging/package-info.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * Copyright 2013 The Netty Project
19 |  *
20 |  * The Netty Project licenses this file to you under the Apache License,
21 |  * version 2.0 (the "License"); you may not use this file except in compliance
22 |  * with the License. You may obtain a copy of the License at:
23 |  *
24 |  *   http://www.apache.org/licenses/LICENSE-2.0
25 |  *
26 |  * Unless required by applicable law or agreed to in writing, software
27 |  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
28 |  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
29 |  * License for the specific language governing permissions and limitations
30 |  * under the License.
31 |  */
32 | 
33 | /**
34 |  * <em>Internal-use-only</em> logging API which is not allowed to be used outside Netty.
35 |  */
36 | package com.mayabot.nlp.common.logging;
37 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/resources/ClasspathNlpResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.resources;
 2 | 
 3 | import java.net.URL;
 4 | import java.nio.charset.Charset;
 5 | 
 6 | /**
 7 |  * 从Claspath下面的路径下加载资源
 8 |  *
 9 |  * @author jimichan
10 |  */
11 | public class ClasspathNlpResourceFactory implements NlpResourceFactory {
12 | 
13 |     private ClassLoader classLoader;
14 | 
15 |     public ClasspathNlpResourceFactory(ClassLoader classLoader) {
16 |         this.classLoader = classLoader;
17 |     }
18 | 
19 |     @Override
20 |     public NlpResource load(String resourceName, Charset charset) {
21 | 
22 |         if (resourceName.startsWith("/")) {
23 |             resourceName = resourceName.substring(1);
24 |         }
25 |         String path = resourceName;
26 | 
27 |         URL resource = classLoader.getResource(path);
28 | 
29 |         if (resource != null) {
30 |             return new URLNlpResource(resource, charset);
31 |         }
32 | 
33 |         return null;
34 |     }
35 | }
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/resources/FileNlpResourceFactory.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.resources
 2 | 
 3 | import java.io.File
 4 | import java.io.IOException
 5 | import java.io.InputStream
 6 | import java.nio.charset.Charset
 7 | 
 8 | /**
 9 |  * @author jimichan
10 |  */
11 | class FileNlpResourceFactory(private val baseDir: File) : NlpResourceFactory {
12 | 
13 |     override fun load(resourceName: String, charset: Charset): NlpResource? {
14 |         if (!baseDir.exists() || baseDir.isFile) {
15 |             return null
16 |         }
17 | 
18 |         val file = File(baseDir, resourceName.replace('/', File.separatorChar))
19 | 
20 |         return if (file.exists() && file.canRead()) {
21 |             FileMynlpResource(file, charset)
22 |         } else null
23 |     }
24 | 
25 |     class FileMynlpResource(private val file: File, private val charset: Charset) : NlpResource {
26 |         @Throws(IOException::class)
27 |         override fun inputStream(): InputStream {
28 |             return file.inputStream().buffered()
29 |         }
30 | 
31 |         override fun toString(): String {
32 |             return file.absolutePath
33 |         }
34 |     }
35 | 
36 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/resources/NlpResource.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.common.resources;
18 | 
19 | import com.mayabot.nlp.common.EncryptionUtil;
20 | 
21 | import java.io.IOException;
22 | import java.io.InputStream;
23 | 
24 | /**
25 |  * 读取的模型是基于文本的。一般一行一个数据。
26 |  * 项目中和外部系统驳接，比如数据库、HDSF
27 |  *
28 |  * @author jimichan
29 |  */
30 | public interface NlpResource {
31 | 
32 |     InputStream inputStream() throws IOException;
33 | 
34 |     /**
35 |      * 有很多实现办法。要么对文件或数据进行计算，还有他同名文件 abc.txt 对应一个文件 abc.txt.hash 进行记录
36 |      *
37 |      * @return String
38 |      */
39 |     default String hash() {
40 | 
41 |         try {
42 |             InputStream inputStream = inputStream();
43 | 
44 |             try {
45 |                 return EncryptionUtil.md5(inputStream);
46 |             } finally {
47 |                 inputStream.close();
48 |             }
49 |         } catch (Exception e) {
50 |             throw new RuntimeException(e);
51 |         }
52 |     }
53 | 
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/resources/NlpResourceFactory.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.resources;
 2 | 
 3 | import kotlin.text.Charsets;
 4 | 
 5 | import java.nio.charset.Charset;
 6 | 
 7 | /**
 8 |  * 资源文件的来源。比如从文件系统里面的加载，或者从classpath里面去加载
 9 |  *
10 |  * @author jimichan
11 |  */
12 | public interface NlpResourceFactory {
13 | 
14 |     /**
15 |      * 加载资源
16 |      *
17 |      * @param resourceName 格式为 dict/abc.dict
18 |      * @param charset      字符集
19 |      * @return 如果资源不存在那么返回NULL
20 |      */
21 |     NlpResource load(String resourceName, Charset charset);
22 | 
23 |     /**
24 |      * 加载资源
25 |      *
26 |      * @param resourceName
27 |      * @return NlpResource
28 |      */
29 |     default NlpResource load(String resourceName) {
30 |         return load(resourceName, Charsets.UTF_8);
31 |     }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/resources/URLNlpResource.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.common.resources;
18 | 
19 | import com.mayabot.nlp.common.logging.InternalLogger;
20 | import com.mayabot.nlp.common.logging.InternalLoggerFactory;
21 | 
22 | import java.io.BufferedInputStream;
23 | import java.io.IOException;
24 | import java.io.InputStream;
25 | import java.net.URL;
26 | import java.nio.charset.Charset;
27 | 
28 | /**
29 |  * @author jimichan
30 |  */
31 | public class URLNlpResource implements NlpResource {
32 | 
33 |     static InternalLogger logger = InternalLoggerFactory.getInstance(URLNlpResource.class);
34 | 
35 |     private final URL url;
36 |     private final Charset charset;
37 | 
38 |     public URLNlpResource(URL url, Charset charset) {
39 |         this.url = url;
40 |         this.charset = charset;
41 |     }
42 | 
43 |     @Override
44 |     public InputStream inputStream() throws IOException {
45 |         return new BufferedInputStream(url.openStream());
46 |     }
47 | 
48 |     @Override
49 |     public String toString() {
50 |         return url.toString();
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/resources/UseLines.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.resources
 2 | 
 3 | import com.mayabot.nlp.common.utils.CharSourceLineReader
 4 | import java.io.InputStream
 5 | import java.util.function.Consumer
 6 | 
 7 | object UseLines {
 8 | 
 9 |     @JvmOverloads
10 |     @JvmStatic
11 |     fun forEachLine(inputStream: InputStream,
12 |                     trim: Boolean = true,
13 |                     skipBlank: Boolean = true,
14 |                     consumer: Consumer<String>) {
15 |         inputStream.bufferedReader().forEachLine { x ->
16 |             var line = x
17 |             if (trim) {
18 |                 line = line.trim()
19 |             }
20 |             if (skipBlank && line.isBlank()) {
21 | 
22 |             } else {
23 |                 consumer.accept(line)
24 |             }
25 |         }
26 |     }
27 | 
28 |     @JvmOverloads
29 |     @JvmStatic
30 |     fun useLines(inputStream: InputStream,
31 |                  trim: Boolean = true,
32 |                  skipBlank: Boolean = true,
33 |                  consumer: Consumer<String>) {
34 |         inputStream.bufferedReader().forEachLine { x ->
35 |             var line = x
36 |             if (trim) {
37 |                 line = line.trim()
38 |             }
39 |             if (skipBlank && line.isBlank()) {
40 | 
41 |             } else {
42 |                 consumer.accept(line)
43 |             }
44 |         }
45 |     }
46 | 
47 |     @JvmStatic
48 |     fun lineReader(inputStream: InputStream): CharSourceLineReader {
49 |         return CharSourceLineReader(inputStream.bufferedReader(charset = Charsets.UTF_8))
50 |     }
51 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/utils/CharSourceLineReader.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.common.utils;
18 | 
19 | 
20 | import kotlin.collections.AbstractIterator;
21 | 
22 | import java.io.BufferedReader;
23 | 
24 | public class CharSourceLineReader extends AbstractIterator<String> implements AutoCloseable {
25 | 
26 |     private final BufferedReader reader;
27 | 
28 |     public CharSourceLineReader(BufferedReader reader) {
29 |         this.reader = reader;
30 |     }
31 | 
32 |     @Override
33 |     protected void computeNext() {
34 |         try {
35 |             String line = reader.readLine();
36 |             if (line == null) {
37 |                 done();
38 |                 return;
39 |             } else {
40 |                 setNext(line);
41 |                 return;
42 | //                return line;
43 |             }
44 |         } catch (Exception e) {
45 |             throw new RuntimeException(e);
46 |         }
47 | 
48 |     }
49 | 
50 |     @Override
51 |     public void close() {
52 |         try {
53 |             reader.close();
54 |         } catch (Exception e) {
55 |             throw new RuntimeException(e);
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/utils/DownloadUtils.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.utils
 2 | 
 3 | import java.io.File
 4 | import java.io.IOException
 5 | import java.net.URL
 6 | import java.util.zip.ZipInputStream
 7 | 
 8 | object DownloadUtils {
 9 | 
10 |     /**
11 |      * 下载文件
12 |      *
13 |      * @param url
14 |      * @param file
15 |      */
16 |     @Throws(IOException::class)
17 |     @JvmStatic
18 |     fun download(url: String, file: File) {
19 |         //先完全读入到内存中去。然后一次性写入文件
20 |         file.writeBytes(URL(url).readBytes())
21 |     }
22 | 
23 |     /**
24 |      * unzip file
25 |      *
26 |      * @param file
27 |      * @throws Exception
28 |      */
29 |     @Throws(Exception::class)
30 |     @JvmStatic
31 |     fun unzip(file: File) {
32 | 
33 |         ZipInputStream(file.inputStream().buffered()).use { zipInputStream ->
34 |             var entry = zipInputStream.nextEntry
35 | 
36 |             while (entry != null) {
37 |                 val name = entry.name
38 | 
39 |                 File(file.parent, name).outputStream().buffered().use {
40 |                     zipInputStream.copyTo(it)
41 |                 }
42 | 
43 |                 entry = zipInputStream.nextEntry
44 |             }
45 | 
46 |         }
47 | 
48 |     }
49 | }
50 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/utils/MyInts.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.common.utils;
18 | 
19 | /**
20 |  * @author jimichan
21 |  */
22 | public class MyInts {
23 | 
24 | 
25 |     public static byte[] toByteArray(int[] value, int fromIndex, int toIndex) {
26 |         toIndex = Math.min(value.length, toIndex);
27 |         byte[] bytes = new byte[(toIndex - fromIndex) * 4];
28 |         int point = 0;
29 |         for (int i = fromIndex; i < toIndex; i++) {
30 |             int v = value[i];
31 |             bytes[point++] = (byte) (v >> 24);
32 |             bytes[point++] = (byte) (v >> 16);
33 |             bytes[point++] = (byte) (v >> 8);
34 |             bytes[point++] = (byte) v;
35 |         }
36 |         return bytes;
37 |     }
38 | 
39 | 
40 |     public static int[] fromByteArrayToArray(byte[] bytes) {
41 |         return fromByteArrayToArray(bytes, new int[bytes.length / 4], bytes.length);
42 |     }
43 | 
44 |     public static int[] fromByteArrayToArray(byte[] bytes, int[] result, int bytesLen) {
45 |         int intCount = bytesLen / 4;
46 |         for (int i = 0, len = intCount; i < len; i++) {
47 |             int from = i * 4;
48 |             byte b1 = bytes[from++];
49 |             byte b2 = bytes[from++];
50 |             byte b3 = bytes[from++];
51 |             byte b4 = bytes[from++];
52 |             result[i] = b1 << 24 | (b2 & 0xFF) << 16 | (b3 & 0xFF) << 8 | (b4 & 0xFF);
53 |         }
54 |         return result;
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/common/utils/MynlpFactories.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.common.utils;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.InputStreamReader;
 5 | import java.net.URL;
 6 | import java.nio.charset.StandardCharsets;
 7 | import java.util.Enumeration;
 8 | import java.util.HashMap;
 9 | import java.util.List;
10 | import java.util.Map;
11 | 
12 | import static com.mayabot.nlp.common.Guava.mutiadd;
13 | 
14 | /**
15 |  * @author jimichan
16 |  */
17 | public class MynlpFactories {
18 | 
19 |     public static final String GuiceModule = "GuiceModule";
20 | 
21 | 
22 |     public static Map<String, List<Class>> load() throws Exception {
23 | 
24 |         Map<String, List<Class>> map = new HashMap<>();
25 | 
26 |         {
27 |             String[] split1 = System.getProperty(GuiceModule, "").trim().split(",");
28 |             for (String k : split1) {
29 |                 if (!k.isEmpty()) {
30 |                     mutiadd(map, GuiceModule, Class.forName(k));
31 |                 }
32 |             }
33 |         }
34 | 
35 |         Enumeration<URL> resources = MynlpFactories.class.getClassLoader().
36 |                 getResources("META-INF/mynlp.factories");
37 | 
38 |         while (resources.hasMoreElements()) {
39 |             URL url = resources.nextElement();
40 | 
41 |             BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), StandardCharsets.UTF_8));
42 | 
43 |             String line = reader.readLine();
44 | 
45 |             while (line != null) {
46 | 
47 |                 String[] split = line.split("=");
48 | 
49 |                 if (split.length == 2) {
50 |                     mutiadd(map, split[0].trim(), Class.forName(split[1].trim()));
51 |                 }
52 | 
53 |                 line = reader.readLine();
54 |             }
55 |             reader.close();
56 |         }
57 | 
58 |         return map;
59 |     }
60 | }
61 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/fasttext/FasttextTranUtils.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext
 2 | 
 3 | import com.mayabot.nlp.segment.LexerReader
 4 | import com.mayabot.nlp.segment.Lexers
 5 | import java.io.File
 6 | 
 7 | class FasttextTranUtils {
 8 | 
 9 |     companion object {
10 | 
11 |         /**
12 |          * 处理没有分词的语料
13 |          * __label__xxxx 语料文本,语料文本，语料文本
14 |          */
15 |         @JvmOverloads
16 |         @JvmStatic
17 |         fun prepareBySegment(from: File,
18 |                              to: File,
19 |                              label: String = "__label__",
20 |                              lexer: LexerReader = Lexers.coreBuilder().build().filterReader(true, true)) {
21 | 
22 |             fun processLine(line:String): String{
23 |                 val list = ArrayList<String>()
24 |                 line.split(" ").forEach { part->
25 |                     if(part.startsWith(label)){
26 |                         list += part
27 |                     }else{
28 |                         lexer.scan(part).toWordSequence().forEach { word->
29 |                             list += word
30 |                         }
31 |                     }
32 |                 }
33 |                 return list.joinToString(" ")
34 |             }
35 | 
36 |             from.useLines { lines->
37 |                 to.bufferedWriter(Charsets.UTF_8).use { writer->
38 |                     lines.forEach { line->
39 |                         writer.write(processLine(line))
40 |                         writer.write("\n")
41 |                     }
42 |                 }
43 |             }
44 |         }
45 |     }
46 | 
47 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/fasttext/dictionary/LoadDictFromDataInput.kt:
--------------------------------------------------------------------------------
1 | package com.mayabot.nlp.fasttext.dictionary
2 | 
3 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/fasttext/loss/NegativeSamplingLoss.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext.loss
 2 | 
 3 | import com.mayabot.nlp.blas.Matrix
 4 | import com.mayabot.nlp.common.IntArrayList
 5 | import com.mayabot.nlp.fasttext.Model
 6 | import kotlin.random.Random
 7 | 
 8 | 
 9 | class NegativeSamplingLoss(wo: Matrix, val neg: Int, targetCounts: LongArray) : BinaryLogisticLoss(wo) {
10 |     companion object {
11 |         const val NEGATIVE_TABLE_SIZE = 10000000
12 |     }
13 | 
14 |     val negatives = IntArrayList()
15 | 
16 | 
17 |     val uniform: (random: Random) -> Int
18 | 
19 |     init {
20 |         var z = 0.0
21 |         for (i in 0 until targetCounts.size) {
22 |             z += Math.pow(targetCounts[i].toDouble(), 0.5)
23 |         }
24 | 
25 |         for (i in 0 until targetCounts.size) {
26 |             val c = Math.pow(targetCounts[i].toDouble(), 0.5)
27 |             for (j in 0 until (c * NEGATIVE_TABLE_SIZE / z).toInt()) {
28 |                 negatives.add(i)
29 |             }
30 |         }
31 |         val ns = negatives.size()
32 |         //uniform_ = std::uniform_int_distribution<size_t>(0, negatives_.size());
33 |         uniform = { random -> random.nextInt(ns) }
34 |     }
35 | 
36 |     override fun forward(targets: IntArrayList, targetIndex: Int, state: Model.State, lr: Float, backprop: Boolean): Float {
37 |         val target = targets[targetIndex]
38 |         var loss = binaryLogistic(target, state, true, lr, backprop)
39 |         for (n in 0 until neg) {
40 |             var negativeTarget = getNegative(target, state.rng)
41 |             loss += binaryLogistic(negativeTarget, state, false, lr, backprop)
42 |         }
43 |         return loss
44 |     }
45 | 
46 |     private fun getNegative(target: Int, rng: Random): Int {
47 |         var negative = -1
48 |         do {
49 |             negative = negatives[uniform(rng)]
50 |         } while (target == negative)
51 |         return negative
52 |     }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/fasttext/loss/OneVsAlLoss.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext.loss
 2 | 
 3 | import com.mayabot.nlp.blas.Matrix
 4 | import com.mayabot.nlp.common.IntArrayList
 5 | import com.mayabot.nlp.fasttext.Model
 6 | 
 7 | class OneVsAlLoss(wo: Matrix) : BinaryLogisticLoss(wo) {
 8 | 
 9 |     override fun forward(targets: IntArrayList, t_: Int, state: Model.State, lr: Float, backprop: Boolean): Float {
10 |         var loss = 0f
11 |         val osz = state.output.length()
12 |         for (i in 0 until osz) {
13 |             val isMatch = targets.contains(i)
14 |             loss += binaryLogistic(i, state, isMatch, lr, backprop)
15 |         }
16 |         return loss
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/fasttext/loss/SoftmaxLoss.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext.loss
 2 | 
 3 | import com.mayabot.nlp.blas.Matrix
 4 | import com.mayabot.nlp.common.IntArrayList
 5 | import com.mayabot.nlp.fasttext.Model
 6 | import java.lang.Math.max
 7 | 
 8 | class SoftmaxLoss(wo: Matrix) : Loss(wo) {
 9 |     override fun computeOutput(state: Model.State) {
10 |         val output = state.output
11 | 
12 | //        matrixMulVector(wo, state.hidden, output)
13 |         output.mul(wo, state.hidden)
14 | 
15 |         var max = output[0]
16 |         var z = 0.0f
17 | 
18 |         val osz = output.length()
19 | 
20 |         for (i in 0 until osz) {
21 |             max = max(output[i], max)
22 |         }
23 | 
24 |         for (i in 0 until osz) {
25 |             output[i] = kotlin.math.exp((output[i] - max).toDouble()).toFloat()
26 |             z += output[i]
27 |         }
28 |         // 归一化?
29 |         for (i in 0 until osz) {
30 |             output[i] = output[i] / z
31 |         }
32 |     }
33 | 
34 |     override fun forward(targets: IntArrayList, targetIndex: Int, state: Model.State, lr: Float, backprop: Boolean): Float {
35 | 
36 |         computeOutput(state)
37 | 
38 |         val target = targets[targetIndex]
39 |         if (backprop) {
40 |             val osz = wo.row
41 |             for (i in 0 until osz) {
42 |                 val label = if (i == target) 1.0f else 0.0f
43 |                 val alpha = lr * (label - state.output[i])
44 | 
45 |                 state.grad.addRow(wo, i, alpha.toDouble())
46 |                 wo.addVectorToRow(state.hidden, i, alpha)
47 |             }
48 | 
49 |         }
50 | 
51 |         val t = -log(state.output[target])
52 |         return t
53 |     }
54 | 
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/fasttext/utils/ByteUtils.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext.utils;//package blas;
 2 | 
 3 | //
 4 | //
 5 | public class ByteUtils {
 6 | 
 7 |     public static short byte2UInt(byte b) {
 8 |         return (short) (b & 0xFF);
 9 |     }
10 | 
11 |     public static byte short2Byte(short b) {
12 |         return (byte) b;
13 |     }
14 | 
15 | 
16 |     public static final long readLITTLELong(byte[] readBuffer) {
17 |         return (((long) readBuffer[7] << 56) +
18 |                 ((long) (readBuffer[6] & 255) << 48) +
19 |                 ((long) (readBuffer[5] & 255) << 40) +
20 |                 ((long) (readBuffer[4] & 255) << 32) +
21 |                 ((long) (readBuffer[3] & 255) << 24) +
22 |                 ((readBuffer[2] & 255) << 16) +
23 |                 ((readBuffer[1] & 255) << 8) +
24 |                 ((readBuffer[0] & 255) << 0));
25 |     }
26 | 
27 | 
28 | //    public static void main(String[] args) {
29 | //        for (int i = 0; i < 256; i++) {
30 | //            byte b = short2Byte((short) i);
31 | //            short x = byte2UInt(b);
32 | //            System.out.loggerln(b + " = " + x);
33 | //        }
34 | //    }
35 | }
36 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/fasttext/utils/LogUtils.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext.utils
 2 | 
 3 | private var doLog = true
 4 | 
 5 | fun disableLog() {
 6 |     doLog = false
 7 | }
 8 | 
 9 | fun enableLog() {
10 |     doLog = true
11 | }
12 | 
13 | fun logger(s: Any) {
14 |     if (doLog) print(s)
15 | }
16 | 
17 | fun loggerln(s: Any) {
18 |     if (doLog) println(s)
19 | }
20 | 
21 | fun loggerln() {
22 |     if (doLog) println()
23 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/lucene/BaseSynTokenFilter.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.lucene
 2 | 
 3 | import org.apache.lucene.analysis.TokenFilter
 4 | import org.apache.lucene.analysis.TokenStream
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 6 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
 7 | import java.util.*
 8 | 
 9 | /**
10 |  * 基础类；对词进行扩展
11 |  */
12 | abstract class BaseSynTokenFilter(input: TokenStream) : TokenFilter(input) {
13 | 
14 |     /**
15 |      * 当前词
16 |      */
17 |     private val termAtt = addAttribute(CharTermAttribute::class.java)
18 | 
19 |     /**
20 |      * Position Increment
21 |      */
22 |     private val positionAttr = addAttribute(
23 |         PositionIncrementAttribute::class.java
24 |     )
25 | 
26 |     private val buffer = LinkedList<String>()
27 | 
28 |     override fun incrementToken(): Boolean {
29 | 
30 |         if (buffer.isNotEmpty()) {
31 |             val ele = buffer.pollFirst()
32 |             termAtt.setEmpty().append(ele)
33 |             positionAttr.positionIncrement = 0
34 |             return true
35 |         }
36 | 
37 |         val hasNext = input.incrementToken()
38 |         if (!hasNext) {
39 |             return false
40 |         }
41 | 
42 |         val item = termAtt as CharSequence
43 | 
44 |         val extended = extend(item)
45 |         buffer.addAll(extended)
46 | 
47 |         // buffer 肯定不能是空
48 |         termAtt.setEmpty().append(buffer.pollFirst())
49 | 
50 |         return true
51 |     }
52 | 
53 |     /**
54 |      * 返回的list不能为空，至少要包括自己吧
55 |      */
56 |     abstract fun extend(item: CharSequence): List<String>
57 | 
58 |     override fun reset() {
59 |         super.reset()
60 |         this.buffer.clear()
61 |     }
62 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/lucene/IterableMode.java:
--------------------------------------------------------------------------------
1 | package com.mayabot.nlp.module.lucene;
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/lucene/MynlpAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.lucene;
 2 | 
 3 | import com.mayabot.nlp.segment.LexerReader;
 4 | import com.mayabot.nlp.segment.WordTermIterableMode;
 5 | import org.apache.lucene.analysis.Analyzer;
 6 | 
 7 | /**
 8 |  * @author jimichan
 9 |  */
10 | public class MynlpAnalyzer extends Analyzer {
11 | 
12 |     private final LexerReader lexerReader;
13 | 
14 |     private WordTermIterableMode mode = WordTermIterableMode.TOP;
15 | 
16 | 
17 |     public MynlpAnalyzer(LexerReader lexerReader) {
18 |         this.lexerReader = lexerReader;
19 |     }
20 | 
21 |     public MynlpAnalyzer(LexerReader lexerReader, WordTermIterableMode mode) {
22 |         this.lexerReader = lexerReader;
23 |         this.mode = mode;
24 |     }
25 | 
26 | 
27 |     @Override
28 |     protected TokenStreamComponents createComponents(final String fieldName) {
29 |         return new TokenStreamComponents(new MynlpTokenizer(lexerReader, mode));
30 |     }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/nwd/TopCounter.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.nwd
 2 | 
 3 | /**
 4 |  * @author jimichan
 5 |  * 统计元素重复最高
 6 |  */
 7 | class TopCounter(private val size: Int = 2000000,
 8 |                  private var minCount: Int = 10) {
 9 | 
10 |     private var verbose = false
11 | 
12 |     var data = HashMap<String, IntCount>(size)
13 | 
14 |     private var topList = listOf<String>()
15 | 
16 |     private var lastMinCount = 2
17 | 
18 |     fun put(key: String) {
19 | 
20 |         val v = data[key]
21 |         if (v == null) {
22 |             data[key] = IntCount()
23 |         } else {
24 |             v.value++
25 |         }
26 | 
27 |         if (data.size >= size) {
28 |             reduce()
29 |         }
30 |     }
31 | 
32 |     private fun reduce() {
33 |         //1. remove count less min
34 |         if (verbose) println("清洗前有${data.size}条数据")
35 | 
36 |         val target = size / 4 //压缩为1/4
37 | 
38 |         var max = 0
39 | 
40 |         for (min in lastMinCount until minCount) {
41 |             if (data.size > target) {
42 |                 //data.removeAll { _, value -> value <= min }
43 |                 data = data.filterTo(HashMap()) { it.value.value > min }
44 |                 if (verbose) println("删除小于 ${min} 的数量,剩余${data.size}")
45 |                 max = min
46 |             }
47 |         }
48 | 
49 |         lastMinCount = max - 1
50 |         if (lastMinCount <= 2) {
51 |             lastMinCount = 2
52 |         }
53 | 
54 |         //还超出一半
55 |         if (data.size > size / 2) {
56 |             minCount++
57 |         }
58 | 
59 |         if (verbose) println("-".repeat(20))
60 |     }
61 | 
62 |     fun clean() {
63 |         data = data.filterTo(HashMap()) { it.value.value > minCount }
64 |     }
65 | 
66 |     fun getListResult(): List<WordCount> {
67 |         clean()
68 |         val list = ArrayList<WordCount>(data.size)
69 | 
70 |         data.forEach {
71 |             list += WordCount(it.key, it.value.value)
72 |         }
73 | 
74 |         list.sort()
75 |         return list
76 |     }
77 | 
78 | }
79 | 
80 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/nwd/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * 新词发现 (new word discovery)
3 |  */
4 | package com.mayabot.nlp.module.nwd;


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/pinyin/CustomPinyin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.pinyin;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.TreeMap;
 5 | 
 6 | /**
 7 |  * @author jimichan
 8 |  */
 9 | public class CustomPinyin {
10 | 
11 |     private Map<String, String> map = new TreeMap<>();
12 | 
13 |     public void put(String text, String pinyin) {
14 |         map.put(text, pinyin);
15 |     }
16 | 
17 |     public void remove(String text) {
18 |         map.remove(text);
19 |     }
20 | 
21 |     public Map<String, String> getMap() {
22 |         return map;
23 |     }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/pinyin/model/PinyinHead.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | /*
18 |  * <summary></summary>
19 |  * <author>He Han</author>
20 |  * <email>hankcs.cn@gmail.com</email>
21 |  * <create-date>2014/11/6 10:36</create-date>
22 |  *
23 |  * <copyright file="Head.java" company="上海林原信息科技有限公司">
24 |  * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/
25 |  * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information.
26 |  * </copyright>
27 |  */
28 | package com.mayabot.nlp.module.pinyin.model;
29 | 
30 | /**
31 |  * 拼音输入法头
32 |  *
33 |  * @author hankcs
34 |  */
35 | public enum PinyinHead {
36 |     a,
37 |     b,
38 |     c,
39 |     ch,
40 |     d,
41 |     e,
42 |     f,
43 |     g,
44 |     h,
45 |     j,
46 |     k,
47 |     l,
48 |     m,
49 |     n,
50 |     o,
51 |     p,
52 |     q,
53 |     r,
54 |     s,
55 |     sh,
56 |     t,
57 |     w,
58 |     x,
59 |     y,
60 |     z,
61 |     zh,
62 |     none,
63 | }
64 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/pinyin/split/PinyinSplitApp.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.pinyin.split
 2 | 
 3 | import com.mayabot.nlp.MynlpEnv
 4 | import com.mayabot.nlp.common.injector.Singleton
 5 | import com.mayabot.nlp.common.utils.CharNormUtils
 6 | import com.mayabot.nlp.perceptron.PerceptronModel
 7 | import com.mayabot.nlp.perceptron.PerceptronComputer
 8 | import java.io.File
 9 | 
10 | @Singleton
11 | class PinyinSplitService(env: MynlpEnv) {
12 | 
13 |     val app = PinyinSplitApp.loadDefault(env)
14 | 
15 |     fun split(text: String) = app.decodeToWordList(text)
16 | }
17 | 
18 | class PinyinSplitApp(val model: PerceptronModel) {
19 | 
20 |     private val logic = define.modelComputer(model)
21 | 
22 |     fun decodeToWordList(sentence: String, convert: Boolean = true): List<String> {
23 |         val result = ArrayList<String>()
24 |         val input = sentence.toCharArray()
25 |         if (convert) {
26 |             CharNormUtils.convert(input)
27 |         }
28 | 
29 |         val output = logic.decodeModel(input)
30 | 
31 |         var p = 0
32 |         for (i in 0 until output.size) {
33 |             val f = output[i]
34 |             if (f == "S" || f == "E") {
35 |                 result += sentence.substring(p, i + 1)
36 |                 p = i + 1
37 |             }
38 |         }
39 |         if (p < sentence.length) {
40 |             result += sentence.substring(p, sentence.length)
41 |         }
42 | 
43 |         return result
44 |     }
45 | 
46 |     companion object {
47 | 
48 |         const val modelPrefix = "pinyin-split-model"
49 | 
50 |         val define = PinyinSplitDefinition()
51 | 
52 |         fun load(file: File): PinyinSplitApp {
53 |             return PinyinSplitApp(PerceptronModel.load(file))
54 |         }
55 | 
56 |         fun loadDefault(env: MynlpEnv): PinyinSplitApp {
57 |             return PinyinSplitApp(PerceptronModel.loadFromNlpResource(modelPrefix, env))
58 |         }
59 |     }
60 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/trans/Simplified2Traditional.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.trans
 2 | 
 3 | import com.mayabot.nlp.common.injector.Singleton
 4 | import java.util.*
 5 | 
 6 | /**
 7 |  * 简体转繁体的词典
 8 |  *
 9 |  * @author jimichan
10 |  */
11 | @Singleton
12 | class Simplified2Traditional : BaseTransformDictionary() {
13 | 
14 |     override fun loadDictionary(): TreeMap<String, String> {
15 |         return loadFromResource(RS_NAME)
16 |     }
17 | 
18 |     companion object {
19 |         private val RS_NAME = "ts-dict/s2t.txt"
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/trans/Traditional2Simplified.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.trans
 2 | 
 3 | import com.mayabot.nlp.common.injector.Singleton
 4 | import java.util.*
 5 | 
 6 | /**
 7 |  * 繁体转简体的词典
 8 |  *
 9 |  * @author jimichan
10 |  */
11 | @Singleton
12 | class Traditional2Simplified : BaseTransformDictionary() {
13 | 
14 |     override fun loadDictionary(): TreeMap<String, String> {
15 |         return loadFromResource(RS_NAME)
16 |     }
17 | 
18 |     companion object {
19 | 
20 |         private val RS_NAME = "ts-dict/t2s.txt"
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/module/trans/TransformService.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.trans;
 2 | 
 3 | import com.mayabot.nlp.Mynlp;
 4 | 
 5 | /**
 6 |  * 繁简体转换
 7 |  *
 8 |  * @author jimichan
 9 |  */
10 | @Deprecated
11 | public class TransformService {
12 | 
13 |     private static Mynlp mynlp = Mynlp.instance();
14 | 
15 |     /**
16 |      * 简体转繁体
17 |      *
18 |      * @param text 简体文字
19 |      * @return 繁体文字
20 |      */
21 |     @Deprecated
22 |     public static String s2t(String text) {
23 |         return mynlp.s2t(text);
24 |     }
25 | 
26 |     /**
27 |      * 繁体转简体
28 |      *
29 |      * @param text 繁体内容
30 |      * @return 简体字符串
31 |      */
32 |     @Deprecated
33 |     public static String t2s(String text) {
34 |         return mynlp.t2s(text);
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/perceptron/EvaluateFunction.java:
--------------------------------------------------------------------------------
1 | package com.mayabot.nlp.perceptron;
2 | 
3 | import java.util.List;
4 | 
5 | public interface EvaluateFunction {
6 |     EvaluateResult evaluate(List<String> sample );
7 | }
8 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/perceptron/EvaluateResult.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.perceptron
 2 | 
 3 | /**
 4 |  * 评估结果
 5 |  */
 6 | data class EvaluateResult(
 7 |         /**
 8 |          * 正确率
 9 |          */
10 |         val precision: Float,
11 |         /**
12 |          * 召回率
13 |          */
14 |         val recall: Float
15 | ) {
16 | 
17 |     constructor(goldTotal: Int, predTotal: Int, correct: Int) : this(
18 |             (correct * 100.0 / predTotal).toFloat(),
19 |             (correct * 100.0 / goldTotal).toFloat()
20 |     )
21 | 
22 |     /**
23 |      * F1综合指标
24 |      */
25 |     val f1: Float
26 |         get() = (2.0 * precision * recall / (precision + recall)).toFloat()
27 | 
28 |     override fun toString(): String {
29 |         return "正确率(P) %.2f , 召回率(R) %.2f , F1 %.2f".format(precision, recall, f1)
30 |     }
31 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/CharNormalize.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.segment;
18 | 
19 | /**
20 |  * 字符规范化接口
21 |  * <p>
22 |  * 分词之前可以对char进行转换。一般完成大小写、半全角、归一化转换的需求.
23 |  *
24 |  * @author jimichan
25 |  * @see com.mayabot.nlp.segment.common.DefaultCharNormalize
26 |  */
27 | public interface CharNormalize {
28 | 
29 |     /**
30 |      * 对char数组里面的字符进行规范化操作，常见的有最小化和宽体字符处理
31 |      *
32 |      * @param text
33 |      */
34 |     void normal(char[] text);
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/KotlinLexers.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import java.io.File
 5 | 
 6 | private val defaultLexer = Mynlp.instance().lexerBuilder()
 7 |     .bigram().withPersonName().build()
 8 | 
 9 | fun String.segment(): List<String> = defaultLexer.scan(this).toWordList()
10 | fun String.lexer(): Sentence = defaultLexer.scan(this)
11 | 
12 | 
13 | /**
14 |  */
15 | fun File.segment(outPath: String) {
16 |     val lexerReader = defaultLexer.reader()
17 | 
18 |     val file = File(outPath)
19 | 
20 |     if (!file.parentFile.exists()) {
21 |         file.parentFile.mkdirs()
22 |     }
23 | 
24 |     val lines = inputStream().bufferedReader().lines()
25 | 
26 |     file.outputStream().bufferedWriter().use { writer ->
27 |         lines.filter { it.isNotBlank() }
28 |                 .map {
29 |                     lexerReader.scan(it).toWordSequence()
30 |                 }.forEach { x ->
31 |                     writer.write(x.joinToString(separator = " "))
32 |                     writer.newLine()
33 |                 }
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/LexerBuilder.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment;
17 | 
18 | /**
19 |  * Lexer构建器接口
20 |  *
21 |  * @author jimichan
22 |  */
23 | public interface LexerBuilder {
24 | 
25 |     /**
26 |      * 构建一个Lexer
27 |      *
28 |      * @return Lexer
29 |      */
30 |     Lexer build();
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/Lexers.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment;
17 | 
18 | import com.mayabot.nlp.Mynlp;
19 | 
20 | /**
21 |  * Lexer系列便捷方法。
22 |  *
23 |  * @author jimichan
24 |  */
25 | @Deprecated
26 | public class Lexers {
27 | 
28 |     /**
29 |      * @return FluentLexerBuilder
30 |      * @since 3.0.0
31 |      */
32 |     public static FluentLexerBuilder builder() {
33 |         return Mynlp.instance().lexerBuilder();
34 |     }
35 | 
36 |     public static Lexer core() {
37 |         return coreBuilder()
38 |                 .withPos()
39 |                 .withPersonName().build();
40 |     }
41 | 
42 |     public static FluentLexerBuilder coreBuilder() {
43 |         return builder().core();
44 |     }
45 | 
46 |     public static Lexer perceptron() {
47 |         return perceptronBuilder().withPos().build();
48 |     }
49 | 
50 |     public static FluentLexerBuilder perceptronBuilder() {
51 |         return builder().perceptron();
52 |     }
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/SegmentComponent.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment;
17 | 
18 | /**
19 |  * 分词组件需要有个Name和设置是否启用的
20 |  * <p>
21 |  * Name : 组件的名称
22 |  * Enable : 是否启用
23 |  * Order ： 排序。 越小越靠前。
24 |  *
25 |  * @author jimichan
26 |  */
27 | public interface SegmentComponent extends Comparable<SegmentComponent> {
28 | 
29 | 
30 |     /**
31 |      * return component name
32 |      *
33 |      * @return name
34 |      */
35 |     String getName();
36 | 
37 |     /**
38 |      * 组件是否启用。默认返回true，启用
39 |      *
40 |      * @return enabled
41 |      */
42 |     boolean isEnabled();
43 | 
44 |     void setEnabled(boolean enable);
45 | 
46 |     void enable();
47 | 
48 |     void disable();
49 | 
50 |     int getOrder();
51 | 
52 |     void setOrder(int order);
53 | 
54 |     @Override
55 |     default int compareTo(SegmentComponent o) {
56 |         return Integer.compare(this.getOrder(), o.getOrder());
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/SegmentModule.kt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment
17 | 
18 | import com.mayabot.nlp.MynlpEnv
19 | import com.mayabot.nlp.common.injector.AbstractModule
20 | 
21 | class SegmentModule(private val env: MynlpEnv) : AbstractModule() {
22 | 
23 |     override fun configure() {
24 | //        if (env.get(MynlpConfigs.server).isNotBlank()) {
25 | //            bind(CoreDictPatch::class.java).toClass(NlpCoreDictPatchClient::class.java)
26 | //        }
27 |     }
28 | 
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/WordAndNature.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment;
17 | 
18 | /**
19 |  * 词和词性访问接口
20 |  */
21 | public interface WordAndNature {
22 | 
23 |     String getWord();
24 | 
25 |     String getNatureName();
26 | }
27 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/WordSplitAlgorithm.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.segment;
18 | 
19 | import com.mayabot.nlp.segment.lexer.bigram.CoreDictionarySplitAlgorithm;
20 | import com.mayabot.nlp.segment.lexer.perceptron.PerceptronSegmentAlgorithm;
21 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm;
22 | import com.mayabot.nlp.segment.wordnet.Wordnet;
23 | import org.jetbrains.annotations.NotNull;
24 | 
25 | /**
26 |  * 分词算法。
27 |  * 分词逻辑基本上是面向字符的处理程序。
28 |  * 分词算法的作用是对文本分析后，产生一种或多种分词路径，结果保存在Wordnet数据结构里面。
29 |  * <p>
30 |  * 1. 基于词典
31 |  * 3. 基于字分割
32 |  * 2. 基于规则
33 |  * <p>
34 |  * 在一个具体的分词器中，有可能综合同时使用多个分词算法。
35 |  *
36 |  * @author jimichan
37 |  * @see AtomSplitAlgorithm
38 |  * @see PerceptronSegmentAlgorithm
39 |  * @see CoreDictionarySplitAlgorithm
40 |  * @see com.mayabot.nlp.segment.plugins.personname.PersonNameAlgorithm
41 |  */
42 | public interface WordSplitAlgorithm extends SegmentComponent {
43 | 
44 |     /**
45 |      * 填充Wordnet实例
46 |      *
47 |      * @param wordnet
48 |      */
49 |     void fill(@NotNull Wordnet wordnet);
50 | 
51 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/WordpathProcessor.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.segment;
18 | 
19 | import com.mayabot.nlp.segment.wordnet.Wordpath;
20 | 
21 | /**
22 |  * Wordpath处理器
23 |  *
24 |  * @author jimichan
25 |  */
26 | public interface WordpathProcessor extends SegmentComponent {
27 | 
28 |     /**
29 |      * 对传入的Wordpath进行处理，然后返回一个旧的或者新的对象
30 |      *
31 |      * @param wordPath
32 |      * @return 一般对传入的wordPath修改，返回对象本身
33 |      */
34 |     Wordpath process(Wordpath wordPath);
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/common/BaseSegmentComponent.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment.common;
17 | 
18 | import com.mayabot.nlp.segment.SegmentComponent;
19 | 
20 | /**
21 |  * @author jimichan
22 |  */
23 | public abstract class BaseSegmentComponent implements SegmentComponent {
24 | 
25 |     private boolean enabled = true;
26 | 
27 |     public static final int LEVEL1 = -1000;
28 | 
29 |     public static final int LEVEL2 = -500;
30 | 
31 |     public static final int LEVEL3 = 0;
32 | 
33 |     public static final int LEVEL4 = 500;
34 | 
35 |     public static final int LEVEL5 = 1000;
36 | 
37 |     private int order = LEVEL3;
38 | 
39 |     public BaseSegmentComponent(int order) {
40 |         this.order = order;
41 |     }
42 | 
43 |     @Override
44 |     public String getName() {
45 |         return this.getClass().getSimpleName();
46 |     }
47 | 
48 |     @Override
49 |     public boolean isEnabled() {
50 |         return enabled;
51 |     }
52 | 
53 |     @Override
54 |     public void setEnabled(boolean enable) {
55 |         this.enabled = enable;
56 |     }
57 | 
58 |     @Override
59 |     public void enable() {
60 |         this.enabled = true;
61 |     }
62 | 
63 |     @Override
64 |     public void disable() {
65 |         this.enabled = false;
66 |     }
67 | 
68 |     @Override
69 |     public int getOrder() {
70 |         return order;
71 |     }
72 | 
73 |     @Override
74 |     public void setOrder(int order) {
75 |         this.order = order;
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/common/DefaultCharNormalize.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.common;
 2 | 
 3 | import com.mayabot.nlp.common.utils.CharNormUtils;
 4 | import com.mayabot.nlp.segment.CharNormalize;
 5 | 
 6 | /**
 7 |  * 大小转小写。
 8 |  * 全角转半角，其他字符归一化。
 9 |  *
10 |  * @author jimichan
11 |  */
12 | public class DefaultCharNormalize implements CharNormalize {
13 |     @Override
14 |     public void normal(char[] text) {
15 |         CharNormUtils.convert(text);
16 |     }
17 | 
18 |     public static final DefaultCharNormalize instance = new DefaultCharNormalize();
19 | }
20 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/common/String2.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.common;
 2 | 
 3 | import org.jetbrains.annotations.NotNull;
 4 | 
 5 | final public class String2 implements CharSequence {
 6 | 
 7 |     @NotNull
 8 |     private char[] chars;
 9 | 
10 |     private int start = 0;
11 |     private int end = 0;
12 | 
13 |     private int len = 0;
14 | 
15 |     public String2(@NotNull char[] chars) {
16 |         this.chars = chars;
17 |         start = 0;
18 |         this.end = chars.length;
19 |         len = chars.length;
20 |     }
21 | 
22 |     public String2(@NotNull char[] chars, int start, int end) {
23 |         this.chars = chars;
24 |         this.start = start;
25 |         this.end = end;
26 |         this.len = end - start;
27 |     }
28 | 
29 |     public void setStartEnd(int start, int end) {
30 |         this.start = start;
31 |         this.end = end;
32 |         this.len = end - start;
33 |     }
34 | 
35 |     public int getStart() {
36 |         return start;
37 |     }
38 | 
39 |     @Override
40 |     public int length() {
41 |         return len;
42 |     }
43 | 
44 |     @Override
45 |     public char charAt(int index) {
46 |         return chars[index + start];
47 |     }
48 | 
49 |     @Override
50 |     public CharSequence subSequence(int start, int end) {
51 |         return new String2(chars, this.start + start, this.start + end);
52 |     }
53 | 
54 |     @Override
55 |     public String toString() {
56 |         return new String(chars, start, len);
57 |     }
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/common/VertexHelper.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.segment.common;
18 | 
19 | import com.mayabot.nlp.segment.Nature;
20 | import com.mayabot.nlp.segment.wordnet.Vertex;
21 | 
22 | /**
23 |  * 顶点管理器
24 |  *
25 |  * @author jimichan
26 |  */
27 | public abstract class VertexHelper {
28 | 
29 |     private static final int total = 25146057 / 10;
30 | 
31 |     /**
32 |      * 生成线程安全的起始节点
33 |      * begin
34 |      *
35 |      * @return Begin Vertex
36 |      */
37 |     public static Vertex newBegin() {
38 |         Vertex v = new Vertex(1);
39 |         v.setAbsWordNatureAndFreq(Nature.newWord, total);
40 |         return v;
41 |     }
42 | 
43 |     /**
44 |      * @return End Vertex
45 |      */
46 |     public static Vertex newEnd() {
47 |         Vertex v = new Vertex(0);
48 |         v.setAbsWordNatureAndFreq(Nature.end, total);
49 |         return v;
50 |     }
51 | 
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/BiGramTableDictionary.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.bigram;
 2 | 
 3 | 
 4 | import com.mayabot.nlp.common.injector.ImplementedBy;
 5 | 
 6 | /**
 7 |  * 查询词ID，两个接续ID中间的共现频率。
 8 |  *
 9 |  * @author jimichan
10 |  */
11 | @ImplementedBy(value = BiGramTableDictionaryImpl.class)
12 | public interface BiGramTableDictionary {
13 |     int getBiFrequency(int idA, int idB);
14 | 
15 |     public void refresh() throws Exception;
16 | }
17 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/BiGramTableReader.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.bigram
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.MynlpEnv
 5 | import com.mayabot.nlp.common.Guava.split
 6 | 
 7 | class BiGramTableReader(private val env: MynlpEnv) {
 8 |     constructor(mynlp: Mynlp) : this(mynlp.env)
 9 | 
10 |     fun read(blocker: (String, String, Int) -> Unit) {
11 | 
12 |         val dictResource = env.loadResource(BiGramTableDictionaryImpl.path)
13 |             ?: throw RuntimeException("Not Found dict resource " + BiGramTableDictionaryImpl.path)
14 | 
15 |         var firstWord: String? = null
16 | 
17 |         dictResource.inputStream().bufferedReader(Charsets.UTF_8).useLines { lines ->
18 |             lines.forEach { line ->
19 |                 if (line.startsWith("\t")) {
20 |                     val firstWh = line.indexOf(" ")
21 |                     val numString = line.substring(1, firstWh)
22 |                     val num = numString.toInt()
23 |                     val words = split(line.substring(firstWh + 1), " ")
24 |                     val wordA = firstWord!!
25 | 
26 |                     for (wordB in words) {
27 |                         blocker(wordA, wordB, num)
28 |                     }
29 |                 } else {
30 |                     firstWord = line
31 |                 }
32 |             }
33 |         }
34 | 
35 |     }
36 | }
37 | 
38 | fun readCoreBigramTable(blocker: (String, String, Int) -> Unit) {
39 |     BiGramTableReader(Mynlp.instance()).read(blocker)
40 | }
41 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictPatch.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.bigram
 2 | 
 3 | import com.mayabot.nlp.common.injector.Singleton
 4 | import org.jetbrains.annotations.Nullable
 5 | 
 6 | interface CoreDictPatch {
 7 |     fun appendDict(): List<Pair<String, Int>>
 8 |     fun deleteDict(): List<String>
 9 |     fun appendBiGram(): List<BiGram>
10 |     fun dictVersion(): String
11 |     fun biGramVersion(): String
12 | }
13 | 
14 | data class BiGram(
15 |         val wordA: String, val wordB: String, val count: Int
16 | )
17 | 
18 | @Singleton
19 | class CoreDictPathWrap {
20 | 
21 |     @Nullable
22 |     val coreDictPatch: CoreDictPatch? = null
23 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictionary.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.bigram;
 2 | 
 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap.DATMapMatcherInt;
 4 | import com.mayabot.nlp.common.injector.ImplementedBy;
 5 | import org.jetbrains.annotations.NotNull;
 6 | 
 7 | /**
 8 |  * @author jimichan
 9 |  */
10 | @ImplementedBy(CoreDictionaryImpl.class)
11 | public interface CoreDictionary {
12 | 
13 |     /**
14 |      * 匹配算法
15 |      *
16 |      * @param text
17 |      * @param offset
18 |      * @return DATMapMatcherInt
19 |      */
20 |     DATMapMatcherInt match(char[] text, int offset);
21 | 
22 |     /**
23 |      * 词频总量
24 |      *
25 |      * @return int 词频总量
26 |      */
27 |     int totalFreq();
28 | 
29 |     void refresh() throws Exception;
30 | 
31 |     int wordId(char[] chars, int pos, int len);
32 | 
33 |     int wordId(CharSequence word);
34 | 
35 |     public int wordFreq(int wordID);
36 | 
37 |     boolean contains(@NotNull String word);
38 | 
39 |     int getWordID(String word);
40 | 
41 |     int size();
42 | }
43 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictionaryReader.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.bigram
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.MynlpEnv
 5 | import com.mayabot.nlp.common.Guava.split
 6 | 
 7 | class CoreDictionaryReader(val env: MynlpEnv) {
 8 | 
 9 |     constructor(mynlp: Mynlp) : this(mynlp.env)
10 | 
11 |     var totalFreq = 0
12 | 
13 |     fun read(blocker: (String, Int) -> Unit) {
14 | 
15 |         val dictResource = env.loadResource(CoreDictionaryImpl.path)
16 |             ?: throw RuntimeException("Not Found dict resource " + CoreDictionaryImpl.path)
17 | 
18 |         dictResource.inputStream().bufferedReader(Charsets.UTF_8).useLines { lines ->
19 |             lines.forEach { line ->
20 |                 val param = split(line, " ")
21 |                 if (param.size == 2) {
22 |                     val count = Integer.valueOf(param[1])
23 |                     blocker(param[0], count)
24 |                     totalFreq += count
25 |                 }
26 |             }
27 |         }
28 |     }
29 | }
30 | 
31 | fun readCoreDict(blocker: (String, Int) -> Unit) {
32 |     CoreDictionaryReader(Mynlp.instance()).read(blocker)
33 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictionarySplitAlgorithm.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.bigram;
 2 | 
 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap.DATMapMatcherInt;
 4 | import com.mayabot.nlp.segment.WordSplitAlgorithm;
 5 | import com.mayabot.nlp.segment.common.BaseSegmentComponent;
 6 | import com.mayabot.nlp.segment.wordnet.Vertex;
 7 | import com.mayabot.nlp.segment.wordnet.Wordnet;
 8 | 
 9 | /**
10 |  * 基于核心词典的基础切词器
11 |  *
12 |  * @author jimichan
13 |  */
14 | public class CoreDictionarySplitAlgorithm extends BaseSegmentComponent implements WordSplitAlgorithm {
15 | 
16 |     private CoreDictionary coreDictionary;
17 | 
18 | 
19 |     public CoreDictionarySplitAlgorithm(CoreDictionary coreDictionary) {
20 |         super(LEVEL1);
21 |         this.coreDictionary = coreDictionary;
22 |     }
23 | 
24 |     @Override
25 |     public void fill(Wordnet wordnet) {
26 |         char[] text = wordnet.getCharArray();
27 | 
28 |         // 核心词典查询
29 |         DATMapMatcherInt searcher = coreDictionary.match(text, 0);
30 | 
31 |         while (searcher.next()) {
32 |             int offset = searcher.getBegin();
33 |             int length = searcher.getLength();
34 |             int wordId = searcher.getIndex();
35 | 
36 |             Vertex v = new Vertex(length, wordId, searcher.getValue());
37 | 
38 |             wordnet.put(offset, v);
39 |         }
40 |     }
41 | 
42 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/HmmLexerPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.bigram;
 2 | 
 3 | import com.mayabot.nlp.Mynlp;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 5 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 6 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm;
 7 | 
 8 | /**
 9 |  * @author jimichan
10 |  */
11 | public class HmmLexerPlugin implements PipelineLexerPlugin {
12 | 
13 |     private CoreDictionary dictionaryMatcher;
14 | 
15 |     public HmmLexerPlugin(CoreDictionary dictionaryMatcher) {
16 |         this.dictionaryMatcher = dictionaryMatcher;
17 |     }
18 | 
19 |     public HmmLexerPlugin(Mynlp mynlp) {
20 |         this.dictionaryMatcher = mynlp.getInstance(CoreDictionary.class);
21 |     }
22 | 
23 |     @Override
24 |     public void init(PipelineLexerBuilder builder) {
25 | 
26 |         builder.setBestPathComputer(ViterbiBestPathAlgorithm.class);
27 | 
28 | 
29 |         builder.addWordSplitAlgorithm(new CoreDictionarySplitAlgorithm(
30 |                 dictionaryMatcher
31 |         ));
32 | 
33 |         builder.addWordSplitAlgorithm(AtomSplitAlgorithm.class);
34 | 
35 |     }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/crf/tokenizer/CrfTokenizerBuilder.java:
--------------------------------------------------------------------------------
 1 | //package com.mayabot.nlp.segment.crf.tokenizer;
 2 | //
 3 | //import com.mayabot.nlp.segment.PipelineTokenizerBuilder;
 4 | //import com.mayabot.nlp.segment.crf.tokenizer.CrfBaseSegmentInitializer;
 5 | //import com.mayabot.nlp.segment.tokenizer.BaseTokenizerBuilder;
 6 | //import com.mayabot.nlp.segment.tokenizer.bestpath.ViterbiBestPathAlgorithm;
 7 | //import SentenceCollector;
 8 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.CommonSplitAlgorithm;
 9 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.CommonRuleWordpathProcessor;
10 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.CustomDictionaryProcessor;
11 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.TimeSplitAlgorithm;
12 | //
13 | //public class CrfTokenizerBuilder extends BaseTokenizerBuilder {
14 | //
15 | //
16 | //    @Override
17 | //    protected void setUp(PipelineTokenizerBuilder builder) {
18 | //
19 | //        //wordnet初始化填充
20 | //        builder.addWordSplitAlgorithm(
21 | //                CrfBaseSegmentInitializer.class,
22 | //                CommonSplitAlgorithm.class,
23 | //                TimeSplitAlgorithm.class
24 | //        );
25 | //
26 | //        //最优路径算法w
27 | //        builder.setBestPathComputer(ViterbiBestPathAlgorithm.class);
28 | //
29 | //
30 | //        // Pipeline处理器
31 | //        builder.addProcessor(CustomDictionaryProcessor.class);
32 | //        builder.addProcessor(CommonRuleWordpathProcessor.class);
33 | //
34 | //
35 | //        builder.setTermCollector(new SentenceCollector());
36 | //
37 | //
38 | //    }
39 | //
40 | //}
41 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronSegmentPatch.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.perceptron
 2 | 
 3 | import com.mayabot.nlp.MynlpEnv
 4 | import com.mayabot.nlp.common.injector.Singleton
 5 | 
 6 | @Singleton
 7 | class PerceptronSegmentPatch
 8 | 
 9 | constructor(val mynlpEnv: MynlpEnv) {
10 | 
11 |     val examples = ArrayList<String>()
12 | 
13 |     init {
14 |         examples += loadExample("patch/cws-default.txt")
15 |         examples += loadExample("patch/cws.txt")
16 |     }
17 | 
18 |     fun addExample(line: String) {
19 |         examples += line
20 |     }
21 | 
22 |     fun removeExample(line: String) {
23 |         examples.remove(line)
24 |     }
25 | 
26 |     fun addResources(rsName: String) {
27 |         examples += loadExample(rsName)
28 |     }
29 | 
30 |     private fun loadExample(rsName: String): List<String> {
31 |         val resource = mynlpEnv.tryLoadResource(rsName, Charsets.UTF_8)
32 |         if (resource != null) {
33 |             return resource.inputStream().bufferedReader().readLines()
34 |                     .map { it.trim() }.filter {
35 |                         it.isNotBlank() && !it.startsWith("#")
36 |                     }
37 |         }
38 |         return listOf()
39 |     }
40 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronSegmentPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.perceptron;
 2 | 
 3 | import com.mayabot.nlp.segment.lexer.bigram.ViterbiBestPathAlgorithm;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 5 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 6 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm;
 7 | 
 8 | public class PerceptronSegmentPlugin implements PipelineLexerPlugin {
 9 | 
10 |     @Override
11 |     public void init(PipelineLexerBuilder builder) {
12 | 
13 |         //切词算法
14 |         builder.addWordSplitAlgorithm(PerceptronSegmentAlgorithm.class);
15 | 
16 | 
17 |         builder.addWordSplitAlgorithm(AtomSplitAlgorithm.class);
18 | 
19 | 
20 |         //最优路径算法
21 |         builder.setBestPathComputer(ViterbiBestPathAlgorithm.class);
22 | 
23 |     }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronsSegmentService.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.perceptron;
 2 | 
 3 | import com.mayabot.nlp.MynlpConfigs;
 4 | import com.mayabot.nlp.MynlpEnv;
 5 | import com.mayabot.nlp.common.injector.Singleton;
 6 | import com.mayabot.nlp.common.logging.InternalLogger;
 7 | import com.mayabot.nlp.common.logging.InternalLoggerFactory;
 8 | import com.mayabot.nlp.common.resources.NlpResource;
 9 | import com.mayabot.nlp.segment.plugins.ner.PerceptronNerService;
10 | 
11 | import java.util.List;
12 | 
13 | /**
14 |  * 感知机分词服务
15 |  */
16 | @Singleton
17 | public class PerceptronsSegmentService {
18 | 
19 |     private PerceptronSegment ps;
20 | 
21 | 
22 |     static InternalLogger logger = InternalLoggerFactory.getInstance(PerceptronNerService.class);
23 | 
24 |     public PerceptronsSegmentService(MynlpEnv mynlp,
25 |                                      PerceptronSegmentPatch perceptronSegmentPatch) throws Exception {
26 | 
27 |         //cws-model or cws-hanlp-model
28 |         String modelName = mynlp.get(MynlpConfigs.cwsModelItem);
29 | 
30 |         long t1 = System.currentTimeMillis();
31 |         NlpResource parameterResource = mynlp.loadResource(modelName + "/parameter.bin");
32 |         NlpResource featureResource = mynlp.loadResource(modelName + "/feature.dat");
33 | 
34 |         ps = PerceptronSegment.load(
35 |                 parameterResource.inputStream(),
36 |                 featureResource.inputStream());
37 | 
38 |         for (String example : perceptronSegmentPatch.getExamples()) {
39 |             ps.learn(example);
40 |         }
41 | 
42 |         long t2 = System.currentTimeMillis();
43 | 
44 |         logger.info("PerceptronCwsService init use " + (t2 - t1) + " ms");
45 |     }
46 | 
47 |     public List<String> splitWord(String sentence) {
48 |         return ps.decode(sentence);
49 |     }
50 | 
51 |     /**
52 |      * 词使用空格分开。
53 |      * @param example
54 |      */
55 |     public void learn(String example){
56 |         ps.learn(example);
57 |     }
58 | 
59 |     public PerceptronSegment getPerceptron() {
60 |         return ps;
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/inner/Train.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.perceptron.inner
 2 | 
 3 | import com.mayabot.nlp.perceptron.PerceptronComputer
 4 | import com.mayabot.nlp.segment.lexer.perceptron.PerceptronSegmentDefinition
 5 | import java.io.File
 6 | 
 7 | /**
 8 |  * 参数
 9 |  * Iter 150
10 |  * thread 2
11 |  */
12 | fun main() {
13 | 
14 |     val runner = PerceptronComputer(PerceptronSegmentDefinition())
15 | 
16 | //        val trainFile = File("data.work/corpus.segment/backoff2005/msr_training.txt")
17 | //        val evaluateFile = File("data.work/corpus.segment/backoff2005/msr_test_gold.txt")
18 | //
19 |     val trainFile = File("data.work/cws/pku/199801.txt")
20 |     val evaluateFile = File("data.work/cws/pku/199802.txt")
21 | 
22 |     var model = runner.train(
23 |             trainFile,
24 |             evaluateFile,
25 |             10, 8)
26 | 
27 |     println("compress")
28 |     model = model.compress(0.2, 1e-3)
29 | 
30 |     println("After compress ...")
31 |     val evlResult = runner.evaluateModel(model,evaluateFile)
32 |     println(evlResult)
33 | 
34 |     model.save(File("data.work/cws-model"))
35 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/pipeline/PipelineLexerBuilderKts.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.pipeline
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.segment.FluentLexerBuilder
 5 | import com.mayabot.nlp.segment.Lexer
 6 | 
 7 | fun lexerBuilder(blocker: FluentLexerBuilder.() -> Unit): Lexer {
 8 |     val builder = FluentLexerBuilder(Mynlp.instance())
 9 |     builder.blocker()
10 |     return builder.build()
11 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/pipeline/PipelineLexerPlugin.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment.pipeline;
17 | 
18 | /**
19 |  * @author jimichan
20 |  */
21 | public interface PipelineLexerPlugin {
22 |     void init(PipelineLexerBuilder builder);
23 | }
24 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/bestpath/LongpathBestPathAlgorithm.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.bestpath;
 2 | 
 3 | import com.mayabot.nlp.segment.wordnet.BestPathAlgorithm;
 4 | import com.mayabot.nlp.segment.wordnet.VertexRow;
 5 | import com.mayabot.nlp.segment.wordnet.Wordnet;
 6 | import com.mayabot.nlp.segment.wordnet.Wordpath;
 7 | 
 8 | /**
 9 |  * 前向最大路径算法
10 |  *
11 |  * @author jimichan
12 |  */
13 | public class LongpathBestPathAlgorithm implements BestPathAlgorithm {
14 | 
15 |     @Override
16 |     public Wordpath select(Wordnet wordnet) {
17 |         //从后到前，获得完整的路径
18 |         final Wordpath wordPath = new Wordpath(wordnet);
19 | 
20 |         int point = 0;
21 |         final int len = wordnet.length() - 1;
22 | 
23 |         while (point <= len) {
24 | 
25 |             VertexRow row = wordnet.row(point);
26 | 
27 |             int wordLen = row.lastLen();
28 |             if (wordLen == 0) {
29 |                 wordLen = 1;
30 |             }
31 | 
32 |             wordPath.combine(point, wordLen);
33 | 
34 |             point += wordLen;
35 |         }
36 | 
37 |         // 最后一个point必定指向start节点
38 |         if (point == len) {
39 |             throw new IllegalStateException("非完整路径,有可能wordnet初始化的时候就路径不完整");
40 |         }
41 | //        Preconditions.checkState(point != len,"非完整路径,有可能wordnet初始化的时候就路径不完整" );
42 |         return wordPath;
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CoreDictSubwordInfoSetup.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.collector;
 2 | 
 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap;
 4 | import com.mayabot.nlp.segment.lexer.bigram.CoreDictionary;
 5 | import com.mayabot.nlp.segment.wordnet.Vertex;
 6 | import com.mayabot.nlp.segment.wordnet.Wordnet;
 7 | import com.mayabot.nlp.segment.wordnet.Wordpath;
 8 | import org.jetbrains.annotations.NotNull;
 9 | 
10 | /**
11 |  * 基于词典的子词补全.
12 |  * 一般在感知机分词器，需要补全
13 |  */
14 | public class CoreDictSubwordInfoSetup implements SubwordInfoSetup {
15 | 
16 |     private CoreDictionary dictionary;
17 | 
18 |     public CoreDictSubwordInfoSetup(CoreDictionary dictionary) {
19 |         this.dictionary = dictionary;
20 |     }
21 | 
22 |     @Override
23 |     public void fill(@NotNull Wordnet wordnet, @NotNull Wordpath wordPath) {
24 |         char[] text = wordnet.getCharArray();
25 |         // 核心词典查询
26 |         DoubleArrayTrieStringIntMap.DATMapMatcherInt searcher = dictionary.match(text, 0);
27 | 
28 |         while (searcher.next()) {
29 |             int offset = searcher.getBegin();
30 |             int length = searcher.getLength();
31 |             int wordId = searcher.getIndex();
32 | 
33 |             Vertex v = new Vertex(length, wordId, searcher.getValue());
34 |             if(!wordnet.row(offset).contains(length)){
35 |                 wordnet.put(offset, v);
36 |             }
37 |         }
38 |     }
39 | 
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CustomDictSubwordInfoSetup.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.collector;
 2 | 
 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap;
 4 | import com.mayabot.nlp.segment.plugins.customwords.CustomDictionary;
 5 | import com.mayabot.nlp.segment.wordnet.Vertex;
 6 | import com.mayabot.nlp.segment.wordnet.Wordnet;
 7 | import com.mayabot.nlp.segment.wordnet.Wordpath;
 8 | import org.jetbrains.annotations.NotNull;
 9 | 
10 | /**
11 |  * 基于词典的子词补全.
12 |  * 一般在感知机分词器，需要补全
13 |  */
14 | public class CustomDictSubwordInfoSetup implements SubwordInfoSetup {
15 | 
16 |     private CustomDictionary dictionary;
17 | 
18 |     public CustomDictSubwordInfoSetup(CustomDictionary dictionary) {
19 |         this.dictionary = dictionary;
20 |     }
21 | 
22 |     @Override
23 |     public void fill(@NotNull Wordnet wordnet, @NotNull Wordpath wordPath) {
24 |         DoubleArrayTrieStringIntMap trie = dictionary.getTrie();
25 |         if (trie == null) {
26 |             return;
27 |         }
28 |         char[] text = wordnet.getCharArray();
29 |         DoubleArrayTrieStringIntMap.DATMapMatcherInt searcher = trie.match(text, 0);
30 | 
31 |         while (searcher.next()) {
32 |             int offset = searcher.getBegin();
33 |             int length = searcher.getLength();
34 | 
35 |             Vertex v = new Vertex(length, -1, searcher.getValue());
36 |             if (!wordnet.row(offset).contains(length)) {
37 |                 wordnet.put(offset, v);
38 |             }
39 |         }
40 |     }
41 | 
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SentenceCollector.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.collector
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.common.utils.StringUtils
 5 | import com.mayabot.nlp.segment.WordTerm
 6 | import com.mayabot.nlp.segment.wordnet.Wordnet
 7 | import com.mayabot.nlp.segment.wordnet.Wordpath
 8 | import java.util.function.Consumer
 9 | 
10 | /**
11 |  * WordTermCollector的默认实现，从各种数据结构中收集和生成词序列
12 |  *
13 |  * @author jimichan
14 |  */
15 | class SentenceCollector(
16 |     private val mynlp: Mynlp,
17 |     private val subwordComputer: List<SubwordComputer> = emptyList(),
18 |     private val setupList: List<SubwordInfoSetup> = emptyList()
19 | ) : WordTermCollector {
20 | 
21 |     override fun collect(txtChars: CharArray?, wordnet: Wordnet, wordPath: Wordpath, consumer: Consumer<WordTerm>) {
22 | 
23 |         val vertexIterator = wordPath.iteratorVertex()
24 | 
25 |         setupList.forEach {
26 |             it.fill(wordnet, wordPath)
27 |         }
28 | 
29 |         while (vertexIterator.hasNext()) {
30 |             val vertex = vertexIterator.next()
31 | 
32 |             val word = if (txtChars == null) {
33 |                 vertex.realWord()
34 |             } else {
35 |                 String(chars = txtChars, vertex.offset(), vertex.length)
36 |             }
37 | 
38 |             val term = WordTerm(word, vertex.nature, vertex.offset())
39 | 
40 |             if (StringUtils.isWhiteSpace(term.word)) {
41 |                 continue
42 |             }
43 | 
44 |             // 如果运行成功，后面的就不运行了
45 |             subwordComputer.forEach {
46 |                 if (it.run(term, wordnet, wordPath)) {
47 |                     return@forEach
48 |                 }
49 |             }
50 | 
51 |             consumer.accept(term)
52 |         }
53 |     }
54 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SubwordComputer.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.collector
 2 | 
 3 | import com.mayabot.nlp.segment.WordTerm
 4 | import com.mayabot.nlp.segment.wordnet.Wordnet
 5 | import com.mayabot.nlp.segment.wordnet.Wordpath
 6 | import org.jetbrains.annotations.NotNull
 7 | 
 8 | /**
 9 |  * 子词切分计算器接口
10 |  *
11 |  * 从wordnet中计算出子词的所需要的基本信息，计算结果保存在WordTerm的subword字段里面
12 |  * @author jimichan
13 |  */
14 | interface SubwordComputer {
15 | 
16 |     /**
17 |      * [term] 一个待切分的子词
18 |      * [wordnet] 当前
19 |      * @return 如果处理了当前term返回true，没有返回false
20 |      */
21 |     fun run(
22 |         @NotNull term: WordTerm,
23 |         @NotNull wordnet: Wordnet,
24 |         @NotNull wordPath: Wordpath
25 |     ): Boolean
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SubwordInfoSetup.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.collector
 2 | 
 3 | import com.mayabot.nlp.segment.wordnet.Wordnet
 4 | import com.mayabot.nlp.segment.wordnet.Wordpath
 5 | 
 6 | /**
 7 |  * 感知机、crf等分词，wordnet中没有子词信息。那么通过这个接口在收集结果之前，通过词典新增子词信息。
 8 |  * @author jimichan
 9 |  */
10 | interface SubwordInfoSetup {
11 |     fun fill(wordnet: Wordnet, wordPath: Wordpath)
12 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/WordTermCollector.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.collector
 2 | 
 3 | import com.mayabot.nlp.segment.WordTerm
 4 | import com.mayabot.nlp.segment.wordnet.Wordnet
 5 | import com.mayabot.nlp.segment.wordnet.Wordpath
 6 | import java.util.function.Consumer
 7 | 
 8 | /**
 9 |  * Mynlp WordTerm 收集器
10 |  *
11 |  *
12 |  * 从wordPath、wordnet这两个数据结构中获得最终的分词结果。
13 |  *
14 |  *
15 |  * 通过这个接口，可以让相同的分词器，获得不同的用途的分词结果。
16 |  *
17 |  * @author jimichan
18 |  */
19 | interface WordTermCollector {
20 | 
21 |     /**
22 |      * 收集分词结果，最终发送到consumer中。
23 |      * 这样外面是流水线还是list保存结果，由外部决定。
24 |      *
25 |      * @param txtChars  词图
26 |      * @param KeepChar  词图
27 |      * @param wordnet  词图
28 |      * @param wordPath 最后的WordPath路径
29 |      * @param consumer 接受WordTerm的消费者
30 |      */
31 |     fun collect(txtChars:CharArray?,wordnet: Wordnet, wordPath: Wordpath, consumer: Consumer<WordTerm>)
32 | 
33 | 
34 | }
35 | 
36 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/CorrectionDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment.plugins.correction;
17 | 
18 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieMap;
19 | import com.mayabot.nlp.common.injector.ImplementedBy;
20 | 
21 | /**
22 |  * 分词纠错词典结构.
23 |  * 对外提供一个DoubleArrayTrie
24 |  *
25 |  * @author jimichan
26 |  */
27 | @ImplementedBy(DefaultCorrectionDictionary.class)
28 | public interface CorrectionDictionary {
29 | 
30 |     DoubleArrayTrieMap<CorrectionWord> getTrie();
31 | 
32 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/CorrectionPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.correction;
 2 | 
 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 5 | import org.jetbrains.annotations.NotNull;
 6 | 
 7 | /**
 8 |  * @author jimichan
 9 |  */
10 | public class CorrectionPlugin implements PipelineLexerPlugin {
11 | 
12 |     CorrectionDictionary dictionary = null;
13 | 
14 |     public CorrectionPlugin(@NotNull CorrectionDictionary dictionary) {
15 |         this.dictionary = dictionary;
16 |     }
17 | 
18 |     public CorrectionPlugin() {
19 |     }
20 | 
21 |     @Override
22 |     public void init(PipelineLexerBuilder builder) {
23 | 
24 |         CorrectionDictionary temp = dictionary;
25 |         if (temp == null) {
26 |             temp = builder.getMynlp().getInstance(CorrectionDictionary.class);
27 |         }
28 | 
29 |         builder.addProcessor(new CorrectionWordpathProcessor(temp));
30 |     }
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/CorrectionWord.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.correction
 2 | 
 3 | /**
 4 |  * 第几套/房
 5 |  */
 6 | class CorrectionWord(
 7 |         val raw: String,
 8 |         @JvmField
 9 |         val path: String,
10 |         val words: IntArray
11 | ) {
12 | 
13 | 
14 |     override fun toString(): String {
15 |         return "CorrectionWord{" + "path='" + path + '\'' +
16 |                 ", raw='" + raw + '\'' +
17 |                 ", words=" + words +
18 |                 '}'
19 |     }
20 | 
21 |     companion object {
22 | //        var splitter = Splitter.on("/").trimResults().omitEmptyStrings()
23 | 
24 |         /**
25 |          * 第几套/房
26 |          *
27 |          * @param line
28 |          * @return CorrectionWord
29 |          */
30 |         @kotlin.jvm.JvmStatic
31 |         fun parse(line: String): CorrectionWord {
32 | 
33 |             val raw = line.trim()
34 |             val list = raw.split("/").map { it.trim() }.filter { it.isNotEmpty() }
35 |             val path = list.joinToString("")
36 |             val words = list.map { it.length }.toIntArray()
37 |             return CorrectionWord(raw, path, words)
38 |         }
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/FileCorrectionDictionary.kt:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment.plugins.correction
17 | 
18 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieMap
19 | import com.mayabot.nlp.segment.plugins.correction.CorrectionWord.Companion.parse
20 | import java.io.File
21 | import java.nio.charset.Charset
22 | import java.util.*
23 | 
24 | /**
25 |  * File版本CorrectionDictionary
26 |  * 文件内容格式：
27 |  * 第几套/房
28 |  *
29 |  *
30 |  * 一行一个规则
31 |  *
32 |  * @author jimichan
33 |  */
34 | class FileCorrectionDictionary(file: File, charset: Charset = Charsets.UTF_8) : CorrectionDictionary {
35 | 
36 |     private val dict: TreeMap<String, CorrectionWord> = TreeMap()
37 | 
38 |     private val trie: DoubleArrayTrieMap<CorrectionWord>
39 | 
40 |     override fun getTrie(): DoubleArrayTrieMap<CorrectionWord> {
41 |         return trie
42 |     }
43 | 
44 |     init {
45 |         val lines = file.readLines(charset)
46 |         for (line in lines) {
47 |             val adjustWord = parse(line)
48 |             dict[adjustWord.path] = adjustWord
49 |         }
50 |         trie = DoubleArrayTrieMap(dict)
51 |     }
52 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * 分词纠错
3 |  */
4 | package com.mayabot.nlp.segment.plugins.correction;


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/customwords/CustomDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | package com.mayabot.nlp.segment.plugins.customwords;
17 | 
18 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap;
19 | import com.mayabot.nlp.common.injector.ImplementedBy;
20 | 
21 | /**
22 |  * 自定义词典结构.
23 |  * 对外提供一个DoubleArrayTrie
24 |  *
25 |  * @author jimichan
26 |  */
27 | @ImplementedBy(DefaultCustomDictionary.class)
28 | public interface CustomDictionary {
29 | 
30 |     DoubleArrayTrieStringIntMap getTrie();
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/customwords/CustomDictionaryPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.customwords;
 2 | 
 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 5 | 
 6 | public class CustomDictionaryPlugin implements PipelineLexerPlugin {
 7 | 
 8 |     private CustomDictionary customDictionary;
 9 | 
10 |     public CustomDictionaryPlugin(CustomDictionary customDictionary) {
11 |         this.customDictionary = customDictionary;
12 |     }
13 | 
14 |     public CustomDictionaryPlugin() {
15 |     }
16 | 
17 | 
18 |     @Override
19 |     public void init(PipelineLexerBuilder builder) {
20 |         CustomDictionary temp;
21 |         if (customDictionary == null) {
22 |             temp = builder.getMynlp().getInstance(CustomDictionary.class);
23 |         } else {
24 |             temp = customDictionary;
25 |         }
26 | 
27 |         builder.addProcessor(new CustomDictionaryProcessor(temp));
28 |     }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/customwords/FileCustomDictionary.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.segment.plugins.customwords;
18 | 
19 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap;
20 | import com.mayabot.nlp.common.Guava;
21 | import com.mayabot.nlp.common.utils.CharNormUtils;
22 | 
23 | import java.io.File;
24 | import java.io.IOException;
25 | import java.nio.charset.Charset;
26 | import java.util.List;
27 | import java.util.TreeMap;
28 | 
29 | /**
30 |  * File版本CustomDictionary
31 |  * 不管什么格式 壁式网球 1
32 |  * 只取第一段，后面的忽略
33 |  *
34 |  * @author jimichan
35 |  */
36 | public class FileCustomDictionary implements CustomDictionary {
37 | 
38 |     private DoubleArrayTrieStringIntMap trie;
39 | 
40 |     public FileCustomDictionary(File file, Charset charset) throws IOException {
41 |         TreeMap<String, Integer> dict = new TreeMap();
42 | 
43 |         List<String> lines = Guava.readLines(file, charset);
44 | 
45 |         for (String line : lines) {
46 | 
47 |             String[] params = line.split("\\s");
48 | 
49 |             String w = params[0];
50 |             String n = CharNormUtils.convert(params[0]);
51 | 
52 |             dict.put(w, 1000);
53 |             dict.put(n, 1000);
54 | 
55 |         }
56 | 
57 |         trie = new DoubleArrayTrieStringIntMap(dict);
58 |     }
59 | 
60 |     @Override
61 |     public DoubleArrayTrieStringIntMap getTrie() {
62 |         return trie;
63 |     }
64 | 
65 | }
66 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/ner/NerPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.ner;
 2 | 
 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 5 | import com.mayabot.nlp.segment.plugins.pos.PosPerceptronProcessor;
 6 | import com.mayabot.nlp.segment.plugins.pos.PosPlugin;
 7 | 
 8 | /**
 9 |  * @author jimichan
10 |  */
11 | public class NerPlugin implements PipelineLexerPlugin {
12 | 
13 |     @Override
14 |     public void init(PipelineLexerBuilder builder) {
15 | 
16 |         //如果不存在那么自行安装Pos模块
17 |         if (!builder.existWordPathProcessor(PosPerceptronProcessor.class)) {
18 |             builder.install(new PosPlugin());
19 |         }
20 | 
21 |         builder.addProcessor(NerProcessor.class);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pattern/PatternPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.pattern;
 2 | 
 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 5 | 
 6 | import java.util.regex.Pattern;
 7 | 
 8 | /**
 9 |  * 基于正则表达式的分词插件
10 |  *
11 |  * @author jimichan
12 |  */
13 | public class PatternPlugin implements PipelineLexerPlugin {
14 | 
15 |     private Pattern pattern;
16 | 
17 |     public static PatternPlugin of(Pattern pattern) {
18 |         return new PatternPlugin(pattern);
19 |     }
20 | 
21 |     public PatternPlugin(Pattern pattern) {
22 |         this.pattern = pattern;
23 |     }
24 | 
25 |     @Override
26 |     public void init(PipelineLexerBuilder builder) {
27 |         builder.addProcessor(new PatternWordpathProcessor(pattern));
28 |     }
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/personname/PersonNameAlgorithm.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.personname;
 2 | 
 3 | import com.mayabot.nlp.common.injector.Singleton;
 4 | import com.mayabot.nlp.segment.Nature;
 5 | import com.mayabot.nlp.segment.WordSplitAlgorithm;
 6 | import com.mayabot.nlp.segment.common.BaseSegmentComponent;
 7 | import com.mayabot.nlp.segment.wordnet.Vertex;
 8 | import com.mayabot.nlp.segment.wordnet.Wordnet;
 9 | 
10 | import java.util.List;
11 | /**
12 |  * 采用感知机或者将来CRF制作的人名识别模型。
13 |  * 这个切分算法，为了配合词典分词算法。
14 |  * 我们在构造词图阶段就提取人名。
15 |  */
16 | @Singleton
17 | public class PersonNameAlgorithm extends BaseSegmentComponent implements WordSplitAlgorithm {
18 | 
19 |     private final PerceptronPersonNameService service;
20 | 
21 |     public PersonNameAlgorithm(
22 |             PerceptronPersonNameService service) {
23 |         super(LEVEL3);
24 |         this.service = service;
25 |     }
26 | 
27 |     @Override
28 |     public void fill(Wordnet wordnet) {
29 | 
30 |         char[] charArray = wordnet.getCharArray();
31 | 
32 |         List<PersonName> names = service.findName(charArray);
33 | 
34 |         wordnet.set(PersonNamePlugin.key,names);
35 | 
36 |         if (!names.isEmpty()) {
37 |             for (PersonName name : names) {
38 | 
39 |                 // 人名<=3，可能性高，作为初始词汇。防止被切断。陈宝奇怪别人不好
40 |                 if (name.getName().length() <= 3) {
41 |                     //如果已经存在
42 |                     if (wordnet.row(name.getOffset()).contains(name.getName().length())) {
43 |                         continue;
44 |                     }
45 |                     Vertex v = new Vertex(name.getName().length());
46 |                     v.setAbsWordNatureAndFreq(Nature.nr);
47 |                     wordnet.put(name.getOffset(), v);
48 |                 }
49 |             }
50 |         }
51 | 
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/personname/PersonNamePlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.personname;
 2 | 
 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 5 | 
 6 | /**
 7 |  * 人名识别插件。
 8 |  * 3.1.0 新增了Processor。和PersonNameAlgorithm并存。
 9 |  * PersonNameAlgorithm处理长度小于等于3的人名。其他的人名，如果没有破坏其他词汇的切分，
10 |  * 那么合并和为人名。
11 |  * 修复了这种类型的bug
12 |  * 阿里/nr 云/u 仓库/n 地址/n 正确/a ,/w 陈宝奇/nr 怪/a 别人/r 不好/a
13 |  * 以前会把 阿里云仓 认为是人名。
14 |  * 陈宝 奇怪 别人 ，人名又会被忽略的问题。
15 |  * @author jimichan
16 |  */
17 | public class PersonNamePlugin implements PipelineLexerPlugin {
18 | 
19 |     public static final String key = "__person_name__";
20 | 
21 |     @Override
22 |     public void init(PipelineLexerBuilder builder) {
23 |         builder.addWordSplitAlgorithm(PersonNameAlgorithm.class);
24 |         builder.addProcessor(PersonNameProcessor.class);
25 |     }
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/CommonPosModel.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.pos
 2 | 
 3 | import com.mayabot.nlp.perceptron.PerceptronModel
 4 | import com.mayabot.nlp.perceptron.PerceptronComputer
 5 | import java.io.File
 6 | 
 7 | /**
 8 |  * 通用的词性标注。
 9 |  * 格式 word/pos word/pos
10 |  */
11 | open class CommonPosModel(val labels: Array<String>,
12 |                      val perceptron: PerceptronModel) {
13 | 
14 |     init {
15 |         perceptron.decodeQuickMode(true)
16 |     }
17 | 
18 |     protected val runner = PerceptronComputer(PosPerceptronDef(labels))
19 | 
20 |     /**
21 |      * 解码
22 |      */
23 |     fun decodeWithIndex(list: List<String>): IntArray {
24 |         return runner.decode(perceptron, list)
25 |     }
26 | 
27 |     fun save(dir: File) {
28 |         perceptron.save(dir)
29 |     }
30 | 
31 |     fun learn(sample: String) {
32 |         runner.learnModel(perceptron,sample)
33 |     }
34 | 
35 |     /**
36 |      * 解码
37 |      */
38 |     fun decode(list: List<String>): List<String> {
39 |         val decodeResult = runner.decode(perceptron, list)
40 |         return decodeResult.map { labels[it] }
41 |     }
42 | 
43 |     companion object {
44 | 
45 |         fun train(labels: List<String>,
46 |                   trainFile: File,
47 |                   evaluateFile: File?,
48 |                   iter: Int,
49 |                   threadNum: Int): PerceptronModel {
50 |             val runner = PerceptronComputer(PosPerceptronDef(labels.toTypedArray()))
51 |             return runner.train(trainFile, evaluateFile, iter, threadNum, true)
52 |         }
53 |     }
54 | }


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPerceptronUtils.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.pos
 2 | 
 3 | import com.mayabot.nlp.segment.common.allFiles
 4 | import com.mayabot.nlp.segment.common.parseToFlatWords
 5 | import java.io.File
 6 | 
 7 | fun main() {
 8 |     genTrainData()
 9 | }
10 |     fun genTrainData() {
11 |         val cn = File("data.work/corpus/cncorpus")
12 |         val pk = File("data.work/corpus/pku")
13 | 
14 |         fun read(file: File,list: MutableList<String>){
15 |             file.allFiles().forEach { f ->
16 |                 f.forEachLine { line ->
17 |                     if (line.isNotBlank()) {
18 |                         val x = line.parseToFlatWords().filter { it.pos.isNotBlank() }.joinToString(separator = " ")
19 |                         if(x.isNotBlank()) {
20 |                                     list += x
21 |                         }
22 |                     }
23 |                 }
24 |             }
25 |         }
26 | 
27 |         val list = ArrayList<String>()
28 | 
29 |         read(cn,list)
30 |         read(pk,list)
31 | 
32 |         list.shuffle()
33 | 
34 |         val out = File("data.work/pos.data")
35 |         out.mkdirs()
36 |         var k = 0
37 |         list.asSequence().chunked(50000).forEach { part->
38 |             k++
39 |             File(out,"part-${k}.txt").writer(Charsets.UTF_8).use {
40 |                 part.forEach { line->
41 |                     it.write(line)
42 |                     it.write("\n")
43 |                 }
44 |             }
45 |         }
46 | 
47 |     }
48 | 
49 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPlugin.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.plugins.pos;
 2 | 
 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder;
 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin;
 5 | 
 6 | /**
 7 |  * 词性模块
 8 |  *
 9 |  * @author jimichan
10 |  */
11 | public class PosPlugin implements PipelineLexerPlugin {
12 | 
13 |     @Override
14 |     public void init(PipelineLexerBuilder builder) {
15 |         builder.addProcessor(PosPerceptronProcessor.class);
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/reader/BaseFilterLexerReader.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.reader;
 2 | 
 3 | import com.mayabot.nlp.segment.LexerReader;
 4 | import com.mayabot.nlp.segment.WordTerm;
 5 | import com.mayabot.nlp.segment.WordTermSequence;
 6 | 
 7 | import java.io.Reader;
 8 | import java.util.Iterator;
 9 | import java.util.function.Predicate;
10 | 
11 | public abstract class BaseFilterLexerReader implements LexerReader, Predicate<WordTerm> {
12 | 
13 |     private final LexerReader source;
14 | 
15 |     private boolean enable = true;
16 | 
17 |     public BaseFilterLexerReader(LexerReader source) {
18 |         this.source = source;
19 |     }
20 | 
21 |     public LexerReader getSource() {
22 |         return source;
23 |     }
24 | 
25 |     @Override
26 |     public WordTermSequence scan(Reader reader) {
27 |         WordTermSequence wts = source.scan(reader);
28 |         if (!enable) {
29 |             return wts;
30 |         }
31 |         Iterator<WordTerm> iterator = wts.iterator();
32 |         Iterator<WordTerm> change = new FilterWordItemIterator(iterator, this);
33 |         return new WordTermSequence(change);
34 |     }
35 | 
36 |     @Override
37 |     public WordTermSequence scan(String text) {
38 |         WordTermSequence wts = source.scan(text);
39 |         if (!enable) {
40 |             return wts;
41 |         }
42 |         Iterator<WordTerm> iterator = wts.iterator();
43 |         Iterator<WordTerm> change = new FilterWordItemIterator(iterator, this);
44 |         return new WordTermSequence(change);
45 |     }
46 | 
47 |     public boolean isEnable() {
48 |         return enable;
49 |     }
50 | 
51 |     public void setEnable(boolean enable) {
52 |         this.enable = enable;
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/reader/DefaultLexerReader.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.reader;
 2 | 
 3 | import com.mayabot.nlp.segment.Lexer;
 4 | import com.mayabot.nlp.segment.LexerReader;
 5 | import com.mayabot.nlp.segment.WordTermSequence;
 6 | 
 7 | import java.io.Reader;
 8 | 
 9 | /**
10 |  * @author jimichan
11 |  */
12 | public class DefaultLexerReader implements LexerReader {
13 | 
14 |     private final Lexer lexer;
15 | 
16 |     public DefaultLexerReader(Lexer lexer) {
17 |         this.lexer = lexer;
18 |     }
19 | 
20 |     @Override
21 |     public WordTermSequence scan(Reader reader) {
22 |         return new WordTermSequence(lexer, reader);
23 |     }
24 | 
25 |     @Override
26 |     public WordTermSequence scan(String text) {
27 |         return new WordTermSequence(lexer, text);
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/wordnet/BestPathAlgorithm.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2018 mayabot.com authors. All rights reserved.
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  *  you may not use this file except in compliance with the License.
 6 |  *  You may obtain a copy of the License at
 7 |  *
 8 |  *       http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package com.mayabot.nlp.segment.wordnet;
18 | 
19 | import com.mayabot.nlp.segment.lexer.bigram.ViterbiBestPathAlgorithm;
20 | 
21 | /**
22 |  * 选择最佳路径接口。具体实现有，viterbi 维特比 dijkstra算法 NShort算法 前向最大路径算法
23 |  *
24 |  * @author jimichan
25 |  * @see ViterbiBestPathAlgorithm
26 |  * @see com.mayabot.nlp.segment.plugins.bestpath.LongpathBestPathAlgorithm
27 |  */
28 | public interface BestPathAlgorithm {
29 | 
30 |     /**
31 |      * 从词图网络中选择一条从头到尾的路径
32 |      *
33 |      * @param wordnet 输入词图
34 |      * @return Wordpath
35 |      */
36 |     Wordpath select(Wordnet wordnet);
37 | }
38 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/segment/wordnet/package-info.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.wordnet;
 2 | 
 3 | /**
 4 |  * Wordnet是一个在分词使用的数据结构。
 5 |  * 所谓词图，指的是句子中所有词可能构成的图。
 6 |  * <p>
 7 |  * 这里提供了优化的Wordnet的实现，更快的速度、更低的内存、尽量zero-copy。
 8 |  * <p>
 9 |  * 还提供了Wordpath数据结构，wordpath采用bitSet去实现对选中路径的描述，避免和wordnet数据结构的纠缠，
10 |  * 让规则程序更容易去进行重新划分词语，为识别器和业务规则的编码带来便利，降低了程序复杂度。
11 |  *
12 |  * @author jimichan
13 |  **/


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/starspace/Prediction.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.starspace
 2 | 
 3 | import com.mayabot.nlp.blas.Vector
 4 | 
 5 | 
 6 | data class Prediction(var score: Float, var second: Int)
 7 | 
 8 | class StarSpacePrediction(private val model: StarSpace, basedoc: String?) {
 9 | 
10 |     var baseDocVectors: MutableList<Vector> = ArrayList()
11 | 
12 |     var baseDocs: MutableList<List<XPair>> = ArrayList()
13 | 
14 |     init {
15 |         val (x, y) = model.loadBaseDocs(basedoc)
16 |         baseDocs = y
17 |         baseDocVectors = x
18 |     }
19 | 
20 |     fun predictOne(doc: String): List<Prediction> {
21 |         return predictOne(model.dict.parseDoc(doc), 5)
22 |     }
23 | 
24 |     fun predictOne(doc: String, k: Int): List<Prediction> {
25 |         return predictOne(model.dict.parseDoc(doc), k)
26 |     }
27 | 
28 |     fun predictOne(input: List<XPair>, k: Int): List<Prediction> {
29 | 
30 |         val lhsM = model.projectLHS(input)
31 | 
32 |         val topMax = TopMaxK(k)
33 | 
34 |         for (i in baseDocVectors.indices) {
35 |             val score = model.args.similarity(lhsM, baseDocVectors[i])
36 |             topMax.push(i, score)
37 |         }
38 | 
39 |         return topMax.resort().map { Prediction(it.second, it.first) }
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/mynlp/src/main/java/com/mayabot/nlp/starspace/SparseLinear.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.starspace
 2 | 
 3 | import com.mayabot.nlp.blas.DenseMatrix
 4 | import com.mayabot.nlp.blas.DenseVector
 5 | import com.mayabot.nlp.blas.Vector
 6 | 
 7 | 
 8 | open class TheMatrix(val matrix: DenseMatrix) {
 9 | 
10 |     fun numRows(): Int {
11 |         return matrix.row
12 |     }
13 | 
14 |     fun numCols(): Int {
15 |         return matrix.col
16 |     }
17 | 
18 | }
19 | 
20 | 
21 | class SparseLinear(matrix: DenseMatrix) : TheMatrix(matrix) {
22 | 
23 |     fun forward(row: Int): Vector {
24 |         return matrix[row]
25 |     }
26 | 
27 |     fun forward(list: List<XPair>): Vector {
28 | 
29 |         val vector = DenseVector(this.numCols())
30 | 
31 |         for ((row, scale) in list) {
32 |             vector += scale to matrix[row]
33 |         }
34 | 
35 |         return vector
36 |     }
37 | 
38 | }


--------------------------------------------------------------------------------
/mynlp/src/main/resources/META-INF/mynlp.factories:
--------------------------------------------------------------------------------
1 | GuiceModule=com.mayabot.nlp.segment.SegmentModule


--------------------------------------------------------------------------------
/mynlp/src/main/resources/mynlp/py_hard_code_map.txt:
--------------------------------------------------------------------------------
 1 | # - 表示互换
 2 | # -> 表示单向变化
 3 | # 大概的逻辑是 前面是嘴瓢的说法，后面是正确的音。
 4 | # 比如 灰机 --> 飞机  huiji,feiji
 5 | hua - fa
 6 | huan - fan
 7 | hui -> fei
 8 | jie -> zhe
 9 | kou -> ke
10 | gou -> ge
11 | zhong -> zen
12 | san -> shang
13 | 


--------------------------------------------------------------------------------
/mynlp/src/main/resources/patch/cws-default.txt:
--------------------------------------------------------------------------------
1 | # ------------------ #
2 | # 感知机分词在线学习补丁
3 | # 随着新版本发现在这里默认修复感知机分词错误案例
4 | # ------------------ #
5 | 
6 | X 临时 分居


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/BM25Test.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp
 2 | 
 3 | import com.mayabot.nlp.similarity.BM25ModelBuilder
 4 | 
 5 | 
 6 | fun main() {
 7 |     val doc = listOf<String>(
 8 |             "黄浦区人民政府在哪",
 9 |             "黄浦区人民政府要怎么去",
10 |             "区政府在哪？",
11 |             "区政府怎么走？",
12 |             "区人民政府在哪儿",
13 |             "黄浦区人民政府的工作地址",
14 |             "黄浦区政府在哪",
15 |             "人民政府要怎么去",
16 |             "人民政府的工作地址",
17 |             "黄浦区人民政府的电话是多少",
18 |             "黄浦区人民政府的电话号码是多少",
19 |             "区政府电话",
20 |             "区政府的联系方式",
21 |             "黄浦区行政服务中心在哪",
22 |             "黄浦区行政服务中心要怎么去",
23 |             "行政服务中心在哪",
24 |             "行政服务中心要怎么去",
25 |             "行政服务中心的工作地址",
26 |             "区行政服务中心在哪？",
27 |             "黄浦区金融发展服务中心都几点有人",
28 |             "黄浦区金融发展服务中心都几点有人在工作",
29 |             "我什么时候去金融发展服务中心比较合适",
30 |             "金融发展服务中心的工作时间是几点",
31 |             "我几点去金融发展服务中心比较合适",
32 |             "金融发展服务中心的工作时间是几点",
33 |             "金融发展服务中心一周的工作时间都是几点到几点",
34 |             "区金融发展服务中心上班时间",
35 |             "请问一下金融发展服务中心电话",
36 |             "请给我金融发展服务中心的电话",
37 |             "请把金融发展服务中心的电话给我",
38 |             "区金融发展服务中心的电话",
39 |             "区金融发展服务中心的联系方式",
40 |             "黄浦区金融发展服务中心在哪",
41 |             "黄浦区金融发展服务中心要怎么去",
42 |             "区金融发展服务中心在哪？",
43 |             "黄浦区金融发展服务中心的工作地址",
44 |             "金融发展服务中心在哪",
45 |             "金融发展服务中心要怎么去",
46 |             "金融发展服务中心的工作地址",
47 |             "怎么去",
48 | //            "请问一下行政服务的电话"
49 |     )
50 | 
51 |     val bm25 = BM25ModelBuilder(doc).b(0.75f).build()
52 | 
53 |     bm25.search("怎么去").forEach {
54 |         println(" ${doc[it.docId]}   $it")
55 |     }
56 | 
57 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/Highlight.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp
 2 | 
 3 | import com.mayabot.nlp.module.Highlighter
 4 | import com.mayabot.nlp.module.highlight
 5 | import org.junit.Assert
 6 | import org.junit.Test
 7 | 
 8 | class HighlightTest {
 9 | 
10 |     private val words = listOf<String>("居住证", "居住", "住宅", "hello")
11 | 
12 |     @Test
13 |     fun test() {
14 | 
15 |         val highlighter = Highlighter(words)
16 |         val text = "这个居住证，怎么办，居住和住宅----"
17 | 
18 |         Assert.assertEquals(highlighter.replace(text), "这个<em>居住证</em>，怎么办，<em>居住</em>和<em>住宅</em>----")
19 |     }
20 | 
21 |     @Test
22 |     fun test2() {
23 |         val highlighter = Highlighter(words, "div")
24 |         val text = "这个居住证，怎么办，居住和住宅----"
25 | 
26 |         Assert.assertEquals(highlighter.replace(text), "这个<div>居住证</div>，怎么办，<div>居住</div>和<div>住宅</div>----")
27 |     }
28 | 
29 |     @Test
30 |     fun test3() {
31 |         val text = "这个居住证，怎么办，居住和住宅----"
32 | 
33 |         val result = text.highlight(words)
34 | 
35 |         Assert.assertEquals(result, "这个<em>居住证</em>，怎么办，<em>居住</em>和<em>住宅</em>----")
36 |     }
37 | 
38 |     /**
39 |      * 大小写
40 |      */
41 |     @Test
42 |     fun test4() {
43 |         val text = "Hello word !"
44 | 
45 |         val result = text.highlight(words)
46 | 
47 |         Assert.assertEquals("<em>Hello</em> word !", result)
48 |     }
49 | 
50 |     /**
51 |      * 大小写
52 |      */
53 |     @Test
54 |     fun test5() {
55 |         val text = "HEllo word !"
56 | 
57 |         val result = text.highlight(words)
58 | 
59 |         Assert.assertEquals("<em>HEllo</em> word !", result)
60 |     }
61 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/TransTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp;
 2 | 
 3 | import com.mayabot.nlp.Mynlp;
 4 | import org.junit.Assert;
 5 | 
 6 | public class TransTest {
 7 | 
 8 |     @org.junit.Test
 9 |     public void test() {
10 | 
11 |         Mynlp mynlp = Mynlp.instance();
12 | 
13 |         String text = "軟件和體育的藝術";
14 |         String text_s = "软件和体育的艺术";
15 | 
16 |         Assert.assertTrue(text.equals(mynlp.s2t(text_s)));
17 | 
18 |         Assert.assertTrue(text_s.equals(mynlp.t2s(text)));
19 |     }
20 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/XxHashTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp
 2 | 
 3 | import net.openhft.hashing.LongHashFunction
 4 | import org.junit.Assert
 5 | import org.junit.Test
 6 | 
 7 | class XxHashTest {
 8 | 
 9 |     @Test
10 |     fun test() {
11 |         //7958582187431989116
12 |         val hash = LongHashFunction.xx().hashChars("要闻汲取奋力前行力量李强龚正等参观我们众志成城上海防控新冠肺炎疫情主题展览")
13 |         Assert.assertEquals(hash,7958582187431989116)
14 |     }
15 | 
16 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/commmon/CsrSparseMatrixTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.commmon
 2 | 
 3 | import com.mayabot.nlp.common.TreeBasedTable
 4 | import com.mayabot.nlp.common.matrix.CSRSparseMatrix
 5 | import org.junit.Assert
 6 | import org.junit.Test
 7 | 
 8 | class CsrSparseMatrixTest {
 9 | 
10 |     @Test
11 |     fun test() {
12 |         val table: TreeBasedTable<Int, Int, Int> = TreeBasedTable()
13 | 
14 |         table.put(2, 0, 6)
15 |         table.put(3, 2, 4)
16 |         table.put(0, 0, 5)
17 |         table.put(0, 3, 2)
18 |         table.put(4, 1, 2)
19 |         table.put(4, 4, 9)
20 | 
21 |         val csr = CSRSparseMatrix(table, 5)
22 | 
23 |         Assert.assertTrue(csr[2, 0] == 6)
24 |         Assert.assertTrue(csr[3, 2] == 4)
25 |         Assert.assertTrue(csr[0, 0] == 5)
26 |         Assert.assertTrue(csr[0, 3] == 2)
27 |         Assert.assertTrue(csr[4, 1] == 2)
28 |         Assert.assertTrue(csr[4, 4] == 9)
29 |     }
30 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/commmon/TokenizerSplitterTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.commmon;
 2 | 
 3 | 
 4 | import org.junit.Assert;
 5 | import org.junit.Test;
 6 | 
 7 | import static com.mayabot.nlp.common.TokenizerSplitter.parts;
 8 | 
 9 | public class TokenizerSplitterTest {
10 | 
11 |     @Test
12 |     public void test() {
13 |         System.out.println();
14 |         Assert.assertTrue(parts("").isEmpty());
15 |         Assert.assertEquals(parts(",abc,efg").toString(), "[abc, efg]");
16 |         Assert.assertEquals(parts(",,abc efg.").toString(), "[abc, efg]");
17 |         Assert.assertEquals(parts("abcefg").toString(), "[abcefg]");
18 |         Assert.assertEquals(parts("ou may skip through a book, reading only those passages concerned  ").toString(),
19 |                 "[ou, may, skip, through, a, book, reading, only, those, passages, concerned]");
20 | 
21 |         Assert.assertEquals(parts("你可以跳读一本书，只拣那些有关的段落读一下即可。").toString(),
22 |                 "[你可以跳读一本书, 只拣那些有关的段落读一下即可]");
23 | 
24 | //        long t1 = System.currentTimeMillis();
25 | //        for (int i = 0; i < 100000; i++) {
26 | //            parts("你可以跳读一本书，只拣那些有关的段落读一下即可。");
27 | //        }
28 | //        long t2 = System.currentTimeMillis();
29 | //        long time = t2 - t1;
30 | //        Assert.assertTrue(time < 5000);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/fasttext/CFtzModelBugTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext
 2 | 
 3 | import java.io.File
 4 | 
 5 | fun main() {
 6 |     val model = FastText.loadCppModel(File("fastText4j/data/ChineseJapaneseKoreanLangIder.ftz"))
 7 | 
 8 |     val list = model.predict(listOf("こんにちは"), 3, 0.1f)
 9 | 
10 |     list.forEach {
11 |         println(it)
12 |     }
13 | 
14 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/fasttext/Java.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext;
 2 | 
 3 | public class Java {
 4 |     public static void main(String[] args) {
 5 | 
 6 | //        FastText fastText = FastText.loadModelFormZip(new File("data/agnews/model.zip"));
 7 | //
 8 | //        System.out.println();
 9 | 
10 | //        File trainFile = new File("data/agnews/ag.train");
11 | //        InputArgs inputArgs = new InputArgs();
12 | //        inputArgs.setLoss(LossName.softmax);
13 | //        inputArgs.setLr(0.1);
14 | //        inputArgs.setDim(100);
15 | //        inputArgs.setEpoch(20);
16 | //
17 | //        FastText model = FastText.trainSupervised(trainFile, inputArgs);
18 | //
19 | //        model.test(new File("data/agnews/ag.test"),1,0,true);
20 | //
21 | //        model.predict()
22 | 
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/fasttext/SupTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext
 2 | 
 3 | import com.mayabot.nlp.fasttext.args.InputArgs
 4 | import com.mayabot.nlp.fasttext.loss.LossName
 5 | import java.io.File
 6 | 
 7 | 
 8 | val trainFile = File("fastText4j/data/agnews/ag.train")
 9 | val testFile = File("fastText4j/data/agnews/ag.test")
10 | 
11 | fun main() {
12 | 
13 |     val args = InputArgs().apply {
14 |         this.loss = LossName.softmax
15 |         lr = 0.1
16 |         dim = 100
17 |         minn = 0
18 |         maxn = 0
19 |     }
20 | 
21 | //    var fastText = FastText.trainSupervised(trainFile, args)
22 | //
23 | //    fastText = fastText.quantize()
24 | //
25 | //    fastText.saveModelToSingleFile(File("fastText4j/data/model.fjbin"))
26 | 
27 | //    fastText.saveModel("fasttext/data/agnews/model")
28 | //
29 | //    val qFastText = fastText.quantize(dsub = 10)
30 | //    qFastText.saveModel("fasttext/data/agnews/model.q")
31 | //////
32 |     //fastText.test(testFile, 1)
33 | //    qFastText.test(testFile, 1)
34 | 
35 | //    val fastText = FastText.loadModel(File("fasttext/data/agnews/model"),true)
36 | //    val fastText = FastText.loadCppModel(File("fasttext/data/agnews/model.ftz"))
37 |     val fastText = FastText.loadModelFromSingleFile(File("fastText4j/data/model.fjbin"))
38 |     fastText.test(testFile, 1)
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/fasttext/TestCModelFTZ.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext
 2 | 
 3 | import java.io.File
 4 | 
 5 | fun main() {
 6 |     val file = File("/Users/jimichan/Downloads/ChineseJapaneseKoreanLangIder.ftz")
 7 | 
 8 |     val model = FastText.loadCppModel(file)
 9 | 
10 |     val x = model.predict(listOf("hello", "hi"), 1, 0.0f)
11 |     println(x)
12 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/fasttext/TestSup.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext
 2 | 
 3 | import com.mayabot.nlp.fasttext.args.InputArgs
 4 | import com.mayabot.nlp.fasttext.args.ModelName
 5 | import com.mayabot.nlp.fasttext.loss.LossName
 6 | import com.mayabot.nlp.fasttext.utils.disableLog
 7 | import java.io.File
 8 | 
 9 | class TestSup {
10 | 
11 |     val trainFile = File("data/agnews/ag.train")
12 |     val testFile = File("data/agnews/ag.test")
13 | 
14 |     fun testSub(){
15 |         disableLog()
16 |         val lossNames = listOf(LossName.softmax,LossName.ns,LossName.hs,LossName.ova)
17 | 
18 |         lossNames.forEach { loss->
19 |             check(test(loss)){
20 |                 "Loss Name ${loss.name} ERROR"
21 |             }
22 |         }
23 |     }
24 | 
25 | 
26 |     fun test(lossName: LossName) : Boolean {
27 | //        val trainSources = listOf(loadTrainFile("ag.train.txt"))
28 | //        val testSources = loadTrainFile("ag.test.txt")
29 | 
30 |         val trainArgs = InputArgs()
31 |         trainArgs.loss = lossName
32 | 
33 |         val fastText = FastText.train(trainFile, ModelName.sup, trainArgs)
34 | 
35 |         fastText.quantize()
36 | 
37 |         val meter = fastText.test(testFile)
38 | 
39 |         return meter.f1Score() > 0.9
40 |     }
41 | 
42 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/fasttext/TestWords.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext
 2 | 
 3 | import com.mayabot.nlp.blas.cosine
 4 | import java.io.File
 5 | 
 6 | fun main() {
 7 | 
 8 | //    val file = File("/Users/jimichan/Downloads/wiki.zh.bin")
 9 | //
10 | //    val fastText = FastText.loadCppModel(file)
11 | 
12 |     val fastText = FastText.loadModel(File("/Users/jimichan/mynlp.data/wordvec.vec"), true)
13 | 
14 |     println("加载模型到内存完成")
15 | 
16 | //    val k = fastText.nearestNeighbor("丢失",5)
17 | 
18 |     fastText.like("丢", "丢失")
19 |     fastText.like("遗落", "丢失")
20 |     fastText.like("偷走", "丢失")
21 |     fastText.like("遗失", "丢失")
22 |     fastText.like("遗失", "遗落")
23 |     fastText.like("失去", "丢失")
24 |     fastText.like("上海", "丢失")
25 |     fastText.like("挂失", "补办")
26 | 
27 |     println("----------------")
28 |     fastText.senLike("卡 丢失 了", "卡 被 偷走 了")
29 |     fastText.senLike("卡 丢失 了", "信用卡 忘记 密码 ")
30 | 
31 | //    println(fastText.analogies("柏林","德国","法国",5))
32 | 
33 | }
34 | 
35 | private fun FastText.like(word1: String, word2: String) {
36 |     val cos = cosine(this.getWordVector(word1), this.getWordVector(word2))
37 |     println("$word1 <-> $word2 : ${cos}")
38 | }
39 | 
40 | private fun FastText.senLike(word1: String, word2: String) {
41 |     val cos = cosine(this.getSentenceVector(word1.split(" ")), this.getSentenceVector(word2.split(" ")))
42 |     println("$word1 <-> $word2 : ${cos}")
43 | }
44 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/fasttext/Utils.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.fasttext
 2 | 
 3 | import com.mayabot.nlp.fasttext.train.MemSampleLineList
 4 | import com.mayabot.nlp.fasttext.train.SampleLine
 5 | 
 6 | fun loadTrainFile( resouceName:String ) : MemSampleLineList{
 7 | 
 8 |     val path = "/"+resouceName
 9 | 
10 |     val ins = TestSup::class.java.getResourceAsStream(path)
11 | 
12 |     val list = ArrayList<String>()
13 |     ins.bufferedReader().lines().forEach {
14 |         list += it
15 |     }
16 | 
17 |     val x = list.map { SampleLine(it.split(" ").toList()) }.toMutableList()
18 | 
19 |     return MemSampleLineList(x)
20 | 
21 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/module/lucene/LuceneUtils.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.module.lucene
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream
 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute
 6 | 
 7 | fun TokenStream.iterable():Iterable<String>{
 8 |     return Iterable {
 9 |         TokenStreamIterator(this)
10 |     }
11 | }
12 | 
13 | class TokenStreamIterator(private val tokenStream:TokenStream) : AbstractIterator<String>() {
14 |     init {
15 |         tokenStream.reset()
16 |     }
17 | 
18 |     private val charTermAttr = tokenStream.getAttribute(CharTermAttribute::class.java)
19 |     private val offsetAttr = tokenStream.getAttribute(OffsetAttribute::class.java)
20 | 
21 |     override fun computeNext() {
22 |         val hasNext = tokenStream.incrementToken()
23 |         if (hasNext) {
24 |             this.setNext(charTermAttr.toString())
25 |         }else{
26 |             tokenStream.end()
27 |             tokenStream.close()
28 |             done()
29 |         }
30 |     }
31 | 
32 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/module/lucene/TestPinyinTokenizer.kt:
--------------------------------------------------------------------------------
 1 | //package com.mayabot.nlp.module.lucene
 2 | //
 3 | //import com.mayabot.nlp.Mynlp
 4 | //import org.apache.lucene.analysis.tokenattributes.CharTermAttribute
 5 | //import org.apache.lucene.analysis.tokenattributes.OffsetAttribute
 6 | //import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute
 7 | //import org.junit.Test
 8 | //import java.io.StringReader
 9 | //
10 | //class TestPinyinTokenizer {
11 | //
12 | //    val pinyin = Mynlp.instance().pinyin();
13 | //
14 | //    @Test
15 | //    fun test() {
16 | //        val tok = PinyinAnalyzer(pinyin,true,true,false)
17 | //
18 | //        tok.pinyinTokens("飞机").forEach {
19 | //            println(it)
20 | //        }
21 | //    }
22 | //
23 | //    @Test
24 | //    fun test2() {
25 | //        val tok = PinyinAnalyzer(pinyin,true,false,false)
26 | //
27 | //        tok.pinyinTokens("三个 小猪").forEach {
28 | //            println(it)
29 | //        }
30 | //    }
31 | //
32 | //    private fun PinyinAnalyzer.pinyinTokens(text:String):List<Item> {
33 | //        val tk = this.tokenStream("title",text)
34 | //        tk.reset()
35 | //
36 | //        val charTermAttr = tk.getAttribute(CharTermAttribute::class.java)
37 | //        val offsetAttr = tk.getAttribute(OffsetAttribute::class.java)
38 | //        val posAttr = tk.getAttribute(PositionIncrementAttribute::class.java)
39 | //        val list = ArrayList<Item>()
40 | //
41 | //        while (tk.incrementToken()) {
42 | //            list += Item(charTermAttr.toString(),offsetAttr.startOffset(),offsetAttr.endOffset(),posAttr.positionIncrement)
43 | //        }
44 | //        tk.end()
45 | //        tk.close()
46 | //        return list
47 | //    }
48 | //
49 | //    data class Item(
50 | //        val py:String,
51 | //        val offsetStart:Int,
52 | //        val offsetEnd:Int,
53 | //        val inc:Int,
54 | //    )
55 | //}


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/pa/GeleiCode.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.pa
 2 | 
 3 | import kotlin.math.pow
 4 | 
 5 | 
 6 | //void GrayCode(int n, string *data)
 7 | //{
 8 | //    if (n == 1)//终止条件，先生成1位的格雷码
 9 | //    {
10 | //        data[0] = "0";
11 | //        data[1] = "1";
12 | //        return;
13 | //    }
14 | //    GrayCode(n - 1, data);//生成n位的格雷码首先需要生成n-1的格雷码
15 | //    int len = (int)pow(2, n);
16 | //    for (int i = len / 2; i < len; i++)//先处理后半部分，注意对称
17 | //    {
18 | //        data[i] = "1" + data[len - i - 1];
19 | //    }
20 | //    for (int i = 0; i < len / 2; i++)//对于前半部分直接+'0'
21 | //    {
22 | //        data[i] = "0" + data[i];
23 | //    }
24 | 
25 | fun grapCode(n: Int, data: Array<String?>) {
26 |     if (n == 1) {
27 |         data[0] = "0"
28 |         data[1] = "1"
29 |         return
30 |     }
31 |     grapCode(n - 1, data)
32 |     val len = 2.0.pow(n).toInt()
33 |     for (i in len / 2 until len) {
34 |         data[i] = "1" + data[len - i - 1]
35 |     }
36 |     for (i in 0 until len / 2) {
37 |         data[i] = "0" + data[i]
38 |     }
39 | }
40 | 
41 | fun main() {
42 |     val n = 5
43 |     val data = Array<String?>(2.0.pow(n * 1.0).toInt()) { null }
44 |     grapCode(n, data)
45 | 
46 |     var i = 0
47 |     for (line in data) {
48 |         println("$i\t" + line!!.padStart(n, '0'))
49 |         i++
50 |     }
51 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinDistance.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.pinyin
 2 | 
 3 | import com.mayabot.nlp.module.pinyin.PinyinDistance
 4 | import org.junit.Test
 5 | 
 6 | class PinyinDistance {
 7 | 
 8 |     @Test
 9 |     fun test() {
10 |         PinyinDistance.distance("灰机", "飞机")
11 |         PinyinDistance.distance("粉丝", "大侠")
12 |         PinyinDistance.distance("粉丝中", "大侠梦")
13 |     }
14 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.pinyin
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.Mynlp.Companion.instance
 5 | import org.junit.Assert
 6 | import org.junit.Test
 7 | 
 8 | class PinyinTest {
 9 | 
10 |     @Test
11 |     fun test() {
12 |         Assert.assertEquals("[zhao, zhao, mu, mu]", "朝朝暮暮".py())
13 |     }
14 | 
15 |     @Test
16 |     fun test2() {
17 |         println(
18 |             instance().convertPinyin("转战")
19 |                 .fuzzy(true).asList()
20 |         )
21 |     }
22 | 
23 | //    @Test
24 | //    fun test3() {
25 | //        var pinyin = Mynlp.instance().pinyin()
26 | //        for (py in pinyin.charPinyin('行')) {
27 | //            println(py)
28 | //        }
29 | //    }
30 | 
31 |     private fun String.py() = Mynlp.instance().convertPinyin(this).asList().toString()
32 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/CmbSegment.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import org.junit.Test
 4 | 
 5 | /**
 6 |  * 招行分词需求
 7 |  */
 8 | class CmbSegment {
 9 | 
10 |     @Test
11 |     fun test() {
12 |         val text = "" +
13 |                 "2018年年度收入\n" +
14 |                 "2018年收入\n" +
15 |                 "17年账单\n" +
16 |                 "我要找1到3个月出入账\n" +
17 |                 "周一到周三花了多少钱\n" +
18 |                 "最近三天花了多少钱\n" +
19 |                 "最近一周转账记录\n" +
20 |                 "6月账单\n" +
21 |                 "半年流水\n" +
22 |                 "二月份明细账\n" +
23 |                 "最近6个月全部账单\n" +
24 |                 "一年流水\n" +
25 |                 "四个月流水\n" +
26 |                 "四月份收入\n" +
27 |                 "上一月支出\n" +
28 |                 "6月1号账单\n" +
29 |                 "6月28号流水\n" +
30 |                 "这是陈汝烨和张帆副院长的生日"
31 | 
32 |         // 1. 自定义词库
33 |         // 2. 人工纠错规则
34 | 
35 |         val tokenizer = Lexers.core()
36 | 
37 |         for (line in text.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) {
38 |             print(line + "\t")
39 | 
40 |             println(tokenizer.scan(line))
41 |         }
42 |     }
43 | 
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/CombineTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment;
 2 | 
 3 | import org.junit.Assert;
 4 | import org.junit.Test;
 5 | 
 6 | /**
 7 |  * CoreTokenizer自带Combine逻辑，不再需要后置处理了。
 8 |  */
 9 | public class CombineTest {
10 | 
11 |     @Test
12 |     public void test() {
13 | 
14 |         Lexer tokenizer = Lexers.core();
15 | 
16 |         String test = "体重182kg\n" +
17 |                 "五十八公斤\n" +
18 |                 "产品编号BN-598\n" +
19 |                 "产品编号BN-598-122N\n" +
20 |                 "我买了一台very cool iPhone7\n" +
21 |                 "分词标签是__lable__";
22 | 
23 | 
24 |         String[] result = ("体重 182kg\n" +
25 |                 "五十八公斤\n" +
26 |                 "产品 编号 bn-598\n" +
27 |                 "产品 编号 bn-598-122n\n" +
28 |                 "我 买 了 一台 very cool iphone7\n" +
29 |                 "分词 标签 是 __lable__").split("\n");
30 | 
31 |         int i = 0;
32 |         for (String text : test.split("\n")) {
33 |             String t = tokenizer.scan(text).toPlainString();
34 |             Assert.assertTrue(t + "--->" + result[i], t.equals(result[i].toLowerCase()));
35 |             i++;
36 |         }
37 | 
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/CustomDictTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import com.mayabot.nlp.segment.plugins.customwords.MemCustomDictionary
 4 | import org.junit.Test
 5 | 
 6 | class CustomDictTest {
 7 | 
 8 | 
 9 |     @Test
10 |     fun test() {
11 |         val mem = MemCustomDictionary()
12 |         mem.addWord("长江1号");
13 |         mem.addWord("ECS固收");
14 |         mem.addWord("固收");
15 |         mem.rebuild()
16 | 
17 |         mem.clear()
18 | 
19 |         mem.addWord("固收");
20 |         mem.rebuild()
21 | 
22 |         val lexer = Lexers.coreBuilder()
23 |             .withCustomDictionary(mem)
24 |             .customSentenceCollector {
25 |                 it.smartSubword()
26 |                 it.fillCustomDict(mem)
27 |             }
28 |             .build()
29 | 
30 |         println(lexer.scan("ECS固收"))
31 |         println("----")
32 |         lexer.scan("ECS固收").forEach { w ->
33 |             println(w.subword)
34 |         }
35 |     }
36 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/IndexSegmentTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment;
 2 | 
 3 | import com.mayabot.nlp.segment.plugins.collector.SentenceCollectorBuilder;
 4 | import org.junit.Assert;
 5 | import org.junit.Test;
 6 | 
 7 | public class IndexSegmentTest {
 8 | 
 9 | 
10 |     @Test
11 |     public void test() {
12 | 
13 |         Lexer mynlpTokenizer = Lexers.
14 |                 coreBuilder()
15 |                 .customSentenceCollector(SentenceCollectorBuilder::indexSubword)
16 |                 .build();
17 | 
18 |         String str = mynlpTokenizer.scan("中华人民共和国的利益").toString();
19 | 
20 |         Assert.assertEquals("[中华 华人 人民 人民共和国 共和 共和国] 的 利益",str);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/KeepOriCharOutputTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import org.junit.Assert
 4 | import org.junit.Test
 5 | 
 6 | class KeepOriCharOutputTest {
 7 | 
 8 |     @Test
 9 |     fun test(){
10 |         val lerxer = Lexers.coreBuilder()
11 |                 .keepOriCharOutput()
12 |                 .build()
13 |         Assert.assertEquals("看看 下面 这 中文 逗号 ， Keep 大小写",
14 |                 lerxer.scan("看看下面这中文逗号，Keep 大小写").toPlainString()
15 |         )
16 |     }
17 | 
18 |     @Test
19 |     fun test2(){
20 |         val lerxer = Lexers.coreBuilder()
21 |                 .build()
22 |         Assert.assertEquals("看看 下面 这 中文 逗号 , keep 大小写",
23 |                 lerxer.scan("看看下面这中文逗号，Keep 大小写").toPlainString()
24 |         )
25 |     }
26 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/KotlinTest.kt:
--------------------------------------------------------------------------------
1 | package com.mayabot.nlp.segment
2 | 
3 | 
4 | fun main() {
5 |     println("科学之门".lexer())
6 |     println("录音曝光！朴槿惠就职总统前 听崔顺实90分钟指导".segment())
7 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/PosTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment;
 2 | 
 3 | 
 4 | public class PosTest {
 5 | 
 6 |     public static void main(String[] args) {
 7 | 
 8 | //        PerceptronPosService service = Mynlps.instanceOf(PerceptronPosService.class);
 9 | //        List<String> words = Lists.newArrayList("第三 章".split(" "));
10 | //        List<Nature> pos = service.pos(words);
11 | //
12 | //        for (int i = 0; i < words.size(); i++) {
13 | //            System.out.println(words.get(i)+"/"+pos.get(i));
14 | //        }
15 | 
16 |         System.out.println(Lexers.core().scan("第三章,章先生"));
17 |     }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/SegmentErrorCasesTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import org.junit.Test
 4 | 
 5 | /**
 6 |  * 收集分词异常报错
 7 |  */
 8 | class SegmentErrorCasesTest {
 9 | 
10 |     @Test
11 |     fun carwords() {
12 |         val tokenizer = Lexers.core()
13 |         val lines = arrayOf(
14 |                 "你好离合器片的生产日期是2013-05-034S回复人635110101001",
15 |                 "第一次维修更换中间轴前轴承和倒档惰轮总成第二次是20170年6",
16 |                 "六万一千公里",
17 |                 "此车20171年12月19号来我站报修前照灯进水",
18 |                 "我站一辆宝骏5602017年2月16日到我站反映六档挡不进档")
19 | 
20 | 
21 |         for (s in lines) {
22 |             println(s)
23 |             println(tokenizer.scan(s))
24 |         }
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/SubwordTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.segment.plugins.collector.DefaultSubwordRuleDict
 5 | import org.junit.Test
 6 | 
 7 | class SubwordTest {
 8 | 
 9 |     @Test
10 |     fun test() {
11 |         val mynlp = Mynlp.instance()
12 | 
13 |         val x = DefaultSubwordRuleDict()
14 |         x.add("副/市长")
15 |         x.rebuild()
16 | 
17 |         val lexer = mynlp.lexerBuilder()
18 |             .hmm()
19 |             .withPos()
20 |             .customSentenceCollector {
21 |                 it.smartSubword()
22 | //                it.ruleBaseSubword(listOf(x))
23 |             }
24 |             .build()
25 | 
26 |         println(lexer.scan("这是副市长的快递").toList())
27 |     }
28 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/Test.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import com.mayabot.nlp.segment.plugins.correction.MemCorrectionDictionary
 5 | 
 6 | fun main() {
 7 |     val mynlp = Mynlp.instance()
 8 |     val mem = MemCorrectionDictionary()
 9 | 
10 |     mem.addRule("近期/待还")
11 |     mem.rebuild()
12 | 
13 | //    val lexer = mynlp.lexerBuilder()
14 | //        .bigram()
15 | //        .withPos()
16 | //        .withPersonName()
17 | //        .collector().smartPickup {
18 | //            it.setBlackListCallback {
19 | //                it[0] == '副' && it[it.length - 1] == '长'
20 | //            }
21 | //        }
22 | //        .done()
23 | //        .withCorrection(mem)
24 | //            .build()
25 | //
26 | //    lexer.scan("近期待还").forEach {
27 | //        print(it)
28 | //        println("\t has sub " + it.hasSubword())
29 | //    }
30 | 
31 | 
32 |     //default core
33 | //    val lexer2 = Lexers.coreBuilder()
34 | //            .withPersonName()
35 | ////            .withPos()
36 | //            .collector().smartPickup()
37 | //            .done()
38 | //            .build()
39 | //
40 | //    println(lexer2.scan("基础设施"))
41 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/TestPosAndSubWord.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment
 2 | 
 3 | import com.mayabot.nlp.Mynlp
 4 | import org.junit.Assert
 5 | import org.junit.Test
 6 | 
 7 | /**
 8 |  * 同时开启词性和subword，导致词性失效
 9 |  */
10 | class TestPosAndSubWord {
11 | 
12 |     @Test
13 |     fun test() {
14 |         val mynlp = Mynlp.instance()
15 | 
16 |         val lexer = mynlp.lexerBuilder().hmm()
17 |             .withPos()
18 |             .customSentenceCollector {
19 |                 it.smartSubword()
20 |                 it.fillCoreDict()
21 |             }
22 |             .build()
23 | 
24 |         val result = lexer.scan("这次是北京大学拿到第一名").toString()
25 |         Assert.assertEquals("这次/r 是/v [北京 大学]/nt 拿到/v 第一名/mq",result)
26 |     }
27 | 
28 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/atom/AtomSplitAlgorithmTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.atom
 2 | 
 3 | import com.mayabot.nlp.Mynlps
 4 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm
 5 | import com.mayabot.nlp.segment.wordnet.Wordnet
 6 | import org.junit.Test
 7 | 
 8 | 
 9 | class SimpleTest {
10 |     @Test
11 |     fun unitTestingWorks() {
12 |         val text = listOf("这个是你jimi@mayabot.com邮箱地址么2017-10-12",
13 |                 "你的ipad3么 ,最近三天花了多少钱 a-ff  -102 @163.com,一万八千八百八十八,FM98.1，jimi@mayabot.com,周一下午九点钟,一九九八年三月，2018年2月2日,2013年,周一下午三点半有个重量为11225.6公斤,123234"
14 |         )
15 |         val atom = Mynlps.instanceOf(AtomSplitAlgorithm::class.java)
16 |         val atom2 = Mynlps.instanceOf<AtomSplitAlgorithm>()
17 | 
18 |         text.forEach { line ->
19 |             val wordnet = Wordnet(line.toCharArray())
20 | 
21 |             atom.fill(wordnet)
22 |             println(wordnet.toMoreString())
23 |         }
24 | 
25 | 
26 |     }
27 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/NERPerceptronTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.perceptron
 2 | 
 3 | import com.mayabot.nlp.segment.Lexers
 4 | import com.mayabot.nlp.segment.plugins.ner.NERPerceptron
 5 | import com.mayabot.nlp.segment.plugins.ner.NERPerceptronTrainer
 6 | import com.mayabot.nlp.segment.plugins.ner.PerceptronNerService
 7 | import java.io.File
 8 | 
 9 | 
10 | object NERPerceptronTest {
11 | 
12 |     @JvmStatic
13 |     fun main(args: Array<String>) {
14 | //        train()
15 |         test()
16 |     }
17 | 
18 |     fun train() {
19 |         val trainer = NERPerceptronTrainer()
20 | 
21 |         val trainFile = File("data.work/ner")
22 |         val evaluateFile = File("data.work/ner-test/ner_1.txt")
23 | 
24 |         val model = trainer.train(
25 |                 trainFile, evaluateFile,
26 |                 130, 1)
27 | 
28 |         model.save(File("data.work/ner.model"))
29 |     }
30 | 
31 |     fun test() {
32 | //        val evaluateFile = File("data/pku/199802.txt")
33 | 
34 |         val tokenizer = Lexers.core()
35 |         val text = "这是陈建国的快递,来自上海万行信息科技有限公司的报告"
36 | 
37 |         val termList = tokenizer.scan(text).toList()
38 | 
39 | 
40 |         val ner = NERPerceptron.load(File("data.work/ner.model"))
41 | 
42 | 
43 |         println(termList)
44 | 
45 |         ner.decode(termList)
46 | 
47 |         println(PerceptronNerService.toNerComposite(termList).joinToString(separator = ","))
48 | 
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/POSPerceptronTest.kt:
--------------------------------------------------------------------------------
 1 | //package com.mayabot.nlp.segment.lexer.perceptron
 2 | //
 3 | //import com.mayabot.nlp.segment.plugins.pos.POSPerceptronTrainer
 4 | //import com.mayabot.nlp.utils.CharNormUtils
 5 | //import java.io.File
 6 | //
 7 | //fun main(args: Array<String>) {
 8 | //    val model = POSPerceptronTrainer().train(File("data/pku/199801.txt"), File("data/cncorpus/cncorpus_9.txt"), 1, 1)
 9 | //    model.save(File("data/pos/model"))
10 | //
11 | //
12 | ////////
13 | ////    println(model.decode("陈汝烨"))
14 | //
15 | ////    val model = POSPerceptron.load(File("data/pos/model"))
16 | //    val words = "陈汝烨 余额宝 的 规模 增长 一直 呈现 不断 加速 , 的 状态".split(" ")
17 | ////
18 | //////    val train = POSPerceptronTrainer()
19 | //////    train.train(File("data/pku"),1,4)
20 | //////    val sampleList = train.loadSamples(File("data/pku").allFiles())
21 | //////    val eva = POSEvaluateRunner(0, sampleList)
22 | //////    eva.run(model.model)
23 | ////
24 | ////
25 | //    val words2 = CharNormUtils.convert("陈汝烨 陈勤勤 余额宝 的 规模 增长 一直 呈现 不断 加速 , 的 状态 四十 年 , 我 的 心里 从未 这么 安静 过").split(" ")
26 | //    val result = model.decode(words2)
27 | //    println(words2.zip(result))
28 | //
29 | ////    val lines = File("data/pos/model/feature.txt").readLines()
30 | ////
31 | ////    var index = DoubleArrayTrie(lines)
32 | ////
33 | ////
34 | ////    println(index.wordId("望京☺"))
35 | //}
36 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronNerServiceTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.lexer.perceptron;
 2 | 
 3 | import com.mayabot.nlp.Mynlps;
 4 | import com.mayabot.nlp.segment.Sentence;
 5 | import com.mayabot.nlp.segment.plugins.ner.PerceptronNerService;
 6 | 
 7 | import java.util.List;
 8 | 
 9 | public class PerceptronNerServiceTest {
10 | 
11 | 
12 |     public static void main(String[] args) {
13 |         PerceptronNerService ner = Mynlps.instanceOf(PerceptronNerService.class);
14 |         PerceptronsSegmentService cws = Mynlps.instanceOf(PerceptronsSegmentService.class);
15 | 
16 | 
17 |         List<String> words = cws.splitWord("悦胜公司成立之初系杭州市体育发展集团（杭州市体育局所属事业单位）下属的全资子公司，主要经营体育事业相关业务，后为服务2018年第14届FINA世界游泳锦标赛，增资扩股为国有控股公司。\n" +
18 |                 "\n" +
19 |                 "\n");
20 | 
21 |         Sentence ner1 = ner.ner(words);
22 | 
23 |         System.out.println(ner1);
24 | 
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronServiceTest.kt:
--------------------------------------------------------------------------------
1 | package com.mayabot.nlp.segment.lexer.perceptron
2 | 
3 | object PerceptronServiceTest {
4 | 
5 | }


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/ner/OrgTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.ner;
 2 | 
 3 | import com.mayabot.nlp.segment.Lexer;
 4 | import com.mayabot.nlp.segment.Lexers;
 5 | import com.mayabot.nlp.segment.utils.TokenizerTestHelp;
 6 | import org.junit.Test;
 7 | 
 8 | public class OrgTest {
 9 | 
10 |     @Test
11 |     public void test() {
12 |         {
13 |             String text = "这|是|上海|万|行|信息|科技|有限公司|的|财务|报表";
14 | 
15 |             Lexer tokenizer = Lexers.coreBuilder()
16 |                     .build();
17 | 
18 |             TokenizerTestHelp.test(tokenizer, text);
19 |         }
20 | 
21 | 
22 |         {
23 |             String text = "这|是|上海万行信息科技有限公司|的|财务|报表";
24 | 
25 |             Lexer tokenizer = Lexers.coreBuilder()
26 |                     .withNer()
27 |                     .build();
28 | 
29 | 
30 |             TokenizerTestHelp.test(tokenizer, text);
31 |         }
32 | 
33 | 
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/ner/PersonNameTest.kt:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.ner
 2 | 
 3 | import com.mayabot.nlp.segment.Lexers
 4 | import com.mayabot.nlp.segment.utils.TokenizerTestHelp
 5 | import org.junit.Test
 6 | 
 7 | class PersonNameTest {
 8 | 
 9 |     @Test
10 |     fun test() {
11 |         run {
12 |             val text = "这|是|陈|建国|的|快递"
13 | 
14 |             val tokenizer = Lexers.builder().core()
15 |                     .build()
16 | 
17 | 
18 |             TokenizerTestHelp.test(tokenizer, text)
19 |         }
20 | 
21 | 
22 |         run {
23 |             val text = "这|是|陈建国|的|快递"
24 | 
25 |             val tokenizer = Lexers.builder().core()
26 |                     .withPersonName().build()
27 | 
28 |             TokenizerTestHelp.test(tokenizer, text)
29 |         }
30 |     }
31 | 
32 |     @Test
33 |     fun test2() {
34 |         val tokenizer = Lexers.perceptronBuilder().core()
35 |                 .withPersonName().build()
36 | 
37 |         val strings = arrayOf("先后视察了华鑫海欣楼宇党建（群团）服务站和江阴顺天村项目", "签约仪式前，秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。", "武大靖创世界纪录夺冠，中国代表团平昌首金", "区长庄木弟新年致辞", "朱立伦：两岸都希望共创双赢 习朱历史会晤在即", "陕西首富吴一坚被带走 与令计划妻子有交集", "据美国之音电台网站4月28日报道，8岁的凯瑟琳·克罗尔（凤甫娟）和很多华裔美国小朋友一样，小小年纪就开始学小提琴了。她的妈妈是位虎妈么？", "凯瑟琳和露西（庐瑞媛），跟她们的哥哥们有一些不同。", "王国强、高峰、汪洋、张朝阳光着头、韩寒、小四", "张浩和胡健康复员回家了", "王总和小丽结婚了", "编剧邵钧林和稽道青说", "这里有关天培的有关事迹", "先后视察了华鑫海欣楼宇党建（群团）服务站和江阴顺天村项目", "龚学平等领导说,邓颖超生前杜绝超生")
38 | 
39 |         for (line in strings) {
40 |             println(line + "\n")
41 |             println(tokenizer.scan(line))
42 |             println("\n")
43 |         }
44 |     }
45 | }
46 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/ner/PlaceTest.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.ner;
 2 | 
 3 | import com.mayabot.nlp.segment.Lexer;
 4 | import com.mayabot.nlp.segment.Lexers;
 5 | import com.mayabot.nlp.segment.utils.TokenizerTestHelp;
 6 | import org.junit.Test;
 7 | 
 8 | public class PlaceTest {
 9 | 
10 |     @Test
11 |     public void test() {
12 | 
13 | 
14 |         {
15 |             String text = "中央|大街|浪漫|永|存";
16 | 
17 |             Lexer tokenizer = Lexers.coreBuilder()
18 | 
19 |                     .build();
20 | 
21 |             TokenizerTestHelp.test(tokenizer, text);
22 |         }
23 | 
24 | 
25 |         {
26 |             String text = "中央大街|浪漫|永|存";
27 | 
28 |             Lexer tokenizer = Lexers.coreBuilder()
29 |                     .withNer()
30 |                     .build();
31 | 
32 | 
33 |             TokenizerTestHelp.test(tokenizer, text);
34 |         }
35 | 
36 | 
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/mynlp/src/test/java/com/mayabot/nlp/segment/utils/TokenizerTestHelp.java:
--------------------------------------------------------------------------------
 1 | package com.mayabot.nlp.segment.utils;
 2 | 
 3 | import com.mayabot.nlp.common.Guava;
 4 | import com.mayabot.nlp.segment.Lexer;
 5 | import org.junit.Assert;
 6 | 
 7 | public class TokenizerTestHelp {
 8 | 
 9 |     /**
10 |      * 测试分词器
11 |      * 输入文本的格式  你好|世界
12 |      * 输入分词器是会把|去除掉
13 |      *
14 |      * @param tokenizer
15 |      * @param text
16 |      * @return
17 |      */
18 |     public static void test(
19 |             Lexer tokenizer,
20 |             String text) {
21 | 
22 |         text = text.trim();
23 | 
24 |         String input = text.replace("|", "");
25 | 
26 | 
27 |         String out = Guava.join(tokenizer.scan(input).toWordList(), "|");
28 | 
29 |         Assert.assertTrue("Out is " + out + " ,Input " + text, text.equalsIgnoreCase(out));
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/mynlp/src/test/resources/GrapCode.txt:
--------------------------------------------------------------------------------
 1 | 0	00000
 2 | 1	00001
 3 | 2	00011
 4 | 3	00010
 5 | 4	00110
 6 | 5	00111
 7 | 6	00101
 8 | 7	00100
 9 | 8	01100
10 | 9	01101
11 | 10	01111
12 | 11	01110
13 | 12	01010
14 | 13	01011
15 | 14	01001
16 | 15	01000
17 | 16	11000
18 | 17	11001
19 | 18	11011
20 | 19	11010
21 | 20	11110
22 | 21	11111
23 | 22	11101
24 | 23	11100
25 | 24	10100
26 | 25	10101
27 | 26	10111
28 | 27	10110
29 | 28	10010
30 | 29	10011
31 | 30	10001
32 | 31	10000


--------------------------------------------------------------------------------
/settings.gradle.kts:
--------------------------------------------------------------------------------
1 | rootProject.name = "mynlp"
2 | 
3 | include("mynlp", "mynlp-all", "mynlp-example", "mynlp-experimental")


--------------------------------------------------------------------------------