├── .github └── workflows │ └── gradle.yml ├── .gitignore ├── CHANGES.md ├── LICENSE ├── README.adoc ├── build.gradle.kts ├── doc ├── advanced.adoc ├── fasttext.adoc ├── highlight │ ├── highlight.min.js │ └── styles │ │ └── github.min.css ├── images │ ├── WordSplitAlgorithm.png │ ├── WordpathProcessor.png │ ├── cli.jpg │ ├── crf_model.jpg │ ├── fasttext-c.png │ ├── lexer.png │ ├── mynlp-pipeline.png │ ├── pipelineLexer.jpg │ ├── weixin.jpeg │ ├── worddict.png │ ├── wordnet-ds.png │ ├── wordnet-framework.jpg │ ├── wordnet-g.png │ ├── wordnet.png │ └── wordpath.png ├── lexer.adoc ├── modules.adoc ├── mynlp-docinfo-footer.html ├── mynlp.adoc ├── mynlp.docx ├── other.adoc ├── perceptron.adoc ├── started.adoc └── update.sh ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── licenses ├── Ansj-LICENSE.txt ├── ApacheCommonsCli-LICENSE.txt ├── DimSim-LICENSE.txt ├── FastText-LICENSE.txt ├── Hanlp-LICENSE.txt └── StartSpace-LICENSE.txt ├── mynlp-all └── build.gradle.kts ├── mynlp-example ├── build.gradle.kts └── src │ ├── main │ └── java │ │ ├── Demo.java │ │ ├── classification │ │ └── HotelCommentExampleTrain.java │ │ ├── pinyin │ │ └── PinyinExample.java │ │ ├── segment │ │ ├── CombineExample.java │ │ ├── CoreSegment.java │ │ ├── CustomSegment.java │ │ ├── HowFast.java │ │ └── UseStreamApi.java │ │ ├── starspace │ │ └── AgNews.kt │ │ └── transform │ │ └── TraditionalExample.java │ └── test │ └── java │ └── TestHighlight.java ├── mynlp-experimental ├── .gitignore └── build.gradle.kts ├── mynlp ├── build.gradle.kts ├── shell │ └── mynlp.sh └── src │ ├── main │ ├── java │ │ └── com │ │ │ └── mayabot │ │ │ └── nlp │ │ │ ├── Mynlp.kt │ │ │ ├── MynlpBuilder.java │ │ │ ├── MynlpConfigs.kt │ │ │ ├── MynlpEnv.java │ │ │ ├── algorithm │ │ │ ├── HMM.kt │ │ │ ├── Heap.kt │ │ │ ├── TopIntMinK.kt │ │ │ ├── TopMaxK.java │ │ │ ├── Viterbi.kt │ │ │ ├── collection │ │ │ │ ├── Trie.java │ │ │ │ ├── ahocorasick │ │ │ │ │ ├── AhoCoraickDoubleArrayTrieBuilder.java │ │ │ │ │ ├── AhoCorasickDoubleArrayTrie.java │ │ │ │ │ ├── Hit.java │ │ │ │ │ ├── IHit.java │ │ │ │ │ ├── IHitFull.java │ │ │ │ │ └── State.java │ │ │ │ ├── bintrie │ │ │ │ │ ├── AbstractTrieNode.java │ │ │ │ │ ├── ArrayTrieNode.java │ │ │ │ │ ├── BinTrieNode.java │ │ │ │ │ ├── BinTrieTree.java │ │ │ │ │ ├── BinTrieTreeBuilder.java │ │ │ │ │ ├── HashTrieNode.java │ │ │ │ │ ├── TrieTreeAllMatcher.java │ │ │ │ │ ├── TrieTreeForwardMaxMatcher.java │ │ │ │ │ └── TrieTreeMatcher.java │ │ │ │ └── dat │ │ │ │ │ ├── DATLongMatcher.java │ │ │ │ │ ├── DATMapMatcher.java │ │ │ │ │ ├── DATMatcher.java │ │ │ │ │ ├── DoubleArrayMaker.java │ │ │ │ │ ├── DoubleArrayTrie.java │ │ │ │ │ ├── DoubleArrayTrieMap.java │ │ │ │ │ ├── DoubleArrayTrieStringIntMap.java │ │ │ │ │ └── FastDatCharSet.java │ │ │ └── distance │ │ │ │ ├── JaroWinklerDistance.java │ │ │ │ ├── LevenshteinDistance.java │ │ │ │ ├── NGramDistance.java │ │ │ │ └── StringDistance.java │ │ │ ├── blas │ │ │ ├── BlasUtils.kt │ │ │ ├── DenseArrayMatrix.kt │ │ │ ├── DenseVector.kt │ │ │ ├── Matrix.kt │ │ │ └── Vector.kt │ │ │ ├── character │ │ │ └── ChineseCharInfos.kt │ │ │ ├── cli │ │ │ └── MynlpCli.kt │ │ │ ├── common │ │ │ ├── ArraySizingStrategy.java │ │ │ ├── BoundedProportionalArraySizingStrategy.java │ │ │ ├── BufferedReaderLFCR.java │ │ │ ├── EncryptionUtil.java │ │ │ ├── FastCharReader.java │ │ │ ├── FastStringBuilder.java │ │ │ ├── Guava.kt │ │ │ ├── IntArrayList.kt │ │ │ ├── LongArrayList.kt │ │ │ ├── Pair.java │ │ │ ├── ParagraphIterable.kt │ │ │ ├── ParagraphReader.java │ │ │ ├── ParagraphReaderSmart.java │ │ │ ├── ParagraphReaderString.java │ │ │ ├── SettingItem.java │ │ │ ├── Settings.java │ │ │ ├── TagAndScore.java │ │ │ ├── TokenizerSplitter.java │ │ │ ├── cli │ │ │ │ ├── AlreadySelectedException.java │ │ │ │ ├── AmbiguousOptionException.java │ │ │ │ ├── BasicParser.java │ │ │ │ ├── CommandLine.java │ │ │ │ ├── CommandLineParser.java │ │ │ │ ├── DefaultParser.java │ │ │ │ ├── GnuParser.java │ │ │ │ ├── HelpFormatter.java │ │ │ │ ├── MissingArgumentException.java │ │ │ │ ├── MissingOptionException.java │ │ │ │ ├── Option.java │ │ │ │ ├── OptionBuilder.java │ │ │ │ ├── OptionGroup.java │ │ │ │ ├── OptionValidator.java │ │ │ │ ├── Options.java │ │ │ │ ├── ParseException.java │ │ │ │ ├── Parser.java │ │ │ │ ├── PatternOptionBuilder.java │ │ │ │ ├── PosixParser.java │ │ │ │ ├── TypeHandler.java │ │ │ │ ├── UnrecognizedOptionException.java │ │ │ │ ├── Util.java │ │ │ │ └── package-info.java │ │ │ ├── hash │ │ │ │ ├── ByteUtils.java │ │ │ │ ├── MessageDigests.java │ │ │ │ ├── MurmurHash3.java │ │ │ │ └── MurmurHash3Kotlin.kt │ │ │ ├── hppc │ │ │ │ ├── BufferAllocationException.java │ │ │ │ ├── CharObjectHashMap.java │ │ │ │ ├── CharObjectMap.java │ │ │ │ └── IntArrayList.java │ │ │ ├── injector │ │ │ │ ├── BeanFactory.java │ │ │ │ ├── ImplementedBy.java │ │ │ │ ├── Injector.kt │ │ │ │ └── Singleton.java │ │ │ ├── logging │ │ │ │ ├── AbstractInternalLogger.java │ │ │ │ ├── CommonsLogger.java │ │ │ │ ├── CommonsLoggerFactory.java │ │ │ │ ├── FormattingTuple.java │ │ │ │ ├── InternalLogLevel.java │ │ │ │ ├── InternalLogger.java │ │ │ │ ├── InternalLoggerFactory.java │ │ │ │ ├── JdkLogger.java │ │ │ │ ├── JdkLoggerFactory.java │ │ │ │ ├── Log4J2Logger.java │ │ │ │ ├── Log4J2LoggerFactory.java │ │ │ │ ├── Log4JLogger.java │ │ │ │ ├── Log4JLoggerFactory.java │ │ │ │ ├── MessageFormatter.java │ │ │ │ ├── Slf4JLogger.java │ │ │ │ ├── Slf4JLoggerFactory.java │ │ │ │ └── package-info.java │ │ │ ├── matrix │ │ │ │ └── CSRSparseMatrix.java │ │ │ ├── resources │ │ │ │ ├── ClasspathNlpResourceFactory.java │ │ │ │ ├── FileNlpResourceFactory.kt │ │ │ │ ├── JarNlpResourceFactory.kt │ │ │ │ ├── NlpResource.java │ │ │ │ ├── NlpResourceFactory.java │ │ │ │ ├── URLNlpResource.java │ │ │ │ └── UseLines.kt │ │ │ └── utils │ │ │ │ ├── CartesianList.java │ │ │ │ ├── CharNormUtils.java │ │ │ │ ├── CharSourceLineReader.java │ │ │ │ ├── Characters.java │ │ │ │ ├── CustomCharSequence.java │ │ │ │ ├── DataInOutputUtils.java │ │ │ │ ├── DictResources.kt │ │ │ │ ├── DownloadUtils.kt │ │ │ │ ├── Jars.kt │ │ │ │ ├── MyInts.java │ │ │ │ ├── MynlpFactories.java │ │ │ │ ├── PathUtils.java │ │ │ │ └── StringUtils.java │ │ │ ├── fasttext │ │ │ ├── FastText.kt │ │ │ ├── FasttextTranUtils.kt │ │ │ ├── Meter.kt │ │ │ ├── Model.kt │ │ │ ├── ProductQuant.kt │ │ │ ├── QuantMatrix.kt │ │ │ ├── args │ │ │ │ ├── Args.kt │ │ │ │ └── InputArgs.kt │ │ │ ├── autotune │ │ │ │ └── AutotuneStrategy.kt │ │ │ ├── dictionary │ │ │ │ ├── BuildDictFromSource.kt │ │ │ │ ├── DictUtils.kt │ │ │ │ ├── Dictionary.kt │ │ │ │ ├── DictionaryBuilder.kt │ │ │ │ ├── FastWordMap.kt │ │ │ │ └── LoadDictFromDataInput.kt │ │ │ ├── loss │ │ │ │ ├── HierarchicalSoftmaxLoss.kt │ │ │ │ ├── Loss.kt │ │ │ │ ├── NegativeSamplingLoss.kt │ │ │ │ ├── OneVsAlLoss.kt │ │ │ │ └── SoftmaxLoss.kt │ │ │ ├── train │ │ │ │ ├── FastTextTrain.kt │ │ │ │ ├── LoadPretraindVector.kt │ │ │ │ └── SampleLines.kt │ │ │ └── utils │ │ │ │ ├── AutoDataInput.kt │ │ │ │ ├── ByteUtils.java │ │ │ │ ├── IOUtils.kt │ │ │ │ ├── LogUtils.kt │ │ │ │ └── TopMaxK.kt │ │ │ ├── module │ │ │ ├── Highlight.kt │ │ │ ├── TextHash.kt │ │ │ ├── lucene │ │ │ │ ├── BaseSynTokenFilter.kt │ │ │ │ ├── IterableMode.java │ │ │ │ ├── LetterTokenizer.java │ │ │ │ ├── MynlpAnalyzer.java │ │ │ │ ├── MynlpTokenizer.java │ │ │ │ └── PinyinTokenizerFilter.kt │ │ │ ├── nwd │ │ │ │ ├── FilesNewWordFind.kt │ │ │ │ ├── NewWordFindEngine.kt │ │ │ │ ├── TopCounter.kt │ │ │ │ ├── ValueObjects.kt │ │ │ │ └── package-info.java │ │ │ ├── pinyin │ │ │ │ ├── CustomPinyin.java │ │ │ │ ├── PinyinDistance.kt │ │ │ │ ├── PinyinResult.java │ │ │ │ ├── PinyinService.kt │ │ │ │ ├── Tex2PinyinComputer.java │ │ │ │ ├── model │ │ │ │ │ ├── Pinyin.java │ │ │ │ │ ├── PinyinFuzzy.kt │ │ │ │ │ ├── PinyinHead.java │ │ │ │ │ ├── Shengmu.java │ │ │ │ │ ├── SimplePinyin.kt │ │ │ │ │ └── Yunmu.java │ │ │ │ └── split │ │ │ │ │ ├── PinyinSplitApp.kt │ │ │ │ │ └── PinyinSplitDefinition.kt │ │ │ ├── summary │ │ │ │ ├── BM25.java │ │ │ │ ├── KeywordSummary.java │ │ │ │ ├── SentenceSummary.java │ │ │ │ └── TextRankSentence.java │ │ │ └── trans │ │ │ │ ├── BaseTransformDictionary.java │ │ │ │ ├── Simplified2Traditional.kt │ │ │ │ ├── Traditional2Simplified.kt │ │ │ │ └── TransformService.java │ │ │ ├── perceptron │ │ │ ├── ConvertHanlpModel.kt │ │ │ ├── EvaluateFunction.java │ │ │ ├── EvaluateResult.kt │ │ │ ├── EvaluateUtils.kt │ │ │ ├── FeatureSet.kt │ │ │ ├── PerceptronComputer.kt │ │ │ ├── PerceptronDefinition.kt │ │ │ ├── PerceptronModel.kt │ │ │ ├── PerceptronModelImpl.kt │ │ │ └── PerceptronTrainer.kt │ │ │ ├── segment │ │ │ ├── CharNormalize.java │ │ │ ├── FluentLexerBuilder.kt │ │ │ ├── IterableMode.kt │ │ │ ├── KotlinLexers.kt │ │ │ ├── Lexer.java │ │ │ ├── LexerBuilder.java │ │ │ ├── LexerReader.java │ │ │ ├── Lexers.java │ │ │ ├── Nature.java │ │ │ ├── SegmentComponent.java │ │ │ ├── SegmentModule.kt │ │ │ ├── Sentence.java │ │ │ ├── WordAndNature.java │ │ │ ├── WordSplitAlgorithm.java │ │ │ ├── WordTerm.java │ │ │ ├── WordTermSequence.kt │ │ │ ├── WordpathProcessor.java │ │ │ ├── common │ │ │ │ ├── BaseSegmentComponent.java │ │ │ │ ├── DefaultCharNormalize.java │ │ │ │ ├── PerceptronUtils.kt │ │ │ │ ├── String2.java │ │ │ │ └── VertexHelper.java │ │ │ ├── lexer │ │ │ │ ├── bigram │ │ │ │ │ ├── BaseExternalizable.java │ │ │ │ │ ├── BiGramTableDictionary.java │ │ │ │ │ ├── BiGramTableDictionaryImpl.java │ │ │ │ │ ├── BiGramTableReader.kt │ │ │ │ │ ├── CoreDictPatch.kt │ │ │ │ │ ├── CoreDictionary.java │ │ │ │ │ ├── CoreDictionaryImpl.java │ │ │ │ │ ├── CoreDictionaryReader.kt │ │ │ │ │ ├── CoreDictionarySplitAlgorithm.java │ │ │ │ │ ├── DictionaryAbsWords.java │ │ │ │ │ ├── HmmLexerPlugin.java │ │ │ │ │ └── ViterbiBestPathAlgorithm.java │ │ │ │ ├── crf │ │ │ │ │ ├── CWSCrf.kt │ │ │ │ │ ├── FeatureTemplate.kt │ │ │ │ │ ├── NerCrf.kt │ │ │ │ │ ├── tokenizer │ │ │ │ │ │ ├── CrfBaseSegmentInitializer.java │ │ │ │ │ │ └── CrfTokenizerBuilder.java │ │ │ │ │ └── utils │ │ │ │ │ │ └── ConvertCrfText2PerceptronModel.kt │ │ │ │ └── perceptron │ │ │ │ │ ├── PerceptronSegment.kt │ │ │ │ │ ├── PerceptronSegmentAlgorithm.java │ │ │ │ │ ├── PerceptronSegmentDefinition.kt │ │ │ │ │ ├── PerceptronSegmentPatch.kt │ │ │ │ │ ├── PerceptronSegmentPlugin.java │ │ │ │ │ ├── PerceptronsSegmentService.java │ │ │ │ │ └── inner │ │ │ │ │ └── Train.kt │ │ │ ├── pipeline │ │ │ │ ├── PipelineLexer.java │ │ │ │ ├── PipelineLexerBuilder.java │ │ │ │ ├── PipelineLexerBuilderKts.kt │ │ │ │ └── PipelineLexerPlugin.java │ │ │ ├── plugins │ │ │ │ ├── atom │ │ │ │ │ ├── AtomSplitAlgorithm.kt │ │ │ │ │ ├── AtomTemplateParser.kt │ │ │ │ │ └── DefaultTemplate.kt │ │ │ │ ├── bestpath │ │ │ │ │ ├── AtomWordViterbiBestPathAlgorithm.java │ │ │ │ │ └── LongpathBestPathAlgorithm.java │ │ │ │ ├── collector │ │ │ │ │ ├── CoreDictSubwordInfoSetup.java │ │ │ │ │ ├── CustomDictSubwordInfoSetup.java │ │ │ │ │ ├── IndexSubwordComputer.java │ │ │ │ │ ├── RuleDictSubwordComputer.kt │ │ │ │ │ ├── SentenceCollector.kt │ │ │ │ │ ├── SentenceCollectorBuilder.kt │ │ │ │ │ ├── SmartSubwordComputer.java │ │ │ │ │ ├── SubwordComputer.kt │ │ │ │ │ ├── SubwordInfoSetup.kt │ │ │ │ │ └── WordTermCollector.kt │ │ │ │ ├── correction │ │ │ │ │ ├── CorrectionDictionary.java │ │ │ │ │ ├── CorrectionPlugin.java │ │ │ │ │ ├── CorrectionWord.kt │ │ │ │ │ ├── CorrectionWordpathProcessor.java │ │ │ │ │ ├── DefaultCorrectionDictionary.java │ │ │ │ │ ├── FileCorrectionDictionary.kt │ │ │ │ │ ├── MemCorrectionDictionary.java │ │ │ │ │ └── package-info.java │ │ │ │ ├── customwords │ │ │ │ │ ├── CustomDictionary.java │ │ │ │ │ ├── CustomDictionaryPlugin.java │ │ │ │ │ ├── CustomDictionaryProcessor.java │ │ │ │ │ ├── DefaultCustomDictionary.java │ │ │ │ │ ├── FileCustomDictionary.java │ │ │ │ │ └── MemCustomDictionary.java │ │ │ │ ├── ner │ │ │ │ │ ├── NERPerceptron.kt │ │ │ │ │ ├── NerPlugin.java │ │ │ │ │ ├── NerProcessor.java │ │ │ │ │ └── PerceptronNerService.java │ │ │ │ ├── pattern │ │ │ │ │ ├── PatternPlugin.java │ │ │ │ │ └── PatternWordpathProcessor.java │ │ │ │ ├── personname │ │ │ │ │ ├── PerceptronPersonNameService.java │ │ │ │ │ ├── PersonNameAlgorithm.java │ │ │ │ │ ├── PersonNamePerceptron.kt │ │ │ │ │ ├── PersonNamePlugin.java │ │ │ │ │ └── PersonNameProcessor.java │ │ │ │ └── pos │ │ │ │ │ ├── CommonPosModel.kt │ │ │ │ │ ├── PerceptronPosService.java │ │ │ │ │ ├── PosPerceptron.kt │ │ │ │ │ ├── PosPerceptronDef.kt │ │ │ │ │ ├── PosPerceptronProcessor.java │ │ │ │ │ ├── PosPerceptronUtils.kt │ │ │ │ │ └── PosPlugin.java │ │ │ ├── reader │ │ │ │ ├── BaseFilterLexerReader.java │ │ │ │ ├── DefaultLexerReader.java │ │ │ │ ├── LexerItreabler.kt │ │ │ │ ├── PunctuationFilter.java │ │ │ │ ├── StopWordDict.kt │ │ │ │ └── StopwordFilter.java │ │ │ └── wordnet │ │ │ │ ├── BestPathAlgorithm.java │ │ │ │ ├── Vertex.java │ │ │ │ ├── VertexRow.java │ │ │ │ ├── WordNetToStringBuilder.java │ │ │ │ ├── Wordnet.java │ │ │ │ ├── Wordpath.java │ │ │ │ └── package-info.java │ │ │ ├── similarity │ │ │ └── BM25.kt │ │ │ └── starspace │ │ │ ├── Args.kt │ │ │ ├── DataHandler.kt │ │ │ ├── Dictionary.kt │ │ │ ├── Evaluate.kt │ │ │ ├── Parser.kt │ │ │ ├── Prediction.kt │ │ │ ├── SparseLinear.kt │ │ │ ├── StarSpace.kt │ │ │ ├── Train.kt │ │ │ └── Utils.kt │ └── resources │ │ ├── META-INF │ │ └── mynlp.factories │ │ ├── com │ │ └── mayabot │ │ │ └── nlp │ │ │ └── common │ │ │ └── utils │ │ │ └── char_norm │ │ ├── mynlp │ │ ├── char_four_code.txt │ │ ├── char_py.txt │ │ ├── char_struct.txt │ │ ├── char_write_num.txt │ │ └── py_hard_code_map.txt │ │ ├── patch │ │ └── cws-default.txt │ │ └── stopwords.txt │ └── test │ ├── java │ └── com │ │ └── mayabot │ │ └── nlp │ │ ├── BM25Test.kt │ │ ├── DoubleArrayTrieTest.java │ │ ├── Highlight.kt │ │ ├── InjectTest.kt │ │ ├── LuceneAnalyzerTest.java │ │ ├── Mynlps.kt │ │ ├── SentenceSummaryTest.java │ │ ├── TestFileMap.java │ │ ├── TransTest.java │ │ ├── XxHashTest.kt │ │ ├── commmon │ │ ├── CsrSparseMatrixTest.kt │ │ └── TokenizerSplitterTest.java │ │ ├── fasttext │ │ ├── CFtzModelBugTest.kt │ │ ├── Java.java │ │ ├── SupTest.kt │ │ ├── TestCModelFTZ.kt │ │ ├── TestSup.kt │ │ ├── TestWords.kt │ │ └── Utils.kt │ │ ├── module │ │ └── lucene │ │ │ ├── LuceneUtils.kt │ │ │ └── TestPinyinTokenizer.kt │ │ ├── pa │ │ └── GeleiCode.kt │ │ ├── perceptron │ │ └── TestCompresParamBin.kt │ │ ├── pinyin │ │ ├── PinyinDistance.kt │ │ └── PinyinTest.kt │ │ └── segment │ │ ├── CmbSegment.kt │ │ ├── CombineTest.java │ │ ├── CoreTokenizerTest.java │ │ ├── CustomDictTest.kt │ │ ├── IndexSegmentTest.java │ │ ├── KeepOriCharOutputTest.kt │ │ ├── KotlinTest.kt │ │ ├── OffsetTest.kt │ │ ├── PerceptronCwsTest.kt │ │ ├── PosTest.java │ │ ├── SegmentErrorCasesTest.kt │ │ ├── SubwordTest.kt │ │ ├── Test.kt │ │ ├── TestPosAndSubWord.kt │ │ ├── atom │ │ └── AtomSplitAlgorithmTest.kt │ │ ├── collector │ │ └── SentenceIndexWordCollectorTest.java │ │ ├── dictionary │ │ └── CoreGiGramTableDictionaryTest.java │ │ ├── lexer │ │ └── perceptron │ │ │ ├── CWSPerceptronTest.kt │ │ │ ├── NERPerceptronTest.kt │ │ │ ├── POSPerceptronTest.kt │ │ │ ├── PerceptronNerServiceTest.java │ │ │ └── PerceptronServiceTest.kt │ │ ├── ner │ │ ├── OrgTest.java │ │ ├── PersonNameTest.kt │ │ └── PlaceTest.java │ │ ├── utils │ │ └── TokenizerTestHelp.java │ │ └── wordnet │ │ ├── VertexRowTest.java │ │ └── WordpathTest.java │ └── resources │ └── GrapCode.txt └── settings.gradle.kts /.github/workflows/gradle.yml: -------------------------------------------------------------------------------- 1 | name: Java CI 2 | 3 | on: 4 | schedule: 5 | - cron: '0 8 * * *' 6 | 7 | jobs: 8 | build: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v1 14 | - name: Set up JDK 1.8 15 | uses: actions/setup-java@v1 16 | with: 17 | java-version: 1.8 18 | - name: Build with Gradle 19 | run: ./gradlew build 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled class file 2 | *.class 3 | 4 | # Log file 5 | *.log 6 | 7 | # BlueJ files 8 | *.ctxt 9 | 10 | # Mobile Tools for Java (J2ME) 11 | .mtj.tmp/ 12 | 13 | # Package Files # 14 | *.jar 15 | *.war 16 | *.ear 17 | # *.zip 18 | *.tar.gz 19 | *.rar 20 | 21 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 22 | hs_err_pid* 23 | 24 | **/out 25 | **/mynlp_work/ 26 | **/mynlp_work/** 27 | /mynlp_work/ 28 | mynlp_work/** 29 | **/temp/ 30 | data/ 31 | /data_dir/ 32 | **/data/ 33 | example.data 34 | 35 | /dependency-reduced-pom.xml 36 | *.iml 37 | /.idea 38 | *.iws 39 | 40 | /.gradle 41 | **/*.iml 42 | *.ipr 43 | 44 | data.work/ 45 | **/model/dependency 46 | **/model/segment 47 | /temp/ 48 | /mynlp-fasttext/xbackjava/ 49 | /mynlp-fasttext/xcsource/ 50 | /mynlp-fasttext/data/ 51 | /testdata/ 52 | !/gradle/wrapper/gradle-wrapper.jar 53 | ### macOS template 54 | # General 55 | .DS_Store 56 | .AppleDouble 57 | .LSOverride 58 | 59 | # Icon must end with two \r 60 | Icon 61 | 62 | # Thumbnails 63 | ._* 64 | 65 | # Files that might appear in the root of a volume 66 | .DocumentRevisions-V100 67 | .fseventsd 68 | .Spotlight-V100 69 | .TemporaryItems 70 | .Trashes 71 | .VolumeIcon.icns 72 | .com.apple.timemachine.donotpresent 73 | 74 | # Directories potentially created on remote AFP share 75 | .AppleDB 76 | .AppleDesktop 77 | Network Trash Folder 78 | Temporary Items 79 | .apdisk 80 | debug 81 | public 82 | docs/public 83 | doc/*.html 84 | doc/*.pdf 85 | *.html 86 | 87 | !/doc/mynlp-docinfo-footer.html -------------------------------------------------------------------------------- /CHANGES.md: -------------------------------------------------------------------------------- 1 | # 4.0.0 2 | - 新增StarSpace模块 3 | - 感知机相关实现基于通用框架 4 | - 规划data文件夹,提供下载 5 | - 命令行工具 6 | - 重构训练工具API 7 | - 去除Mynlps类 8 | 9 | # 3.3.0 10 | 11 | # 3.2.2 12 | 13 | - 增加QuickReplacer和Highlight功能 14 | - murmur3 15 | - text hash use xxHash 16 | 17 | # 3.2.1 18 | 19 | - kotlin update to 1.4.0 20 | - fixbug: injector 单例在key不一致的情况下单例实例化两个对象 21 | - 清理了Setting,彻底去除配置文件 22 | - fix junit test内存溢出的问题 23 | 24 | # 3.2.0 25 | 26 | - 代码结构做出改变,合并到mynlp单一项目 27 | - fasttext也合并到mynlp中 28 | - mynlp不再自动依赖词典资源,需要独立引入资源 29 | - mynlp-with-res这个自动引入常用资源,可以通过exclude排除不需要的资源 30 | - elasticsearch-plugin将独立项目,支持7.0以上的版本 31 | 32 | # 3.1.5 33 | 去除Gauva依赖,mynlp只依赖kotlin运行时 34 | 35 | # 3.1.2 36 | - FastText 模型保存为单个文件,也可以从单个文件加载 37 | ```kotlin 38 | fastText.saveModelToSingleFile(File("fastText4j/data/model.fjbin")) 39 | 40 | FastText.loadModelFromSingleFile(File("fastText4j/data/model.fjbin")) 41 | ``` 42 | 43 | # 3.1.1 44 | - fix 标点符号过滤bug 45 | 46 | # v3.1.0 47 | - 合并了mynlp-core,mynlp-perceptron,mynlp-segment模块 48 | - 重构了感知机模块,自定义感知机只需要实现一个接口定义 49 | - 感知机分词、词性分析使用新的感知机API 50 | - 开放词性分析在线学习接口;简化词性感知机特征提取函数 51 | - 在规则层面提高人名识别准确性 52 | - 合并fastText4j代码到mynlp项目 53 | - 按照最新C语言版本fastText重构 54 | - 新增OneVsAiLoss损失函数 55 | - 新增test接口 56 | - fix预测结果数量少一个的bug 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /doc/advanced.adoc: -------------------------------------------------------------------------------- 1 | == 高级主题 2 | 3 | === Wordnet 4 | 5 | :imagesdir: images 6 | 7 | 分词系统中需要一个数据结构来表达一段文字来多种分词可能性。距离来说"商品和服务","商品/和服/务"就是其中一个 错误的切分可能。 各种分词算法的目标就是找出最合理的切分方法。 8 | 9 | image::wordnet-g.png[词图篱笆网络,600] 10 | 11 | 站在每个字的角度来看,会有一个或多个跳转路径。从Start节点到End节点中必定存在一个最优路径,这个路径就是 分词结果了。但是上图如果利用Node和Edge的数据结构来表达的话,性能和方便程度都很差。 12 | 13 | Wordnet是经典的数据结构,mynlp用链表的方式实现了一个高效的Wordnet类。 14 | 15 | 16 | image::wordnet.png[,600] 17 | 每个数字节点,表示一个边,也表示从当前这个字构成的词的长度。 18 | 19 | 对应的Java数据为: 20 | 21 | image::wordnet-ds.png[,600] 22 | 23 | 每个字对应一个`VertexRow`,每个VertexRow指向一个Vertex链表,其中Vertex链表中的数字大小**一定是不可重复且有序的**。 24 | 25 | 分词的基本逻辑就是填充Wordnet,使用路径选算法从多种可能性选出最佳的分词路径。 26 | 27 | === Wordpath 28 | 29 | 类Wordpath表示一个路径,如果路径不在变化,那么也就无所谓采用什么数据结构。但是在Pipeline中,不同的组件和算法还需要对这个 **唯一的路径再进行修改**。会涉及到很多`联合`、`打破-再联合`等操作。在List的基础上操作起来,代码非常复杂且不容易理解。 30 | 31 | 这里我们使用BitSet来表示唯一分词路径。 32 | 33 | image::wordpath.png[,600] 34 | 35 | 图中的字之间的斜线,表示要切断。我们用bitset中和字对应的Index,设置为true。 比如"提高"是一个词,那么设置bitset的下标1为true。 36 | 37 | 就是这么简单,使用这种数据结构的好处是,combine或者划词的操作非常简单,而且内存上消耗非常非常低。 38 | 39 | === Injector IOC容器 -------------------------------------------------------------------------------- /doc/highlight/styles/github.min.css: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | github.com style (c) Vasily Polovnyov 4 | 5 | */ 6 | 7 | .hljs { 8 | display: block; 9 | overflow-x: auto; 10 | padding: 0.5em; 11 | color: #333; 12 | background: #f8f8f8; 13 | } 14 | 15 | .hljs-comment, 16 | .hljs-quote { 17 | color: #998; 18 | font-style: italic; 19 | } 20 | 21 | .hljs-keyword, 22 | .hljs-selector-tag, 23 | .hljs-subst { 24 | color: #333; 25 | font-weight: bold; 26 | } 27 | 28 | .hljs-number, 29 | .hljs-literal, 30 | .hljs-variable, 31 | .hljs-template-variable, 32 | .hljs-tag .hljs-attr { 33 | color: #008080; 34 | } 35 | 36 | .hljs-string, 37 | .hljs-doctag { 38 | color: #d14; 39 | } 40 | 41 | .hljs-title, 42 | .hljs-section, 43 | .hljs-selector-id { 44 | color: #900; 45 | font-weight: bold; 46 | } 47 | 48 | .hljs-subst { 49 | font-weight: normal; 50 | } 51 | 52 | .hljs-type, 53 | .hljs-class .hljs-title { 54 | color: #458; 55 | font-weight: bold; 56 | } 57 | 58 | .hljs-tag, 59 | .hljs-name, 60 | .hljs-attribute { 61 | color: #000080; 62 | font-weight: normal; 63 | } 64 | 65 | .hljs-regexp, 66 | .hljs-link { 67 | color: #009926; 68 | } 69 | 70 | .hljs-symbol, 71 | .hljs-bullet { 72 | color: #990073; 73 | } 74 | 75 | .hljs-built_in, 76 | .hljs-builtin-name { 77 | color: #0086b3; 78 | } 79 | 80 | .hljs-meta { 81 | color: #999; 82 | font-weight: bold; 83 | } 84 | 85 | .hljs-deletion { 86 | background: #fdd; 87 | } 88 | 89 | .hljs-addition { 90 | background: #dfd; 91 | } 92 | 93 | .hljs-emphasis { 94 | font-style: italic; 95 | } 96 | 97 | .hljs-strong { 98 | font-weight: bold; 99 | } 100 | -------------------------------------------------------------------------------- /doc/images/WordSplitAlgorithm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/WordSplitAlgorithm.png -------------------------------------------------------------------------------- /doc/images/WordpathProcessor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/WordpathProcessor.png -------------------------------------------------------------------------------- /doc/images/cli.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/cli.jpg -------------------------------------------------------------------------------- /doc/images/crf_model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/crf_model.jpg -------------------------------------------------------------------------------- /doc/images/fasttext-c.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/fasttext-c.png -------------------------------------------------------------------------------- /doc/images/lexer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/lexer.png -------------------------------------------------------------------------------- /doc/images/mynlp-pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/mynlp-pipeline.png -------------------------------------------------------------------------------- /doc/images/pipelineLexer.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/pipelineLexer.jpg -------------------------------------------------------------------------------- /doc/images/weixin.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/weixin.jpeg -------------------------------------------------------------------------------- /doc/images/worddict.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/worddict.png -------------------------------------------------------------------------------- /doc/images/wordnet-ds.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet-ds.png -------------------------------------------------------------------------------- /doc/images/wordnet-framework.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet-framework.jpg -------------------------------------------------------------------------------- /doc/images/wordnet-g.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet-g.png -------------------------------------------------------------------------------- /doc/images/wordnet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordnet.png -------------------------------------------------------------------------------- /doc/images/wordpath.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/images/wordpath.png -------------------------------------------------------------------------------- /doc/lexer.adoc: -------------------------------------------------------------------------------- 1 | :imagesdir: images 2 | 3 | == 中文分词、词性标注、命名实体 4 | 5 | === lexer架构 6 | 7 | 分词、词性、命名实体这三个任务一起被称为 `词法分析` ,mynlp中使用Lexer接口这个功能进行定义。 8 | 9 | Lexer负责对有限短的文本(一句话、一个段落)进行词法分析。 10 | 11 | nlp中有各种各样的分词算法,mynlp并没有为每个算法定义一个分词器类,而是使用Pipeline方式进行组装。 12 | 13 | .Pipeline架构图 14 | image::lexer.png[width=600] 15 | 16 | .接口描述 17 | - WordSplitAlgorithm: 基础切词算法,词典、感知机、CRF等等 18 | - WordpathProcessor: 对Wordpath进行调整或计算词性等操作 19 | - BestPathAlgorithm: 从Wordnet中选择最优路径 20 | - WordTermCollector: 分词收集器,可以进行索引分词、子词再切分等操作 --- 21 | 22 | .WordSplitAlgorithm接口以及实现类 23 | image::WordSplitAlgorithm.png[width=600] 24 | 25 | .WordpathProcessor接口以及实现类 26 | image::WordpathProcessor.png[width=700] 27 | 28 | === PipelineBuilder 29 | 30 | === CharNormalize 31 | 32 | === WordSplitAlgorithm 33 | 34 | ==== CORE 35 | 36 | ==== 感知机 37 | 38 | ==== ATOM 39 | 40 | === WordpathProcessor 41 | 42 | ==== 人名识别 43 | 44 | ==== NER 45 | 46 | ==== 分词纠错 47 | 48 | ==== 自定义词典 49 | 50 | === WordTermCollector 51 | 52 | === 扩展插件 53 | 54 | === 自定义分词粒度插件示例 -------------------------------------------------------------------------------- /doc/modules.adoc: -------------------------------------------------------------------------------- 1 | == 综合模块 2 | 3 | === 拼音 4 | 5 | === 文本分类 6 | 7 | === 繁简体转换 8 | 9 | [source,java] 10 | ---- 11 | Simplified2Traditional s2t = TransformService.simplified2Traditional(); 12 | System.out.println(s2t.transform("软件和体育的艺术")); 13 | 14 | Traditional2Simplified t2s = TransformService.traditional2Simplified(); 15 | System.out.println(t2s.transform("軟件和體育的藝術")); 16 | 17 | ---- 18 | 19 | === 摘要 20 | 21 | 文本摘要包含了两个简单TextRank的实现。 22 | 23 | .关键字摘要 24 | [source,java] 25 | ---- 26 | KeywordSummary keywordSummary = new KeywordSummary(); 27 | keywordSummary.keyword("text",10); 28 | ---- 29 | 30 | .句子摘要 31 | [source,java] 32 | ---- 33 | SentenceSummary sentenceSummary = new SentenceSummary(); 34 | List result = sentenceSummary.summarySentences(document, 10); 35 | ---- 36 | 37 | === 相似度 38 | 39 | 还没开发。 40 | 41 | === 高亮与关键字替换 42 | 43 | 对文本关键字进行高亮。 44 | 45 | [source,java] 46 | ---- 47 | List keywords = new ArrayList<>(); 48 | 49 | keywords.add("居住证"); 50 | keywords.add("居住"); 51 | 52 | Highlighter highlighter = new Highlighter(keywords);//<1> 53 | 54 | String text = "居住在上海需要办理居住证"; 55 | String text = highlighter.replace(text); 56 | ---- 57 | <1> Highlighter对象可重复使用 58 | 59 | Highlighter内部使用了Trie结构,所以replace的时间复杂度和keywords的数量几乎无关,只对原始text扫描一次。 替换过程采用前向最大匹配算法。 60 | 61 | 另外还可以通过 `QuickReplacer` 类来自定义替换内容。 62 | 63 | [source,java] 64 | ---- 65 | List keywords = new ArrayList<>(); 66 | 67 | keywords.add("居住证"); 68 | keywords.add("居住"); 69 | 70 | QuickReplacer quickReplacer = new QuickReplacer(keywords); 71 | 72 | String result = quickReplacer.replace("居住在上海需要办理居住证", 73 | (Function) word -> ""+word+""); 74 | ---- 75 | 76 | Kotlin便捷扩展函数 77 | 78 | [source,kotlin] 79 | ---- 80 | "居住在上海需要办理居住证".highlight(listOf("居住证","居住")) 81 | ---- 82 | 83 | === 新词发现 84 | 85 | 这个文档怎么写 -------------------------------------------------------------------------------- /doc/mynlp-docinfo-footer.html: -------------------------------------------------------------------------------- 1 | 10 | -------------------------------------------------------------------------------- /doc/mynlp.adoc: -------------------------------------------------------------------------------- 1 | = Mynlp技术参考手册 2 | Jimi 3 | :doctype: book 4 | :toc: left 5 | :toc-title: 目录 6 | :toclevels: 5 7 | :icons: font 8 | :docinfo: shared,private-footer 9 | :imagesdir: images 10 | :source-highlighter: highlightjs 11 | :source-indent: 1 12 | :source-language: java 13 | :highlightjsdir: highlight 14 | 15 | == Mynlp介绍 16 | 17 | image::https://cdn.mayabot.com/mynlp/mynlp-banner.png[Logo,400,] 18 | MYNLP是一个Java实现的高性能、柔性API、可扩展的中文NLP工具包。 19 | 20 | .功能 21 | - 感知机分词 22 | - CORE二元语言模型&词典分词 23 | - 词性标注 24 | - 通用感知机 25 | - 命名实体识别(人名、地名、组织机构名) 26 | - fastText 27 | - 文本分类 28 | - 新词发现 29 | - 拼音转换&切分 30 | - 简繁体转换 31 | 32 | .欢迎关注微信公众号,获取最新动态和相关文章 33 | image::weixin.jpeg[weixin,150,,,align="center"] 34 | 35 | include::started.adoc[] 36 | 37 | include::lexer.adoc[] 38 | 39 | include::perceptron.adoc[] 40 | 41 | include::fasttext.adoc[] 42 | 43 | include::modules.adoc[] 44 | 45 | include::advanced.adoc[] 46 | 47 | include::other.adoc[] 48 | 49 | == 致谢以下优秀开源项目 50 | 51 | - HanLP 52 | - ansj_seg 53 | 54 | mynlp实现参考了他们算法实现和部分代码 -------------------------------------------------------------------------------- /doc/mynlp.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/doc/mynlp.docx -------------------------------------------------------------------------------- /doc/perceptron.adoc: -------------------------------------------------------------------------------- 1 | == 结构化平均感知机框架 2 | 3 | 什么是结构化平均感知机框架?请阅读这一篇文章link:http://www.hankcs.com/nlp/segment/implementation-of-word-segmentation-device-java-based-on-structured-average-perceptron.html[《基于结构化平均感知机的分词器Java实现》] 4 | 5 | 在这里我们用结构化平均感知机框架来解决序列化标注问题,例如BMES标注。因为BMES这四个TAG之前是有转移关系的, 所以肯定是结构化预测问题。 6 | 7 | mynlp提供了通用的AP框架,方便实现各种自定义标签、特征函数、语料格式。 8 | 9 | 你只需告知感知机框架三件事情: 10 | 11 | - 特征提取函数 12 | - label编码 13 | - 原始语料如果转换为(输入=标签)二元组 14 | 15 | -------------------------------------------------------------------------------- /doc/update.sh: -------------------------------------------------------------------------------- 1 | 2 | scp mynlp.html root@www.mayabot.com:/opt/mynlp-doc 3 | scp -r images root@www.mayabot.com:/opt/mynlp-doc 4 | scp -r highlight root@www.mayabot.com:/opt/mynlp-doc -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.code.style=official 2 | 3 | systemProp.org.gradle.internal.http.connectionTimeout=120000 4 | systemProp.org.gradle.internal.http.socketTimeout=120000 5 | //开启kotlin的增量和并行编译 6 | kotlin.incremental=true 7 | kotlin.incremental.java=true 8 | kotlin.incremental.js=true 9 | kotlin.caching.enabled=true 10 | kotlin.parallel.tasks.in.project=true -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jimichan/mynlp/b980da3a6f9cdcb83e0800f6cab50656df94a22a/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | #distributionUrl=https\://services.gradle.org/distributions/gradle-6.7-bin.zip 4 | distributionUrl=https\://mirrors.cloud.tencent.com/gradle/gradle-6.7-bin.zip 5 | zipStoreBase=GRADLE_USER_HOME 6 | zipStorePath=wrapper/dists 7 | wrapper.keep=true -------------------------------------------------------------------------------- /licenses/FastText-LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016-present, Facebook, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /licenses/StartSpace-LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Facebook, Inc. and its affiliates. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /mynlp-all/build.gradle.kts: -------------------------------------------------------------------------------- 1 | 2 | description = "自动依赖必要的资源" 3 | 4 | dependencies { 5 | 6 | api("org.jetbrains.kotlin:kotlin-stdlib") 7 | 8 | api(project(":mynlp")) 9 | 10 | // 核心词典 11 | implementation("com.mayabot.mynlp.resource:mynlp-resource-coredict:1.0.0") 12 | // 词性标注 13 | implementation("com.mayabot.mynlp.resource:mynlp-resource-pos:1.0.0") 14 | // 命名实体 15 | implementation("com.mayabot.mynlp.resource:mynlp-resource-ner:1.0.0") 16 | // pinyin 17 | implementation("com.mayabot.mynlp.resource:mynlp-resource-pinyin:1.1.0") 18 | // 繁简体转换 19 | implementation("com.mayabot.mynlp.resource:mynlp-resource-transform:1.0.1") 20 | 21 | 22 | // 感知机分词模型 23 | // implementation 'com.mayabot.mynlp.resource:mynlp-resource-cws:1.0.0' 24 | 25 | // 自定义扩展词库 26 | // implementation 'com.mayabot.mynlp.resource:mynlp-resource-custom:1.0.0' 27 | } -------------------------------------------------------------------------------- /mynlp-example/build.gradle.kts: -------------------------------------------------------------------------------- 1 | description = "Example" 2 | 3 | project.afterEvaluate { 4 | project.tasks.withType{ 5 | enabled = false 6 | } 7 | } 8 | 9 | dependencies { 10 | 11 | implementation(project(":mynlp-all")) 12 | 13 | implementation( "com.mayabot.mynlp.resource:mynlp-resource-cws:1.0.0") 14 | implementation( "com.mayabot.mynlp.resource:mynlp-resource-custom:1.0.0") 15 | implementation( "org.fusesource.jansi:jansi:1.16") 16 | implementation( "ch.qos.logback:logback-classic:1.2.3") 17 | 18 | 19 | } -------------------------------------------------------------------------------- /mynlp-example/src/main/java/Demo.java: -------------------------------------------------------------------------------- 1 | import com.mayabot.nlp.Mynlp; 2 | 3 | public class Demo { 4 | public static void main(String[] args) { 5 | // Mynlp.configer() 6 | // .set("a","1"); 7 | // System.out.println(Mynlp.instance().segment("扫描二维码即可下载")); 8 | // 9 | Mynlp mynlp = Mynlp.instance(); 10 | // 11 | // System.out.println(mynlp.segment("请勿大声喧哗")); 12 | //// 13 | // 14 | // System.out.println(mynlp.convertPinyin("信息公开")); 15 | // 16 | // 17 | // System.out.println(mynlp.splitPinyin("xinxigongkai")); 18 | 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /mynlp-example/src/main/java/pinyin/PinyinExample.java: -------------------------------------------------------------------------------- 1 | package pinyin; 2 | 3 | 4 | public class PinyinExample { 5 | public static void main(String[] args) { 6 | // PinyinResult result = Pinyins.convert("朝朝暮暮"); 7 | // 8 | // System.out.println(result.asString()); 9 | // System.out.println(result.asHeadList()); 10 | // System.out.println(result.asList()); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /mynlp-example/src/main/java/segment/CombineExample.java: -------------------------------------------------------------------------------- 1 | package segment; 2 | 3 | public class CombineExample { 4 | public static void main(String[] args) { 5 | 6 | 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /mynlp-example/src/main/java/segment/CoreSegment.java: -------------------------------------------------------------------------------- 1 | package segment; 2 | 3 | import com.mayabot.nlp.segment.*; 4 | 5 | import java.io.Reader; 6 | import java.io.StringReader; 7 | 8 | public class CoreSegment { 9 | 10 | public static void main(String[] args) { 11 | long t1 = System.currentTimeMillis(); 12 | 13 | Lexer tokenizer = Lexers.core(); 14 | 15 | 16 | Sentence sentence = tokenizer.scan("mynlp是mayabot开源的中文NLP工具包。"); 17 | 18 | System.out.println(sentence.toWordList()); 19 | 20 | 21 | LexerReader analyzer = tokenizer.reader(); 22 | 23 | Reader reader = new StringReader("假装这是一个大文本"); 24 | WordTermSequence result = analyzer.scan(reader); 25 | long t2 = System.currentTimeMillis(); 26 | System.out.println(t2 - t1); 27 | System.out.printf("result" + result.toSentence()); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /mynlp-example/src/main/java/segment/CustomSegment.java: -------------------------------------------------------------------------------- 1 | package segment; 2 | 3 | import com.mayabot.nlp.segment.FluentLexerBuilder; 4 | import com.mayabot.nlp.segment.Lexer; 5 | import com.mayabot.nlp.segment.Lexers; 6 | import com.mayabot.nlp.segment.plugins.customwords.CustomDictionaryPlugin; 7 | import com.mayabot.nlp.segment.plugins.customwords.MemCustomDictionary; 8 | 9 | public class CustomSegment { 10 | 11 | public static void main(String[] args) { 12 | 13 | MemCustomDictionary memCustomDictionary = new MemCustomDictionary(); 14 | 15 | FluentLexerBuilder builder = Lexers.coreBuilder(); 16 | 17 | builder.with(new CustomDictionaryPlugin(memCustomDictionary)); 18 | 19 | Lexer tokenizer = builder.build(); 20 | 21 | System.out.println(tokenizer); 22 | 23 | System.out.println(tokenizer.scan("欢迎来到松江临港科技城")); 24 | 25 | memCustomDictionary.addWord("临港科技城"); 26 | memCustomDictionary.rebuild(); 27 | 28 | System.out.println(tokenizer.scan("欢迎来到松江临港科技城")); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /mynlp-example/src/main/java/segment/UseStreamApi.java: -------------------------------------------------------------------------------- 1 | package segment; 2 | 3 | import com.mayabot.nlp.segment.LexerReader; 4 | import com.mayabot.nlp.segment.Lexers; 5 | import com.mayabot.nlp.segment.WordTerm; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.File; 9 | import java.io.FileInputStream; 10 | import java.io.InputStreamReader; 11 | import java.util.stream.Stream; 12 | 13 | public class UseStreamApi { 14 | 15 | public static void main(String[] args) throws Exception { 16 | 17 | LexerReader lexerReader = Lexers.core().reader(); 18 | 19 | try (BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream( 20 | new File("data/红楼梦.txt"))))) { 21 | 22 | Stream stream = lexerReader.scan(bufferedReader) 23 | .stream() 24 | .filter(it -> it.word.length() > 1); 25 | stream.forEach(term -> { 26 | 27 | }); 28 | 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /mynlp-example/src/main/java/transform/TraditionalExample.java: -------------------------------------------------------------------------------- 1 | package transform; 2 | 3 | import com.mayabot.nlp.module.trans.TransformService; 4 | 5 | public class TraditionalExample { 6 | 7 | public static void main(String[] args) { 8 | String text = "軟件和體育的藝術"; 9 | System.out.println(TransformService.t2s(text)); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /mynlp-example/src/test/java/TestHighlight.java: -------------------------------------------------------------------------------- 1 | import com.mayabot.nlp.module.QuickReplacer; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.function.Function; 6 | 7 | public class TestHighlight { 8 | 9 | public static void main(String[] args) { 10 | List keywords = new ArrayList<>(); 11 | 12 | keywords.add("居住证"); 13 | keywords.add("居住"); 14 | 15 | QuickReplacer quickReplacer = new QuickReplacer(keywords); 16 | 17 | String result = quickReplacer.replaceForJava("居住在上海需要办理居住证", 18 | (Function) word -> "" + word + ""); 19 | 20 | System.out.println(result); 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /mynlp-experimental/.gitignore: -------------------------------------------------------------------------------- 1 | JRNN-master 2 | src 3 | backup -------------------------------------------------------------------------------- /mynlp-experimental/build.gradle.kts: -------------------------------------------------------------------------------- 1 | dependencies { 2 | api("org.jetbrains.kotlin:kotlin-stdlib") 3 | } 4 | // dependencies { 5 | //// implementation 'org.jblas:jblas:1.2.5' 6 | //// compile 'org.apache.commons:commons-lang3:3.3.2' 7 | //// compile 'com.google.guava:guava:18.0' 8 | //// compile 'commons-io:commons-io:2.4' 9 | // compile ("org.jetbrains.kotlin:kotlin-stdlib") { 10 | // exclude module:"kotlin-stdlib-jdk7" 11 | // exclude module:"kotlin-stdlib-jdk8" 12 | // } 13 | //} 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /mynlp/shell/mynlp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # Attempt to set APP_HOME 3 | # Resolve links: $0 may be a link 4 | PRG="$0" 5 | # Need this for relative symlinks. 6 | while [ -h "$PRG" ] ; do 7 | ls=`ls -ld "$PRG"` 8 | link=`expr "$ls" : '.*-> \(.*\)$'` 9 | if expr "$link" : '/.*' > /dev/null; then 10 | PRG="$link" 11 | else 12 | PRG=`dirname "$PRG"`"/$link" 13 | fi 14 | done 15 | SAVED="`pwd`" 16 | cd "`dirname \"$PRG\"`/" >/dev/null 17 | APP_HOME="`pwd -P`" 18 | cd "$SAVED" >/dev/null 19 | 20 | APP_NAME="Mynlp" 21 | APP_BASE_NAME=`basename "$0"` 22 | 23 | # 下载mynlp-bin.jar 24 | # 准备JDK环境变量 25 | # exe jar -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/MynlpConfigs.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp 2 | 3 | import com.mayabot.nlp.common.SettingItem 4 | import com.mayabot.nlp.common.SettingItem.stringSetting 5 | 6 | object MynlpConfigs { 7 | 8 | @JvmField 9 | val server: SettingItem = stringSetting("mynlp.server", "") 10 | 11 | /** 12 | * AP分词器的模型名 13 | */ 14 | @JvmField 15 | val cwsModelItem: SettingItem = stringSetting("cws.model", "cws-model") 16 | 17 | /** 18 | * 自定义词典的路径 19 | * value可以是用逗号分隔的多个值,表示多个文件 20 | */ 21 | @JvmField 22 | val dictPathSetting: SettingItem = stringSetting( 23 | "custom.dictionary.path", "custom-dict/CustomDictionary.txt" 24 | ) 25 | 26 | /** 27 | * 主要拼音的资源文件名 28 | */ 29 | @JvmField 30 | val pinyinSetting: SettingItem = stringSetting("pinyin.dict", "mynlp-pinyin.txt") 31 | 32 | /** 33 | * 拼音自定义扩展词典的文件名(可选) 34 | */ 35 | @JvmField 36 | val pinyinExtDicSetting: SettingItem = stringSetting("pinyin.ext.dict", null) 37 | 38 | /** 39 | * 分词纠错词典配置 40 | */ 41 | @JvmField 42 | val correctionDict: SettingItem = stringSetting("correction.dict", "dictionary/correction.txt") 43 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/Heap.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.algorithm 2 | 3 | /** 4 | * 默认小顶堆。如果需要大顶堆 5 | * 6 | * 7 | */ 8 | class TopHeap( 9 | val maxSize: Int, 10 | val comparator: Comparator, 11 | /** 12 | * false 表示大顶堆 13 | */ 14 | val minTop: Boolean = true 15 | ) { 16 | 17 | private val data = arrayOfNulls(maxSize) 18 | 19 | private var size: Int = 0 20 | 21 | fun push(data: T) { 22 | 23 | } 24 | 25 | private fun heapify() { 26 | 27 | } 28 | 29 | fun root(): T { 30 | TODO() 31 | } 32 | 33 | /** 34 | * 获取里面的所有元素,但是并不是排好序的 35 | */ 36 | fun toList(): List { 37 | TODO() 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/TopIntMinK.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.algorithm 2 | 3 | /** 4 | * Top K 最小值。 5 | */ 6 | class TopIntMinK(private val k: Int) { 7 | 8 | private val heap = FloatArray(k) 9 | private val idIndex = IntArray(k) { -1 } 10 | 11 | var size = 0 12 | 13 | fun push(id: Int, score: Float) { 14 | if (size < k) { 15 | heap[size] = score 16 | idIndex[size] = id 17 | size++ 18 | 19 | if (size == k) { 20 | buildMinHeap() 21 | } 22 | } else { 23 | // 如果这个数据小于最大值,那么有资格进入 24 | if (score < heap[0]) { 25 | heap[0] = score 26 | idIndex[0] = id 27 | 28 | topify(0) 29 | } 30 | } 31 | } 32 | 33 | fun result(): ArrayList> { 34 | val top = Math.min(k, size) 35 | val list = ArrayList>(top) 36 | 37 | for (i in 0 until top) { 38 | list += idIndex[i] to heap[i] 39 | } 40 | 41 | list.sortBy { it.second } 42 | return list 43 | } 44 | 45 | private fun buildMinHeap() { 46 | for (i in k / 2 - 1 downTo 0) { 47 | topify(i)// 依次向上将当前子树最大堆化 48 | } 49 | } 50 | 51 | private fun topify(i: Int) { 52 | val l = 2 * i + 1 53 | val r = 2 * i + 2 54 | var max: Int 55 | 56 | if (l < k && heap[l] > heap[i]) 57 | max = l 58 | else 59 | max = i 60 | 61 | if (r < k && heap[r] > heap[max]) { 62 | max = r 63 | } 64 | 65 | if (max == i || max >= k) 66 | // 如果largest等于i说明i是最大元素 67 | // largest超出heap范围说明不存在比i节点大的子女 68 | return 69 | 70 | swap(i, max) 71 | topify(max) 72 | } 73 | 74 | private fun swap(i: Int, j: Int) { 75 | val tmp = heap[i] 76 | heap[i] = heap[j] 77 | heap[j] = tmp 78 | 79 | val tmp2 = idIndex[i] 80 | idIndex[i] = idIndex[j] 81 | idIndex[j] = tmp2 82 | } 83 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/Trie.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.algorithm.collection; 17 | 18 | 19 | /** 20 | * 字典树接口 21 | * 22 | * @author jimichan 23 | */ 24 | public interface Trie { 25 | 26 | T get(char[] key); 27 | 28 | T get(CharSequence key); 29 | 30 | T get(char[] key, int offset, int len); 31 | 32 | boolean containsKey(String key); 33 | 34 | } 35 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/ahocorasick/Hit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * 19 | * He Han 20 | * hankcs.cn@gmail.com 21 | * 源代码来自于 https://github.com/hankcs/HanLP 22 | */ 23 | package com.mayabot.nlp.algorithm.collection.ahocorasick; 24 | 25 | /** 26 | * 一个命中结果 27 | * 28 | * @param 29 | */ 30 | public class Hit { 31 | /** 32 | * 模式串在母文本中的起始位置 33 | */ 34 | public final int begin; 35 | /** 36 | * 模式串在母文本中的终止位置 37 | */ 38 | public final int end; 39 | /** 40 | * 模式串对应的值 41 | */ 42 | public final V value; 43 | 44 | public Hit(int begin, int end, V value) { 45 | this.begin = begin; 46 | this.end = end; 47 | this.value = value; 48 | } 49 | 50 | @Override 51 | public String toString() { 52 | return String.format("[%d:%d]=%s", begin, end, value); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/ahocorasick/IHit.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * 19 | * He Han 20 | * hankcs.cn@gmail.com 21 | * 源代码来自于 https://github.com/hankcs/HanLP 22 | */ 23 | package com.mayabot.nlp.algorithm.collection.ahocorasick; 24 | 25 | /** 26 | * 命中一个模式串的处理方法 27 | */ 28 | public interface IHit { 29 | /** 30 | * 命中一个模式串 31 | * 32 | * @param begin 模式串在母文本中的起始位置 33 | * @param end 模式串在母文本中的终止位置 34 | * @param value 模式串对应的值 35 | */ 36 | void hit(int begin, int end, V value); 37 | } 38 | 39 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/ahocorasick/IHitFull.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * 19 | * He Han 20 | * hankcs.cn@gmail.com 21 | * 源代码来自于 https://github.com/hankcs/HanLP 22 | */ 23 | package com.mayabot.nlp.algorithm.collection.ahocorasick; 24 | 25 | public interface IHitFull { 26 | /** 27 | * 命中一个模式串 28 | * 29 | * @param begin 模式串在母文本中的起始位置 30 | * @param end 模式串在母文本中的终止位置 31 | * @param value 模式串对应的值 32 | * @param index 模式串对应的值的下标 33 | */ 34 | void hit(int begin, int end, V value, int index); 35 | } 36 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/bintrie/BinTrieNode.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.algorithm.collection.bintrie; 18 | 19 | /** 20 | * @param 21 | * @author jimichan 22 | */ 23 | public interface BinTrieNode { 24 | 25 | BinTrieNode addChildNode(BinTrieNode nodeToInsert); 26 | 27 | BinTrieNode findChild(char c); 28 | 29 | byte getStatus(); 30 | 31 | T getValue(); 32 | 33 | int compareTo(char c); 34 | 35 | boolean contains(char c); 36 | 37 | 38 | default BinTrieNode findNode(char[] keyWord) { 39 | BinTrieNode point = this; 40 | for (int j = 0; j < keyWord.length; j++) { 41 | point = point.findChild(keyWord[j]); 42 | if (point == null) { 43 | return null; 44 | } 45 | } 46 | return point; 47 | } 48 | 49 | /** 50 | * 寻找到这个路径的最后一个节点 51 | * 52 | * @param key 53 | * @return BinTrieNode 54 | */ 55 | default BinTrieNode findNode(CharSequence key) { 56 | BinTrieNode branch = this; 57 | int len = key.length(); 58 | for (int i = 0; i < len; i++) { 59 | char _char = key.charAt(i); 60 | if (branch == null) { 61 | return null; 62 | } 63 | branch = branch.findChild(_char); 64 | } 65 | return branch; 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/bintrie/TrieTreeMatcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.algorithm.collection.bintrie; 18 | 19 | /** 20 | * @param 21 | * @author jimichan 22 | */ 23 | public interface TrieTreeMatcher { 24 | 25 | /** 26 | * 詞典中全部命中的詞語 27 | * 28 | * @return String 29 | */ 30 | String next(); 31 | 32 | 33 | /** 34 | * 得到全部参数 35 | * 36 | * @return String 37 | */ 38 | T getParams(); 39 | 40 | /** 41 | * 当参数对象是列表或者数组的时候,返回指定下标的内容。否则返回null 42 | * 43 | * @param i 44 | * @return String 45 | */ 46 | String getParam(int i); 47 | 48 | 49 | int getOffset(); 50 | 51 | } 52 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/dat/DATMapMatcher.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * 源代码参考和部分引用来自 https://github.com/hankcs/HanLP https://github.com/NLPchina/ansj_seg 19 | */ 20 | package com.mayabot.nlp.algorithm.collection.dat; 21 | 22 | /** 23 | * 一个搜索工具(注意,当调用next()返回false后不应该继续调用next(),除非reset状态) 24 | *

25 | * DAT的匹配器是一个多匹配器,把各种可能都计算出来 26 | * 27 | * @author jimichan 28 | */ 29 | public interface DATMapMatcher { 30 | 31 | boolean next(); 32 | 33 | int getBegin(); 34 | 35 | int getLength(); 36 | 37 | V getValue(); 38 | 39 | int getIndex(); 40 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/collection/dat/FastDatCharSet.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.algorithm.collection.dat; 2 | 3 | import java.util.HashSet; 4 | import java.util.Set; 5 | import java.util.TreeSet; 6 | 7 | public class FastDatCharSet { 8 | 9 | private DoubleArrayTrie map; 10 | 11 | public FastDatCharSet(char... chars) { 12 | HashSet set = new HashSet<>(); 13 | for (char aChar : chars) { 14 | set.add(aChar); 15 | } 16 | set(set); 17 | } 18 | 19 | public FastDatCharSet(Set characterSet) { 20 | set(characterSet); 21 | } 22 | 23 | private void set(Set characterSet) { 24 | TreeSet treeMap = new TreeSet<>(); 25 | 26 | for (Character character : characterSet) { 27 | treeMap.add(character.toString()); 28 | } 29 | 30 | this.map = new DoubleArrayTrie(treeMap); 31 | } 32 | 33 | public boolean contains(char ch) { 34 | return map.indexOf(ch) != -1; 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/algorithm/distance/StringDistance.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.mayabot.nlp.algorithm.distance; 18 | 19 | /** 20 | * Interface for string distances. 21 | */ 22 | public interface StringDistance { 23 | 24 | /** 25 | * Returns a float between 0 and 1 based on how similar the specified strings are to one another. 26 | * Returning a value of 1 means the specified strings are identical and 0 means the 27 | * string are maximally different. 28 | * @param s1 The first string. 29 | * @param s2 The second string. 30 | * @return a float between 0 and 1 based on how similar the specified strings are to one another. 31 | */ 32 | public float getDistance(String s1,String s2); 33 | 34 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/blas/BlasUtils.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.blas 2 | 3 | import java.nio.ByteBuffer 4 | import kotlin.math.sqrt 5 | 6 | /** 7 | * 向量点积 8 | */ 9 | fun dot(a: Vector, b: Vector): Float { 10 | return a * b 11 | } 12 | 13 | /** 14 | * 向量余弦 15 | */ 16 | fun cosine(a: Vector, b: Vector): Float { 17 | val normA = a * a 18 | val normB = b * b 19 | return if (normA == 0.0f || normB == 0.0f) { 20 | 0.0f 21 | } else { 22 | (a * b / sqrt((normA * normB).toDouble())).toFloat() 23 | } 24 | } 25 | 26 | fun floatArrayVector(size: Int) = DenseVector(size) 27 | fun byteBufferVector(size: Int) = ByteBufferDenseVector(ByteBuffer.allocate(size shl 2), 0, size) 28 | fun directByteBufferVector(size: Int) = ByteBufferDenseVector(ByteBuffer.allocateDirect(size shl 2), 0, size) 29 | 30 | fun floatArrayMatrix(rows: Int, cols: Int, data: FloatArray) = DenseArrayMatrix(rows, cols, data) 31 | fun floatArrayMatrix(rows: Int, cols: Int) = DenseArrayMatrix(rows, cols) 32 | fun byteBufferMatrix(rows: Int, cols: Int) = ByteBufferMatrix(rows, cols, false) 33 | fun directByteBufferMatrix(rows: Int, cols: Int) = ByteBufferMatrix(rows, cols, true) 34 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/blas/Matrix.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.blas 2 | 3 | import java.io.File 4 | import java.io.Serializable 5 | import java.nio.channels.FileChannel 6 | 7 | /** 8 | * Float矩阵 9 | */ 10 | interface Matrix : Serializable { 11 | 12 | val row: Int 13 | val col: Int 14 | 15 | /** 16 | * 矩阵的第i行和vec进行点积计算 17 | */ 18 | fun dotRow(vec: Vector, i: Int): Float 19 | 20 | /** 21 | * 把[vector]加到指定的[row] , [a]是系数 22 | */ 23 | fun addVectorToRow(vector: Vector, row: Int, a: Float) 24 | 25 | fun addRowToVector(target: Vector, i: Int, a: Double? = null) 26 | 27 | fun save(file: File) 28 | 29 | fun save(channel: FileChannel) 30 | 31 | 32 | } 33 | 34 | interface DenseMatrix : Matrix { 35 | 36 | fun zero() 37 | // fun fill(v: Float) 38 | fun uniform(number: Number) 39 | 40 | operator fun get(row: Int): Vector 41 | operator fun get(i: Int, j: Int): Float 42 | 43 | operator fun set(i: Int, j: Int, v: Float) 44 | 45 | /** 46 | * 乘法 47 | * 48 | * 从ib到ie这些行,系数存在vector里面 49 | */ 50 | fun multiplyRow(nums: Vector, ib: Int = 0, ie: Int = -1) 51 | 52 | /** 53 | * 除法 54 | */ 55 | fun divideRow(nums: Vector, ib: Int = 0, ie: Int = -1) 56 | 57 | fun l2NormRow(i: Int): Float 58 | fun l2NormRow(norms: Vector) 59 | 60 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/cli/MynlpCli.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.cli 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.common.logging.InternalLogLevel 5 | import com.mayabot.nlp.common.logging.JdkLogger 6 | import com.mayabot.nlp.segment.segment 7 | 8 | fun main(args: Array) { 9 | 10 | JdkLogger.defaultLevel = InternalLogLevel.WARN; 11 | 12 | Mynlp.configer().setAutoDownloadRes(true) 13 | 14 | println("2012年的冬天".segment()) 15 | 16 | if (args.isEmpty()) { 17 | printTopHelp() 18 | return 19 | } 20 | val subcommand = args.first() 21 | val commandArgs = args.drop(1).toTypedArray() 22 | 23 | } 24 | 25 | fun printTopHelp() { 26 | println( 27 | """ 28 | Usage: mynlp subcommand [OPTION]... 29 | 30 | Mynlp实用工具,提供多个subcommand执行不同的功能. 31 | 32 | Subcommand List: 33 | 34 | segment 中文分词 35 | ner 命名实体 36 | pos 词性分析 37 | name 人名模型 38 | perceptron 通用AP训练和评估 39 | train 内部模型训练入口 40 | nwd 新词发现 41 | fastText 分类模型和词嵌入 42 | t2s 繁简体转换 43 | pinyin 文字转拼音 44 | pinyin-split 拼音流切分(nihaoshijie --> ni hao shi jie) 45 | hash 46 | classify 便捷的文本分类 47 | 48 | 49 | """.trimIndent() 50 | ) 51 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/ArraySizingStrategy.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common; 2 | 3 | /** 4 | * Resizing (growth) strategy for array-backed buffers. 5 | */ 6 | public interface ArraySizingStrategy { 7 | /** 8 | * @param currentBufferLength Current size of the array (buffer). This number should comply with 9 | * the strategy's policies (it is a result of initial rounding or 10 | * further growCalls). It can also be zero, indicating the growth 11 | * from an empty buffer. 12 | * @param elementsCount Number of elements stored in the buffer. 13 | * @param expectedAdditions Expected number of additions (resize hint). 14 | * @return Must return a new size at least as big as to hold 15 | * elementsCount + expectedAdditions. 16 | */ 17 | int grow(int currentBufferLength, int elementsCount, int expectedAdditions); 18 | } 19 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/Pair.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common; 2 | 3 | public final class Pair { 4 | 5 | public T first; 6 | public R second; 7 | 8 | public Pair(T first, R second) { 9 | this.first = first; 10 | this.second = second; 11 | } 12 | 13 | @Override 14 | public String toString() { 15 | return "(" + first + ", " + second + ')'; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/ParagraphIterable.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.common 18 | 19 | 20 | /** 21 | * ParagraphReader包装成iterable对象 22 | * 23 | * @author jimichan 24 | */ 25 | 26 | class ParagraphIterable(private val reader: ParagraphReader) : Iterable { 27 | 28 | override fun iterator(): Iterator { 29 | 30 | return object : AbstractIterator() { 31 | override fun computeNext() { 32 | val n = reader.next() 33 | if (n == null) { 34 | done() 35 | }else{ 36 | setNext(n) 37 | } 38 | } 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/ParagraphReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.common; 18 | 19 | import java.io.IOException; 20 | 21 | /** 22 | * 分段接口 23 | * 24 | * @author jimichan 25 | */ 26 | public interface ParagraphReader { 27 | /** 28 | * 返回一个段落,最后返回null 29 | * 30 | * @return String 31 | * @throws IOException 32 | */ 33 | String next() throws IOException; 34 | } 35 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/ParagraphReaderString.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.common; 18 | 19 | /** 20 | * 有的时候给定的文本很短,那么就做个假的 21 | * 22 | * @author jimichan 23 | */ 24 | public class ParagraphReaderString implements ParagraphReader { 25 | 26 | private String string = null; 27 | 28 | public ParagraphReaderString(String string) { 29 | this.string = string; 30 | } 31 | 32 | @Override 33 | public String next() { 34 | String old = string; 35 | string = null; 36 | return old; 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/TagAndScore.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common; 2 | 3 | /** 4 | * @author jimichan 5 | */ 6 | public class TagAndScore { 7 | 8 | private String tag; 9 | 10 | private float score; 11 | 12 | public TagAndScore(String tag, float score) { 13 | this.tag = tag; 14 | this.score = score; 15 | } 16 | 17 | public String getTag() { 18 | return tag; 19 | } 20 | 21 | public void setTag(String tag) { 22 | this.tag = tag; 23 | } 24 | 25 | public float getScore() { 26 | return score; 27 | } 28 | 29 | public void setScore(float score) { 30 | this.score = score; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/cli/ParseException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | *

9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | *

11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | 18 | package com.mayabot.nlp.common.cli; 19 | 20 | /** 21 | * Base for Exceptions thrown during parsing of a command-line. 22 | * 23 | * @version $Id: ParseException.java 1443102 2013-02-06 18:12:16Z tn $ 24 | */ 25 | public class ParseException extends Exception { 26 | /** 27 | * This exception {@code serialVersionUID}. 28 | */ 29 | private static final long serialVersionUID = 9112808380089253192L; 30 | 31 | /** 32 | * Construct a new ParseException 33 | * with the specified detail message. 34 | * 35 | * @param message the detail message 36 | */ 37 | public ParseException(String message) { 38 | super(message); 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/cli/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | *

9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | *

11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | *

17 | * Commons CLI 1.3 18 | * 19 | * @version $Id: package-info.java 1443102 2013-02-06 18:12:16Z tn $ 20 | */ 21 | 22 | /** 23 | * Commons CLI 1.3 24 | * 25 | * @version $Id: package-info.java 1443102 2013-02-06 18:12:16Z tn $ 26 | */ 27 | package com.mayabot.nlp.common.cli; 28 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/hash/MurmurHash3Kotlin.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.hash 2 | 3 | object MurmurHash3Utils { 4 | 5 | fun hashBytes(byteArray: ByteArray, offset: Int, length: Int, seed: Long = 0L, hash: MurmurHash3.Hash128 = MurmurHash3.Hash128()): MurmurHash3.Hash128 { 6 | return MurmurHash3.hash128(byteArray, offset, length, seed, hash) 7 | } 8 | 9 | fun hashBytes(byteArray: ByteArray): MurmurHash3.Hash128 { 10 | return this.hashBytes(byteArray, 0, byteArray.size) 11 | } 12 | 13 | fun hashString(text: String): Long { 14 | val bytes = text.toByteArray(Charsets.UTF_8) 15 | return hashBytes(bytes).h1 16 | } 17 | 18 | } 19 | 20 | fun String.murmur3(): Long { 21 | return MurmurHash3Utils.hashString(this) 22 | } 23 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/hppc/BufferAllocationException.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.hppc; 2 | 3 | import java.util.IllegalFormatException; 4 | import java.util.Locale; 5 | 6 | public class BufferAllocationException extends RuntimeException { 7 | public BufferAllocationException(String message) { 8 | super(message); 9 | } 10 | 11 | public BufferAllocationException(String message, Object... args) { 12 | this(message, null, args); 13 | } 14 | 15 | public BufferAllocationException(String message, Throwable t, Object... args) { 16 | super(formatMessage(message, t, args), t); 17 | } 18 | 19 | private static String formatMessage(String message, Throwable t, Object... args) { 20 | try { 21 | return String.format(Locale.ROOT, message, args); 22 | } catch (IllegalFormatException e) { 23 | BufferAllocationException substitute = 24 | new BufferAllocationException(message + " [ILLEGAL FORMAT, ARGS SUPPRESSED]"); 25 | if (t != null) { 26 | substitute.addSuppressed(t); 27 | } 28 | substitute.addSuppressed(e); 29 | throw substitute; 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/injector/BeanFactory.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.injector; 2 | 3 | import org.jetbrains.annotations.NotNull; 4 | 5 | public interface BeanFactory { 6 | public Object create(@NotNull Injector injector) ; 7 | } 8 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/injector/ImplementedBy.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.injector; 2 | 3 | import java.lang.annotation.Retention; 4 | import java.lang.annotation.Target; 5 | 6 | import static java.lang.annotation.ElementType.TYPE; 7 | import static java.lang.annotation.RetentionPolicy.RUNTIME; 8 | 9 | @Retention(RUNTIME) 10 | @Target(TYPE) 11 | public @interface ImplementedBy { 12 | /** The implementation type. */ 13 | Class value(); 14 | } 15 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/injector/Singleton.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.injector; 2 | 3 | import java.lang.annotation.Retention; 4 | import java.lang.annotation.Target; 5 | 6 | import static java.lang.annotation.ElementType.TYPE; 7 | import static java.lang.annotation.RetentionPolicy.RUNTIME; 8 | 9 | @Retention(RUNTIME) 10 | @Target(TYPE) 11 | public @interface Singleton { 12 | } 13 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/logging/FormattingTuple.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | 18 | package com.mayabot.nlp.common.logging; 19 | 20 | /** 21 | * Holds the results of formatting done by {@link MessageFormatter}. 22 | */ 23 | final class FormattingTuple { 24 | 25 | private final String message; 26 | private final Throwable throwable; 27 | 28 | FormattingTuple(String message, Throwable throwable) { 29 | this.message = message; 30 | this.throwable = throwable; 31 | } 32 | 33 | public String getMessage() { 34 | return message; 35 | } 36 | 37 | public Throwable getThrowable() { 38 | return throwable; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/logging/InternalLogLevel.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * Copyright 2012 The Netty Project 19 | * 20 | * The Netty Project licenses this file to you under the Apache License, 21 | * version 2.0 (the "License"); you may not use this file except in compliance 22 | * with the License. You may obtain a copy of the License at: 23 | * 24 | * http://www.apache.org/licenses/LICENSE-2.0 25 | * 26 | * Unless required by applicable law or agreed to in writing, software 27 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 28 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 29 | * License for the specific language governing permissions and limitations 30 | * under the License. 31 | */ 32 | package com.mayabot.nlp.common.logging; 33 | 34 | /** 35 | * The log level that {@link InternalLogger} can log at. 36 | */ 37 | public enum InternalLogLevel { 38 | /** 39 | * 'TRACE' log level. 40 | */ 41 | TRACE, 42 | /** 43 | * 'DEBUG' log level. 44 | */ 45 | DEBUG, 46 | /** 47 | * 'INFO' log level. 48 | */ 49 | INFO, 50 | /** 51 | * 'WARN' log level. 52 | */ 53 | WARN, 54 | /** 55 | * 'ERROR' log level. 56 | */ 57 | ERROR 58 | } 59 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/logging/Log4J2LoggerFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * Copyright 2016 The Netty Project 19 | * 20 | * The Netty Project licenses this file to you under the Apache License, 21 | * version 2.0 (the "License"); you may not use this file except in compliance 22 | * with the License. You may obtain a copy of the License at: 23 | * 24 | * http://www.apache.org/licenses/LICENSE-2.0 25 | * 26 | * Unless required by applicable law or agreed to in writing, software 27 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 28 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 29 | * License for the specific language governing permissions and limitations 30 | * under the License. 31 | */ 32 | package com.mayabot.nlp.common.logging; 33 | 34 | import org.apache.logging.log4j.LogManager; 35 | 36 | public final class Log4J2LoggerFactory extends InternalLoggerFactory { 37 | 38 | public static final InternalLoggerFactory INSTANCE = new Log4J2LoggerFactory(); 39 | 40 | /** 41 | * @deprecated Use {@link #INSTANCE} instead. 42 | */ 43 | @Deprecated 44 | public Log4J2LoggerFactory() { 45 | } 46 | 47 | @Override 48 | public InternalLogger newInstance(String name) { 49 | return new Log4J2Logger(LogManager.getLogger(name)); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/logging/Log4JLoggerFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * Copyright 2012 The Netty Project 19 | * 20 | * The Netty Project licenses this file to you under the Apache License, 21 | * version 2.0 (the "License"); you may not use this file except in compliance 22 | * with the License. You may obtain a copy of the License at: 23 | * 24 | * http://www.apache.org/licenses/LICENSE-2.0 25 | * 26 | * Unless required by applicable law or agreed to in writing, software 27 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 28 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 29 | * License for the specific language governing permissions and limitations 30 | * under the License. 31 | */ 32 | package com.mayabot.nlp.common.logging; 33 | 34 | import org.apache.log4j.Logger; 35 | 36 | /** 37 | * Logger factory which creates an 38 | * Apache Log4J 39 | * logger. 40 | */ 41 | public class Log4JLoggerFactory extends InternalLoggerFactory { 42 | 43 | public static final InternalLoggerFactory INSTANCE = new Log4JLoggerFactory(); 44 | 45 | /** 46 | * @deprecated Use {@link #INSTANCE} instead. 47 | */ 48 | @Deprecated 49 | public Log4JLoggerFactory() { 50 | } 51 | 52 | @Override 53 | public InternalLogger newInstance(String name) { 54 | return new Log4JLogger(Logger.getLogger(name)); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/logging/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * Copyright 2013 The Netty Project 19 | * 20 | * The Netty Project licenses this file to you under the Apache License, 21 | * version 2.0 (the "License"); you may not use this file except in compliance 22 | * with the License. You may obtain a copy of the License at: 23 | * 24 | * http://www.apache.org/licenses/LICENSE-2.0 25 | * 26 | * Unless required by applicable law or agreed to in writing, software 27 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 28 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 29 | * License for the specific language governing permissions and limitations 30 | * under the License. 31 | */ 32 | 33 | /** 34 | * Internal-use-only logging API which is not allowed to be used outside Netty. 35 | */ 36 | package com.mayabot.nlp.common.logging; 37 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/resources/ClasspathNlpResourceFactory.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.resources; 2 | 3 | import java.net.URL; 4 | import java.nio.charset.Charset; 5 | 6 | /** 7 | * 从Claspath下面的路径下加载资源 8 | * 9 | * @author jimichan 10 | */ 11 | public class ClasspathNlpResourceFactory implements NlpResourceFactory { 12 | 13 | private ClassLoader classLoader; 14 | 15 | public ClasspathNlpResourceFactory(ClassLoader classLoader) { 16 | this.classLoader = classLoader; 17 | } 18 | 19 | @Override 20 | public NlpResource load(String resourceName, Charset charset) { 21 | 22 | if (resourceName.startsWith("/")) { 23 | resourceName = resourceName.substring(1); 24 | } 25 | String path = resourceName; 26 | 27 | URL resource = classLoader.getResource(path); 28 | 29 | if (resource != null) { 30 | return new URLNlpResource(resource, charset); 31 | } 32 | 33 | return null; 34 | } 35 | } 36 | 37 | 38 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/resources/FileNlpResourceFactory.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.resources 2 | 3 | import java.io.File 4 | import java.io.IOException 5 | import java.io.InputStream 6 | import java.nio.charset.Charset 7 | 8 | /** 9 | * @author jimichan 10 | */ 11 | class FileNlpResourceFactory(private val baseDir: File) : NlpResourceFactory { 12 | 13 | override fun load(resourceName: String, charset: Charset): NlpResource? { 14 | if (!baseDir.exists() || baseDir.isFile) { 15 | return null 16 | } 17 | 18 | val file = File(baseDir, resourceName.replace('/', File.separatorChar)) 19 | 20 | return if (file.exists() && file.canRead()) { 21 | FileMynlpResource(file, charset) 22 | } else null 23 | } 24 | 25 | class FileMynlpResource(private val file: File, private val charset: Charset) : NlpResource { 26 | @Throws(IOException::class) 27 | override fun inputStream(): InputStream { 28 | return file.inputStream().buffered() 29 | } 30 | 31 | override fun toString(): String { 32 | return file.absolutePath 33 | } 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/resources/NlpResource.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.common.resources; 18 | 19 | import com.mayabot.nlp.common.EncryptionUtil; 20 | 21 | import java.io.IOException; 22 | import java.io.InputStream; 23 | 24 | /** 25 | * 读取的模型是基于文本的。一般一行一个数据。 26 | * 项目中和外部系统驳接,比如数据库、HDSF 27 | * 28 | * @author jimichan 29 | */ 30 | public interface NlpResource { 31 | 32 | InputStream inputStream() throws IOException; 33 | 34 | /** 35 | * 有很多实现办法。要么对文件或数据进行计算,还有他同名文件 abc.txt 对应一个文件 abc.txt.hash 进行记录 36 | * 37 | * @return String 38 | */ 39 | default String hash() { 40 | 41 | try { 42 | InputStream inputStream = inputStream(); 43 | 44 | try { 45 | return EncryptionUtil.md5(inputStream); 46 | } finally { 47 | inputStream.close(); 48 | } 49 | } catch (Exception e) { 50 | throw new RuntimeException(e); 51 | } 52 | } 53 | 54 | 55 | } 56 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/resources/NlpResourceFactory.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.resources; 2 | 3 | import kotlin.text.Charsets; 4 | 5 | import java.nio.charset.Charset; 6 | 7 | /** 8 | * 资源文件的来源。比如从文件系统里面的加载,或者从classpath里面去加载 9 | * 10 | * @author jimichan 11 | */ 12 | public interface NlpResourceFactory { 13 | 14 | /** 15 | * 加载资源 16 | * 17 | * @param resourceName 格式为 dict/abc.dict 18 | * @param charset 字符集 19 | * @return 如果资源不存在那么返回NULL 20 | */ 21 | NlpResource load(String resourceName, Charset charset); 22 | 23 | /** 24 | * 加载资源 25 | * 26 | * @param resourceName 27 | * @return NlpResource 28 | */ 29 | default NlpResource load(String resourceName) { 30 | return load(resourceName, Charsets.UTF_8); 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/resources/URLNlpResource.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.common.resources; 18 | 19 | import com.mayabot.nlp.common.logging.InternalLogger; 20 | import com.mayabot.nlp.common.logging.InternalLoggerFactory; 21 | 22 | import java.io.BufferedInputStream; 23 | import java.io.IOException; 24 | import java.io.InputStream; 25 | import java.net.URL; 26 | import java.nio.charset.Charset; 27 | 28 | /** 29 | * @author jimichan 30 | */ 31 | public class URLNlpResource implements NlpResource { 32 | 33 | static InternalLogger logger = InternalLoggerFactory.getInstance(URLNlpResource.class); 34 | 35 | private final URL url; 36 | private final Charset charset; 37 | 38 | public URLNlpResource(URL url, Charset charset) { 39 | this.url = url; 40 | this.charset = charset; 41 | } 42 | 43 | @Override 44 | public InputStream inputStream() throws IOException { 45 | return new BufferedInputStream(url.openStream()); 46 | } 47 | 48 | @Override 49 | public String toString() { 50 | return url.toString(); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/resources/UseLines.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.resources 2 | 3 | import com.mayabot.nlp.common.utils.CharSourceLineReader 4 | import java.io.InputStream 5 | import java.util.function.Consumer 6 | 7 | object UseLines { 8 | 9 | @JvmOverloads 10 | @JvmStatic 11 | fun forEachLine(inputStream: InputStream, 12 | trim: Boolean = true, 13 | skipBlank: Boolean = true, 14 | consumer: Consumer) { 15 | inputStream.bufferedReader().forEachLine { x -> 16 | var line = x 17 | if (trim) { 18 | line = line.trim() 19 | } 20 | if (skipBlank && line.isBlank()) { 21 | 22 | } else { 23 | consumer.accept(line) 24 | } 25 | } 26 | } 27 | 28 | @JvmOverloads 29 | @JvmStatic 30 | fun useLines(inputStream: InputStream, 31 | trim: Boolean = true, 32 | skipBlank: Boolean = true, 33 | consumer: Consumer) { 34 | inputStream.bufferedReader().forEachLine { x -> 35 | var line = x 36 | if (trim) { 37 | line = line.trim() 38 | } 39 | if (skipBlank && line.isBlank()) { 40 | 41 | } else { 42 | consumer.accept(line) 43 | } 44 | } 45 | } 46 | 47 | @JvmStatic 48 | fun lineReader(inputStream: InputStream): CharSourceLineReader { 49 | return CharSourceLineReader(inputStream.bufferedReader(charset = Charsets.UTF_8)) 50 | } 51 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/utils/CharSourceLineReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.common.utils; 18 | 19 | 20 | import kotlin.collections.AbstractIterator; 21 | 22 | import java.io.BufferedReader; 23 | 24 | public class CharSourceLineReader extends AbstractIterator implements AutoCloseable { 25 | 26 | private final BufferedReader reader; 27 | 28 | public CharSourceLineReader(BufferedReader reader) { 29 | this.reader = reader; 30 | } 31 | 32 | @Override 33 | protected void computeNext() { 34 | try { 35 | String line = reader.readLine(); 36 | if (line == null) { 37 | done(); 38 | return; 39 | } else { 40 | setNext(line); 41 | return; 42 | // return line; 43 | } 44 | } catch (Exception e) { 45 | throw new RuntimeException(e); 46 | } 47 | 48 | } 49 | 50 | @Override 51 | public void close() { 52 | try { 53 | reader.close(); 54 | } catch (Exception e) { 55 | throw new RuntimeException(e); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/utils/DownloadUtils.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.utils 2 | 3 | import java.io.File 4 | import java.io.IOException 5 | import java.net.URL 6 | import java.util.zip.ZipInputStream 7 | 8 | object DownloadUtils { 9 | 10 | /** 11 | * 下载文件 12 | * 13 | * @param url 14 | * @param file 15 | */ 16 | @Throws(IOException::class) 17 | @JvmStatic 18 | fun download(url: String, file: File) { 19 | //先完全读入到内存中去。然后一次性写入文件 20 | file.writeBytes(URL(url).readBytes()) 21 | } 22 | 23 | /** 24 | * unzip file 25 | * 26 | * @param file 27 | * @throws Exception 28 | */ 29 | @Throws(Exception::class) 30 | @JvmStatic 31 | fun unzip(file: File) { 32 | 33 | ZipInputStream(file.inputStream().buffered()).use { zipInputStream -> 34 | var entry = zipInputStream.nextEntry 35 | 36 | while (entry != null) { 37 | val name = entry.name 38 | 39 | File(file.parent, name).outputStream().buffered().use { 40 | zipInputStream.copyTo(it) 41 | } 42 | 43 | entry = zipInputStream.nextEntry 44 | } 45 | 46 | } 47 | 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/utils/MyInts.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.common.utils; 18 | 19 | /** 20 | * @author jimichan 21 | */ 22 | public class MyInts { 23 | 24 | 25 | public static byte[] toByteArray(int[] value, int fromIndex, int toIndex) { 26 | toIndex = Math.min(value.length, toIndex); 27 | byte[] bytes = new byte[(toIndex - fromIndex) * 4]; 28 | int point = 0; 29 | for (int i = fromIndex; i < toIndex; i++) { 30 | int v = value[i]; 31 | bytes[point++] = (byte) (v >> 24); 32 | bytes[point++] = (byte) (v >> 16); 33 | bytes[point++] = (byte) (v >> 8); 34 | bytes[point++] = (byte) v; 35 | } 36 | return bytes; 37 | } 38 | 39 | 40 | public static int[] fromByteArrayToArray(byte[] bytes) { 41 | return fromByteArrayToArray(bytes, new int[bytes.length / 4], bytes.length); 42 | } 43 | 44 | public static int[] fromByteArrayToArray(byte[] bytes, int[] result, int bytesLen) { 45 | int intCount = bytesLen / 4; 46 | for (int i = 0, len = intCount; i < len; i++) { 47 | int from = i * 4; 48 | byte b1 = bytes[from++]; 49 | byte b2 = bytes[from++]; 50 | byte b3 = bytes[from++]; 51 | byte b4 = bytes[from++]; 52 | result[i] = b1 << 24 | (b2 & 0xFF) << 16 | (b3 & 0xFF) << 8 | (b4 & 0xFF); 53 | } 54 | return result; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/common/utils/MynlpFactories.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.common.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStreamReader; 5 | import java.net.URL; 6 | import java.nio.charset.StandardCharsets; 7 | import java.util.Enumeration; 8 | import java.util.HashMap; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | import static com.mayabot.nlp.common.Guava.mutiadd; 13 | 14 | /** 15 | * @author jimichan 16 | */ 17 | public class MynlpFactories { 18 | 19 | public static final String GuiceModule = "GuiceModule"; 20 | 21 | 22 | public static Map> load() throws Exception { 23 | 24 | Map> map = new HashMap<>(); 25 | 26 | { 27 | String[] split1 = System.getProperty(GuiceModule, "").trim().split(","); 28 | for (String k : split1) { 29 | if (!k.isEmpty()) { 30 | mutiadd(map, GuiceModule, Class.forName(k)); 31 | } 32 | } 33 | } 34 | 35 | Enumeration resources = MynlpFactories.class.getClassLoader(). 36 | getResources("META-INF/mynlp.factories"); 37 | 38 | while (resources.hasMoreElements()) { 39 | URL url = resources.nextElement(); 40 | 41 | BufferedReader reader = new BufferedReader(new InputStreamReader(url.openStream(), StandardCharsets.UTF_8)); 42 | 43 | String line = reader.readLine(); 44 | 45 | while (line != null) { 46 | 47 | String[] split = line.split("="); 48 | 49 | if (split.length == 2) { 50 | mutiadd(map, split[0].trim(), Class.forName(split[1].trim())); 51 | } 52 | 53 | line = reader.readLine(); 54 | } 55 | reader.close(); 56 | } 57 | 58 | return map; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/fasttext/FasttextTranUtils.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext 2 | 3 | import com.mayabot.nlp.segment.LexerReader 4 | import com.mayabot.nlp.segment.Lexers 5 | import java.io.File 6 | 7 | class FasttextTranUtils { 8 | 9 | companion object { 10 | 11 | /** 12 | * 处理没有分词的语料 13 | * __label__xxxx 语料文本,语料文本,语料文本 14 | */ 15 | @JvmOverloads 16 | @JvmStatic 17 | fun prepareBySegment(from: File, 18 | to: File, 19 | label: String = "__label__", 20 | lexer: LexerReader = Lexers.coreBuilder().build().filterReader(true, true)) { 21 | 22 | fun processLine(line:String): String{ 23 | val list = ArrayList() 24 | line.split(" ").forEach { part-> 25 | if(part.startsWith(label)){ 26 | list += part 27 | }else{ 28 | lexer.scan(part).toWordSequence().forEach { word-> 29 | list += word 30 | } 31 | } 32 | } 33 | return list.joinToString(" ") 34 | } 35 | 36 | from.useLines { lines-> 37 | to.bufferedWriter(Charsets.UTF_8).use { writer-> 38 | lines.forEach { line-> 39 | writer.write(processLine(line)) 40 | writer.write("\n") 41 | } 42 | } 43 | } 44 | } 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/fasttext/dictionary/LoadDictFromDataInput.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext.dictionary 2 | 3 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/fasttext/loss/NegativeSamplingLoss.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext.loss 2 | 3 | import com.mayabot.nlp.blas.Matrix 4 | import com.mayabot.nlp.common.IntArrayList 5 | import com.mayabot.nlp.fasttext.Model 6 | import kotlin.random.Random 7 | 8 | 9 | class NegativeSamplingLoss(wo: Matrix, val neg: Int, targetCounts: LongArray) : BinaryLogisticLoss(wo) { 10 | companion object { 11 | const val NEGATIVE_TABLE_SIZE = 10000000 12 | } 13 | 14 | val negatives = IntArrayList() 15 | 16 | 17 | val uniform: (random: Random) -> Int 18 | 19 | init { 20 | var z = 0.0 21 | for (i in 0 until targetCounts.size) { 22 | z += Math.pow(targetCounts[i].toDouble(), 0.5) 23 | } 24 | 25 | for (i in 0 until targetCounts.size) { 26 | val c = Math.pow(targetCounts[i].toDouble(), 0.5) 27 | for (j in 0 until (c * NEGATIVE_TABLE_SIZE / z).toInt()) { 28 | negatives.add(i) 29 | } 30 | } 31 | val ns = negatives.size() 32 | //uniform_ = std::uniform_int_distribution(0, negatives_.size()); 33 | uniform = { random -> random.nextInt(ns) } 34 | } 35 | 36 | override fun forward(targets: IntArrayList, targetIndex: Int, state: Model.State, lr: Float, backprop: Boolean): Float { 37 | val target = targets[targetIndex] 38 | var loss = binaryLogistic(target, state, true, lr, backprop) 39 | for (n in 0 until neg) { 40 | var negativeTarget = getNegative(target, state.rng) 41 | loss += binaryLogistic(negativeTarget, state, false, lr, backprop) 42 | } 43 | return loss 44 | } 45 | 46 | private fun getNegative(target: Int, rng: Random): Int { 47 | var negative = -1 48 | do { 49 | negative = negatives[uniform(rng)] 50 | } while (target == negative) 51 | return negative 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/fasttext/loss/OneVsAlLoss.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext.loss 2 | 3 | import com.mayabot.nlp.blas.Matrix 4 | import com.mayabot.nlp.common.IntArrayList 5 | import com.mayabot.nlp.fasttext.Model 6 | 7 | class OneVsAlLoss(wo: Matrix) : BinaryLogisticLoss(wo) { 8 | 9 | override fun forward(targets: IntArrayList, t_: Int, state: Model.State, lr: Float, backprop: Boolean): Float { 10 | var loss = 0f 11 | val osz = state.output.length() 12 | for (i in 0 until osz) { 13 | val isMatch = targets.contains(i) 14 | loss += binaryLogistic(i, state, isMatch, lr, backprop) 15 | } 16 | return loss 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/fasttext/loss/SoftmaxLoss.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext.loss 2 | 3 | import com.mayabot.nlp.blas.Matrix 4 | import com.mayabot.nlp.common.IntArrayList 5 | import com.mayabot.nlp.fasttext.Model 6 | import java.lang.Math.max 7 | 8 | class SoftmaxLoss(wo: Matrix) : Loss(wo) { 9 | override fun computeOutput(state: Model.State) { 10 | val output = state.output 11 | 12 | // matrixMulVector(wo, state.hidden, output) 13 | output.mul(wo, state.hidden) 14 | 15 | var max = output[0] 16 | var z = 0.0f 17 | 18 | val osz = output.length() 19 | 20 | for (i in 0 until osz) { 21 | max = max(output[i], max) 22 | } 23 | 24 | for (i in 0 until osz) { 25 | output[i] = kotlin.math.exp((output[i] - max).toDouble()).toFloat() 26 | z += output[i] 27 | } 28 | // 归一化? 29 | for (i in 0 until osz) { 30 | output[i] = output[i] / z 31 | } 32 | } 33 | 34 | override fun forward(targets: IntArrayList, targetIndex: Int, state: Model.State, lr: Float, backprop: Boolean): Float { 35 | 36 | computeOutput(state) 37 | 38 | val target = targets[targetIndex] 39 | if (backprop) { 40 | val osz = wo.row 41 | for (i in 0 until osz) { 42 | val label = if (i == target) 1.0f else 0.0f 43 | val alpha = lr * (label - state.output[i]) 44 | 45 | state.grad.addRow(wo, i, alpha.toDouble()) 46 | wo.addVectorToRow(state.hidden, i, alpha) 47 | } 48 | 49 | } 50 | 51 | val t = -log(state.output[target]) 52 | return t 53 | } 54 | 55 | 56 | } 57 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/fasttext/utils/ByteUtils.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext.utils;//package blas; 2 | 3 | // 4 | // 5 | public class ByteUtils { 6 | 7 | public static short byte2UInt(byte b) { 8 | return (short) (b & 0xFF); 9 | } 10 | 11 | public static byte short2Byte(short b) { 12 | return (byte) b; 13 | } 14 | 15 | 16 | public static final long readLITTLELong(byte[] readBuffer) { 17 | return (((long) readBuffer[7] << 56) + 18 | ((long) (readBuffer[6] & 255) << 48) + 19 | ((long) (readBuffer[5] & 255) << 40) + 20 | ((long) (readBuffer[4] & 255) << 32) + 21 | ((long) (readBuffer[3] & 255) << 24) + 22 | ((readBuffer[2] & 255) << 16) + 23 | ((readBuffer[1] & 255) << 8) + 24 | ((readBuffer[0] & 255) << 0)); 25 | } 26 | 27 | 28 | // public static void main(String[] args) { 29 | // for (int i = 0; i < 256; i++) { 30 | // byte b = short2Byte((short) i); 31 | // short x = byte2UInt(b); 32 | // System.out.loggerln(b + " = " + x); 33 | // } 34 | // } 35 | } 36 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/fasttext/utils/LogUtils.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext.utils 2 | 3 | private var doLog = true 4 | 5 | fun disableLog() { 6 | doLog = false 7 | } 8 | 9 | fun enableLog() { 10 | doLog = true 11 | } 12 | 13 | fun logger(s: Any) { 14 | if (doLog) print(s) 15 | } 16 | 17 | fun loggerln(s: Any) { 18 | if (doLog) println(s) 19 | } 20 | 21 | fun loggerln() { 22 | if (doLog) println() 23 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/lucene/BaseSynTokenFilter.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.lucene 2 | 3 | import org.apache.lucene.analysis.TokenFilter 4 | import org.apache.lucene.analysis.TokenStream 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 6 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute 7 | import java.util.* 8 | 9 | /** 10 | * 基础类;对词进行扩展 11 | */ 12 | abstract class BaseSynTokenFilter(input: TokenStream) : TokenFilter(input) { 13 | 14 | /** 15 | * 当前词 16 | */ 17 | private val termAtt = addAttribute(CharTermAttribute::class.java) 18 | 19 | /** 20 | * Position Increment 21 | */ 22 | private val positionAttr = addAttribute( 23 | PositionIncrementAttribute::class.java 24 | ) 25 | 26 | private val buffer = LinkedList() 27 | 28 | override fun incrementToken(): Boolean { 29 | 30 | if (buffer.isNotEmpty()) { 31 | val ele = buffer.pollFirst() 32 | termAtt.setEmpty().append(ele) 33 | positionAttr.positionIncrement = 0 34 | return true 35 | } 36 | 37 | val hasNext = input.incrementToken() 38 | if (!hasNext) { 39 | return false 40 | } 41 | 42 | val item = termAtt as CharSequence 43 | 44 | val extended = extend(item) 45 | buffer.addAll(extended) 46 | 47 | // buffer 肯定不能是空 48 | termAtt.setEmpty().append(buffer.pollFirst()) 49 | 50 | return true 51 | } 52 | 53 | /** 54 | * 返回的list不能为空,至少要包括自己吧 55 | */ 56 | abstract fun extend(item: CharSequence): List 57 | 58 | override fun reset() { 59 | super.reset() 60 | this.buffer.clear() 61 | } 62 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/lucene/IterableMode.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.lucene; 2 | 3 | 4 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/lucene/MynlpAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.lucene; 2 | 3 | import com.mayabot.nlp.segment.LexerReader; 4 | import com.mayabot.nlp.segment.WordTermIterableMode; 5 | import org.apache.lucene.analysis.Analyzer; 6 | 7 | /** 8 | * @author jimichan 9 | */ 10 | public class MynlpAnalyzer extends Analyzer { 11 | 12 | private final LexerReader lexerReader; 13 | 14 | private WordTermIterableMode mode = WordTermIterableMode.TOP; 15 | 16 | 17 | public MynlpAnalyzer(LexerReader lexerReader) { 18 | this.lexerReader = lexerReader; 19 | } 20 | 21 | public MynlpAnalyzer(LexerReader lexerReader, WordTermIterableMode mode) { 22 | this.lexerReader = lexerReader; 23 | this.mode = mode; 24 | } 25 | 26 | 27 | @Override 28 | protected TokenStreamComponents createComponents(final String fieldName) { 29 | return new TokenStreamComponents(new MynlpTokenizer(lexerReader, mode)); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/nwd/TopCounter.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.nwd 2 | 3 | /** 4 | * @author jimichan 5 | * 统计元素重复最高 6 | */ 7 | class TopCounter(private val size: Int = 2000000, 8 | private var minCount: Int = 10) { 9 | 10 | private var verbose = false 11 | 12 | var data = HashMap(size) 13 | 14 | private var topList = listOf() 15 | 16 | private var lastMinCount = 2 17 | 18 | fun put(key: String) { 19 | 20 | val v = data[key] 21 | if (v == null) { 22 | data[key] = IntCount() 23 | } else { 24 | v.value++ 25 | } 26 | 27 | if (data.size >= size) { 28 | reduce() 29 | } 30 | } 31 | 32 | private fun reduce() { 33 | //1. remove count less min 34 | if (verbose) println("清洗前有${data.size}条数据") 35 | 36 | val target = size / 4 //压缩为1/4 37 | 38 | var max = 0 39 | 40 | for (min in lastMinCount until minCount) { 41 | if (data.size > target) { 42 | //data.removeAll { _, value -> value <= min } 43 | data = data.filterTo(HashMap()) { it.value.value > min } 44 | if (verbose) println("删除小于 ${min} 的数量,剩余${data.size}") 45 | max = min 46 | } 47 | } 48 | 49 | lastMinCount = max - 1 50 | if (lastMinCount <= 2) { 51 | lastMinCount = 2 52 | } 53 | 54 | //还超出一半 55 | if (data.size > size / 2) { 56 | minCount++ 57 | } 58 | 59 | if (verbose) println("-".repeat(20)) 60 | } 61 | 62 | fun clean() { 63 | data = data.filterTo(HashMap()) { it.value.value > minCount } 64 | } 65 | 66 | fun getListResult(): List { 67 | clean() 68 | val list = ArrayList(data.size) 69 | 70 | data.forEach { 71 | list += WordCount(it.key, it.value.value) 72 | } 73 | 74 | list.sort() 75 | return list 76 | } 77 | 78 | } 79 | 80 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/nwd/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 新词发现 (new word discovery) 3 | */ 4 | package com.mayabot.nlp.module.nwd; -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/pinyin/CustomPinyin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.pinyin; 2 | 3 | import java.util.Map; 4 | import java.util.TreeMap; 5 | 6 | /** 7 | * @author jimichan 8 | */ 9 | public class CustomPinyin { 10 | 11 | private Map map = new TreeMap<>(); 12 | 13 | public void put(String text, String pinyin) { 14 | map.put(text, pinyin); 15 | } 16 | 17 | public void remove(String text) { 18 | map.remove(text); 19 | } 20 | 21 | public Map getMap() { 22 | return map; 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/pinyin/model/PinyinHead.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | *

19 | * He Han 20 | * hankcs.cn@gmail.com 21 | * 2014/11/6 10:36 22 | * 23 | * 24 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 25 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 26 | * 27 | */ 28 | package com.mayabot.nlp.module.pinyin.model; 29 | 30 | /** 31 | * 拼音输入法头 32 | * 33 | * @author hankcs 34 | */ 35 | public enum PinyinHead { 36 | a, 37 | b, 38 | c, 39 | ch, 40 | d, 41 | e, 42 | f, 43 | g, 44 | h, 45 | j, 46 | k, 47 | l, 48 | m, 49 | n, 50 | o, 51 | p, 52 | q, 53 | r, 54 | s, 55 | sh, 56 | t, 57 | w, 58 | x, 59 | y, 60 | z, 61 | zh, 62 | none, 63 | } 64 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/pinyin/split/PinyinSplitApp.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.pinyin.split 2 | 3 | import com.mayabot.nlp.MynlpEnv 4 | import com.mayabot.nlp.common.injector.Singleton 5 | import com.mayabot.nlp.common.utils.CharNormUtils 6 | import com.mayabot.nlp.perceptron.PerceptronModel 7 | import com.mayabot.nlp.perceptron.PerceptronComputer 8 | import java.io.File 9 | 10 | @Singleton 11 | class PinyinSplitService(env: MynlpEnv) { 12 | 13 | val app = PinyinSplitApp.loadDefault(env) 14 | 15 | fun split(text: String) = app.decodeToWordList(text) 16 | } 17 | 18 | class PinyinSplitApp(val model: PerceptronModel) { 19 | 20 | private val logic = define.modelComputer(model) 21 | 22 | fun decodeToWordList(sentence: String, convert: Boolean = true): List { 23 | val result = ArrayList() 24 | val input = sentence.toCharArray() 25 | if (convert) { 26 | CharNormUtils.convert(input) 27 | } 28 | 29 | val output = logic.decodeModel(input) 30 | 31 | var p = 0 32 | for (i in 0 until output.size) { 33 | val f = output[i] 34 | if (f == "S" || f == "E") { 35 | result += sentence.substring(p, i + 1) 36 | p = i + 1 37 | } 38 | } 39 | if (p < sentence.length) { 40 | result += sentence.substring(p, sentence.length) 41 | } 42 | 43 | return result 44 | } 45 | 46 | companion object { 47 | 48 | const val modelPrefix = "pinyin-split-model" 49 | 50 | val define = PinyinSplitDefinition() 51 | 52 | fun load(file: File): PinyinSplitApp { 53 | return PinyinSplitApp(PerceptronModel.load(file)) 54 | } 55 | 56 | fun loadDefault(env: MynlpEnv): PinyinSplitApp { 57 | return PinyinSplitApp(PerceptronModel.loadFromNlpResource(modelPrefix, env)) 58 | } 59 | } 60 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/trans/Simplified2Traditional.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.trans 2 | 3 | import com.mayabot.nlp.common.injector.Singleton 4 | import java.util.* 5 | 6 | /** 7 | * 简体转繁体的词典 8 | * 9 | * @author jimichan 10 | */ 11 | @Singleton 12 | class Simplified2Traditional : BaseTransformDictionary() { 13 | 14 | override fun loadDictionary(): TreeMap { 15 | return loadFromResource(RS_NAME) 16 | } 17 | 18 | companion object { 19 | private val RS_NAME = "ts-dict/s2t.txt" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/trans/Traditional2Simplified.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.trans 2 | 3 | import com.mayabot.nlp.common.injector.Singleton 4 | import java.util.* 5 | 6 | /** 7 | * 繁体转简体的词典 8 | * 9 | * @author jimichan 10 | */ 11 | @Singleton 12 | class Traditional2Simplified : BaseTransformDictionary() { 13 | 14 | override fun loadDictionary(): TreeMap { 15 | return loadFromResource(RS_NAME) 16 | } 17 | 18 | companion object { 19 | 20 | private val RS_NAME = "ts-dict/t2s.txt" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/module/trans/TransformService.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.trans; 2 | 3 | import com.mayabot.nlp.Mynlp; 4 | 5 | /** 6 | * 繁简体转换 7 | * 8 | * @author jimichan 9 | */ 10 | @Deprecated 11 | public class TransformService { 12 | 13 | private static Mynlp mynlp = Mynlp.instance(); 14 | 15 | /** 16 | * 简体转繁体 17 | * 18 | * @param text 简体文字 19 | * @return 繁体文字 20 | */ 21 | @Deprecated 22 | public static String s2t(String text) { 23 | return mynlp.s2t(text); 24 | } 25 | 26 | /** 27 | * 繁体转简体 28 | * 29 | * @param text 繁体内容 30 | * @return 简体字符串 31 | */ 32 | @Deprecated 33 | public static String t2s(String text) { 34 | return mynlp.t2s(text); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/perceptron/EvaluateFunction.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.perceptron; 2 | 3 | import java.util.List; 4 | 5 | public interface EvaluateFunction { 6 | EvaluateResult evaluate(List sample ); 7 | } 8 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/perceptron/EvaluateResult.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.perceptron 2 | 3 | /** 4 | * 评估结果 5 | */ 6 | data class EvaluateResult( 7 | /** 8 | * 正确率 9 | */ 10 | val precision: Float, 11 | /** 12 | * 召回率 13 | */ 14 | val recall: Float 15 | ) { 16 | 17 | constructor(goldTotal: Int, predTotal: Int, correct: Int) : this( 18 | (correct * 100.0 / predTotal).toFloat(), 19 | (correct * 100.0 / goldTotal).toFloat() 20 | ) 21 | 22 | /** 23 | * F1综合指标 24 | */ 25 | val f1: Float 26 | get() = (2.0 * precision * recall / (precision + recall)).toFloat() 27 | 28 | override fun toString(): String { 29 | return "正确率(P) %.2f , 召回率(R) %.2f , F1 %.2f".format(precision, recall, f1) 30 | } 31 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/CharNormalize.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.segment; 18 | 19 | /** 20 | * 字符规范化接口 21 | *

22 | * 分词之前可以对char进行转换。一般完成大小写、半全角、归一化转换的需求. 23 | * 24 | * @author jimichan 25 | * @see com.mayabot.nlp.segment.common.DefaultCharNormalize 26 | */ 27 | public interface CharNormalize { 28 | 29 | /** 30 | * 对char数组里面的字符进行规范化操作,常见的有最小化和宽体字符处理 31 | * 32 | * @param text 33 | */ 34 | void normal(char[] text); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/KotlinLexers.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import java.io.File 5 | 6 | private val defaultLexer = Mynlp.instance().lexerBuilder() 7 | .bigram().withPersonName().build() 8 | 9 | fun String.segment(): List = defaultLexer.scan(this).toWordList() 10 | fun String.lexer(): Sentence = defaultLexer.scan(this) 11 | 12 | 13 | /** 14 | */ 15 | fun File.segment(outPath: String) { 16 | val lexerReader = defaultLexer.reader() 17 | 18 | val file = File(outPath) 19 | 20 | if (!file.parentFile.exists()) { 21 | file.parentFile.mkdirs() 22 | } 23 | 24 | val lines = inputStream().bufferedReader().lines() 25 | 26 | file.outputStream().bufferedWriter().use { writer -> 27 | lines.filter { it.isNotBlank() } 28 | .map { 29 | lexerReader.scan(it).toWordSequence() 30 | }.forEach { x -> 31 | writer.write(x.joinToString(separator = " ")) 32 | writer.newLine() 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/LexerBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment; 17 | 18 | /** 19 | * Lexer构建器接口 20 | * 21 | * @author jimichan 22 | */ 23 | public interface LexerBuilder { 24 | 25 | /** 26 | * 构建一个Lexer 27 | * 28 | * @return Lexer 29 | */ 30 | Lexer build(); 31 | 32 | } 33 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/Lexers.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment; 17 | 18 | import com.mayabot.nlp.Mynlp; 19 | 20 | /** 21 | * Lexer系列便捷方法。 22 | * 23 | * @author jimichan 24 | */ 25 | @Deprecated 26 | public class Lexers { 27 | 28 | /** 29 | * @return FluentLexerBuilder 30 | * @since 3.0.0 31 | */ 32 | public static FluentLexerBuilder builder() { 33 | return Mynlp.instance().lexerBuilder(); 34 | } 35 | 36 | public static Lexer core() { 37 | return coreBuilder() 38 | .withPos() 39 | .withPersonName().build(); 40 | } 41 | 42 | public static FluentLexerBuilder coreBuilder() { 43 | return builder().core(); 44 | } 45 | 46 | public static Lexer perceptron() { 47 | return perceptronBuilder().withPos().build(); 48 | } 49 | 50 | public static FluentLexerBuilder perceptronBuilder() { 51 | return builder().perceptron(); 52 | } 53 | 54 | } 55 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/SegmentComponent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment; 17 | 18 | /** 19 | * 分词组件需要有个Name和设置是否启用的 20 | *

21 | * Name : 组件的名称 22 | * Enable : 是否启用 23 | * Order : 排序。 越小越靠前。 24 | * 25 | * @author jimichan 26 | */ 27 | public interface SegmentComponent extends Comparable { 28 | 29 | 30 | /** 31 | * return component name 32 | * 33 | * @return name 34 | */ 35 | String getName(); 36 | 37 | /** 38 | * 组件是否启用。默认返回true,启用 39 | * 40 | * @return enabled 41 | */ 42 | boolean isEnabled(); 43 | 44 | void setEnabled(boolean enable); 45 | 46 | void enable(); 47 | 48 | void disable(); 49 | 50 | int getOrder(); 51 | 52 | void setOrder(int order); 53 | 54 | @Override 55 | default int compareTo(SegmentComponent o) { 56 | return Integer.compare(this.getOrder(), o.getOrder()); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/SegmentModule.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment 17 | 18 | import com.mayabot.nlp.MynlpEnv 19 | import com.mayabot.nlp.common.injector.AbstractModule 20 | 21 | class SegmentModule(private val env: MynlpEnv) : AbstractModule() { 22 | 23 | override fun configure() { 24 | // if (env.get(MynlpConfigs.server).isNotBlank()) { 25 | // bind(CoreDictPatch::class.java).toClass(NlpCoreDictPatchClient::class.java) 26 | // } 27 | } 28 | 29 | } 30 | 31 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/WordAndNature.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment; 17 | 18 | /** 19 | * 词和词性访问接口 20 | */ 21 | public interface WordAndNature { 22 | 23 | String getWord(); 24 | 25 | String getNatureName(); 26 | } 27 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/WordSplitAlgorithm.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.segment; 18 | 19 | import com.mayabot.nlp.segment.lexer.bigram.CoreDictionarySplitAlgorithm; 20 | import com.mayabot.nlp.segment.lexer.perceptron.PerceptronSegmentAlgorithm; 21 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm; 22 | import com.mayabot.nlp.segment.wordnet.Wordnet; 23 | import org.jetbrains.annotations.NotNull; 24 | 25 | /** 26 | * 分词算法。 27 | * 分词逻辑基本上是面向字符的处理程序。 28 | * 分词算法的作用是对文本分析后,产生一种或多种分词路径,结果保存在Wordnet数据结构里面。 29 | *

30 | * 1. 基于词典 31 | * 3. 基于字分割 32 | * 2. 基于规则 33 | *

34 | * 在一个具体的分词器中,有可能综合同时使用多个分词算法。 35 | * 36 | * @author jimichan 37 | * @see AtomSplitAlgorithm 38 | * @see PerceptronSegmentAlgorithm 39 | * @see CoreDictionarySplitAlgorithm 40 | * @see com.mayabot.nlp.segment.plugins.personname.PersonNameAlgorithm 41 | */ 42 | public interface WordSplitAlgorithm extends SegmentComponent { 43 | 44 | /** 45 | * 填充Wordnet实例 46 | * 47 | * @param wordnet 48 | */ 49 | void fill(@NotNull Wordnet wordnet); 50 | 51 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/WordpathProcessor.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.segment; 18 | 19 | import com.mayabot.nlp.segment.wordnet.Wordpath; 20 | 21 | /** 22 | * Wordpath处理器 23 | * 24 | * @author jimichan 25 | */ 26 | public interface WordpathProcessor extends SegmentComponent { 27 | 28 | /** 29 | * 对传入的Wordpath进行处理,然后返回一个旧的或者新的对象 30 | * 31 | * @param wordPath 32 | * @return 一般对传入的wordPath修改,返回对象本身 33 | */ 34 | Wordpath process(Wordpath wordPath); 35 | 36 | } 37 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/common/BaseSegmentComponent.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment.common; 17 | 18 | import com.mayabot.nlp.segment.SegmentComponent; 19 | 20 | /** 21 | * @author jimichan 22 | */ 23 | public abstract class BaseSegmentComponent implements SegmentComponent { 24 | 25 | private boolean enabled = true; 26 | 27 | public static final int LEVEL1 = -1000; 28 | 29 | public static final int LEVEL2 = -500; 30 | 31 | public static final int LEVEL3 = 0; 32 | 33 | public static final int LEVEL4 = 500; 34 | 35 | public static final int LEVEL5 = 1000; 36 | 37 | private int order = LEVEL3; 38 | 39 | public BaseSegmentComponent(int order) { 40 | this.order = order; 41 | } 42 | 43 | @Override 44 | public String getName() { 45 | return this.getClass().getSimpleName(); 46 | } 47 | 48 | @Override 49 | public boolean isEnabled() { 50 | return enabled; 51 | } 52 | 53 | @Override 54 | public void setEnabled(boolean enable) { 55 | this.enabled = enable; 56 | } 57 | 58 | @Override 59 | public void enable() { 60 | this.enabled = true; 61 | } 62 | 63 | @Override 64 | public void disable() { 65 | this.enabled = false; 66 | } 67 | 68 | @Override 69 | public int getOrder() { 70 | return order; 71 | } 72 | 73 | @Override 74 | public void setOrder(int order) { 75 | this.order = order; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/common/DefaultCharNormalize.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.common; 2 | 3 | import com.mayabot.nlp.common.utils.CharNormUtils; 4 | import com.mayabot.nlp.segment.CharNormalize; 5 | 6 | /** 7 | * 大小转小写。 8 | * 全角转半角,其他字符归一化。 9 | * 10 | * @author jimichan 11 | */ 12 | public class DefaultCharNormalize implements CharNormalize { 13 | @Override 14 | public void normal(char[] text) { 15 | CharNormUtils.convert(text); 16 | } 17 | 18 | public static final DefaultCharNormalize instance = new DefaultCharNormalize(); 19 | } 20 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/common/String2.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.common; 2 | 3 | import org.jetbrains.annotations.NotNull; 4 | 5 | final public class String2 implements CharSequence { 6 | 7 | @NotNull 8 | private char[] chars; 9 | 10 | private int start = 0; 11 | private int end = 0; 12 | 13 | private int len = 0; 14 | 15 | public String2(@NotNull char[] chars) { 16 | this.chars = chars; 17 | start = 0; 18 | this.end = chars.length; 19 | len = chars.length; 20 | } 21 | 22 | public String2(@NotNull char[] chars, int start, int end) { 23 | this.chars = chars; 24 | this.start = start; 25 | this.end = end; 26 | this.len = end - start; 27 | } 28 | 29 | public void setStartEnd(int start, int end) { 30 | this.start = start; 31 | this.end = end; 32 | this.len = end - start; 33 | } 34 | 35 | public int getStart() { 36 | return start; 37 | } 38 | 39 | @Override 40 | public int length() { 41 | return len; 42 | } 43 | 44 | @Override 45 | public char charAt(int index) { 46 | return chars[index + start]; 47 | } 48 | 49 | @Override 50 | public CharSequence subSequence(int start, int end) { 51 | return new String2(chars, this.start + start, this.start + end); 52 | } 53 | 54 | @Override 55 | public String toString() { 56 | return new String(chars, start, len); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/common/VertexHelper.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.segment.common; 18 | 19 | import com.mayabot.nlp.segment.Nature; 20 | import com.mayabot.nlp.segment.wordnet.Vertex; 21 | 22 | /** 23 | * 顶点管理器 24 | * 25 | * @author jimichan 26 | */ 27 | public abstract class VertexHelper { 28 | 29 | private static final int total = 25146057 / 10; 30 | 31 | /** 32 | * 生成线程安全的起始节点 33 | * begin 34 | * 35 | * @return Begin Vertex 36 | */ 37 | public static Vertex newBegin() { 38 | Vertex v = new Vertex(1); 39 | v.setAbsWordNatureAndFreq(Nature.newWord, total); 40 | return v; 41 | } 42 | 43 | /** 44 | * @return End Vertex 45 | */ 46 | public static Vertex newEnd() { 47 | Vertex v = new Vertex(0); 48 | v.setAbsWordNatureAndFreq(Nature.end, total); 49 | return v; 50 | } 51 | 52 | 53 | } 54 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/BiGramTableDictionary.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.bigram; 2 | 3 | 4 | import com.mayabot.nlp.common.injector.ImplementedBy; 5 | 6 | /** 7 | * 查询词ID,两个接续ID中间的共现频率。 8 | * 9 | * @author jimichan 10 | */ 11 | @ImplementedBy(value = BiGramTableDictionaryImpl.class) 12 | public interface BiGramTableDictionary { 13 | int getBiFrequency(int idA, int idB); 14 | 15 | public void refresh() throws Exception; 16 | } 17 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/BiGramTableReader.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.bigram 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.MynlpEnv 5 | import com.mayabot.nlp.common.Guava.split 6 | 7 | class BiGramTableReader(private val env: MynlpEnv) { 8 | constructor(mynlp: Mynlp) : this(mynlp.env) 9 | 10 | fun read(blocker: (String, String, Int) -> Unit) { 11 | 12 | val dictResource = env.loadResource(BiGramTableDictionaryImpl.path) 13 | ?: throw RuntimeException("Not Found dict resource " + BiGramTableDictionaryImpl.path) 14 | 15 | var firstWord: String? = null 16 | 17 | dictResource.inputStream().bufferedReader(Charsets.UTF_8).useLines { lines -> 18 | lines.forEach { line -> 19 | if (line.startsWith("\t")) { 20 | val firstWh = line.indexOf(" ") 21 | val numString = line.substring(1, firstWh) 22 | val num = numString.toInt() 23 | val words = split(line.substring(firstWh + 1), " ") 24 | val wordA = firstWord!! 25 | 26 | for (wordB in words) { 27 | blocker(wordA, wordB, num) 28 | } 29 | } else { 30 | firstWord = line 31 | } 32 | } 33 | } 34 | 35 | } 36 | } 37 | 38 | fun readCoreBigramTable(blocker: (String, String, Int) -> Unit) { 39 | BiGramTableReader(Mynlp.instance()).read(blocker) 40 | } 41 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictPatch.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.bigram 2 | 3 | import com.mayabot.nlp.common.injector.Singleton 4 | import org.jetbrains.annotations.Nullable 5 | 6 | interface CoreDictPatch { 7 | fun appendDict(): List> 8 | fun deleteDict(): List 9 | fun appendBiGram(): List 10 | fun dictVersion(): String 11 | fun biGramVersion(): String 12 | } 13 | 14 | data class BiGram( 15 | val wordA: String, val wordB: String, val count: Int 16 | ) 17 | 18 | @Singleton 19 | class CoreDictPathWrap { 20 | 21 | @Nullable 22 | val coreDictPatch: CoreDictPatch? = null 23 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictionary.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.bigram; 2 | 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap.DATMapMatcherInt; 4 | import com.mayabot.nlp.common.injector.ImplementedBy; 5 | import org.jetbrains.annotations.NotNull; 6 | 7 | /** 8 | * @author jimichan 9 | */ 10 | @ImplementedBy(CoreDictionaryImpl.class) 11 | public interface CoreDictionary { 12 | 13 | /** 14 | * 匹配算法 15 | * 16 | * @param text 17 | * @param offset 18 | * @return DATMapMatcherInt 19 | */ 20 | DATMapMatcherInt match(char[] text, int offset); 21 | 22 | /** 23 | * 词频总量 24 | * 25 | * @return int 词频总量 26 | */ 27 | int totalFreq(); 28 | 29 | void refresh() throws Exception; 30 | 31 | int wordId(char[] chars, int pos, int len); 32 | 33 | int wordId(CharSequence word); 34 | 35 | public int wordFreq(int wordID); 36 | 37 | boolean contains(@NotNull String word); 38 | 39 | int getWordID(String word); 40 | 41 | int size(); 42 | } 43 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictionaryReader.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.bigram 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.MynlpEnv 5 | import com.mayabot.nlp.common.Guava.split 6 | 7 | class CoreDictionaryReader(val env: MynlpEnv) { 8 | 9 | constructor(mynlp: Mynlp) : this(mynlp.env) 10 | 11 | var totalFreq = 0 12 | 13 | fun read(blocker: (String, Int) -> Unit) { 14 | 15 | val dictResource = env.loadResource(CoreDictionaryImpl.path) 16 | ?: throw RuntimeException("Not Found dict resource " + CoreDictionaryImpl.path) 17 | 18 | dictResource.inputStream().bufferedReader(Charsets.UTF_8).useLines { lines -> 19 | lines.forEach { line -> 20 | val param = split(line, " ") 21 | if (param.size == 2) { 22 | val count = Integer.valueOf(param[1]) 23 | blocker(param[0], count) 24 | totalFreq += count 25 | } 26 | } 27 | } 28 | } 29 | } 30 | 31 | fun readCoreDict(blocker: (String, Int) -> Unit) { 32 | CoreDictionaryReader(Mynlp.instance()).read(blocker) 33 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/CoreDictionarySplitAlgorithm.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.bigram; 2 | 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap.DATMapMatcherInt; 4 | import com.mayabot.nlp.segment.WordSplitAlgorithm; 5 | import com.mayabot.nlp.segment.common.BaseSegmentComponent; 6 | import com.mayabot.nlp.segment.wordnet.Vertex; 7 | import com.mayabot.nlp.segment.wordnet.Wordnet; 8 | 9 | /** 10 | * 基于核心词典的基础切词器 11 | * 12 | * @author jimichan 13 | */ 14 | public class CoreDictionarySplitAlgorithm extends BaseSegmentComponent implements WordSplitAlgorithm { 15 | 16 | private CoreDictionary coreDictionary; 17 | 18 | 19 | public CoreDictionarySplitAlgorithm(CoreDictionary coreDictionary) { 20 | super(LEVEL1); 21 | this.coreDictionary = coreDictionary; 22 | } 23 | 24 | @Override 25 | public void fill(Wordnet wordnet) { 26 | char[] text = wordnet.getCharArray(); 27 | 28 | // 核心词典查询 29 | DATMapMatcherInt searcher = coreDictionary.match(text, 0); 30 | 31 | while (searcher.next()) { 32 | int offset = searcher.getBegin(); 33 | int length = searcher.getLength(); 34 | int wordId = searcher.getIndex(); 35 | 36 | Vertex v = new Vertex(length, wordId, searcher.getValue()); 37 | 38 | wordnet.put(offset, v); 39 | } 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/bigram/HmmLexerPlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.bigram; 2 | 3 | import com.mayabot.nlp.Mynlp; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 5 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 6 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm; 7 | 8 | /** 9 | * @author jimichan 10 | */ 11 | public class HmmLexerPlugin implements PipelineLexerPlugin { 12 | 13 | private CoreDictionary dictionaryMatcher; 14 | 15 | public HmmLexerPlugin(CoreDictionary dictionaryMatcher) { 16 | this.dictionaryMatcher = dictionaryMatcher; 17 | } 18 | 19 | public HmmLexerPlugin(Mynlp mynlp) { 20 | this.dictionaryMatcher = mynlp.getInstance(CoreDictionary.class); 21 | } 22 | 23 | @Override 24 | public void init(PipelineLexerBuilder builder) { 25 | 26 | builder.setBestPathComputer(ViterbiBestPathAlgorithm.class); 27 | 28 | 29 | builder.addWordSplitAlgorithm(new CoreDictionarySplitAlgorithm( 30 | dictionaryMatcher 31 | )); 32 | 33 | builder.addWordSplitAlgorithm(AtomSplitAlgorithm.class); 34 | 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/crf/tokenizer/CrfTokenizerBuilder.java: -------------------------------------------------------------------------------- 1 | //package com.mayabot.nlp.segment.crf.tokenizer; 2 | // 3 | //import com.mayabot.nlp.segment.PipelineTokenizerBuilder; 4 | //import com.mayabot.nlp.segment.crf.tokenizer.CrfBaseSegmentInitializer; 5 | //import com.mayabot.nlp.segment.tokenizer.BaseTokenizerBuilder; 6 | //import com.mayabot.nlp.segment.tokenizer.bestpath.ViterbiBestPathAlgorithm; 7 | //import SentenceCollector; 8 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.CommonSplitAlgorithm; 9 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.CommonRuleWordpathProcessor; 10 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.CustomDictionaryProcessor; 11 | //import com.mayabot.nlp.segment.tokenizer.xprocessor.TimeSplitAlgorithm; 12 | // 13 | //public class CrfTokenizerBuilder extends BaseTokenizerBuilder { 14 | // 15 | // 16 | // @Override 17 | // protected void setUp(PipelineTokenizerBuilder builder) { 18 | // 19 | // //wordnet初始化填充 20 | // builder.addWordSplitAlgorithm( 21 | // CrfBaseSegmentInitializer.class, 22 | // CommonSplitAlgorithm.class, 23 | // TimeSplitAlgorithm.class 24 | // ); 25 | // 26 | // //最优路径算法w 27 | // builder.setBestPathComputer(ViterbiBestPathAlgorithm.class); 28 | // 29 | // 30 | // // Pipeline处理器 31 | // builder.addProcessor(CustomDictionaryProcessor.class); 32 | // builder.addProcessor(CommonRuleWordpathProcessor.class); 33 | // 34 | // 35 | // builder.setTermCollector(new SentenceCollector()); 36 | // 37 | // 38 | // } 39 | // 40 | //} 41 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronSegmentPatch.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.perceptron 2 | 3 | import com.mayabot.nlp.MynlpEnv 4 | import com.mayabot.nlp.common.injector.Singleton 5 | 6 | @Singleton 7 | class PerceptronSegmentPatch 8 | 9 | constructor(val mynlpEnv: MynlpEnv) { 10 | 11 | val examples = ArrayList() 12 | 13 | init { 14 | examples += loadExample("patch/cws-default.txt") 15 | examples += loadExample("patch/cws.txt") 16 | } 17 | 18 | fun addExample(line: String) { 19 | examples += line 20 | } 21 | 22 | fun removeExample(line: String) { 23 | examples.remove(line) 24 | } 25 | 26 | fun addResources(rsName: String) { 27 | examples += loadExample(rsName) 28 | } 29 | 30 | private fun loadExample(rsName: String): List { 31 | val resource = mynlpEnv.tryLoadResource(rsName, Charsets.UTF_8) 32 | if (resource != null) { 33 | return resource.inputStream().bufferedReader().readLines() 34 | .map { it.trim() }.filter { 35 | it.isNotBlank() && !it.startsWith("#") 36 | } 37 | } 38 | return listOf() 39 | } 40 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronSegmentPlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.perceptron; 2 | 3 | import com.mayabot.nlp.segment.lexer.bigram.ViterbiBestPathAlgorithm; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 5 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 6 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm; 7 | 8 | public class PerceptronSegmentPlugin implements PipelineLexerPlugin { 9 | 10 | @Override 11 | public void init(PipelineLexerBuilder builder) { 12 | 13 | //切词算法 14 | builder.addWordSplitAlgorithm(PerceptronSegmentAlgorithm.class); 15 | 16 | 17 | builder.addWordSplitAlgorithm(AtomSplitAlgorithm.class); 18 | 19 | 20 | //最优路径算法 21 | builder.setBestPathComputer(ViterbiBestPathAlgorithm.class); 22 | 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronsSegmentService.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.perceptron; 2 | 3 | import com.mayabot.nlp.MynlpConfigs; 4 | import com.mayabot.nlp.MynlpEnv; 5 | import com.mayabot.nlp.common.injector.Singleton; 6 | import com.mayabot.nlp.common.logging.InternalLogger; 7 | import com.mayabot.nlp.common.logging.InternalLoggerFactory; 8 | import com.mayabot.nlp.common.resources.NlpResource; 9 | import com.mayabot.nlp.segment.plugins.ner.PerceptronNerService; 10 | 11 | import java.util.List; 12 | 13 | /** 14 | * 感知机分词服务 15 | */ 16 | @Singleton 17 | public class PerceptronsSegmentService { 18 | 19 | private PerceptronSegment ps; 20 | 21 | 22 | static InternalLogger logger = InternalLoggerFactory.getInstance(PerceptronNerService.class); 23 | 24 | public PerceptronsSegmentService(MynlpEnv mynlp, 25 | PerceptronSegmentPatch perceptronSegmentPatch) throws Exception { 26 | 27 | //cws-model or cws-hanlp-model 28 | String modelName = mynlp.get(MynlpConfigs.cwsModelItem); 29 | 30 | long t1 = System.currentTimeMillis(); 31 | NlpResource parameterResource = mynlp.loadResource(modelName + "/parameter.bin"); 32 | NlpResource featureResource = mynlp.loadResource(modelName + "/feature.dat"); 33 | 34 | ps = PerceptronSegment.load( 35 | parameterResource.inputStream(), 36 | featureResource.inputStream()); 37 | 38 | for (String example : perceptronSegmentPatch.getExamples()) { 39 | ps.learn(example); 40 | } 41 | 42 | long t2 = System.currentTimeMillis(); 43 | 44 | logger.info("PerceptronCwsService init use " + (t2 - t1) + " ms"); 45 | } 46 | 47 | public List splitWord(String sentence) { 48 | return ps.decode(sentence); 49 | } 50 | 51 | /** 52 | * 词使用空格分开。 53 | * @param example 54 | */ 55 | public void learn(String example){ 56 | ps.learn(example); 57 | } 58 | 59 | public PerceptronSegment getPerceptron() { 60 | return ps; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/lexer/perceptron/inner/Train.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.perceptron.inner 2 | 3 | import com.mayabot.nlp.perceptron.PerceptronComputer 4 | import com.mayabot.nlp.segment.lexer.perceptron.PerceptronSegmentDefinition 5 | import java.io.File 6 | 7 | /** 8 | * 参数 9 | * Iter 150 10 | * thread 2 11 | */ 12 | fun main() { 13 | 14 | val runner = PerceptronComputer(PerceptronSegmentDefinition()) 15 | 16 | // val trainFile = File("data.work/corpus.segment/backoff2005/msr_training.txt") 17 | // val evaluateFile = File("data.work/corpus.segment/backoff2005/msr_test_gold.txt") 18 | // 19 | val trainFile = File("data.work/cws/pku/199801.txt") 20 | val evaluateFile = File("data.work/cws/pku/199802.txt") 21 | 22 | var model = runner.train( 23 | trainFile, 24 | evaluateFile, 25 | 10, 8) 26 | 27 | println("compress") 28 | model = model.compress(0.2, 1e-3) 29 | 30 | println("After compress ...") 31 | val evlResult = runner.evaluateModel(model,evaluateFile) 32 | println(evlResult) 33 | 34 | model.save(File("data.work/cws-model")) 35 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/pipeline/PipelineLexerBuilderKts.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.pipeline 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.segment.FluentLexerBuilder 5 | import com.mayabot.nlp.segment.Lexer 6 | 7 | fun lexerBuilder(blocker: FluentLexerBuilder.() -> Unit): Lexer { 8 | val builder = FluentLexerBuilder(Mynlp.instance()) 9 | builder.blocker() 10 | return builder.build() 11 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/pipeline/PipelineLexerPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment.pipeline; 17 | 18 | /** 19 | * @author jimichan 20 | */ 21 | public interface PipelineLexerPlugin { 22 | void init(PipelineLexerBuilder builder); 23 | } 24 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/bestpath/LongpathBestPathAlgorithm.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.bestpath; 2 | 3 | import com.mayabot.nlp.segment.wordnet.BestPathAlgorithm; 4 | import com.mayabot.nlp.segment.wordnet.VertexRow; 5 | import com.mayabot.nlp.segment.wordnet.Wordnet; 6 | import com.mayabot.nlp.segment.wordnet.Wordpath; 7 | 8 | /** 9 | * 前向最大路径算法 10 | * 11 | * @author jimichan 12 | */ 13 | public class LongpathBestPathAlgorithm implements BestPathAlgorithm { 14 | 15 | @Override 16 | public Wordpath select(Wordnet wordnet) { 17 | //从后到前,获得完整的路径 18 | final Wordpath wordPath = new Wordpath(wordnet); 19 | 20 | int point = 0; 21 | final int len = wordnet.length() - 1; 22 | 23 | while (point <= len) { 24 | 25 | VertexRow row = wordnet.row(point); 26 | 27 | int wordLen = row.lastLen(); 28 | if (wordLen == 0) { 29 | wordLen = 1; 30 | } 31 | 32 | wordPath.combine(point, wordLen); 33 | 34 | point += wordLen; 35 | } 36 | 37 | // 最后一个point必定指向start节点 38 | if (point == len) { 39 | throw new IllegalStateException("非完整路径,有可能wordnet初始化的时候就路径不完整"); 40 | } 41 | // Preconditions.checkState(point != len,"非完整路径,有可能wordnet初始化的时候就路径不完整" ); 42 | return wordPath; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CoreDictSubwordInfoSetup.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.collector; 2 | 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap; 4 | import com.mayabot.nlp.segment.lexer.bigram.CoreDictionary; 5 | import com.mayabot.nlp.segment.wordnet.Vertex; 6 | import com.mayabot.nlp.segment.wordnet.Wordnet; 7 | import com.mayabot.nlp.segment.wordnet.Wordpath; 8 | import org.jetbrains.annotations.NotNull; 9 | 10 | /** 11 | * 基于词典的子词补全. 12 | * 一般在感知机分词器,需要补全 13 | */ 14 | public class CoreDictSubwordInfoSetup implements SubwordInfoSetup { 15 | 16 | private CoreDictionary dictionary; 17 | 18 | public CoreDictSubwordInfoSetup(CoreDictionary dictionary) { 19 | this.dictionary = dictionary; 20 | } 21 | 22 | @Override 23 | public void fill(@NotNull Wordnet wordnet, @NotNull Wordpath wordPath) { 24 | char[] text = wordnet.getCharArray(); 25 | // 核心词典查询 26 | DoubleArrayTrieStringIntMap.DATMapMatcherInt searcher = dictionary.match(text, 0); 27 | 28 | while (searcher.next()) { 29 | int offset = searcher.getBegin(); 30 | int length = searcher.getLength(); 31 | int wordId = searcher.getIndex(); 32 | 33 | Vertex v = new Vertex(length, wordId, searcher.getValue()); 34 | if(!wordnet.row(offset).contains(length)){ 35 | wordnet.put(offset, v); 36 | } 37 | } 38 | } 39 | 40 | 41 | } 42 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/CustomDictSubwordInfoSetup.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.collector; 2 | 3 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap; 4 | import com.mayabot.nlp.segment.plugins.customwords.CustomDictionary; 5 | import com.mayabot.nlp.segment.wordnet.Vertex; 6 | import com.mayabot.nlp.segment.wordnet.Wordnet; 7 | import com.mayabot.nlp.segment.wordnet.Wordpath; 8 | import org.jetbrains.annotations.NotNull; 9 | 10 | /** 11 | * 基于词典的子词补全. 12 | * 一般在感知机分词器,需要补全 13 | */ 14 | public class CustomDictSubwordInfoSetup implements SubwordInfoSetup { 15 | 16 | private CustomDictionary dictionary; 17 | 18 | public CustomDictSubwordInfoSetup(CustomDictionary dictionary) { 19 | this.dictionary = dictionary; 20 | } 21 | 22 | @Override 23 | public void fill(@NotNull Wordnet wordnet, @NotNull Wordpath wordPath) { 24 | DoubleArrayTrieStringIntMap trie = dictionary.getTrie(); 25 | if (trie == null) { 26 | return; 27 | } 28 | char[] text = wordnet.getCharArray(); 29 | DoubleArrayTrieStringIntMap.DATMapMatcherInt searcher = trie.match(text, 0); 30 | 31 | while (searcher.next()) { 32 | int offset = searcher.getBegin(); 33 | int length = searcher.getLength(); 34 | 35 | Vertex v = new Vertex(length, -1, searcher.getValue()); 36 | if (!wordnet.row(offset).contains(length)) { 37 | wordnet.put(offset, v); 38 | } 39 | } 40 | } 41 | 42 | 43 | } 44 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SentenceCollector.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.collector 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.common.utils.StringUtils 5 | import com.mayabot.nlp.segment.WordTerm 6 | import com.mayabot.nlp.segment.wordnet.Wordnet 7 | import com.mayabot.nlp.segment.wordnet.Wordpath 8 | import java.util.function.Consumer 9 | 10 | /** 11 | * WordTermCollector的默认实现,从各种数据结构中收集和生成词序列 12 | * 13 | * @author jimichan 14 | */ 15 | class SentenceCollector( 16 | private val mynlp: Mynlp, 17 | private val subwordComputer: List = emptyList(), 18 | private val setupList: List = emptyList() 19 | ) : WordTermCollector { 20 | 21 | override fun collect(txtChars: CharArray?, wordnet: Wordnet, wordPath: Wordpath, consumer: Consumer) { 22 | 23 | val vertexIterator = wordPath.iteratorVertex() 24 | 25 | setupList.forEach { 26 | it.fill(wordnet, wordPath) 27 | } 28 | 29 | while (vertexIterator.hasNext()) { 30 | val vertex = vertexIterator.next() 31 | 32 | val word = if (txtChars == null) { 33 | vertex.realWord() 34 | } else { 35 | String(chars = txtChars, vertex.offset(), vertex.length) 36 | } 37 | 38 | val term = WordTerm(word, vertex.nature, vertex.offset()) 39 | 40 | if (StringUtils.isWhiteSpace(term.word)) { 41 | continue 42 | } 43 | 44 | // 如果运行成功,后面的就不运行了 45 | subwordComputer.forEach { 46 | if (it.run(term, wordnet, wordPath)) { 47 | return@forEach 48 | } 49 | } 50 | 51 | consumer.accept(term) 52 | } 53 | } 54 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SubwordComputer.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.collector 2 | 3 | import com.mayabot.nlp.segment.WordTerm 4 | import com.mayabot.nlp.segment.wordnet.Wordnet 5 | import com.mayabot.nlp.segment.wordnet.Wordpath 6 | import org.jetbrains.annotations.NotNull 7 | 8 | /** 9 | * 子词切分计算器接口 10 | * 11 | * 从wordnet中计算出子词的所需要的基本信息,计算结果保存在WordTerm的subword字段里面 12 | * @author jimichan 13 | */ 14 | interface SubwordComputer { 15 | 16 | /** 17 | * [term] 一个待切分的子词 18 | * [wordnet] 当前 19 | * @return 如果处理了当前term返回true,没有返回false 20 | */ 21 | fun run( 22 | @NotNull term: WordTerm, 23 | @NotNull wordnet: Wordnet, 24 | @NotNull wordPath: Wordpath 25 | ): Boolean 26 | 27 | } 28 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/SubwordInfoSetup.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.collector 2 | 3 | import com.mayabot.nlp.segment.wordnet.Wordnet 4 | import com.mayabot.nlp.segment.wordnet.Wordpath 5 | 6 | /** 7 | * 感知机、crf等分词,wordnet中没有子词信息。那么通过这个接口在收集结果之前,通过词典新增子词信息。 8 | * @author jimichan 9 | */ 10 | interface SubwordInfoSetup { 11 | fun fill(wordnet: Wordnet, wordPath: Wordpath) 12 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/collector/WordTermCollector.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.collector 2 | 3 | import com.mayabot.nlp.segment.WordTerm 4 | import com.mayabot.nlp.segment.wordnet.Wordnet 5 | import com.mayabot.nlp.segment.wordnet.Wordpath 6 | import java.util.function.Consumer 7 | 8 | /** 9 | * Mynlp WordTerm 收集器 10 | * 11 | * 12 | * 从wordPath、wordnet这两个数据结构中获得最终的分词结果。 13 | * 14 | * 15 | * 通过这个接口,可以让相同的分词器,获得不同的用途的分词结果。 16 | * 17 | * @author jimichan 18 | */ 19 | interface WordTermCollector { 20 | 21 | /** 22 | * 收集分词结果,最终发送到consumer中。 23 | * 这样外面是流水线还是list保存结果,由外部决定。 24 | * 25 | * @param txtChars 词图 26 | * @param KeepChar 词图 27 | * @param wordnet 词图 28 | * @param wordPath 最后的WordPath路径 29 | * @param consumer 接受WordTerm的消费者 30 | */ 31 | fun collect(txtChars:CharArray?,wordnet: Wordnet, wordPath: Wordpath, consumer: Consumer) 32 | 33 | 34 | } 35 | 36 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/CorrectionDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment.plugins.correction; 17 | 18 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieMap; 19 | import com.mayabot.nlp.common.injector.ImplementedBy; 20 | 21 | /** 22 | * 分词纠错词典结构. 23 | * 对外提供一个DoubleArrayTrie 24 | * 25 | * @author jimichan 26 | */ 27 | @ImplementedBy(DefaultCorrectionDictionary.class) 28 | public interface CorrectionDictionary { 29 | 30 | DoubleArrayTrieMap getTrie(); 31 | 32 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/CorrectionPlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.correction; 2 | 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 5 | import org.jetbrains.annotations.NotNull; 6 | 7 | /** 8 | * @author jimichan 9 | */ 10 | public class CorrectionPlugin implements PipelineLexerPlugin { 11 | 12 | CorrectionDictionary dictionary = null; 13 | 14 | public CorrectionPlugin(@NotNull CorrectionDictionary dictionary) { 15 | this.dictionary = dictionary; 16 | } 17 | 18 | public CorrectionPlugin() { 19 | } 20 | 21 | @Override 22 | public void init(PipelineLexerBuilder builder) { 23 | 24 | CorrectionDictionary temp = dictionary; 25 | if (temp == null) { 26 | temp = builder.getMynlp().getInstance(CorrectionDictionary.class); 27 | } 28 | 29 | builder.addProcessor(new CorrectionWordpathProcessor(temp)); 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/CorrectionWord.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.correction 2 | 3 | /** 4 | * 第几套/房 5 | */ 6 | class CorrectionWord( 7 | val raw: String, 8 | @JvmField 9 | val path: String, 10 | val words: IntArray 11 | ) { 12 | 13 | 14 | override fun toString(): String { 15 | return "CorrectionWord{" + "path='" + path + '\'' + 16 | ", raw='" + raw + '\'' + 17 | ", words=" + words + 18 | '}' 19 | } 20 | 21 | companion object { 22 | // var splitter = Splitter.on("/").trimResults().omitEmptyStrings() 23 | 24 | /** 25 | * 第几套/房 26 | * 27 | * @param line 28 | * @return CorrectionWord 29 | */ 30 | @kotlin.jvm.JvmStatic 31 | fun parse(line: String): CorrectionWord { 32 | 33 | val raw = line.trim() 34 | val list = raw.split("/").map { it.trim() }.filter { it.isNotEmpty() } 35 | val path = list.joinToString("") 36 | val words = list.map { it.length }.toIntArray() 37 | return CorrectionWord(raw, path, words) 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/FileCorrectionDictionary.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment.plugins.correction 17 | 18 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieMap 19 | import com.mayabot.nlp.segment.plugins.correction.CorrectionWord.Companion.parse 20 | import java.io.File 21 | import java.nio.charset.Charset 22 | import java.util.* 23 | 24 | /** 25 | * File版本CorrectionDictionary 26 | * 文件内容格式: 27 | * 第几套/房 28 | * 29 | * 30 | * 一行一个规则 31 | * 32 | * @author jimichan 33 | */ 34 | class FileCorrectionDictionary(file: File, charset: Charset = Charsets.UTF_8) : CorrectionDictionary { 35 | 36 | private val dict: TreeMap = TreeMap() 37 | 38 | private val trie: DoubleArrayTrieMap 39 | 40 | override fun getTrie(): DoubleArrayTrieMap { 41 | return trie 42 | } 43 | 44 | init { 45 | val lines = file.readLines(charset) 46 | for (line in lines) { 47 | val adjustWord = parse(line) 48 | dict[adjustWord.path] = adjustWord 49 | } 50 | trie = DoubleArrayTrieMap(dict) 51 | } 52 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/correction/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 分词纠错 3 | */ 4 | package com.mayabot.nlp.segment.plugins.correction; -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/customwords/CustomDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | package com.mayabot.nlp.segment.plugins.customwords; 17 | 18 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap; 19 | import com.mayabot.nlp.common.injector.ImplementedBy; 20 | 21 | /** 22 | * 自定义词典结构. 23 | * 对外提供一个DoubleArrayTrie 24 | * 25 | * @author jimichan 26 | */ 27 | @ImplementedBy(DefaultCustomDictionary.class) 28 | public interface CustomDictionary { 29 | 30 | DoubleArrayTrieStringIntMap getTrie(); 31 | 32 | } 33 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/customwords/CustomDictionaryPlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.customwords; 2 | 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 5 | 6 | public class CustomDictionaryPlugin implements PipelineLexerPlugin { 7 | 8 | private CustomDictionary customDictionary; 9 | 10 | public CustomDictionaryPlugin(CustomDictionary customDictionary) { 11 | this.customDictionary = customDictionary; 12 | } 13 | 14 | public CustomDictionaryPlugin() { 15 | } 16 | 17 | 18 | @Override 19 | public void init(PipelineLexerBuilder builder) { 20 | CustomDictionary temp; 21 | if (customDictionary == null) { 22 | temp = builder.getMynlp().getInstance(CustomDictionary.class); 23 | } else { 24 | temp = customDictionary; 25 | } 26 | 27 | builder.addProcessor(new CustomDictionaryProcessor(temp)); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/customwords/FileCustomDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.segment.plugins.customwords; 18 | 19 | import com.mayabot.nlp.algorithm.collection.dat.DoubleArrayTrieStringIntMap; 20 | import com.mayabot.nlp.common.Guava; 21 | import com.mayabot.nlp.common.utils.CharNormUtils; 22 | 23 | import java.io.File; 24 | import java.io.IOException; 25 | import java.nio.charset.Charset; 26 | import java.util.List; 27 | import java.util.TreeMap; 28 | 29 | /** 30 | * File版本CustomDictionary 31 | * 不管什么格式 壁式网球 1 32 | * 只取第一段,后面的忽略 33 | * 34 | * @author jimichan 35 | */ 36 | public class FileCustomDictionary implements CustomDictionary { 37 | 38 | private DoubleArrayTrieStringIntMap trie; 39 | 40 | public FileCustomDictionary(File file, Charset charset) throws IOException { 41 | TreeMap dict = new TreeMap(); 42 | 43 | List lines = Guava.readLines(file, charset); 44 | 45 | for (String line : lines) { 46 | 47 | String[] params = line.split("\\s"); 48 | 49 | String w = params[0]; 50 | String n = CharNormUtils.convert(params[0]); 51 | 52 | dict.put(w, 1000); 53 | dict.put(n, 1000); 54 | 55 | } 56 | 57 | trie = new DoubleArrayTrieStringIntMap(dict); 58 | } 59 | 60 | @Override 61 | public DoubleArrayTrieStringIntMap getTrie() { 62 | return trie; 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/ner/NerPlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.ner; 2 | 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 5 | import com.mayabot.nlp.segment.plugins.pos.PosPerceptronProcessor; 6 | import com.mayabot.nlp.segment.plugins.pos.PosPlugin; 7 | 8 | /** 9 | * @author jimichan 10 | */ 11 | public class NerPlugin implements PipelineLexerPlugin { 12 | 13 | @Override 14 | public void init(PipelineLexerBuilder builder) { 15 | 16 | //如果不存在那么自行安装Pos模块 17 | if (!builder.existWordPathProcessor(PosPerceptronProcessor.class)) { 18 | builder.install(new PosPlugin()); 19 | } 20 | 21 | builder.addProcessor(NerProcessor.class); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pattern/PatternPlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.pattern; 2 | 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 5 | 6 | import java.util.regex.Pattern; 7 | 8 | /** 9 | * 基于正则表达式的分词插件 10 | * 11 | * @author jimichan 12 | */ 13 | public class PatternPlugin implements PipelineLexerPlugin { 14 | 15 | private Pattern pattern; 16 | 17 | public static PatternPlugin of(Pattern pattern) { 18 | return new PatternPlugin(pattern); 19 | } 20 | 21 | public PatternPlugin(Pattern pattern) { 22 | this.pattern = pattern; 23 | } 24 | 25 | @Override 26 | public void init(PipelineLexerBuilder builder) { 27 | builder.addProcessor(new PatternWordpathProcessor(pattern)); 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/personname/PersonNameAlgorithm.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.personname; 2 | 3 | import com.mayabot.nlp.common.injector.Singleton; 4 | import com.mayabot.nlp.segment.Nature; 5 | import com.mayabot.nlp.segment.WordSplitAlgorithm; 6 | import com.mayabot.nlp.segment.common.BaseSegmentComponent; 7 | import com.mayabot.nlp.segment.wordnet.Vertex; 8 | import com.mayabot.nlp.segment.wordnet.Wordnet; 9 | 10 | import java.util.List; 11 | /** 12 | * 采用感知机或者将来CRF制作的人名识别模型。 13 | * 这个切分算法,为了配合词典分词算法。 14 | * 我们在构造词图阶段就提取人名。 15 | */ 16 | @Singleton 17 | public class PersonNameAlgorithm extends BaseSegmentComponent implements WordSplitAlgorithm { 18 | 19 | private final PerceptronPersonNameService service; 20 | 21 | public PersonNameAlgorithm( 22 | PerceptronPersonNameService service) { 23 | super(LEVEL3); 24 | this.service = service; 25 | } 26 | 27 | @Override 28 | public void fill(Wordnet wordnet) { 29 | 30 | char[] charArray = wordnet.getCharArray(); 31 | 32 | List names = service.findName(charArray); 33 | 34 | wordnet.set(PersonNamePlugin.key,names); 35 | 36 | if (!names.isEmpty()) { 37 | for (PersonName name : names) { 38 | 39 | // 人名<=3,可能性高,作为初始词汇。防止被切断。陈宝奇怪别人不好 40 | if (name.getName().length() <= 3) { 41 | //如果已经存在 42 | if (wordnet.row(name.getOffset()).contains(name.getName().length())) { 43 | continue; 44 | } 45 | Vertex v = new Vertex(name.getName().length()); 46 | v.setAbsWordNatureAndFreq(Nature.nr); 47 | wordnet.put(name.getOffset(), v); 48 | } 49 | } 50 | } 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/personname/PersonNamePlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.personname; 2 | 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 5 | 6 | /** 7 | * 人名识别插件。 8 | * 3.1.0 新增了Processor。和PersonNameAlgorithm并存。 9 | * PersonNameAlgorithm处理长度小于等于3的人名。其他的人名,如果没有破坏其他词汇的切分, 10 | * 那么合并和为人名。 11 | * 修复了这种类型的bug 12 | * 阿里/nr 云/u 仓库/n 地址/n 正确/a ,/w 陈宝奇/nr 怪/a 别人/r 不好/a 13 | * 以前会把 阿里云仓 认为是人名。 14 | * 陈宝 奇怪 别人 ,人名又会被忽略的问题。 15 | * @author jimichan 16 | */ 17 | public class PersonNamePlugin implements PipelineLexerPlugin { 18 | 19 | public static final String key = "__person_name__"; 20 | 21 | @Override 22 | public void init(PipelineLexerBuilder builder) { 23 | builder.addWordSplitAlgorithm(PersonNameAlgorithm.class); 24 | builder.addProcessor(PersonNameProcessor.class); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/CommonPosModel.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.pos 2 | 3 | import com.mayabot.nlp.perceptron.PerceptronModel 4 | import com.mayabot.nlp.perceptron.PerceptronComputer 5 | import java.io.File 6 | 7 | /** 8 | * 通用的词性标注。 9 | * 格式 word/pos word/pos 10 | */ 11 | open class CommonPosModel(val labels: Array, 12 | val perceptron: PerceptronModel) { 13 | 14 | init { 15 | perceptron.decodeQuickMode(true) 16 | } 17 | 18 | protected val runner = PerceptronComputer(PosPerceptronDef(labels)) 19 | 20 | /** 21 | * 解码 22 | */ 23 | fun decodeWithIndex(list: List): IntArray { 24 | return runner.decode(perceptron, list) 25 | } 26 | 27 | fun save(dir: File) { 28 | perceptron.save(dir) 29 | } 30 | 31 | fun learn(sample: String) { 32 | runner.learnModel(perceptron,sample) 33 | } 34 | 35 | /** 36 | * 解码 37 | */ 38 | fun decode(list: List): List { 39 | val decodeResult = runner.decode(perceptron, list) 40 | return decodeResult.map { labels[it] } 41 | } 42 | 43 | companion object { 44 | 45 | fun train(labels: List, 46 | trainFile: File, 47 | evaluateFile: File?, 48 | iter: Int, 49 | threadNum: Int): PerceptronModel { 50 | val runner = PerceptronComputer(PosPerceptronDef(labels.toTypedArray())) 51 | return runner.train(trainFile, evaluateFile, iter, threadNum, true) 52 | } 53 | } 54 | } -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPerceptronUtils.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.pos 2 | 3 | import com.mayabot.nlp.segment.common.allFiles 4 | import com.mayabot.nlp.segment.common.parseToFlatWords 5 | import java.io.File 6 | 7 | fun main() { 8 | genTrainData() 9 | } 10 | fun genTrainData() { 11 | val cn = File("data.work/corpus/cncorpus") 12 | val pk = File("data.work/corpus/pku") 13 | 14 | fun read(file: File,list: MutableList){ 15 | file.allFiles().forEach { f -> 16 | f.forEachLine { line -> 17 | if (line.isNotBlank()) { 18 | val x = line.parseToFlatWords().filter { it.pos.isNotBlank() }.joinToString(separator = " ") 19 | if(x.isNotBlank()) { 20 | list += x 21 | } 22 | } 23 | } 24 | } 25 | } 26 | 27 | val list = ArrayList() 28 | 29 | read(cn,list) 30 | read(pk,list) 31 | 32 | list.shuffle() 33 | 34 | val out = File("data.work/pos.data") 35 | out.mkdirs() 36 | var k = 0 37 | list.asSequence().chunked(50000).forEach { part-> 38 | k++ 39 | File(out,"part-${k}.txt").writer(Charsets.UTF_8).use { 40 | part.forEach { line-> 41 | it.write(line) 42 | it.write("\n") 43 | } 44 | } 45 | } 46 | 47 | } 48 | 49 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/plugins/pos/PosPlugin.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.plugins.pos; 2 | 3 | import com.mayabot.nlp.segment.pipeline.PipelineLexerBuilder; 4 | import com.mayabot.nlp.segment.pipeline.PipelineLexerPlugin; 5 | 6 | /** 7 | * 词性模块 8 | * 9 | * @author jimichan 10 | */ 11 | public class PosPlugin implements PipelineLexerPlugin { 12 | 13 | @Override 14 | public void init(PipelineLexerBuilder builder) { 15 | builder.addProcessor(PosPerceptronProcessor.class); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/reader/BaseFilterLexerReader.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.reader; 2 | 3 | import com.mayabot.nlp.segment.LexerReader; 4 | import com.mayabot.nlp.segment.WordTerm; 5 | import com.mayabot.nlp.segment.WordTermSequence; 6 | 7 | import java.io.Reader; 8 | import java.util.Iterator; 9 | import java.util.function.Predicate; 10 | 11 | public abstract class BaseFilterLexerReader implements LexerReader, Predicate { 12 | 13 | private final LexerReader source; 14 | 15 | private boolean enable = true; 16 | 17 | public BaseFilterLexerReader(LexerReader source) { 18 | this.source = source; 19 | } 20 | 21 | public LexerReader getSource() { 22 | return source; 23 | } 24 | 25 | @Override 26 | public WordTermSequence scan(Reader reader) { 27 | WordTermSequence wts = source.scan(reader); 28 | if (!enable) { 29 | return wts; 30 | } 31 | Iterator iterator = wts.iterator(); 32 | Iterator change = new FilterWordItemIterator(iterator, this); 33 | return new WordTermSequence(change); 34 | } 35 | 36 | @Override 37 | public WordTermSequence scan(String text) { 38 | WordTermSequence wts = source.scan(text); 39 | if (!enable) { 40 | return wts; 41 | } 42 | Iterator iterator = wts.iterator(); 43 | Iterator change = new FilterWordItemIterator(iterator, this); 44 | return new WordTermSequence(change); 45 | } 46 | 47 | public boolean isEnable() { 48 | return enable; 49 | } 50 | 51 | public void setEnable(boolean enable) { 52 | this.enable = enable; 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/reader/DefaultLexerReader.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.reader; 2 | 3 | import com.mayabot.nlp.segment.Lexer; 4 | import com.mayabot.nlp.segment.LexerReader; 5 | import com.mayabot.nlp.segment.WordTermSequence; 6 | 7 | import java.io.Reader; 8 | 9 | /** 10 | * @author jimichan 11 | */ 12 | public class DefaultLexerReader implements LexerReader { 13 | 14 | private final Lexer lexer; 15 | 16 | public DefaultLexerReader(Lexer lexer) { 17 | this.lexer = lexer; 18 | } 19 | 20 | @Override 21 | public WordTermSequence scan(Reader reader) { 22 | return new WordTermSequence(lexer, reader); 23 | } 24 | 25 | @Override 26 | public WordTermSequence scan(String text) { 27 | return new WordTermSequence(lexer, text); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/wordnet/BestPathAlgorithm.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2018 mayabot.com authors. All rights reserved. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.mayabot.nlp.segment.wordnet; 18 | 19 | import com.mayabot.nlp.segment.lexer.bigram.ViterbiBestPathAlgorithm; 20 | 21 | /** 22 | * 选择最佳路径接口。具体实现有,viterbi 维特比 dijkstra算法 NShort算法 前向最大路径算法 23 | * 24 | * @author jimichan 25 | * @see ViterbiBestPathAlgorithm 26 | * @see com.mayabot.nlp.segment.plugins.bestpath.LongpathBestPathAlgorithm 27 | */ 28 | public interface BestPathAlgorithm { 29 | 30 | /** 31 | * 从词图网络中选择一条从头到尾的路径 32 | * 33 | * @param wordnet 输入词图 34 | * @return Wordpath 35 | */ 36 | Wordpath select(Wordnet wordnet); 37 | } 38 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/segment/wordnet/package-info.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.wordnet; 2 | 3 | /** 4 | * Wordnet是一个在分词使用的数据结构。 5 | * 所谓词图,指的是句子中所有词可能构成的图。 6 | *

7 | * 这里提供了优化的Wordnet的实现,更快的速度、更低的内存、尽量zero-copy。 8 | *

9 | * 还提供了Wordpath数据结构,wordpath采用bitSet去实现对选中路径的描述,避免和wordnet数据结构的纠缠, 10 | * 让规则程序更容易去进行重新划分词语,为识别器和业务规则的编码带来便利,降低了程序复杂度。 11 | * 12 | * @author jimichan 13 | **/ -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/starspace/Prediction.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.starspace 2 | 3 | import com.mayabot.nlp.blas.Vector 4 | 5 | 6 | data class Prediction(var score: Float, var second: Int) 7 | 8 | class StarSpacePrediction(private val model: StarSpace, basedoc: String?) { 9 | 10 | var baseDocVectors: MutableList = ArrayList() 11 | 12 | var baseDocs: MutableList> = ArrayList() 13 | 14 | init { 15 | val (x, y) = model.loadBaseDocs(basedoc) 16 | baseDocs = y 17 | baseDocVectors = x 18 | } 19 | 20 | fun predictOne(doc: String): List { 21 | return predictOne(model.dict.parseDoc(doc), 5) 22 | } 23 | 24 | fun predictOne(doc: String, k: Int): List { 25 | return predictOne(model.dict.parseDoc(doc), k) 26 | } 27 | 28 | fun predictOne(input: List, k: Int): List { 29 | 30 | val lhsM = model.projectLHS(input) 31 | 32 | val topMax = TopMaxK(k) 33 | 34 | for (i in baseDocVectors.indices) { 35 | val score = model.args.similarity(lhsM, baseDocVectors[i]) 36 | topMax.push(i, score) 37 | } 38 | 39 | return topMax.resort().map { Prediction(it.second, it.first) } 40 | } 41 | 42 | } 43 | -------------------------------------------------------------------------------- /mynlp/src/main/java/com/mayabot/nlp/starspace/SparseLinear.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.starspace 2 | 3 | import com.mayabot.nlp.blas.DenseMatrix 4 | import com.mayabot.nlp.blas.DenseVector 5 | import com.mayabot.nlp.blas.Vector 6 | 7 | 8 | open class TheMatrix(val matrix: DenseMatrix) { 9 | 10 | fun numRows(): Int { 11 | return matrix.row 12 | } 13 | 14 | fun numCols(): Int { 15 | return matrix.col 16 | } 17 | 18 | } 19 | 20 | 21 | class SparseLinear(matrix: DenseMatrix) : TheMatrix(matrix) { 22 | 23 | fun forward(row: Int): Vector { 24 | return matrix[row] 25 | } 26 | 27 | fun forward(list: List): Vector { 28 | 29 | val vector = DenseVector(this.numCols()) 30 | 31 | for ((row, scale) in list) { 32 | vector += scale to matrix[row] 33 | } 34 | 35 | return vector 36 | } 37 | 38 | } -------------------------------------------------------------------------------- /mynlp/src/main/resources/META-INF/mynlp.factories: -------------------------------------------------------------------------------- 1 | GuiceModule=com.mayabot.nlp.segment.SegmentModule -------------------------------------------------------------------------------- /mynlp/src/main/resources/mynlp/py_hard_code_map.txt: -------------------------------------------------------------------------------- 1 | # - 表示互换 2 | # -> 表示单向变化 3 | # 大概的逻辑是 前面是嘴瓢的说法,后面是正确的音。 4 | # 比如 灰机 --> 飞机 huiji,feiji 5 | hua - fa 6 | huan - fan 7 | hui -> fei 8 | jie -> zhe 9 | kou -> ke 10 | gou -> ge 11 | zhong -> zen 12 | san -> shang 13 | -------------------------------------------------------------------------------- /mynlp/src/main/resources/patch/cws-default.txt: -------------------------------------------------------------------------------- 1 | # ------------------ # 2 | # 感知机分词在线学习补丁 3 | # 随着新版本发现在这里默认修复感知机分词错误案例 4 | # ------------------ # 5 | 6 | X 临时 分居 -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/BM25Test.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp 2 | 3 | import com.mayabot.nlp.similarity.BM25ModelBuilder 4 | 5 | 6 | fun main() { 7 | val doc = listOf( 8 | "黄浦区人民政府在哪", 9 | "黄浦区人民政府要怎么去", 10 | "区政府在哪?", 11 | "区政府怎么走?", 12 | "区人民政府在哪儿", 13 | "黄浦区人民政府的工作地址", 14 | "黄浦区政府在哪", 15 | "人民政府要怎么去", 16 | "人民政府的工作地址", 17 | "黄浦区人民政府的电话是多少", 18 | "黄浦区人民政府的电话号码是多少", 19 | "区政府电话", 20 | "区政府的联系方式", 21 | "黄浦区行政服务中心在哪", 22 | "黄浦区行政服务中心要怎么去", 23 | "行政服务中心在哪", 24 | "行政服务中心要怎么去", 25 | "行政服务中心的工作地址", 26 | "区行政服务中心在哪?", 27 | "黄浦区金融发展服务中心都几点有人", 28 | "黄浦区金融发展服务中心都几点有人在工作", 29 | "我什么时候去金融发展服务中心比较合适", 30 | "金融发展服务中心的工作时间是几点", 31 | "我几点去金融发展服务中心比较合适", 32 | "金融发展服务中心的工作时间是几点", 33 | "金融发展服务中心一周的工作时间都是几点到几点", 34 | "区金融发展服务中心上班时间", 35 | "请问一下金融发展服务中心电话", 36 | "请给我金融发展服务中心的电话", 37 | "请把金融发展服务中心的电话给我", 38 | "区金融发展服务中心的电话", 39 | "区金融发展服务中心的联系方式", 40 | "黄浦区金融发展服务中心在哪", 41 | "黄浦区金融发展服务中心要怎么去", 42 | "区金融发展服务中心在哪?", 43 | "黄浦区金融发展服务中心的工作地址", 44 | "金融发展服务中心在哪", 45 | "金融发展服务中心要怎么去", 46 | "金融发展服务中心的工作地址", 47 | "怎么去", 48 | // "请问一下行政服务的电话" 49 | ) 50 | 51 | val bm25 = BM25ModelBuilder(doc).b(0.75f).build() 52 | 53 | bm25.search("怎么去").forEach { 54 | println(" ${doc[it.docId]} $it") 55 | } 56 | 57 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/Highlight.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp 2 | 3 | import com.mayabot.nlp.module.Highlighter 4 | import com.mayabot.nlp.module.highlight 5 | import org.junit.Assert 6 | import org.junit.Test 7 | 8 | class HighlightTest { 9 | 10 | private val words = listOf("居住证", "居住", "住宅", "hello") 11 | 12 | @Test 13 | fun test() { 14 | 15 | val highlighter = Highlighter(words) 16 | val text = "这个居住证,怎么办,居住和住宅----" 17 | 18 | Assert.assertEquals(highlighter.replace(text), "这个居住证,怎么办,居住住宅----") 19 | } 20 | 21 | @Test 22 | fun test2() { 23 | val highlighter = Highlighter(words, "div") 24 | val text = "这个居住证,怎么办,居住和住宅----" 25 | 26 | Assert.assertEquals(highlighter.replace(text), "这个

居住证
,怎么办,
居住
住宅
----") 27 | } 28 | 29 | @Test 30 | fun test3() { 31 | val text = "这个居住证,怎么办,居住和住宅----" 32 | 33 | val result = text.highlight(words) 34 | 35 | Assert.assertEquals(result, "这个居住证,怎么办,居住住宅----") 36 | } 37 | 38 | /** 39 | * 大小写 40 | */ 41 | @Test 42 | fun test4() { 43 | val text = "Hello word !" 44 | 45 | val result = text.highlight(words) 46 | 47 | Assert.assertEquals("Hello word !", result) 48 | } 49 | 50 | /** 51 | * 大小写 52 | */ 53 | @Test 54 | fun test5() { 55 | val text = "HEllo word !" 56 | 57 | val result = text.highlight(words) 58 | 59 | Assert.assertEquals("HEllo word !", result) 60 | } 61 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/TransTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp; 2 | 3 | import com.mayabot.nlp.Mynlp; 4 | import org.junit.Assert; 5 | 6 | public class TransTest { 7 | 8 | @org.junit.Test 9 | public void test() { 10 | 11 | Mynlp mynlp = Mynlp.instance(); 12 | 13 | String text = "軟件和體育的藝術"; 14 | String text_s = "软件和体育的艺术"; 15 | 16 | Assert.assertTrue(text.equals(mynlp.s2t(text_s))); 17 | 18 | Assert.assertTrue(text_s.equals(mynlp.t2s(text))); 19 | } 20 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/XxHashTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp 2 | 3 | import net.openhft.hashing.LongHashFunction 4 | import org.junit.Assert 5 | import org.junit.Test 6 | 7 | class XxHashTest { 8 | 9 | @Test 10 | fun test() { 11 | //7958582187431989116 12 | val hash = LongHashFunction.xx().hashChars("要闻汲取奋力前行力量李强龚正等参观我们众志成城上海防控新冠肺炎疫情主题展览") 13 | Assert.assertEquals(hash,7958582187431989116) 14 | } 15 | 16 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/commmon/CsrSparseMatrixTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.commmon 2 | 3 | import com.mayabot.nlp.common.TreeBasedTable 4 | import com.mayabot.nlp.common.matrix.CSRSparseMatrix 5 | import org.junit.Assert 6 | import org.junit.Test 7 | 8 | class CsrSparseMatrixTest { 9 | 10 | @Test 11 | fun test() { 12 | val table: TreeBasedTable = TreeBasedTable() 13 | 14 | table.put(2, 0, 6) 15 | table.put(3, 2, 4) 16 | table.put(0, 0, 5) 17 | table.put(0, 3, 2) 18 | table.put(4, 1, 2) 19 | table.put(4, 4, 9) 20 | 21 | val csr = CSRSparseMatrix(table, 5) 22 | 23 | Assert.assertTrue(csr[2, 0] == 6) 24 | Assert.assertTrue(csr[3, 2] == 4) 25 | Assert.assertTrue(csr[0, 0] == 5) 26 | Assert.assertTrue(csr[0, 3] == 2) 27 | Assert.assertTrue(csr[4, 1] == 2) 28 | Assert.assertTrue(csr[4, 4] == 9) 29 | } 30 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/commmon/TokenizerSplitterTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.commmon; 2 | 3 | 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | import static com.mayabot.nlp.common.TokenizerSplitter.parts; 8 | 9 | public class TokenizerSplitterTest { 10 | 11 | @Test 12 | public void test() { 13 | System.out.println(); 14 | Assert.assertTrue(parts("").isEmpty()); 15 | Assert.assertEquals(parts(",abc,efg").toString(), "[abc, efg]"); 16 | Assert.assertEquals(parts(",,abc efg.").toString(), "[abc, efg]"); 17 | Assert.assertEquals(parts("abcefg").toString(), "[abcefg]"); 18 | Assert.assertEquals(parts("ou may skip through a book, reading only those passages concerned ").toString(), 19 | "[ou, may, skip, through, a, book, reading, only, those, passages, concerned]"); 20 | 21 | Assert.assertEquals(parts("你可以跳读一本书,只拣那些有关的段落读一下即可。").toString(), 22 | "[你可以跳读一本书, 只拣那些有关的段落读一下即可]"); 23 | 24 | // long t1 = System.currentTimeMillis(); 25 | // for (int i = 0; i < 100000; i++) { 26 | // parts("你可以跳读一本书,只拣那些有关的段落读一下即可。"); 27 | // } 28 | // long t2 = System.currentTimeMillis(); 29 | // long time = t2 - t1; 30 | // Assert.assertTrue(time < 5000); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/fasttext/CFtzModelBugTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext 2 | 3 | import java.io.File 4 | 5 | fun main() { 6 | val model = FastText.loadCppModel(File("fastText4j/data/ChineseJapaneseKoreanLangIder.ftz")) 7 | 8 | val list = model.predict(listOf("こんにちは"), 3, 0.1f) 9 | 10 | list.forEach { 11 | println(it) 12 | } 13 | 14 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/fasttext/Java.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext; 2 | 3 | public class Java { 4 | public static void main(String[] args) { 5 | 6 | // FastText fastText = FastText.loadModelFormZip(new File("data/agnews/model.zip")); 7 | // 8 | // System.out.println(); 9 | 10 | // File trainFile = new File("data/agnews/ag.train"); 11 | // InputArgs inputArgs = new InputArgs(); 12 | // inputArgs.setLoss(LossName.softmax); 13 | // inputArgs.setLr(0.1); 14 | // inputArgs.setDim(100); 15 | // inputArgs.setEpoch(20); 16 | // 17 | // FastText model = FastText.trainSupervised(trainFile, inputArgs); 18 | // 19 | // model.test(new File("data/agnews/ag.test"),1,0,true); 20 | // 21 | // model.predict() 22 | 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/fasttext/SupTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext 2 | 3 | import com.mayabot.nlp.fasttext.args.InputArgs 4 | import com.mayabot.nlp.fasttext.loss.LossName 5 | import java.io.File 6 | 7 | 8 | val trainFile = File("fastText4j/data/agnews/ag.train") 9 | val testFile = File("fastText4j/data/agnews/ag.test") 10 | 11 | fun main() { 12 | 13 | val args = InputArgs().apply { 14 | this.loss = LossName.softmax 15 | lr = 0.1 16 | dim = 100 17 | minn = 0 18 | maxn = 0 19 | } 20 | 21 | // var fastText = FastText.trainSupervised(trainFile, args) 22 | // 23 | // fastText = fastText.quantize() 24 | // 25 | // fastText.saveModelToSingleFile(File("fastText4j/data/model.fjbin")) 26 | 27 | // fastText.saveModel("fasttext/data/agnews/model") 28 | // 29 | // val qFastText = fastText.quantize(dsub = 10) 30 | // qFastText.saveModel("fasttext/data/agnews/model.q") 31 | ////// 32 | //fastText.test(testFile, 1) 33 | // qFastText.test(testFile, 1) 34 | 35 | // val fastText = FastText.loadModel(File("fasttext/data/agnews/model"),true) 36 | // val fastText = FastText.loadCppModel(File("fasttext/data/agnews/model.ftz")) 37 | val fastText = FastText.loadModelFromSingleFile(File("fastText4j/data/model.fjbin")) 38 | fastText.test(testFile, 1) 39 | 40 | } 41 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/fasttext/TestCModelFTZ.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext 2 | 3 | import java.io.File 4 | 5 | fun main() { 6 | val file = File("/Users/jimichan/Downloads/ChineseJapaneseKoreanLangIder.ftz") 7 | 8 | val model = FastText.loadCppModel(file) 9 | 10 | val x = model.predict(listOf("hello", "hi"), 1, 0.0f) 11 | println(x) 12 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/fasttext/TestSup.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext 2 | 3 | import com.mayabot.nlp.fasttext.args.InputArgs 4 | import com.mayabot.nlp.fasttext.args.ModelName 5 | import com.mayabot.nlp.fasttext.loss.LossName 6 | import com.mayabot.nlp.fasttext.utils.disableLog 7 | import java.io.File 8 | 9 | class TestSup { 10 | 11 | val trainFile = File("data/agnews/ag.train") 12 | val testFile = File("data/agnews/ag.test") 13 | 14 | fun testSub(){ 15 | disableLog() 16 | val lossNames = listOf(LossName.softmax,LossName.ns,LossName.hs,LossName.ova) 17 | 18 | lossNames.forEach { loss-> 19 | check(test(loss)){ 20 | "Loss Name ${loss.name} ERROR" 21 | } 22 | } 23 | } 24 | 25 | 26 | fun test(lossName: LossName) : Boolean { 27 | // val trainSources = listOf(loadTrainFile("ag.train.txt")) 28 | // val testSources = loadTrainFile("ag.test.txt") 29 | 30 | val trainArgs = InputArgs() 31 | trainArgs.loss = lossName 32 | 33 | val fastText = FastText.train(trainFile, ModelName.sup, trainArgs) 34 | 35 | fastText.quantize() 36 | 37 | val meter = fastText.test(testFile) 38 | 39 | return meter.f1Score() > 0.9 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/fasttext/TestWords.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext 2 | 3 | import com.mayabot.nlp.blas.cosine 4 | import java.io.File 5 | 6 | fun main() { 7 | 8 | // val file = File("/Users/jimichan/Downloads/wiki.zh.bin") 9 | // 10 | // val fastText = FastText.loadCppModel(file) 11 | 12 | val fastText = FastText.loadModel(File("/Users/jimichan/mynlp.data/wordvec.vec"), true) 13 | 14 | println("加载模型到内存完成") 15 | 16 | // val k = fastText.nearestNeighbor("丢失",5) 17 | 18 | fastText.like("丢", "丢失") 19 | fastText.like("遗落", "丢失") 20 | fastText.like("偷走", "丢失") 21 | fastText.like("遗失", "丢失") 22 | fastText.like("遗失", "遗落") 23 | fastText.like("失去", "丢失") 24 | fastText.like("上海", "丢失") 25 | fastText.like("挂失", "补办") 26 | 27 | println("----------------") 28 | fastText.senLike("卡 丢失 了", "卡 被 偷走 了") 29 | fastText.senLike("卡 丢失 了", "信用卡 忘记 密码 ") 30 | 31 | // println(fastText.analogies("柏林","德国","法国",5)) 32 | 33 | } 34 | 35 | private fun FastText.like(word1: String, word2: String) { 36 | val cos = cosine(this.getWordVector(word1), this.getWordVector(word2)) 37 | println("$word1 <-> $word2 : ${cos}") 38 | } 39 | 40 | private fun FastText.senLike(word1: String, word2: String) { 41 | val cos = cosine(this.getSentenceVector(word1.split(" ")), this.getSentenceVector(word2.split(" "))) 42 | println("$word1 <-> $word2 : ${cos}") 43 | } 44 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/fasttext/Utils.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.fasttext 2 | 3 | import com.mayabot.nlp.fasttext.train.MemSampleLineList 4 | import com.mayabot.nlp.fasttext.train.SampleLine 5 | 6 | fun loadTrainFile( resouceName:String ) : MemSampleLineList{ 7 | 8 | val path = "/"+resouceName 9 | 10 | val ins = TestSup::class.java.getResourceAsStream(path) 11 | 12 | val list = ArrayList() 13 | ins.bufferedReader().lines().forEach { 14 | list += it 15 | } 16 | 17 | val x = list.map { SampleLine(it.split(" ").toList()) }.toMutableList() 18 | 19 | return MemSampleLineList(x) 20 | 21 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/module/lucene/LuceneUtils.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.module.lucene 2 | 3 | import org.apache.lucene.analysis.TokenStream 4 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 5 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute 6 | 7 | fun TokenStream.iterable():Iterable{ 8 | return Iterable { 9 | TokenStreamIterator(this) 10 | } 11 | } 12 | 13 | class TokenStreamIterator(private val tokenStream:TokenStream) : AbstractIterator() { 14 | init { 15 | tokenStream.reset() 16 | } 17 | 18 | private val charTermAttr = tokenStream.getAttribute(CharTermAttribute::class.java) 19 | private val offsetAttr = tokenStream.getAttribute(OffsetAttribute::class.java) 20 | 21 | override fun computeNext() { 22 | val hasNext = tokenStream.incrementToken() 23 | if (hasNext) { 24 | this.setNext(charTermAttr.toString()) 25 | }else{ 26 | tokenStream.end() 27 | tokenStream.close() 28 | done() 29 | } 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/module/lucene/TestPinyinTokenizer.kt: -------------------------------------------------------------------------------- 1 | //package com.mayabot.nlp.module.lucene 2 | // 3 | //import com.mayabot.nlp.Mynlp 4 | //import org.apache.lucene.analysis.tokenattributes.CharTermAttribute 5 | //import org.apache.lucene.analysis.tokenattributes.OffsetAttribute 6 | //import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute 7 | //import org.junit.Test 8 | //import java.io.StringReader 9 | // 10 | //class TestPinyinTokenizer { 11 | // 12 | // val pinyin = Mynlp.instance().pinyin(); 13 | // 14 | // @Test 15 | // fun test() { 16 | // val tok = PinyinAnalyzer(pinyin,true,true,false) 17 | // 18 | // tok.pinyinTokens("飞机").forEach { 19 | // println(it) 20 | // } 21 | // } 22 | // 23 | // @Test 24 | // fun test2() { 25 | // val tok = PinyinAnalyzer(pinyin,true,false,false) 26 | // 27 | // tok.pinyinTokens("三个 小猪").forEach { 28 | // println(it) 29 | // } 30 | // } 31 | // 32 | // private fun PinyinAnalyzer.pinyinTokens(text:String):List { 33 | // val tk = this.tokenStream("title",text) 34 | // tk.reset() 35 | // 36 | // val charTermAttr = tk.getAttribute(CharTermAttribute::class.java) 37 | // val offsetAttr = tk.getAttribute(OffsetAttribute::class.java) 38 | // val posAttr = tk.getAttribute(PositionIncrementAttribute::class.java) 39 | // val list = ArrayList() 40 | // 41 | // while (tk.incrementToken()) { 42 | // list += Item(charTermAttr.toString(),offsetAttr.startOffset(),offsetAttr.endOffset(),posAttr.positionIncrement) 43 | // } 44 | // tk.end() 45 | // tk.close() 46 | // return list 47 | // } 48 | // 49 | // data class Item( 50 | // val py:String, 51 | // val offsetStart:Int, 52 | // val offsetEnd:Int, 53 | // val inc:Int, 54 | // ) 55 | //} -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/pa/GeleiCode.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.pa 2 | 3 | import kotlin.math.pow 4 | 5 | 6 | //void GrayCode(int n, string *data) 7 | //{ 8 | // if (n == 1)//终止条件,先生成1位的格雷码 9 | // { 10 | // data[0] = "0"; 11 | // data[1] = "1"; 12 | // return; 13 | // } 14 | // GrayCode(n - 1, data);//生成n位的格雷码首先需要生成n-1的格雷码 15 | // int len = (int)pow(2, n); 16 | // for (int i = len / 2; i < len; i++)//先处理后半部分,注意对称 17 | // { 18 | // data[i] = "1" + data[len - i - 1]; 19 | // } 20 | // for (int i = 0; i < len / 2; i++)//对于前半部分直接+'0' 21 | // { 22 | // data[i] = "0" + data[i]; 23 | // } 24 | 25 | fun grapCode(n: Int, data: Array) { 26 | if (n == 1) { 27 | data[0] = "0" 28 | data[1] = "1" 29 | return 30 | } 31 | grapCode(n - 1, data) 32 | val len = 2.0.pow(n).toInt() 33 | for (i in len / 2 until len) { 34 | data[i] = "1" + data[len - i - 1] 35 | } 36 | for (i in 0 until len / 2) { 37 | data[i] = "0" + data[i] 38 | } 39 | } 40 | 41 | fun main() { 42 | val n = 5 43 | val data = Array(2.0.pow(n * 1.0).toInt()) { null } 44 | grapCode(n, data) 45 | 46 | var i = 0 47 | for (line in data) { 48 | println("$i\t" + line!!.padStart(n, '0')) 49 | i++ 50 | } 51 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinDistance.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.pinyin 2 | 3 | import com.mayabot.nlp.module.pinyin.PinyinDistance 4 | import org.junit.Test 5 | 6 | class PinyinDistance { 7 | 8 | @Test 9 | fun test() { 10 | PinyinDistance.distance("灰机", "飞机") 11 | PinyinDistance.distance("粉丝", "大侠") 12 | PinyinDistance.distance("粉丝中", "大侠梦") 13 | } 14 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/pinyin/PinyinTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.pinyin 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.Mynlp.Companion.instance 5 | import org.junit.Assert 6 | import org.junit.Test 7 | 8 | class PinyinTest { 9 | 10 | @Test 11 | fun test() { 12 | Assert.assertEquals("[zhao, zhao, mu, mu]", "朝朝暮暮".py()) 13 | } 14 | 15 | @Test 16 | fun test2() { 17 | println( 18 | instance().convertPinyin("转战") 19 | .fuzzy(true).asList() 20 | ) 21 | } 22 | 23 | // @Test 24 | // fun test3() { 25 | // var pinyin = Mynlp.instance().pinyin() 26 | // for (py in pinyin.charPinyin('行')) { 27 | // println(py) 28 | // } 29 | // } 30 | 31 | private fun String.py() = Mynlp.instance().convertPinyin(this).asList().toString() 32 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/CmbSegment.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import org.junit.Test 4 | 5 | /** 6 | * 招行分词需求 7 | */ 8 | class CmbSegment { 9 | 10 | @Test 11 | fun test() { 12 | val text = "" + 13 | "2018年年度收入\n" + 14 | "2018年收入\n" + 15 | "17年账单\n" + 16 | "我要找1到3个月出入账\n" + 17 | "周一到周三花了多少钱\n" + 18 | "最近三天花了多少钱\n" + 19 | "最近一周转账记录\n" + 20 | "6月账单\n" + 21 | "半年流水\n" + 22 | "二月份明细账\n" + 23 | "最近6个月全部账单\n" + 24 | "一年流水\n" + 25 | "四个月流水\n" + 26 | "四月份收入\n" + 27 | "上一月支出\n" + 28 | "6月1号账单\n" + 29 | "6月28号流水\n" + 30 | "这是陈汝烨和张帆副院长的生日" 31 | 32 | // 1. 自定义词库 33 | // 2. 人工纠错规则 34 | 35 | val tokenizer = Lexers.core() 36 | 37 | for (line in text.split("\n".toRegex()).dropLastWhile { it.isEmpty() }.toTypedArray()) { 38 | print(line + "\t") 39 | 40 | println(tokenizer.scan(line)) 41 | } 42 | } 43 | 44 | 45 | } 46 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/CombineTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment; 2 | 3 | import org.junit.Assert; 4 | import org.junit.Test; 5 | 6 | /** 7 | * CoreTokenizer自带Combine逻辑,不再需要后置处理了。 8 | */ 9 | public class CombineTest { 10 | 11 | @Test 12 | public void test() { 13 | 14 | Lexer tokenizer = Lexers.core(); 15 | 16 | String test = "体重182kg\n" + 17 | "五十八公斤\n" + 18 | "产品编号BN-598\n" + 19 | "产品编号BN-598-122N\n" + 20 | "我买了一台very cool iPhone7\n" + 21 | "分词标签是__lable__"; 22 | 23 | 24 | String[] result = ("体重 182kg\n" + 25 | "五十八公斤\n" + 26 | "产品 编号 bn-598\n" + 27 | "产品 编号 bn-598-122n\n" + 28 | "我 买 了 一台 very cool iphone7\n" + 29 | "分词 标签 是 __lable__").split("\n"); 30 | 31 | int i = 0; 32 | for (String text : test.split("\n")) { 33 | String t = tokenizer.scan(text).toPlainString(); 34 | Assert.assertTrue(t + "--->" + result[i], t.equals(result[i].toLowerCase())); 35 | i++; 36 | } 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/CustomDictTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import com.mayabot.nlp.segment.plugins.customwords.MemCustomDictionary 4 | import org.junit.Test 5 | 6 | class CustomDictTest { 7 | 8 | 9 | @Test 10 | fun test() { 11 | val mem = MemCustomDictionary() 12 | mem.addWord("长江1号"); 13 | mem.addWord("ECS固收"); 14 | mem.addWord("固收"); 15 | mem.rebuild() 16 | 17 | mem.clear() 18 | 19 | mem.addWord("固收"); 20 | mem.rebuild() 21 | 22 | val lexer = Lexers.coreBuilder() 23 | .withCustomDictionary(mem) 24 | .customSentenceCollector { 25 | it.smartSubword() 26 | it.fillCustomDict(mem) 27 | } 28 | .build() 29 | 30 | println(lexer.scan("ECS固收")) 31 | println("----") 32 | lexer.scan("ECS固收").forEach { w -> 33 | println(w.subword) 34 | } 35 | } 36 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/IndexSegmentTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment; 2 | 3 | import com.mayabot.nlp.segment.plugins.collector.SentenceCollectorBuilder; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | public class IndexSegmentTest { 8 | 9 | 10 | @Test 11 | public void test() { 12 | 13 | Lexer mynlpTokenizer = Lexers. 14 | coreBuilder() 15 | .customSentenceCollector(SentenceCollectorBuilder::indexSubword) 16 | .build(); 17 | 18 | String str = mynlpTokenizer.scan("中华人民共和国的利益").toString(); 19 | 20 | Assert.assertEquals("[中华 华人 人民 人民共和国 共和 共和国] 的 利益",str); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/KeepOriCharOutputTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import org.junit.Assert 4 | import org.junit.Test 5 | 6 | class KeepOriCharOutputTest { 7 | 8 | @Test 9 | fun test(){ 10 | val lerxer = Lexers.coreBuilder() 11 | .keepOriCharOutput() 12 | .build() 13 | Assert.assertEquals("看看 下面 这 中文 逗号 , Keep 大小写", 14 | lerxer.scan("看看下面这中文逗号,Keep 大小写").toPlainString() 15 | ) 16 | } 17 | 18 | @Test 19 | fun test2(){ 20 | val lerxer = Lexers.coreBuilder() 21 | .build() 22 | Assert.assertEquals("看看 下面 这 中文 逗号 , keep 大小写", 23 | lerxer.scan("看看下面这中文逗号,Keep 大小写").toPlainString() 24 | ) 25 | } 26 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/KotlinTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | 4 | fun main() { 5 | println("科学之门".lexer()) 6 | println("录音曝光!朴槿惠就职总统前 听崔顺实90分钟指导".segment()) 7 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/PosTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment; 2 | 3 | 4 | public class PosTest { 5 | 6 | public static void main(String[] args) { 7 | 8 | // PerceptronPosService service = Mynlps.instanceOf(PerceptronPosService.class); 9 | // List words = Lists.newArrayList("第三 章".split(" ")); 10 | // List pos = service.pos(words); 11 | // 12 | // for (int i = 0; i < words.size(); i++) { 13 | // System.out.println(words.get(i)+"/"+pos.get(i)); 14 | // } 15 | 16 | System.out.println(Lexers.core().scan("第三章,章先生")); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/SegmentErrorCasesTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import org.junit.Test 4 | 5 | /** 6 | * 收集分词异常报错 7 | */ 8 | class SegmentErrorCasesTest { 9 | 10 | @Test 11 | fun carwords() { 12 | val tokenizer = Lexers.core() 13 | val lines = arrayOf( 14 | "你好离合器片的生产日期是2013-05-034S回复人635110101001", 15 | "第一次维修更换中间轴前轴承和倒档惰轮总成第二次是20170年6", 16 | "六万一千公里", 17 | "此车20171年12月19号来我站报修前照灯进水", 18 | "我站一辆宝骏5602017年2月16日到我站反映六档挡不进档") 19 | 20 | 21 | for (s in lines) { 22 | println(s) 23 | println(tokenizer.scan(s)) 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/SubwordTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.segment.plugins.collector.DefaultSubwordRuleDict 5 | import org.junit.Test 6 | 7 | class SubwordTest { 8 | 9 | @Test 10 | fun test() { 11 | val mynlp = Mynlp.instance() 12 | 13 | val x = DefaultSubwordRuleDict() 14 | x.add("副/市长") 15 | x.rebuild() 16 | 17 | val lexer = mynlp.lexerBuilder() 18 | .hmm() 19 | .withPos() 20 | .customSentenceCollector { 21 | it.smartSubword() 22 | // it.ruleBaseSubword(listOf(x)) 23 | } 24 | .build() 25 | 26 | println(lexer.scan("这是副市长的快递").toList()) 27 | } 28 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/Test.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import com.mayabot.nlp.segment.plugins.correction.MemCorrectionDictionary 5 | 6 | fun main() { 7 | val mynlp = Mynlp.instance() 8 | val mem = MemCorrectionDictionary() 9 | 10 | mem.addRule("近期/待还") 11 | mem.rebuild() 12 | 13 | // val lexer = mynlp.lexerBuilder() 14 | // .bigram() 15 | // .withPos() 16 | // .withPersonName() 17 | // .collector().smartPickup { 18 | // it.setBlackListCallback { 19 | // it[0] == '副' && it[it.length - 1] == '长' 20 | // } 21 | // } 22 | // .done() 23 | // .withCorrection(mem) 24 | // .build() 25 | // 26 | // lexer.scan("近期待还").forEach { 27 | // print(it) 28 | // println("\t has sub " + it.hasSubword()) 29 | // } 30 | 31 | 32 | //default core 33 | // val lexer2 = Lexers.coreBuilder() 34 | // .withPersonName() 35 | //// .withPos() 36 | // .collector().smartPickup() 37 | // .done() 38 | // .build() 39 | // 40 | // println(lexer2.scan("基础设施")) 41 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/TestPosAndSubWord.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment 2 | 3 | import com.mayabot.nlp.Mynlp 4 | import org.junit.Assert 5 | import org.junit.Test 6 | 7 | /** 8 | * 同时开启词性和subword,导致词性失效 9 | */ 10 | class TestPosAndSubWord { 11 | 12 | @Test 13 | fun test() { 14 | val mynlp = Mynlp.instance() 15 | 16 | val lexer = mynlp.lexerBuilder().hmm() 17 | .withPos() 18 | .customSentenceCollector { 19 | it.smartSubword() 20 | it.fillCoreDict() 21 | } 22 | .build() 23 | 24 | val result = lexer.scan("这次是北京大学拿到第一名").toString() 25 | Assert.assertEquals("这次/r 是/v [北京 大学]/nt 拿到/v 第一名/mq",result) 26 | } 27 | 28 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/atom/AtomSplitAlgorithmTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.atom 2 | 3 | import com.mayabot.nlp.Mynlps 4 | import com.mayabot.nlp.segment.plugins.atom.AtomSplitAlgorithm 5 | import com.mayabot.nlp.segment.wordnet.Wordnet 6 | import org.junit.Test 7 | 8 | 9 | class SimpleTest { 10 | @Test 11 | fun unitTestingWorks() { 12 | val text = listOf("这个是你jimi@mayabot.com邮箱地址么2017-10-12", 13 | "你的ipad3么 ,最近三天花了多少钱 a-ff -102 @163.com,一万八千八百八十八,FM98.1,jimi@mayabot.com,周一下午九点钟,一九九八年三月,2018年2月2日,2013年,周一下午三点半有个重量为11225.6公斤,123234" 14 | ) 15 | val atom = Mynlps.instanceOf(AtomSplitAlgorithm::class.java) 16 | val atom2 = Mynlps.instanceOf() 17 | 18 | text.forEach { line -> 19 | val wordnet = Wordnet(line.toCharArray()) 20 | 21 | atom.fill(wordnet) 22 | println(wordnet.toMoreString()) 23 | } 24 | 25 | 26 | } 27 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/NERPerceptronTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.perceptron 2 | 3 | import com.mayabot.nlp.segment.Lexers 4 | import com.mayabot.nlp.segment.plugins.ner.NERPerceptron 5 | import com.mayabot.nlp.segment.plugins.ner.NERPerceptronTrainer 6 | import com.mayabot.nlp.segment.plugins.ner.PerceptronNerService 7 | import java.io.File 8 | 9 | 10 | object NERPerceptronTest { 11 | 12 | @JvmStatic 13 | fun main(args: Array) { 14 | // train() 15 | test() 16 | } 17 | 18 | fun train() { 19 | val trainer = NERPerceptronTrainer() 20 | 21 | val trainFile = File("data.work/ner") 22 | val evaluateFile = File("data.work/ner-test/ner_1.txt") 23 | 24 | val model = trainer.train( 25 | trainFile, evaluateFile, 26 | 130, 1) 27 | 28 | model.save(File("data.work/ner.model")) 29 | } 30 | 31 | fun test() { 32 | // val evaluateFile = File("data/pku/199802.txt") 33 | 34 | val tokenizer = Lexers.core() 35 | val text = "这是陈建国的快递,来自上海万行信息科技有限公司的报告" 36 | 37 | val termList = tokenizer.scan(text).toList() 38 | 39 | 40 | val ner = NERPerceptron.load(File("data.work/ner.model")) 41 | 42 | 43 | println(termList) 44 | 45 | ner.decode(termList) 46 | 47 | println(PerceptronNerService.toNerComposite(termList).joinToString(separator = ",")) 48 | 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/POSPerceptronTest.kt: -------------------------------------------------------------------------------- 1 | //package com.mayabot.nlp.segment.lexer.perceptron 2 | // 3 | //import com.mayabot.nlp.segment.plugins.pos.POSPerceptronTrainer 4 | //import com.mayabot.nlp.utils.CharNormUtils 5 | //import java.io.File 6 | // 7 | //fun main(args: Array) { 8 | // val model = POSPerceptronTrainer().train(File("data/pku/199801.txt"), File("data/cncorpus/cncorpus_9.txt"), 1, 1) 9 | // model.save(File("data/pos/model")) 10 | // 11 | // 12 | //////// 13 | //// println(model.decode("陈汝烨")) 14 | // 15 | //// val model = POSPerceptron.load(File("data/pos/model")) 16 | // val words = "陈汝烨 余额宝 的 规模 增长 一直 呈现 不断 加速 , 的 状态".split(" ") 17 | //// 18 | ////// val train = POSPerceptronTrainer() 19 | ////// train.train(File("data/pku"),1,4) 20 | ////// val sampleList = train.loadSamples(File("data/pku").allFiles()) 21 | ////// val eva = POSEvaluateRunner(0, sampleList) 22 | ////// eva.run(model.model) 23 | //// 24 | //// 25 | // val words2 = CharNormUtils.convert("陈汝烨 陈勤勤 余额宝 的 规模 增长 一直 呈现 不断 加速 , 的 状态 四十 年 , 我 的 心里 从未 这么 安静 过").split(" ") 26 | // val result = model.decode(words2) 27 | // println(words2.zip(result)) 28 | // 29 | //// val lines = File("data/pos/model/feature.txt").readLines() 30 | //// 31 | //// var index = DoubleArrayTrie(lines) 32 | //// 33 | //// 34 | //// println(index.wordId("望京☺")) 35 | //} 36 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronNerServiceTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.perceptron; 2 | 3 | import com.mayabot.nlp.Mynlps; 4 | import com.mayabot.nlp.segment.Sentence; 5 | import com.mayabot.nlp.segment.plugins.ner.PerceptronNerService; 6 | 7 | import java.util.List; 8 | 9 | public class PerceptronNerServiceTest { 10 | 11 | 12 | public static void main(String[] args) { 13 | PerceptronNerService ner = Mynlps.instanceOf(PerceptronNerService.class); 14 | PerceptronsSegmentService cws = Mynlps.instanceOf(PerceptronsSegmentService.class); 15 | 16 | 17 | List words = cws.splitWord("悦胜公司成立之初系杭州市体育发展集团(杭州市体育局所属事业单位)下属的全资子公司,主要经营体育事业相关业务,后为服务2018年第14届FINA世界游泳锦标赛,增资扩股为国有控股公司。\n" + 18 | "\n" + 19 | "\n"); 20 | 21 | Sentence ner1 = ner.ner(words); 22 | 23 | System.out.println(ner1); 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/lexer/perceptron/PerceptronServiceTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.lexer.perceptron 2 | 3 | object PerceptronServiceTest { 4 | 5 | } -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/ner/OrgTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.ner; 2 | 3 | import com.mayabot.nlp.segment.Lexer; 4 | import com.mayabot.nlp.segment.Lexers; 5 | import com.mayabot.nlp.segment.utils.TokenizerTestHelp; 6 | import org.junit.Test; 7 | 8 | public class OrgTest { 9 | 10 | @Test 11 | public void test() { 12 | { 13 | String text = "这|是|上海|万|行|信息|科技|有限公司|的|财务|报表"; 14 | 15 | Lexer tokenizer = Lexers.coreBuilder() 16 | .build(); 17 | 18 | TokenizerTestHelp.test(tokenizer, text); 19 | } 20 | 21 | 22 | { 23 | String text = "这|是|上海万行信息科技有限公司|的|财务|报表"; 24 | 25 | Lexer tokenizer = Lexers.coreBuilder() 26 | .withNer() 27 | .build(); 28 | 29 | 30 | TokenizerTestHelp.test(tokenizer, text); 31 | } 32 | 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/ner/PersonNameTest.kt: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.ner 2 | 3 | import com.mayabot.nlp.segment.Lexers 4 | import com.mayabot.nlp.segment.utils.TokenizerTestHelp 5 | import org.junit.Test 6 | 7 | class PersonNameTest { 8 | 9 | @Test 10 | fun test() { 11 | run { 12 | val text = "这|是|陈|建国|的|快递" 13 | 14 | val tokenizer = Lexers.builder().core() 15 | .build() 16 | 17 | 18 | TokenizerTestHelp.test(tokenizer, text) 19 | } 20 | 21 | 22 | run { 23 | val text = "这|是|陈建国|的|快递" 24 | 25 | val tokenizer = Lexers.builder().core() 26 | .withPersonName().build() 27 | 28 | TokenizerTestHelp.test(tokenizer, text) 29 | } 30 | } 31 | 32 | @Test 33 | fun test2() { 34 | val tokenizer = Lexers.perceptronBuilder().core() 35 | .withPersonName().build() 36 | 37 | val strings = arrayOf("先后视察了华鑫海欣楼宇党建(群团)服务站和江阴顺天村项目", "签约仪式前,秦光荣、李纪恒、仇和等一同会见了参加签约的企业家。", "武大靖创世界纪录夺冠,中国代表团平昌首金", "区长庄木弟新年致辞", "朱立伦:两岸都希望共创双赢 习朱历史会晤在即", "陕西首富吴一坚被带走 与令计划妻子有交集", "据美国之音电台网站4月28日报道,8岁的凯瑟琳·克罗尔(凤甫娟)和很多华裔美国小朋友一样,小小年纪就开始学小提琴了。她的妈妈是位虎妈么?", "凯瑟琳和露西(庐瑞媛),跟她们的哥哥们有一些不同。", "王国强、高峰、汪洋、张朝阳光着头、韩寒、小四", "张浩和胡健康复员回家了", "王总和小丽结婚了", "编剧邵钧林和稽道青说", "这里有关天培的有关事迹", "先后视察了华鑫海欣楼宇党建(群团)服务站和江阴顺天村项目", "龚学平等领导说,邓颖超生前杜绝超生") 38 | 39 | for (line in strings) { 40 | println(line + "\n") 41 | println(tokenizer.scan(line)) 42 | println("\n") 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/ner/PlaceTest.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.ner; 2 | 3 | import com.mayabot.nlp.segment.Lexer; 4 | import com.mayabot.nlp.segment.Lexers; 5 | import com.mayabot.nlp.segment.utils.TokenizerTestHelp; 6 | import org.junit.Test; 7 | 8 | public class PlaceTest { 9 | 10 | @Test 11 | public void test() { 12 | 13 | 14 | { 15 | String text = "中央|大街|浪漫|永|存"; 16 | 17 | Lexer tokenizer = Lexers.coreBuilder() 18 | 19 | .build(); 20 | 21 | TokenizerTestHelp.test(tokenizer, text); 22 | } 23 | 24 | 25 | { 26 | String text = "中央大街|浪漫|永|存"; 27 | 28 | Lexer tokenizer = Lexers.coreBuilder() 29 | .withNer() 30 | .build(); 31 | 32 | 33 | TokenizerTestHelp.test(tokenizer, text); 34 | } 35 | 36 | 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /mynlp/src/test/java/com/mayabot/nlp/segment/utils/TokenizerTestHelp.java: -------------------------------------------------------------------------------- 1 | package com.mayabot.nlp.segment.utils; 2 | 3 | import com.mayabot.nlp.common.Guava; 4 | import com.mayabot.nlp.segment.Lexer; 5 | import org.junit.Assert; 6 | 7 | public class TokenizerTestHelp { 8 | 9 | /** 10 | * 测试分词器 11 | * 输入文本的格式 你好|世界 12 | * 输入分词器是会把|去除掉 13 | * 14 | * @param tokenizer 15 | * @param text 16 | * @return 17 | */ 18 | public static void test( 19 | Lexer tokenizer, 20 | String text) { 21 | 22 | text = text.trim(); 23 | 24 | String input = text.replace("|", ""); 25 | 26 | 27 | String out = Guava.join(tokenizer.scan(input).toWordList(), "|"); 28 | 29 | Assert.assertTrue("Out is " + out + " ,Input " + text, text.equalsIgnoreCase(out)); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /mynlp/src/test/resources/GrapCode.txt: -------------------------------------------------------------------------------- 1 | 0 00000 2 | 1 00001 3 | 2 00011 4 | 3 00010 5 | 4 00110 6 | 5 00111 7 | 6 00101 8 | 7 00100 9 | 8 01100 10 | 9 01101 11 | 10 01111 12 | 11 01110 13 | 12 01010 14 | 13 01011 15 | 14 01001 16 | 15 01000 17 | 16 11000 18 | 17 11001 19 | 18 11011 20 | 19 11010 21 | 20 11110 22 | 21 11111 23 | 22 11101 24 | 23 11100 25 | 24 10100 26 | 25 10101 27 | 26 10111 28 | 27 10110 29 | 28 10010 30 | 29 10011 31 | 30 10001 32 | 31 10000 -------------------------------------------------------------------------------- /settings.gradle.kts: -------------------------------------------------------------------------------- 1 | rootProject.name = "mynlp" 2 | 3 | include("mynlp", "mynlp-all", "mynlp-example", "mynlp-experimental") --------------------------------------------------------------------------------