├── .classpath ├── .gitignore ├── .project ├── .settings ├── org.eclipse.core.resources.prefs ├── org.eclipse.jdt.core.prefs └── org.eclipse.m2e.core.prefs ├── Changes.txt ├── LICENSE ├── PageHeader.txt ├── README.md ├── Thanks.txt ├── example-data ├── data-classification.txt ├── data-lda.txt ├── data-tag.txt ├── seg-bad-case.txt ├── sequence │ ├── data.txt │ ├── seq.res │ ├── shell.cmd │ ├── template │ ├── template_dynamic │ ├── template_pro │ ├── test.txt │ ├── test0.txt │ └── train.txt ├── text-classification │ ├── Eco.data │ ├── Sport.data │ └── Tech.data └── text │ ├── 1.txt │ └── 2.txt ├── fnlp-app ├── .gitignore ├── pom.xml └── src │ ├── main │ └── java │ │ └── org │ │ └── fnlp │ │ └── app │ │ ├── keyword │ │ ├── AbstractExtractor.java │ │ ├── Graph.java │ │ ├── Vertex.java │ │ ├── WordExtract.java │ │ └── package-info.java │ │ ├── lucene │ │ ├── FNLPAnalyzer.java │ │ ├── FilteringTokenFilter.java │ │ ├── POSAttribute.java │ │ ├── POSAttributeImpl.java │ │ ├── POSTaggingFilter.java │ │ ├── SentenceTokenizer.java │ │ ├── WordTokenFilter.java │ │ ├── WordType.java │ │ ├── demo │ │ │ ├── BuildIndex.java │ │ │ ├── Search.java │ │ │ └── package-info.java │ │ └── package-info.java │ │ ├── num │ │ ├── CNExpression.java │ │ ├── Test.java │ │ └── package-info.java │ │ └── tc │ │ ├── TextClassifier.java │ │ └── package-info.java │ └── test │ └── java │ └── org │ └── fnlp │ └── app │ └── num │ └── CNExpressionTest.java ├── fnlp-core ├── .classpath ├── .project ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml └── src │ ├── main │ └── java │ │ └── org │ │ └── fnlp │ │ ├── data │ │ └── reader │ │ │ ├── DocumentReader.java │ │ │ ├── FileReader.java │ │ │ ├── ListReader.java │ │ │ ├── Reader.java │ │ │ ├── SequenceReader.java │ │ │ ├── SimpleFileReader.java │ │ │ ├── StringReader.java │ │ │ ├── package-info.java │ │ │ └── svmFileReader.java │ │ ├── ml │ │ ├── classifier │ │ │ ├── AbstractClassifier.java │ │ │ ├── LabelParser.java │ │ │ ├── LinkedPredict.java │ │ │ ├── Predict.java │ │ │ ├── TPredict.java │ │ │ ├── bayes │ │ │ │ ├── BayesClassifier.java │ │ │ │ ├── BayesTrainer.java │ │ │ │ ├── Heap.java │ │ │ │ └── ItemFrequency.java │ │ │ ├── hier │ │ │ │ ├── Linear.java │ │ │ │ ├── Mean.java │ │ │ │ ├── ModelAnalysis.java │ │ │ │ ├── PATrainer.java │ │ │ │ ├── Predict.java │ │ │ │ ├── Statistic.java │ │ │ │ ├── Tree.java │ │ │ │ ├── inf │ │ │ │ │ ├── MultiLinearMax.java │ │ │ │ │ └── package.html │ │ │ │ └── package.html │ │ │ ├── knn │ │ │ │ ├── KNN.java │ │ │ │ ├── KNNClassifier.java │ │ │ │ ├── VotePredict.java │ │ │ │ └── package-info.java │ │ │ ├── linear │ │ │ │ ├── AbstractTrainer.java │ │ │ │ ├── ClassifierPool.java │ │ │ │ ├── Linear.java │ │ │ │ ├── OnlineTrainer.java │ │ │ │ ├── inf │ │ │ │ │ ├── Inferencer.java │ │ │ │ │ ├── LinearMax.java │ │ │ │ │ └── package-info.java │ │ │ │ ├── package.html │ │ │ │ └── update │ │ │ │ │ ├── AbstractPAUpdate.java │ │ │ │ │ ├── LinearMaxPAUpdate.java │ │ │ │ │ ├── Update.java │ │ │ │ │ └── package-info.java │ │ │ ├── package.html │ │ │ └── struct │ │ │ │ ├── OnlineHybridTrainer.java │ │ │ │ ├── PATrainer.java │ │ │ │ ├── inf │ │ │ │ ├── AbstractViterbi.java │ │ │ │ ├── ConstraintViterbi.java │ │ │ │ ├── HigherOrderViterbi.java │ │ │ │ ├── HybridViterbi.java │ │ │ │ ├── LinearViterbi.java │ │ │ │ └── package-info.java │ │ │ │ ├── package.html │ │ │ │ └── update │ │ │ │ ├── HigherOrderViterbiPAUpdate.java │ │ │ │ ├── HybridViterbiPAUpdate.java │ │ │ │ ├── LinearViterbiPAUpdate.java │ │ │ │ └── package-info.java │ │ ├── cluster │ │ │ ├── Kmeans.java │ │ │ └── package.html │ │ ├── eval │ │ │ ├── Entity.java │ │ │ ├── Evaluation.java │ │ │ ├── Score.java │ │ │ ├── ScoreUsage.java │ │ │ ├── SeqEval.java │ │ │ └── package-info.java │ │ ├── feature │ │ │ ├── BaseGenerator.java │ │ │ ├── FeatureSelect.java │ │ │ ├── Generator.java │ │ │ ├── SFGenerator.java │ │ │ └── package.html │ │ ├── loss │ │ │ ├── Loss.java │ │ │ ├── ZeroOneLoss.java │ │ │ ├── package-info.java │ │ │ └── struct │ │ │ │ ├── HammingLoss.java │ │ │ │ ├── HybridHammingLoss.java │ │ │ │ ├── SequenceLoss.java │ │ │ │ ├── ZeroOneLoss.java │ │ │ │ └── package-info.java │ │ ├── nmf │ │ │ ├── Nmf.java │ │ │ └── package.html │ │ └── types │ │ │ ├── Dictionary.java │ │ │ ├── DynamicInfo.java │ │ │ ├── Instance.java │ │ │ ├── InstanceSet.java │ │ │ ├── LinearSparseVector.java │ │ │ ├── alphabet │ │ │ ├── AlphabetFactory.java │ │ │ ├── ClusterFeatureAlphabet.java │ │ │ ├── HashFeatureAlphabet.java │ │ │ ├── IAlphabet.java │ │ │ ├── IFeatureAlphabet.java │ │ │ ├── ILabelAlphabet.java │ │ │ ├── LabelAlphabet.java │ │ │ ├── LabelAlphabetEnum.java │ │ │ ├── StringFeatureAlphabet.java │ │ │ └── package-info.java │ │ │ ├── featurecluster │ │ │ ├── AbstractCluster.java │ │ │ ├── AbstractDistance.java │ │ │ ├── ClassData.java │ │ │ ├── Cluster.java │ │ │ ├── ClusterFix.java │ │ │ ├── ClusterKmeans.java │ │ │ ├── ClusterOri.java │ │ │ ├── ClusterSame.java │ │ │ ├── InstanceSet2ClassData.java │ │ │ ├── JSDistance.java │ │ │ └── SimpleDistance.java │ │ │ ├── package.html │ │ │ └── sv │ │ │ ├── BinarySparseVector.java │ │ │ ├── HashSparseVector.java │ │ │ ├── ISparseVector.java │ │ │ ├── SparseMatrix.java │ │ │ ├── SparseVector.java │ │ │ └── Vector.java │ │ ├── nlp │ │ ├── cn │ │ │ ├── CNFactory.java │ │ │ ├── Chars.java │ │ │ ├── ChineseTrans.java │ │ │ ├── LangDetection.java │ │ │ ├── PartOfSpeech.java │ │ │ ├── Sentenizer.java │ │ │ ├── Tags.java │ │ │ ├── anaphora │ │ │ │ ├── ARInstanceGetter.java │ │ │ │ ├── AR_Reader.java │ │ │ │ ├── Anaphora.java │ │ │ │ ├── EntitiesGetter.java │ │ │ │ ├── Entity.java │ │ │ │ ├── EntityGroup.java │ │ │ │ ├── FeatureGeter.java │ │ │ │ ├── FormChanger.java │ │ │ │ ├── WeightGetter.java │ │ │ │ ├── package-info.java │ │ │ │ ├── rule │ │ │ │ │ └── RuleAnaphora.java │ │ │ │ └── train │ │ │ │ │ ├── ARClassifier.java │ │ │ │ │ ├── DocFilter.java │ │ │ │ │ ├── DocGroupMacher.java │ │ │ │ │ ├── FileGroup.java │ │ │ │ │ ├── FileGroupReader.java │ │ │ │ │ ├── MarkFileManager.java │ │ │ │ │ ├── MyDocumentWriter.java │ │ │ │ │ └── package-info.java │ │ │ ├── ner │ │ │ │ ├── Address.java │ │ │ │ ├── TimeNormalizer.java │ │ │ │ ├── TimeUnit.java │ │ │ │ ├── ne │ │ │ │ │ ├── PreProcessor.java │ │ │ │ │ ├── TimeNormalizer.java │ │ │ │ │ ├── TimeUnit.java │ │ │ │ │ └── package-info.java │ │ │ │ ├── package-info.java │ │ │ │ └── stringPreHandlingModule.java │ │ │ ├── package-info.java │ │ │ └── tag │ │ │ │ ├── AbstractTagger.java │ │ │ │ ├── CWSTagger.java │ │ │ │ ├── NERTagger.java │ │ │ │ ├── POSTagger.java │ │ │ │ ├── POSTaggerX.java │ │ │ │ ├── TaggerPool.java │ │ │ │ ├── format │ │ │ │ ├── BasicFormatter.java │ │ │ │ ├── FormatCWS.java │ │ │ │ ├── Seq2ArrayWithTag.java │ │ │ │ ├── Seq2StrWithTag.java │ │ │ │ ├── SimpleFormatter.java │ │ │ │ └── package-info.java │ │ │ │ └── package-info.java │ │ ├── corpus │ │ │ ├── CharEnc.java │ │ │ ├── CharSets.java │ │ │ ├── CiLin.java │ │ │ ├── CorpusCount.java │ │ │ ├── CreateNounTrainFile.java │ │ │ ├── StopWords.java │ │ │ ├── Tags.java │ │ │ ├── Unlabeled.java │ │ │ ├── WikiClean.java │ │ │ ├── WordCount.java │ │ │ ├── WordList.java │ │ │ ├── WordMap.java │ │ │ ├── ctbconvert │ │ │ │ ├── CTB2CONLL.java │ │ │ │ ├── DepClassProducter.java │ │ │ │ ├── DependentTreeProducter.java │ │ │ │ ├── FCTB2CONLL.java │ │ │ │ ├── FCTB2CONLLTest.java │ │ │ │ ├── MyTreebankReader.java │ │ │ │ ├── Node.java │ │ │ │ ├── Tree.java │ │ │ │ └── package-info.java │ │ │ ├── fnlp │ │ │ │ ├── FNLPCorpus.java │ │ │ │ ├── FNLPDoc.java │ │ │ │ ├── FNLPSent.java │ │ │ │ ├── filter │ │ │ │ │ ├── Filter.java │ │ │ │ │ └── package-info.java │ │ │ │ └── package-info.java │ │ │ ├── package.html │ │ │ └── third │ │ │ │ ├── SougouScelReader.java │ │ │ │ └── package-info.java │ │ ├── duplicate │ │ │ ├── DocSim.java │ │ │ ├── Documents.java │ │ │ ├── DupDetector.java │ │ │ ├── FingerPrint.java │ │ │ ├── ISimilarity.java │ │ │ ├── Similarity.java │ │ │ └── SimilaritySlow.java │ │ ├── langmodel │ │ │ ├── NGramModel.java │ │ │ ├── NGramModelTest.java │ │ │ ├── lda │ │ │ │ ├── LdaGibbsSampler.java │ │ │ │ └── package-info.java │ │ │ └── package-info.java │ │ ├── parser │ │ │ ├── Sentence.java │ │ │ ├── Target.java │ │ │ ├── Util.java │ │ │ ├── dep │ │ │ │ ├── DependencyTree.java │ │ │ │ ├── JointParser.java │ │ │ │ ├── JointParsingState.java │ │ │ │ ├── Merge.java │ │ │ │ ├── ParsingState.java │ │ │ │ ├── TreeCache.java │ │ │ │ ├── TreeCacheSent.java │ │ │ │ ├── YamadaParser.java │ │ │ │ ├── analysis │ │ │ │ │ ├── AnalysisSentence.java │ │ │ │ │ ├── AnalysisTest.java │ │ │ │ │ └── ResultReader.java │ │ │ │ ├── package-info.java │ │ │ │ ├── reader │ │ │ │ │ ├── CoNLLReader.java │ │ │ │ │ ├── FNLPReader.java │ │ │ │ │ ├── Malt2Reader.java │ │ │ │ │ └── MaltReader.java │ │ │ │ └── train │ │ │ │ │ ├── JointParerTester.java │ │ │ │ │ ├── JointParerTrainer.java │ │ │ │ │ ├── ParserTester.java │ │ │ │ │ ├── ParserTrainer.java │ │ │ │ │ ├── YamadaOptimization.java │ │ │ │ │ └── package-info.java │ │ │ └── package-info.java │ │ ├── pipe │ │ │ ├── NGram.java │ │ │ ├── Normalize.java │ │ │ ├── NumericPipe.java │ │ │ ├── Pipe.java │ │ │ ├── SeriesPipes.java │ │ │ ├── String2Dep.java │ │ │ ├── StringArray2IndexArray.java │ │ │ ├── StringArray2SV.java │ │ │ ├── TF2IDF.java │ │ │ ├── TFIDF.java │ │ │ ├── Target2Label.java │ │ │ ├── WeightPipe.java │ │ │ ├── nlp │ │ │ │ ├── CNPipe.java │ │ │ │ └── package-info.java │ │ │ ├── package.html │ │ │ ├── seq │ │ │ │ ├── AddCharRange.java │ │ │ │ ├── DictLabel.java │ │ │ │ ├── DictPOSLabel.java │ │ │ │ ├── MixedString2Sequence.java │ │ │ │ ├── Sequence2DynamicFeatureSequence.java │ │ │ │ ├── Sequence2FeatureSequence.java │ │ │ │ ├── SplitDataAndTarget.java │ │ │ │ ├── String2Sequence.java │ │ │ │ ├── TokenNormalize.java │ │ │ │ ├── package.html │ │ │ │ └── templet │ │ │ │ │ ├── BaseTemplet.java │ │ │ │ │ ├── CharClassTemplet.java │ │ │ │ │ ├── CharClassTemplet2.java │ │ │ │ │ ├── CharClassTemplet3.java │ │ │ │ │ ├── CharInStringTemplet.java │ │ │ │ │ ├── ClusterTemplet.java │ │ │ │ │ ├── CustomTemplet.java │ │ │ │ │ ├── DictionaryTemplet.java │ │ │ │ │ ├── DynamicTemplet.java │ │ │ │ │ ├── HybridTemplet.java │ │ │ │ │ ├── ProTemplet.java │ │ │ │ │ ├── StringTypeTemplet.java │ │ │ │ │ ├── Templet.java │ │ │ │ │ ├── TempletGroup.java │ │ │ │ │ └── package.html │ │ │ └── templet │ │ │ │ ├── BaseTemplet.java │ │ │ │ ├── RETemplate.java │ │ │ │ ├── RETemplateGroup.java │ │ │ │ ├── Sequence2SVWithTemplate.java │ │ │ │ ├── TemplatePipe.java │ │ │ │ ├── Templet.java │ │ │ │ ├── TempletGroup.java │ │ │ │ └── package.html │ │ ├── similarity │ │ │ ├── Cluster.java │ │ │ ├── DrawTree.java │ │ │ ├── EditDistance.java │ │ │ ├── EditDistanceWithSemantic.java │ │ │ ├── ISimilarity.java │ │ │ ├── JaccardSimilarity.java │ │ │ ├── SparseVectorSimilarity.java │ │ │ ├── TreeKernel.java │ │ │ ├── package.html │ │ │ └── train │ │ │ │ ├── KMeansWordCluster.java │ │ │ │ ├── SougouCA.java │ │ │ │ ├── WordCluster.java │ │ │ │ ├── WordClusterM.java │ │ │ │ ├── WordSimilarity.java │ │ │ │ └── package-info.java │ │ └── tag │ │ │ ├── AbstractTagger.java │ │ │ ├── CRF2FudanNLP.java │ │ │ ├── ModelIO.java │ │ │ ├── Tagger.java │ │ │ └── package.html │ │ ├── ontology │ │ ├── CharClassDictionary.java │ │ ├── Dictionary.java │ │ ├── graph │ │ │ ├── Direction.java │ │ │ ├── SparseMatrix.java │ │ │ ├── SparseMatrixT.java │ │ │ ├── Word.java │ │ │ ├── WordGraph.java │ │ │ ├── WordRelationEnum.java │ │ │ └── package-info.java │ │ └── package-info.java │ │ └── util │ │ ├── AV.java │ │ ├── ICallback.java │ │ ├── MultiValueMap.java │ │ ├── MyArrays.java │ │ ├── MyCollection.java │ │ ├── MyFiles.java │ │ ├── MyHashSparseArrays.java │ │ ├── MyLinearSparseArrays.java │ │ ├── MyStrings.java │ │ ├── Options.java │ │ ├── PMI.java │ │ ├── UnicodeInputStream.java │ │ ├── UnicodeReader.java │ │ ├── ValueComparator.java │ │ ├── exception │ │ ├── LoadModelException.java │ │ ├── NotImplementedException.java │ │ └── UnsupportedDataTypeException.java │ │ ├── hash │ │ ├── AbstractHashCode.java │ │ ├── JavaHash.java │ │ └── MurmurHash.java │ │ └── package-info.java │ └── test │ └── java │ └── org │ └── fnlp │ ├── ml │ ├── classifier │ │ └── knn │ │ │ └── KNNTest.java │ ├── eval │ │ └── SeqEvalTest.java │ └── types │ │ ├── HashSparseVectorTest.java │ │ ├── LinearSparseVectorTest.java │ │ └── alphabet │ │ └── LabelAlphabetEnumTest.java │ ├── nlp │ ├── cn │ │ ├── CharsTest.java │ │ ├── ChineseTransTest.java │ │ ├── SentenizerTest.java │ │ ├── anaphora │ │ │ └── AR_ReaderTest.java │ │ └── tag │ │ │ ├── CWSTaggerTest.java │ │ │ └── POSTaggerTest.java │ ├── corpus │ │ ├── StopWordsTest.java │ │ └── fnlp │ │ │ └── FNLPCorpusTest.java │ ├── ner │ │ └── time │ │ │ ├── Demo_NumberTranslator.java │ │ │ └── Demo_TimeNormalizer.java │ ├── parser │ │ └── dep │ │ │ └── yamada │ │ │ └── YamadaParserTest.java │ ├── pipe │ │ └── seq │ │ │ └── String2SequenceTest.java │ ├── sighen │ │ └── DataProcessor.java │ ├── similarity │ │ ├── TreeKernelTest.java │ │ └── WordClusterTest.java │ └── tag │ │ ├── MemoryStatic.java │ │ ├── TaggerTest.java │ │ ├── TestDepParser.java │ │ ├── TestDictSEG.java │ │ ├── TestNER.java │ │ ├── TestPOS.java │ │ ├── TestSEG.java │ │ └── TestTime.java │ ├── test │ ├── CharacterType.java │ ├── Speed.java │ └── Speed2.java │ └── util │ ├── MyArraysTest.java │ ├── MyFilesTest.java │ ├── MyHashSparseArraysTest.java │ └── hash │ └── MurmurHashTest.java ├── fnlp-demo ├── .gitignore ├── pom.xml └── src │ ├── main │ └── java │ │ └── org │ │ └── fnlp │ │ └── demo │ │ ├── ml │ │ ├── HierClassifierUsage1.java │ │ ├── HierClassifierUsage2.java │ │ ├── SequenceLabeling.java │ │ ├── SimpleClassifier2.java │ │ └── package-info.java │ │ └── nlp │ │ ├── AnaphoraResolution.java │ │ ├── ChineseWordSegmentation.java │ │ ├── DepParser.java │ │ ├── KeyWordExtraction.java │ │ ├── NamedEntityRecognition.java │ │ ├── PartsOfSpeechTag.java │ │ ├── TimeExpressionRecognition.java │ │ ├── package-info.java │ │ └── tc │ │ ├── MyDocumentReader.java │ │ ├── RemoveWords.java │ │ ├── Strings2StringArray.java │ │ ├── TextClassificationBasedOnBayes.java │ │ ├── TextClassificationBasedOnBayes2.java │ │ ├── TextClassificationBasedOnKNN.java │ │ ├── TextClassificationCustom.java │ │ ├── TextClassificationCustom1.java │ │ ├── TextClassificationSimple.java │ │ └── TextClassificationTest.java │ └── test │ └── java │ └── org │ └── fnlp │ └── demo │ ├── MLTest.java │ └── NLPTest.java ├── fnlp-dev ├── .classpath ├── .gitignore ├── .project ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml └── src │ ├── main │ └── java │ │ └── org │ │ └── fnlp │ │ └── dev │ │ └── App.java │ └── test │ └── java │ └── org │ └── fnlp │ └── dev │ └── AppTest.java ├── fnlp-train ├── .classpath ├── .gitignore ├── .project ├── .settings │ ├── org.eclipse.core.resources.prefs │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.m2e.core.prefs ├── pom.xml └── src │ └── main │ └── java │ ├── BatchComment.java │ ├── Txt2Bin.java │ └── org │ └── fnlp │ ├── nlp │ └── cn │ │ └── rl │ │ ├── Freq.java │ │ ├── RLSeg.java │ │ ├── SearchByBaidu.java │ │ ├── Seg.java │ │ └── Seg2.java │ └── train │ ├── corpus │ ├── CoNLL2FNLP.java │ ├── TagCorrect.java │ └── TrainTestSplit.java │ ├── parsing │ ├── DepPrepare.java │ ├── DepRun.java │ └── DepRunFinal.java │ ├── pos │ ├── DictPOS.java │ ├── FNLP2POS.java │ ├── POSAddEnTag.java │ ├── POSPrepare.java │ ├── POSRun.java │ ├── POSRunFinal.java │ ├── POSTrain.java │ └── PosC2E.java │ ├── seg │ ├── DICT.java │ ├── FNLP2BMES.java │ ├── SegPrepare.java │ ├── SegRun.java │ ├── SegRunFinal.java │ └── SegTrain.java │ └── tag │ ├── BatchTrain.java │ ├── Clean2word.java │ ├── ModelOptimization.java │ ├── ProcessCorpus.java │ ├── TemplateSelection.java │ ├── TestTemplates.java │ ├── TrainTagger.java │ ├── addedTagger.java │ └── package-info.java ├── models ├── .gitignore ├── ExtractPattern.txt ├── Stock-Tree.txt ├── ar.m ├── dict.txt ├── dict_ambiguity.txt ├── dict_dep.txt ├── stopwords │ ├── ErrorWords.txt │ ├── NoSenseWords.txt │ └── StopWords.txt ├── time.m └── wordgraph.txt └── pom.xml /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Maven # 2 | target 3 | /fnlp-dev/target 4 | 5 | # IDEA # 6 | *.iml 7 | 8 | # Eclipse # 9 | .classpath 10 | .project 11 | .settings 12 | 13 | #large files# 14 | /data 15 | /tmp 16 | 17 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | fnlp-all 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.m2e.core.maven2Nature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding/=UTF-8 3 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.6 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.6 13 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Changes.txt: -------------------------------------------------------------------------------- 1 | 2.00 2014.3.25 2 | --- 3 | 1. FudanNLP改为FNLP. 4 | 2. package名由edu.fudan换成了 org.fnlp 5 | 3. 用maven管理 6 | 7 | 1.55 2013.3.25 8 | ---- 9 | 1. 优化了分词、句法分析模型 10 | 2. 调整分类器结构 11 | 3. 增加lucene接口 apps/org.fnlp.app.lucene 12 | 4. 修改小的BUG 13 | 14 | 1.5 2012.11.16 15 | ---- 16 | 1.所有模型基于自主标注语料,不受第三方语料伴随的协议限制; 17 | 2.增加指代消解功能; 18 | 4.词性标注支持自定义词典; 19 | 4.改进依存句法分析算法,提高准确率; 20 | 5.依存句法分析支持输出依赖关系类型; 21 | 6.改进分词、词性标注特征,提高准确率; 22 | 7.重新梳理了程序结构。 23 | 24 | 1.05 2011.10.14 25 | ----- 26 | *增加程序注释 27 | *修正一些bug 28 | *支持并行化 29 | *支持自定义词典 30 | *高速关键词抽取 31 | 32 | 1.0 2011.8.1 33 | ----- 34 | *修改了基本数据结构,优化速度和减少模型文件大小。 35 | *增加关键词提取 36 | *分类器一致性改进 37 | *Solver类改为Inferencer类 -------------------------------------------------------------------------------- /PageHeader.txt: -------------------------------------------------------------------------------- 1 | This file is part of FNLP (formerly FudanNLP). 2 | 3 | FNLP is free software: you can redistribute it and/or modify 4 | it under the terms of the GNU Lesser General Public License as published by 5 | the Free Software Foundation, either version 3 of the License, or 6 | (at your option) any later version. 7 | 8 | FNLP is distributed in the hope that it will be useful, 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | GNU Lesser General Public License for more details. 12 | 13 | You should have received a copy of the GNU General Public License 14 | along with FudanNLP. If not, see . 15 | 16 | Copyright 2009-2014 www.fnlp.org. All rights reserved. 17 | -------------------------------------------------------------------------------- /Thanks.txt: -------------------------------------------------------------------------------- 1 | #summary 项目开发人员(参与项目时间段) 2 | 3 | We would like to thank the following people for their help and support. 4 | 5 | =项目负责= 6 | * 邱锡鹏 7 | 8 | =当前开发人员= 9 | * 陈丹双(2012-) 10 | * 范雄雄(2012-) 11 | * 赵建双(2011-) 12 | * 刘昭(2010-) 13 | * 计峰 (2009-) 14 | * 曹零 (2010-) 15 | * 赵嘉亿 (2010-) 16 | * 田乐 (2010-) 17 | 18 | =过去开发人员= 19 | * 高文君 (2009-2010) 20 | * 缪有栋 (2009-2010) 21 | * 沈超 (2009) 22 | 23 | =其他开发人员= -------------------------------------------------------------------------------- /example-data/data-tag.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 《菲律宾每日问询者报》5月16日报道说,菲律宾渔业与水产资源局当天表示,将从5月16日开始禁止菲律宾渔民前往黄岩岛捕鱼,禁渔令将持续两个月。菲律宾渔业与水产资源局局长阿西斯·皮瑞斯表示,这一行动和中国的休渔令在同一天生效,不过,“我们仍将在禁渔令期间在那里(指黄岩岛海域)进行科学研究”。与此同时,菲律宾总统阿基诺16日表示,已经任命了两名“总统特使”前往中国。 4 | 5 | 《菲律宾星报》的评论认为,中国的休渔令可能成为缓和形势的一种方式。该报在15日的评论中表示,菲政府在黄岩岛事件中表现出令人难以置信的僵硬,将导致形势陷入“死结”。评论称,菲律宾的东盟伙伴认为菲律宾无视东盟的共识搞“单飞”,并采取战斗性的姿态对待中国。这导致东盟和菲律宾保持相当的距离。文章还对菲律宾总统至今仍未确定驻华大使的提名提出了严厉批评,认为阿基诺在这一重要问题上行动迟缓得让人震惊,因为“现在我们比任何时候都迫切需要一位驻华大使在北京处理对华关系”。 6 | 7 | 在马尼拉街头,目前很难看到中国游客的身影,不过当地市民普遍认为菲中的紧张关系目前正在逐步缓解。接受本报记者采访的市民多数对黄岩岛事件的和平解决抱有信心,一位名叫何塞·马林诺的市民对记者说,中国实施休渔令是一个积极的信号,他相信随着菲律宾也宣布自己的休渔措施后,两国的外交争端将会结束,事情将会回归正常。 8 | 9 | 菲律宾外长德尔罗萨里奥16日发表演讲时声称,菲愿意将黄岩岛事件抛诸脑后,并且期待着与中国发展积极的关系。他说,菲律宾政府相信通过持续磋商能够和平解决危机。他强调对两国最终能找到解决问题的办法是“乐观的”,并澄清说,媒体上一些所谓“中国对菲律宾实施经济制裁”的报道“并不准确”。 10 | 11 | 德尔罗萨里奥说,中国是菲律宾的长期伙伴和近邻,“我们从与中国的关系中获益,如同中国也从同菲律宾的关系中获益一样”。 12 | 13 | 不过,菲律宾媒体仍然在继续炒作黄岩岛事件。菲律宾主流媒体16日都对美军攻击型核潜艇停泊菲律宾苏比克湾进行了报道,并刻意将这一事件同黄岩岛事件联系起来。 14 | 15 | 《菲律宾每日问询者报》网站16日援引法新社的报道称,阿基诺当天表示,菲正在考虑从美国以外购买战斗机。阿基诺称,菲政府原计划购入美国二手的F—16战机,但考虑到后期维护费用高昂,政府正在考虑其他选择。 16 | 17 | 《菲律宾每日问询者报》16日的报道称,德尔罗萨里奥在接受该报采访时表示,计划将菲律宾同中国的争端提交即将召开的东盟—美国对话会议。菲外交部表示,会议将在本月20日至22日在菲律宾召开,会议“将就地区和国际形势交换意见”。 18 | 19 | 菲律宾国家青年组织批评说,“菲律宾是美国在东盟国家里的‘内应’,它在东盟中代表美国的利益”。该组织秘书长雷纳多·拉亚斯表示:“菲律宾将黄岩岛事件提交东盟—美国对话会议,这和增加在东南亚的军事存在一样符合美国的利益。美国可能会淡化在中菲争端中的角色,但同时也在利用争端寻求在亚太地区更多的存在。这将使形势更加复杂化。” 20 | 21 | Java平台和语言最开始只是SUN公司在1990年12月开始研究的一个内部项目。SUN公司的一个叫做帕特里克•诺顿的工程师被自己开发的C和C语言编译器搞得焦头烂额,因为其中的API极其难用。帕特里克决定改用NeXT,同时他也获得了研究公司的一个叫做“Stealth 计划”的项目的机会。 22 | “Stealth 计划”后来改名为“Green计划”,James Gosling(詹姆斯•高斯林)和麦克•舍林丹也加入了帕特里克的工作小组。他们和其他几个工程师一起在加利福尼亚州门罗帕克市沙丘路的一个小工作室里面研究开发新技术,瞄准下一代智能家电(如微波炉)的程序设计,SUN公司预料未来科技将在家用电器领域大显身手。团队最初考虑使用C语言,但是很多成员包括SUN的首席科学家比尔•乔伊,发现C和可用的API在某些方面存在很大问题。工作小组使用的是内嵌类型平台,可以用的资源极其有限。很多成员发现C太复杂以至很多开发者经常错误使用。他们发现C缺少垃圾回收系统,还有可移植的安全性、分布程序设计、和多线程功能。最后,他们想要一种易于移植到各种设备上的平台。 -------------------------------------------------------------------------------- /example-data/seg-bad-case.txt: -------------------------------------------------------------------------------- 1 | 2013年 03月 25日 16:29 2 | 2011年 9月 6日 22:11 3 | 2012年 5月 5日 24:00 4 | 1999年 7月 4日 02:10 5 | 98年 8月 10日 05:20 6 | 浙江 省 了 大 批 投资 7 | 浙江省 了解 这个 情况 的 人 不 多 8 | 从 北京 经 济南 下 徐州 9 | 发展中 国家 服装 需求 大 增 10 | 我们 提供 高档 和服 务必 前来 选购 11 | 我们 提供 高档 设备 和 服务 。 12 | 服务 13 | 穿 上 日本 和服 装嫩 14 | 这 台 计算机 系统盘 出 了 故障 15 | 丹东 西安 全 是 我 喜欢 的 地方 16 | 南京 的 市长 江大桥 说 南京市 长江 大桥 好 长 17 | 这 事儿 的确 定 不 下 来 18 | 去 网吧 19 | 去 酒吧 20 | 看 A片 21 | 常见 软件 22 | 我 的 小米 2 买 来 半 个 月 目前 出现 的 问题 是 手机 连 不 上 数据线 冲 不 了 电 , 也 连 不 上 电脑 数据线 是 OK 的 我 式 过 是 中 病毒 23 | 操作 系统 24 | 我 雅思 成绩 , 听力 7 , 阅读 6 , 写作 9 25 | VB 对 26 | C++ 指南 27 | 晚上 21:10 28 | 这 招 行不通 的 29 | 招行 工行 30 | 小米 手机 31 | 内核 中 等待 队列 32 | 跟 我 妻小 说 33 | 哽咽 向 对方 妻小 道歉 34 | 王菲 与 谢霆锋 恋情 公开 35 | 刘菲 和 李亚鹏 结婚 36 | 刘鹏 和 李亚鹏 结婚 37 | 刘鹏 与 李亚鹏 结婚 38 | 我 是 屌丝 好不 39 | 自动 雨刮 自动 大灯 等 40 | 比 CRV 好 太 多 了 41 | 比 SUV 舒服 很多 42 | 比 SUV 舒服 很多 43 | 比 ipad 舒服 很多 44 | 比 ipad 好 很多 45 | 看起来 比 ipad 舒服 很多 46 | 欧莱雅 美宝莲 兰蔻 是 很好 的 品牌 47 | -------------------------------------------------------------------------------- /example-data/sequence/shell.cmd: -------------------------------------------------------------------------------- 1 | java -classpath ../../fudannlp.jar;;../../lib/*; edu.fudan.nlp.tag.Tagger -train template train.txt model 2 | java -classpath ../../fudannlp.jar;../../lib/*; edu.fudan.nlp.tag.Tagger model test.txt result.txt 3 | @echo delete model file 4 | del model 5 | @echo press any key to delete result.txt file 6 | pause>nul 7 | del result.txt -------------------------------------------------------------------------------- /example-data/sequence/template: -------------------------------------------------------------------------------- 1 | %x[-2,0]%y[0] 2 | %x[-1,0]%y[0] 3 | %x[0,0]%y[0] 4 | %x[1,0]%y[0] 5 | %x[2,0]%y[0] 6 | %x[-1,0]%x[0,0]%y[0] 7 | %x[0,0]%x[1,0]%y[0] 8 | %y[-1]%y[0] 9 | -------------------------------------------------------------------------------- /example-data/sequence/template_dynamic: -------------------------------------------------------------------------------- 1 | %pos[-2]%pos[-1]%y[0] 2 | %word[-1]%word[0]%y[0] 3 | %len[0]%y[0] 4 | %word[-1]%pos[-1]%y[0] 5 | %pos[-2]%word[-1]%y[0] 6 | 7 | -------------------------------------------------------------------------------- /example-data/sequence/template_pro: -------------------------------------------------------------------------------- 1 | %x[-2,0]%y[0] 2 | %x[-1,0]%y[0] 3 | %x[0,0]%y[0] 4 | %x[1,0]%y[0] 5 | %x[2,0]%y[0] 6 | %x[-1,0]%x[0,0]%y[0] 7 | %x[0,0]%x[1,0]%y[0] 8 | %x[-2,0]%x[-1,0]%y[0] 9 | %x[-1,0]%x[1,0]%y[0] 10 | %x[1,0]%x[2,0]%y[0] 11 | %p[0,0]%y[0] 12 | %t[-2,0]%t[-1,0]%t[0,0]%t[1,0]%t[2,0]%y[0] 13 | %y[-1]%y[0] -------------------------------------------------------------------------------- /example-data/sequence/test.txt: -------------------------------------------------------------------------------- 1 | 菲 2 | 律 3 | 宾 4 | 国 5 | 家 6 | 青 7 | 年 8 | 组 9 | 织 10 | 批 11 | 评 12 | 说 13 | , 14 | “ 15 | 菲 16 | 律 17 | 宾 18 | 是 19 | 美 20 | 国 21 | 在 22 | 东 23 | 盟 24 | 国 25 | 家 26 | 里 27 | 的 28 | ‘ 29 | 内 30 | 应 31 | ’ 32 | , 33 | 它 34 | 在 35 | 东 36 | 盟 37 | 中 38 | 代 39 | 表 40 | 美 41 | 国 42 | 的 43 | 利 44 | 益 45 | ” 46 | 。 47 | 该 48 | 组 49 | 织 50 | 秘 51 | 书 52 | 长 53 | 雷 54 | 纳 55 | 多 56 | · 57 | 拉 58 | 亚 59 | 斯 60 | 表 61 | 示 62 | : 63 | “ 64 | 菲 65 | 律 66 | 宾 67 | 将 68 | 黄 69 | 岩 70 | 岛 71 | 事 72 | 件 73 | 提 74 | 交 75 | 东 76 | 盟 77 | — 78 | 美 79 | 国 80 | 对 81 | 话 82 | 会 83 | 议 84 | , 85 | 这 86 | 和 87 | 增 88 | 加 89 | 在 90 | 东 91 | 南 92 | 亚 93 | 的 94 | 军 95 | 事 96 | 存 97 | 在 98 | 一 99 | 样 100 | 符 101 | 合 102 | 美 103 | 国 104 | 的 105 | 利 106 | 益 107 | 。 108 | 美 109 | 国 110 | 可 111 | 能 112 | 会 113 | 淡 114 | 化 115 | 在 116 | 中 117 | 菲 118 | 争 119 | 端 120 | 中 121 | 的 122 | 角 123 | 色 124 | , 125 | 但 126 | 同 127 | 时 128 | 也 129 | 在 130 | 利 131 | 用 132 | 争 133 | 端 134 | 寻 135 | 求 136 | 在 137 | 亚 138 | 太 139 | 地 140 | 区 141 | 更 142 | 多 143 | 的 144 | 存 145 | 在 146 | 。 147 | 这 148 | 将 149 | 使 150 | 形 151 | 势 152 | 更 153 | 加 154 | 复 155 | 杂 156 | 化 157 | 。 158 | ” -------------------------------------------------------------------------------- /example-data/sequence/test0.txt: -------------------------------------------------------------------------------- 1 | 菲 2 | 律 3 | 宾 4 | 国 5 | 家 6 | 青 7 | 年 8 | 组 9 | 织 10 | 批 11 | 评 12 | 说 13 | , 14 | “ 15 | 菲 16 | 律 17 | 宾 18 | 是 19 | 美 20 | 国 21 | 在 22 | 东 23 | 盟 24 | 国 25 | 家 26 | 里 27 | 的 28 | ‘ 29 | 内 30 | 应 31 | ’ 32 | , 33 | 它 34 | 在 35 | 东 36 | 盟 37 | 中 38 | 代 39 | 表 40 | 美 41 | 国 42 | 的 43 | 利 44 | 益 45 | ” 46 | 。 47 | 该 48 | 组 49 | 织 50 | 秘 51 | 书 52 | 长 53 | 雷 54 | 纳 55 | 多 56 | · 57 | 拉 58 | 亚 59 | 斯 60 | 表 61 | 示 62 | : 63 | “ 64 | 菲 65 | 律 66 | 宾 67 | 将 68 | 黄 69 | 岩 70 | 岛 71 | 事 72 | 件 73 | 提 74 | 交 75 | 东 76 | 盟 77 | — 78 | 美 79 | 国 80 | 对 81 | 话 82 | 会 83 | 议 84 | , 85 | 这 86 | 和 87 | 增 88 | 加 89 | 在 90 | 东 91 | 南 92 | 亚 93 | 的 94 | 军 95 | 事 96 | 存 97 | 在 98 | 一 99 | 样 100 | 符 101 | 合 102 | 美 103 | 国 104 | 的 105 | 利 106 | 益 107 | 。 108 | 美 109 | 国 110 | 可 111 | 能 112 | 会 113 | 淡 114 | 化 115 | 在 116 | 中 117 | 菲 118 | 争 119 | 端 120 | 中 121 | 的 122 | 角 123 | 色 124 | , 125 | 但 126 | 同 127 | 时 128 | 也 129 | 在 130 | 利 131 | 用 132 | 争 133 | 端 134 | 寻 135 | 求 136 | 在 137 | 亚 138 | 太 139 | 地 140 | 区 141 | 更 142 | 多 143 | 的 144 | 存 145 | 在 146 | 。 147 | 这 148 | 将 149 | 使 150 | 形 151 | 势 152 | 更 153 | 加 154 | 复 155 | 杂 156 | 化 157 | 。 158 | ” -------------------------------------------------------------------------------- /example-data/text/2.txt: -------------------------------------------------------------------------------- 1 | 穆里尼奥:皇马已死!裁判让我们输 这世界真恶心穆里尼奥:今天的一切再次证明,我们本来就没有任何机会  新浪体育讯 穆里尼奥已经举手投降了?从表面上看来是这样的。在主场0比2输给巴萨后,穆里尼奥在赛后的新闻发布会上表示,欧冠半决赛的结果在首轮比赛过后就已经确定,巴萨已进入了本赛季欧冠的决赛。“是的,我们已经被淘汰了。0比0本来是一个合理的结果,但裁判不让我们这样结束。以现在这样的情形来看,我们下一场同样根本没可能……他们再一次杀死视频-穆帅竖大拇指讽裁判 鸟叔被罚上看台媒体来源:新浪体育了我们。今天的一切再次证明:我们本来就没有任何机会。 ”  但穆里尼奥显然还是心有不甘:“有时候生活在这样的世界里让人感觉恶心,但这就是我们的世界。我们必须带着全部的尊严前往巴塞罗那。佩佩没法前往,这场他可没做什么;拉莫斯也不能去,他也一样没做什么;球队的主教练也无法坐在教练席上。”(塞尔吉奥/任语之) 新闻图片博客视频分享到:更多关于  的新闻 -------------------------------------------------------------------------------- /fnlp-app/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /fnlp-app/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.fnlp 6 | fnlp-all 7 | 2.1-SNAPSHOT 8 | 9 | org.fnlp 10 | fnlp-app 11 | 2.1-SNAPSHOT 12 | fnlp-app 13 | http://maven.apache.org 14 | 15 | UTF-8 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.11 22 | test 23 | 24 | 25 | org.fnlp 26 | fnlp-core 27 | 2.1-SNAPSHOT 28 | 29 | 30 | org.apache.lucene 31 | lucene-core 32 | 4.7.0 33 | 34 | 35 | org.apache.lucene 36 | lucene-queryparser 37 | 4.7.0 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/keyword/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 关键词抽取包. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.app.keyword; -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/lucene/POSAttribute.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.app.lucene; 21 | 22 | import org.apache.lucene.util.Attribute; 23 | 24 | public interface POSAttribute extends Attribute { 25 | 26 | public void setPartOfSpeech(String pos); 27 | 28 | public String getPartOfSpeech(); 29 | } -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/lucene/POSAttributeImpl.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.app.lucene; 21 | 22 | import org.apache.lucene.util.AttributeImpl; 23 | 24 | public final class POSAttributeImpl extends AttributeImpl 25 | implements POSAttribute { 26 | 27 | private String pos = ""; 28 | 29 | public void setPartOfSpeech(String pos) { 30 | this.pos = pos; 31 | } 32 | 33 | public String getPartOfSpeech() { 34 | return pos; 35 | } 36 | @Override 37 | public void clear() { 38 | pos = ""; 39 | } 40 | @Override 41 | public void copyTo(AttributeImpl target) { 42 | ((POSAttribute) target).setPartOfSpeech(pos); 43 | } 44 | } -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/lucene/POSTaggingFilter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.app.lucene; 21 | 22 | import java.io.IOException; 23 | 24 | import org.apache.lucene.analysis.TokenStream; 25 | 26 | import org.fnlp.nlp.cn.Tags; 27 | 28 | public final class POSTaggingFilter extends FilteringTokenFilter { 29 | 30 | 31 | private final POSAttribute posAtt = addAttribute(POSAttribute.class); 32 | 33 | 34 | public POSTaggingFilter(boolean enablePositionIncrements, TokenStream in) { 35 | super(enablePositionIncrements, in); 36 | } 37 | 38 | @Override 39 | public boolean accept() throws IOException { 40 | String pos = posAtt.getPartOfSpeech(); 41 | return !Tags.isStopword(pos); 42 | } 43 | } -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/lucene/WordType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.app.lucene; 21 | 22 | /** 23 | * Internal SmartChineseAnalyzer token type constants 24 | * @lucene.experimental 25 | */ 26 | public class WordType { 27 | 28 | /** 29 | * Start of a Sentence 30 | */ 31 | public final static int SENTENCE_BEGIN = 0; 32 | 33 | /** 34 | * End of a Sentence 35 | */ 36 | public final static int SENTENCE_END = 1; 37 | 38 | /** 39 | * Chinese Word 40 | */ 41 | public final static int CHINESE_WORD = 2; 42 | 43 | /** 44 | * ASCII String 45 | */ 46 | public final static int STRING = 3; 47 | 48 | /** 49 | * ASCII Alphanumeric 50 | */ 51 | public final static int NUMBER = 4; 52 | 53 | /** 54 | * Punctuation Symbol 55 | */ 56 | public final static int DELIMITER = 5; 57 | 58 | /** 59 | * Full-Width String 60 | */ 61 | public final static int FULLWIDTH_STRING = 6; 62 | 63 | /** 64 | * Full-Width Alphanumeric 65 | */ 66 | public final static int FULLWIDTH_NUMBER = 7; 67 | 68 | } -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/lucene/demo/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author xpqiu 6 | * 7 | */ 8 | package org.fnlp.app.lucene.demo; -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/lucene/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author xpqiu 6 | * 7 | */ 8 | package org.fnlp.app.lucene; -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/num/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author xpqiu 6 | * 7 | */ 8 | package org.fnlp.app.num; -------------------------------------------------------------------------------- /fnlp-app/src/main/java/org/fnlp/app/tc/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 文本分类包. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.app.tc; -------------------------------------------------------------------------------- /fnlp-core/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /fnlp-core/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | fnlp-core 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.wst.common.project.facet.core.builder 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | org.eclipse.wst.validation.validationbuilder 20 | 21 | 22 | 23 | 24 | org.eclipse.m2e.core.maven2Builder 25 | 26 | 27 | 28 | 29 | 30 | org.eclipse.jem.workbench.JavaEMFNature 31 | org.eclipse.wst.common.modulecore.ModuleCoreNature 32 | org.eclipse.m2e.core.maven2Nature 33 | org.eclipse.jdt.core.javanature 34 | org.eclipse.wst.common.project.facet.core.nature 35 | 36 | 37 | -------------------------------------------------------------------------------- /fnlp-core/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/test/java=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /fnlp-core/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | #Tue Mar 11 13:39:47 CST 2014 2 | encoding/src/test/java=UTF-8 3 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 4 | org.eclipse.jdt.core.compiler.compliance=1.6 5 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 6 | encoding/src/main/resources=UTF-8 7 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 8 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 9 | encoding/src/main/java=UTF-8 10 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 11 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 12 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 13 | eclipse.preferences.version=1 14 | encoding/src/test/resources=UTF-8 15 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 16 | org.eclipse.jdt.core.compiler.source=1.6 17 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 18 | -------------------------------------------------------------------------------- /fnlp-core/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/data/reader/ListReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.data.reader; 21 | 22 | import java.util.List; 23 | 24 | import org.fnlp.ml.types.Instance; 25 | 26 | public class ListReader extends Reader{ 27 | 28 | //测试指代消解临时写的 jszhao 29 | List[] data; 30 | int index; 31 | 32 | public ListReader (List[] data) 33 | { 34 | this.data = data; 35 | this.index = 0; 36 | } 37 | 38 | public Instance next () 39 | { 40 | return new Instance (data[index++], null); 41 | } 42 | 43 | public boolean hasNext () { return index < data.length; } 44 | 45 | 46 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/data/reader/Reader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.data.reader; 21 | 22 | import java.util.Iterator; 23 | 24 | import org.fnlp.ml.types.Instance; 25 | import org.fnlp.ml.types.InstanceSet; 26 | 27 | /** 28 | * @author xpqiu 29 | * @version 1.0 30 | * Reader为数据读入接口,用一个迭代器依次读入数据,每次返回一个Instance对象 31 | * 使得数据处理和读入无关 32 | * package edu.fudan.data.reader 33 | */ 34 | public abstract class Reader implements Iterator { 35 | 36 | public void remove () { 37 | throw new IllegalStateException ("This Iterator does not support remove()."); 38 | } 39 | 40 | 41 | public InstanceSet read(){ 42 | InstanceSet instSet = new InstanceSet(); 43 | while (hasNext()) { 44 | Instance inst = next(); 45 | if(inst!=null){ 46 | instSet.add(inst); 47 | } 48 | } 49 | return instSet; 50 | } 51 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/data/reader/StringReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.data.reader; 21 | 22 | import org.fnlp.ml.types.Instance; 23 | 24 | /** 25 | * 字符串数组,每维为一个样本,无类别信息 26 | * @author xpqiu 27 | * @version 1.0 28 | * StringReader 29 | * package edu.fudan.ml.data 30 | */ 31 | public class StringReader extends Reader 32 | { 33 | String[] data; 34 | int index; 35 | 36 | public StringReader (String[] data) 37 | { 38 | this.data = data; 39 | this.index = 0; 40 | } 41 | 42 | public Instance next () 43 | { 44 | return new Instance (data[index++], null); 45 | } 46 | 47 | public boolean hasNext () { return index < data.length; } 48 | 49 | 50 | 51 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/data/reader/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | /** 21 | * 数据读取包:处理不同类型格式的数据。 22 | */ 23 | package org.fnlp.data.reader; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/TPredict.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.classifier; 21 | 22 | /** 23 | * 结果接口 24 | * @author xpqiu 25 | * @version 2.0 26 | * @since 1.5 27 | */ 28 | public interface TPredict { 29 | /** 30 | * 获得预测结果 31 | * @param i 位置 32 | * @return 第i个预测结果;如果不存在,为NULL 33 | */ 34 | public T getLabel(int i); 35 | /** 36 | * 获得预测结果的得分 37 | * @param i 位置 38 | * @return 第i个预测结果的得分;不存在为Double.NEGATIVE_INFINITY 39 | */ 40 | public float getScore(int i); 41 | /** 42 | * 归一化得分 43 | */ 44 | public void normalize(); 45 | /** 46 | * 预测结果数量 47 | * @return 预测结果的数量 48 | */ 49 | public int size(); 50 | /** 51 | * 得到所有标签 52 | * @return 53 | */ 54 | public T[] getLabels(); 55 | /** 56 | * 删除位置i的信息 57 | * @param i 58 | */ 59 | public void remove(int i); 60 | 61 | 62 | 63 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/bayes/BayesTrainer.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.ml.classifier.bayes; 2 | 3 | import gnu.trove.iterator.TIntFloatIterator; 4 | 5 | import java.util.List; 6 | 7 | import org.fnlp.ml.classifier.AbstractClassifier; 8 | import org.fnlp.ml.classifier.linear.AbstractTrainer; 9 | import org.fnlp.ml.types.Instance; 10 | import org.fnlp.ml.types.InstanceSet; 11 | import org.fnlp.ml.types.alphabet.AlphabetFactory; 12 | import org.fnlp.ml.types.sv.HashSparseVector; 13 | import org.fnlp.nlp.pipe.Pipe; 14 | import org.fnlp.nlp.pipe.SeriesPipes; 15 | /** 16 | * 贝叶斯文本分类模型训练器 17 | * 输入训练数据为稀疏矩阵 18 | * @author sywu 19 | * 20 | */ 21 | public class BayesTrainer{ 22 | 23 | public AbstractClassifier train(InstanceSet trainset) { 24 | AlphabetFactory af=trainset.getAlphabetFactory(); 25 | SeriesPipes pp=(SeriesPipes) trainset.getPipes(); 26 | pp.removeTargetPipe(); 27 | return train(trainset,af,pp); 28 | } 29 | public AbstractClassifier train(InstanceSet trainset,AlphabetFactory af,Pipe pp) { 30 | ItemFrequency tf=new ItemFrequency(trainset,af); 31 | BayesClassifier classifier=new BayesClassifier(); 32 | classifier.setFactory(af); 33 | classifier.setPipe(pp); 34 | classifier.setTf(tf); 35 | return classifier; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/hier/Statistic.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.classifier.hier; 21 | 22 | public class Statistic { 23 | 24 | /** 25 | * @param args 26 | */ 27 | public static void main(String[] args) { 28 | 29 | 30 | } 31 | 32 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/knn/VotePredict.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.ml.classifier.knn; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import org.fnlp.ml.classifier.Predict; 7 | 8 | public class VotePredict extends Predict { 9 | 10 | public VotePredict(int k){ 11 | super(k); 12 | } 13 | public T getLabel() { 14 | T label=labels[0]; 15 | int count=0; 16 | Map labelCount = new HashMap(); 17 | for(int pos=0;poscount){ 29 | count=tempCount; 30 | label=labels[pos]; 31 | } 32 | } 33 | return label; 34 | } 35 | public Predict getNLabels(int labels_num){ 36 | Predict pred=new Predict(labels_num); 37 | 38 | Map labelCount = new HashMap(); 39 | for(int i=0;iThis file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | */ 25 | package org.fnlp.ml.classifier.knn; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/linear/AbstractTrainer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.classifier.linear; 21 | 22 | import org.fnlp.ml.classifier.AbstractClassifier; 23 | import org.fnlp.ml.types.InstanceSet; 24 | 25 | /** 26 | * 抽象参数训练类 27 | * @author Feng Ji 28 | * 29 | */ 30 | public abstract class AbstractTrainer { 31 | 32 | /** 33 | * 抽象参数训练方法 34 | * @param trainset 训练数据集 35 | * @param devset 评估性能的数据集,可以为NULL 36 | * @return 分类器 37 | */ 38 | public abstract AbstractClassifier train(InstanceSet trainset, InstanceSet devset); 39 | 40 | /** 41 | * 参数训练方法 42 | * @param trainset 训练数据集 43 | * @return 分类器 44 | */ 45 | public AbstractClassifier train(InstanceSet trainset){ 46 | return train(trainset,null); 47 | } 48 | 49 | /** 50 | * 评估性能方法 51 | * @param devset 评估性能的数据集 52 | */ 53 | protected abstract void evaluate(InstanceSet devset); 54 | 55 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/linear/inf/Inferencer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.classifier.linear.inf; 21 | 22 | import java.io.Serializable; 23 | 24 | import org.fnlp.ml.classifier.TPredict; 25 | import org.fnlp.ml.types.Instance; 26 | 27 | /** 28 | * 推理类 29 | * @author xpqiu 30 | * 31 | */ 32 | public abstract class Inferencer implements Serializable { 33 | 34 | private static final long serialVersionUID = -7254946709189008567L; 35 | 36 | protected float[] weights; 37 | 38 | protected boolean isUseTarget; 39 | 40 | /** 41 | * 得到前n个最可能的预测值 42 | * @param inst 43 | * @return 44 | * Sep 9, 2009 45 | */ 46 | public abstract TPredict getBest(Instance inst); 47 | 48 | public abstract TPredict getBest(Instance inst, int n); 49 | 50 | public float[] getWeights() { 51 | return weights; 52 | } 53 | 54 | public void setWeights(float[] weights) { 55 | this.weights = weights; 56 | } 57 | 58 | public void isUseTarget(boolean b) { 59 | isUseTarget = b; 60 | } 61 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/linear/inf/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 解码器包,配合 edu.fudan.ml.classifier.linear中的分类器使用. 3 | * 4 | *

This file is part of FudanNLP. 5 | 6 | *

FudanNLP is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Lesser General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | 11 | *

FudanNLP is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Lesser General Public License for more details. 15 | 16 | *

You should have received a copy of the GNU General Public License 17 | * along with FudanNLP. If not, see 18 | * http://www.gnu.org/licenses/. 19 | 20 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 21 | * 22 | * @author fnlp.org 23 | * @since FudanNLP 1.5 24 | * @version 1.0.0 25 | * 26 | */ 27 | package org.fnlp.ml.classifier.linear.inf; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/linear/update/LinearMaxPAUpdate.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.classifier.linear.update; 21 | 22 | import org.fnlp.ml.loss.Loss; 23 | import org.fnlp.ml.types.Instance; 24 | 25 | /** 26 | * 线性分类的参数更新类,采用PA算法 27 | */ 28 | public class LinearMaxPAUpdate extends AbstractPAUpdate { 29 | 30 | public LinearMaxPAUpdate(Loss loss) { 31 | super(loss); 32 | } 33 | 34 | @Override 35 | protected int diff(Instance inst, float[] weights, Object target, 36 | Object predict) { 37 | 38 | int[] data = (int[]) inst.getData(); 39 | int gold; 40 | if (target == null) 41 | gold = (Integer) inst.getTarget(); 42 | else 43 | gold = (Integer) target; 44 | int pred = (Integer) predict; 45 | 46 | for (int i = 0; i < data.length; i++) { 47 | if (data[i] != -1) { 48 | int ts = data[i] + gold; 49 | int ps = data[i] + pred; 50 | diffv.put(ts, 1.0f); 51 | diffv.put(ps, -1.0f); 52 | diffw += weights[ts]-weights[ps]; // w^T(f(x,y)-f(x,ybar)) 53 | } 54 | } 55 | 56 | return 1; 57 | } 58 | 59 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/linear/update/Update.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.classifier.linear.update; 21 | 22 | import org.fnlp.ml.types.Instance; 23 | 24 | public interface Update { 25 | 26 | /** 27 | * 28 | * @param inst 样本实例 29 | * @param weights 权重 30 | * @param k 目前遍历的样本数 31 | * @param extraweight 平均化感知器需要减去的权重 32 | * @param predictLabel 预测类别 33 | * @param c 步长阈值 34 | * @return 预测类别和真实类别之间的损失 35 | */ 36 | public float update(Instance inst, float[] weights, int k, float[] extraweight, Object predictLabel, 37 | float c); 38 | 39 | /** 40 | * 41 | * @param inst 样本实例 42 | * @param weights 权重 43 | * @param k 目前遍历的样本数 44 | * @param extraweight 平均化感知器需要减去的权重 45 | * @param predictLabel 预测类别 46 | * @param goldenLabel 真实类别 47 | * @param c 步长阈值 48 | * @return 预测类别和真实类别之间的损失 49 | */ 50 | public float update(Instance inst, float[] weights, int k, float[] extraweight, Object predictLabel, 51 | Object goldenLabel, float c); 52 | 53 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/linear/update/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 在线学习权重调整,配合 edu.fudan.ml.classifier.linear中的分类器使用 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.ml.classifier.linear.update; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/struct/inf/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 结构化学习推理,配合 edu.fudan.ml.classifier.linear中的分类器使用. 3 | * 4 | *

This file is part of FudanNLP. 5 | 6 | *

FudanNLP is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Lesser General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | 11 | *

FudanNLP is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Lesser General Public License for more details. 15 | 16 | *

You should have received a copy of the GNU General Public License 17 | * along with FudanNLP. If not, see 18 | * http://www.gnu.org/licenses/. 19 | 20 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 21 | * 22 | * @author fnlp.org 23 | * @since FudanNLP 1.5 24 | * @version 1.0.0 25 | * 26 | */ 27 | package org.fnlp.ml.classifier.struct.inf; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/classifier/struct/update/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 结构化学习权重调整包,配合 edu.fudan.ml.classifier.linear中的分类器使用. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.ml.classifier.struct.update; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/eval/ScoreUsage.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.ml.eval; 2 | 3 | import java.io.IOException; 4 | 5 | import org.fnlp.util.MyFiles; 6 | 7 | public class ScoreUsage { 8 | 9 | public static void main(String[] args) throws IOException { 10 | 11 | Score ss = new Score(); 12 | 13 | int numofclass = 10; 14 | 15 | String str = MyFiles.loadString("../tmp/Sogou_SVM"); 16 | 17 | String[] s = str.split("\n"); 18 | Integer[] golden= new Integer[s.length]; 19 | Integer[] pred = new Integer[s.length]; 20 | for (int i = 0; i < s.length; i++) { 21 | String[] ele = s[i].split("\\s"); 22 | int g = Integer.parseInt(ele[0]); 23 | int p = Integer.parseInt(ele[1]); 24 | golden[i] = g; 25 | pred[i] = p; 26 | } 27 | String res = ss.score(pred, golden, numofclass); 28 | System.out.println(res); 29 | 30 | } 31 | 32 | } 33 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/eval/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 分类结果评测包. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.ml.eval; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/feature/BaseGenerator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.feature; 21 | 22 | import org.fnlp.ml.types.Instance; 23 | import org.fnlp.ml.types.sv.HashSparseVector; 24 | import org.fnlp.ml.types.sv.ISparseVector; 25 | 26 | /** 27 | * 简单将data返回 特征不包含类别信息 28 | * 29 | * @author xpqiu 30 | * 31 | */ 32 | public class BaseGenerator extends Generator { 33 | 34 | private static final long serialVersionUID = 5209575930740335391L; 35 | 36 | 37 | public ISparseVector getVector(Instance inst) { 38 | 39 | return (ISparseVector) inst.getData(); 40 | } 41 | 42 | public ISparseVector getVector(Instance inst, Object object) { 43 | return getVector(inst); 44 | } 45 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/feature/Generator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.feature; 21 | 22 | import java.io.Serializable; 23 | 24 | import org.fnlp.ml.types.Instance; 25 | import org.fnlp.ml.types.sv.HashSparseVector; 26 | import org.fnlp.ml.types.sv.ISparseVector; 27 | 28 | /** 29 | * 生成特征向量,包含类别信息 30 | * 31 | * @author xpqiu 32 | * @version 1.0 33 | */ 34 | public abstract class Generator implements Serializable { 35 | 36 | private static final long serialVersionUID = 8640098825477722199L; 37 | 38 | public Generator() { 39 | } 40 | 41 | public ISparseVector getVector(Instance inst) { 42 | return getVector(inst, inst.getTarget()); 43 | } 44 | 45 | public abstract ISparseVector getVector(Instance inst, Object object); 46 | 47 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/feature/SFGenerator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.feature; 21 | 22 | import org.fnlp.ml.feature.Generator; 23 | import org.fnlp.ml.types.Instance; 24 | import org.fnlp.ml.types.sv.BinarySparseVector; 25 | import org.fnlp.ml.types.sv.HashSparseVector; 26 | import org.fnlp.ml.types.sv.ISparseVector; 27 | import org.fnlp.ml.types.sv.SparseVector; 28 | 29 | /** 30 | * 结构化特征生成类 31 | * 32 | * @version Feb 16, 2009 33 | */ 34 | public class SFGenerator extends Generator { 35 | 36 | private static final long serialVersionUID = 6404015214630864081L; 37 | 38 | /** 39 | * 构造函数 40 | */ 41 | public SFGenerator() { 42 | } 43 | 44 | @Override 45 | public ISparseVector getVector(Instance inst, Object label) { 46 | int[] data = (int[]) inst.getData(); 47 | ISparseVector fv = new BinarySparseVector(data.length); 48 | for(int i = 0; i < data.length; i++) { 49 | int idx = data[i]+(Integer)label; 50 | fv.put(idx); 51 | } 52 | return fv; 53 | } 54 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/loss/Loss.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.loss; 21 | 22 | public interface Loss { 23 | 24 | /** 25 | * 计算l1和l2之间的损失 26 | * @param l1 对象1 27 | * @param l2 对象2 28 | * @return 损失 29 | */ 30 | public float calc(Object l1, Object l2); 31 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/loss/ZeroOneLoss.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.loss; 21 | 22 | public class ZeroOneLoss implements Loss { 23 | 24 | private float calc(Integer i1, Integer i2) { 25 | return i1==i2?0:1; 26 | } 27 | 28 | private float calc(String l1, String l2) { 29 | return l1.equals(l2)?0:1; 30 | } 31 | 32 | public float calc(Object l1, Object l2) { 33 | if (!l1.getClass().equals(l2.getClass())) { 34 | throw new IllegalArgumentException("Exception in ZeroOneLoss: l1 and l2 have different types"); 35 | } 36 | 37 | float ret = 0; 38 | if (l1 instanceof Integer) { 39 | ret = calc((Integer)l1, (Integer)l2); 40 | }else if (l1 instanceof String) { 41 | ret = calc((String)l1, (String)l2); 42 | } 43 | 44 | return ret; 45 | } 46 | 47 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/loss/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 损失计算函数. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.ml.loss; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/loss/struct/HybridHammingLoss.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.loss.struct; 21 | 22 | import org.fnlp.ml.loss.struct.HammingLoss; 23 | 24 | /** 25 | * 计算双链的Hamming距离 26 | * @author Feng Ji 27 | * 28 | */ 29 | public class HybridHammingLoss extends HammingLoss { 30 | 31 | /** 32 | * 计算o1和o2之间的Hamming距离,o1和o2必须是同类型的对象 33 | * @param o1 对象1(支持二维整型数组) 34 | * @param o2 对象2(支持二维整型数组) 35 | * @return Hamming距离 36 | */ 37 | @Override 38 | public float calc(Object o1, Object o2) { 39 | if (!o1.getClass().equals(o2.getClass())) 40 | throw new IllegalArgumentException("Exception in HybridHammingLoss: o1 and o2 have different types"); 41 | 42 | int[][] l1 = (int[][]) o1; 43 | int[][] l2 = (int[][]) o2; 44 | int ne = 0; 45 | for (int i = 0; i < l1[0].length; i++) { 46 | for (int j = 0; j < l1.length; j++) { 47 | if (l1[j][i] != l2[j][i]) { 48 | ne++; 49 | break; 50 | } 51 | } 52 | } 53 | return ne; 54 | } 55 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/loss/struct/SequenceLoss.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.loss.struct; 21 | 22 | import org.fnlp.ml.loss.Loss; 23 | 24 | public class SequenceLoss implements Loss { 25 | /** 26 | * 27 | * @author xpqiu 28 | * 29 | */ 30 | public static enum Type { 31 | POINT, EDGE 32 | } 33 | 34 | Type type; 35 | 36 | public SequenceLoss(Type type) { 37 | this.type = type; 38 | } 39 | 40 | public float calc(Object o1, Object o2) { 41 | 42 | float errCount = 0; 43 | if (o1 instanceof int[] && o2 instanceof int[]) { 44 | int[] pred = (int[]) o1; 45 | int[] gold = (int[]) o2; 46 | 47 | if (type == Type.POINT) { 48 | for (int i = 0; i < pred.length; i++) { 49 | if (pred[i] != gold[i]) 50 | errCount++; 51 | } 52 | }else if (type == Type.EDGE) { 53 | for (int i = 1; i < pred.length; i++) { 54 | if (pred[i - 1] != gold[i - 1] || pred[i] != gold[i]) 55 | errCount++; 56 | } 57 | } 58 | } 59 | 60 | return errCount; 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/loss/struct/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 针对结构化分类结果的损失计算函数. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.ml.loss.struct; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/nmf/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

矩阵分解. 9 |

This file is part of FudanNLP. 10 | 11 |

FudanNLP is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU Lesser General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 |

FudanNLP is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU Lesser General Public License for more details. 20 | 21 |

You should have received a copy of the GNU General Public License 22 | along with FudanNLP. If not, see 23 | http://www.gnu.org/licenses/. 24 | 25 |

Copyright 2009-2012 fnlp.org. All rights reserved. 26 | 27 | 28 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/DynamicInfo.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types; 21 | 22 | public class DynamicInfo { 23 | private String pos; 24 | private String word; 25 | private int len; 26 | 27 | public DynamicInfo(String pos, String word, int len) { 28 | this.pos = pos; 29 | this.word = word; 30 | this.len = len; 31 | } 32 | 33 | public String getPos() { 34 | return pos; 35 | } 36 | public void setPos(String pos) { 37 | this.pos = pos; 38 | } 39 | public String getWord() { 40 | return word; 41 | } 42 | public void setWord(String word) { 43 | this.word = word; 44 | } 45 | public int getLen() { 46 | return len; 47 | } 48 | public void setLen(int len) { 49 | this.len = len; 50 | } 51 | 52 | public String toString() { 53 | return word + "/" + pos + "/" + len; 54 | } 55 | 56 | 57 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/alphabet/ILabelAlphabet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types.alphabet; 21 | 22 | 23 | 24 | /** 25 | * 标记词典,以自增的方式存放标记 26 | * @version 1.0 27 | * 28 | */ 29 | public interface ILabelAlphabet extends IAlphabet { 30 | 31 | 32 | /** 33 | * 查找索引编号对应的标记 34 | * @param id 索引编号 35 | * @return 标记 36 | */ 37 | public T lookupString(int id); 38 | 39 | /** 40 | * 查找一组编号对应的标记 41 | * @param ids 索引编号数组 42 | * @return 标记数组 43 | */ 44 | public T[] lookupString(int[] ids); 45 | 46 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/alphabet/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 特征字典. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.6 23 | * @version 1.0.0 24 | */ 25 | package org.fnlp.ml.types.alphabet; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/featurecluster/AbstractCluster.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types.featurecluster; 21 | 22 | import java.util.HashMap; 23 | 24 | public abstract class AbstractCluster { 25 | public abstract void process(); 26 | public abstract HashMap getMap(); 27 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/featurecluster/AbstractDistance.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types.featurecluster; 21 | 22 | public abstract class AbstractDistance { 23 | public abstract double cal(ClassData cd1, ClassData cd2); 24 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

基本数据类型。 9 |

This file is part of FudanNLP. 10 | 11 |

FudanNLP is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU Lesser General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 |

FudanNLP is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU Lesser General Public License for more details. 20 | 21 |

You should have received a copy of the GNU General Public License 22 | along with FudanNLP. If not, see 23 | http://www.gnu.org/licenses/. 24 | 25 |

Copyright 2009-2012 fnlp.org. All rights reserved. 26 | 27 | 28 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/sv/ISparseVector.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types.sv; 21 | 22 | import java.io.Serializable; 23 | 24 | 25 | /** 26 | * 稀疏向量,并实现各种向量运算 27 | * 28 | */ 29 | public interface ISparseVector extends Serializable { 30 | 31 | /** 32 | * 点积 33 | * @param vector 34 | * @return 35 | */ 36 | public float dotProduct(float[] vector); 37 | 38 | /** 39 | * 40 | * @param sv 41 | * @return 42 | */ 43 | public float dotProduct(HashSparseVector sv); 44 | 45 | /** 46 | * 增加元素 47 | */ 48 | public void put(int i); 49 | /** 50 | * 增加多个元素 51 | */ 52 | public void put(int[] idx); 53 | /** 54 | * L2模 55 | */ 56 | public float l2Norm2(); 57 | 58 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ml/types/sv/Vector.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types.sv; 21 | 22 | 23 | /** 24 | * 一般向量,只是封装为统一接口 25 | * @author Xipeng 26 | * 27 | */ 28 | public class Vector implements ISparseVector { 29 | 30 | private static final long serialVersionUID = -7805496876863128028L; 31 | 32 | float[] data; 33 | 34 | public Vector(int size){ 35 | data = new float[size]; 36 | } 37 | 38 | public Vector(float[] data){ 39 | this.data = data; 40 | } 41 | 42 | @Override 43 | public float dotProduct(float[] vector) { 44 | System.out.println("未实现"); 45 | return 0; 46 | } 47 | 48 | @Override 49 | public float dotProduct(HashSparseVector sv) { 50 | return sv.dotProduct(data); 51 | } 52 | 53 | @Override 54 | public void put(int i) { 55 | System.out.println("未实现"); 56 | 57 | } 58 | 59 | @Override 60 | public void put(int[] idx) { 61 | System.out.println("未实现"); 62 | 63 | } 64 | 65 | @Override 66 | public float l2Norm2() { 67 | // TODO Auto-generated method stub 68 | return 0; 69 | } 70 | 71 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/Tags.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.cn; 21 | 22 | import java.util.regex.Pattern; 23 | 24 | 25 | /** 26 | * 中文词性操作类 27 | * @author xpqiu 28 | * @version 1.0 29 | * @since FudanNLP 1.5 30 | */ 31 | public class Tags { 32 | 33 | 34 | 35 | 36 | 37 | static Pattern nounsPattern = Pattern.compile("名词|人名|地名|机构名|专有名"); 38 | 39 | public static boolean isNoun(String pos) { 40 | return (nounsPattern.matcher(pos).find()); 41 | } 42 | 43 | 44 | static Pattern stopwordPattern = Pattern.compile(".*代词|标点|介词|从属连词|语气词|叹词|结构助词|拟声词|方位词"); 45 | /** 46 | * 判断词性是否为无意义词。 47 | * @param pos 词性 48 | * @return true,false 49 | */ 50 | public static boolean isStopword(String pos) { 51 | return (stopwordPattern.matcher(pos).find()); 52 | } 53 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/anaphora/ARInstanceGetter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.cn.anaphora; 21 | 22 | import org.fnlp.ml.types.Instance; 23 | 24 | /** 25 | * 获得指代消解的样本 26 | * @author jszhao 27 | * @version 1.0 28 | * @since FudanNLP 1.5 29 | */ 30 | 31 | public class ARInstanceGetter { 32 | 33 | private Instance instance; 34 | public ARInstanceGetter(FeatureGeter fBuilder){ 35 | this.instance = new Instance(fBuilder.getFeature(), 36 | fBuilder.getInst().getTarget()); 37 | this.instance.setSource(fBuilder.getInst().getData()); 38 | } 39 | 40 | public Instance getInstance(){ 41 | return this.instance; 42 | } 43 | 44 | 45 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/anaphora/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 指代消解包. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.nlp.cn.anaphora; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/anaphora/train/FileGroup.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.cn.anaphora.train; 21 | 22 | import java.io.File; 23 | /** 24 | * 文件组合,包括原文件和标记好的文件 25 | * @author jszhao 26 | * @version 1.0 27 | * @since FudanNLP 1.5 28 | */ 29 | public class FileGroup { 30 | private File orgFile; // 原文件 31 | private File markFile; //标记文件 32 | 33 | public FileGroup(File orgFile,File markFile){ 34 | this.orgFile= orgFile; 35 | this.markFile = markFile; 36 | 37 | } 38 | 39 | public File getOrgFile(){ 40 | return orgFile; 41 | } 42 | public File getMarkFile(){ 43 | return markFile; 44 | } 45 | public void setOrgFile(File orgFile){ 46 | this.orgFile = orgFile; 47 | } 48 | public void setMarkFile(File markFile){ 49 | this.markFile = markFile; 50 | } 51 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/anaphora/train/package-info.java: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @author xpqiu 4 | * 5 | */ 6 | package org.fnlp.nlp.cn.anaphora.train; 7 | 8 | /** 9 | * 训练步骤: 10 | * 1、通过DocFilter.java过滤掉不含有第三人称代词和指示代词的文件; 11 | * 2、通过MyDocumentWriter.java生成特征训练文件; 12 | * 3、通过ARClassifier.java对生成的特征训练文件进行训练,生成训练模型。 13 | */ -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/ner/ne/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Xipeng 6 | * 7 | */ 8 | package org.fnlp.nlp.cn.ner.ne; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/ner/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 专有实体名识别包,使用特定方法进行简单的实体名识别. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.nlp.cn.ner; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 中文句、词、字符处理(字符编码、简繁转换等)。. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.nlp.cn; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/tag/format/BasicFormatter.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.cn.tag.format; 21 | 22 | import java.util.List; 23 | 24 | import org.fnlp.ml.types.Instance; 25 | import org.fnlp.ml.types.InstanceSet; 26 | /** 27 | * 28 | * @author xpqiu 29 | * 30 | */ 31 | public class BasicFormatter { 32 | public static String format(InstanceSet testSet, String[][] labelsSet) { 33 | StringBuilder sb = new StringBuilder(); 34 | for (int i = 0; i < testSet.size(); i++) { 35 | Instance inst = testSet.getInstance(i); 36 | String[] labels = labelsSet[i]; 37 | sb.append(format(inst, labels)); 38 | sb.append("\n"); 39 | } 40 | return sb.toString(); 41 | } 42 | 43 | public static String format(Instance inst, String[] labels) { 44 | 45 | StringBuilder sb = new StringBuilder(); 46 | List data = (List) inst.getSource(); 47 | 48 | for (int j = 0; j < data.size(); j++) { 49 | sb.append(data.get(j)); 50 | sb.append('\t'); 51 | sb.append(labels[j]); 52 | sb.append("\n"); 53 | } 54 | return sb.toString(); 55 | } 56 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/tag/format/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 序列标注结果格式化包. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.nlp.cn.tag.format; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/cn/tag/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 中文自然语言处理,包括分词、词性,实体名识别. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.nlp.cn.tag; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/corpus/ctbconvert/FCTB2CONLL.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.corpus.ctbconvert; 21 | 22 | import java.io.IOException; 23 | import java.nio.charset.Charset; 24 | 25 | import org.fnlp.ml.types.InstanceSet; 26 | /** 27 | * CTB转为FNLP格式 28 | * @author Xipeng 29 | * 30 | */ 31 | public class FCTB2CONLL { 32 | 33 | public static void main(String[] args) throws IOException{ 34 | DependentTreeProducter rp = new DependentTreeProducter(); 35 | InstanceSet ins = MyTreebankReader.readTrees("../data/ctb/data", null,Charset.forName("UTF8")); 36 | // InstanceSet ins = MyTreebankReader.readNewTrees("./data/ctb/data", null,Charset.forName("UTF8")); 37 | 38 | rp.write(ins, "../data/ctb/result.txt", "../data/headrules.txt"); 39 | System.out.print("Done!"); 40 | } 41 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/corpus/ctbconvert/FCTB2CONLLTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.corpus.ctbconvert; 21 | 22 | import java.io.IOException; 23 | import java.nio.charset.Charset; 24 | 25 | import org.fnlp.ml.types.InstanceSet; 26 | /** 27 | * CTB转为FNLP格式 28 | * @author Xipeng 29 | * 30 | */ 31 | public class FCTB2CONLLTest { 32 | 33 | public static void main(String[] args) throws IOException{ 34 | DependentTreeProducter rp = new DependentTreeProducter(); 35 | rp.debug = true; 36 | InstanceSet ins = MyTreebankReader.readTrees("./data/ctb/test.txt", null,Charset.forName("UTF8")); 37 | 38 | rp.write(ins, "./data/ctb/result.txt", "./data/headrules.txt"); 39 | System.out.print("Done!"); 40 | } 41 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/corpus/ctbconvert/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * CTB树库转换 3 | */ 4 | package org.fnlp.nlp.corpus.ctbconvert; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/corpus/fnlp/filter/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Xipeng 6 | * 7 | */ 8 | package org.fnlp.nlp.corpus.fnlp.filter; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/corpus/fnlp/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * FNLP数据内部格式 3 | * @author Xipeng 4 | * 5 | */ 6 | package org.fnlp.nlp.corpus.fnlp; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/corpus/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

自然语言处理语料处理包。 9 |

This file is part of FudanNLP. 10 | 11 |

FudanNLP is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU Lesser General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 |

FudanNLP is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU Lesser General Public License for more details. 20 | 21 |

You should have received a copy of the GNU General Public License 22 | along with FudanNLP. If not, see 23 | http://www.gnu.org/licenses/. 24 | 25 |

Copyright 2009-2012 fnlp.org. All rights reserved. 26 | 27 | 28 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/corpus/third/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 第三方数据处理 3 | */ 4 | package org.fnlp.nlp.corpus.third; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/duplicate/DocSim.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.duplicate; 21 | 22 | import java.util.ArrayList; 23 | 24 | 25 | 26 | public class DocSim implements Comparable { 27 | public ArrayList ids; 28 | 29 | public DocSim(ArrayList ids) { 30 | this.ids = ids; 31 | } 32 | 33 | public int compareTo(DocSim ds) { 34 | if (ids.size() < ds.ids.size()) 35 | return 1; 36 | else 37 | return -1; 38 | } 39 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/duplicate/Documents.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.duplicate; 21 | 22 | public class Documents { 23 | 24 | public Documents() { 25 | 26 | } 27 | public Documents(String ss) { 28 | content = ss; 29 | } 30 | 31 | public String content; 32 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/duplicate/ISimilarity.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.duplicate; 21 | 22 | import java.util.ArrayList; 23 | import java.util.TreeSet; 24 | 25 | 26 | public interface ISimilarity { 27 | 28 | TreeSet duplicate(ArrayList docs) throws Exception; 29 | 30 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/langmodel/NGramModelTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.langmodel; 21 | 22 | import java.io.IOException; 23 | 24 | public class NGramModelTest{ 25 | 26 | 27 | public static void main(String[] args) throws Exception { 28 | // 29 | String segfile_mini = "../tmp/wiki_mini_simp_seg"; 30 | 31 | NGramModel model = new NGramModel(2); 32 | model.build(segfile_mini); 33 | 34 | // System.out.println("perplexity:" + model.computePerplexity("tmp/poi.dic")); 35 | System.out.println(model.getProbability("利用 符号")); 36 | } 37 | 38 | 39 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/langmodel/lda/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Xipeng 6 | * 7 | */ 8 | package org.fnlp.nlp.langmodel.lda; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/langmodel/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author xpqiu 6 | * 7 | */ 8 | package org.fnlp.nlp.langmodel; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/analysis/AnalysisSentence.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.nlp.parser.dep.analysis; 2 | 3 | public class AnalysisSentence { 4 | public String forms[]; 5 | public String tags[]; 6 | public int goldhead[]; 7 | public String goldrel[]; 8 | public int predhead[]; 9 | public String predrel[]; 10 | public AnalysisSentence(String[] forms, String[] tags, int[] goldhead, 11 | String[] goldrel, int[] predhead, String[] predrel) { 12 | super(); 13 | this.forms = forms; 14 | this.tags = tags; 15 | this.goldhead = goldhead; 16 | this.goldrel = goldrel; 17 | this.predhead = predhead; 18 | this.predrel = predrel; 19 | } 20 | 21 | public int length(){ 22 | return forms.length; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 依存句法分析包. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.nlp.parser.dep; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/parser/dep/train/package-info.java: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * 句法分析训练 4 | * @author xpqiu 5 | * 6 | */ 7 | package org.fnlp.nlp.parser.dep.train; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/parser/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 句法分析包. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | 27 | package org.fnlp.nlp.parser; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/Normalize.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe; 21 | 22 | import java.io.Serializable; 23 | 24 | import org.fnlp.ml.types.Instance; 25 | import org.fnlp.ml.types.sv.SparseVector; 26 | 27 | /** 28 | * 归一化,data类型须为SparseVector 29 | * @author xpqiu 30 | * 31 | */ 32 | public class Normalize extends Pipe implements Serializable { 33 | 34 | private static final long serialVersionUID = -4740915822925015609L; 35 | 36 | @Override 37 | public void addThruPipe(Instance instance) { 38 | SparseVector data = (SparseVector) instance.getData(); 39 | data.normalize(); 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/WeightPipe.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe; 21 | 22 | import org.fnlp.ml.types.Instance; 23 | 24 | public class WeightPipe extends Pipe { 25 | 26 | private static final long serialVersionUID = 1L; 27 | private static float[] weight = {}; 28 | 29 | public WeightPipe(boolean b){ 30 | if(b){ 31 | weight = new float[10]; 32 | int i=0; 33 | for(;i<5;i++){ 34 | weight[i] = 2f; 35 | } 36 | for(;i<10;i++){ 37 | weight[i] = 1.5f; 38 | } 39 | } 40 | } 41 | 42 | @Override 43 | public void addThruPipe(Instance inst) throws Exception { 44 | 45 | Object sdata = inst.getData(); 46 | int len; 47 | if(sdata instanceof int[][]){//转换后的特征 48 | int[][] data = (int[][]) sdata; 49 | len = data.length; 50 | }else{ 51 | System.err.println("WeightPipe: Error"); 52 | return; 53 | } 54 | 55 | float w; 56 | if(len. 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe.nlp; 21 | 22 | import org.fnlp.ml.types.Instance; 23 | import org.fnlp.nlp.cn.tag.CWSTagger; 24 | import org.fnlp.nlp.pipe.Pipe; 25 | 26 | /** 27 | * 进行分词等操作 28 | * @author xpqiu 29 | * 30 | */ 31 | public class CNPipe extends Pipe{ 32 | 33 | private static final long serialVersionUID = -2329969202592736092L; 34 | private transient CWSTagger seg; 35 | 36 | public CNPipe() { 37 | } 38 | 39 | public CNPipe(CWSTagger seg) { 40 | this.seg = seg; 41 | } 42 | 43 | @Override 44 | public void addThruPipe(Instance inst) { 45 | String data = (String) inst.getData(); 46 | String[] newdata = seg.tag2Array(data); 47 | inst.setData(newdata); 48 | } 49 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/nlp/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 使用了NLP处理工具的数据特征转换器。 3 | *

4 | *

This file is part of FudanNLP. 5 | 6 | *

FudanNLP is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Lesser General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | * 11 | *

FudanNLP is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Lesser General Public License for more details. 15 | * 16 | *

You should have received a copy of the GNU General Public License 17 | * along with FudanNLP. If not, see 18 | * http://www.gnu.org/licenses/. 19 | * 20 | *

Copyright 2009-2012 Fudan University. All rights reserved. 21 | */ 22 | package org.fnlp.nlp.pipe.nlp; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

数据特征转换器。 9 |

10 |

This file is part of FudanNLP. 11 | 12 |

FudanNLP is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU Lesser General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 |

FudanNLP is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU Lesser General Public License for more details. 21 | 22 |

You should have received a copy of the GNU General Public License 23 | along with FudanNLP. If not, see 24 | http://www.gnu.org/licenses/. 25 | 26 |

Copyright 2009-2012 fnlp.org. All rights reserved. 27 | 28 | 29 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/seq/MixedString2Sequence.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe.seq; 21 | 22 | import org.fnlp.ml.types.Instance; 23 | import org.fnlp.nlp.cn.Chars; 24 | import org.fnlp.nlp.pipe.Pipe; 25 | 26 | /** 27 | * 处理混合语言字符串 28 | * @author Feng Ji 29 | * 30 | */ 31 | public class MixedString2Sequence extends Pipe { 32 | 33 | @Override 34 | public void addThruPipe(Instance inst) throws Exception { 35 | String str = (String) inst.getData(); 36 | char[] toks = str.toCharArray(); 37 | StringBuilder sb = new StringBuilder(); 38 | for(int i = 0; i < toks.length; i++) { 39 | if (Chars.isChar(toks[i])) { 40 | sb.append(toks[i]); 41 | sb.append(" "); 42 | } 43 | } 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/seq/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

数据特征转换器,针对序列标注数据。 9 |

10 |

This file is part of FudanNLP. 11 | 12 |

FudanNLP is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU Lesser General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 |

FudanNLP is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU Lesser General Public License for more details. 21 | 22 |

You should have received a copy of the GNU General Public License 23 | along with FudanNLP. If not, see 24 | http://www.gnu.org/licenses/. 25 | 26 |

Copyright 2009-2012 fnlp.org. All rights reserved. 27 | 28 | 29 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/seq/templet/Templet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe.seq.templet; 21 | 22 | import java.io.Serializable; 23 | 24 | import org.fnlp.ml.types.Instance; 25 | import org.fnlp.ml.types.alphabet.IFeatureAlphabet; 26 | /** 27 | * 模板接口 28 | * @author xpqiu 29 | * 30 | */ 31 | public interface Templet extends Serializable{ 32 | 33 | /** 34 | * 返回该模板的阶 35 | * @return 阶 36 | */ 37 | public int getOrder(); 38 | 39 | /** 40 | * 在给定实例的指定位置上抽取特征 41 | * @param instance 给定实例 42 | * @param pos 指定位置 43 | * @param numLabels 标签数量 44 | * @throws Exception 45 | */ 46 | public int generateAt( Instance instance, 47 | IFeatureAlphabet features, 48 | int pos, 49 | int ... numLabels ) throws Exception; 50 | 51 | public int[] getVars(); 52 | 53 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/seq/templet/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

特征生成。 9 |

10 |

This file is part of FudanNLP. 11 | 12 |

FudanNLP is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU Lesser General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 |

FudanNLP is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU Lesser General Public License for more details. 21 | 22 |

You should have received a copy of the GNU General Public License 23 | along with FudanNLP. If not, see 24 | http://www.gnu.org/licenses/. 25 | 26 |

Copyright 2009-2012 fnlp.org. All rights reserved. 27 | 28 | 29 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/templet/Templet.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe.templet; 21 | 22 | import java.io.Serializable; 23 | 24 | import org.fnlp.ml.types.Instance; 25 | import org.fnlp.ml.types.alphabet.IFeatureAlphabet; 26 | /** 27 | * 模板接口 28 | * @author xpqiu 29 | * 30 | */ 31 | public interface Templet extends Serializable{ 32 | 33 | /** 34 | * 在给定实例的指定位置上抽取特征 35 | * @param instance 给定实例 36 | * @param numLabels 标签数量 37 | * @throws Exception 38 | */ 39 | public int[] generateAt( Instance instance, 40 | IFeatureAlphabet features, 41 | int numLabels ) throws Exception; 42 | 43 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/templet/TempletGroup.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe.templet; 21 | 22 | import java.util.ArrayList; 23 | /** 24 | * 序列标注特征模板组,包含不同的特征生成方式 25 | * @author xpqiu 26 | * 27 | */ 28 | public class TempletGroup extends ArrayList { 29 | 30 | private static final long serialVersionUID = 2584759562263226861L; 31 | /** 32 | * 模板标识 33 | */ 34 | public int gid; 35 | 36 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/pipe/templet/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

特征生成模板。 9 |

10 |

This file is part of FudanNLP. 11 | 12 |

FudanNLP is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU Lesser General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 |

FudanNLP is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU Lesser General Public License for more details. 21 | 22 |

You should have received a copy of the GNU General Public License 23 | along with FudanNLP. If not, see 24 | http://www.gnu.org/licenses/. 25 | 26 |

Copyright 2009-2012 fnlp.org. All rights reserved. 27 | 28 | 29 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/similarity/ISimilarity.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.similarity; 21 | 22 | /** 23 | * @author xpqiu 24 | * @version 1.0 25 | * @since 1.0 26 | * ISimilarity 27 | */ 28 | public interface ISimilarity { 29 | 30 | public float calc(E item1,E item2); 31 | 32 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/similarity/JaccardSimilarity.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.similarity; 21 | 22 | import gnu.trove.iterator.hash.TObjectHashIterator; 23 | import gnu.trove.set.hash.THashSet; 24 | 25 | public class JaccardSimilarity implements ISimilarity> { 26 | 27 | public float calc(THashSet s1, THashSet s2) { 28 | int com = 0; 29 | if (s1 == null || s2 == null) 30 | return 0; 31 | TObjectHashIterator it = s1.iterator(); 32 | for ( int i = s1.size(); i-- > 0; ) { 33 | Object v = it.next(); 34 | if(s2.contains(v)) 35 | com++; 36 | } 37 | float sim = ((float) com)/(s1.size()+s2.size()-com); 38 | return sim; 39 | } 40 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/similarity/SparseVectorSimilarity.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.nlp.similarity; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.fnlp.ml.types.sv.HashSparseVector; 6 | 7 | 8 | 9 | public class SparseVectorSimilarity implements ISimilarity ,Serializable{ 10 | 11 | @Override 12 | public float calc(HashSparseVector item1, HashSparseVector item2) { 13 | //return item1.dotProduct(item2); 14 | return item1.cos(item2); 15 | } 16 | 17 | } 18 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/similarity/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

文本相似度计算包。 9 |

10 |

This file is part of FudanNLP. 11 | 12 |

FudanNLP is free software: you can redistribute it and/or modify 13 | it under the terms of the GNU Lesser General Public License as published by 14 | the Free Software Foundation, either version 3 of the License, or 15 | (at your option) any later version. 16 | 17 |

FudanNLP is distributed in the hope that it will be useful, 18 | but WITHOUT ANY WARRANTY; without even the implied warranty of 19 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 | GNU Lesser General Public License for more details. 21 | 22 |

You should have received a copy of the GNU General Public License 23 | along with FudanNLP. If not, see 24 | http://www.gnu.org/licenses/. 25 | 26 |

Copyright 2009-2012 fnlp.org. All rights reserved. 27 | 28 | 29 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/similarity/train/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Xipeng 6 | * 7 | */ 8 | package org.fnlp.nlp.similarity.train; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/nlp/tag/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Insert title here 6 | 7 | 8 |

基于序列标注的序列标注。 9 |

This file is part of FudanNLP. 10 | 11 |

FudanNLP is free software: you can redistribute it and/or modify 12 | it under the terms of the GNU Lesser General Public License as published by 13 | the Free Software Foundation, either version 3 of the License, or 14 | (at your option) any later version. 15 | 16 |

FudanNLP is distributed in the hope that it will be useful, 17 | but WITHOUT ANY WARRANTY; without even the implied warranty of 18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 19 | GNU Lesser General Public License for more details. 20 | 21 |

You should have received a copy of the GNU General Public License 22 | along with FudanNLP. If not, see 23 | http://www.gnu.org/licenses/. 24 | 25 |

Copyright 2009-2012 fnlp.org. All rights reserved. 26 | 27 | 28 | -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ontology/graph/Direction.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ontology.graph; 21 | 22 | public enum Direction{ 23 | BOTH, 24 | UP, 25 | SUB; 26 | }; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ontology/graph/Word.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ontology.graph; 21 | /** 22 | * 词 23 | * @author Xipeng 24 | * 25 | */ 26 | public class Word { 27 | 28 | int id; 29 | String word; 30 | 31 | public Word(String word) { 32 | this.word = word; 33 | } 34 | 35 | /** 36 | * @param args 37 | */ 38 | public static void main(String[] args) { 39 | // TODO Auto-generated method stub 40 | 41 | } 42 | 43 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ontology/graph/WordRelationEnum.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ontology.graph; 21 | 22 | 23 | 24 | public enum WordRelationEnum { 25 | 26 | 27 | 28 | SYM("同义词",Direction.BOTH), 29 | ANTONYM("反义词",Direction.BOTH); 30 | 31 | private String cname; 32 | 33 | private Direction direction; 34 | 35 | public Direction getDirection() { 36 | return direction; 37 | } 38 | 39 | public void setDirection(Direction direction) { 40 | this.direction = direction; 41 | } 42 | 43 | private WordRelationEnum(String name,Direction direction){ 44 | this.cname = name; 45 | this.direction = direction; 46 | } 47 | 48 | public static WordRelationEnum getWithName(String name) { 49 | WordRelationEnum[] tasks = WordRelationEnum.values(); 50 | for(WordRelationEnum task:tasks){ 51 | if(task.cname.equals(name)) 52 | return task; 53 | } 54 | return null; 55 | } 56 | 57 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ontology/graph/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | /** 5 | * @author Xipeng 6 | * 7 | */ 8 | package org.fnlp.ontology.graph; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/ontology/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 知识库管理。 3 | * 4 | *

This file is part of FudanNLP. 5 | 6 | *

FudanNLP is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Lesser General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | 11 | *

FudanNLP is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Lesser General Public License for more details. 15 | 16 | *

You should have received a copy of the GNU General Public License 17 | * along with FudanNLP. If not, see 18 | * http://www.gnu.org/licenses/. 19 | 20 | *

Copyright 2009-2012 Fudan University. All rights reserved. 21 | * 22 | * @author fnlp.org 23 | * @since FudanNLP 1.5 24 | * @version 1.0.0 25 | */ 26 | package org.fnlp.ontology; -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/ICallback.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.util; 21 | /** 22 | * 实现回调函数 23 | * @author xpqiu 24 | * 25 | */ 26 | public interface ICallback { 27 | public void execute(); 28 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/Options.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package org.fnlp.util; 5 | 6 | import java.util.ArrayList; 7 | import java.util.HashMap; 8 | 9 | /** 10 | * @author Xipeng Qiu E-mail: xpqiu@fudan.edu.cn 11 | * @version 创建时间:2015年1月6日 下午4:39:23 12 | * @since fnlp 2.1 13 | */ 14 | public class Options { 15 | HashMap options = new HashMap(); 16 | ArrayList rootArgs = new ArrayList(); 17 | 18 | 19 | public void parsing(String[] args) { 20 | 21 | for (int i=0; i. 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.util; 21 | 22 | import java.util.Map; 23 | /** 24 | * Map按值比较 25 | * @author xpqiu 26 | * @version 1.0 27 | * @since FudanNLP 1.5 28 | */ 29 | public class ValueComparator implements java.util.Comparator { 30 | private Map m; // the original map 31 | 32 | public ValueComparator(Map m) { 33 | this.m = m; 34 | } 35 | 36 | public int compare(Object o1, Object o2) { 37 | // handle some exceptions here 38 | Object v1 = m.get(o1); 39 | Object v2 = m.get(o2); 40 | // make sure the values implement Comparable 41 | return -((Comparable) v1).compareTo(v2); 42 | } 43 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/exception/LoadModelException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.util.exception; 21 | 22 | import java.io.FileNotFoundException; 23 | import java.io.IOException; 24 | 25 | public class LoadModelException extends Exception { 26 | 27 | private static final long serialVersionUID = -3933859344026018386L; 28 | 29 | public LoadModelException(Exception e, String file) { 30 | super(e); 31 | if( e instanceof FileNotFoundException) { 32 | System.out.println("模型文件不存在: "+ file); 33 | } else if (e instanceof ClassNotFoundException) { 34 | System.out.println("模型文件版本错误。"); 35 | } else if (e instanceof IOException) { 36 | System.out.println("模型文件读入错误: "+file); 37 | 38 | } 39 | e.printStackTrace(); 40 | } 41 | 42 | public LoadModelException(String msg) { 43 | super(msg); 44 | printStackTrace(); 45 | } 46 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/exception/NotImplementedException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.util.exception; 21 | 22 | /** 23 | * 不支持数据类型 24 | * @author xpqiu 25 | * 26 | */ 27 | public class NotImplementedException extends Exception { 28 | 29 | private static final long serialVersionUID = -7879174759276938120L; 30 | 31 | 32 | public NotImplementedException(String msg) { 33 | super(msg); 34 | printStackTrace(); 35 | } 36 | 37 | 38 | public NotImplementedException() { 39 | super("该方法暂未实现"); 40 | printStackTrace(); 41 | } 42 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/exception/UnsupportedDataTypeException.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.util.exception; 21 | 22 | /** 23 | * 不支持数据类型 24 | * @author xpqiu 25 | * 26 | */ 27 | public class UnsupportedDataTypeException extends Exception { 28 | 29 | private static final long serialVersionUID = -7879174759276938120L; 30 | 31 | 32 | public UnsupportedDataTypeException(String msg) { 33 | super(msg); 34 | printStackTrace(); 35 | } 36 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/hash/AbstractHashCode.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.util.hash; 21 | 22 | import java.io.Serializable; 23 | 24 | public abstract class AbstractHashCode implements Serializable { 25 | private static final long serialVersionUID = -6803250687142456011L; 26 | 27 | public abstract int hashcode(String str); 28 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/hash/JavaHash.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.util.hash; 21 | 22 | public class JavaHash extends AbstractHashCode { 23 | public int hashcode(String str) { 24 | return str.hashCode(); 25 | } 26 | } -------------------------------------------------------------------------------- /fnlp-core/src/main/java/org/fnlp/util/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 基础工具包。 3 | * 4 | *

This file is part of FudanNLP. 5 | 6 | *

FudanNLP is free software: you can redistribute it and/or modify 7 | * it under the terms of the GNU Lesser General Public License as published by 8 | * the Free Software Foundation, either version 3 of the License, or 9 | * (at your option) any later version. 10 | 11 | *

FudanNLP is distributed in the hope that it will be useful, 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | * GNU Lesser General Public License for more details. 15 | 16 | *

You should have received a copy of the GNU General Public License 17 | * along with FudanNLP. If not, see 18 | * http://www.gnu.org/licenses/. 19 | 20 | *

Copyright 2009-2012 Fudan University. All rights reserved. 21 | * 22 | * @author fnlp.org 23 | * @since FNLP 1.5 24 | * @version 1.0.0 25 | */ 26 | package org.fnlp.util; -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/ml/classifier/knn/KNNTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.classifier.knn; 21 | 22 | import static org.junit.Assert.*; 23 | 24 | 25 | import org.junit.After; 26 | import org.junit.AfterClass; 27 | import org.junit.Before; 28 | import org.junit.BeforeClass; 29 | import org.junit.Test; 30 | 31 | import org.fnlp.ml.types.Instance; 32 | /** 33 | * 分类测试,输出测试结果的相关信息 34 | * @author xpqiu 35 | * 36 | */ 37 | public class KNNTest { 38 | 39 | @BeforeClass 40 | public static void setUpBeforeClass() throws Exception { 41 | } 42 | 43 | @AfterClass 44 | public static void tearDownAfterClass() throws Exception { 45 | } 46 | 47 | @Before 48 | public void setUp() throws Exception { 49 | } 50 | 51 | @After 52 | public void tearDown() throws Exception { 53 | } 54 | 55 | @Test 56 | public void testClassifyInstanceSetInt() { 57 | 58 | 59 | } 60 | 61 | 62 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/ml/eval/SeqEvalTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package org.fnlp.ml.eval; 5 | 6 | import static org.junit.Assert.*; 7 | 8 | import java.io.IOException; 9 | 10 | import org.junit.AfterClass; 11 | import org.junit.BeforeClass; 12 | import org.junit.Test; 13 | 14 | /** 15 | * @author Xipeng Qiu E-mail: xpqiu@fudan.edu.cn 16 | * @version 创建时间:2015年5月6日 下午4:54:05 17 | */ 18 | public class SeqEvalTest { 19 | 20 | /** 21 | * @throws java.lang.Exception 22 | * 下午4:54:05 23 | */ 24 | @BeforeClass 25 | public static void setUpBeforeClass() throws Exception { 26 | } 27 | 28 | /** 29 | * @throws java.lang.Exception 30 | * 下午4:54:05 31 | */ 32 | @AfterClass 33 | public static void tearDownAfterClass() throws Exception { 34 | } 35 | 36 | @Test 37 | public void test() throws IOException { 38 | 39 | 40 | 41 | // String filePath = "./paperdata/ctb6-seg/work/ctb_三列式结果_0.txt"; 42 | String dictpath = "D:\\项目\\9.评测\\NLPCC2015分词\\data_4.0.1\\all.dict"; 43 | String filePath = "D:\\项目\\9.评测\\NLPCC2015分词\\data_4.0.1\\testSeg.txt"; 44 | // String dictpath = "D:\\项目\\9.评测\\NLPCC2015分词\\data21_No_0\\all.dict"; 45 | 46 | // filePath = "./example-data/sequence/seq.res"; 47 | 48 | // //读取评测结果文件,并输出到outputPath 49 | // SeqEval ne1; 50 | // ne1 = new SeqEval(); 51 | // ne1.readOOV(dictpath); 52 | // ne1.read(filePath); 53 | //// ne1.NeEvl(null); 54 | // double[] res = ne1.calcPRF(); 55 | // System.out.print(res[0] +" " + res[1]+" " +res[2]+" "+res[3]); 56 | 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/ml/types/HashSparseVectorTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types; 21 | 22 | import static org.junit.Assert.*; 23 | 24 | import org.junit.After; 25 | import org.junit.AfterClass; 26 | import org.junit.Before; 27 | import org.junit.BeforeClass; 28 | import org.junit.Test; 29 | 30 | import org.fnlp.ml.types.sv.HashSparseVector; 31 | 32 | public class HashSparseVectorTest { 33 | 34 | @BeforeClass 35 | public static void setUpBeforeClass() throws Exception { 36 | } 37 | 38 | @AfterClass 39 | public static void tearDownAfterClass() throws Exception { 40 | } 41 | 42 | private HashSparseVector sv; 43 | 44 | @Before 45 | public void setUp() throws Exception { 46 | sv = new HashSparseVector(); 47 | } 48 | 49 | @After 50 | public void tearDown() throws Exception { 51 | } 52 | 53 | @Test 54 | public void testGet() { 55 | sv.put(1, 1); 56 | System.out.println(sv.containsKey(2)); 57 | System.out.println(sv.get(2)); 58 | System.out.println(sv.get(1)); 59 | } 60 | 61 | @Test 62 | public void testPut() { 63 | sv.put(1, 1); 64 | 65 | } 66 | 67 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/ml/types/alphabet/LabelAlphabetEnumTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.ml.types.alphabet; 21 | 22 | import org.fnlp.nlp.cn.PartOfSpeech; 23 | import org.junit.AfterClass; 24 | import org.junit.BeforeClass; 25 | import org.junit.Test; 26 | 27 | public class LabelAlphabetEnumTest { 28 | static LabelAlphabetEnum label ; 29 | 30 | @BeforeClass 31 | public static void setUpBeforeClass() throws Exception { 32 | label = new LabelAlphabetEnum(PartOfSpeech.class); 33 | } 34 | 35 | @AfterClass 36 | public static void tearDownAfterClass() throws Exception { 37 | } 38 | 39 | @Test 40 | public void test() { 41 | System.out.println(label.lookupIndex(PartOfSpeech.人称代词.name())); 42 | System.out.println(label.lookupIndex("人称代词")); 43 | } 44 | 45 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/cn/SentenizerTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.cn; 21 | 22 | import static org.junit.Assert.*; 23 | 24 | import org.junit.After; 25 | import org.junit.AfterClass; 26 | import org.junit.Before; 27 | import org.junit.BeforeClass; 28 | import org.junit.Test; 29 | 30 | import org.fnlp.util.MyStrings; 31 | 32 | public class SentenizerTest { 33 | 34 | @BeforeClass 35 | public static void setUpBeforeClass() throws Exception { 36 | } 37 | 38 | @AfterClass 39 | public static void tearDownAfterClass() throws Exception { 40 | } 41 | 42 | @Before 43 | public void setUp() throws Exception { 44 | } 45 | 46 | @After 47 | public void tearDown() throws Exception { 48 | } 49 | 50 | @Test 51 | public void testSplit() { 52 | String sent = " 回顾王适娴职业生涯成长历程,2008年只参加了两站国内进行的公开赛?呵呵"; 53 | String[] subsents = Sentenizer.split(sent); 54 | System.out.println(MyStrings.toString(subsents,"\n")); 55 | } 56 | 57 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/cn/tag/POSTaggerTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.cn.tag; 21 | 22 | import static org.junit.Assert.*; 23 | 24 | import org.junit.After; 25 | import org.junit.AfterClass; 26 | import org.junit.Before; 27 | import org.junit.BeforeClass; 28 | import org.junit.Test; 29 | 30 | import org.fnlp.util.MyCollection; 31 | import org.fnlp.util.MyFiles; 32 | 33 | public class POSTaggerTest { 34 | static String s1; 35 | static POSTagger pos; 36 | 37 | @BeforeClass 38 | public static void setUpBeforeClass() throws Exception { 39 | s1 = MyFiles.loadString("../example-data/data-tag.txt"); 40 | pos = new POSTagger("../models/seg.m", "../models/pos.m"); 41 | } 42 | 43 | @AfterClass 44 | public static void tearDownAfterClass() throws Exception { 45 | } 46 | 47 | @Before 48 | public void setUp() throws Exception { 49 | 50 | } 51 | 52 | @After 53 | public void tearDown() throws Exception { 54 | } 55 | 56 | 57 | 58 | 59 | @Test 60 | public void testTagString() { 61 | String o1 = pos.tag(s1); 62 | System.out.println(o1); 63 | } 64 | 65 | 66 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/corpus/StopWordsTest.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.nlp.corpus; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import org.junit.AfterClass; 6 | import org.junit.BeforeClass; 7 | import org.junit.Test; 8 | 9 | public class StopWordsTest { 10 | 11 | @BeforeClass 12 | public static void setUpBeforeClass() throws Exception { 13 | } 14 | 15 | @AfterClass 16 | public static void tearDownAfterClass() throws Exception { 17 | } 18 | 19 | @Test 20 | public void testIsStopWordStringIntInt() { 21 | StopWords sw = new StopWords(); 22 | sw.read("../models/stopwords/StopWords.txt"); 23 | assertTrue(!sw.isStopWord("现在我",2,4)); 24 | assertTrue(sw.isStopWord("我0",2,4)); 25 | assertTrue(sw.isStopWord("我#",2,4)); 26 | assertTrue(sw.isStopWord(" ",2,4)); 27 | } 28 | 29 | @Test 30 | public void testIsStopWordString() { 31 | StopWords sw = new StopWords(); 32 | sw.read("../models/stopwords/StopWords.txt"); 33 | assertTrue(!sw.isStopWord("现在我")); 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/corpus/fnlp/FNLPCorpusTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.corpus.fnlp; 21 | 22 | import static org.junit.Assert.*; 23 | 24 | import java.io.IOException; 25 | 26 | import org.junit.After; 27 | import org.junit.AfterClass; 28 | import org.junit.Before; 29 | import org.junit.BeforeClass; 30 | import org.junit.Test; 31 | 32 | 33 | public class FNLPCorpusTest { 34 | 35 | @BeforeClass 36 | public static void setUpBeforeClass() throws Exception { 37 | } 38 | 39 | @AfterClass 40 | public static void tearDownAfterClass() throws Exception { 41 | } 42 | 43 | @Before 44 | public void setUp() throws Exception { 45 | } 46 | 47 | @After 48 | public void tearDown() throws Exception { 49 | } 50 | 51 | 52 | @Test 53 | public void testReadCWS() throws IOException { 54 | FNLPCorpus corpus = new FNLPCorpus(); 55 | corpus.readCWS("../example-data/seg-bad-case.txt",".txt","UTF8"); 56 | corpus.writeOne("../tmp/seg.dat"); 57 | // corpus.count("./tmp", true); 58 | System.out.println("Done!"); 59 | } 60 | 61 | 62 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/ner/time/Demo_NumberTranslator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.ner.time; 21 | 22 | import org.fnlp.nlp.cn.ner.stringPreHandlingModule; 23 | 24 | /** 25 | * 大写数字转化模块的演示demo 26 | * 27 | * @author 曹零07300720158 28 | * 29 | */ 30 | public class Demo_NumberTranslator { 31 | public static void main(String[] args){ 32 | String target = "七千零五十一万零三百零五"; 33 | String s = stringPreHandlingModule.numberTranslator(target); 34 | System.out.println(s); 35 | 36 | target = "一千六加一五八零"; 37 | s = stringPreHandlingModule.numberTranslator(target); 38 | System.out.println(s); 39 | 40 | target = "周三十三点"; 41 | s = stringPreHandlingModule.numberTranslator(target); 42 | System.out.println(s); 43 | 44 | target = "三十三点"; 45 | s = stringPreHandlingModule.numberTranslator(target); 46 | System.out.println(s); 47 | } 48 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/ner/time/Demo_TimeNormalizer.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.ner.time; 21 | 22 | import org.fnlp.nlp.cn.ner.TimeNormalizer; 23 | import org.fnlp.nlp.cn.ner.TimeUnit; 24 | 25 | /** 26 | * TimeNormalizer的演示demo 27 | * 28 | * @author 曹零07300720158 29 | * 30 | */ 31 | public class Demo_TimeNormalizer { 32 | public static void main(String[] args){ 33 | String target = "08年北京申办奥运会,8月8号开幕式,九月十八号闭幕式。" + 34 | "1年后的7月21号发生了件大事。" + 35 | "今天我本想去世博会,但是人太多了,直到晚上9点人还是那么多。" + 36 | "考虑到明天和后天人还是那么多,决定下周日再去。"; 37 | TimeNormalizer normalizer; 38 | // normalizer= new TimeNormalizer(); 39 | // 40 | // try { 41 | // normalizer.text2binModel("./model/TimeExp-Rules.txt","./model/TimeExp.gz"); 42 | // } catch (Exception e) { 43 | // // TODO Auto-generated catch block 44 | // e.printStackTrace(); 45 | // } 46 | normalizer = new TimeNormalizer("./model/TimeExp.gz"); 47 | normalizer.parse(target); 48 | TimeUnit[] unit = normalizer.getTimeUnit(); 49 | for(int i = 0; i < unit.length; i++){ 50 | System.out.println(unit[i]); 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/pipe/seq/String2SequenceTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.pipe.seq; 21 | 22 | import org.junit.After; 23 | import org.junit.AfterClass; 24 | import org.junit.Before; 25 | import org.junit.BeforeClass; 26 | import org.junit.Test; 27 | 28 | import org.fnlp.util.MyStrings; 29 | 30 | public class String2SequenceTest { 31 | 32 | @BeforeClass 33 | public static void setUpBeforeClass() throws Exception { 34 | } 35 | 36 | @AfterClass 37 | public static void tearDownAfterClass() throws Exception { 38 | } 39 | 40 | @Before 41 | public void setUp() throws Exception { 42 | } 43 | 44 | @After 45 | public void tearDown() throws Exception { 46 | } 47 | 48 | @Test 49 | public void testGenSequence() { 50 | String input = "我000们ss001 在 这里 哈哈ssss哈s。"; 51 | String[][] s = String2Sequence.genSequence(input); 52 | System.out.println(MyStrings.toString(s, ",", "\n")); 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/tag/MemoryStatic.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.tag; 21 | 22 | public class MemoryStatic { 23 | private static final Runtime s_runtime = Runtime.getRuntime (); 24 | private static long start; 25 | private static long usedMemory () 26 | { 27 | return s_runtime.totalMemory () - 28 | s_runtime.freeMemory (); 29 | } 30 | private static void runGC () 31 | { 32 | long usedMem1 = usedMemory (), usedMem2 = Long.MAX_VALUE; 33 | for (int i = 0; (usedMem1 < usedMem2) && (i < 500); ++ i) 34 | { 35 | s_runtime.runFinalization (); 36 | s_runtime.gc (); 37 | Thread.currentThread ().yield (); 38 | usedMem2 = usedMem1; 39 | usedMem1 = usedMemory (); 40 | } 41 | } 42 | public static void start() { 43 | runGC(); 44 | start = s_runtime.totalMemory() - s_runtime.freeMemory(); 45 | } 46 | public static long end() { 47 | runGC(); 48 | long end = s_runtime.totalMemory() - s_runtime.freeMemory(); 49 | long diff = end - start; 50 | return diff; 51 | } 52 | 53 | 54 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/tag/TaggerTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package org.fnlp.nlp.tag; 5 | 6 | import static org.junit.Assert.*; 7 | 8 | import org.junit.AfterClass; 9 | import org.junit.BeforeClass; 10 | import org.junit.Test; 11 | 12 | /** 13 | * @author Xipeng Qiu E-mail: xpqiu@fudan.edu.cn 14 | * @version 创建时间:2014年11月14日 上午10:27:23 15 | */ 16 | public class TaggerTest { 17 | 18 | /** 19 | * @throws java.lang.Exception 20 | * 上午10:27:24 21 | */ 22 | @BeforeClass 23 | public static void setUpBeforeClass() throws Exception { 24 | } 25 | 26 | /** 27 | * @throws java.lang.Exception 28 | * 上午10:27:24 29 | */ 30 | @AfterClass 31 | public static void tearDownAfterClass() throws Exception { 32 | } 33 | 34 | /** 35 | * Test method for {@link org.fnlp.nlp.tag.Tagger#main(java.lang.String[])}. 36 | */ 37 | @Test 38 | public void testMain() { 39 | try { 40 | Tagger.main("-train ../example-data/sequence/template ../example-data/sequence/train.txt ../tmp/tmp.m".split("\\s+")); 41 | Tagger.main("../tmp/tmp.m ../example-data/sequence/test.txt ../tmp/res.txt".split("\\s+")); 42 | } catch (Exception e) { 43 | // TODO Auto-generated catch block 44 | e.printStackTrace(); 45 | } 46 | 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/tag/TestDictSEG.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.tag; 21 | 22 | import java.util.ArrayList; 23 | 24 | import org.fnlp.ml.types.Dictionary; 25 | import org.fnlp.nlp.cn.tag.CWSTagger; 26 | import org.fnlp.util.MyCollection; 27 | 28 | /** 29 | * 分词使用示例 30 | * @author xpqiu 31 | * 32 | */ 33 | public class TestDictSEG { 34 | 35 | public static void main(String[] args) throws Exception { 36 | CWSTagger tag = new CWSTagger("./models/seg.m"); 37 | Dictionary dict=new Dictionary(); 38 | dict.addFile("./models/dict.txt"); 39 | tag.setDictionary(dict); 40 | ArrayList str = MyCollection.loadList("./testcase/test case seg.txt",null); 41 | for(String s:str){ 42 | String t = tag.tag(s); 43 | // t = tag.tag(t); 44 | System.out.println(t); 45 | } 46 | tag.setEnFilter(false); 47 | for(String s:str){ 48 | String t = tag.tag(s); 49 | System.out.println(t); 50 | } 51 | 52 | String t = tag.tagFile("data/FNLPDATA/seg/bad case.txt"); 53 | System.out.println(t); 54 | 55 | } 56 | 57 | 58 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/nlp/tag/TestPOS.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.nlp.tag; 21 | 22 | import java.util.ArrayList; 23 | 24 | import org.fnlp.nlp.cn.tag.POSTagger; 25 | import org.fnlp.util.MyCollection; 26 | 27 | /** 28 | * 词性标注使用示例 29 | * @author xpqiu 30 | * 31 | */ 32 | public class TestPOS { 33 | 34 | static POSTagger tag; 35 | 36 | 37 | 38 | public static void main(String[] args) throws Exception { 39 | // TODO Auto-generated method stub 40 | tag = new POSTagger("models/seg.m","models/pos.m"); 41 | 42 | ArrayList str = MyCollection.loadList("./testcase/test case pos.txt",null); 43 | str.add("周杰伦 生 于 台湾\n我们"); 44 | str.add("分析和比较"); 45 | 46 | for(String s:str){ 47 | String t = tag.tag(s); 48 | System.out.println(t); 49 | } 50 | 51 | str.clear(); 52 | str.add("周杰伦 生 于 台湾\n我们"); 53 | 54 | for(String s:str){ 55 | String t = tag.tagSeged2StringALL(s.split(" ")); 56 | System.out.println(t); 57 | } 58 | 59 | } 60 | 61 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/test/CharacterType.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.test; 21 | 22 | public class CharacterType { 23 | 24 | /** 25 | * @param args 26 | */ 27 | public static void main(String[] args) { 28 | System.out.println(Character.getType('我')); 29 | System.out.println(Character.getType('I')); 30 | System.out.println(Character.getType(' ')); 31 | System.out.println(Character.getType('。')); 32 | System.out.println(Character.getType('.')); 33 | System.out.println(Character.getType('6')); 34 | System.out.println(Character.getType('⑴')); 35 | System.out.println(Character.getType('十')); 36 | 37 | } 38 | 39 | } -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/util/MyFilesTest.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.util; 2 | 3 | import static org.junit.Assert.*; 4 | 5 | import java.io.File; 6 | import java.util.Date; 7 | import java.util.List; 8 | 9 | import org.junit.AfterClass; 10 | import org.junit.BeforeClass; 11 | import org.junit.Test; 12 | 13 | public class MyFilesTest { 14 | 15 | @BeforeClass 16 | public static void setUpBeforeClass() throws Exception { 17 | } 18 | 19 | @AfterClass 20 | public static void tearDownAfterClass() throws Exception { 21 | } 22 | 23 | 24 | } 25 | -------------------------------------------------------------------------------- /fnlp-core/src/test/java/org/fnlp/util/hash/MurmurHashTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package org.fnlp.util.hash; 5 | 6 | import org.fnlp.nlp.cn.tag.CWSTagger; 7 | import org.fnlp.util.exception.LoadModelException; 8 | 9 | /** 10 | * @author Xipeng Qiu E-mail: xpqiu@fudan.edu.cn 11 | * @version 创建时间:2015年1月8日 下午3:14:28 12 | */ 13 | public class MurmurHashTest { 14 | 15 | private CWSTagger cws; 16 | 17 | public MurmurHashTest() { 18 | try { 19 | long t = System.currentTimeMillis(); 20 | cws = new CWSTagger("../models/seg.m"); 21 | long elapsed = System.currentTimeMillis() - t; 22 | System.out.println("Fnlp loaded in " + elapsed + " ms."); 23 | } catch (LoadModelException e) { 24 | throw new RuntimeException("Failed to load fnlp", e); 25 | } 26 | } 27 | 28 | 29 | protected String[] getCws(String text) { 30 | return cws.tag2Array(text); 31 | } 32 | 33 | protected void benchmark() { 34 | long t = System.currentTimeMillis(); 35 | String input = "12月1日,长江经济带海关区域通关一体化改革实现流域全覆盖,南昌、武汉、长沙、成都、重庆、贵阳、昆明等7个海关加入改革。当天,流域12个关区的海关特殊监管区域也纳入区域通关一体化,长江全流域真正实现了“12关如1关”。这标志着京津冀、长江经济带、广东地区三大区域通关一体化改革全面实施"; 36 | for (int i = 0; i < 5000; i++) { 37 | getCws(input); 38 | } 39 | long elapsed = System.currentTimeMillis() - t; 40 | System.out.println("Benchmarked " + elapsed + " ms."); 41 | } 42 | 43 | public void run() { 44 | // warm up the code, and perform benchmark 45 | for (int k = 0; k < 10; k ++) { 46 | benchmark(); 47 | } 48 | } 49 | 50 | public static void main(String[] args) throws InterruptedException { 51 | new MurmurHashTest().run(); 52 | } 53 | } 54 | 55 | -------------------------------------------------------------------------------- /fnlp-demo/.gitignore: -------------------------------------------------------------------------------- 1 | /bin 2 | -------------------------------------------------------------------------------- /fnlp-demo/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.fnlp 6 | fnlp-all 7 | 2.1-SNAPSHOT 8 | 9 | org.fnlp 10 | fnlp-demo 11 | 2.1-SNAPSHOT 12 | fnlp-demo 13 | http://maven.apache.org 14 | 15 | UTF-8 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.11 22 | test 23 | 24 | 25 | org.fnlp 26 | fnlp-core 27 | 2.1-SNAPSHOT 28 | 29 | 30 | org.fnlp 31 | fnlp-app 32 | 2.1-SNAPSHOT 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /fnlp-demo/src/main/java/org/fnlp/demo/ml/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * FudanNLP分类器使用示例. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.demo.ml; -------------------------------------------------------------------------------- /fnlp-demo/src/main/java/org/fnlp/demo/nlp/NamedEntityRecognition.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.demo.nlp; 21 | 22 | 23 | import java.util.HashMap; 24 | 25 | import org.fnlp.nlp.cn.tag.NERTagger; 26 | 27 | /** 28 | * 实体名识别使用示例 29 | * @author xpqiu 30 | * 31 | */ 32 | public class NamedEntityRecognition { 33 | 34 | 35 | /** 36 | * 主程序 37 | * @param args 38 | * @throws Exception 39 | * @throws 40 | */ 41 | public static void main(String[] args) throws Exception { 42 | 43 | NERTagger tag = new NERTagger("../models/seg.m","../models/pos.m"); 44 | String str = " 新浪体育讯 北京时间4月15日03:00(英国当地时间14日20:00),2009/10赛季英格兰足球超级联赛第34轮一场焦点战在白鹿巷球场展开角逐,阿森纳客场1比2不敌托特纳姆热刺,丹尼-罗斯和拜尔先入两球,本特纳扳回一城。阿森纳仍落后切尔西6分(净胜球少15个),夺冠几成泡影。热刺近 7轮联赛取得6胜,继续以1分之差紧逼曼城。"; 45 | HashMap map = new HashMap(); 46 | tag.tag(str,map); 47 | System.out.println(map); 48 | map = tag.tagFile("../example-data/data-tag.txt"); 49 | System.out.println(map); 50 | System.out.println("Done!"); 51 | } 52 | } -------------------------------------------------------------------------------- /fnlp-demo/src/main/java/org/fnlp/demo/nlp/TimeExpressionRecognition.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.demo.nlp; 21 | 22 | 23 | import org.fnlp.nlp.cn.ner.TimeNormalizer; 24 | import org.fnlp.nlp.cn.ner.TimeUnit; 25 | 26 | /** 27 | * 时间表达式识别实例 28 | * 29 | * @author 曹零 07300720158 30 | * 31 | */ 32 | public class TimeExpressionRecognition { 33 | public static void main(String[] args){ 34 | String target = "08年北京申办奥运会,8月8号开幕式,九月十八号闭幕式。" + 35 | "1年后的7月21号发生了件大事。" + 36 | "今天我本想去世博会,但是人太多了,直到晚上9点人还是那么多。" + 37 | "考虑到明天和后天人还是那么多,决定下周日再去。"; 38 | TimeNormalizer normalizer; 39 | normalizer = new TimeNormalizer("../models/time.m"); 40 | normalizer.parse(target); 41 | TimeUnit[] unit = normalizer.getTimeUnit(); 42 | for(int i = 0; i < unit.length; i++){ 43 | System.out.println(unit[i]); 44 | } 45 | } 46 | } -------------------------------------------------------------------------------- /fnlp-demo/src/main/java/org/fnlp/demo/nlp/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * FudanNLP中文处理使用示例。. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * 25 | */ 26 | package org.fnlp.demo.nlp; -------------------------------------------------------------------------------- /fnlp-demo/src/main/java/org/fnlp/demo/nlp/tc/MyDocumentReader.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.demo.nlp.tc; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileInputStream; 6 | import java.io.FileNotFoundException; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.nio.charset.Charset; 10 | import java.util.List; 11 | 12 | import org.fnlp.data.reader.DocumentReader; 13 | import org.fnlp.data.reader.Reader; 14 | import org.fnlp.ml.types.Instance; 15 | import org.fnlp.util.MyFiles; 16 | 17 | public class MyDocumentReader extends Reader{ 18 | 19 | List files; 20 | Instance cur; 21 | Charset charset; 22 | public MyDocumentReader(String path) { 23 | this(path, "UTF-8"); 24 | } 25 | public MyDocumentReader(String path, String charsetName) { 26 | files = MyFiles.getAllFiles(path,null); 27 | charset = Charset.forName(charsetName); 28 | } 29 | public boolean hasNext() { 30 | if (files.isEmpty()) 31 | return false; 32 | nextDocument(); 33 | return true; 34 | } 35 | public Instance next() { 36 | return cur; 37 | } 38 | void nextDocument() { 39 | StringBuffer buff = new StringBuffer(); 40 | File f = files.remove(files.size()-1); 41 | try { 42 | BufferedReader cf = new BufferedReader(new InputStreamReader( 43 | new FileInputStream(f), charset)); 44 | String line = null; 45 | while((line = cf.readLine()) != null) { 46 | buff.append(line); 47 | buff.append('\n'); 48 | } 49 | cf.close(); 50 | } catch (FileNotFoundException e) { 51 | e.printStackTrace(); 52 | } catch (IOException e) { 53 | e.printStackTrace(); 54 | } 55 | String path=f.getPath(); 56 | int pos=path.lastIndexOf("\\"); 57 | path=path.substring(0, pos); 58 | pos=path.lastIndexOf("\\"); 59 | path=path.substring(pos+1); 60 | cur = new Instance(buff.toString(), path); 61 | cur.setTempData(f.getPath()); 62 | buff = null; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /fnlp-demo/src/main/java/org/fnlp/demo/nlp/tc/RemoveWords.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.demo.nlp.tc; 2 | 3 | import java.util.ArrayList; 4 | 5 | import org.fnlp.ml.types.Instance; 6 | import org.fnlp.nlp.pipe.Pipe; 7 | 8 | public class RemoveWords extends Pipe{ 9 | String[] list=new String[]{" "," "}; 10 | public void addThruPipe(Instance inst) { 11 | String data = (String) inst.getData(); 12 | for(int i=0;i newdata=new ArrayList(); 16 | for(int i=0;i 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /fnlp-dev/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | -------------------------------------------------------------------------------- /fnlp-dev/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | fnlp-dev 4 | NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse. 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.m2e.core.maven2Nature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /fnlp-dev/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding//src/test/java=UTF-8 4 | encoding/=UTF-8 5 | -------------------------------------------------------------------------------- /fnlp-dev/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | #Tue Mar 11 13:39:47 CST 2014 2 | encoding/src/test/java=UTF-8 3 | org.eclipse.jdt.core.compiler.compliance=1.6 4 | encoding/src/main/resources=UTF-8 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 6 | encoding/src/main/java=UTF-8 7 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 8 | eclipse.preferences.version=1 9 | encoding/src/test/resources=UTF-8 10 | org.eclipse.jdt.core.compiler.source=1.6 11 | -------------------------------------------------------------------------------- /fnlp-dev/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /fnlp-dev/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.fnlp 6 | fnlp-all 7 | 2.1-SNAPSHOT 8 | 9 | org.fnlp 10 | fnlp-dev 11 | 2.1-SNAPSHOT 12 | fnlp-dev 13 | http://maven.apache.org 14 | 15 | UTF-8 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.11 22 | test 23 | 24 | 25 | org.fnlp 26 | fnlp-core 27 | 2.1-SNAPSHOT 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /fnlp-dev/src/main/java/org/fnlp/dev/App.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.dev; 21 | 22 | /** 23 | * Hello world! 24 | * 25 | */ 26 | public class App 27 | { 28 | public static void main( String[] args ) 29 | { 30 | System.out.println( "Hello World!" ); 31 | } 32 | } -------------------------------------------------------------------------------- /fnlp-dev/src/test/java/org/fnlp/dev/AppTest.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.dev; 21 | 22 | import junit.framework.Test; 23 | import junit.framework.TestCase; 24 | import junit.framework.TestSuite; 25 | 26 | /** 27 | * Unit test for simple App. 28 | */ 29 | public class AppTest 30 | extends TestCase 31 | { 32 | /** 33 | * Create the test case 34 | * 35 | * @param testName name of the test case 36 | */ 37 | public AppTest( String testName ) 38 | { 39 | super( testName ); 40 | } 41 | 42 | /** 43 | * @return the suite of tests being tested 44 | */ 45 | public static Test suite() 46 | { 47 | return new TestSuite( AppTest.class ); 48 | } 49 | 50 | /** 51 | * Rigourous Test :-) 52 | */ 53 | public void testApp() 54 | { 55 | assertTrue( true ); 56 | } 57 | } -------------------------------------------------------------------------------- /fnlp-train/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /fnlp-train/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /pom.xml 3 | -------------------------------------------------------------------------------- /fnlp-train/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | fnlp-train 4 | NO_M2ECLIPSE_SUPPORT: Project files created with the maven-eclipse-plugin are not supported in M2Eclipse. 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.m2e.core.maven2Nature 21 | org.eclipse.jdt.core.javanature 22 | 23 | 24 | -------------------------------------------------------------------------------- /fnlp-train/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding/=UTF-8 4 | -------------------------------------------------------------------------------- /fnlp-train/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.6 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.6 13 | -------------------------------------------------------------------------------- /fnlp-train/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /fnlp-train/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | 5 | org.fnlp 6 | fnlp-all 7 | 2.1-SNAPSHOT 8 | 9 | org.fnlp 10 | fnlp-train 11 | 2.1-SNAPSHOT 12 | fnlp-train 13 | http://maven.apache.org 14 | 15 | UTF-8 16 | 17 | 18 | 19 | junit 20 | junit 21 | 4.11 22 | test 23 | 24 | 25 | org.fnlp 26 | fnlp-core 27 | 2.1-SNAPSHOT 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /fnlp-train/src/main/java/org/fnlp/train/corpus/TrainTestSplit.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.train.corpus; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import org.fnlp.nlp.corpus.fnlp.FNLPCorpus; 7 | import org.fnlp.nlp.corpus.fnlp.FNLPDoc; 8 | import org.fnlp.nlp.corpus.fnlp.FNLPSent; 9 | 10 | public class TrainTestSplit { 11 | 12 | public static void main(String[] args) throws Exception { 13 | 14 | String datapath = "../data"; 15 | FNLPCorpus corpus = new FNLPCorpus(); 16 | corpus.read(datapath + "/FNLPDATA/WeiboFTB(v1.0).dat", null); 17 | 18 | System.out.println(corpus.getDocumenNum()); 19 | System.out.println(corpus.getSentenceNum()); 20 | System.out.println(corpus.getAllPOS()); 21 | 22 | FNLPDoc doc = corpus.docs.get(0); 23 | List train = doc.sentences.subList(0, 3000); 24 | List test = doc.sentences.subList(3000,doc.sentences.size()); 25 | 26 | doc.sentences = new LinkedList(); 27 | doc.sentences.addAll(train); 28 | corpus.writeOne(datapath + "/FNLPDATA/WeiboFTB(v1.0)-train.dat"); 29 | System.out.println(corpus.getSentenceNum()); 30 | System.out.println(corpus.getAllPOS().size()); 31 | 32 | 33 | doc.sentences = new LinkedList(); 34 | doc.sentences.addAll(test); 35 | corpus.writeOne(datapath + "/FNLPDATA/WeiboFTB(v1.0)-test.dat"); 36 | System.out.println(corpus.getSentenceNum()); 37 | System.out.println(corpus.getAllPOS().size()); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /fnlp-train/src/main/java/org/fnlp/train/parsing/DepRunFinal.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.train.parsing; 2 | 3 | import java.io.File; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.io.PrintWriter; 7 | import java.io.UnsupportedEncodingException; 8 | 9 | import org.fnlp.ml.eval.SeqEval; 10 | import org.fnlp.nlp.corpus.fnlp.FNLPCorpus; 11 | import org.fnlp.nlp.parser.dep.train.JointParerTester; 12 | import org.fnlp.nlp.parser.dep.train.JointParerTrainer; 13 | import org.fnlp.train.seg.SegTrain; 14 | import org.fnlp.train.tag.ModelOptimization; 15 | import org.fnlp.util.MyFiles; 16 | 17 | public class DepRunFinal { 18 | 19 | public static void main(String[] args) throws Exception { 20 | 21 | String datapath = "../data"; 22 | String model = "../models/dep.m"; 23 | //合并训练文件 24 | 25 | String allfile = datapath + "/FNLPDATA/all.dep"; 26 | MyFiles.delete(allfile); 27 | String testfile = datapath + "/FNLPDATA/test.dep"; 28 | String trainfile = datapath + "/FNLPDATA/train.dep"; 29 | 30 | 31 | MyFiles.combine(allfile, trainfile,testfile); 32 | 33 | 34 | JointParerTrainer trainer = new JointParerTrainer(model); 35 | int maxite = 100; 36 | float c = 0.01f; 37 | trainer.train(allfile, maxite, c); 38 | 39 | float thres = 1.0E-3f; 40 | ModelOptimization op = new ModelOptimization(thres); 41 | op.optimizeDep(model); 42 | 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /fnlp-train/src/main/java/org/fnlp/train/pos/POSRunFinal.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.train.pos; 2 | 3 | import org.fnlp.train.tag.ModelOptimization; 4 | import org.fnlp.util.MyFiles; 5 | 6 | public class POSRunFinal { 7 | 8 | public static void main(String[] args) throws Exception { 9 | 10 | String datapath = "../data"; 11 | String model = "../models/pos.m"; 12 | String templates = "../data/template"; 13 | 14 | //合并训练文件 15 | 16 | String allfile = datapath + "/FNLPDATA/all.pos"; 17 | String testfile = datapath + "/FNLPDATA/test.pos"; 18 | String trainfile = datapath + "/FNLPDATA/train.pos"; 19 | 20 | 21 | MyFiles.combine(allfile, trainfile,testfile); 22 | 23 | POSTrain pos; 24 | 25 | 26 | pos = new POSTrain(); 27 | pos.model = model; 28 | pos.train = allfile; 29 | pos.templateFile = templates; 30 | pos.iterNum = 100; 31 | pos.c = 0.01f; 32 | pos.train(); 33 | 34 | float thres = 1.0E-5f; 35 | ModelOptimization op = new ModelOptimization(thres); 36 | op.optimizeTag(model); 37 | 38 | POSAddEnTag pp = new POSAddEnTag(); 39 | pp.addEnTag(model); 40 | 41 | 42 | } 43 | 44 | } 45 | -------------------------------------------------------------------------------- /fnlp-train/src/main/java/org/fnlp/train/seg/SegRunFinal.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.train.seg; 2 | 3 | import org.fnlp.train.tag.ModelOptimization; 4 | import org.fnlp.util.MyFiles; 5 | 6 | public class SegRunFinal { 7 | 8 | 9 | public static void main(String[] args) throws Exception { 10 | 11 | 12 | String datapath = "../data"; 13 | String model = "../models/seg.m"; 14 | String templates = "../data/template-seg"; 15 | 16 | //合并训练文件 17 | 18 | String allfile = datapath + "/FNLPDATA/all.seg"; 19 | String testfile = datapath + "/FNLPDATA/test.seg"; 20 | String trainfile = datapath + "/FNLPDATA/train.seg"; 21 | 22 | 23 | 24 | 25 | 26 | MyFiles.combine(allfile, trainfile,testfile); 27 | SegTrain seg; 28 | 29 | 30 | seg = new SegTrain(); 31 | seg.model = model; 32 | seg.train = allfile; 33 | seg.templateFile = templates; 34 | seg.iterNum = 100; 35 | seg.c = 0.01f; 36 | seg.train(); 37 | 38 | ///////////////////////////////////////// 39 | float thres = 0.001f; 40 | 41 | ModelOptimization op = new ModelOptimization(thres); 42 | op.optimizeTag(model); 43 | 44 | 45 | } 46 | 47 | 48 | } 49 | -------------------------------------------------------------------------------- /fnlp-train/src/main/java/org/fnlp/train/tag/Clean2word.java: -------------------------------------------------------------------------------- 1 | package org.fnlp.train.tag; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | 6 | import org.fnlp.nlp.cn.Chars; 7 | import org.fnlp.nlp.cn.ChineseTrans; 8 | import org.fnlp.util.MyCollection; 9 | 10 | public class Clean2word { 11 | 12 | public static void main(String[] args) throws IOException { 13 | 14 | HashSet wset = new HashSet(); 15 | String file = "../data/tmp.txt"; 16 | HashSet set = MyCollection.loadSet(file , false); 17 | for(String w: set){ 18 | if(w.length()>=3||w.length()<=1) 19 | continue; 20 | if(Chars.isLetterOrDigitOrPunc(w)) 21 | continue; 22 | 23 | wset.add(w); 24 | } 25 | 26 | MyCollection.write(wset, "../data/word.txt"); 27 | System.out.print("Done"); 28 | 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /fnlp-train/src/main/java/org/fnlp/train/tag/TrainTagger.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This file is part of FNLP (formerly FudanNLP). 3 | * 4 | * FNLP is free software: you can redistribute it and/or modify 5 | * it under the terms of the GNU Lesser General Public License as published by 6 | * the Free Software Foundation, either version 3 of the License, or 7 | * (at your option) any later version. 8 | * 9 | * FNLP is distributed in the hope that it will be useful, 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | * GNU Lesser General Public License for more details. 13 | * 14 | * You should have received a copy of the GNU General Public License 15 | * along with FudanNLP. If not, see . 16 | * 17 | * Copyright 2009-2014 www.fnlp.org. All rights reserved. 18 | */ 19 | 20 | package org.fnlp.train.tag; 21 | 22 | import org.fnlp.nlp.tag.Tagger; 23 | 24 | public class TrainTagger { 25 | 26 | /* 27 | * Tagger可以用来训练分词、词性标注、实体名识别等 28 | * 训练需要模板和训练语料,这些内容都存放在 \\10.141.200.3\Datasets\TrainData 中 29 | * TrainData中有三个目录 segmentation postagged ner 分别是分词、词性标注和实体名识别 30 | * 将相应的语料拷贝到指定的文件夹,修改下列参数就可以进行训练了 31 | */ 32 | 33 | /** 34 | * @param args 35 | * @throws Exception 36 | */ 37 | public static void main(String[] args) throws Exception { 38 | // 指定训练语料和模型存储的位置 39 | String template = "./tmpdata/template.sighan2005"; 40 | String corpus = "./tmpdata/as_training.utf8"; 41 | String model = "./tmpdata/cws.m"; 42 | 43 | // 如果在训练过程中没有测试文件请保持testfile为"" 44 | String testfile = ""; 45 | 46 | if(testfile != ""){ 47 | Tagger.main(new String[]{"-train",template,corpus,model,testfile}); 48 | }else{ 49 | Tagger.main(new String[]{"-train",template,corpus,model}); 50 | } 51 | 52 | 53 | } 54 | 55 | } -------------------------------------------------------------------------------- /fnlp-train/src/main/java/org/fnlp/train/tag/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 序列标注数据预处理. 3 | *

This file is part of FudanNLP. 4 | 5 | *

FudanNLP is free software: you can redistribute it and/or modify 6 | * it under the terms of the GNU Lesser General Public License as published by 7 | * the Free Software Foundation, either version 3 of the License, or 8 | * (at your option) any later version. 9 | 10 | *

FudanNLP is distributed in the hope that it will be useful, 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | * GNU Lesser General Public License for more details. 14 | 15 | *

You should have received a copy of the GNU General Public License 16 | * along with FudanNLP. If not, see 17 | * http://www.gnu.org/licenses/. 18 | 19 | *

Copyright 2009-2012 fnlp.org. All rights reserved. 20 | * 21 | * @author fnlp.org 22 | * @since FudanNLP 1.5 23 | * @version 1.0.0 24 | * @date 2012-11-14 25 | */ 26 | package org.fnlp.train.tag; -------------------------------------------------------------------------------- /models/.gitignore: -------------------------------------------------------------------------------- 1 | /dep.m 2 | /pos.m 3 | /seg.m 4 | *.bak 5 | -------------------------------------------------------------------------------- /models/ExtractPattern.txt: -------------------------------------------------------------------------------- 1 | ## 本文件由TreeExtractor类使用 2 | ## 格式由四部分组成,由%%隔开 3 | ## 第一部分为标签词性 4 | ## 第二部分为抽取模版 5 | ## 第三部分为限制 6 | ## 第四部分为同义词声明 7 | ## “##”开头为注释部分 8 | 9 | %% 10 | ##标签词性,添加标签词性以及自定义的词性 11 | #POI# 专有名 12 | #Person# 人名 13 | #s# 地名 14 | #m# 地名 15 | #d# 地名 16 | #a# 地名 17 | #dist# 数词 18 | #unit# 量词 19 | 以内 方位词 20 | 米 量词 21 | 22 | %% 23 | ##抽取模版 24 | A:#a#的 25 | 去#dist##unit#范围内的#d# 26 | 去#dist##unit#以内的#d# 27 | 去#dist##unit#内#d# 28 | 去#dist#公里以内的#d# 29 | 去#a#附近的#d# 30 | 去#a#上的#d# 31 | 从#s#出发经过#m#到#d#怎么走 32 | 从#s#经过#m#到#d#怎么走 33 | #s#出发经过#m#到#d#怎么走 34 | #s#经过#m#到#d#怎么走 35 | 从#s#出发到#d#经过#m# 36 | 从#s#到#d#经过#m# 37 | 从#s#出发经过#m#到#d# 38 | 从#s#经过#m#到#d# 39 | #s#出发经过#m#到#d# 40 | #s#经过#m#到#d# 41 | 从#s#出发到#d#经过#m# 42 | 从#s#到#d#经过#m# 43 | #s#出发到#d#经过#m# 44 | #s#到#d#经过#m# 45 | 目标在#d#经过#m# 46 | 目标#d#经过#m# 47 | 去#d#经过#m# 48 | 去#d#经过#m# 49 | 先去#m#再去#d# 50 | 去#m#再去#d# 51 | 从#s#出发到#d#怎么走 52 | 从#s#到#d#怎么走 53 | #s#出发到#d#怎么走 54 | #s#到#d#怎么走 55 | 从#s#出发到#d# 56 | 从#s#到#d# 57 | 从#s#出发到#d# 58 | 从#s#到#d# 59 | #s#出发到#d# 60 | 去#a#附近的#d# 61 | #a#附近的#d# 62 | 去#a#附近的 63 | 目标在#d# 64 | 目标#d# 65 | 导航去#d# 66 | 起点在#s# 67 | 起点#s# 68 | 从#s#出发 69 | 经过#m# 70 | 到#d#怎么走 71 | #d#怎么走 72 | 去#d# 73 | 找#d# 74 | #dist##unit# 75 | #s#到#d# 76 | #a#路上的 77 | #a#区的 78 | #a#附近的 79 | 80 | 81 | %% 82 | ##限制 83 | POI ONLY_CONTAIN_POS 实体名 人物名 84 | POI ONLY_CONTAIN_CONTENT 复旦 交大 85 | POI NOT_CONTAIN_POS 形容词 人名 助词 86 | POI NOT_CONTAIN_CONTENT 歌曲 报告 87 | dist ONLY_CONTAIN_POS 数词 88 | unit ONLY_CONTAIN_CONTENT 百米 千米 公里 米 里 89 | 90 | %% 91 | ##同义词声明 92 | 经过 路过 途经 途径 93 | 走 去 到 找 查 搜 搜索 检索 94 | 要 想 95 | 目的地 目标 终点 96 | 是 在 97 | 再 然后 98 | 起点 出发地 起始地 99 | 附近 旁边 周边 周围 100 | 路 街 路口 101 | 内 以内 之内 左右 102 | 百米 千米 公里 米 里 103 | 104 | %% 105 | ##与父节点的关系限制 106 | #POI# ONLY_PARENT_RELATION 宾语 107 | #d# NOT_PARENT_RELATION 状语 补语 语态 108 | #m# NOT_PARENT_RELATION 状语 补语 语态 109 | #s# NOT_PARENT_RELATION 状语 补语 语态 -------------------------------------------------------------------------------- /models/Stock-Tree.txt: -------------------------------------------------------------------------------- 1 | ## 本文件由TreeExtractor类使用 2 | ## 格式由四部分组成,由%%隔开 3 | ## 第一部分为标签词性 4 | ## 第二部分为抽取模版 5 | ## 第三部分为限制 6 | ## 第四部分为同义词声明 7 | ## “##”开头为注释部分 8 | 9 | %% 10 | ##标签词性 11 | #stock# 实体名 12 | #time# 时间短语 13 | 今天 时间短语 14 | 15 | %% 16 | ##抽取模版 17 | 查#time##stock#股价 18 | 查#time##stock#的股价 19 | #time##stock#的股价 20 | #time##stock#的行情 21 | 查#stock#的股价 22 | #stock#的股价 23 | #time##stock#的股价怎样 24 | #stock#是涨是跌 25 | #stock#涨了吗 26 | #stock#的价格 27 | #stock#的 28 | #time#的 29 | 30 | %% 31 | ##限制 32 | #stock# NOT_CONTAIN_CONTENT 查 查看 查询 看 问 33 | #stock# NOT_CONTAIN_POS 时间短语 34 | #time# ONLY_CONTAIN_POS 时间短语 35 | 36 | 37 | %% 38 | ##同义词声明 39 | 价格 股价 成交价 40 | 查 查看 查询 看 问 41 | 股价 股票 行情 42 | 怎样 样 怎样 43 | 涨 跌 -------------------------------------------------------------------------------- /models/ar.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FudanNLP/fnlp/ce258f3e4a5add2ba0b5e4cbac7cab2190af6659/models/ar.m -------------------------------------------------------------------------------- /models/dict.txt: -------------------------------------------------------------------------------- 1 | 媒体计算 专有名 2 | 数据挖掘 专有名 3 | 乐phone 专有名 4 | 媒体计算研究所 专有名 5 | 数据挖掘 专有名 6 | 数据库系统 专有名 7 | 数据仓库 专有名 8 | 数据密集型计算 9 | 软件工程 10 | 集体智慧编程 mypos 11 | 极其 12 | 门罗帕克市 -------------------------------------------------------------------------------- /models/dict_ambiguity.txt: -------------------------------------------------------------------------------- 1 | 媒体计算研究所 2 | 中华人民共和国 3 | 宁夏回族自治区 4 | 上海交通大学 5 | 研究所 6 | 数据库系统 7 | 高级数据挖掘 8 | 数据挖掘 9 | 高级 10 | 数据仓库 11 | 数据 12 | 数据密集型计算 13 | 软件工程 14 | 集体智慧编程 15 | 送给 16 | 给力 17 | 量子力学 18 | 力学 19 | 力学系 20 | 同学 21 | 送 22 | 学 23 | 成立了 24 | 成立 25 | 了 26 | 玩具 -------------------------------------------------------------------------------- /models/dict_dep.txt: -------------------------------------------------------------------------------- 1 | 0 * 名词|地名|实体名|机构名 -1 核心词 2 | 1 的 * 0 语态 3 | 4 | 0 * 动词 -1 核心词 5 | 1 * 地名 2 定语 6 | 2 附近 名词 3 的字结构 7 | 3 的 结构助词 4 定语 8 | 4 * 地名 0 宾语 9 | -------------------------------------------------------------------------------- /models/stopwords/ErrorWords.txt: -------------------------------------------------------------------------------- 1 | 下么 2 | 睡哦 3 | 老公你 4 | 厂做 5 | 刚睡一会 6 | 看咯 7 | 得票 8 | 你故意 9 | 思哦 10 | 互相疼 11 | 新婚愉 -------------------------------------------------------------------------------- /models/stopwords/NoSenseWords.txt: -------------------------------------------------------------------------------- 1 | 呵呵 2 | 咋样 3 | zz 4 | 本条 5 | 正在 6 | 永远 7 | 现在 8 | 恩恩 9 | 之前 10 | 昨天 11 | 今天 12 | 明天 13 | 谢谢 14 | 没事 15 | 以为 16 | 短信 17 | 干嘛 18 | 意思 19 | 晚安 20 | 吃饭 21 | 呵呵 22 | 想想 23 | 一起 24 | 事情 25 | 看到 26 | 肯定 27 | 信息 28 | 哈哈哈 29 | 好吧 -------------------------------------------------------------------------------- /models/time.m: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FudanNLP/fnlp/ce258f3e4a5add2ba0b5e4cbac7cab2190af6659/models/time.m -------------------------------------------------------------------------------- /models/wordgraph.txt: -------------------------------------------------------------------------------- 1 | 同义词 去 到 2 | 同义词 是的 对的 好的 3 | 反义词 唱 听 --------------------------------------------------------------------------------